Compare commits

...

189 Commits

Author SHA1 Message Date
Jon Taylor
2b1f056aa7 sketch for runner as a module 2025-07-24 20:15:06 +01:00
Mark Backman
2be615066c Merge pull request #2261 from pipecat-ai/mb/foundational-requirements
Foundational requirements.txt: add silero, websocket optional dep, re…
2025-07-24 11:06:16 -07:00
Mark Backman
1bb821a07d Foundational requirements.txt: add silero, websocket optional dep, remove fastapi 2025-07-24 13:49:44 -04:00
Filipi da Silva Fuchter
d8bcb81f35 Merge pull request #2259 from pipecat-ai/filipi/eleven_labs_delayed_messages
Play delayed messages from `ElevenLabsTTSService` if they still belong to the current context.
2025-07-24 12:07:06 -03:00
Filipi da Silva Fuchter
3ce0ab8c6d Removing extra space.
Co-authored-by: Mark Backman <mark@daily.co>
2025-07-24 12:05:17 -03:00
Filipi Fuchter
097d786431 Fixing ruff format. 2025-07-24 12:03:17 -03:00
Filipi Fuchter
662f04879c Play delayed messages from ElevenLabsTTSService if they still belong to the current context. 2025-07-24 12:00:14 -03:00
Mark Backman
7a69f57e11 Merge pull request #2255 from pipecat-ai/mb/pyproject-versions-for-uv
pyproject.toml dependency updates to support better cross compatibility
2025-07-24 06:43:35 -07:00
Mark Backman
5b7b4efdc9 Add broader version support for stable core dependencies, up to the next major version 2025-07-24 09:40:52 -04:00
Mark Backman
cfa26524ca Add support for fastapi>=0.115.6,<0.117.0 2025-07-24 09:37:42 -04:00
Mark Backman
3d4ab7158d pyproject.toml dependency updates to support better cross compatibility 2025-07-24 09:37:42 -04:00
Mark Backman
26d1ca3c98 Merge pull request #2256 from pipecat-ai/mb/refactor-neuphonic-http
NeuphonicHttpTTSService: Refactor to use POST API
2025-07-24 06:36:23 -07:00
Mark Backman
083b32887e NeuphonicHttpTTSService: Refactor to use POST API 2025-07-24 01:05:37 -04:00
Mark Backman
3391929127 Merge pull request #2252 from pipecat-ai/mb/example-axios-version-bump
Update axios in daily-pstn-server example due to transitive vulnerabi…
2025-07-23 13:30:58 -07:00
Mark Backman
ebf9bc2741 Merge pull request #2246 from ydlamba/ydlamba/missing-livekit-event
fix(livekit): emit on_audio_track_subscribed event
2025-07-23 11:27:10 -07:00
Mark Backman
f5edde42f6 Update axios in daily-pstn-server example due to transitive vulnerability with form-data 2025-07-23 14:22:13 -04:00
Filipi da Silva Fuchter
37bb7ef926 Merge pull request #2239 from pipecat-ai/filipi/daily_log
Added `set_log_level` to `DailyTransport`
2025-07-23 14:48:34 -03:00
Filipi Fuchter
a63d1530a4 Added set_log_level to DailyTransport. 2025-07-23 14:43:53 -03:00
Yash Dev Lamba
960bc9df5b chore(changelog): add entry for LiveKitTransport audio subscribed event fix 2025-07-23 22:41:20 +05:30
Mark Backman
e2a153ee01 Merge pull request #2242 from pipecat-ai/mb/websockets-14
Upgrade websockets to support asyncio implementation
2025-07-23 08:58:08 -07:00
Mark Backman
300f19ad23 Port to the websockets asyncio implementation, support for websockets 13 and 14 2025-07-23 11:54:25 -04:00
Mark Backman
7955080da2 Change extra_headers to additional_headers, update websocket version support 2025-07-23 11:53:43 -04:00
Mark Backman
994e82c1ef Merge pull request #2243 from pipecat-ai/mb/word-wrangler-twilio-readme
Update Word Wrangler phone bot README to include deployment info
2025-07-23 07:04:19 -07:00
Mark Backman
b07b947352 Merge pull request #2244 from pipecat-ai/mb/upgrade-deepgram-4.7.0
Deepgram: Update optional dependency to 4.7.0
2025-07-23 07:04:02 -07:00
Filipi da Silva Fuchter
a6527c3856 Merge pull request #2240 from pipecat-ai/filipi/sig_term
Adding support for handle_sigterm
2025-07-23 08:15:50 -03:00
Yash Dev Lamba
0e6874b605 fix(livekit): emit on_audio_track_subscribed event 2025-07-23 08:23:45 +05:30
Mark Backman
9ba172c49f Merge pull request #2236 from dbtreasure/fix/python-311-compatibility
Fix Python 3.11+ compatibility by pinning numba/llvmlite versions
2025-07-22 18:20:38 -07:00
dbtreasure
f710c94b6e Address code review feedback: remove explicit llvmlite pin
- Remove explicit llvmlite>=0.44.0 pin as numba>=0.61.0 automatically pulls compatible version
- Add changelog entry for Python 3.11+ dependency fix

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-22 18:45:32 -06:00
dbtreasure
6e3a0a2d5d Add explicit numba/llvmlite pins for Python 3.11+ compatibility
Fixes dependency resolution issues where transitive dependencies
through resampy would install incompatible versions:
- numba>=0.61.0 (supports Python 3.10-3.13)
- llvmlite>=0.44.0 (supports Python 3.10-3.13)

Previously, older versions (numba 0.53.1, llvmlite 0.36.0) only
supported Python 3.6-3.9, causing deployment failures on Python 3.11+.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-22 18:45:02 -06:00
Mark Backman
9530b8b842 Merge pull request #2235 from pipecat-ai/mb/nltk-tokenizer
Update match_endofsentence to use NLTK sentence tokenizer
2025-07-22 17:22:23 -07:00
Mark Backman
26c937af87 Update match_endofsentence to use NLTK sentence tokenizer 2025-07-22 20:19:29 -04:00
Mark Backman
976f6168f0 Deepgram: Update optional dependency to 4.7.0 2025-07-22 20:15:30 -04:00
Mark Backman
0be64e0fd9 Update Word Wrangler phone bot README to include deployment info 2025-07-22 20:10:20 -04:00
Filipi Fuchter
7d527c3a6b Mentioning the new field in the changelog. 2025-07-22 19:32:52 -03:00
Filipi Fuchter
c6f6930c27 Adding support for handle_sigterm 2025-07-22 17:24:07 -03:00
Mark Backman
c33dfe8309 Merge pull request #2233 from pipecat-ai/mb/enable-tracing-flag
fix: enable_tracing PipelineParam controls the service class decorators
2025-07-22 08:14:32 -07:00
Mark Backman
769cd1ef06 fix: enable_tracing PipelineParam controls the service class decorators 2025-07-22 11:10:53 -04:00
Mark Backman
6d72f60571 Merge pull request #2234 from pipecat-ai/mb/fix-minimax-pitch
fix: MiniMaxHttpTTSService pitch, add base_url arg
2025-07-22 08:10:01 -07:00
Mark Backman
e8d0712ac1 Merge pull request #2238 from pipecat-ai/mb/patch-form-data
Fix form-data vulnerability in pipecat-cloud-daily-pstn-server
2025-07-22 08:09:49 -07:00
Mark Backman
88b2c817ac Fix form-data vulnerability in pipecat-cloud-daily-pstn-server 2025-07-22 10:08:25 -04:00
Mark Backman
f8f6c9918d Merge pull request #2237 from pipecat-ai/mb/pipecat-cloud-example-pipeline-runner-args
Update Pipecat Cloud example to use handle_sigint=False in PipelineRu…
2025-07-22 06:55:56 -07:00
Mark Backman
8ee608bbfe Update Pipecat Cloud example to use handle_sigint=False in PipelineRunner args 2025-07-22 09:52:57 -04:00
Mark Backman
fad2ba4570 Merge pull request #2204 from yousifa/mcp-FunctionCallParams 2025-07-22 05:01:32 -07:00
Mark Backman
f609f7eb53 fix: MiniMaxHttpTTSService pitch, add base_url arg 2025-07-21 21:16:35 -04:00
Mark Backman
ea09813a2b Merge pull request #2227 from pipecat-ai/mb/fix-11labs-wordtimestamps
fix: Improve ElevenLabsTTSService word/timestamp calcuation accuracy
2025-07-21 16:07:07 -07:00
Mark Backman
53abfc27a7 fix: Improve ElevenLabsTTSService word/timestamp calcuation accuracy 2025-07-21 18:48:38 -04:00
Mark Backman
9c72e96a2c Merge pull request #2230 from pipecat-ai/mb/livekit-tenacity
Livekit: change tenacity supported versions
2025-07-21 15:28:38 -07:00
Mark Backman
f66c67c4ab Merge pull request #2232 from pipecat-ai/mb/fix-ollama-args
Fix: Ollama kwargs error
2025-07-21 15:26:13 -07:00
Mark Backman
b623face03 Add Ollama function calling example 14u 2025-07-21 17:52:16 -04:00
Mark Backman
698d60f3ae fix: OLLamaLLMService pass base_url as kwarg 2025-07-21 17:51:11 -04:00
Mark Backman
c9717a23a5 Livekit: change tenacity supported versions 2025-07-21 17:30:18 -04:00
Mark Backman
d981ce6e56 Merge pull request #2226 from pipecat-ai/mb/11labs-speed-docstring
Fix 11Labs speed docstring
2025-07-21 13:21:45 -07:00
Mark Backman
1bbd3bd8ab Fix 11Labs speed docstring 2025-07-21 14:58:12 -04:00
Kwindla Hultman Kramer
a20915caa7 Merge pull request #2224 from pipecat-ai/khk/mps
Add MPS backend auto-detection to local smart-turn v2
2025-07-21 09:24:51 -07:00
Vanessa Pyne
28cab5a606 Merge pull request #1932 from getchannel/groundingMetadata
Add groundingMetadata to Gemini Multimodal Live Service
2025-07-21 10:09:26 -05:00
Vanessa Pyne
cfea56064d small merge-main nit fixes - gemini_multimodal_live events.py 2025-07-21 09:54:15 -05:00
Vanessa Pyne
8467d87cfc small main-merge fixes - gemini.py 2025-07-21 09:52:32 -05:00
Kwindla Hultman Kramer
b20d020bea Add MPS backend auto-detection to local smart-turn v2 2025-07-20 20:18:45 -04:00
Pete
948257c66e Merge branch 'main' into groundingMetadata 2025-07-20 19:54:30 -04:00
Pete
b54d1fb7fd Resolve merge conflict and remove duplicate File API initialization
- Remove duplicate file_api initialization lines
- Keep grounding metadata tracking functionality
- Maintain clean code structure
2025-07-20 19:15:40 -04:00
Pete
ec361df0d1 Fix final ruff linting issues
- Remove duplicate import in __init__.py
- Clean up extra blank lines in gemini.py
- Remove extra blank line in _create_single_response method
2025-07-20 18:58:54 -04:00
Pete
b1a5cddde4 Refactor whitespace and formatting in multiple files
- Clean up unnecessary whitespace in `gemini.py`, `events.py`, and `file_api.py`
- Ensure consistent formatting in `26g-gemini-multimodal-live-groundingMetadata.py`
- Improve readability by aligning code and removing trailing spaces
2025-07-20 18:40:12 -04:00
Pete
e165d38277 remove truncated logging from debug 2025-07-20 18:27:21 -04:00
Pete
8ba340a8a5 remove debug logging 2025-07-20 18:21:42 -04:00
kompfner
d4e33663b2 Merge pull request #2214 from pipecat-ai/pk/fix-google-llm-context
Fixed an issue in `GoogleLLMContext` where it would inject the `syste…
2025-07-18 09:28:28 -04:00
marcus-daily
d7d1b16dad Removing old import 2025-07-18 12:48:06 +01:00
marcus-daily
0bc2ea13f2 Updating changelog 2025-07-18 12:48:06 +01:00
marcus-daily
b5d1301221 Fix linter warnings 2025-07-18 12:48:06 +01:00
marcus-daily
ed8f30ec71 Add support for running smart-turn-v2 locally 2025-07-18 12:48:06 +01:00
kompfner
a74a935ca0 Merge pull request #1910 from matejmarinko-soniox/main
Add Soniox STT service integration
2025-07-17 09:29:07 -04:00
Paul Kompfner
7cfd56699b Fixed an issue in GoogleLLMContext where it would inject the system_message as a "user" message into cases where it was not meant to; it was only meant to do that when there were no "regular" (non-function-call) messages in the context, to ensure that inference would run properly. 2025-07-16 16:07:53 -04:00
Matej Marinko
cb984237a7 Fix lint error 2025-07-16 16:54:28 +02:00
Matej Marinko
c969fdddb9 Rename and simplify VAD finalization parameter usage 2025-07-16 09:47:34 +02:00
Mark Backman
9931ad2ce1 Merge pull request #2199 from Dev-Khant/add-host-support-in-Mem0
Add `host` support in Mem0 Memory
2025-07-15 11:41:15 -07:00
Filipi da Silva Fuchter
fd73feb645 Merge pull request #2201 from pipecat-ai/filipi/stt_issue
Only create the EmulateUserStartedSpeakingFrame if we have received a transcription
2025-07-15 13:56:11 -03:00
Yousif Astarabadi
ee78428a2a formatted 2025-07-14 20:38:28 -07:00
Yousif Astarabadi
ae02249255 mcp_tool_wrapper using FunctionCallParams 2025-07-14 20:31:22 -07:00
Filipi Fuchter
727af2e6fb Only create the EmulateUserStartedSpeakingFrame if we have received a transcription. 2025-07-14 17:38:03 -03:00
Mark Backman
8fd5576879 Merge pull request #2198 from Allenmylath/patch-24
Update app.py
2025-07-14 06:37:42 -07:00
kompfner
1f85dcee7c Merge pull request #2171 from pipecat-ai/pk/aws-strands-demo
Minimal AWS Strands demo
2025-07-14 09:32:16 -04:00
Dev Khant
138890bc5c Add support in Mem0 Memory 2025-07-14 18:08:25 +05:30
Filipi da Silva Fuchter
a094efc9e6 Merge pull request #2196 from pipecat-ai/mb/lmnt-model
LmntTTSService: update the default model to blizzard
2025-07-14 09:15:17 -03:00
allenmylath
1f9e2fdecc Update app.py
misleading comment. no endpoints.py
2025-07-14 14:02:35 +05:30
Mark Backman
4a2b4660bc LmntTTSService: update the default model to blizzard 2025-07-13 10:54:43 -07:00
Mark Backman
b3ac90015a Merge pull request #2195 from Trinary-Projects/transformers_ver_patch
Update transformers dep. to >=4.48.0 for Ultravox
2025-07-11 23:31:47 -07:00
Jaideep
2fe06f0a4e Update pyproject.toml 2025-07-12 11:34:45 +05:30
Mark Backman
1836a7484e Merge pull request #2193 from pipecat-ai/mb/changelog-0.0.76
Prepare changelog for 0.0.76 release
2025-07-11 16:15:34 -07:00
Mark Backman
25a5c5aaab Prepare changelog for 0.0.76 release 2025-07-11 16:08:08 -07:00
mattie ruth backman
24694e2558 Changelog entry 2025-07-11 14:30:12 -07:00
mattie ruth backman
2325edd9ba Add a text entry box to the simple-chatbot example 2025-07-11 14:30:12 -07:00
mattie ruth backman
fad5713ade Fix append-to-context function call 2025-07-11 14:30:12 -07:00
Paul Kompfner
fe8573322f AWS Strands demos 2025-07-11 16:42:27 -04:00
Mark Backman
06c1255abe fix: use a different aggregation timeout for emulated user speech (#2185)
* fix: use a different aggregation timeout for emulated user speech

* Add SpeechControlParamsFrame

* Update test_context_aggregator tests
2025-07-11 16:33:44 -04:00
Mark Backman
f108a67635 Merge pull request #2189 from pipecat-ai/mb/numpy-version-bump
Update numpy, transformers to support newer versions
2025-07-11 12:02:02 -07:00
Mark Backman
bf580d061d Update numpy, transformers to support newer versions 2025-07-11 11:58:31 -07:00
Filipi da Silva Fuchter
b005bd7b98 Merge pull request #2184 from pipecat-ai/filipi/twilio_issue
Fixing an issue where Pipecat was not receiving the user's audio
2025-07-11 15:32:28 -03:00
Filipi Fuchter
75f8baab33 Mentioning the fixes in the changelog. 2025-07-11 11:56:16 -03:00
Matej Marinko
5c3fb73cef Rename example 2025-07-11 16:07:24 +02:00
Filipi Fuchter
5c3f4180b9 Refactored VAD analyzer to process multiple audio frames in a single iteration if needed. 2025-07-11 10:59:32 -03:00
Mark Backman
6cd6e7ceed Merge pull request #2186 from pipecat-ai/mb/fix-pre-commit-config
Update .pre-commit-config.yaml to use pyproject.toml linting rules
2025-07-11 06:34:01 -07:00
Filipi Fuchter
1a146c2a64 Not serializing a JSON in case we have no audio. 2025-07-11 10:15:09 -03:00
Filipi Fuchter
eaeb9e6efa Not creating InputAudioRawFrame in case we don't have bytes. Fixed for Pilvo, Exotel and Telnyx. 2025-07-11 09:51:38 -03:00
Matej Marinko
2e84c91748 Remove outdated parameter 2025-07-11 08:52:39 +02:00
Matej Marinko
650d45c1f4 Use single sample rate parameter 2025-07-11 08:27:06 +02:00
Filipi Fuchter
f4f65024ef Refactoring the test client to use the new version of the Pipecat Client SDK. 2025-07-10 21:57:25 -03:00
Filipi Fuchter
1200aa4fb8 Not creating InputAudioRawFrame in case we don't have bytes. 2025-07-10 21:56:34 -03:00
Filipi da Silva Fuchter
6762363685 Merge pull request #2183 from pipecat-ai/filipi/parallel_pipeline_issue
Fixed an issue in ParallelPipeline that caused errors when attempting to drain the queues.
2025-07-10 21:51:04 -03:00
Filipi Fuchter
b2ead325c4 Fixed an issue in ParallelPipeline that caused errors when attempting to drain the queues. 2025-07-10 21:50:35 -03:00
Mark Backman
4e24b915cc Update .pre-commit-config.yaml to use pyproject.toml linting rules 2025-07-10 16:10:27 -07:00
kompfner
b610ee26ba Merge pull request #2181 from pipecat-ai/pk/fix-aws-nova-sonic-pipeline-freeze
Fix a pipeline freeze when using AWS Nova Sonic. The freeze occurs if…
2025-07-10 16:30:55 -04:00
Paul Kompfner
2b867f1613 Fix a pipeline freeze when using AWS Nova Sonic. The freeze occurs if the user starts speaking before we've finished sending the "trigger " audio (AWS Nova Sonic can only start speaking in response to a user utterance, so we have a simulated user utterance to "trigger" the bot speaking without the user having actually spoken first). 2025-07-10 15:57:05 -04:00
Mark Backman
7b8fe565c7 Merge pull request #2182 from pipecat-ai/mb/run-example-usage
run.py: Add example usage to the module docstring
2025-07-10 12:48:29 -07:00
Mark Backman
a246862910 run.py: Add example usage to the module docstring 2025-07-10 11:41:49 -07:00
Filipi da Silva Fuchter
106809f3fd Merge pull request #2166 from carolin-tavus/remove-persona-microphone-check
feat: Remove persona microphone check
2025-07-10 15:28:35 -03:00
carolin-tavus
f0d8499f7e feat: avoid checking microphone enabled 2025-07-10 09:40:27 +00:00
Mark Backman
332ca3d55e Merge pull request #2177 from pipecat-ai/mb/fix-ruff-improvements
Make fix-ruff.sh more flexible, use pyproject rules
2025-07-09 12:33:05 -07:00
Mark Backman
a48f5d5796 Make fix-ruff.sh more flexible, use pyproject rules 2025-07-09 11:48:17 -07:00
Mark Backman
f04f047428 Merge pull request #2176 from pipecat-ai/mb/pre-commit-config
Add docstring checking to .pre-commit-config.yaml
2025-07-09 11:47:25 -07:00
Mark Backman
4e61fd33ea Add docstring checking to .pre-commit-config.yaml 2025-07-09 11:18:40 -07:00
Matej Marinko
61ac77be72 Update docs 2025-07-09 11:59:45 +02:00
Matej Marinko
c093eb5b63 Move config to main file 2025-07-09 10:20:37 +02:00
Matej Marinko
98e24131bd Send raw result 2025-07-09 09:59:04 +02:00
Matej Marinko
7becce9e8c Add transcript tracing 2025-07-09 09:37:58 +02:00
Matej Marinko
3cdaeb719a Update examples to new format 2025-07-09 09:28:43 +02:00
Matej Marinko
8daaea5969 Minor code cleanup 2025-07-09 09:03:02 +02:00
matejmarinko-soniox
dc47516e14 Update src/pipecat/services/soniox/config.py
Co-authored-by: Mark Backman <m.backman@gmail.com>
2025-07-09 08:04:59 +02:00
Mark Backman
0fcc4f822f Merge pull request #2173 from captaincaius/fix-nextjs-webhook-example-null-check
fix nextjs webhook example num_endpoints null check
2025-07-08 14:10:16 -07:00
Captain Caius
c0ed061ff5 fix nextjs webhook example num_endpoints null check 2025-07-08 13:40:26 -07:00
Mark Backman
d98b6b418d Release prep for 0.0.75 (#2169)
* Update CHANGELOG for 0.0.75 release

* Add RTVI changes to CHANGELOG

* Update CHANGELOG, add deprecation directives to rtvi.py

---------

Co-authored-by: mattie ruth backman <mattieruth@gmail.com>
2025-07-08 15:39:44 -04:00
Mark Backman
deea29b5e8 Merge pull request #2170 from pipecat-ai/mb/update-packages-0.0.75
Package updates to run the release evals
2025-07-08 12:02:22 -07:00
Mark Backman
0bdbc83ed9 Package updates to run the release evals 2025-07-08 11:39:49 -07:00
Mark Backman
6c591f0990 Merge pull request #2167 from pipecat-ai/mb/fix-riva-watchdog
RivaSTTService: reset the watchdog in an async function
2025-07-08 11:29:44 -07:00
Mark Backman
b55b9c257b RivaSTTService: remove reset_watchdog, which is handled in the WatchdogQueue already 2025-07-08 11:19:47 -07:00
Mark Backman
5156c21d14 Merge pull request #2168 from pipecat-ai/mb/fix-neuophonic-tts
Fix: NeuphonicTTSService to use latest websocket API
2025-07-08 11:17:58 -07:00
Mark Backman
a9d824753b Fix: NeuphonicTTSService to use latest websocket API 2025-07-08 11:08:08 -07:00
Filipi da Silva Fuchter
3c6a208101 Merge pull request #2148 from pipecat-ai/filipi/aws_bedrock
Refactoring AWSBedrockLLMService to work async
2025-07-08 12:14:28 -03:00
Mark Backman
b1032a1ca4 Merge pull request #2151 from pipecat-ai/mb/ollama-kwargs
OLLamaLLMService: Pass kwargs
2025-07-08 07:21:09 -07:00
Mark Backman
931f34fccd OLLamaLLMService: Pass kwargs 2025-07-08 07:17:18 -07:00
Mark Backman
f2509adec1 Merge pull request #2162 from pipecat-ai/mb/tts-service-aggregate-sentences
TTS services: Add aggregate_sentences arg
2025-07-08 07:14:58 -07:00
Filipi Fuchter
285b82eb65 Mentioning the AWSBedrockLLMService and AWSPollyTTSService refactors in the changelog. 2025-07-08 07:30:30 -03:00
Filipi Fuchter
74da197304 Refactored AWSBedrockLLMService and AWSPollyTTSService to work asynchronously using aioboto3 instead of the boto3 library. 2025-07-08 07:28:23 -03:00
Matej Marinko
0f727248d2 Merge branch 'main' of github.com:pipecat-ai/pipecat 2025-07-08 08:20:10 +02:00
mattie ruth backman
a6de16f92f Bump all client dependencies to use client-js/react/transports 1.0.0 2025-07-07 15:56:08 -07:00
mattie ruth backman
fc09854d7f fix cam light always on 2025-07-07 15:56:08 -07:00
mattie ruth backman
2959029151 PR Review fixes 2025-07-07 15:56:08 -07:00
mattie ruth backman
e590441b7b Add support for about info in ready messages and add deprecation comments to deprecated types 2025-07-07 15:56:08 -07:00
mattie ruth backman
dc41ec7cb1 Updated all examples with clients to use the new PipecatClient 2025-07-07 15:56:08 -07:00
mattie ruth backman
43049c865c Add support for new RTVI client message protocol: handling and responding 2025-07-07 15:56:08 -07:00
mattie ruth backman
c4a9fc7f88 video-transport typescript formatting 2025-07-07 15:56:08 -07:00
mattie ruth backman
faf4026cf4 Add device controls to the simple chatbot example 2025-07-07 15:56:08 -07:00
Mark Backman
f53f45a6cd TTS services: Add aggregate_sentences arg 2025-07-07 15:38:31 -07:00
Mark Backman
e04e876f44 Merge pull request #2156 from shahrukhx01/add-additional-whisper-models
WhisperSTTService: Add additional whisper model variants
2025-07-07 12:53:06 -07:00
shahrukhx01
a84e7e30da WhisperSTTService: Add additional whisper model variants 2025-07-07 21:43:48 +02:00
Pete
7ed4fe50d4 Update gemini.py
-FunctionCallFromLLM
-Delete duplicate Gemini imports
2025-07-03 19:39:44 -04:00
Pete
6f66ec1727 Update gemini.py
tab indentation fix
2025-07-03 18:55:21 -04:00
Pete
c7e758fc36 Merge branch 'main' into groundingMetadata 2025-07-03 18:47:47 -04:00
Pete
14c22234bb Fix parameter name consistency in parse_server_event function
- Change function body to use 'str' parameter consistently
- Matches pattern used in OpenAI Realtime Beta service
- Fixes bug where parameter was named 'str' but body used 'message_str'
- Maintains consistency with existing codebase patterns
2025-07-03 18:02:24 -04:00
Pete
d565e9ae53 Update grounding metadata example with final refinements
- Reorganize imports and transport_params structure
- Remove copyright header for consistency
- Enhance grounding metadata logging with better formatting
- Remove unnecessary PipelineParams configuration
- Update message content formatting

Completes incorporation of draft PR #2121 changes
2025-07-03 17:53:55 -04:00
Pete
4951c97eab Clean up verbose logging in grounding metadata implementation
- Remove debug logging from grounding metadata event handlers
- Simplify logging in _process_grounding_metadata method
- Clean up example file logging for better readability
- Remove verbose event parsing comments

Based on suggestions from draft PR #2121
2025-07-03 17:49:27 -04:00
Pete
9b38f3e2fa Delete examples/foundational/26f-gemini-multimodal-live-files-api.py 2025-07-03 17:15:18 -04:00
Pete
a297e4208e Merge branch 'main' into groundingMetadata 2025-06-30 19:48:55 -04:00
Pete
1cf0b35ac1 Merge branch 'main' into groundingMetadata 2025-06-24 22:00:16 -04:00
Matej Marinko
c54084b7a4 Fix deadlock on STT service stop 2025-06-23 14:18:29 +02:00
Pete
e3fe040017 Update gemini.py 2025-06-21 14:43:15 -04:00
Pete
ae5e3e2dc4 Merge branch 'main' into groundingMetadata 2025-06-21 12:16:32 -04:00
Pete
77378d2779 Merge branch 'pipecat-ai:main' into groundingMetadata 2025-06-21 12:08:49 -04:00
Pete
4106f0dabe Merge branch 'pipecat-ai:main' into main 2025-06-21 10:54:25 -04:00
Pete
2ed1ed6821 Merge branch 'pipecat-ai:main' into main 2025-06-14 16:23:27 -04:00
Matej Marinko
6d3a38842d Merge branch 'main' of github.com:pipecat-ai/pipecat 2025-06-12 11:32:38 +02:00
Pete
7360f79413 Merge branch 'pipecat-ai:main' into main 2025-06-11 13:16:19 -04:00
Pete
8d55e13750 remove audio_transcriber from gemini.py
unecessary import removed.
2025-06-10 11:22:16 -04:00
Pete
737e8e79c9 Merge branch 'main' into groundingMetadata 2025-06-10 11:12:35 -04:00
Pete
4d977fede0 Merge branch 'main' into main 2025-06-10 11:07:59 -04:00
getchannel
8070e156d8 Add groundingMetadata events.py 2025-05-30 18:07:09 -04:00
getchannel
43c6f1f5cd Add groundingMetadata and logging gemini.py 2025-05-30 18:01:15 -04:00
getchannel
f53f5445ba Create 26g-gemini-multimodal-live-groundingMetadata.py 2025-05-30 17:36:36 -04:00
getchannel
7263d11ee4 update correct upload endpoint file_api.py 2025-05-30 13:41:55 -04:00
getchannel
f2d5b9ad69 Create 26f-gemini-multimodal-live-files-api.py
This is an example to test usage of the Files API integration. Specifically with the Gemini Multimodal Live Service.
2025-05-30 13:04:52 -04:00
getchannel
40c7e3c52c Update gemini.py 2025-05-30 12:19:40 -04:00
Matej Marinko
ee5fea4221 Fix auto finalization cycle 2025-05-29 14:58:35 +02:00
Matej Marinko
db7b60cfe9 Auto finalize fix 2025-05-29 13:24:53 +02:00
Matej Marinko
51b79bd6a1 Minor code style changes 2025-05-29 10:11:11 +02:00
Matej Marinko
95fe762776 Fix typo 2025-05-29 09:23:37 +02:00
Matej Marinko
2968c846ce Add Soniox STT service 2025-05-28 09:35:21 +02:00
getchannel
e27da96cdc Rename file_api to file_api.py
added proper .py to file name.
2025-05-13 22:01:02 -04:00
getchannel
d86502e79a add file_api __init__.py 2025-05-09 10:53:31 -04:00
getchannel
59c7744590 add FileData class events.py 2025-05-09 10:52:04 -04:00
getchannel
949971dea9 Create file_api 2025-05-09 10:51:24 -04:00
getchannel
cd4a893c65 add FileAPI to gemini.py 2025-05-09 10:50:27 -04:00
138 changed files with 9796 additions and 4587 deletions

View File

@@ -4,5 +4,5 @@ repos:
hooks:
- id: ruff
language_version: python3
args: [ --select, I, ]
args: [--fix]
- id: ruff-format

View File

@@ -9,17 +9,186 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added a new field `handle_sigterm` to `PipelineRunner`. It defaults to `False`.
This field handles SIGTERM signals. The `handle_sigint` field still defaults
to `True`, but now it handles only SIGINT signals.
- Added foundational example `14u-function-calling-ollama.py` for Ollama
function calling.
- Added `LocalSmartTurnAnalyzerV2`, which supports local on-device inference
with the new `smart-turn-v2` turn detection model.
- Added `set_log_level` to `DailyTransport`, allowing setting the logging level
for Daily's internal logging system.
### Changed
- Play delayed messages from `ElevenLabsTTSService` if they still belong to the
current context.
- Dependency compatibility improvements: Relaxed version constraints for core
dependencies to support broader version ranges while maintaining stability:
- `aiohttp`, `Markdown`, `nltk`, `numpy`, `Pillow`, `pydantic`, `openai`,
`numba`: Now support up to the next major version (e.g. `numpy>=1.26.4,<3`)
- `pyht`: Relaxed to `>=0.1.6` to resolve `grpcio` conflicts with
`nvidia-riva-client`
- `fastapi`: Updated to support versions `>=0.115.6,<0.117.0`
- `torch`/`torchaudio`: Changed from exact pinning (`==2.5.0`) to compatible
range (`~=2.5.0`)
- `aws_sdk_bedrock_runtime`: Added Python 3.12+ constraint via environment
marker
- `numba`: Reduced minimum version to `0.60.0` for better compatibility
- Changed `NeuphonicHttpTTSService` to use a POST based request instead of the
`pyneuphonic` package. This removes a package requirement, allowing Neuphonic
to work with more services.
- Updated the `deepgram` optional dependency to 4.7.0, which downgrades the
`tasks cancelled error` to a debug log. This removes the log from appearing
in Pipecat logs upon leaving.
- Upgraded the `websockets` implementation to the new asyncio implementation.
Along with this change, we're updating support for versions >=13.1.0 and
<15.0.0. All services have been update to use the asyncio implementation.
- Updated `MiniMaxHttpTTSService` with a `base_url` arg where you can specify
the Global endpoint (default) or Mainland China.
- Replaced regex-based sentence detection in `match_endofsentence` with NLTK's
punkt_tab tokenizer for more reliable sentence boundary detection.
- Changed the `livekit` optional dependency for `tenacity` to
`tenacity>=8.2.3,<10.0.0` in order to support the `google-genai` package.
- For `LmntTTSService`, changed the default `model` to `blizzard`, LMNT's
recommended model.
### Fixed
- Fixed a dependency issue for uv users where an `llvmlite` version required python 3.9.
- Fixed an issue in `MiniMaxHttpTTSService` where the `pitch` param was the
incorrect type.
- Fixed an issue with OpenTelemetry tracing where the `enable_tracing` flag did
not disable the internal tracing decorator functions.
- Fixed an issue in `OLLamaLLMService` where kwargs were not passed correctly
to the parent class.
- Fixed an issue in `ElevenLabsTTSService` where the word/timestamp pairs were
calculating word boundaries incorrectly.
- Fixed an issue where, in some edge cases, the `EmulateUserStartedSpeakingFrame`
could be created even if we didn't have a transcription.
- Fixed an issue in `GoogleLLMContext` where it would inject the
`system_message` as a "user" message into cases where it was not meant to;
it was only meant to do that when there were no "regular" (non-function-call)
messages in the context, to ensure that inference would run properly.
- Fixed an issue in `LiveKitTransport` where the `on_audio_track_subscribed` was never emitted.
## [0.0.76] - 2025-07-11
### Added
- Added `SpeechControlParamsFrame`, a new `SystemFrame` that notifies
downstream processors of the VAD and Turn analyzer params. This frame is
pushed by the `BaseInputTransport` at Start and any time a
`VADParamsUpdateFrame` is received.
### Changed
- Two package dependencies have been updated:
- `numpy` now supports 1.26.0 and newer
- `transformers` now supports 4.48.0 and newer
### Fixed
- Fixed an issue with RTVI's handling of `append-to-context`.
- Fixed an issue where using audio input with a sample rate requiring resampling
could result in empty audio being passed to STT services, causing errors.
- Fixed the VAD analyzer to process the full audio buffer as long as it contains
more than the minimum required bytes per iteration, instead of only analyzing
the first chunk.
- Fixed an issue in ParallelPipeline that caused errors when attempting to drain
the queues.
- Fixed an issue with emulated VAD timeout inconsistency in
`LLMUserContextAggregator`. Previously, emulated VAD scenarios (where
transcription is received without VAD detection) used a hardcoded
`aggregation_timeout` (default 0.5s) instead of matching the VAD's
`stop_secs` parameter (default 0.8s). This created different user experiences
between real VAD and emulated VAD scenarios. Now, emulated VAD timeouts
automatically synchronize with the VAD's `stop_secs` parameter.
- Fix a pipeline freeze when using AWS Nova Sonic, which would occur if the
user started early, while the bot was still working through
`trigger_assistant_response()`.
## [0.0.75] - 2025-07-08
### Added
- Added an `aggregate_sentences` arg in `CartesiaTTSService`,
`ElevenLabsTTSService`, `NeuphonicTTSService` and `RimeTTSService`, where the
default value is True. When `aggregate_sentences` is True, the `TTSService`
aggregates the LLM streamed tokens into sentences by default. Note: setting
the value to False requires a custom processor before the `TTSService` to
aggregate LLM tokens.
- Added `kwargs` to the `OLLamaLLMService` to allow for configuration args to
be passed to Ollama.
- Added call hang-up error handling in `TwilioFrameSerializer`, which handles
the case where the user has hung up before the `TwilioFrameSerializer` hangs
up the call.
### Changed
- Updated `RTVIObserver` and `RTVIProcessor` to match the new RTVI 1.0.0 protocol.
This includes:
- Deprecating support for all messages related to service configuaration and
actions.
- Adding support for obtaining and logging data about client, including its
RTVI version and optionally included system information (OS/browser/etc.)
- Adding support for handling the new `client-message` RTVI message through
either a `on_client_message` event handler or listening for a new
`RTVIClientMessageFrame`
- Adding support for responding to a `client-message` with a `server-response`
via either a direct call on the `RTVIProcessor` or via pushing a new
`RTVIServerResponseFrame`
- Adding built-in support for handling the new `append-to-context` RTVI message
which allows a client to add to the user or assistant llm context. No extra
code is required for supporting this behavior.
- Updating all JavaScript and React client RTVI examples to use versions 1.0.0
of the clients.
Get started migrating to RTVI protocol 1.0.0 by following the migration guide:
https://docs.pipecat.ai/client/migration-guide
- Refactored `AWSBedrockLLMService` and `AWSPollyTTSService` to work
asynchronously using `aioboto3` instead of the `boto3` library.
- The `UserIdleProcessor` now handles the scenario where function calls take
longer than the idle timeout duration. This allows you to use the
`UserIdleProcessor` in conjunction with function calls that take a while to
return a result.
### Fixed
- Updated the `NeuphonicTTSService` to work with the updated websocket API.
- Fixed an issue with `RivaSTTService` where the watchdog feature was causing
an error on initialization.
### Performance
- Remove unncessary push task in each `FrameProcessor`.

View File

@@ -53,7 +53,7 @@ You can connect to Pipecat from any platform using our official SDKs:
| Category | Services |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |

View File

@@ -11,3 +11,10 @@ ruff~=0.12.1
setuptools~=78.1.1
setuptools_scm~=8.3.1
python-dotenv~=1.1.1
# For running examples
uvicorn
python-dotenv
fastapi
aiohttp
aiortc

View File

@@ -77,6 +77,7 @@ autodoc_mock_imports = [
"openpipe",
"simli",
"soundfile",
"soniox",
"pipecat_ai_krisp",
"pyaudio",
"_tkinter",

View File

@@ -46,6 +46,7 @@ pipecat-ai[sambanova]
pipecat-ai[silero]
pipecat-ai[simli]
pipecat-ai[soundfile]
pipecat-ai[soniox]
pipecat-ai[speechmatics]
pipecat-ai[tavus]
pipecat-ai[together]

View File

@@ -109,6 +109,9 @@ MINIMAX_GROUP_ID=...
# Sarvam AI
SARVAM_API_KEY=...
# Soniox
SONIOX_API_KEY=
# Speechmatics
SPEECHMATICS_API_KEY=...

View File

@@ -0,0 +1,60 @@
# AWS Strands Examples
This folder contains two Python examples demonstrating how to use Pipecat with the AWS Strands agent.
## Overview
These examples show how to delegate complex, multi-step tasks to a Strands agent, which can reason step-by-step and call tools to accomplish user requests.
These examples are intentionally simplified for demonstration, using mock API calls. They work best if you ask it:
> What's the weather where the Golden Gate Bridge is?
## Example Scripts
### `black-box.py`
A minimal example that demonstrates how to use the Strands agent with Pipecat. The agent can handle multi-step queries by calling tools, but does not explain its reasoning out loud.
### `explain-thinking.py`
An enhanced example where the Strands agent explains each step of its reasoning in clear, simple language as it works through a multi-step task.
## Quick Start
1. **Clone the repository and navigate to this example:**
```bash
git clone https://github.com/pipecat-ai/pipecat.git
cd pipecat/examples/aws-strands
```
2. **Set up a virtual environment:**
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
3. **Install dependencies:**
```bash
pip install -r requirements.txt
```
4. **Configure environment variables:**
Copy the provided `env.example` file to `.env` and fill in the necessary credentials:
```bash
cp env.example .env
# Then edit .env with your preferred editor
```
5. **Run an example:**
```bash
python black-box.py
# or
python explain-thinking.py
```

View File

@@ -0,0 +1,206 @@
#
# Copyright (c) 2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import os
from dotenv import load_dotenv
from loguru import logger
from strands import Agent, tool
from strands.models import BedrockModel
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
"""This example demonstrates how to use the Strands agent with Pipecat.
You can delegate complex, multi-step tasks to the Strands agent, which can cycle through LLM-based reasoning and tool calls to accomplish the task.
Try asking: "What's the weather where the Golden Gate Bridge is?"
"""
# Strands agent tools
@tool
def get_location_name_from_landmark(landmark: str) -> str:
"""
Get the location name from a landmark.
Args:
landmark (str): The name of the landmark, e.g. "Golden Gate Bridge".
"""
# Simulate fetching location
return "San Francisco, CA"
@tool
def get_lat_long_from_location_name(location: str) -> dict:
"""
Get the latitude and longitude for a location name.
Args:
location (str): The city and state, e.g. "San Francisco, CA".
"""
# Simulate fetching lat/long from a geocoding service
return {"lat": 37.7749, "long": -122.4194}
@tool
def get_current_weather_from_lat_long(lat: float, long: float) -> dict:
"""
Get the current weather for a specific latitude and longitude.
Args:
lat (float): The latitude of the location.
long (float): The longitude of the location.
"""
# Simulate fetching weather data from a weather service
return {"conditions": "nice", "temperature": "75"}
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
logger.info(f"Starting bot")
strands_agent = Agent(
model=BedrockModel(
model_id="us.anthropic.claude-3-7-sonnet-20250219-v1:0", max_tokens=64000
),
tools=[
get_location_name_from_landmark,
get_lat_long_from_location_name,
get_current_weather_from_lat_long,
],
system_prompt="""
You are a helpful personal assistant who can look up information about places and weather.
Your key capabilities:
1. Look up where landmarks are located.
2. Find latitude and longitude for a location.
3. Look up the current weather for a specific latitude and longitude.
Explain each step of your reasoning in clear, simple, and concise language. Your responses will be converted to audio, so avoid special characters and numbered lists.
""",
)
async def handle_location_or_weather_related_queries(params: FunctionCallParams, query: str):
"""
Handle location or weather related queries.
Args:
query (str): The user's query, e.g. "What's the weather where the Golden Gate Bridge is?".
"""
# Run in a background thread
# (Otherwise the agent blocks the event loop; one effect of that is that we don't hear
# "let me check on that" until the agent finishes)
loop = asyncio.get_running_loop()
result = await loop.run_in_executor(None, strands_agent, query)
await params.result_callback(result.message)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
llm.register_direct_function(handle_location_or_weather_related_queries)
@llm.event_handler("on_function_calls_started")
async def on_function_calls_started(service, function_calls):
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
tools = ToolsSchema(standard_tools=[handle_location_or_weather_related_queries])
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. Start by suggesting that the user ask about the weather where the Golden Gate Bridge is.",
},
]
context = OpenAILLMContext(messages, tools)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
stt,
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=handle_sigint)
await runner.run(task)
if __name__ == "__main__":
from pipecat.examples.run import main
main(run_example, transport_params=transport_params)

View File

@@ -0,0 +1,8 @@
OPENAI_API_KEY=
CARTESIA_API_KEY=
DEEPGRAM_API_KEY=
DAILY_API_KEY=
DAILY_SAMPLE_ROOM_URL=
AWS_SECRET_ACCESS_KEY=
AWS_ACCESS_KEY_ID=
AWS_REGION=

View File

@@ -0,0 +1,249 @@
#
# Copyright (c) 2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import os
import threading
import time
from dotenv import load_dotenv
from loguru import logger
from strands import Agent, tool
from strands.models import BedrockModel
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
"""This example demonstrates how to use the Strands agent with Pipecat in a way where the agent explains its reasoning step-by-step.
You can delegate complex, multi-step tasks to the Strands agent, which can cycle through LLM-based reasoning and tool calls to accomplish the task.
Try asking: "What's the weather where the Golden Gate Bridge is?"
"""
# Strands agent tools
@tool
def get_location_name_from_landmark(landmark: str) -> str:
"""
Get the location name from a landmark.
Args:
landmark (str): The name of the landmark, e.g. "Golden Gate Bridge".
"""
# Simulate fetching location (slowly)
time.sleep(3)
return "San Francisco, CA"
@tool
def get_lat_long_from_location_name(location: str) -> dict:
"""
Get the latitude and longitude for a location name.
Args:
location (str): The city and state, e.g. "San Francisco, CA".
"""
# Simulate fetching lat/long from a geocoding service (slowly)
time.sleep(3)
return {"lat": 37.7749, "long": -122.4194}
@tool
def get_current_weather_from_lat_long(lat: float, long: float) -> dict:
"""
Get the current weather for a specific latitude and longitude.
Args:
lat (float): The latitude of the location.
long (float): The longitude of the location.
"""
# Simulate fetching weather data from a weather service (slowly)
time.sleep(3)
return {"conditions": "nice", "temperature": "75"}
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
logger.info(f"Starting bot")
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
next_strands_message_is_last = False
strands_messages_queue = asyncio.Queue()
def strands_callback_handler(**kwargs):
"""
Handle events from the Strands agent.
"""
nonlocal next_strands_message_is_last
if "event" in kwargs:
event_obj = kwargs["event"]
if event_obj and "messageStop" in event_obj:
message_stop = event_obj["messageStop"]
if message_stop and "stopReason" in message_stop:
stop_reason = message_stop["stopReason"]
if stop_reason == "end_turn":
next_strands_message_is_last = True
elif "message" in kwargs:
message_obj = kwargs["message"]
if message_obj and "content" in message_obj and "role" in message_obj:
role = message_obj["role"]
content = message_obj["content"]
if role == "assistant" and isinstance(content, list):
for content_obj in content:
if isinstance(content_obj, dict) and "text" in content_obj:
message = content_obj["text"]
if not next_strands_message_is_last:
strands_messages_queue.put_nowait(message)
async def process_strands_messages():
while True:
message = await strands_messages_queue.get()
await tts.queue_frame(TTSSpeakFrame(message))
strands_messages_queue.task_done()
asyncio.create_task(process_strands_messages())
strands_agent = Agent(
model=BedrockModel(
model_id="us.anthropic.claude-3-7-sonnet-20250219-v1:0", max_tokens=64000
),
tools=[
get_location_name_from_landmark,
get_lat_long_from_location_name,
get_current_weather_from_lat_long,
],
system_prompt="""
You are a helpful personal assistant who can look up information about places and weather.
Your key capabilities:
1. Look up where landmarks are located.
2. Find latitude and longitude for a location.
3. Look up the current weather for a specific latitude and longitude.
Explain each step of your reasoning in clear, simple, and concise language. Your responses will be converted to audio, so avoid special characters and numbered lists.
""",
callback_handler=strands_callback_handler,
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
async def handle_location_or_weather_related_queries(params: FunctionCallParams, query: str):
"""
Handle location or weather related queries.
Args:
query (str): The user's query, e.g. "What's the weather where the Golden Gate Bridge is?".
"""
# Run in a background thread
# (Otherwise the agent blocks the event loop; one effect of that is that we don't hear
# the agent's "thinking" messages until the agent finishes)
loop = asyncio.get_running_loop()
result = await loop.run_in_executor(None, strands_agent, query)
await params.result_callback(result.message)
llm.register_direct_function(handle_location_or_weather_related_queries)
@llm.event_handler("on_function_calls_started")
async def on_function_calls_started(service, function_calls):
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
tools = ToolsSchema(standard_tools=[handle_location_or_weather_related_queries])
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. Start by suggesting that the user ask about the weather where the Golden Gate Bridge is.",
},
]
context = OpenAILLMContext(messages, tools)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
stt,
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=handle_sigint)
await runner.run(task)
if __name__ == "__main__":
from pipecat.examples.run import main
main(run_example, transport_params=transport_params)

View File

@@ -2,4 +2,5 @@ fastapi
uvicorn
python-dotenv
pipecat-ai[webrtc,daily,deepgram,cartesia]
pipecat-ai-small-webrtc-prebuilt
pipecat-ai-small-webrtc-prebuilt
strands-agents

File diff suppressed because it is too large Load Diff

View File

@@ -15,7 +15,7 @@
"vite": "^6.3.5"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.3.5",
"@pipecat-ai/daily-transport": "^0.3.10"
"@pipecat-ai/client-js": "^1.0.0",
"@pipecat-ai/daily-transport": "^1.0.0"
}
}

View File

@@ -5,7 +5,7 @@
*/
/**
* RTVI Client Implementation
* Pipecat Client Implementation
*
* This client connects to an RTVI-compatible bot server using WebRTC (via Daily).
* It handles audio/video streaming and manages the connection lifecycle.
@@ -16,7 +16,7 @@
* - Browser with WebRTC support
*/
import { RTVIClient, RTVIEvent } from '@pipecat-ai/client-js';
import { PipecatClient, RTVIEvent } from '@pipecat-ai/client-js';
import { DailyTransport } from '@pipecat-ai/daily-transport';
/**
@@ -26,7 +26,7 @@ import { DailyTransport } from '@pipecat-ai/daily-transport';
class ChatbotClient {
constructor() {
// Initialize client state
this.rtviClient = null;
this.pcClient = null;
this.setupDOMElements();
this.initializeClientAndTransport();
this.setupEventListeners();
@@ -59,7 +59,7 @@ class ChatbotClient {
this.disconnectBtn.addEventListener('click', () => this.disconnect());
// Populate device selector
this.rtviClient.getAllMics().then((mics) => {
this.pcClient.getAllMics().then((mics) => {
console.log('Available mics:', mics);
mics.forEach((device) => {
const option = document.createElement('option');
@@ -71,16 +71,16 @@ class ChatbotClient {
this.deviceSelector.addEventListener('change', (event) => {
const selectedDeviceId = event.target.value;
console.log('Selected device ID:', selectedDeviceId);
this.rtviClient.updateMic(selectedDeviceId);
this.pcClient.updateMic(selectedDeviceId);
});
// Handle mic mute/unmute toggle
const micToggleBtn = document.getElementById('mic-toggle-btn');
micToggleBtn.addEventListener('click', () => {
let micEnabled = this.rtviClient.isMicEnabled;
let micEnabled = this.pcClient.isMicEnabled;
micToggleBtn.textContent = micEnabled ? 'Unmute Mic' : 'Mute Mic';
this.rtviClient.enableMic(!micEnabled);
this.pcClient.enableMic(!micEnabled);
// Add logic to mute/unmute the mic
if (micEnabled) {
console.log('Mic muted');
@@ -93,23 +93,12 @@ class ChatbotClient {
}
/**
* Set up the RTVI client and Daily transport
* Set up the Pipecat client and Daily transport
*/
async initializeClientAndTransport() {
// Initialize the RTVI client with a DailyTransport and our configuration
this.rtviClient = new RTVIClient({
// Initialize the Pipecat client with a DailyTransport and our configuration
this.pcClient = new PipecatClient({
transport: new DailyTransport(),
params: {
// REPLACE WITH YOUR MODAL URL ENDPOINT
baseUrl:
'https://<Modal workspace>--pipecat-modal-bot-launcher.modal.run',
endpoints: {
connect: '/connect',
},
requestData: {
bot_name: 'openai',
},
},
enableMic: true, // Enable microphone for user input
enableCam: false,
callbacks: {
@@ -176,8 +165,8 @@ class ChatbotClient {
// Set up listeners for media track events
this.setupTrackListeners();
await this.rtviClient.initDevices();
window.client = this.rtviClient;
await this.pcClient.initDevices();
window.client = this.pcClient;
}
/**
@@ -212,10 +201,10 @@ class ChatbotClient {
* This is called when the bot is ready or when the transport state changes to ready
*/
setupMediaTracks() {
if (!this.rtviClient) return;
if (!this.pcClient) return;
// Get current tracks from the client
const tracks = this.rtviClient.tracks();
const tracks = this.pcClient.tracks();
// Set up any available bot tracks
if (tracks.bot?.audio) {
@@ -231,10 +220,10 @@ class ChatbotClient {
* This handles new tracks being added during the session
*/
setupTrackListeners() {
if (!this.rtviClient) return;
if (!this.pcClient) return;
// Listen for new tracks starting
this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => {
this.pcClient.on(RTVIEvent.TrackStarted, (track, participant) => {
// Only handle non-local (bot) tracks
if (!participant?.local) {
if (track.kind === 'audio') {
@@ -253,7 +242,7 @@ class ChatbotClient {
});
// Listen for tracks stopping
this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.pcClient.on(RTVIEvent.TrackStopped, (track, participant) => {
if (participant.local) {
this.log('Local mic muted');
return;
@@ -311,21 +300,27 @@ class ChatbotClient {
/**
* Initialize and connect to the bot
* This sets up the RTVI client, initializes devices, and establishes the connection
* This sets up the Pipecat client, initializes devices, and establishes the connection
*/
async connect() {
try {
const botSelector = document.getElementById('bot-selector');
const selectedBot = botSelector.value;
this.rtviClient.params.requestData.bot_name = selectedBot;
// Initialize audio/video devices
this.log('Initializing devices...');
await this.rtviClient.initDevices();
await this.pcClient.initDevices();
// Connect to the bot
this.log(`Connecting to bot: ${selectedBot}`);
await this.rtviClient.connect();
await this.pcClient.connect({
// REPLACE WITH YOUR MODAL URL ENDPOINT
endpoint:
'https://<your-workspace>--pipecat-modal-fastapi-app.modal.run/connect',
requestData: {
bot_name: selectedBot,
},
});
this.log('Connection complete');
} catch (error) {
@@ -336,9 +331,9 @@ class ChatbotClient {
this.updateStatus('Error');
// Clean up if there's an error
if (this.rtviClient) {
if (this.pcClient) {
try {
await this.rtviClient.disconnect();
await this.pcClient.disconnect();
} catch (disconnectError) {
this.log(`Error during disconnect: ${disconnectError.message}`);
}
@@ -350,10 +345,10 @@ class ChatbotClient {
* Disconnect from the bot and clean up media resources
*/
async disconnect() {
if (this.rtviClient) {
if (this.pcClient) {
try {
// Disconnect the RTVI client
await this.rtviClient.disconnect();
// Disconnect the Pipecat client
await this.pcClient.disconnect();
// Clean up audio
if (this.botAudio.srcObject) {

View File

@@ -301,7 +301,7 @@ def fastapi_app():
allow_headers=["*"],
)
# Include the endpoints from endpoints.py
# Include the endpoints from this file
web_app.include_router(router)
return web_app

View File

@@ -1,2 +1,3 @@
python-dotenv==1.0.1
modal==0.71.3
modal==1.0.5
fastapi[all]

View File

@@ -8,7 +8,7 @@
"name": "my-daily-app",
"version": "0.1.0",
"dependencies": {
"axios": "^1.6.0",
"axios": "^1.11.0",
"next": "^14.0.0",
"pino": "^8.15.0",
"react": "^18.2.0",
@@ -1165,13 +1165,13 @@
}
},
"node_modules/axios": {
"version": "1.8.4",
"resolved": "https://registry.npmjs.org/axios/-/axios-1.8.4.tgz",
"integrity": "sha512-eBSYY4Y68NNlHbHBMdeDmKNtDgXWhQsJcGqzO3iLUM0GraQFSS9cVgPX5I9b3lbdFKyYoAEGAZF1DwhTaljNAw==",
"version": "1.11.0",
"resolved": "https://registry.npmjs.org/axios/-/axios-1.11.0.tgz",
"integrity": "sha512-1Lx3WLFQWm3ooKDYZD1eXmoGO9fxYQjrycfHFC8P0sCfQVXyROp0p9PFWBehewBOdCwHc+f/b8I0fMto5eSfwA==",
"license": "MIT",
"dependencies": {
"follow-redirects": "^1.15.6",
"form-data": "^4.0.0",
"form-data": "^4.0.4",
"proxy-from-env": "^1.1.0"
}
},
@@ -2436,14 +2436,15 @@
}
},
"node_modules/form-data": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.2.tgz",
"integrity": "sha512-hGfm/slu0ZabnNt4oaRZ6uREyfCj6P4fT/n6A1rGV+Z0VdGXjfOhVUpkn6qVQONHGIFwmveGXyDs75+nr6FM8w==",
"version": "4.0.4",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.4.tgz",
"integrity": "sha512-KrGhL9Q4zjj0kiUt5OO4Mr/A/jlI2jDYs5eHBpYHPcBEVSiipAvn2Ko2HnPe20rmcuuvMHNdZFp+4IlGTMF0Ow==",
"license": "MIT",
"dependencies": {
"asynckit": "^0.4.0",
"combined-stream": "^1.0.8",
"es-set-tostringtag": "^2.1.0",
"hasown": "^2.0.2",
"mime-types": "^2.1.12"
},
"engines": {

View File

@@ -9,7 +9,7 @@
"lint": "next lint"
},
"dependencies": {
"axios": "^1.6.0",
"axios": "^1.11.0",
"next": "^14.0.0",
"pino": "^8.15.0",
"react": "^18.2.0",

View File

@@ -103,7 +103,7 @@ export default async function handler(req, res) {
const sip_config = {
display_name: From,
sip_mode: 'dial-in',
num_endpoints: call_transfer !== null ? 2 : 1,
num_endpoints: (call_transfer !== undefined && call_transfer !== null) ? 2 : 1,
codecs: {"audio": ["OPUS"]},
};
daily_room_properties.sip = sip_config;

View File

@@ -90,7 +90,7 @@ async def main(transport: DailyTransport):
logger.info("Participant left: {}", participant)
await task.cancel()
runner = PipelineRunner()
runner = PipelineRunner(handle_sigint=False, force_gc=True)
await runner.run(task)

View File

@@ -44,7 +44,7 @@ Try the hosted version of the demo here: https://pcc-smart-turn.vercel.app/.
4. Run the server:
```bash
LOCAL=1 python server.py
LOCAL_RUN=1 python server.py
```
### Run the client

File diff suppressed because it is too large Load Diff

View File

@@ -9,9 +9,9 @@
"lint": "next lint"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.3.5",
"@pipecat-ai/client-react": "^0.3.5",
"@pipecat-ai/daily-transport": "^0.3.10",
"@pipecat-ai/client-js": "^1.0.0",
"@pipecat-ai/client-react": "^1.0.0",
"@pipecat-ai/daily-transport": "^1.0.0",
"next": "15.3.1",
"react": "^19.0.0",
"react-dom": "^19.0.0"

View File

@@ -1,5 +1,5 @@
import './globals.css';
import { RTVIProvider } from '@/providers/RTVIProvider';
import { PipecatProvider } from '@/providers/PipecatProvider';
export const metadata = {
title: 'Pipecat React Client',
@@ -20,7 +20,7 @@ export default function RootLayout({
<link rel="icon" href="/favicon.svg" type="image/svg+xml" />
</head>
<body>
<RTVIProvider>{children}</RTVIProvider>
<PipecatProvider>{children}</PipecatProvider>
</body>
</html>
);

View File

@@ -1,22 +1,22 @@
'use client';
import {
RTVIClientAudio,
RTVIClientVideo,
useRTVIClientTransportState,
PipecatClientAudio,
PipecatClientVideo,
usePipecatClientTransportState,
} from '@pipecat-ai/client-react';
import { ConnectButton } from '../components/ConnectButton';
import { StatusDisplay } from '../components/StatusDisplay';
import { DebugDisplay } from '../components/DebugDisplay';
function BotVideo() {
const transportState = useRTVIClientTransportState();
const transportState = usePipecatClientTransportState();
const isConnected = transportState !== 'disconnected';
return (
<div className="bot-container">
<div className="video-container">
{isConnected && <RTVIClientVideo participant="bot" fit="cover" />}
{isConnected && <PipecatClientVideo participant="bot" fit="cover" />}
</div>
</div>
);
@@ -35,7 +35,7 @@ export default function Home() {
</div>
<DebugDisplay />
<RTVIClientAudio />
<PipecatClientAudio />
</div>
);
}

View File

@@ -1,11 +1,17 @@
import {
useRTVIClient,
useRTVIClientTransportState,
usePipecatClient,
usePipecatClientTransportState,
} from '@pipecat-ai/client-react';
// Get the API base URL from environment variables
// Default to "/api" if not specified
// "/api" is the default for Next.js API routes and used
// for the Pipecat Cloud deployed agent
const API_BASE_URL = process.env.NEXT_PUBLIC_API_BASE_URL || '/api';
export function ConnectButton() {
const client = useRTVIClient();
const transportState = useRTVIClientTransportState();
const client = usePipecatClient();
const transportState = usePipecatClientTransportState();
const isConnected = ['connected', 'ready'].includes(transportState);
const handleClick = async () => {
@@ -18,7 +24,10 @@ export function ConnectButton() {
if (isConnected) {
await client.disconnect();
} else {
await client.connect();
await client.connect({
endpoint: `${API_BASE_URL}/connect`,
requestData: { foo: 'bar' },
});
}
} catch (error) {
console.error('Connection error:', error);

View File

@@ -6,7 +6,7 @@ import {
TranscriptData,
BotLLMTextData,
} from '@pipecat-ai/client-js';
import { useRTVIClient, useRTVIClientEvent } from '@pipecat-ai/client-react';
import { usePipecatClient, useRTVIClientEvent } from '@pipecat-ai/client-react';
import './DebugDisplay.css';
interface SmartTurnResultData {
@@ -20,7 +20,7 @@ interface SmartTurnResultData {
export function DebugDisplay() {
const debugLogRef = useRef<HTMLDivElement>(null);
const client = useRTVIClient();
const client = usePipecatClient();
const log = useCallback((message: string) => {
if (!debugLogRef.current) return;

View File

@@ -1,7 +1,7 @@
import { useRTVIClientTransportState } from '@pipecat-ai/client-react';
import { usePipecatClientTransportState } from '@pipecat-ai/client-react';
export function StatusDisplay() {
const transportState = useRTVIClientTransportState();
const transportState = usePipecatClientTransportState();
return (
<div className="status">

View File

@@ -0,0 +1,28 @@
'use client';
import { PipecatClient } from '@pipecat-ai/client-js';
import { DailyTransport } from '@pipecat-ai/daily-transport';
import { PipecatClientProvider } from '@pipecat-ai/client-react';
import { PropsWithChildren, useEffect, useState } from 'react';
export function PipecatProvider({ children }: PropsWithChildren) {
const [client, setClient] = useState<PipecatClient | null>(null);
useEffect(() => {
const pcClient = new PipecatClient({
transport: new DailyTransport(),
enableMic: true,
enableCam: false,
});
setClient(pcClient);
}, []);
if (!client) {
return null;
}
return (
<PipecatClientProvider client={client}>{children}</PipecatClientProvider>
);
}

View File

@@ -1,43 +0,0 @@
'use client';
import { RTVIClient } from '@pipecat-ai/client-js';
import { DailyTransport } from '@pipecat-ai/daily-transport';
import { RTVIClientProvider } from '@pipecat-ai/client-react';
import { PropsWithChildren, useEffect, useState } from 'react';
// Get the API base URL from environment variables
// Default to "/api" if not specified
// "/api" is the default for Next.js API routes and used
// for the Pipecat Cloud deployed agent
const API_BASE_URL = process.env.NEXT_PUBLIC_API_BASE_URL || '/api';
console.log('Using API base URL:', API_BASE_URL);
export function RTVIProvider({ children }: PropsWithChildren) {
const [client, setClient] = useState<RTVIClient | null>(null);
useEffect(() => {
const transport = new DailyTransport();
const rtviClient = new RTVIClient({
transport,
params: {
baseUrl: API_BASE_URL,
endpoints: {
connect: '/connect',
},
requestData: { foo: 'bar' },
},
enableMic: true,
enableCam: false,
});
setClient(rtviClient);
}, []);
if (!client) {
return null;
}
return <RTVIClientProvider client={client}>{children}</RTVIClientProvider>;
}

View File

@@ -45,7 +45,7 @@ from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
# Check if we're in local development mode
LOCAL = os.getenv("LOCAL")
LOCAL = os.getenv("LOCAL_RUN")
logger.remove()
logger.add(sys.stderr, level="DEBUG")

View File

@@ -20,7 +20,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.transports.services.daily import DailyLogLevel, DailyParams, DailyTransport
load_dotenv(override=True)
@@ -43,6 +43,7 @@ async def main():
vad_analyzer=SileroVADAnalyzer(),
),
)
transport.set_log_level(DailyLogLevel.Info)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),

View File

@@ -0,0 +1,109 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.soniox.stt import SonioxSTTService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
logger.info(f"Starting bot")
stt = SonioxSTTService(
api_key=os.getenv("SONIOX_API_KEY"),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt,
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=handle_sigint)
await runner.run(task)
if __name__ == "__main__":
from pipecat.examples.run import main
main(run_example, transport_params=transport_params)

View File

@@ -7,6 +7,7 @@
import argparse
import os
import aiohttp
from dotenv import load_dotenv
from loguru import logger
@@ -50,60 +51,63 @@ transport_params = {
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
logger.info(f"Starting bot")
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
# Create an HTTP session
async with aiohttp.ClientSession() as session:
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = NeuphonicHttpTTSService(
api_key=os.getenv("NEUPHONIC_API_KEY"),
voice_id="fc854436-2dac-4d21-aa69-ae17b54e98eb", # Emily
)
tts = NeuphonicHttpTTSService(
api_key=os.getenv("NEUPHONIC_API_KEY"),
voice_id="fc854436-2dac-4d21-aa69-ae17b54e98eb", # Emily
aiohttp_session=session,
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt,
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([context_aggregator.user().get_context_frame()])
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt,
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
)
runner = PipelineRunner(handle_sigint=handle_sigint)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([context_aggregator.user().get_context_frame()])
await runner.run(task)
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=handle_sigint)
await runner.run(task)
if __name__ == "__main__":

View File

@@ -0,0 +1,81 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, TranscriptionFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.soniox.stt import SonioxSTTService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
class TranscriptionLogger(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, TranscriptionFrame):
print(f"Transcription: {frame.text}")
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
logger.info(f"Starting bot")
stt = SonioxSTTService(
api_key=os.getenv("SONIOX_API_KEY"),
)
tl = TranscriptionLogger()
pipeline = Pipeline([transport.input(), stt, tl])
task = PipelineTask(pipeline)
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
@transport.event_handler("on_client_closed")
async def on_client_closed(transport, client):
logger.info(f"Client closed connection")
await task.cancel()
runner = PipelineRunner(handle_sigint=False)
await runner.run(task)
if __name__ == "__main__":
from pipecat.examples.run import main
main(run_example, transport_params=transport_params)

View File

@@ -0,0 +1,162 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.ollama.llm import OLLamaLLMService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
async def fetch_weather_from_api(params: FunctionCallParams):
await params.result_callback({"conditions": "nice", "temperature": "75"})
async def fetch_restaurant_recommendation(params: FunctionCallParams):
await params.result_callback({"name": "The Golden Dragon"})
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
logger.info(f"Starting bot")
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
llm = OLLamaLLMService(model="llama3.2") # Update to the model you're running locally
# You can also register a function_name of None to get all functions
# sent to the same callback with an additional function_name parameter.
llm.register_function("get_current_weather", fetch_weather_from_api)
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
@llm.event_handler("on_function_calls_started")
async def on_function_calls_started(service, function_calls):
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
weather_function = FunctionSchema(
name="get_current_weather",
description="Get the current weather",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the user's location.",
},
},
required=["location", "format"],
)
restaurant_function = FunctionSchema(
name="get_restaurant_recommendation",
description="Get a restaurant recommendation",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
required=["location"],
)
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages, tools)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
stt,
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=handle_sigint)
await runner.run(task)
if __name__ == "__main__":
from pipecat.examples.run import main
main(run_example, transport_params=transport_params)

View File

@@ -0,0 +1,165 @@
import argparse
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import Frame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
from pipecat.services.google.frames import LLMSearchResponseFrame
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_in_enabled=False,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_in_enabled=False,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_in_enabled=False,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
),
}
SYSTEM_INSTRUCTION = """
You are a helpful AI assistant that actively uses Google Search to provide up-to-date, accurate information.
IMPORTANT: For ANY question about current events, news, recent developments, real-time information, or anything that might have changed recently, you MUST use the google_search tool to get the latest information.
You should use Google Search for:
- Current news and events
- Recent developments in any field
- Today's weather, stock prices, or other real-time data
- Any question that starts with "what's happening", "latest", "recent", "current", "today", etc.
- When you're not certain about recent information
Always be proactive about using search when the user asks about anything that could benefit from real-time information.
Your output will be converted to audio so don't include special characters in your answers.
Respond to what the user said in a creative and helpful way, always using search for current information.
"""
class GroundingMetadataProcessor(FrameProcessor):
"""Processor to capture and display grounding metadata from Gemini Live API."""
def __init__(self):
super().__init__()
self._grounding_count = 0
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, LLMSearchResponseFrame):
self._grounding_count += 1
logger.info(f"\n\n🔍 GROUNDING METADATA RECEIVED #{self._grounding_count}\n")
logger.info(f"📝 Search Result Text: {frame.search_result[:200]}...")
if frame.rendered_content:
logger.info(f"🔗 Rendered Content: {frame.rendered_content}")
if frame.origins:
logger.info(f"📍 Number of Origins: {len(frame.origins)}")
for i, origin in enumerate(frame.origins):
logger.info(f" Origin {i + 1}: {origin.site_title} - {origin.site_uri}")
if origin.results:
logger.info(f" Results: {len(origin.results)} items")
# Always push the frame downstream
await self.push_frame(frame, direction)
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
logger.info(f"Starting Gemini Live Grounding Metadata Test Bot")
# Create tools using ToolsSchema with custom tools for Gemini
tools = ToolsSchema(
standard_tools=[], # No standard function declarations needed
custom_tools={AdapterType.GEMINI: [{"google_search": {}}, {"code_execution": {}}]},
)
llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
system_instruction=SYSTEM_INSTRUCTION,
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck
transcribe_user_audio=True,
tools=tools,
)
# Create a processor to capture grounding metadata
grounding_processor = GroundingMetadataProcessor()
messages = [
{
"role": "user",
"content": "Please introduce yourself and let me know that you can help with current information by searching the web. Ask me what current information I'd like to know about.",
},
]
# Set up conversation context and management
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
llm,
grounding_processor, # Add our grounding processor here
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(pipeline)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
@transport.event_handler("on_client_closed")
async def on_client_closed(transport, client):
logger.info(f"Client closed connection")
await task.cancel()
runner = PipelineRunner(handle_sigint=False)
await runner.run(task)
if __name__ == "__main__":
from pipecat.examples.run import main
main(run_example, transport_params=transport_params)

View File

@@ -11,7 +11,7 @@ from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn import LocalSmartTurnAnalyzer
from pipecat.audio.turn.smart_turn.local_smart_turn_v2 import LocalSmartTurnAnalyzerV2
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.pipeline.pipeline import Pipeline
@@ -37,7 +37,7 @@ load_dotenv(override=True)
# # Hugging Face uses LFS to store large model files, including .mlpackage
# git lfs install
# # Clone the repo with the smart_turn_classifier.mlpackage
# git clone https://huggingface.co/pipecat-ai/smart-turn
# git clone https://huggingface.co/pipecat-ai/smart-turn-v2
#
# Then set the env variable:
# export LOCAL_SMART_TURN_MODEL_PATH=./smart-turn
@@ -52,7 +52,7 @@ transport_params = {
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzer(
turn_analyzer=LocalSmartTurnAnalyzerV2(
smart_turn_model_path=smart_turn_model_path, params=SmartTurnParams()
),
),
@@ -60,7 +60,7 @@ transport_params = {
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzer(
turn_analyzer=LocalSmartTurnAnalyzerV2(
smart_turn_model_path=smart_turn_model_path, params=SmartTurnParams()
),
),
@@ -68,7 +68,7 @@ transport_params = {
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzer(
turn_analyzer=LocalSmartTurnAnalyzerV2(
smart_turn_model_path=smart_turn_model_path, params=SmartTurnParams()
),
),

View File

@@ -20,11 +20,10 @@ import {
} from '@pipecat-ai/client-js';
import {
ProtobufFrameSerializer,
WebSocketTransport
} from "@pipecat-ai/websocket-transport";
WebSocketTransport,
} from '@pipecat-ai/websocket-transport';
class RecordingSerializer extends ProtobufFrameSerializer {
private lastTimestamp: number | null = null;
private recordingAudioToSend: boolean = false;
private _recordedAudio: { data: ArrayBuffer; delay: number }[] = [];
@@ -40,7 +39,11 @@ class RecordingSerializer extends ProtobufFrameSerializer {
}
// @ts-ignore
serializeAudio(data: ArrayBuffer, sampleRate: number, numChannels: number): Uint8Array | null {
serializeAudio(
data: ArrayBuffer,
sampleRate: number,
numChannels: number
): Uint8Array | null {
if (this.recordingAudioToSend) {
const now = Date.now();
// Compute delay since last packet
@@ -55,13 +58,13 @@ class RecordingSerializer extends ProtobufFrameSerializer {
}
public get recordedAudio() {
return this._recordedAudio
return this._recordedAudio;
}
}
class WebsocketClientApp {
private ENABLE_RECORDING_MODE = false
private RECORDING_TIME_MS = 10000
private ENABLE_RECORDING_MODE = false;
private RECORDING_TIME_MS = 10000;
private rtviClient: RTVIClient | null = null;
private connectBtn: HTMLButtonElement | null = null;
@@ -71,7 +74,7 @@ class WebsocketClientApp {
private botAudio: HTMLAudioElement;
private declare websocketTransport: WebSocketTransport;
private sendRecordedAudio: boolean = false
private sendRecordedAudio: boolean = false;
private declare recordingSerializer: RecordingSerializer;
private playBtn: HTMLButtonElement | null = null;
@@ -91,8 +94,12 @@ class WebsocketClientApp {
* Set up references to DOM elements and create necessary media elements
*/
private setupDOMElements(): void {
this.connectBtn = document.getElementById('connect-btn') as HTMLButtonElement;
this.disconnectBtn = document.getElementById('disconnect-btn') as HTMLButtonElement;
this.connectBtn = document.getElementById(
'connect-btn'
) as HTMLButtonElement;
this.disconnectBtn = document.getElementById(
'disconnect-btn'
) as HTMLButtonElement;
this.statusSpan = document.getElementById('connection-status');
this.debugLog = document.getElementById('debug-log');
this.playBtn = document.getElementById('play-btn') as HTMLButtonElement;
@@ -105,8 +112,12 @@ class WebsocketClientApp {
private setupEventListeners(): void {
this.connectBtn?.addEventListener('click', () => this.connect());
this.disconnectBtn?.addEventListener('click', () => this.disconnect());
this.playBtn?.addEventListener('click', () => this.startSendingRecordedAudio());
this.stopBtn?.addEventListener('click', () => this.stopSendingRecordedAudio());
this.playBtn?.addEventListener('click', () =>
this.startSendingRecordedAudio()
);
this.stopBtn?.addEventListener('click', () =>
this.stopSendingRecordedAudio()
);
}
/**
@@ -165,7 +176,9 @@ class WebsocketClientApp {
// Listen for tracks stopping
this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.log(`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`);
this.log(
`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`
);
});
}
@@ -175,7 +188,10 @@ class WebsocketClientApp {
*/
private setupAudioTrack(track: MediaStreamTrack): void {
this.log('Setting up audio track');
if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) {
if (
this.botAudio.srcObject &&
'getAudioTracks' in this.botAudio.srcObject
) {
const oldTrack = this.botAudio.srcObject.getAudioTracks()[0];
if (oldTrack?.id === track.id) return;
}
@@ -190,27 +206,17 @@ class WebsocketClientApp {
try {
const startTime = Date.now();
this.recordingSerializer = new RecordingSerializer()
const transport = this.ENABLE_RECORDING_MODE ?
new WebSocketTransport({
serializer: this.recordingSerializer,
recorderSampleRate: 8000,
playerSampleRate:8000
}) :
new WebSocketTransport({
serializer: new ProtobufFrameSerializer(),
recorderSampleRate: 8000,
playerSampleRate:8000
});
this.websocketTransport = transport
this.recordingSerializer = new RecordingSerializer();
const ws_opts = {
serializer: this.ENABLE_RECORDING_MODE
? this.recordingSerializer
: new ProtobufFrameSerializer(),
recorderSampleRate: 8000,
playerSampleRate: 8000,
};
const RTVIConfig: RTVIClientOptions = {
transport,
params: {
// The baseURL and endpoint of your bot server that the client will connect to
baseUrl: 'http://localhost:7860',
endpoints: { connect: '/connect' },
},
transport: new WebSocketTransport(ws_opts),
enableMic: true,
enableCam: false,
callbacks: {
@@ -238,27 +244,34 @@ class WebsocketClientApp {
onMessageError: (error) => console.error('Message error:', error),
onError: (error) => console.error('Error:', error),
},
}
};
this.rtviClient = new RTVIClient(RTVIConfig);
this.websocketTransport = this.rtviClient.transport;
this.setupTrackListeners();
this.log('Initializing devices...');
await this.rtviClient.initDevices();
this.log('Connecting to bot...');
await this.rtviClient.connect();
await this.rtviClient.connect({
endpoint: 'http://localhost:7860/connect',
});
const timeTaken = Date.now() - startTime;
this.log(`Connection complete, timeTaken: ${timeTaken}`);
if (this.ENABLE_RECORDING_MODE) {
this.log(`Starting to recording the next ${(this.RECORDING_TIME_MS/1000)}s of audio`);
this.recordingSerializer.startRecording()
await this.sleep(this.RECORDING_TIME_MS)
this.recordingSerializer.stopRecording()
this.log("Recording stopped");
this.rtviClient.enableMic(false)
this.startSendingRecordedAudio()
this.log(
`Starting to recording the next ${
this.RECORDING_TIME_MS / 1000
}s of audio`
);
this.recordingSerializer.startRecording();
await this.sleep(this.RECORDING_TIME_MS);
this.recordingSerializer.stopRecording();
this.log('Recording stopped');
this.rtviClient.enableMic(false);
this.startSendingRecordedAudio();
}
} catch (error) {
this.log(`Error connecting: ${(error as Error).message}`);
@@ -280,11 +293,16 @@ class WebsocketClientApp {
public async disconnect(): Promise<void> {
if (this.rtviClient) {
try {
this.stopSendingRecordedAudio()
this.stopSendingRecordedAudio();
await this.rtviClient.disconnect();
this.rtviClient = null;
if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) {
this.botAudio.srcObject.getAudioTracks().forEach((track) => track.stop());
if (
this.botAudio.srcObject &&
'getAudioTracks' in this.botAudio.srcObject
) {
this.botAudio.srcObject
.getAudioTracks()
.forEach((track) => track.stop());
this.botAudio.srcObject = null;
}
} catch (error) {
@@ -294,21 +312,21 @@ class WebsocketClientApp {
}
private startSendingRecordedAudio() {
this.sendRecordedAudio = true
this.sendRecordedAudio = true;
if (this.playBtn) this.playBtn.disabled = true;
if (this.stopBtn) this.stopBtn.disabled = false;
void this.replayAudio()
void this.replayAudio();
}
private stopSendingRecordedAudio() {
if (this.stopBtn) this.stopBtn.disabled = true;
if (this.playBtn) this.playBtn.disabled = false;
this.sendRecordedAudio = false
this.sendRecordedAudio = false;
}
private async replayAudio() {
if (this.sendRecordedAudio) {
this.log("Sending recorded audio")
this.log('Sending recorded audio');
for (const chunk of this.recordingSerializer.recordedAudio) {
await this.sleep(chunk.delay);
this.websocketTransport.handleUserAudioStream(chunk.data);
@@ -316,14 +334,13 @@ class WebsocketClientApp {
const randomDelay = 1000 + Math.random() * (10000 - 500);
await this.sleep(randomDelay);
void this.replayAudio()
void this.replayAudio();
}
}
private sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
return new Promise((resolve) => setTimeout(resolve, ms));
}
}
declare global {

File diff suppressed because it is too large Load Diff

View File

@@ -18,7 +18,7 @@
"vite": "^6.3.5"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.3.5",
"@pipecat-ai/daily-transport": "^0.3.8"
"@pipecat-ai/client-js": "^1.0.0",
"@pipecat-ai/daily-transport": "^1.0.0"
}
}

View File

@@ -5,7 +5,7 @@
*/
/**
* RTVI Client Implementation
* Pipecat Client Implementation
*
* This client connects to an RTVI-compatible bot server using WebRTC (via Daily).
* It handles audio/video streaming and manages the connection lifecycle.
@@ -18,20 +18,22 @@
import {
Participant,
RTVIClient,
RTVIClientOptions,
PipecatClient,
PipecatClientOptions,
RTVIEvent,
} from '@pipecat-ai/client-js';
import { DailyTransport } from '@pipecat-ai/daily-transport';
import {
DailyEventCallbacks,
DailyTransport,
} from '@pipecat-ai/daily-transport';
import SoundUtils from './util/soundUtils';
import { InstantVoiceHelper } from './util/instantVoiceHelper';
/**
* InstantVoiceClient handles the connection and media management for a real-time
* voice and video interaction with an AI bot.
*/
class InstantVoiceClient {
private declare rtviClient: RTVIClient;
private declare pcClient: PipecatClient;
private connectBtn: HTMLButtonElement | null = null;
private disconnectBtn: HTMLButtonElement | null = null;
private statusSpan: HTMLElement | null = null;
@@ -46,7 +48,7 @@ class InstantVoiceClient {
document.body.appendChild(this.botAudio);
this.setupDOMElements();
this.setupEventListeners();
this.initializeRTVIClient();
this.initializePipecatClient();
}
/**
@@ -72,16 +74,11 @@ class InstantVoiceClient {
this.disconnectBtn?.addEventListener('click', () => this.disconnect());
}
private initializeRTVIClient(): void {
const RTVIConfig: RTVIClientOptions = {
private initializePipecatClient(): void {
const PipecatConfig: PipecatClientOptions = {
transport: new DailyTransport({
bufferLocalAudioUntilBotReady: true,
}),
params: {
// The baseURL and endpoint of your bot server that the client will connect to
baseUrl: 'http://localhost:7860',
endpoints: { connect: '/connect' },
},
enableMic: true,
enableCam: false,
callbacks: {
@@ -113,30 +110,23 @@ class InstantVoiceClient {
onBotTranscript: (data) => this.log(`Bot: ${data.text}`),
onMessageError: (error) => console.error('Message error:', error),
onError: (error) => console.error('Error:', error),
},
onAudioBufferingStarted: () => {
SoundUtils.beep();
this.updateBufferingStatus('Yes');
this.log(
`onMicCaptureStarted, timeTaken: ${Date.now() - this.startTime}`
);
},
onAudioBufferingStopped: () => {
this.updateBufferingStatus('No');
this.log(
`onMicCaptureStopped, timeTaken: ${Date.now() - this.startTime}`
);
},
} as DailyEventCallbacks,
};
this.rtviClient = new RTVIClient(RTVIConfig);
this.rtviClient.registerHelper(
'transport',
new InstantVoiceHelper({
callbacks: {
onAudioBufferingStarted: () => {
SoundUtils.beep();
this.updateBufferingStatus('Yes');
this.log(
`onMicCaptureStarted, timeTaken: ${Date.now() - this.startTime}`
);
},
onAudioBufferingStopped: () => {
this.updateBufferingStatus('No');
this.log(
`onMicCaptureStopped, timeTaken: ${Date.now() - this.startTime}`
);
},
},
})
);
this.pcClient = new PipecatClient(PipecatConfig);
this.setupTrackListeners();
}
@@ -182,8 +172,8 @@ class InstantVoiceClient {
* This is called when the bot is ready or when the transport state changes to ready
*/
setupMediaTracks() {
if (!this.rtviClient) return;
const tracks = this.rtviClient.tracks();
if (!this.pcClient) return;
const tracks = this.pcClient.tracks();
if (tracks.bot?.audio) {
this.setupAudioTrack(tracks.bot.audio);
}
@@ -194,10 +184,10 @@ class InstantVoiceClient {
* This handles new tracks being added during the session
*/
setupTrackListeners() {
if (!this.rtviClient) return;
if (!this.pcClient) return;
// Listen for new tracks starting
this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => {
this.pcClient.on(RTVIEvent.TrackStarted, (track, participant) => {
// Only handle non-local (bot) tracks
if (!participant?.local && track.kind === 'audio') {
this.setupAudioTrack(track);
@@ -205,7 +195,7 @@ class InstantVoiceClient {
});
// Listen for tracks stopping
this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.pcClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.log(
`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`
);
@@ -230,22 +220,25 @@ class InstantVoiceClient {
/**
* Initialize and connect to the bot
* This sets up the RTVI client, initializes devices, and establishes the connection
* This sets up the Pipecat client, initializes devices, and establishes the connection
*/
public async connect(): Promise<void> {
try {
this.startTime = Date.now();
this.log('Connecting to bot...');
await this.rtviClient.connect();
await this.pcClient.connect({
// The baseURL and endpoint of your bot server that the client will connect to
endpoint: 'http://localhost:7860/connect',
});
} catch (error) {
this.log(`Error connecting: ${(error as Error).message}`);
this.updateStatus('Error');
this.updateBufferingStatus('No');
// Clean up if there's an error
if (this.rtviClient) {
if (this.pcClient) {
try {
await this.rtviClient.disconnect();
await this.pcClient.disconnect();
} catch (disconnectError) {
this.log(`Error during disconnect: ${disconnectError}`);
}
@@ -258,7 +251,7 @@ class InstantVoiceClient {
*/
public async disconnect(): Promise<void> {
try {
await this.rtviClient.disconnect();
await this.pcClient.disconnect();
if (
this.botAudio.srcObject &&
'getAudioTracks' in this.botAudio.srcObject

View File

@@ -1,39 +0,0 @@
import {RTVIClientHelper, RTVIClientHelperOptions, RTVIMessage} from "@pipecat-ai/client-js";
import {DailyRTVIMessageType} from '@pipecat-ai/daily-transport';
export type InstantVoiceHelperCallbacks = Partial<{
onAudioBufferingStarted: () => void;
onAudioBufferingStopped: () => void;
}>;
// --- Interface and class
export interface InstantVoiceHelperOptions extends RTVIClientHelperOptions {
callbacks?: InstantVoiceHelperCallbacks;
}
export class InstantVoiceHelper extends RTVIClientHelper {
protected declare _options: InstantVoiceHelperOptions;
constructor(options: InstantVoiceHelperOptions) {
super(options);
}
handleMessage(rtviMessage: RTVIMessage): void {
switch (rtviMessage.type) {
case DailyRTVIMessageType.AUDIO_BUFFERING_STARTED:
if (this._options.callbacks?.onAudioBufferingStarted) {
this._options.callbacks?.onAudioBufferingStarted()
}
break;
case DailyRTVIMessageType.AUDIO_BUFFERING_STOPPED:
if (this._options.callbacks?.onAudioBufferingStopped) {
this._options.callbacks?.onAudioBufferingStopped()
}
break;
}
}
getMessageTypes(): string[] {
return [DailyRTVIMessageType.AUDIO_BUFFERING_STARTED, DailyRTVIMessageType.AUDIO_BUFFERING_STOPPED];
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -15,7 +15,7 @@
"vite": "^6.3.5"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.3.5",
"@pipecat-ai/daily-transport": "^0.3.8"
"@pipecat-ai/client-js": "^1.0.0",
"@pipecat-ai/daily-transport": "^1.0.0"
}
}

View File

@@ -5,7 +5,7 @@
*/
/**
* RTVI Client Implementation
* Pipecat Client Implementation
*
* This client connects to an RTVI-compatible bot server using WebRTC (via Daily).
* It handles audio/video streaming and manages the connection lifecycle.
@@ -16,78 +16,9 @@
* - Browser with WebRTC support
*/
import {
LogLevel,
RTVIClient,
RTVIClientHelper,
RTVIEvent,
} from '@pipecat-ai/client-js';
import { LogLevel, PipecatClient, RTVIEvent } from '@pipecat-ai/client-js';
import { DailyTransport } from '@pipecat-ai/daily-transport';
class SearchResponseHelper extends RTVIClientHelper {
constructor(contentPanel) {
super();
this.contentPanel = contentPanel;
}
handleMessage(rtviMessage) {
console.log('SearchResponseHelper, received message:', rtviMessage);
if (rtviMessage.data) {
// Clear existing content
this.contentPanel.innerHTML = '';
// Create a container for all content
const contentContainer = document.createElement('div');
contentContainer.className = 'content-container';
// Add the search_result
if (rtviMessage.data.search_result) {
const searchResultDiv = document.createElement('div');
searchResultDiv.className = 'search-result';
searchResultDiv.textContent = rtviMessage.data.search_result;
contentContainer.appendChild(searchResultDiv);
}
// Add the sources
if (rtviMessage.data.origins) {
const sourcesDiv = document.createElement('div');
sourcesDiv.className = 'sources';
const sourcesTitle = document.createElement('h3');
sourcesTitle.className = 'sources-title';
sourcesTitle.textContent = 'Sources:';
sourcesDiv.appendChild(sourcesTitle);
rtviMessage.data.origins.forEach((origin) => {
const sourceLink = document.createElement('a');
sourceLink.className = 'source-link';
sourceLink.href = origin.site_uri;
sourceLink.target = '_blank';
sourceLink.textContent = origin.site_title;
sourcesDiv.appendChild(sourceLink);
});
contentContainer.appendChild(sourcesDiv);
}
// Add the rendered_content in an iframe
if (rtviMessage.data.rendered_content) {
const iframe = document.createElement('iframe');
iframe.className = 'iframe-container';
iframe.srcdoc = rtviMessage.data.rendered_content;
contentContainer.appendChild(iframe);
}
// Append the content container to the content panel
this.contentPanel.appendChild(contentContainer);
}
}
getMessageTypes() {
return ['bot-llm-search-response'];
}
}
/**
* ChatbotClient handles the connection and media management for a real-time
* voice and video interaction with an AI bot.
@@ -95,7 +26,7 @@ class SearchResponseHelper extends RTVIClientHelper {
class ChatbotClient {
constructor() {
// Initialize client state
this.rtviClient = null;
this.pcClient = null;
this.setupDOMElements();
this.setupEventListeners();
}
@@ -160,10 +91,10 @@ class ChatbotClient {
* This is called when the bot is ready or when the transport state changes to ready
*/
setupMediaTracks() {
if (!this.rtviClient) return;
if (!this.pcClient) return;
// Get current tracks from the client
const tracks = this.rtviClient.tracks();
const tracks = this.pcClient.tracks();
// Set up any available bot tracks
if (tracks.bot?.audio) {
@@ -176,10 +107,10 @@ class ChatbotClient {
* This handles new tracks being added during the session
*/
setupTrackListeners() {
if (!this.rtviClient) return;
if (!this.pcClient) return;
// Listen for new tracks starting
this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => {
this.pcClient.on(RTVIEvent.TrackStarted, (track, participant) => {
// Only handle non-local (bot) tracks
if (!participant?.local && track.kind === 'audio') {
this.setupAudioTrack(track);
@@ -187,7 +118,7 @@ class ChatbotClient {
});
// Listen for tracks stopping
this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.pcClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.log(
`Track stopped event: ${track.kind} from ${
participant?.name || 'unknown'
@@ -213,20 +144,13 @@ class ChatbotClient {
/**
* Initialize and connect to the bot
* This sets up the RTVI client, initializes devices, and establishes the connection
* This sets up the Pipecat client, initializes devices, and establishes the connection
*/
async connect() {
try {
// Initialize the RTVI client with a Daily WebRTC transport and our configuration
this.rtviClient = new RTVIClient({
// Initialize the Pipecat client with a Daily WebRTC transport and our configuration
this.pcClient = new PipecatClient({
transport: new DailyTransport(),
params: {
// The baseURL and endpoint of your bot server that the client will connect to
baseUrl: 'http://localhost:7860',
endpoints: {
connect: '/connect',
},
},
enableMic: true, // Enable microphone for user input
enableCam: false,
callbacks: {
@@ -251,6 +175,8 @@ class ChatbotClient {
this.setupMediaTracks();
}
},
// Handle search response events
onBotLlmSearchResponse: this.handleSearchResponse.bind(this),
// Handle bot connection events
onBotConnected: (participant) => {
this.log(`Bot connected: ${JSON.stringify(participant)}`);
@@ -281,22 +207,22 @@ class ChatbotClient {
},
},
});
//this.rtviClient.setLogLevel(LogLevel.DEBUG)
this.rtviClient.registerHelper(
'llm',
new SearchResponseHelper(this.searchResultContainer)
);
//this.pcClient.setLogLevel(LogLevel.DEBUG)
// Set up listeners for media track events
this.setupTrackListeners();
// Initialize audio devices
this.log('Initializing devices...');
await this.rtviClient.initDevices();
await this.pcClient.initDevices();
// Connect to the bot
this.log('Connecting to bot...');
await this.rtviClient.connect();
await this.pcClient.connect({
// The baseURL and endpoint of your bot server that the client will connect to
endpoint: 'http://localhost:7860/connect',
});
this.log('Connection complete');
} catch (error) {
@@ -306,9 +232,9 @@ class ChatbotClient {
this.updateStatus('Error');
// Clean up if there's an error
if (this.rtviClient) {
if (this.pcClient) {
try {
await this.rtviClient.disconnect();
await this.pcClient.disconnect();
} catch (disconnectError) {
this.log(`Error during disconnect: ${disconnectError.message}`);
}
@@ -320,11 +246,11 @@ class ChatbotClient {
* Disconnect from the bot and clean up media resources
*/
async disconnect() {
if (this.rtviClient) {
if (this.pcClient) {
try {
// Disconnect the RTVI client
await this.rtviClient.disconnect();
this.rtviClient = null;
// Disconnect the Pipecat client
await this.pcClient.disconnect();
this.pcClient = null;
// Clean up audio
if (this.botAudio.srcObject) {
@@ -339,6 +265,57 @@ class ChatbotClient {
}
}
}
handleSearchResponse(response) {
console.log('SearchResponseHelper, received message:', response);
// Clear existing content
this.searchResultContainer.innerHTML = '';
// Create a container for all content
const contentContainer = document.createElement('div');
contentContainer.className = 'content-container';
// Add the search_result
if (response.search_result) {
const searchResultDiv = document.createElement('div');
searchResultDiv.className = 'search-result';
searchResultDiv.textContent = response.search_result;
contentContainer.appendChild(searchResultDiv);
}
// Add the sources
if (response.origins) {
const sourcesDiv = document.createElement('div');
sourcesDiv.className = 'sources';
const sourcesTitle = document.createElement('h3');
sourcesTitle.className = 'sources-title';
sourcesTitle.textContent = 'Sources:';
sourcesDiv.appendChild(sourcesTitle);
response.origins.forEach((origin) => {
const sourceLink = document.createElement('a');
sourceLink.className = 'source-link';
sourceLink.href = origin.site_uri;
sourceLink.target = '_blank';
sourceLink.textContent = origin.site_title;
sourcesDiv.appendChild(sourceLink);
});
contentContainer.appendChild(sourcesDiv);
}
// Add the rendered_content in an iframe
if (response.rendered_content) {
const iframe = document.createElement('iframe');
iframe.className = 'iframe-container';
iframe.srcdoc = response.rendered_content;
contentContainer.appendChild(iframe);
}
// Append the content container to the content panel
this.searchResultContainer.appendChild(contentContainer);
}
}
// Initialize the client when the page loads

File diff suppressed because it is too large Load Diff

View File

@@ -18,7 +18,7 @@
"vite": "^6.3.5"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.3.2",
"@pipecat-ai/small-webrtc-transport": "^0.0.2"
"@pipecat-ai/client-js": "^1.0.0",
"@pipecat-ai/small-webrtc-transport": "^1.0.0"
}
}

View File

@@ -1,217 +1,236 @@
import { SmallWebRTCTransport } from '@pipecat-ai/small-webrtc-transport';
import {
SmallWebRTCTransport
} from "@pipecat-ai/small-webrtc-transport";
import {Participant, RTVIClient, RTVIClientOptions, Transport} from "@pipecat-ai/client-js";
BotLLMTextData,
Participant,
PipecatClient,
PipecatClientOptions,
TranscriptData,
TransportState,
} from '@pipecat-ai/client-js';
class WebRTCApp {
private declare connectBtn: HTMLButtonElement;
private declare disconnectBtn: HTMLButtonElement;
private declare muteBtn: HTMLButtonElement;
private declare connectBtn: HTMLButtonElement;
private declare disconnectBtn: HTMLButtonElement;
private declare muteBtn: HTMLButtonElement;
private declare audioInput: HTMLSelectElement;
private declare videoInput: HTMLSelectElement;
private declare audioCodec: HTMLSelectElement;
private declare videoCodec: HTMLSelectElement;
private declare audioInput: HTMLSelectElement;
private declare videoInput: HTMLSelectElement;
private declare audioCodec: HTMLSelectElement;
private declare videoCodec: HTMLSelectElement;
private declare videoElement: HTMLVideoElement;
private declare audioElement: HTMLAudioElement;
private declare videoElement: HTMLVideoElement;
private declare audioElement: HTMLAudioElement;
private debugLog: HTMLElement | null = null;
private statusSpan: HTMLElement | null = null;
private debugLog: HTMLElement | null = null;
private statusSpan: HTMLElement | null = null;
private declare smallWebRTCTransport: SmallWebRTCTransport;
private declare pcClient: PipecatClient;
private declare smallWebRTCTransport: SmallWebRTCTransport;
private declare rtviClient: RTVIClient;
constructor() {
this.setupDOMElements();
this.setupDOMEventListeners();
this.initializePipecatClient();
void this.populateDevices();
}
constructor() {
this.setupDOMElements();
this.setupDOMEventListeners();
this.initializeRTVIClient()
void this.populateDevices();
private initializePipecatClient(): void {
const opts: PipecatClientOptions = {
transport: new SmallWebRTCTransport({ connectionUrl: '/api/offer' }),
enableMic: true,
enableCam: true,
callbacks: {
onTransportStateChanged: (state: TransportState) => {
this.log(`Transport state: ${state}`);
},
onConnected: () => {
this.onConnectedHandler();
},
onBotReady: () => {
this.log('Bot is ready.');
},
onDisconnected: () => {
this.onDisconnectedHandler();
},
onUserStartedSpeaking: () => {
this.log('User started speaking.');
},
onUserStoppedSpeaking: () => {
this.log('User stopped speaking.');
},
onBotStartedSpeaking: () => {
this.log('Bot started speaking.');
},
onBotStoppedSpeaking: () => {
this.log('Bot stopped speaking.');
},
onUserTranscript: (transcript: TranscriptData) => {
if (transcript.final) {
this.log(`User transcript: ${transcript.text}`);
}
},
onBotTranscript: (data: BotLLMTextData) => {
this.log(`Bot transcript: ${data.text}`);
},
onTrackStarted: (
track: MediaStreamTrack,
participant?: Participant
) => {
if (participant?.local) {
return;
}
this.onBotTrackStarted(track);
},
onServerMessage: (msg: unknown) => {
this.log(`Server message: ${msg}`);
},
},
};
this.pcClient = new PipecatClient(opts);
this.smallWebRTCTransport = this.pcClient.transport as SmallWebRTCTransport;
}
private setupDOMElements(): void {
this.connectBtn = document.getElementById(
'connect-btn'
) as HTMLButtonElement;
this.disconnectBtn = document.getElementById(
'disconnect-btn'
) as HTMLButtonElement;
this.muteBtn = document.getElementById('mute-btn') as HTMLButtonElement;
this.audioInput = document.getElementById(
'audio-input'
) as HTMLSelectElement;
this.videoInput = document.getElementById(
'video-input'
) as HTMLSelectElement;
this.audioCodec = document.getElementById(
'audio-codec'
) as HTMLSelectElement;
this.videoCodec = document.getElementById(
'video-codec'
) as HTMLSelectElement;
this.videoElement = document.getElementById(
'bot-video'
) as HTMLVideoElement;
this.audioElement = document.getElementById(
'bot-audio'
) as HTMLAudioElement;
this.debugLog = document.getElementById('debug-log');
this.statusSpan = document.getElementById('connection-status');
}
private setupDOMEventListeners(): void {
this.connectBtn.addEventListener('click', () => this.start());
this.disconnectBtn.addEventListener('click', () => this.stop());
this.audioInput.addEventListener('change', (e) => {
// @ts-ignore
let audioDevice = e.target?.value;
this.pcClient.updateMic(audioDevice);
});
this.videoInput.addEventListener('change', (e) => {
// @ts-ignore
let videoDevice = e.target?.value;
this.pcClient.updateCam(videoDevice);
});
this.muteBtn.addEventListener('click', () => {
let isCamEnabled = this.pcClient.isCamEnabled;
this.pcClient.enableCam(!isCamEnabled);
this.muteBtn.textContent = isCamEnabled ? '📵' : '📷';
});
}
private log(message: string): void {
if (!this.debugLog) return;
const entry = document.createElement('div');
entry.textContent = `${new Date().toISOString()} - ${message}`;
if (message.startsWith('User: ')) {
entry.style.color = '#2196F3';
} else if (message.startsWith('Bot: ')) {
entry.style.color = '#4CAF50';
}
this.debugLog.appendChild(entry);
this.debugLog.scrollTop = this.debugLog.scrollHeight;
}
private initializeRTVIClient(): void {
const transport = new SmallWebRTCTransport();
const RTVIConfig: RTVIClientOptions = {
params: {
baseUrl: "/api/offer"
},
transport: transport as Transport,
enableMic: true,
enableCam: true,
callbacks: {
onTransportStateChanged: (state) => {
this.log(`Transport state: ${state}`)
},
onConnected: () => {
this.onConnectedHandler()
},
onBotReady: () => {
this.log("Bot is ready.")
},
onDisconnected: () => {
this.onDisconnectedHandler()
},
onUserStartedSpeaking: () => {
this.log("User started speaking.")
},
onUserStoppedSpeaking: () => {
this.log("User stopped speaking.")
},
onBotStartedSpeaking: () => {
this.log("Bot started speaking.")
},
onBotStoppedSpeaking: () => {
this.log("Bot stopped speaking.")
},
onUserTranscript: (transcript) => {
if (transcript.final) {
this.log(`User transcript: ${transcript.text}`)
}
},
onBotTranscript: (transcript) => {
this.log(`Bot transcript: ${transcript.text}`)
},
onTrackStarted: (track: MediaStreamTrack, participant?: Participant) => {
if (participant?.local) {
return
}
this.onBotTrackStarted(track)
},
onServerMessage: (msg) => {
this.log(`Server message: ${msg}`)
}
},
}
RTVIConfig.customConnectHandler = () => Promise.resolve();
this.rtviClient = new RTVIClient(RTVIConfig);
this.smallWebRTCTransport = transport
private clearAllLogs() {
this.debugLog!.innerText = '';
}
private updateStatus(status: string): void {
if (this.statusSpan) {
this.statusSpan.textContent = status;
}
this.log(`Status: ${status}`);
}
private setupDOMElements(): void {
this.connectBtn = document.getElementById('connect-btn') as HTMLButtonElement;
this.disconnectBtn = document.getElementById('disconnect-btn') as HTMLButtonElement;
this.muteBtn = document.getElementById('mute-btn') as HTMLButtonElement;
private onConnectedHandler() {
this.updateStatus('Connected');
if (this.connectBtn) this.connectBtn.disabled = true;
if (this.disconnectBtn) this.disconnectBtn.disabled = false;
}
this.audioInput = document.getElementById('audio-input') as HTMLSelectElement;
this.videoInput = document.getElementById('video-input') as HTMLSelectElement;
this.audioCodec = document.getElementById('audio-codec') as HTMLSelectElement;
this.videoCodec = document.getElementById('video-codec') as HTMLSelectElement;
private onDisconnectedHandler() {
this.updateStatus('Disconnected');
if (this.connectBtn) this.connectBtn.disabled = false;
if (this.disconnectBtn) this.disconnectBtn.disabled = true;
}
this.videoElement = document.getElementById('bot-video') as HTMLVideoElement;
this.audioElement = document.getElementById('bot-audio') as HTMLAudioElement;
this.debugLog = document.getElementById('debug-log');
this.statusSpan = document.getElementById('connection-status');
private onBotTrackStarted(track: MediaStreamTrack) {
if (track.kind === 'video') {
this.videoElement.srcObject = new MediaStream([track]);
} else {
this.audioElement.srcObject = new MediaStream([track]);
}
}
private setupDOMEventListeners(): void {
this.connectBtn.addEventListener("click", () => this.start());
this.disconnectBtn.addEventListener("click", () => this.stop());
this.audioInput.addEventListener("change", (e) => {
// @ts-ignore
let audioDevice = e.target?.value
this.rtviClient.updateMic(audioDevice)
})
this.videoInput.addEventListener("change", (e) => {
// @ts-ignore
let videoDevice = e.target?.value
this.rtviClient.updateCam(videoDevice)
})
this.muteBtn.addEventListener('click', () => {
let isCamEnabled = this.rtviClient.isCamEnabled
this.rtviClient.enableCam(!isCamEnabled)
this.muteBtn.textContent = isCamEnabled ? '📵' : '📷';
});
private async populateDevices(): Promise<void> {
const populateSelect = (
select: HTMLSelectElement,
devices: MediaDeviceInfo[]
): void => {
let counter = 1;
devices.forEach((device) => {
const option = document.createElement('option');
option.value = device.deviceId;
option.text = device.label || 'Device #' + counter;
select.appendChild(option);
counter += 1;
});
};
try {
const audioDevices = await this.pcClient.getAllMics();
populateSelect(this.audioInput, audioDevices);
const videoDevices = await this.pcClient.getAllCams();
populateSelect(this.videoInput, videoDevices);
} catch (e) {
alert(e);
}
}
private log(message: string): void {
if (!this.debugLog) return;
const entry = document.createElement('div');
entry.textContent = `${new Date().toISOString()} - ${message}`;
if (message.startsWith('User: ')) {
entry.style.color = '#2196F3';
} else if (message.startsWith('Bot: ')) {
entry.style.color = '#4CAF50';
}
this.debugLog.appendChild(entry);
this.debugLog.scrollTop = this.debugLog.scrollHeight;
private async start(): Promise<void> {
this.clearAllLogs();
this.connectBtn.disabled = true;
this.updateStatus('Connecting');
this.smallWebRTCTransport.setAudioCodec(this.audioCodec.value);
this.smallWebRTCTransport.setVideoCodec(this.videoCodec.value);
try {
await this.pcClient.connect();
} catch (e) {
console.log(`Failed to connect ${e}`);
this.stop();
}
}
private clearAllLogs() {
this.debugLog!.innerText = ''
}
private updateStatus(status: string): void {
if (this.statusSpan) {
this.statusSpan.textContent = status;
}
this.log(`Status: ${status}`);
}
private onConnectedHandler() {
this.updateStatus('Connected');
if (this.connectBtn) this.connectBtn.disabled = true;
if (this.disconnectBtn) this.disconnectBtn.disabled = false;
}
private onDisconnectedHandler() {
this.updateStatus('Disconnected');
if (this.connectBtn) this.connectBtn.disabled = false;
if (this.disconnectBtn) this.disconnectBtn.disabled = true;
}
private onBotTrackStarted(track: MediaStreamTrack) {
if (track.kind === 'video') {
this.videoElement.srcObject = new MediaStream([track]);
} else {
this.audioElement.srcObject = new MediaStream([track]);
}
}
private async populateDevices(): Promise<void> {
const populateSelect = (select: HTMLSelectElement, devices: MediaDeviceInfo[]): void => {
let counter = 1;
devices.forEach((device) => {
const option = document.createElement('option');
option.value = device.deviceId;
option.text = device.label || ('Device #' + counter);
select.appendChild(option);
counter += 1;
});
};
try {
const audioDevices = await this.rtviClient.getAllMics();
populateSelect(this.audioInput, audioDevices);
const videoDevices = await this.rtviClient.getAllCams();
populateSelect(this.videoInput, videoDevices);
} catch (e) {
alert(e);
}
}
private async start(): Promise<void> {
this.clearAllLogs()
this.connectBtn.disabled = true;
this.updateStatus("Connecting")
this.smallWebRTCTransport.setAudioCodec(this.audioCodec.value)
this.smallWebRTCTransport.setVideoCodec(this.videoCodec.value)
try {
await this.rtviClient.connect()
} catch (e) {
console.log(`Failed to connect ${e}`)
this.stop()
}
}
private stop(): void {
void this.rtviClient.disconnect()
}
private stop(): void {
void this.pcClient.disconnect();
}
}
// Create the WebRTCConnection instance

View File

@@ -1,40 +1,51 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>AI Chatbot</title>
</head>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>AI Chatbot</title>
</head>
<body>
<div class="container">
<div class="status-bar">
<div class="status">
Status: <span id="connection-status">Disconnected</span>
</div>
<div class="controls">
<button id="connect-btn">Connect</button>
<button id="disconnect-btn" disabled>Disconnect</button>
</div>
</div>
<div class="main-content">
<div class="bot-container">
<div id="bot-video-container">
<body>
<div class="container">
<div class="status-bar">
<div class="status">
Status: <span id="connection-status">Disconnected</span>
</div>
<audio id="bot-audio" autoplay></audio>
<div class="controls">
<button id="connect-btn">Connect</button>
<button id="disconnect-btn" disabled>Disconnect</button>
</div>
</div>
<div class="main-content">
<div class="bot-container">
<div id="bot-video-container"></div>
<audio id="bot-audio" autoplay></audio>
</div>
</div>
<div class="device-bar">
<div class="device-controls">
<select id="device-selector"></select>
<button id="mic-toggle-btn">Unmute Mic</button>
</div>
<div class="text-input-container">
<input
type="text"
id="text-input"
placeholder="Type your message..." />
<button id="send-text-btn" disabled>Send</button>
</div>
</div>
<div class="debug-panel">
<h3>Debug Info</h3>
<div id="debug-log"></div>
</div>
</div>
<div class="debug-panel">
<h3>Debug Info</h3>
<div id="debug-log"></div>
</div>
</div>
<script type="module" src="/src/app.js"></script>
<link rel="stylesheet" href="/src/style.css">
</body>
</html>
<script type="module" src="/src/app.js"></script>
<link rel="stylesheet" href="/src/style.css" />
</body>
</html>

File diff suppressed because it is too large Load Diff

View File

@@ -15,7 +15,7 @@
"vite": "^6.3.5"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.3.5",
"@pipecat-ai/daily-transport": "^0.3.8"
"@pipecat-ai/client-js": "^1.0.0",
"@pipecat-ai/daily-transport": "^1.0.0"
}
}

View File

@@ -5,7 +5,7 @@
*/
/**
* RTVI Client Implementation
* Pipecat Client Implementation
*
* This client connects to an RTVI-compatible bot server using WebRTC (via Daily).
* It handles audio/video streaming and manages the connection lifecycle.
@@ -16,7 +16,7 @@
* - Browser with WebRTC support
*/
import { RTVIClient, RTVIEvent } from '@pipecat-ai/client-js';
import { PipecatClient, RTVIEvent } from '@pipecat-ai/client-js';
import { DailyTransport } from '@pipecat-ai/daily-transport';
/**
@@ -26,9 +26,8 @@ import { DailyTransport } from '@pipecat-ai/daily-transport';
class ChatbotClient {
constructor() {
// Initialize client state
this.rtviClient = null;
this.pcClient = null;
this.setupDOMElements();
this.setupEventListeners();
this.initializeClientAndTransport();
}
@@ -42,6 +41,9 @@ class ChatbotClient {
this.statusSpan = document.getElementById('connection-status');
this.debugLog = document.getElementById('debug-log');
this.botVideoContainer = document.getElementById('bot-video-container');
this.deviceSelector = document.getElementById('device-selector');
this.micToggleBtn = document.getElementById('mic-toggle-btn');
this.sendTextBtn = document.getElementById('send-text-btn');
// Create an audio element for bot's voice output
this.botAudio = document.createElement('audio');
@@ -54,25 +56,78 @@ class ChatbotClient {
* Set up event listeners for connect/disconnect buttons
*/
setupEventListeners() {
this.connectBtn.addEventListener('click', () => this.connect());
this.connectBtn.addEventListener('click', () => {
console.log('click');
this.connect();
});
this.disconnectBtn.addEventListener('click', () => this.disconnect());
// Populate device selector
this.pcClient.getAllMics().then((mics) => {
console.log('Available mics:', mics);
mics.forEach((device) => {
const option = document.createElement('option');
option.value = device.deviceId;
option.textContent = device.label || `Microphone ${device.deviceId}`;
this.deviceSelector.appendChild(option);
});
});
this.deviceSelector.addEventListener('change', (event) => {
const selectedDeviceId = event.target.value;
console.log('Selected device ID:', selectedDeviceId);
this.pcClient.updateMic(selectedDeviceId);
});
// Handle mic mute/unmute toggle
const micToggleBtn = document.getElementById('mic-toggle-btn');
micToggleBtn.addEventListener('click', async () => {
if (this.pcClient.state === 'disconnected') {
await this.pcClient.initDevices();
} else {
this.pcClient.enableMic(!this.pcClient.isMicEnabled);
}
});
const textInput = document.getElementById('text-input');
const sendTextToLLM = () => {
this.sendTextBtn.disabled = true; // Disable button to prevent multiple clicks
const text = textInput.value.trim();
if (text) {
void this.pcClient.appendToContext({
role: 'user',
content: text,
run_immediately: true,
});
}
textInput.value = ''; // Clear the input
this.sendTextBtn.disabled = false; // Re-enable button after sending
};
this.sendTextBtn.addEventListener('click', sendTextToLLM);
// Also handle Enter key in the input
textInput.addEventListener('keypress', (e) => {
if (e.key === 'Enter') {
sendTextToLLM();
}
});
}
updateMicToggleButton(micEnabled) {
console.log('Mic enabled:', micEnabled, this.pcClient?.isMicEnabled);
this.micToggleBtn.textContent = micEnabled ? 'Mute Mic' : 'Unmute Mic';
}
/**
* Set up the RTVI client and Daily transport
* Set up the Pipecat client and Daily transport
*/
initializeClientAndTransport() {
// Initialize the RTVI client with a DailyTransport and our configuration
this.rtviClient = new RTVIClient({
async initializeClientAndTransport() {
console.log('Initializing Pipecat client and transport...');
// Initialize the Pipecat client with a DailyTransport and our configuration
this.pcClient = new PipecatClient({
transport: new DailyTransport(),
params: {
// The baseURL and endpoint of your bot server that the client will connect to
baseUrl: 'http://localhost:7860',
endpoints: {
connect: '/connect',
},
},
enableMic: true, // Enable microphone for user input
enableMic: true,
enableCam: false,
callbacks: {
// Handle connection state changes
@@ -86,7 +141,9 @@ class ChatbotClient {
this.updateStatus('Disconnected');
this.connectBtn.disabled = false;
this.disconnectBtn.disabled = true;
this.sendTextBtn.disabled = true;
this.log('Client disconnected');
this.updateMicToggleButton(false);
},
// Handle transport state changes
onTransportStateChanged: (state) => {
@@ -106,6 +163,7 @@ class ChatbotClient {
onBotReady: (data) => {
this.log(`Bot ready: ${JSON.stringify(data)}`);
this.setupMediaTracks();
this.sendTextBtn.disabled = false;
},
// Transcript events
onUserTranscript: (data) => {
@@ -121,14 +179,20 @@ class ChatbotClient {
onMessageError: (error) => {
console.log('Message error:', error);
},
onMicUpdated: (data) => {
console.log('Mic updated:', data);
this.deviceSelector.value = data.deviceId;
},
onError: (error) => {
console.log('Error:', JSON.stringify(error));
},
},
});
window.client = this; // Expose client globally for debugging
// Set up listeners for media track events
this.setupTrackListeners();
this.setupEventListeners();
}
/**
@@ -163,10 +227,10 @@ class ChatbotClient {
* This is called when the bot is ready or when the transport state changes to ready
*/
setupMediaTracks() {
if (!this.rtviClient) return;
if (!this.pcClient) return;
// Get current tracks from the client
const tracks = this.rtviClient.tracks();
const tracks = this.pcClient.tracks();
// Set up any available bot tracks
if (tracks.bot?.audio) {
@@ -182,27 +246,34 @@ class ChatbotClient {
* This handles new tracks being added during the session
*/
setupTrackListeners() {
if (!this.rtviClient) return;
if (!this.pcClient) return;
// Listen for new tracks starting
this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => {
// Only handle non-local (bot) tracks
this.pcClient.on(RTVIEvent.TrackStarted, (track, participant) => {
if (!participant?.local) {
if (track.kind === 'audio') {
this.setupAudioTrack(track);
} else if (track.kind === 'video') {
this.setupVideoTrack(track);
}
} else if (track.kind === 'audio') {
console.log(`Local audio track started: `, this.pcClient.tracks());
// If local audio track starts, update mic
this.updateMicToggleButton(true);
}
});
// Listen for tracks stopping
this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.pcClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.log(
`Track stopped event: ${track.kind} from ${
participant?.name || 'unknown'
participant ? (participant.local ? 'local' : 'bot') : 'unknown'
}`
);
if (participant?.local && track.kind === 'audio') {
// If local audio track stops, update mic toggle button
this.updateMicToggleButton(false);
}
});
}
@@ -251,17 +322,16 @@ class ChatbotClient {
/**
* Initialize and connect to the bot
* This sets up the RTVI client, initializes devices, and establishes the connection
* This sets up the Pipecat client, initializes devices, and establishes the connection
*/
async connect() {
try {
// Initialize audio/video devices
this.log('Initializing devices...');
await this.rtviClient.initDevices();
// Connect to the bot
this.log('Connecting to bot...');
await this.rtviClient.connect();
await this.pcClient.connect({
endpoint: 'http://localhost:7860/connect',
timeout: 25000,
});
this.log('Connection complete');
} catch (error) {
@@ -271,9 +341,9 @@ class ChatbotClient {
this.updateStatus('Error');
// Clean up if there's an error
if (this.rtviClient) {
if (this.pcClient) {
try {
await this.rtviClient.disconnect();
await this.pcClient.disconnect();
} catch (disconnectError) {
this.log(`Error during disconnect: ${disconnectError.message}`);
}
@@ -285,10 +355,10 @@ class ChatbotClient {
* Disconnect from the bot and clean up media resources
*/
async disconnect() {
if (this.rtviClient) {
if (this.pcClient) {
try {
// Disconnect the RTVI client
await this.rtviClient.disconnect();
// Disconnect the Pipecat client
await this.pcClient.disconnect();
// Clean up audio
if (this.botAudio.srcObject) {

View File

@@ -10,7 +10,8 @@ body {
margin: 0 auto;
}
.status-bar {
.status-bar,
.device-bar {
display: flex;
justify-content: space-between;
align-items: center;
@@ -20,7 +21,24 @@ body {
margin-bottom: 20px;
}
.controls button {
.device-bar {
flex-direction: column;
gap: 10px;
}
.controls,
.device-controls {
display: flex;
align-items: center;
gap: 10px; /* Adds spacing between elements */
}
.device-controls {
margin-left: auto;
}
.controls button,
.device-controls button {
padding: 8px 16px;
margin-left: 10px;
border: none;
@@ -28,6 +46,56 @@ body {
cursor: pointer;
}
#bot-selector,
#device-selector {
padding: 8px 16px;
padding-right: 40px;
border: none;
border-radius: 4px;
background-color: #6c757d; /* Gray background */
color: white; /* White text */
cursor: pointer;
appearance: none; /* Removes default browser styling for dropdowns */
background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='white'%3E%3Cpath d='M7 10l5 5 5-5z'/%3E%3C/svg%3E"); /* Custom arrow */
background-repeat: no-repeat;
background-position: right 8px center; /* Position the arrow */
}
#bot-selector:focus,
#device-selector:focus {
outline: none;
box-shadow: 0 0 4px rgba(0, 0, 0, 0.3); /* Add a subtle focus effect */
}
.text-input-container {
display: flex;
gap: 8px;
margin-left: 10px;
width: 100%;
flex: 1;
}
#text-input {
flex: 1;
padding: 8px 16px;
border: 1px solid #e0e0e0;
border-radius: 4px;
min-width: 200px;
}
#send-text-btn {
padding: 8px 16px;
border: none;
border-radius: 4px;
background-color: #007bff;
color: white;
flex-shrink: 0;
}
#send-text-btn:hover {
background-color: #0056b3;
}
#connect-btn {
background-color: #4caf50;
color: white;
@@ -38,6 +106,9 @@ body {
color: white;
}
#mic-toggle-btn {
}
button:disabled {
opacity: 0.5;
cursor: not-allowed;

File diff suppressed because it is too large Load Diff

View File

@@ -10,9 +10,9 @@
"preview": "vite preview"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.3.5",
"@pipecat-ai/client-react": "^0.3.5",
"@pipecat-ai/daily-transport": "^0.3.8",
"@pipecat-ai/client-js": "^1.0.0",
"@pipecat-ai/client-react": "^1.0.0",
"@pipecat-ai/daily-transport": "^1.0.0",
"react": "^18.3.1",
"react-dom": "^18.3.1"
},

View File

@@ -1,22 +1,22 @@
import {
RTVIClientAudio,
RTVIClientVideo,
useRTVIClientTransportState,
PipecatClientAudio,
PipecatClientVideo,
usePipecatClientTransportState,
} from '@pipecat-ai/client-react';
import { RTVIProvider } from './providers/RTVIProvider';
import { PipecatProvider } from './providers/PipecatProvider';
import { ConnectButton } from './components/ConnectButton';
import { StatusDisplay } from './components/StatusDisplay';
import { DebugDisplay } from './components/DebugDisplay';
import './App.css';
function BotVideo() {
const transportState = useRTVIClientTransportState();
const transportState = usePipecatClientTransportState();
const isConnected = transportState !== 'disconnected';
return (
<div className="bot-container">
<div className="video-container">
{isConnected && <RTVIClientVideo participant="bot" fit="cover" />}
{isConnected && <PipecatClientVideo participant="bot" fit="cover" />}
</div>
</div>
);
@@ -35,16 +35,16 @@ function AppContent() {
</div>
<DebugDisplay />
<RTVIClientAudio />
<PipecatClientAudio />
</div>
);
}
function App() {
return (
<RTVIProvider>
<PipecatProvider>
<AppContent />
</RTVIProvider>
</PipecatProvider>
);
}

View File

@@ -1,16 +1,16 @@
import {
useRTVIClient,
useRTVIClientTransportState,
usePipecatClient,
usePipecatClientTransportState,
} from '@pipecat-ai/client-react';
export function ConnectButton() {
const client = useRTVIClient();
const transportState = useRTVIClientTransportState();
const client = usePipecatClient();
const transportState = usePipecatClientTransportState();
const isConnected = ['connected', 'ready'].includes(transportState);
const handleClick = async () => {
if (!client) {
console.error('RTVI client is not initialized');
console.error('Pipecat client is not initialized');
return;
}
@@ -18,7 +18,7 @@ export function ConnectButton() {
if (isConnected) {
await client.disconnect();
} else {
await client.connect();
await client.connect({ endpoint: 'http://localhost:7860/connect' });
}
} catch (error) {
console.error('Connection error:', error);

View File

@@ -6,12 +6,12 @@ import {
TranscriptData,
BotLLMTextData,
} from '@pipecat-ai/client-js';
import { useRTVIClient, useRTVIClientEvent } from '@pipecat-ai/client-react';
import { usePipecatClient, useRTVIClientEvent } from '@pipecat-ai/client-react';
import './DebugDisplay.css';
export function DebugDisplay() {
const debugLogRef = useRef<HTMLDivElement>(null);
const client = useRTVIClient();
const client = usePipecatClient();
const log = useCallback((message: string) => {
if (!debugLogRef.current) return;

View File

@@ -1,7 +1,7 @@
import { useRTVIClientTransportState } from '@pipecat-ai/client-react';
import { usePipecatClientTransportState } from '@pipecat-ai/client-react';
export function StatusDisplay() {
const transportState = useRTVIClientTransportState();
const transportState = usePipecatClientTransportState();
return (
<div className="status">

View File

@@ -0,0 +1,16 @@
import { type PropsWithChildren } from 'react';
import { PipecatClient } from '@pipecat-ai/client-js';
import { DailyTransport } from '@pipecat-ai/daily-transport';
import { PipecatClientProvider } from '@pipecat-ai/client-react';
const client = new PipecatClient({
transport: new DailyTransport(),
enableMic: true,
enableCam: false,
});
export function PipecatProvider({ children }: PropsWithChildren) {
return (
<PipecatClientProvider client={client}>{children}</PipecatClientProvider>
);
}

View File

@@ -1,22 +0,0 @@
import { type PropsWithChildren } from 'react';
import { RTVIClient } from '@pipecat-ai/client-js';
import { DailyTransport } from '@pipecat-ai/daily-transport';
import { RTVIClientProvider } from '@pipecat-ai/client-react';
const transport = new DailyTransport();
const client = new RTVIClient({
transport,
params: {
baseUrl: 'http://localhost:7860',
endpoints: {
connect: '/connect',
},
},
enableMic: true,
enableCam: false,
});
export function RTVIProvider({ children }: PropsWithChildren) {
return <RTVIClientProvider client={client}>{children}</RTVIClientProvider>;
}

File diff suppressed because it is too large Load Diff

View File

@@ -19,7 +19,7 @@
"vite": "^6.0.2"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.4.0",
"@pipecat-ai/websocket-transport": "^0.4.2"
"@pipecat-ai/client-js": "^1.0.0",
"@pipecat-ai/websocket-transport": "^1.0.0"
}
}

View File

@@ -5,21 +5,22 @@
*/
import {
RTVIClient,
RTVIClientOptions,
RTVIEvent,
BotLLMTextData,
Participant,
PipecatClient,
PipecatClientOptions,
RTVIEvent, RTVIMessage, TranscriptData,
} from '@pipecat-ai/client-js';
import {
WebSocketTransport,
TwilioSerializer,
} from "@pipecat-ai/websocket-transport";
} from '@pipecat-ai/websocket-transport';
class WebsocketClientApp {
private static STREAM_SID = 'ws_mock_stream_sid';
private static CALL_SID = 'ws_mock_call_sid';
private static STREAM_SID = "ws_mock_stream_sid"
private static CALL_SID = "ws_mock_call_sid"
private rtviClient: RTVIClient | null = null;
private rtviClient: PipecatClient | null = null;
private connectBtn: HTMLButtonElement | null = null;
private disconnectBtn: HTMLButtonElement | null = null;
private statusSpan: HTMLElement | null = null;
@@ -38,8 +39,12 @@ class WebsocketClientApp {
* Set up references to DOM elements and create necessary media elements
*/
private setupDOMElements(): void {
this.connectBtn = document.getElementById('connect-btn') as HTMLButtonElement;
this.disconnectBtn = document.getElementById('disconnect-btn') as HTMLButtonElement;
this.connectBtn = document.getElementById(
'connect-btn'
) as HTMLButtonElement;
this.disconnectBtn = document.getElementById(
'disconnect-btn'
) as HTMLButtonElement;
this.statusSpan = document.getElementById('connection-status');
this.debugLog = document.getElementById('debug-log');
}
@@ -80,13 +85,23 @@ class WebsocketClientApp {
}
private async emulateTwilioMessages() {
const connectedMessage={"event": "connected", "protocol": "Call", "version": "1.0.0"}
const connectedMessage = {
event: 'connected',
protocol: 'Call',
version: '1.0.0',
};
const websocketTransport = this.rtviClient?.transport as WebSocketTransport
void websocketTransport?.sendRawMessage(connectedMessage)
const websocketTransport = this.rtviClient?.transport as WebSocketTransport;
void websocketTransport?.sendRawMessage(connectedMessage);
const startMessage={"event": "start", "start": {"streamSid": WebsocketClientApp.STREAM_SID, "callSid": WebsocketClientApp.CALL_SID}}
void websocketTransport?.sendRawMessage(startMessage)
const startMessage = {
event: 'start',
start: {
streamSid: WebsocketClientApp.STREAM_SID,
callSid: WebsocketClientApp.CALL_SID,
},
};
void websocketTransport?.sendRawMessage(startMessage);
}
/**
@@ -109,7 +124,7 @@ class WebsocketClientApp {
if (!this.rtviClient) return;
// Listen for new tracks starting
this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => {
this.rtviClient.on(RTVIEvent.TrackStarted, (track: MediaStreamTrack, participant?: Participant) => {
// Only handle non-local (bot) tracks
if (!participant?.local && track.kind === 'audio') {
this.setupAudioTrack(track);
@@ -117,8 +132,10 @@ class WebsocketClientApp {
});
// Listen for tracks stopping
this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.log(`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`);
this.rtviClient.on(RTVIEvent.TrackStopped, (track: MediaStreamTrack, participant?: Participant) => {
this.log(
`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`
);
});
}
@@ -128,7 +145,10 @@ class WebsocketClientApp {
*/
private setupAudioTrack(track: MediaStreamTrack): void {
this.log('Setting up audio track');
if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) {
if (
this.botAudio.srcObject &&
'getAudioTracks' in this.botAudio.srcObject
) {
const oldTrack = this.botAudio.srcObject.getAudioTracks()[0];
if (oldTrack?.id === track.id) return;
}
@@ -143,23 +163,19 @@ class WebsocketClientApp {
try {
const startTime = Date.now();
const transport = new WebSocketTransport({
const ws_opts = {
serializer: new TwilioSerializer(),
recorderSampleRate: 8000,
playerSampleRate: 8000
});
const RTVIConfig: RTVIClientOptions = {
transport,
params: {
// The baseURL and endpoint of your bot server that the client will connect to
baseUrl: 'http://localhost:8765',
endpoints: { connect: '/' },
},
playerSampleRate: 8000,
ws_url: 'http://localhost:8765/ws',
};
const RTVIConfig: PipecatClientOptions = {
transport: new WebSocketTransport(ws_opts),
enableMic: true,
enableCam: false,
callbacks: {
onConnected: () => {
this.emulateTwilioMessages()
this.emulateTwilioMessages();
this.updateStatus('Connected');
if (this.connectBtn) this.connectBtn.disabled = true;
if (this.disconnectBtn) this.disconnectBtn.disabled = false;
@@ -170,27 +186,21 @@ class WebsocketClientApp {
if (this.disconnectBtn) this.disconnectBtn.disabled = true;
this.log('Client disconnected');
},
onBotReady: (data) => {
onBotReady: (data: any) => {
this.log(`Bot ready: ${JSON.stringify(data)}`);
this.setupMediaTracks();
},
onUserTranscript: (data) => {
onUserTranscript: (data: TranscriptData) => {
if (data.final) {
this.log(`User: ${data.text}`);
}
},
onBotTranscript: (data) => this.log(`Bot: ${data.text}`),
onMessageError: (error) => console.error('Message error:', error),
onError: (error) => console.error('Error:', error),
onBotTranscript: (data: BotLLMTextData) => this.log(`Bot: ${data.text}`),
onMessageError: (error: RTVIMessage) => console.error('Message error:', error),
onError: (error: RTVIMessage) => console.error('Error:', error),
},
}
// @ts-ignore
RTVIConfig.customConnectHandler = () => Promise.resolve(
{
ws_url: "/ws",
}
);
this.rtviClient = new RTVIClient(RTVIConfig);
};
this.rtviClient = new PipecatClient(RTVIConfig);
this.setupTrackListeners();
this.log('Initializing devices...');
@@ -223,8 +233,13 @@ class WebsocketClientApp {
try {
await this.rtviClient.disconnect();
this.rtviClient = null;
if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) {
this.botAudio.srcObject.getAudioTracks().forEach((track) => track.stop());
if (
this.botAudio.srcObject &&
'getAudioTracks' in this.botAudio.srcObject
) {
this.botAudio.srcObject
.getAudioTracks()
.forEach((track) => track.stop());
this.botAudio.srcObject = null;
}
} catch (error) {
@@ -232,7 +247,6 @@ class WebsocketClientApp {
}
}
}
}
declare global {

File diff suppressed because it is too large Load Diff

View File

@@ -19,8 +19,8 @@
"vite": "^6.3.5"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.4.0",
"@pipecat-ai/websocket-transport": "^0.4.2",
"@pipecat-ai/client-js": "^1.0.0",
"@pipecat-ai/websocket-transport": "^1.0.0",
"protobufjs": "^7.4.0"
}
}

View File

@@ -5,7 +5,7 @@
*/
/**
* RTVI Client Implementation
* Pipecat Client Implementation
*
* This client connects to an RTVI-compatible bot server using WebSocket.
*
@@ -14,16 +14,14 @@
*/
import {
RTVIClient,
RTVIClientOptions,
PipecatClient,
PipecatClientOptions,
RTVIEvent,
} from '@pipecat-ai/client-js';
import {
WebSocketTransport
} from "@pipecat-ai/websocket-transport";
import { WebSocketTransport } from '@pipecat-ai/websocket-transport';
class WebsocketClientApp {
private rtviClient: RTVIClient | null = null;
private pcClient: PipecatClient | null = null;
private connectBtn: HTMLButtonElement | null = null;
private disconnectBtn: HTMLButtonElement | null = null;
private statusSpan: HTMLElement | null = null;
@@ -31,7 +29,7 @@ class WebsocketClientApp {
private botAudio: HTMLAudioElement;
constructor() {
console.log("WebsocketClientApp");
console.log('WebsocketClientApp');
this.botAudio = document.createElement('audio');
this.botAudio.autoplay = true;
//this.botAudio.playsInline = true;
@@ -45,8 +43,12 @@ class WebsocketClientApp {
* Set up references to DOM elements and create necessary media elements
*/
private setupDOMElements(): void {
this.connectBtn = document.getElementById('connect-btn') as HTMLButtonElement;
this.disconnectBtn = document.getElementById('disconnect-btn') as HTMLButtonElement;
this.connectBtn = document.getElementById(
'connect-btn'
) as HTMLButtonElement;
this.disconnectBtn = document.getElementById(
'disconnect-btn'
) as HTMLButtonElement;
this.statusSpan = document.getElementById('connection-status');
this.debugLog = document.getElementById('debug-log');
}
@@ -91,8 +93,8 @@ class WebsocketClientApp {
* This is called when the bot is ready or when the transport state changes to ready
*/
setupMediaTracks() {
if (!this.rtviClient) return;
const tracks = this.rtviClient.tracks();
if (!this.pcClient) return;
const tracks = this.pcClient.tracks();
if (tracks.bot?.audio) {
this.setupAudioTrack(tracks.bot.audio);
}
@@ -103,10 +105,10 @@ class WebsocketClientApp {
* This handles new tracks being added during the session
*/
setupTrackListeners() {
if (!this.rtviClient) return;
if (!this.pcClient) return;
// Listen for new tracks starting
this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => {
this.pcClient.on(RTVIEvent.TrackStarted, (track, participant) => {
// Only handle non-local (bot) tracks
if (!participant?.local && track.kind === 'audio') {
this.setupAudioTrack(track);
@@ -114,8 +116,10 @@ class WebsocketClientApp {
});
// Listen for tracks stopping
this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.log(`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`);
this.pcClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.log(
`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`
);
});
}
@@ -125,7 +129,10 @@ class WebsocketClientApp {
*/
private setupAudioTrack(track: MediaStreamTrack): void {
this.log('Setting up audio track');
if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) {
if (
this.botAudio.srcObject &&
'getAudioTracks' in this.botAudio.srcObject
) {
const oldTrack = this.botAudio.srcObject.getAudioTracks()[0];
if (oldTrack?.id === track.id) return;
}
@@ -134,21 +141,15 @@ class WebsocketClientApp {
/**
* Initialize and connect to the bot
* This sets up the RTVI client, initializes devices, and establishes the connection
* This sets up the Pipecat client, initializes devices, and establishes the connection
*/
public async connect(): Promise<void> {
try {
const startTime = Date.now();
//const transport = new DailyTransport();
const transport = new WebSocketTransport();
const RTVIConfig: RTVIClientOptions = {
transport,
params: {
// The baseURL and endpoint of your bot server that the client will connect to
baseUrl: 'http://localhost:7860',
endpoints: { connect: '/connect' },
},
const PipecatConfig: PipecatClientOptions = {
transport: new WebSocketTransport(),
enableMic: true,
enableCam: false,
callbacks: {
@@ -176,15 +177,20 @@ class WebsocketClientApp {
onMessageError: (error) => console.error('Message error:', error),
onError: (error) => console.error('Error:', error),
},
}
this.rtviClient = new RTVIClient(RTVIConfig);
};
this.pcClient = new PipecatClient(PipecatConfig);
// @ts-ignore
window.pcClient = this.pcClient; // Expose for debugging
this.setupTrackListeners();
this.log('Initializing devices...');
await this.rtviClient.initDevices();
await this.pcClient.initDevices();
this.log('Connecting to bot...');
await this.rtviClient.connect();
await this.pcClient.connect({
// The baseURL and endpoint of your bot server that the client will connect to
endpoint: 'http://localhost:7860/connect',
});
const timeTaken = Date.now() - startTime;
this.log(`Connection complete, timeTaken: ${timeTaken}`);
@@ -192,9 +198,9 @@ class WebsocketClientApp {
this.log(`Error connecting: ${(error as Error).message}`);
this.updateStatus('Error');
// Clean up if there's an error
if (this.rtviClient) {
if (this.pcClient) {
try {
await this.rtviClient.disconnect();
await this.pcClient.disconnect();
} catch (disconnectError) {
this.log(`Error during disconnect: ${disconnectError}`);
}
@@ -206,12 +212,17 @@ class WebsocketClientApp {
* Disconnect from the bot and clean up media resources
*/
public async disconnect(): Promise<void> {
if (this.rtviClient) {
if (this.pcClient) {
try {
await this.rtviClient.disconnect();
this.rtviClient = null;
if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) {
this.botAudio.srcObject.getAudioTracks().forEach((track) => track.stop());
await this.pcClient.disconnect();
this.pcClient = null;
if (
this.botAudio.srcObject &&
'getAudioTracks' in this.botAudio.srcObject
) {
this.botAudio.srcObject
.getAudioTracks()
.forEach((track) => track.stop());
this.botAudio.srcObject = null;
}
} catch (error) {
@@ -219,7 +230,6 @@ class WebsocketClientApp {
}
}
}
}
declare global {

View File

@@ -295,6 +295,22 @@ This project uses TypeScript, React, and Next.js, making it a perfect fit for [V
Again, we'll use Pipecat Cloud. Follow the steps from above. The only difference will be the secrets required; in addition to a GOOGLE_API_KEY, you'll need `GOOGLE_APPLICATION_CREDENTIALS` in the format of a .json file with your [Google Cloud service account](https://console.cloud.google.com/iam-admin/serviceaccounts) information.
You'll need to modify the Dockerfile so that the credentials.json and word_list.py are accessible. This Dockerfile will work:
```Dockerfile
FROM dailyco/pipecat-base:latest
COPY ./requirements.txt requirements.txt
RUN pip install --no-cache-dir --upgrade -r requirements.txt
COPY ./word_list.py word_list.py
COPY ./credentials.json credentials.json
COPY ./bot_phone_twilio.py bot.py
```
Note: Your `credentials.json` file should have your Google service account credentials.
#### Buy and Configure a Twilio Number
Check out the [Twilio Websocket Telephony guide](https://docs.pipecat.daily.co/pipecat-in-production/telephony/twilio-mediastreams) for a step-by-step walkthrough on how to purchase a phone number, configure your TwiML, and make or receive calls.

File diff suppressed because it is too large Load Diff

View File

@@ -9,11 +9,12 @@
"lint": "next lint"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.3.5",
"@pipecat-ai/client-react": "^0.3.5",
"@pipecat-ai/daily-transport": "^0.3.10",
"@pipecat-ai/client-js": "^1.0.0",
"@pipecat-ai/client-react": "^1.0.0",
"@pipecat-ai/daily-transport": "^1.0.0",
"@tabler/icons-react": "^3.31.0",
"@tailwindcss/postcss": "^4.1.3",
"jotai": "^2.12.5",
"js-confetti": "^0.12.0",
"next": "15.2.4",
"react": "^19.0.0",

View File

@@ -1,16 +1,26 @@
import { useEffect, useCallback } from 'react';
import {
useRTVIClient,
useRTVIClientTransportState,
usePipecatClient,
usePipecatClientTransportState,
} from '@pipecat-ai/client-react';
import { CONNECTION_STATES } from '@/constants/gameConstants';
import { useConfigurationSettings } from '@/contexts/Configuration';
// Get the API base URL from environment variables
// Default to "/api" if not specified
// "/api" is the default for Next.js API routes and used
// for the Pipecat Cloud deployed agent
const API_BASE_URL = process.env.NEXT_PUBLIC_API_BASE_URL || '/api';
console.log('Using API base URL:', API_BASE_URL);
export function useConnectionState(
onConnected?: () => void,
onDisconnected?: () => void
) {
const client = useRTVIClient();
const transportState = useRTVIClientTransportState();
const client = usePipecatClient();
const transportState = usePipecatClientTransportState();
const config = useConfigurationSettings();
const isConnected = CONNECTION_STATES.ACTIVE.includes(transportState);
const isConnecting = CONNECTION_STATES.CONNECTING.includes(transportState);
@@ -35,12 +45,17 @@ export function useConnectionState(
if (isConnected) {
await client.disconnect();
} else {
await client.connect();
await client.connect({
endpoint: `${API_BASE_URL}/connect`,
requestData: {
personality: config.personality,
},
});
}
} catch (error) {
console.error('Connection error:', error);
}
}, [client, isConnected]);
}, [client, config, isConnected]);
return {
isConnected,

View File

@@ -1,15 +1,15 @@
import { ConfigurationProvider } from "@/contexts/Configuration";
import { RTVIProvider } from "@/providers/RTVIProvider";
import { RTVIClientAudio } from "@pipecat-ai/client-react";
import type { AppProps } from "next/app";
import { Nunito } from "next/font/google";
import Head from "next/head";
import "../styles/globals.css";
import { ConfigurationProvider } from '@/contexts/Configuration';
import { PipecatProvider } from '@/providers/PipecatProvider';
import { PipecatClientAudio } from '@pipecat-ai/client-react';
import type { AppProps } from 'next/app';
import { Nunito } from 'next/font/google';
import Head from 'next/head';
import '../styles/globals.css';
const nunito = Nunito({
subsets: ["latin"],
display: "swap",
variable: "--font-sans",
subsets: ['latin'],
display: 'swap',
variable: '--font-sans',
});
export default function App({ Component, pageProps }: AppProps) {
@@ -21,10 +21,10 @@ export default function App({ Component, pageProps }: AppProps) {
</Head>
<main className={`${nunito.variable}`}>
<ConfigurationProvider>
<RTVIProvider>
<RTVIClientAudio />
<PipecatProvider>
<PipecatClientAudio />
<Component {...pageProps} />
</RTVIProvider>
</PipecatProvider>
</ConfigurationProvider>
</main>
</>

View File

@@ -1,11 +1,11 @@
import type { NextApiRequest, NextApiResponse } from "next";
import type { NextApiRequest, NextApiResponse } from 'next';
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method !== "POST") {
return res.status(405).json({ error: "Method not allowed" });
if (req.method !== 'POST') {
return res.status(405).json({ error: 'Method not allowed' });
}
try {
@@ -15,16 +15,16 @@ export default async function handler(
if (!personality) {
return res
.status(400)
.json({ error: "Missing required configuration parameters" });
.json({ error: 'Missing required configuration parameters' });
}
const response = await fetch(
`https://api.pipecat.daily.co/v1/public/${process.env.AGENT_NAME}/start`,
{
method: "POST",
method: 'POST',
headers: {
Authorization: `Bearer ${process.env.PIPECAT_CLOUD_API_KEY}`,
"Content-Type": "application/json",
'Content-Type': 'application/json',
},
body: JSON.stringify({
createDailyRoom: true,
@@ -37,15 +37,15 @@ export default async function handler(
const data = await response.json();
console.log("Response from API:", JSON.stringify(data, null, 2));
console.log('Response from API:', JSON.stringify(data, null, 2));
// Transform the response to match what RTVI client expects
// Transform the response to match what Pipecat client expects
return res.status(200).json({
room_url: data.dailyRoom,
token: data.dailyToken,
});
} catch (error) {
console.error("Error starting agent:", error);
return res.status(500).json({ error: "Failed to start agent" });
console.error('Error starting agent:', error);
return res.status(500).json({ error: 'Failed to start agent' });
}
}

View File

@@ -0,0 +1,43 @@
'use client';
import { PipecatClient } from '@pipecat-ai/client-js';
import { DailyTransport } from '@pipecat-ai/daily-transport';
import { PipecatClientProvider } from '@pipecat-ai/client-react';
import { PropsWithChildren, useEffect, useState, useRef } from 'react';
export function PipecatProvider({ children }: PropsWithChildren) {
const [client, setClient] = useState<PipecatClient | null>(null);
const clientCreated = useRef(false);
useEffect(() => {
// Only create the client once
if (clientCreated.current) return;
const pcClient = new PipecatClient({
transport: new DailyTransport(),
enableMic: true,
enableCam: false,
});
setClient(pcClient);
clientCreated.current = true;
// Cleanup when component unmounts
return () => {
if (pcClient) {
pcClient.disconnect().catch((err) => {
console.error('Error disconnecting client:', err);
});
}
clientCreated.current = false;
};
}, []);
if (!client) {
return null;
}
return (
<PipecatClientProvider client={client}>{children}</PipecatClientProvider>
);
}

View File

@@ -1,72 +0,0 @@
"use client";
import { RTVIClient } from "@pipecat-ai/client-js";
import { DailyTransport } from "@pipecat-ai/daily-transport";
import { RTVIClientProvider } from "@pipecat-ai/client-react";
import { PropsWithChildren, useEffect, useState, useRef } from "react";
import { useConfigurationSettings } from "@/contexts/Configuration";
// Get the API base URL from environment variables
// Default to "/api" if not specified
// "/api" is the default for Next.js API routes and used
// for the Pipecat Cloud deployed agent
const API_BASE_URL = process.env.NEXT_PUBLIC_API_BASE_URL || "/api";
console.log("Using API base URL:", API_BASE_URL);
export function RTVIProvider({ children }: PropsWithChildren) {
const [client, setClient] = useState<RTVIClient | null>(null);
const config = useConfigurationSettings();
const clientCreated = useRef(false);
useEffect(() => {
// Only create the client once
if (clientCreated.current) return;
const transport = new DailyTransport();
const rtviClient = new RTVIClient({
transport,
params: {
baseUrl: API_BASE_URL,
endpoints: {
connect: "/connect",
},
requestData: {
personality: config.personality,
},
},
enableMic: true,
enableCam: false,
});
setClient(rtviClient);
clientCreated.current = true;
// Cleanup when component unmounts
return () => {
if (rtviClient) {
rtviClient.disconnect().catch((err) => {
console.error("Error disconnecting client:", err);
});
}
clientCreated.current = false;
};
}, []);
// Update the connectParams when config changes
useEffect(() => {
if (!client) return;
// Update the connect params without recreating the client
client.params.requestData = {
personality: config.personality,
};
}, [client, config.personality]);
if (!client) {
return null;
}
return <RTVIClientProvider client={client}>{children}</RTVIClientProvider>;
}

View File

@@ -4,6 +4,7 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import os
import sys
@@ -198,16 +199,15 @@ async def bot(args: DailySessionArguments):
# Local development
async def local_daily():
async def local_daily(args: DailySessionArguments):
"""Daily transport for local development."""
from runner import configure
# from runner import configure
try:
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
room_url=args.room_url,
token=args.token,
bot_name="Bot",
params=DailyParams(
audio_in_enabled=True,
@@ -217,7 +217,7 @@ async def local_daily():
)
test_config = {
"personality": "witty",
"personality": args.personality,
}
await main(transport, test_config)
@@ -227,7 +227,24 @@ async def local_daily():
# Local development entry point
if LOCAL_RUN and __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Run the Word Wrangler bot in local development mode"
)
parser.add_argument(
"-u", "--room-url", type=str, default=os.getenv("DAILY_SAMPLE_ROOM_URL", "")
)
parser.add_argument(
"-t", "--token", type=str, default=os.getenv("DAILY_SAMPLE_ROOM_TOKEN", None)
)
parser.add_argument(
"-p",
"--personality",
default="witty",
choices=["friendly", "professional", "enthusiastic", "thoughtful", "witty"],
help="Personality preset for the bot (friendly, professional, enthusiastic, thoughtful, witty)",
)
args = parser.parse_args()
try:
asyncio.run(local_daily())
asyncio.run(local_daily(args))
except Exception as e:
logger.exception(f"Failed to run in local mode: {e}")

View File

@@ -160,14 +160,15 @@ async def rtvi_connect(request: Request) -> Dict[Any, Any]:
Raises:
HTTPException: If room creation, token generation, or bot startup fails
"""
print("Creating room for RTVI connection")
body = await request.json()
print("Creating room for RTVI connection", body)
room_url, token = await create_room_and_token()
print(f"Room URL: {room_url}")
# Start the bot process
try:
proc = subprocess.Popen(
[f"python3 -m bot -u {room_url} -t {token}"],
[f"python3 -m bot -u {room_url} -t {token} -p {body.get('personality', 'witty')}"],
shell=True,
bufsize=1,
cwd=os.path.dirname(os.path.abspath(__file__)),

View File

@@ -20,19 +20,22 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence"
]
dependencies = [
"aiohttp~=3.11.12",
"aiohttp>=3.11.12,<4",
"audioop-lts~=0.2.1; python_version>='3.13'",
"docstring_parser~=0.16",
"loguru~=0.7.3",
"Markdown~=3.7",
"numpy~=1.26.4",
"Pillow~=11.1.0",
"Markdown>=3.7,<4",
"nltk>=3.9.1,<4",
"numpy>=1.26.4,<3",
"Pillow>=11.1.0,<12",
"protobuf~=5.29.3",
"pydantic~=2.10.6",
"pydantic>=2.10.6,<3",
"pyloudnorm~=0.1.1",
"resampy~=0.4.3",
"soxr~=0.5.0",
"openai~=1.70.0",
"openai>=1.74.0,<2",
# Explicit dependency pins for Python 3.11+ compatibility
"numba>=0.60.0,<1",
]
[project.urls]
@@ -41,59 +44,60 @@ Website = "https://pipecat.ai"
[project.optional-dependencies]
anthropic = [ "anthropic~=0.49.0" ]
assemblyai = [ "websockets~=13.1" ]
aws = [ "boto3~=1.37.16", "websockets~=13.1" ]
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.0.2" ]
assemblyai = [ "websockets>=13.1,<15.0" ]
aws = [ "aioboto3~=15.0.0", "websockets>=13.1,<15.0" ]
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.0.2; python_version>='3.12'" ]
azure = [ "azure-cognitiveservices-speech~=1.42.0"]
cartesia = [ "cartesia~=2.0.3", "websockets~=13.1" ]
cartesia = [ "cartesia~=2.0.3", "websockets>=13.1,<15.0" ]
cerebras = []
deepseek = []
daily = [ "daily-python~=0.19.4" ]
deepgram = [ "deepgram-sdk~=4.1.0" ]
elevenlabs = [ "websockets~=13.1" ]
deepgram = [ "deepgram-sdk~=4.7.0" ]
elevenlabs = [ "websockets>=13.1,<15.0" ]
fal = [ "fal-client~=0.5.9" ]
fireworks = []
fish = [ "ormsgpack~=1.7.0", "websockets~=13.1" ]
gladia = [ "websockets~=13.1" ]
google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", "google-genai~=1.24.0", "websockets~=13.1" ]
fish = [ "ormsgpack~=1.7.0", "websockets>=13.1,<15.0" ]
gladia = [ "websockets>=13.1,<15.0" ]
google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", "google-genai~=1.24.0", "websockets>=13.1,<15.0" ]
grok = []
groq = [ "groq~=0.23.0" ]
gstreamer = [ "pygobject~=3.50.0" ]
krisp = [ "pipecat-ai-krisp~=0.4.0" ]
koala = [ "pvkoala~=2.0.3" ]
langchain = [ "langchain~=0.3.20", "langchain-community~=0.3.20", "langchain-openai~=0.3.9" ]
livekit = [ "livekit~=0.22.0", "livekit-api~=0.8.2", "tenacity~=9.0.0" ]
lmnt = [ "websockets~=13.1" ]
livekit = [ "livekit~=0.22.0", "livekit-api~=0.8.2", "tenacity>=8.2.3,<10.0.0" ]
lmnt = [ "websockets>=13.1,<15.0" ]
local = [ "pyaudio~=0.2.14" ]
mcp = [ "mcp[cli]~=1.9.4" ]
mem0 = [ "mem0ai~=0.1.94" ]
mlx-whisper = [ "mlx-whisper~=0.4.2" ]
moondream = [ "einops~=0.8.0", "timm~=1.0.13", "transformers~=4.48.0" ]
moondream = [ "einops~=0.8.0", "timm~=1.0.13", "transformers>=4.48.0" ]
nim = []
neuphonic = [ "pyneuphonic~=1.5.13", "websockets~=13.1" ]
neuphonic = [ "websockets>=13.1,<15.0" ]
noisereduce = [ "noisereduce~=3.0.3" ]
openai = [ "websockets~=13.1" ]
openai = [ "websockets>=13.1,<15.0" ]
openpipe = [ "openpipe~=4.50.0" ]
openrouter = []
perplexity = []
playht = [ "pyht~=0.1.12", "websockets~=13.1" ]
playht = [ "pyht>=0.1.6", "websockets>=13.1,<15.0" ]
qwen = []
rime = [ "websockets~=13.1" ]
riva = [ "nvidia-riva-client~=2.19.1" ]
rime = [ "websockets>=13.1,<15.0" ]
riva = [ "nvidia-riva-client~=2.21.1" ]
sambanova = []
sentry = [ "sentry-sdk~=2.23.1" ]
local-smart-turn = [ "coremltools>=8.0", "transformers", "torch==2.5.0", "torchaudio==2.5.0" ]
local-smart-turn = [ "coremltools>=8.0", "transformers", "torch~=2.5.0", "torchaudio~=2.5.0" ]
remote-smart-turn = []
silero = [ "onnxruntime~=1.20.1" ]
simli = [ "simli-ai~=0.1.10"]
soniox = [ "websockets>=13.1,<15.0" ]
soundfile = [ "soundfile~=0.13.0" ]
speechmatics = [ "speechmatics-rt>=0.3.1" ]
tavus=[]
together = []
tracing = [ "opentelemetry-sdk>=1.33.0", "opentelemetry-api>=1.33.0", "opentelemetry-instrumentation>=0.54b0" ]
ultravox = [ "transformers~=4.48.0", "vllm~=0.7.3" ]
ultravox = [ "transformers>=4.48.0", "vllm~=0.7.3" ]
webrtc = [ "aiortc~=1.11.0", "opencv-python~=4.11.0.86" ]
websocket = [ "websockets~=13.1", "fastapi~=0.115.6" ]
websocket = [ "websockets>=13.1,<15.0", "fastapi>=0.115.6,<0.117.0" ]
whisper = [ "faster-whisper~=1.1.1" ]
[tool.setuptools.packages.find]
@@ -148,3 +152,6 @@ convention = "google"
command_line = "--module pytest"
source = ["src"]
omit = ["*/tests/*"]
[project.scripts]
pipecat = "pipecat.__main__:main"

View File

@@ -1,5 +1,10 @@
ruff format src
ruff format examples
ruff format tests
ruff format scripts
ruff check --select I,D --fix
#!/bin/bash
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
echo "Running ruff format..."
ruff format "$PROJECT_ROOT"
echo "Running ruff check..."
ruff check --fix "$PROJECT_ROOT"

101
src/pipecat/__main__.py Normal file
View File

@@ -0,0 +1,101 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import importlib.util
import sys
from pathlib import Path
from typing import Any, Callable
from loguru import logger
def load_bot_module(file_path: str, function_name: str = "run_example"):
"""Load a bot module from a Python file and return the specified function.
Args:
file_path: Path to the Python file containing the bot
function_name: Name of the function to load (default: run_example)
Returns:
The callable function from the module
Raises:
SystemExit: If the file doesn't exist, isn't a Python file, or the function isn't found
"""
logger.info(f"Loading bot module from: {file_path}")
logger.info(f"Looking for function: {function_name}")
file_path_obj = Path(file_path)
if not file_path_obj.exists():
print(f"Error: File '{file_path}' not found", file=sys.stderr)
sys.exit(1)
if not file_path_obj.suffix == ".py":
print(f"Error: File '{file_path}' is not a Python file", file=sys.stderr)
sys.exit(1)
# Import the module
try:
logger.info(f"Importing module from: {file_path}")
spec = importlib.util.spec_from_file_location("bot_module", file_path_obj)
if spec is None or spec.loader is None:
print(f"Error: Could not load module from '{file_path}'", file=sys.stderr)
sys.exit(1)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
logger.info(f"Successfully imported module: {module.__name__}")
except Exception as e:
print(f"Error importing module from '{file_path}': {e}", file=sys.stderr)
sys.exit(1)
# Find the function to run
if not hasattr(module, function_name):
print(f"Error: Function '{function_name}' not found in '{file_path}'", file=sys.stderr)
print(
f"Available functions: {[name for name in dir(module) if not name.startswith('_')]}", file=sys.stderr)
sys.exit(1)
run_example = getattr(module, function_name)
if not callable(run_example):
print(f"Error: '{function_name}' is not a callable function", file=sys.stderr)
sys.exit(1)
logger.info(f"Successfully loaded function: {function_name}")
return run_example
def main():
"""Main entry point for the pipecat command line tool.
This function is called by the entry point script and handles argument parsing
and module loading before calling the actual main execution logic.
"""
# Set up argument parser for our specific arguments
parser = argparse.ArgumentParser(description="Run a Pipecat bot from a Python file")
parser.add_argument("file", help="Python file containing the bot to run")
parser.add_argument("--function", "-f", default="run_example",
help="Function name to run (default: run_example)")
# Parse our arguments first
args, remaining_args = parser.parse_known_args()
# Load the bot module and get the function
run_example = load_bot_module(args.file, args.function)
# Set sys.argv to the remaining arguments for the run_main function
sys.argv = [sys.argv[0]] + remaining_args
# Import run_main only when we need it
from pipecat.examples.run import main as run_main
# Call the main function from pipecat.examples.run
run_main(run_example)
if __name__ == "__main__":
main()

View File

@@ -76,6 +76,16 @@ class BaseTurnAnalyzer(ABC):
"""
pass
@property
@abstractmethod
def params(self):
"""Get the current turn analyzer parameters.
Returns:
Current turn analyzer configuration parameters.
"""
pass
@abstractmethod
def append_audio(self, buffer: bytes, is_speech: bool) -> EndOfTurnState:
"""Appends audio data for analysis.

View File

@@ -87,6 +87,15 @@ class BaseSmartTurn(BaseTurnAnalyzer):
"""
return self._speech_triggered
@property
def params(self) -> SmartTurnParams:
"""Get the current smart turn parameters.
Returns:
Current smart turn configuration parameters.
"""
return self._params
def append_audio(self, buffer: bytes, is_speech: bool) -> EndOfTurnState:
"""Append audio data for turn analysis.

View File

@@ -0,0 +1,196 @@
#
# Copyright (c) 2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Local PyTorch turn analyzer for on-device ML inference using the smart-turn-v2 model.
This module provides a smart turn analyzer that uses PyTorch models for
local end-of-turn detection without requiring network connectivity.
"""
from typing import Any, Dict
import numpy as np
from loguru import logger
from pipecat.audio.turn.smart_turn.base_smart_turn import BaseSmartTurn
try:
import torch
import torch.nn.functional as F
from torch import nn
from transformers import (
Wav2Vec2Config,
Wav2Vec2Model,
Wav2Vec2PreTrainedModel,
Wav2Vec2Processor,
)
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
"In order to use LocalSmartTurnAnalyzerV2, you need to `pip install pipecat-ai[local-smart-turn]`."
)
raise Exception(f"Missing module: {e}")
class LocalSmartTurnAnalyzerV2(BaseSmartTurn):
"""Local turn analyzer using the smart-turn-v2 PyTorch model.
Provides end-of-turn detection using locally-stored PyTorch models,
enabling offline operation without network dependencies. Uses
Wav2Vec2 architecture for audio sequence classification.
"""
def __init__(self, *, smart_turn_model_path: str, **kwargs):
"""Initialize the local PyTorch smart-turn-v2 analyzer.
Args:
smart_turn_model_path: Path to directory containing the PyTorch model
and feature extractor files. If empty, uses default HuggingFace model.
**kwargs: Additional arguments passed to BaseSmartTurn.
"""
super().__init__(**kwargs)
if not smart_turn_model_path:
# Define the path to the pretrained model on Hugging Face
smart_turn_model_path = "pipecat-ai/smart-turn-v2"
logger.debug("Loading Local Smart Turn v2 model...")
# Load the pretrained model for sequence classification
self._turn_model = _Wav2Vec2ForEndpointing.from_pretrained(smart_turn_model_path)
# Load the corresponding feature extractor for preprocessing audio
self._turn_processor = Wav2Vec2Processor.from_pretrained(smart_turn_model_path)
# Use platform-optimized backend if available (MPS for Apple silicon, CUDA for NVIDIA)
self._device = "cpu"
if torch.backends.mps.is_available():
self._device = "mps"
elif torch.cuda.is_available():
self._device = "cuda"
# Move model to selected device and set it to evaluation mode
self._turn_model = self._turn_model.to(self._device)
self._turn_model.eval()
logger.debug("Loaded Local Smart Turn v2")
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
"""Predict end-of-turn using local PyTorch model."""
inputs = self._turn_processor(
audio_array,
sampling_rate=16000,
padding="max_length",
truncation=True,
max_length=16000 * 16, # 16 seconds at 16kHz
return_attention_mask=True,
return_tensors="pt",
)
# Move inputs to device
inputs = {k: v.to(self._device) for k, v in inputs.items()}
# Run inference
with torch.no_grad():
outputs = self._turn_model(**inputs)
# The model returns sigmoid probabilities directly in the logits field
probability = outputs["logits"][0].item()
# Make prediction (1 for Complete, 0 for Incomplete)
prediction = 1 if probability > 0.5 else 0
return {
"prediction": prediction,
"probability": probability,
}
class _Wav2Vec2ForEndpointing(Wav2Vec2PreTrainedModel):
def __init__(self, config: Wav2Vec2Config):
super().__init__(config)
self.wav2vec2 = Wav2Vec2Model(config)
self.pool_attention = nn.Sequential(
nn.Linear(config.hidden_size, 256), nn.Tanh(), nn.Linear(256, 1)
)
self.classifier = nn.Sequential(
nn.Linear(config.hidden_size, 256),
nn.LayerNorm(256),
nn.GELU(),
nn.Dropout(0.1),
nn.Linear(256, 64),
nn.GELU(),
nn.Linear(64, 1),
)
for module in self.classifier:
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=0.1)
if module.bias is not None:
module.bias.data.zero_()
for module in self.pool_attention:
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=0.1)
if module.bias is not None:
module.bias.data.zero_()
def attention_pool(self, hidden_states, attention_mask):
# Calculate attention weights
attention_weights = self.pool_attention(hidden_states)
if attention_mask is None:
raise ValueError("attention_mask must be provided for attention pooling")
attention_weights = attention_weights + (
(1.0 - attention_mask.unsqueeze(-1).to(attention_weights.dtype)) * -1e9
)
attention_weights = F.softmax(attention_weights, dim=1)
# Apply attention to hidden states
weighted_sum = torch.sum(hidden_states * attention_weights, dim=1)
return weighted_sum
def forward(self, input_values, attention_mask=None, labels=None):
outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
hidden_states = outputs[0]
# Create transformer padding mask
if attention_mask is not None:
input_length = attention_mask.size(1)
hidden_length = hidden_states.size(1)
ratio = input_length / hidden_length
indices = (torch.arange(hidden_length, device=attention_mask.device) * ratio).long()
attention_mask = attention_mask[:, indices]
attention_mask = attention_mask.bool()
else:
attention_mask = None
pooled = self.attention_pool(hidden_states, attention_mask)
logits = self.classifier(pooled)
if torch.isnan(logits).any():
raise ValueError("NaN values detected in logits")
if labels is not None:
# Calculate positive sample weight based on batch statistics
pos_weight = ((labels == 0).sum() / (labels == 1).sum()).clamp(min=0.1, max=10.0)
loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
labels = labels.float()
loss = loss_fct(logits.view(-1), labels.view(-1))
# Add L2 regularization for classifier layers
l2_lambda = 0.01
l2_reg = torch.tensor(0.0, device=logits.device)
for param in self.classifier.parameters():
l2_reg += torch.norm(param)
loss += l2_lambda * l2_reg
probs = torch.sigmoid(logits.detach())
return {"loss": loss, "logits": probs}
probs = torch.sigmoid(logits)
return {"logits": probs}

View File

@@ -183,36 +183,37 @@ class VADAnalyzer(ABC):
if len(self._vad_buffer) < num_required_bytes:
return self._vad_state
audio_frames = self._vad_buffer[:num_required_bytes]
self._vad_buffer = self._vad_buffer[num_required_bytes:]
while len(self._vad_buffer) >= num_required_bytes:
audio_frames = self._vad_buffer[:num_required_bytes]
self._vad_buffer = self._vad_buffer[num_required_bytes:]
confidence = self.voice_confidence(audio_frames)
confidence = self.voice_confidence(audio_frames)
volume = self._get_smoothed_volume(audio_frames)
self._prev_volume = volume
volume = self._get_smoothed_volume(audio_frames)
self._prev_volume = volume
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
if speaking:
match self._vad_state:
case VADState.QUIET:
self._vad_state = VADState.STARTING
self._vad_starting_count = 1
case VADState.STARTING:
self._vad_starting_count += 1
case VADState.STOPPING:
self._vad_state = VADState.SPEAKING
self._vad_stopping_count = 0
else:
match self._vad_state:
case VADState.STARTING:
self._vad_state = VADState.QUIET
self._vad_starting_count = 0
case VADState.SPEAKING:
self._vad_state = VADState.STOPPING
self._vad_stopping_count = 1
case VADState.STOPPING:
self._vad_stopping_count += 1
if speaking:
match self._vad_state:
case VADState.QUIET:
self._vad_state = VADState.STARTING
self._vad_starting_count = 1
case VADState.STARTING:
self._vad_starting_count += 1
case VADState.STOPPING:
self._vad_state = VADState.SPEAKING
self._vad_stopping_count = 0
else:
match self._vad_state:
case VADState.STARTING:
self._vad_state = VADState.QUIET
self._vad_starting_count = 0
case VADState.SPEAKING:
self._vad_state = VADState.STOPPING
self._vad_stopping_count = 1
case VADState.STOPPING:
self._vad_stopping_count += 1
if (
self._vad_state == VADState.STARTING

View File

@@ -9,6 +9,21 @@
This module provides a unified interface for running Pipecat examples across
different transport types including Daily.co, WebRTC, and Twilio. It handles
setup, configuration, and lifecycle management for each transport type.
Example usage:
SmallWebRTCTransport::
python bot.py --transport webrtc
DailyTransport::
python bot.py --transport daily
Twilio::
python bot.py --transport twilio --proxy username.ngrok.io
# Note: Concurrently, run an ngrok tunnel to your local server:
# ngrok http 7860
"""
import argparse

View File

@@ -28,6 +28,7 @@ from typing import (
)
from pipecat.audio.interruptions.base_interruption_strategy import BaseInterruptionStrategy
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.metrics.metrics import MetricsData
from pipecat.transcriptions.language import Language
@@ -613,6 +614,7 @@ class StartFrame(SystemFrame):
audio_out_sample_rate: Output audio sample rate in Hz.
allow_interruptions: Whether to allow user interruptions.
enable_metrics: Whether to enable performance metrics collection.
enable_tracing: Whether to enable OpenTelemetry tracing.
enable_usage_metrics: Whether to enable usage metrics collection.
interruption_strategies: List of interruption handling strategies.
report_only_initial_ttfb: Whether to report only initial time-to-first-byte.
@@ -622,6 +624,7 @@ class StartFrame(SystemFrame):
audio_out_sample_rate: int = 24000
allow_interruptions: bool = False
enable_metrics: bool = False
enable_tracing: bool = False
enable_usage_metrics: bool = False
interruption_strategies: List[BaseInterruptionStrategy] = field(default_factory=list)
report_only_initial_ttfb: bool = False
@@ -1145,6 +1148,23 @@ class OutputDTMFUrgentFrame(DTMFFrame, SystemFrame):
pass
@dataclass
class SpeechControlParamsFrame(SystemFrame):
"""Frame for notifying processors of speech control parameter changes.
This includes parameters for both VAD (Voice Activity Detection) and
turn-taking analysis. It allows downstream processors to adjust their
behavior based on updated interaction control settings.
Parameters:
vad_params: Current VAD parameters.
turn_params: Current turn-taking analysis parameters.
"""
vad_params: Optional[VADParams] = None
turn_params: Optional[SmartTurnParams] = None
#
# Control frames
#

View File

@@ -273,12 +273,17 @@ class ParallelPipeline(BasePipeline):
if not self._down_task:
self._down_task = self.create_task(self._process_down_queue())
async def _drain_queue(self, queue: asyncio.Queue):
try:
while not queue.empty():
queue.get_nowait()
except asyncio.QueueEmpty:
logger.debug(f"Draining {self} queue already empty")
async def _drain_queues(self):
"""Drain all frames from upstream and downstream queues."""
while not self._up_queue.empty:
await self._up_queue.get()
while not self._down_queue.empty:
await self._down_queue.get()
await self._drain_queue(self._up_queue)
await self._drain_queue(self._down_queue)
async def _handle_interruption(self):
"""Handle interruption by cancelling tasks, draining queues, and restarting."""

View File

@@ -38,14 +38,16 @@ class PipelineRunner(BaseObject):
handle_sigint: bool = True,
force_gc: bool = False,
loop: Optional[asyncio.AbstractEventLoop] = None,
handle_sigterm: bool = False,
):
"""Initialize the pipeline runner.
Args:
name: Optional name for the runner instance.
handle_sigint: Whether to automatically handle SIGINT/SIGTERM signals.
handle_sigint: Whether to automatically handle SIGINT signals.
force_gc: Whether to force garbage collection after task completion.
loop: Event loop to use. If None, uses the current running loop.
handle_sigterm: Whether to automatically handle SIGTERM signals.
"""
super().__init__(name=name)
@@ -57,6 +59,9 @@ class PipelineRunner(BaseObject):
if handle_sigint:
self._setup_sigint()
if handle_sigterm:
self._setup_sigterm()
async def run(self, task: PipelineTask):
"""Run a pipeline task to completion.
@@ -96,6 +101,10 @@ class PipelineRunner(BaseObject):
"""Set up signal handlers for graceful shutdown."""
loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGINT, lambda *args: self._sig_handler())
def _setup_sigterm(self):
"""Set up signal handlers for graceful shutdown."""
loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGTERM, lambda *args: self._sig_handler())
def _sig_handler(self):

View File

@@ -638,6 +638,7 @@ class PipelineTask(BasePipelineTask):
audio_in_sample_rate=self._params.audio_in_sample_rate,
audio_out_sample_rate=self._params.audio_out_sample_rate,
enable_metrics=self._params.enable_metrics,
enable_tracing=self._enable_tracing,
enable_usage_metrics=self._params.enable_usage_metrics,
report_only_initial_ttfb=self._params.report_only_initial_ttfb,
interruption_strategies=self._params.interruption_strategies,

View File

@@ -19,6 +19,8 @@ from typing import Dict, List, Literal, Optional, Set
from loguru import logger
from pipecat.audio.interruptions.base_interruption_strategy import BaseInterruptionStrategy
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import (
BotInterruptionFrame,
BotStartedSpeakingFrame,
@@ -43,6 +45,7 @@ from pipecat.frames.frames import (
LLMSetToolsFrame,
LLMTextFrame,
OpenAILLMContextAssistantTimestampFrame,
SpeechControlParamsFrame,
StartFrame,
StartInterruptionFrame,
TextFrame,
@@ -67,9 +70,13 @@ class LLMUserAggregatorParams:
aggregation_timeout: Maximum time in seconds to wait for additional
transcription content before pushing aggregated result. This
timeout is used only when the transcription is slow to arrive.
turn_emulated_vad_timeout: Maximum time in seconds to wait for emulated
VAD when using turn-based analysis. Applied when transcription is
received but VAD didn't detect speech (e.g., whispered utterances).
"""
aggregation_timeout: float = 0.5
turn_emulated_vad_timeout: float = 0.8
@dataclass
@@ -390,6 +397,9 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
"""
super().__init__(context=context, role="user", **kwargs)
self._params = params or LLMUserAggregatorParams()
self._vad_params: Optional[VADParams] = None
self._turn_params: Optional[SmartTurnParams] = None
if "aggregation_timeout" in kwargs:
import warnings
@@ -477,6 +487,10 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
self.set_tools(frame.tools)
elif isinstance(frame, LLMSetToolChoiceFrame):
self.set_tool_choice(frame.tool_choice)
elif isinstance(frame, SpeechControlParamsFrame):
self._vad_params = frame.vad_params
self._turn_params = frame.turn_params
await self.push_frame(frame, direction)
else:
await self.push_frame(frame, direction)
@@ -618,9 +632,40 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
async def _aggregation_task_handler(self):
while True:
try:
await asyncio.wait_for(
self._aggregation_event.wait(), self._params.aggregation_timeout
)
# The _aggregation_task_handler handles two distinct timeout scenarios:
#
# 1. When emulating_vad=True: Wait for emulated VAD timeout before
# pushing aggregation (simulating VAD behavior when no actual VAD
# detection occurred).
#
# 2. When emulating_vad=False: Use aggregation_timeout as a buffer
# to wait for potential late-arriving transcription frames after
# a real VAD event.
#
# For emulated VAD scenarios, the timeout strategy depends on whether
# a turn analyzer is configured:
#
# - WITH turn analyzer: Use turn_emulated_vad_timeout parameter because
# the VAD's stop_secs is set very low (e.g. 0.2s) for rapid speech
# chunking to feed the turn analyzer. This low value is too fast
# for emulated VAD scenarios where we need to allow users time to
# finish speaking (e.g. 0.8s).
#
# - WITHOUT turn analyzer: Use VAD's stop_secs directly to maintain
# consistent user experience between real VAD detection and
# emulated VAD scenarios.
if not self._emulating_vad:
timeout = self._params.aggregation_timeout
elif self._turn_params:
timeout = self._params.turn_emulated_vad_timeout
else:
# Use VAD stop_secs when no turn analyzer is present, fallback if no VAD params
timeout = (
self._vad_params.stop_secs
if self._vad_params
else self._params.turn_emulated_vad_timeout
)
await asyncio.wait_for(self._aggregation_event.wait(), timeout)
await self._maybe_emulate_user_speaking()
except asyncio.TimeoutError:
if not self._user_speaking:
@@ -648,7 +693,11 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
# to emulate VAD (i.e. user start/stopped speaking), but we do it only
# if the bot is not speaking. If the bot is speaking and we really have
# a short utterance we don't really want to interrupt the bot.
if not self._user_speaking and not self._waiting_for_aggregation:
if (
not self._user_speaking
and not self._waiting_for_aggregation
and len(self._aggregation) > 0
):
if self._bot_speaking:
# If we reached this case and the bot is speaking, let's ignore
# what the user said.

View File

@@ -44,6 +44,7 @@ from pipecat.frames.frames import (
InterimTranscriptionFrame,
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMMessagesAppendFrame,
LLMTextFrame,
MetricsFrame,
StartFrame,
@@ -71,13 +72,14 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.llm_service import (
FunctionCallParams, # TODO(aleix): we shouldn't import `services` from `processors`
)
from pipecat.services.openai.llm import OpenAIContextAggregatorPair
from pipecat.transports.base_input import BaseInputTransport
from pipecat.transports.base_output import BaseOutputTransport
from pipecat.transports.base_transport import BaseTransport
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
from pipecat.utils.string import match_endofsentence
RTVI_PROTOCOL_VERSION = "0.3.0"
RTVI_PROTOCOL_VERSION = "1.0.0"
RTVI_MESSAGE_LABEL = "rtvi-ai"
RTVIMessageLiteral = Literal["rtvi-ai"]
@@ -90,6 +92,10 @@ class RTVIServiceOption(BaseModel):
Defines a configurable option that can be set for an RTVI service,
including its name, type, and handler function.
.. deprecated:: 0.0.75
Pipeline Configuration has been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
name: str
@@ -104,6 +110,10 @@ class RTVIService(BaseModel):
Represents a service that can be configured and used within the RTVI protocol,
containing a name and list of configurable options.
.. deprecated:: 0.0.75
Pipeline Configuration has been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
name: str
@@ -122,6 +132,10 @@ class RTVIActionArgumentData(BaseModel):
"""Data for an RTVI action argument.
Contains the name and value of an argument passed to an RTVI action.
.. deprecated:: 0.0.75
Actions have been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
name: str
@@ -132,6 +146,10 @@ class RTVIActionArgument(BaseModel):
"""Definition of an RTVI action argument.
Specifies the name and expected type of an argument for an RTVI action.
.. deprecated:: 0.0.75
Actions have been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
name: str
@@ -143,6 +161,10 @@ class RTVIAction(BaseModel):
Represents an action that can be executed within the RTVI protocol,
including its service, name, arguments, and handler function.
.. deprecated:: 0.0.75
Actions have been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
service: str
@@ -166,6 +188,10 @@ class RTVIServiceOptionConfig(BaseModel):
"""Configuration value for an RTVI service option.
Contains the name and value to set for a specific service option.
.. deprecated:: 0.0.75
Pipeline Configuration has been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
name: str
@@ -176,6 +202,10 @@ class RTVIServiceConfig(BaseModel):
"""Configuration for an RTVI service.
Contains the service name and list of option configurations to apply.
.. deprecated:: 0.0.75
Pipeline Configuration has been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
service: str
@@ -186,6 +216,10 @@ class RTVIConfig(BaseModel):
"""Complete RTVI configuration.
Contains the full configuration for all RTVI services.
.. deprecated:: 0.0.75
Pipeline Configuration has been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
config: List[RTVIServiceConfig]
@@ -196,10 +230,15 @@ class RTVIConfig(BaseModel):
#
# deprecated
class RTVIUpdateConfig(BaseModel):
"""Request to update RTVI configuration.
Contains new configuration settings and whether to interrupt the bot.
.. deprecated:: 0.0.75
Pipeline Configuration has been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
config: List[RTVIServiceConfig]
@@ -210,6 +249,10 @@ class RTVIActionRunArgument(BaseModel):
"""Argument for running an RTVI action.
Contains the name and value of an argument to pass to an action.
.. deprecated:: 0.0.75
Actions have been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
name: str
@@ -220,6 +263,10 @@ class RTVIActionRun(BaseModel):
"""Request to run an RTVI action.
Contains the service, action name, and optional arguments.
.. deprecated:: 0.0.75
Actions have been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
service: str
@@ -234,12 +281,80 @@ class RTVIActionFrame(DataFrame):
Parameters:
rtvi_action_run: The action to execute.
message_id: Optional message ID for response correlation.
.. deprecated:: 0.0.75
Actions have been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
rtvi_action_run: RTVIActionRun
message_id: Optional[str] = None
class RTVIRawClientMessageData(BaseModel):
"""Data structure expected from client messages sent to the RTVI server."""
t: str
d: Optional[Any] = None
class RTVIClientMessage(BaseModel):
"""Cleansed data structure for client messages for handling."""
msg_id: str
type: str
data: Optional[Any] = None
@dataclass
class RTVIClientMessageFrame(SystemFrame):
"""A frame for sending messages from the client to the RTVI server.
This frame is meant for custom messaging from the client to the server
and expects a server-response message.
"""
msg_id: str
type: str
data: Optional[Any] = None
@dataclass
class RTVIServerResponseFrame(SystemFrame):
"""A frame for responding to a client RTVI message.
This frame should be sent in response to an RTVIClientMessageFrame
and include the original RTVIClientMessageFrame to ensure the response
is properly attributed to the original request. To respond with an error,
set the `error` field to a string describing the error. This will result
in the client receiving a `response-error` message instead of a
`server-response` message.
"""
client_msg: RTVIClientMessageFrame
data: Optional[Any] = None
error: Optional[str] = None
class RTVIRawServerResponseData(BaseModel):
"""Data structure for server responses to client messages."""
t: str
d: Optional[Any] = None
class RTVIServerResponse(BaseModel):
"""The RTVI-formatted message response from the server to the client.
This message is used to respond to custom messages sent by the client.
"""
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
type: Literal["server-response"] = "server-response"
id: str
data: RTVIRawServerResponseData
class RTVIMessage(BaseModel):
"""Base RTVI message structure.
@@ -269,7 +384,7 @@ class RTVIErrorResponseData(BaseModel):
class RTVIErrorResponse(BaseModel):
"""RTVI error response message.
Sent in response to a client request that resulted in an error.
RTVI Formatted error response message for relaying failed client requests.
"""
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
@@ -285,13 +400,13 @@ class RTVIErrorData(BaseModel):
"""
error: str
fatal: bool
fatal: bool # Indicates the pipeline has stopped due to this error
class RTVIError(BaseModel):
"""RTVI error event message.
Sent when an error occurs that isn't in response to a specific request.
RTVI Formatted error message for relaying errors in the pipeline.
"""
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
@@ -303,6 +418,10 @@ class RTVIDescribeConfigData(BaseModel):
"""Data for describing available RTVI configuration.
Contains the list of available services and their options.
.. deprecated:: 0.0.75
Pipeline Configuration has been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
config: List[RTVIService]
@@ -312,6 +431,10 @@ class RTVIDescribeConfig(BaseModel):
"""Message describing available RTVI configuration.
Sent in response to a describe-config request.
.. deprecated:: 0.0.75
Pipeline Configuration has been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
@@ -324,6 +447,10 @@ class RTVIDescribeActionsData(BaseModel):
"""Data for describing available RTVI actions.
Contains the list of available actions that can be executed.
.. deprecated:: 0.0.75
Actions have been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
actions: List[RTVIAction]
@@ -333,6 +460,10 @@ class RTVIDescribeActions(BaseModel):
"""Message describing available RTVI actions.
Sent in response to a describe-actions request.
.. deprecated:: 0.0.75
Actions have been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
@@ -345,6 +476,10 @@ class RTVIConfigResponse(BaseModel):
"""Response containing current RTVI configuration.
Sent in response to a get-config request.
.. deprecated:: 0.0.75
Pipeline Configuration has been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
@@ -357,6 +492,10 @@ class RTVIActionResponseData(BaseModel):
"""Data for an RTVI action response.
Contains the result of executing an action.
.. deprecated:: 0.0.75
Actions have been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
result: ActionResult
@@ -366,6 +505,10 @@ class RTVIActionResponse(BaseModel):
"""Response to an RTVI action execution.
Sent after successfully executing an action.
.. deprecated:: 0.0.75
Actions have been removed as part of the RTVI protocol 1.0.0.
Use custom client and server messages instead.
"""
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
@@ -374,6 +517,30 @@ class RTVIActionResponse(BaseModel):
data: RTVIActionResponseData
class AboutClientData(BaseModel):
"""Data about the RTVI client.
Contains information about the client, including which RTVI library it
is using, what platform it is on and any additional details, if available.
"""
library: str
library_version: Optional[str] = None
platform: Optional[str] = None
platform_version: Optional[str] = None
platform_details: Optional[Any] = None
class RTVIClientReadyData(BaseModel):
"""Data format of client ready messages.
Contains the RTVIprotocol version and client information.
"""
version: str
about: AboutClientData
class RTVIBotReadyData(BaseModel):
"""Data for bot ready notification.
@@ -381,7 +548,10 @@ class RTVIBotReadyData(BaseModel):
"""
version: str
config: List[RTVIServiceConfig]
# The config field is deprecated and will not be included if
# the client's rtvi version is 1.0.0 or higher.
config: Optional[List[RTVIServiceConfig]] = None
about: Optional[Mapping[str, Any]] = None
class RTVIBotReady(BaseModel):
@@ -418,6 +588,25 @@ class RTVILLMFunctionCallMessage(BaseModel):
data: RTVILLMFunctionCallMessageData
class RTVIAppendToContextData(BaseModel):
"""Data format for appending messages to the context.
Contains the role, content, and whether to run the message immediately.
"""
role: Literal["user", "assistant"] | str
content: Any
run_immediately: bool = False
class RTVIAppendToContext(BaseModel):
"""RTVI Message format to append content to the LLM context."""
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
type: Literal["append-to-context"] = "append-to-context"
data: RTVIAppendToContextData
class RTVILLMFunctionCallStartMessageData(BaseModel):
"""Data for LLM function call start notification.
@@ -752,6 +941,11 @@ class RTVIObserver(BaseObserver):
elif isinstance(frame, RTVIServerMessageFrame):
message = RTVIServerMessage(data=frame.data)
await self.push_transport_message_urgent(message)
elif isinstance(frame, RTVIServerResponseFrame):
if frame.error is not None:
await self._send_error_response(frame)
else:
await self._send_server_response(frame)
if mark_as_seen:
self._frames_seen.add(frame.id)
@@ -879,6 +1073,22 @@ class RTVIObserver(BaseObserver):
message = RTVIMetricsMessage(data=metrics)
await self.push_transport_message_urgent(message)
async def _send_server_response(self, frame: RTVIServerResponseFrame):
"""Send a response to the client for a specific request."""
message = RTVIServerResponse(
id=str(frame.client_msg.msg_id),
data=RTVIRawServerResponseData(t=frame.client_msg.type, d=frame.data),
)
await self.push_transport_message_urgent(message)
async def _send_error_response(self, frame: RTVIServerResponseFrame):
"""Send a response to the client for a specific request."""
if self._params.errors_enabled:
message = RTVIErrorResponse(
id=str(frame.client_msg.msg_id), data=RTVIErrorResponseData(error=frame.error)
)
await self.push_transport_message_urgent(message)
class RTVIProcessor(FrameProcessor):
"""Main processor for handling RTVI protocol messages and actions.
@@ -908,6 +1118,7 @@ class RTVIProcessor(FrameProcessor):
self._bot_ready = False
self._client_ready = False
self._client_ready_id = ""
self._client_version = []
self._errors_enabled = True
self._registered_actions: Dict[str, RTVIAction] = {}
@@ -921,6 +1132,7 @@ class RTVIProcessor(FrameProcessor):
self._register_event_handler("on_bot_started")
self._register_event_handler("on_client_ready")
self._register_event_handler("on_client_message")
self._input_transport = None
self._transport = transport
@@ -936,6 +1148,15 @@ class RTVIProcessor(FrameProcessor):
Args:
action: The action to register.
"""
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"The actions API is deprecated, use server and client messages instead.",
DeprecationWarning,
)
id = self._action_id(action.service, action.action)
self._registered_actions[id] = action
@@ -945,6 +1166,15 @@ class RTVIProcessor(FrameProcessor):
Args:
service: The service to register.
"""
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"The actions API is deprecated, use server and client messages instead.",
DeprecationWarning,
)
self._registered_services[service.name] = service
async def set_client_ready(self):
@@ -970,6 +1200,22 @@ class RTVIProcessor(FrameProcessor):
"""Send a bot interruption frame upstream."""
await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
async def send_server_message(self, data: Any):
"""Send a server message to the client."""
message = RTVIServerMessage(data=data)
await self._send_server_message(message)
async def send_server_response(self, client_msg: RTVIClientMessage, data: Any):
"""Send a server response for a given client message."""
message = RTVIServerResponse(
id=client_msg.msg_id, data=RTVIRawServerResponseData(t=client_msg.type, d=data)
)
await self._send_server_message(message)
async def send_error_response(self, client_msg: RTVIClientMessage, error: str):
"""Send an error response for a given client message."""
await self._send_error_response(id=client_msg.msg_id, error=error)
async def send_error(self, error: str):
"""Send an error message to the client.
@@ -1013,9 +1259,6 @@ class RTVIProcessor(FrameProcessor):
function_name: Name of the function being called.
llm: The LLM processor making the call.
context: The LLM context.
Note:
This method is deprecated. Use handle_function_call() instead.
"""
import warnings
@@ -1136,7 +1379,15 @@ class RTVIProcessor(FrameProcessor):
try:
match message.type:
case "client-ready":
await self._handle_client_ready(message.id)
data = None
try:
data = RTVIClientReadyData.model_validate(message.data)
except ValidationError:
# Not all clients have been updated to RTVI 1.0.0.
# For now, that's okay, we just log their info as unknown.
data = None
pass
await self._handle_client_ready(message.id, data)
case "describe-actions":
await self._handle_describe_actions(message.id)
case "describe-config":
@@ -1148,6 +1399,9 @@ class RTVIProcessor(FrameProcessor):
await self._handle_update_config(message.id, update_config)
case "disconnect-bot":
await self.push_frame(EndTaskFrame(), FrameDirection.UPSTREAM)
case "client-message":
data = RTVIRawClientMessageData.model_validate(message.data)
await self._handle_client_message(message.id, data)
case "action":
action = RTVIActionRun.model_validate(message.data)
action_frame = RTVIActionFrame(message_id=message.id, rtvi_action_run=action)
@@ -1155,6 +1409,9 @@ class RTVIProcessor(FrameProcessor):
case "llm-function-call-result":
data = RTVILLMFunctionCallResultData.model_validate(message.data)
await self._handle_function_call_result(data)
case "append-to-context":
data = RTVIAppendToContextData.model_validate(message.data)
await self._handle_update_context(data)
case "raw-audio" | "raw-audio-batch":
await self._handle_audio_buffer(message.data)
@@ -1168,9 +1425,20 @@ class RTVIProcessor(FrameProcessor):
await self._send_error_response(message.id, f"Exception processing message: {e}")
logger.warning(f"Exception processing message: {e}")
async def _handle_client_ready(self, request_id: str):
"""Handle a client-ready message."""
logger.debug("Received client-ready")
async def _handle_client_ready(self, request_id: str, data: RTVIClientReadyData | None):
"""Handle the client-ready message from the client."""
version = data.version if data else "unknown"
logger.debug(f"Received client-ready: version {version}")
if version == "unknown":
self._client_version = [0, 3, 0] # Default to 0.3.0 if unknown
else:
try:
self._client_version = [int(v) for v in version.split(".")]
except ValueError:
logger.warning(f"Invalid client version format: {version}")
self._client_version = [0, 3, 0]
about = data.about if data else {"library": "unknown"}
logger.debug(f"Client Details: {about}")
if self._input_transport:
await self._input_transport.start_audio_in_streaming()
@@ -1201,18 +1469,45 @@ class RTVIProcessor(FrameProcessor):
async def _handle_describe_config(self, request_id: str):
"""Handle a describe-config request."""
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Configuration helpers are deprecated. If your application needs this behavior, use custom server and client messages.",
DeprecationWarning,
)
services = list(self._registered_services.values())
message = RTVIDescribeConfig(id=request_id, data=RTVIDescribeConfigData(config=services))
await self._push_transport_message(message)
async def _handle_describe_actions(self, request_id: str):
"""Handle a describe-actions request."""
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"The Actions API is deprecated, use custom server and client messages instead.",
DeprecationWarning,
)
actions = list(self._registered_actions.values())
message = RTVIDescribeActions(id=request_id, data=RTVIDescribeActionsData(actions=actions))
await self._push_transport_message(message)
async def _handle_get_config(self, request_id: str):
"""Handle a get-config request."""
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Configuration helpers are deprecated. If your application needs this behavior, use custom server and client messages.",
DeprecationWarning,
)
message = RTVIConfigResponse(id=request_id, data=self._config)
await self._push_transport_message(message)
@@ -1230,6 +1525,15 @@ class RTVIProcessor(FrameProcessor):
async def _update_service_config(self, config: RTVIServiceConfig):
"""Update configuration for a specific service."""
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Configuration helpers are deprecated. If your application needs this behavior, use custom server and client messages.",
DeprecationWarning,
)
service = self._registered_services[config.service]
for option in config.options:
handler = service._options_dict[option.name].handler
@@ -1238,6 +1542,15 @@ class RTVIProcessor(FrameProcessor):
async def _update_config(self, data: RTVIConfig, interrupt: bool):
"""Update the RTVI configuration."""
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Configuration helpers are deprecated. If your application needs this behavior, use custom server and client messages.",
DeprecationWarning,
)
if interrupt:
await self.interrupt_bot()
for service_config in data.config:
@@ -1248,6 +1561,33 @@ class RTVIProcessor(FrameProcessor):
await self._update_config(RTVIConfig(config=data.config), data.interrupt)
await self._handle_get_config(request_id)
async def _handle_update_context(self, data: RTVIAppendToContextData):
if data.run_immediately:
await self.interrupt_bot()
frame = LLMMessagesAppendFrame(
messages=[{"role": data.role, "content": data.content}],
run_llm=data.run_immediately,
)
await self.push_frame(frame)
async def _handle_client_message(self, msg_id: str, data: RTVIRawClientMessageData):
"""Handle a client message frame."""
if not data:
await self._send_error_response(msg_id, "Malformed client message")
return
# Create a RTVIClientMessageFrame to push the message
frame = RTVIClientMessageFrame(msg_id=msg_id, type=data.t, data=data.d)
await self.push_frame(frame)
await self._call_event_handler(
"on_client_message",
RTVIClientMessage(
msg_id=msg_id,
type=data.t,
data=data.d,
),
)
async def _handle_function_call_result(self, data):
"""Handle a function call result from the client."""
frame = FunctionCallResultFrame(
@@ -1278,12 +1618,19 @@ class RTVIProcessor(FrameProcessor):
async def _send_bot_ready(self):
"""Send the bot-ready message to the client."""
config = None
if self._client_version[0] < 1:
config = self._config.config
message = RTVIBotReady(
id=self._client_ready_id,
data=RTVIBotReadyData(version=RTVI_PROTOCOL_VERSION, config=self._config.config),
data=RTVIBotReadyData(version=RTVI_PROTOCOL_VERSION, config=config),
)
await self._push_transport_message(message)
async def _send_server_message(self, message: RTVIServerMessage | RTVIServerResponse):
"""Send a message or response to the client."""
await self._push_transport_message(message)
async def _send_error_frame(self, frame: ErrorFrame):
"""Send an error frame as an RTVI error message."""
if self._errors_enabled:

View File

@@ -108,6 +108,10 @@ class ExotelFrameSerializer(FrameSerializer):
serialized_data = await self._output_resampler.resample(
data, frame.sample_rate, self._exotel_sample_rate
)
if serialized_data is None or len(serialized_data) == 0:
# Ignoring in case we don't have audio
return None
payload = base64.b64encode(serialized_data).decode("ascii")
answer = {
@@ -144,6 +148,9 @@ class ExotelFrameSerializer(FrameSerializer):
self._exotel_sample_rate,
self._sample_rate,
)
if deserialized_data is None or len(deserialized_data) == 0:
# Ignoring in case we don't have audio
return None
# Input: Exotel takes PCM data, so just resample to match sample_rate
audio_frame = InputAudioRawFrame(

View File

@@ -132,6 +132,10 @@ class PlivoFrameSerializer(FrameSerializer):
serialized_data = await pcm_to_ulaw(
data, frame.sample_rate, self._plivo_sample_rate, self._output_resampler
)
if serialized_data is None or len(serialized_data) == 0:
# Ignoring in case we don't have audio
return None
payload = base64.b64encode(serialized_data).decode("utf-8")
answer = {
"event": "playAudio",
@@ -227,6 +231,10 @@ class PlivoFrameSerializer(FrameSerializer):
deserialized_data = await ulaw_to_pcm(
payload, self._plivo_sample_rate, self._sample_rate, self._input_resampler
)
if deserialized_data is None or len(deserialized_data) == 0:
# Ignoring in case we don't have audio
return None
audio_frame = InputAudioRawFrame(
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
)

View File

@@ -155,6 +155,10 @@ class TelnyxFrameSerializer(FrameSerializer):
else:
raise ValueError(f"Unsupported encoding: {self._params.inbound_encoding}")
if serialized_data is None or len(serialized_data) == 0:
# Ignoring in case we don't have audio
return None
payload = base64.b64encode(serialized_data).decode("utf-8")
answer = {
"event": "media",
@@ -262,6 +266,10 @@ class TelnyxFrameSerializer(FrameSerializer):
else:
raise ValueError(f"Unsupported encoding: {self._params.outbound_encoding}")
if deserialized_data is None or len(deserialized_data) == 0:
# Ignoring in case we don't have audio
return None
audio_frame = InputAudioRawFrame(
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
)

View File

@@ -132,6 +132,10 @@ class TwilioFrameSerializer(FrameSerializer):
serialized_data = await pcm_to_ulaw(
data, frame.sample_rate, self._twilio_sample_rate, self._output_resampler
)
if serialized_data is None or len(serialized_data) == 0:
# Ignoring in case we don't have audio
return None
payload = base64.b64encode(serialized_data).decode("utf-8")
answer = {
"event": "media",
@@ -235,6 +239,10 @@ class TwilioFrameSerializer(FrameSerializer):
deserialized_data = await ulaw_to_pcm(
payload, self._twilio_sample_rate, self._sample_rate, self._input_resampler
)
if deserialized_data is None or len(deserialized_data) == 0:
# Ignoring in case we don't have audio
return None
audio_frame = InputAudioRawFrame(
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
)

View File

@@ -44,6 +44,7 @@ from .models import (
try:
import websockets
from websockets.asyncio.client import connect as websocket_connect
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error('In order to use AssemblyAI, you need to `pip install "pipecat-ai[assemblyai]"`.')
@@ -190,9 +191,9 @@ class AssemblyAISTTService(STTService):
"Authorization": self._api_key,
"User-Agent": f"AssemblyAI/1.0 (integration=Pipecat/{pipecat_version})",
}
self._websocket = await websockets.connect(
self._websocket = await websocket_connect(
ws_url,
extra_headers=headers,
additional_headers=headers,
)
self._connected = True
self._receive_task = self.create_task(self._receive_task_handler())

View File

@@ -55,7 +55,7 @@ from pipecat.services.llm_service import LLMService
from pipecat.utils.tracing.service_decorators import traced_llm
try:
import boto3
import aioboto3
import httpx
from botocore.config import Config
except ModuleNotFoundError as e:
@@ -749,13 +749,17 @@ class AWSBedrockLLMService(LLMService):
read_timeout=300, # 5 minutes
retries={"max_attempts": 3},
)
session = boto3.Session(
aws_access_key_id=aws_access_key,
aws_secret_access_key=aws_secret_key,
aws_session_token=aws_session_token,
region_name=aws_region,
)
self._client = session.client(service_name="bedrock-runtime", config=client_config)
self._aws_session = aioboto3.Session()
# Store AWS session parameters for creating client in async context
self._aws_params = {
"aws_access_key_id": aws_access_key,
"aws_secret_access_key": aws_secret_key,
"aws_session_token": aws_session_token,
"region_name": aws_region,
"config": client_config,
}
self.set_model_name(model)
self._settings = {
@@ -903,70 +907,74 @@ class AWSBedrockLLMService(LLMService):
logger.debug(f"Calling AWS Bedrock model with: {request_params}")
# Call AWS Bedrock with streaming
response = self._client.converse_stream(**request_params)
async with self._aws_session.client(
service_name="bedrock-runtime", **self._aws_params
) as client:
# Call AWS Bedrock with streaming
response = await client.converse_stream(**request_params)
await self.stop_ttfb_metrics()
await self.stop_ttfb_metrics()
# Process the streaming response
tool_use_block = None
json_accumulator = ""
# Process the streaming response
tool_use_block = None
json_accumulator = ""
function_calls = []
for event in response["stream"]:
self.reset_watchdog()
function_calls = []
# Handle text content
if "contentBlockDelta" in event:
delta = event["contentBlockDelta"]["delta"]
if "text" in delta:
await self.push_frame(LLMTextFrame(delta["text"]))
completion_tokens_estimate += self._estimate_tokens(delta["text"])
elif "toolUse" in delta and "input" in delta["toolUse"]:
# Handle partial JSON for tool use
json_accumulator += delta["toolUse"]["input"]
completion_tokens_estimate += self._estimate_tokens(
delta["toolUse"]["input"]
)
async for event in response["stream"]:
self.reset_watchdog()
# Handle tool use start
elif "contentBlockStart" in event:
content_block_start = event["contentBlockStart"]["start"]
if "toolUse" in content_block_start:
tool_use_block = {
"id": content_block_start["toolUse"].get("toolUseId", ""),
"name": content_block_start["toolUse"].get("name", ""),
}
json_accumulator = ""
# Handle text content
if "contentBlockDelta" in event:
delta = event["contentBlockDelta"]["delta"]
if "text" in delta:
await self.push_frame(LLMTextFrame(delta["text"]))
completion_tokens_estimate += self._estimate_tokens(delta["text"])
elif "toolUse" in delta and "input" in delta["toolUse"]:
# Handle partial JSON for tool use
json_accumulator += delta["toolUse"]["input"]
completion_tokens_estimate += self._estimate_tokens(
delta["toolUse"]["input"]
)
# Handle message completion with tool use
elif "messageStop" in event and "stopReason" in event["messageStop"]:
if event["messageStop"]["stopReason"] == "tool_use" and tool_use_block:
try:
arguments = json.loads(json_accumulator) if json_accumulator else {}
# Handle tool use start
elif "contentBlockStart" in event:
content_block_start = event["contentBlockStart"]["start"]
if "toolUse" in content_block_start:
tool_use_block = {
"id": content_block_start["toolUse"].get("toolUseId", ""),
"name": content_block_start["toolUse"].get("name", ""),
}
json_accumulator = ""
# Only call function if it's not the no_operation tool
if not using_noop_tool:
function_calls.append(
FunctionCallFromLLM(
context=context,
tool_call_id=tool_use_block["id"],
function_name=tool_use_block["name"],
arguments=arguments,
# Handle message completion with tool use
elif "messageStop" in event and "stopReason" in event["messageStop"]:
if event["messageStop"]["stopReason"] == "tool_use" and tool_use_block:
try:
arguments = json.loads(json_accumulator) if json_accumulator else {}
# Only call function if it's not the no_operation tool
if not using_noop_tool:
function_calls.append(
FunctionCallFromLLM(
context=context,
tool_call_id=tool_use_block["id"],
function_name=tool_use_block["name"],
arguments=arguments,
)
)
)
else:
logger.debug("Ignoring no_operation tool call")
except json.JSONDecodeError:
logger.error(f"Failed to parse tool arguments: {json_accumulator}")
else:
logger.debug("Ignoring no_operation tool call")
except json.JSONDecodeError:
logger.error(f"Failed to parse tool arguments: {json_accumulator}")
# Handle usage metrics if available
if "metadata" in event and "usage" in event["metadata"]:
usage = event["metadata"]["usage"]
prompt_tokens += usage.get("inputTokens", 0)
completion_tokens += usage.get("outputTokens", 0)
cache_read_input_tokens += usage.get("cacheReadInputTokens", 0)
cache_creation_input_tokens += usage.get("cacheWriteInputTokens", 0)
# Handle usage metrics if available
if "metadata" in event and "usage" in event["metadata"]:
usage = event["metadata"]["usage"]
prompt_tokens += usage.get("inputTokens", 0)
completion_tokens += usage.get("outputTokens", 0)
cache_read_input_tokens += usage.get("cacheReadInputTokens", 0)
cache_creation_input_tokens += usage.get("cacheWriteInputTokens", 0)
await self.run_function_calls(function_calls)
except asyncio.CancelledError:

View File

@@ -36,6 +36,8 @@ from pipecat.utils.tracing.service_decorators import traced_stt
try:
import websockets
from websockets.asyncio.client import connect as websocket_connect
from websockets.protocol import State
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use AWS services, you need to `pip install pipecat-ai[aws]`.")
@@ -133,7 +135,7 @@ class AWSTranscribeSTTService(STTService):
while retry_count < max_retries:
try:
await self._connect()
if self._ws_client and self._ws_client.open:
if self._ws_client and self._ws_client.state is State.OPEN:
logger.info("Successfully established WebSocket connection")
return
logger.warning("WebSocket connection not established after connect")
@@ -174,7 +176,7 @@ class AWSTranscribeSTTService(STTService):
"""
try:
# Ensure WebSocket is connected
if not self._ws_client or not self._ws_client.open:
if not self._ws_client or self._ws_client.state is State.CLOSED:
logger.debug("WebSocket not connected, attempting to reconnect...")
try:
await self._connect()
@@ -208,7 +210,7 @@ class AWSTranscribeSTTService(STTService):
async def _connect(self):
"""Connect to AWS Transcribe with connection state management."""
if self._ws_client and self._ws_client.open and self._receive_task:
if self._ws_client and self._ws_client.state is State.OPEN and self._receive_task:
logger.debug(f"{self} Already connected")
return
@@ -238,7 +240,7 @@ class AWSTranscribeSTTService(STTService):
)
# Add required headers
extra_headers = {
additional_headers = {
"Origin": "https://localhost",
"Sec-WebSocket-Key": websocket_key,
"Sec-WebSocket-Version": "13",
@@ -268,9 +270,9 @@ class AWSTranscribeSTTService(STTService):
logger.debug(f"{self} Connecting to WebSocket with URL: {presigned_url[:100]}...")
# Connect with the required headers and settings
self._ws_client = await websockets.connect(
self._ws_client = await websocket_connect(
presigned_url,
extra_headers=extra_headers,
additional_headers=additional_headers,
subprotocols=["mqtt"],
ping_interval=None,
ping_timeout=None,
@@ -299,7 +301,7 @@ class AWSTranscribeSTTService(STTService):
self._receive_task = None
try:
if self._ws_client and self._ws_client.open:
if self._ws_client and self._ws_client.state is State.OPEN:
# Send end-stream message
end_stream = {"message-type": "event", "event": "end"}
await self._ws_client.send(json.dumps(end_stream))
@@ -341,7 +343,7 @@ class AWSTranscribeSTTService(STTService):
async def _receive_loop(self):
"""Background task to receive and process messages from AWS Transcribe."""
while True:
if not self._ws_client or not self._ws_client.open:
if not self._ws_client or self._ws_client.state is State.CLOSED:
logger.warning(f"{self} WebSocket closed in receive loop")
break

Some files were not shown because too many files have changed in this diff Show More