Compare commits

...

756 Commits

Author SHA1 Message Date
Aleix Conchillo Flaqué
53f675f5cf Merge pull request #727 from pipecat-ai/aleix/pipecat-0.0.49
update CHANGELOG for 0.0.49
2024-11-18 06:27:12 +08:00
Aleix Conchillo Flaqué
8173e4ce55 update CHANGELOG for 0.0.49 2024-11-17 23:26:09 +01:00
Aleix Conchillo Flaqué
5445bb0363 rtvi: add on_bot_started event 2024-11-17 22:40:00 +01:00
Mark Backman
a2a94724e5 Merge pull request #725 from pipecat-ai/mb/fix-simple-chatbot
Fix simple-chatbot example
2024-11-16 12:10:05 -05:00
Aleix Conchillo Flaqué
a8f9b0635a Merge pull request #722 from pipecat-ai/aleix/more-dailin-events
transports(daily): add more dial-in events
2024-11-17 01:09:01 +08:00
Mark Backman
4273a31fd5 Fix simple-chatbot example 2024-11-16 07:48:42 -05:00
Aleix Conchillo Flaqué
67f975a2c8 transports(daily): add more dial-in events 2024-11-16 01:22:50 +01:00
Mark Backman
d0bca67666 Merge pull request #716 from pipecat-ai/mb/mute-stt-service
Add STTMuteFilter to un/mute the STT
2024-11-14 19:55:00 -05:00
Mark Backman
966974bfc6 Change STTMuteProcessor to STTMuteFilter 2024-11-14 19:47:37 -05:00
Mark Backman
f807f233bd Suppress UserStartedSpeakingFrame and UserStoppedSpeakingFrame when muted 2024-11-14 17:11:51 -05:00
Mark Backman
33108f5798 Code review feedback 2024-11-14 17:05:08 -05:00
Mark Backman
52de825af8 Update CHANGELOG 2024-11-14 13:47:08 -05:00
Mark Backman
5fe679039c Add STTMuteProcessor to un/mute the STT 2024-11-14 13:35:02 -05:00
Kwindla Hultman Kramer
534f710f5d Merge pull request #688 from pipecat-ai/khk/natural-conversation
More work on llm-as-judge phrase endpointing
2024-11-14 09:15:16 -08:00
Mark Backman
53a11744a8 Merge pull request #712 from pipecat-ai/aleix/some-languages-tweaks
some languages tweaks
2024-11-14 09:33:26 -05:00
Mark Backman
72412cc0c4 Code review feedback 2024-11-14 09:31:04 -05:00
Mark Backman
b77ac07bc6 Merge pull request #715 from pipecat-ai/mb/update-readme-2
Add visual divider below Pipecat README image
2024-11-14 08:54:25 -05:00
Mark Backman
eb6926e0ce Add visual divider below Pipecat README image 2024-11-14 08:51:07 -05:00
Mark Backman
3b2c9de944 Merge pull request #713 from pipecat-ai/mb/update-readme
Update README
2024-11-14 08:45:28 -05:00
Mark Backman
27ff868e5a Move CONTRIBUTING to top directory 2024-11-14 08:43:03 -05:00
Mark Backman
57ef525a8e Update README 2024-11-14 08:43:03 -05:00
Aleix Conchillo Flaqué
d1db54d5fe examples(playht): use a 2.0 engine 2024-11-13 17:19:23 +01:00
Aleix Conchillo Flaqué
4f88fc0eb8 services(tts): initialize language to the proper language code 2024-11-13 17:19:23 +01:00
Aleix Conchillo Flaqué
37d1f4c4e1 services(tts): some language to service language cleanup 2024-11-13 17:19:23 +01:00
Aleix Conchillo Flaqué
ef9e86d997 services(playht): make sure we only skip wav header no matter the size 2024-11-13 17:19:23 +01:00
Aleix Conchillo Flaqué
2d2ef5a417 services(playht): voice engine is Play3.0-mini 2024-11-13 17:19:23 +01:00
Aleix Conchillo Flaqué
c1fff00586 services(playht): fix language codes 2024-11-13 17:19:23 +01:00
Mark Backman
0af2196f50 Merge pull request #708 from pipecat-ai/mb/add-rime-ai
Add RimeTTSService
2024-11-12 18:29:53 -05:00
Mark Backman
cd42320788 Update changelog 2024-11-12 18:28:04 -05:00
Mark Backman
70fce52499 Merge pull request #710 from pipecat-ai/mb/update-readme-krisp
Update Krisp README instructions
2024-11-12 11:15:25 -05:00
Mark Backman
70b60c0593 Update Krisp README instructions 2024-11-12 10:26:12 -05:00
Jon Taylor
2d8aa03f31 Merge pull request #706 from pipecat-ai/jpt/modal-example
barebones modal.com deployment example
2024-11-12 11:41:00 +00:00
Kwindla Hultman Kramer
581ff26704 Merge pull request #707 from pipecat-ai/khk/clean-up
tiny PR to remove old comment lines
2024-11-11 21:14:16 -08:00
Kwindla Hultman Kramer
335178ff06 some gemini audio input examples 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
ee53535f41 gemini audio-in with no transcription 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
91ac40307e small fix and more prompt examples 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
b6c2c1f730 anthropic natural conversation example using claude haiku 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
b56c789ae4 fixes for proposed judge pipeline 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
bd435d9e62 missing commit 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
55a81df84f contributing to llm-as-judge phrase endpointing work 2024-11-11 21:04:50 -08:00
Kwindla Hultman Kramer
87434460f5 temp hacking 2024-11-11 21:04:50 -08:00
Mark Backman
958ec42e8d Add Rime.ai TTS service 2024-11-11 21:58:09 -05:00
Jon Taylor
d1fff60d1d barebones modal.com deployment example 2024-11-11 22:30:07 +00:00
Kwindla Hultman Kramer
1438e5654a remove old comment 2024-11-10 16:08:10 -08:00
Aleix Conchillo Flaqué
1d4be0139a Merge pull request #705 from pipecat-ai/aleix/prepare-0.0.48
update CHANGELOG for 0.0.48
2024-11-10 14:08:33 -08:00
Aleix Conchillo Flaqué
f58c3ee322 update CHANGELOG for 0.0.48 2024-11-10 23:01:03 +01:00
Aleix Conchillo Flaqué
379750df91 Merge pull request #704 from pipecat-ai/aleix/cartesia-tts-stopped-frame
services(cartesia): generated TTSStoppedFrame after no more audio
2024-11-10 05:17:36 -08:00
Aleix Conchillo Flaqué
d125a38737 services(cartesia): generated TTSStoppedFrame after no more audio
The TTSStoppedFrame should be generated when the TTS services stoped generating
audio not when the bot stops speaking.
2024-11-10 09:55:45 +01:00
Mark Backman
446bb0aeaf Merge pull request #702 from pipecat-ai/mb/azure-websocket
Add an Azure TTS websocket service
2024-11-09 17:41:53 -05:00
Aleix Conchillo Flaqué
d839080834 Merge pull request #642 from pipecat-ai/aleix/input-queues-block-frames
introduce frame processor input queues block frames
2024-11-09 14:30:17 -08:00
Mark Backman
9b85d0642b Add a changelog entry 2024-11-09 12:37:29 -05:00
Mark Backman
230b51a117 Add an Azure TTS websocket service 2024-11-09 12:37:29 -05:00
Mark Backman
3a965ca396 Merge pull request #701 from pipecat-ai/khk/anthropic-function-calling-fix
fixes for anthropic function calling
2024-11-09 06:39:34 -05:00
Kwindla Hultman Kramer
33fc5bf990 improved 20c-persistent-context-anthropic.py 2024-11-08 16:42:30 -08:00
Kwindla Hultman Kramer
a54ca08405 fixes for anthropic function calling 2024-11-08 16:33:02 -08:00
Filipi da Silva Fuchter
4379db43ed Merge pull request #689 from pipecat-ai/filipi/krisp
Making pipecat work with Krisp
2024-11-08 16:22:52 -03:00
Filipi Fuchter
e915c676aa Added support for Krisp audio filter 2024-11-08 16:18:10 -03:00
Mark Backman
e0a003afa1 Merge pull request #695 from pipecat-ai/mb/initialize-azure-lang
Initialize the speech_recognition_language for Azure TTS
2024-11-08 06:40:40 -05:00
James Hush
d5666727ce feat: toggle looping with soundfile mixer (#693)
* feat: toggle looping with soundfile mixer

* Implement PR changes
2024-11-07 21:08:37 -08:00
Mark Backman
f6d7402530 Update changelog 2024-11-07 15:16:03 -05:00
Mark Backman
aefe190c9f Initialize the speech_recognition_language for Azure TTS 2024-11-07 15:14:05 -05:00
Vanessa Pyne
29925a8f21 Merge pull request #551 from Allenmylath/patch-3
Frame types and short descriptionCreate Frames.md
2024-11-07 10:05:32 -06:00
Aleix Conchillo Flaqué
beb3271168 services(tts): make sure word timestamp is reset properly 2024-11-06 18:54:12 -08:00
Aleix Conchillo Flaqué
b959ac6e1e Merge pull request #694 from pipecat-ai/aleix/daily-add-on-transcription-message
transports(daily): call on_transcription_message event handler
2024-11-06 15:21:17 -08:00
Aleix Conchillo Flaqué
17f4286942 transports(daily): call on_transcription_message event handler 2024-11-06 15:10:58 -08:00
Aleix Conchillo Flaqué
ce89bbb16e tts(elevenlabs): support pausing and resuming frames while speaking 2024-11-06 14:38:33 -08:00
Aleix Conchillo Flaqué
865768039b processors: remove block_on_frames and add pause_processing_frames() instead 2024-11-06 14:20:25 -08:00
Aleix Conchillo Flaqué
7071482583 try to use queue_frame() instead of process_frame() 2024-11-06 14:18:21 -08:00
Aleix Conchillo Flaqué
5353d13151 update CHANGELOG 2024-11-06 13:16:58 -08:00
Aleix Conchillo Flaqué
a9e565f355 processors: fix input queue interruptions 2024-11-06 13:12:24 -08:00
Aleix Conchillo Flaqué
b6f0c16591 examples: restore EndFrame() on 01 and 02 foundational 2024-11-06 13:05:03 -08:00
Aleix Conchillo Flaqué
49005d02f5 services(tts): use TTSSpeakFrame in say() method 2024-11-06 13:05:03 -08:00
Aleix Conchillo Flaqué
6d8b885071 transports(base_output): push bot started/stopped frames downstream 2024-11-06 13:04:37 -08:00
Aleix Conchillo Flaqué
2eccb33e73 processors: allow passing a callback when queued frame is processed 2024-11-06 13:04:37 -08:00
Aleix Conchillo Flaqué
22ca4c5a02 processors: cancel input task and empty queue with interruptions 2024-11-06 13:04:37 -08:00
Aleix Conchillo Flaqué
84f26ac1ca processors: introduce input queues
Frame processors can now decide if they should continue processing frames or
not, and if so also decide when to continue processing frames. For example,
asynchronous TTS services will stop processing frames until they have generated
all the audio for an LLM response.
2024-11-06 12:13:49 -08:00
Aleix Conchillo Flaqué
74937411e6 Merge pull request #691 from pipecat-ai/aleix/rtvi-manual-bot-ready
rtvi: bot-ready message needs to be sent manual
2024-11-06 10:53:25 -08:00
Aleix Conchillo Flaqué
8aab068ffd rtvi: bot-ready message needs to be sent manual 2024-11-05 10:52:54 -08:00
Aleix Conchillo Flaqué
bd50201ce4 transports(daily): just make it clear we subscribe to camera 2024-11-04 17:32:46 -08:00
Aleix Conchillo Flaqué
6082da284e Merge pull request #611 from pipecat-ai/aleix/audio-filters
introduce audio filters
2024-11-04 16:34:47 -08:00
Aleix Conchillo Flaqué
358c458265 transports(base_input): handle filter contorl frames 2024-11-04 16:19:52 -08:00
Aleix Conchillo Flaqué
807dbbe326 audio(noisereduce): allow enabling/disabling filter 2024-11-04 16:13:29 -08:00
Aleix Conchillo Flaqué
3c116b291d audio(mixers): some cosmetics 2024-11-04 15:37:08 -08:00
Aleix Conchillo Flaqué
0dd413ee90 audio(filters): add noisereduce filter 2024-11-04 15:37:08 -08:00
Aleix Conchillo Flaqué
abc8ede3d7 introduce audio filters 2024-11-04 15:37:08 -08:00
Aleix Conchillo Flaqué
126324ca1b Merge pull request #687 from pipecat-ai/aleix/transport-audio-mixers
introduce transport audio mixers
2024-11-04 13:14:36 -08:00
Aleix Conchillo Flaqué
602915ae18 examples(websocket-server): allow interruptions 2024-11-04 13:05:02 -08:00
Aleix Conchillo Flaqué
0ac9e2dd3f transports(network): synchronize with time before sending data 2024-11-04 13:04:18 -08:00
Aleix Conchillo Flaqué
a9ef5ca95d examples: add bot background sound example 2024-11-03 11:13:02 -08:00
Aleix Conchillo Flaqué
81c476dd4c introduce output transport audio mixers 2024-11-03 11:13:02 -08:00
Kwindla Hultman Kramer
151242d3a0 Merge pull request #666 from pipecat-ai/khk/realtime-pipecat-vad
Support using Pipecat turn detection instead of OpenAI Realtime API turn detection
2024-11-02 08:36:31 -07:00
Kwindla Hultman Kramer
93c6e5098c added comment explaining config of TurnDetection 2024-11-02 08:24:54 -07:00
Aleix Conchillo Flaqué
4455b2a428 rtvi: create queues before tasks 2024-11-01 23:06:50 -07:00
Aleix Conchillo Flaqué
94062592ef base_output: generate smaller audio frames of the same incoming type 2024-11-01 23:06:50 -07:00
Aleix Conchillo Flaqué
d2401a76c8 base_output: only generate bot speaking with TTS audio frames 2024-11-01 23:06:50 -07:00
Aleix Conchillo Flaqué
e2b1b56e86 examples: don't require room token if using an STT 2024-11-01 23:06:50 -07:00
Mark Backman
84bd767312 Merge pull request #685 from pipecat-ai/mb/add-recording-events
Add recording events and callbacks
2024-11-01 12:02:46 -04:00
Mark Backman
802c29e9e1 Add recording events and callbacks 2024-11-01 10:20:00 -04:00
Aleix Conchillo Flaqué
f83381860c Merge pull request #677 from pipecat-ai/aleix/add-notifier-and-notifier-filters
add notifiers and more frame filters
2024-10-31 15:55:07 -07:00
Aleix Conchillo Flaqué
4dad1bfe49 examples: add foundational/22-natural-conversation.py 2024-10-31 12:10:33 -07:00
marcus-daily
9ee8896b64 Removing unnecessary ruff arguments from README 2024-10-31 18:02:29 +00:00
marcus-daily
5f7a2f66d4 Add .idea to .gitignore 2024-10-31 18:02:29 +00:00
marcus-daily
76e5f1e847 Remove unnecessary ruff params in CI 2024-10-31 15:07:28 +00:00
marcus-daily
6975340d6c Set Ruff config for the project 2024-10-31 15:07:28 +00:00
marcus-daily
0f4cf56418 Load dotenv in simple chatbot server (fixes #415) 2024-10-31 12:08:30 +00:00
Aleix Conchillo Flaqué
018e51e8a3 add notifiers and more frame filters 2024-10-30 16:36:17 -07:00
Vanessa Pyne
b050143952 Merge pull request #676 from RonakAgarwalVani/fix/chunk-choices-delta-none
Fix uncaught exception when accessing 'tool_calls' in NoneType delta in response handling
2024-10-30 14:44:32 -05:00
Mark Backman
98ea1f0791 Merge pull request #675 from pipecat-ai/mb/playht-add-request-id
Add a request_id to each TTS sequence
2024-10-30 13:56:15 -04:00
Mark Backman
8272c35527 Use a request_id in TTS commands for the PlayHT websocket service 2024-10-30 13:54:18 -04:00
Mark Backman
e973e82e05 Merge pull request #672 from pipecat-ai/mb/fix-playht
Fix PlayHT TTFB metrics
2024-10-30 13:53:02 -04:00
RonakAgarwalVani
d1396bf618 Update openai.py 2024-10-30 14:26:49 +05:30
Vanessa Pyne
8186e423de Merge pull request #637 from pipecat-ai/vp-issue-template
docs: add ISSUE_TEMPLATE.md
2024-10-29 15:08:42 -05:00
vipyne
3010addb8b docs: add CONTRIBUTING.md 2024-10-29 15:03:07 -05:00
vipyne
029e0d391e docs: add ISSUE_TEMPLATE.md 2024-10-29 15:03:07 -05:00
Vanessa Pyne
bf31223577 Merge pull request #671 from pipecat-ai/vp-issue-635
docs: small fix
2024-10-29 14:34:13 -05:00
vipyne
42cc79154f docs: small fix 2024-10-29 14:33:57 -05:00
Mark Backman
05b857006a Update changelog 2024-10-28 20:56:29 -04:00
Mark Backman
2e57d21b89 Fix ttfb metrics 2024-10-28 20:27:24 -04:00
Aleix Conchillo Flaqué
fa05ec46be Merge pull request #667 from pipecat-ai/aleix/base-output-bot-speaking-detection
transports(base_output): use audio frames for bot speaking detection
2024-10-28 10:54:54 -07:00
Aleix Conchillo Flaqué
e3ce619284 transports(base_output): use audio frames for bot speaking detection 2024-10-28 10:07:37 -07:00
Vanessa Pyne
fb512dcd74 Merge pull request #630 from MoofSoup/update-readme
docs: simplify readme
2024-10-28 10:26:30 -05:00
Aleix Conchillo Flaqué
ca15d97383 Merge pull request #662 from pipecat-ai/aleix/daily-transport-async-functions
transports(daily): make functions async
2024-10-25 16:14:06 -07:00
Aleix Conchillo Flaqué
b32448e967 transports(daily): make functions async 2024-10-25 15:01:52 -07:00
Aleix Conchillo Flaqué
7e30da6183 Merge pull request #661 from pipecat-ai/aleix/allow-updating-subscritption-before
transports(daily): allow updating subscriptions before join
2024-10-25 15:00:34 -07:00
Aleix Conchillo Flaqué
a6dd2600d2 examples(tavus): await update_subscriptions 2024-10-25 14:56:56 -07:00
Aleix Conchillo Flaqué
b905b57dfc transports(daily): allow updating subscriptions before join 2024-10-25 14:46:17 -07:00
Kwindla Hultman Kramer
e1a7edfb58 make it possible to use Pipecat turn detection instead of OpenAI turn detection 2024-10-25 15:59:48 -05:00
Aleix Conchillo Flaqué
1b30b1fc23 Merge pull request #665 from pipecat-ai/aleix/fix-bot-started-stopped-speaking
transports(base_output): fix constant bot started/stopped speaking fr…
2024-10-25 13:00:38 -07:00
Aleix Conchillo Flaqué
55026898f6 transports(base_output): use vad stop secs for bot stopped speaking 2024-10-25 12:59:15 -07:00
Aleix Conchillo Flaqué
4283557894 audio(vad): expose params property 2024-10-25 12:59:15 -07:00
Aleix Conchillo Flaqué
5ab00e01aa transports(base_output): fix constant bot started/stopped speaking frames 2024-10-25 12:10:24 -07:00
Aleix Conchillo Flaqué
fcfc729e83 Merge pull request #664 from pipecat-ai/aleix/fix-aws-stuttering
services(aws): read stream and resample in a thread
2024-10-25 11:49:28 -07:00
Aleix Conchillo Flaqué
4eacb34fd8 services(aws): read stream and resample in a thread 2024-10-25 11:22:28 -07:00
Aleix Conchillo Flaqué
3a8aacccf7 Merge pull request #663 from pipecat-ai/aleix/audio-resampling-with-resampy
audio: use resamply for audio resampling
2024-10-25 10:16:20 -07:00
roey
54c0bf0c70 Adding TavusVideoService layer (#617)
Co-authored-by: roey <159067767+roey-tavus@users.noreply.github.com>
Co-authored-by: Mert Gerdan <mert@tavus.io>
Co-authored-by: Aleix Conchillo Flaqué <aleix@daily.co>
2024-10-25 09:46:25 -07:00
Aleix Conchillo Flaqué
778b05a252 audio: use resamply for audio resampling 2024-10-25 09:22:22 -07:00
Mark Backman
f16a416c2b Merge pull request #660 from pipecat-ai/mb/add-gemini-inputs
Add input params to Google Gemini
2024-10-24 20:58:19 -04:00
Aleix Conchillo Flaqué
1be63bccb8 Merge pull request #647 from pipecat-ai/aleix/daily-transport-only-transcribe-users
transport(daily): only transcribe users
2024-10-24 17:40:34 -07:00
Mark Backman
37820ac0df Add input params to Google Gemini 2024-10-24 20:12:41 -04:00
Aleix Conchillo Flaqué
8ea80d43f4 transports(daily): only transcribe user audio 2024-10-24 17:06:43 -07:00
Aleix Conchillo Flaqué
e117d70a00 update to daily-python 0.12.0 2024-10-24 16:49:19 -07:00
Aleix Conchillo Flaqué
2ba753272a Merge pull request #658 from pipecat-ai/aleix/default-to-24000-sample-rate
update TTS and transport output sample rate to 24000
2024-10-24 16:48:41 -07:00
Aleix Conchillo Flaqué
60c8c2f6e9 examples(15a): use daily transcription instead of local whisper 2024-10-24 16:47:41 -07:00
Aleix Conchillo Flaqué
cfb48200c2 services(azure): support sample rates 2024-10-24 16:47:35 -07:00
Aleix Conchillo Flaqué
6d317c6e8e audio: don't resample if same sample rate 2024-10-24 16:47:35 -07:00
Aleix Conchillo Flaqué
158d52856f transports(livekit): fix VADAnalyzer import 2024-10-24 16:47:35 -07:00
Aleix Conchillo Flaqué
92a69e404f update TTS and transport output sample rate to 24000 2024-10-24 16:47:35 -07:00
Aleix Conchillo Flaqué
d24c6185d8 Merge pull request #654 from pipecat-ai/aleix/daily-allow-completion-futures
transport(daily): allow completion futures
2024-10-24 14:28:53 -07:00
Mark Backman
1fd21578a6 Merge pull request #657 from pipecat-ai/mb/add-elevenlabs-output-format-type
Add ElevenLabs output format type
2024-10-24 17:07:04 -04:00
Mark Backman
700db87127 Merge pull request #656 from pipecat-ai/mb/add-gemini-metrics
Add Gemini token usage metrics
2024-10-24 17:04:56 -04:00
Mark Backman
6f1310569c Add ElevenLabs output format type 2024-10-24 17:03:45 -04:00
Aleix Conchillo Flaqué
14cedb0be8 Merge pull request #655 from pipecat-ai/aleix/fix-together-params
services(together): fix together AI InputParams
2024-10-24 13:51:38 -07:00
Mark Backman
fae97f9051 Add Gemini token usage metrics 2024-10-24 16:37:21 -04:00
Aleix Conchillo Flaqué
d930a46e64 services(together): fix together AI InputParams 2024-10-24 13:08:35 -07:00
Aleix Conchillo Flaqué
2e6b5d1843 transports(daily): fix aiohttp timeout 2024-10-24 11:44:30 -07:00
Aleix Conchillo Flaqué
88362db034 transports(daily): no more need for an output message queue 2024-10-24 11:44:30 -07:00
Aleix Conchillo Flaqué
f7f0c44c32 transports(daily): don't block event handlers 2024-10-24 11:44:30 -07:00
Mark Backman
33553b71d4 Merge pull request #653 from pipecat-ai/mb/align-tts-constructors
Align TTSService constructors
2024-10-24 13:52:43 -04:00
Mark Backman
be8ca505cd Merge pull request #652 from pipecat-ai/khk/more-gemini
Gemini new context manager and rewrite to use google data structures internally
2024-10-24 13:47:38 -04:00
Mark Backman
e957cce422 Align TTSService constructors 2024-10-24 13:42:06 -04:00
Mark Backman
418a13a4ec Merge pull request #650 from pipecat-ai/mb/assembly-fix
AssemblyAI: don't disconnect on language change
2024-10-24 11:26:56 -04:00
Mark Backman
fc445c0a1f Merge pull request #649 from pipecat-ai/mb/open-ai-max-tokens
Add max_tokens and max_completion_tokens inputs for OpenAI
2024-10-24 11:26:44 -04:00
Mark Backman
f0c65468ed AssemblyAI: don't disconnect on language change 2024-10-24 08:30:48 -04:00
Mark Backman
ce6a2bdcf7 Add max tokens inputs to OpenAI 2024-10-24 07:03:45 -04:00
Mark Backman
673542e235 Merge pull request #646 from pipecat-ai/mb/grok-function-calling
Support function calling for Grok
2024-10-23 21:56:38 -04:00
Kwindla Hultman Kramer
e032b0b70a gemini context aggregators 2024-10-23 18:44:09 -07:00
Mark Backman
e39f7e965b Support function calling for Grok 2024-10-23 17:22:26 -04:00
Mattie Ruth
d26751e968 add missing PipelineParams to enable the metrics (#645) 2024-10-23 16:46:46 -04:00
Aleix Conchillo Flaqué
e0ca4a9c23 Merge pull request #643 from pipecat-ai/aleix/daily-update-subscriptions
transports(daily): add update_subscriptions()
2024-10-22 17:07:07 -07:00
Aleix Conchillo Flaqué
801e52c095 transports(daily): add update_subscriptions() 2024-10-22 15:02:55 -07:00
Aleix Conchillo Flaqué
a46eaa838b Merge pull request #641 from pipecat-ai/aleix/prepare-0.0.47
prepare 0.0.47
2024-10-22 10:30:42 -07:00
Aleix Conchillo Flaqué
7c432499db update CHANGELOG for 0.0.47 2024-10-22 10:02:50 -07:00
Aleix Conchillo Flaqué
8d75fcc9f0 use warnings package to report deprecated code 2024-10-22 10:02:21 -07:00
Aleix Conchillo Flaqué
61d73f81ae Merge pull request #639 from pipecat-ai/aleix/daily-transcription-model
transport(daily): use "nova-2-general" for transcription
2024-10-22 09:40:43 -07:00
Aleix Conchillo Flaqué
951255def9 transport(daily): use "nova-2-general" for transcription 2024-10-22 09:40:03 -07:00
Moof Soup
bf5a7c3562 docs: Clarify README example and token usage
clarified readme example
2024-10-21 19:54:34 -07:00
Mark Backman
e556f34094 Merge pull request #638 from pipecat-ai/mb/fix-silero-vad-import
Fix Silero VAD import issue
2024-10-21 20:48:06 -04:00
Mark Backman
ccc3691620 Fix Silero VAD import issue 2024-10-21 20:39:20 -04:00
Vanessa Pyne
5321affda7 Merge pull request #588 from Allenmylath/patch-11
Update README.md
2024-10-21 11:20:05 -05:00
Mark Backman
e5ad8dc67b Merge pull request #627 from pipecat-ai/mb/upgrade-gladia-to-v2-api
Update GladiaSTTService to use the Gladia V2 API
2024-10-21 12:01:20 -04:00
Mark Backman
46927805bc Update GladiaSTTService to use the Gladia V2 API 2024-10-21 07:10:38 -04:00
Aleix Conchillo Flaqué
b6b1ef0a40 Merge pull request #589 from Allenmylath/patch-12
Update Dockerfile
2024-10-20 10:59:43 -07:00
Mark Backman
e62f762382 Merge pull request #625 from pipecat-ai/mb/add-assemblyai-stt
Add support for AssemblyAI STT
2024-10-20 13:59:33 -04:00
Aleix Conchillo Flaqué
dbfda14342 Merge pull request #587 from Allenmylath/patch-9
Update env.example
2024-10-20 10:58:50 -07:00
Aleix Conchillo Flaqué
fee85418cd Merge pull request #620 from gregschwartz/main
Start agent/call/bot at localhost root
2024-10-20 10:14:10 -07:00
Mark Backman
015faa3dbd Update CHANGELOG and README 2024-10-20 08:57:57 -04:00
Mark Backman
1dbf4ff27d Add AssemblyAI STT service 2024-10-20 08:57:57 -04:00
Aleix Conchillo Flaqué
4f1b2dce9b Merge pull request #624 from pvilchez/fix_enable_usage_metrics
Fixing `enable_usage_metrics` setting.
2024-10-20 01:00:12 -07:00
Paul Vilchez
5640bd9447 Fixing a config mismatch which caused usage stats to only report when enable_metrics was true. 2024-10-20 03:33:13 -04:00
Aleix Conchillo Flaqué
ee5ae0d631 Merge pull request #621 from pipecat-ai/aleix/prepare-0.0.46
update CHANGELOG for 0.0.46
2024-10-19 18:26:05 -07:00
Aleix Conchillo Flaqué
4b8a4b86fe update CHANGELOG for 0.0.46 2024-10-19 18:25:29 -07:00
Aleix Conchillo Flaqué
3556c9ce0f Merge pull request #618 from pipecat-ai/aleix/examples-switch-to-llm-context
examples: use OpenAILLMContext in all the examples
2024-10-19 18:24:39 -07:00
Aleix Conchillo Flaqué
f971dbe027 examples(audio-recording): record audio into a file 2024-10-19 18:24:00 -07:00
Aleix Conchillo Flaqué
3815e9dec3 examples: fix dialin-chatbot python arguments 2024-10-19 18:24:00 -07:00
Aleix Conchillo Flaqué
320f622255 examples: upgrade storytelling frontend packages 2024-10-19 18:24:00 -07:00
Aleix Conchillo Flaqué
be4bdabdf4 examples: use OpenAILLMContext in all the examples 2024-10-19 18:24:00 -07:00
Greg Schwartz
1fa52b62aa Put start agent/call at localhost root. Before you had to read in the docs to go to /start, or /start_call or /start_bot. Which isn't mentioned in the console output, and is inconsistent, adding friction to learning the codebase 2024-10-19 16:18:43 -07:00
Aleix Conchillo Flaqué
4f66e5d55f Merge pull request #619 from pipecat-ai/aleix/split-vad
move SileroVAD processor to processors package
2024-10-18 23:30:07 -07:00
Aleix Conchillo Flaqué
3502509d3e move SileroVAD processor to processors package 2024-10-18 23:28:29 -07:00
Aleix Conchillo Flaqué
d71ea1c0e0 Merge pull request #615 from DamienDeepgram/patch-1
Update default Deepgram model
2024-10-18 22:47:30 -07:00
Kwindla Hultman Kramer
07712cdb16 gemini function calling and partial implementation of standard context stuff 2024-10-18 17:14:57 -07:00
DamienDeepgram
13f232bafc Update default model 2024-10-18 15:33:50 -07:00
Aleix Conchillo Flaqué
9dd3354b89 Merge pull request #613 from pipecat-ai/aleix/examples-endframe
examples: use EndFrame() when the participant leaves
2024-10-18 11:18:26 -07:00
Aleix Conchillo Flaqué
8c006c24a3 README: update example 2024-10-18 11:18:03 -07:00
Aleix Conchillo Flaqué
4550545528 examples: use EndFrame() when the participant leaves 2024-10-18 11:18:03 -07:00
Aleix Conchillo Flaqué
020f371ecb pyproject: update onnxruntime to support python 3.12 2024-10-18 10:20:28 -07:00
Aleix Conchillo Flaqué
f3c0767c81 Merge pull request #610 from pipecat-ai/aleix/stt-push-audio
allow STT services to passthrough audio frames
2024-10-17 21:02:30 -07:00
Aleix Conchillo Flaqué
c9318ecd5c examples: minor fixes 2024-10-17 16:15:09 -07:00
Aleix Conchillo Flaqué
12eb9437c1 services(stt): allow STT service to passthrough audio 2024-10-17 16:15:09 -07:00
Aleix Conchillo Flaqué
71c8c0dcdb Merge pull request #609 from pipecat-ai/aleix/livekit-force-specifying-vad
livekit force specifying vad
2024-10-17 14:08:55 -07:00
Aleix Conchillo Flaqué
8108423742 transport(livekit): force specifying a vad analyzer
Don't default to SileroVADAnalyzer(). Also, resample to input sample rate.
2024-10-17 14:06:43 -07:00
Aleix Conchillo Flaqué
d67e08be4d Merge pull request #608 from pipecat-ai/aleix/add-audio-utils-and-resample
add audio utils and resample
2024-10-17 14:00:49 -07:00
Aleix Conchillo Flaqué
d3f4ac61b6 move utils.audio to audio.utils and add resample_audio() 2024-10-17 13:59:32 -07:00
Aleix Conchillo Flaqué
c6d28bb0db Merge pull request #607 from pipecat-ai/aleix/pipecat-vad-deprecation
move vad package to audio.vad
2024-10-17 13:51:20 -07:00
Aleix Conchillo Flaqué
2a37b2459a move vad package to audio.vad 2024-10-17 13:49:16 -07:00
Mark Backman
d1000f2fe4 Merge pull request #606 from pipecat-ai/mb/add-playht-options
PlayHT: Add websocket TTS service; rename existing service to PlayHTHttpTTSService, upgrade client, add input params
2024-10-17 16:46:59 -04:00
Mark Backman
e2d7af4b62 Update changelog 2024-10-17 16:16:29 -04:00
Mark Backman
da3810f1a2 Add websocket support for PlayHT 2024-10-17 15:41:33 -04:00
Aleix Conchillo Flaqué
eb21597d1a Merge pull request #603 from pipecat-ai/aleix/silero-vad-processor-fixes
vad: add support for interruption to SileroVAD processor
2024-10-17 10:48:39 -07:00
Aleix Conchillo Flaqué
e3eea0c02f vad: add support for interruption to SileroVAD processor 2024-10-17 10:48:25 -07:00
Mark Backman
45606e177c Add input options to PlayHT, upgrade to latest PlayHT model 2024-10-17 11:56:12 -04:00
Aleix Conchillo Flaqué
197d7b3e2b Merge pull request #604 from natestraub/patch-1
services(livekit) - Stop Sending EndFrame when Participant Disconnects
2024-10-17 08:48:57 -07:00
Nathan Straub
d4ec6827ce services(livekit) - Stop Sending EndFrame when Participant Disconnects
How It Works Now:
A participant disconnecting triggers and EndFrame, invoking stop() on the input and output transports and causing the LiveKit room to disconnect.  

Proposal:
Match the daily implementation, and just trigger the callbacks in the LiveKitTransport.  Leave it up to the implementor to decide whether to send EndFrames when this happens.
2024-10-16 23:53:31 -07:00
Aleix Conchillo Flaqué
e31d1152db Merge pull request #601 from pipecat-ai/aleix/openai-realtime-misc
services(openai): rename OpenAILLMServiceRealtimeBeta to OpenAIRealti…
2024-10-16 16:20:18 -07:00
Mark Backman
bb48a81103 Merge pull request #602 from pipecat-ai/mb/adjust-logger-levels
Adjust log levels for log messages
2024-10-16 18:00:35 -04:00
Mark Backman
55f1ae2564 Adjust log levels for log messages 2024-10-16 17:30:47 -04:00
Kwindla Hultman Kramer
280691b1b3 explanatory comment in 19-openai-realtime-beta.py 2024-10-16 14:27:48 -07:00
Kwindla Hultman Kramer
93c9e219ce fix for message handling bug on initialization 2024-10-16 12:40:20 -07:00
Aleix Conchillo Flaqué
edd44cc181 services(openai): rename OpenAILLMServiceRealtimeBeta to OpenAIRealtimeBetaLLMService 2024-10-16 10:20:19 -07:00
Aleix Conchillo Flaqué
4075b19f7c Merge pull request #600 from pipecat-ai/aleix/prepare-0.0.45
update CHANGELOG to 0.0.45
2024-10-16 09:18:37 -07:00
Aleix Conchillo Flaqué
bb14918a33 update CHANGELOG to 0.0.45 2024-10-16 09:17:33 -07:00
Mark Backman
2aee8a12f8 Merge pull request #599 from pipecat-ai/mb/remove-metrics-from-transport
Move metrics from transport to rtvi
2024-10-16 11:39:58 -04:00
Mark Backman
5760fadb44 Update changelog 2024-10-16 11:33:56 -04:00
Mark Backman
af5a7e9092 Move metrics from transport to rtvi 2024-10-16 11:33:56 -04:00
Mark Backman
8d9a7486d1 Merge pull request #598 from pipecat-ai/mb/add-daily-metrics-message-frame
Comply with RTVI format for sending metrics data via Daily transport
2024-10-16 10:14:44 -04:00
Mark Backman
00d0f9ae48 Comply with RTVI format for sending metrics data 2024-10-16 09:00:38 -04:00
Aleix Conchillo Flaqué
d255b7d1b2 Merge pull request #596 from pipecat-ai/aleix/prepare-0.0.44
prepare for pipecat 0.0.44
2024-10-15 18:13:07 -07:00
Aleix Conchillo Flaqué
4eb2c95b63 update CHANGELOG for 0.0.44 2024-10-15 17:51:01 -07:00
Aleix Conchillo Flaqué
3910aeb4de transports(daily): don't send messages if not joined 2024-10-15 17:51:01 -07:00
Aleix Conchillo Flaqué
713dcb7a4d transports(daily): cancel messages task when canceling 2024-10-15 17:51:01 -07:00
Aleix Conchillo Flaqué
04da51c7d8 transport(base_output): push EndFrame downstream at the right time 2024-10-15 17:51:01 -07:00
Aleix Conchillo Flaqué
e52d18e42d processors(audiobuffer): make functions public 2024-10-15 15:31:59 -07:00
Aleix Conchillo Flaqué
0c4a513ca2 Merge pull request #595 from pipecat-ai/aleix/bot-speaking-system-frames
bot speaking system frames
2024-10-15 15:30:11 -07:00
Aleix Conchillo Flaqué
4a71eacac3 rtvi: reset bot transcription with interruptions 2024-10-15 14:58:21 -07:00
Aleix Conchillo Flaqué
f0d89e57ad frames: some frames need to be SystemFrames
We want to process user and bot started/stopped speaking frames as fast as
possible. If we queue them they might be processed too late.
2024-10-15 14:37:56 -07:00
Mark Backman
79b52d4301 Merge pull request #594 from pipecat-ai/mb/more-text-filter-massaging
More edge case handling for text filtering
2024-10-15 14:51:43 -04:00
Mark Backman
bb00dbefbc More edge case handling for text filtering 2024-10-15 14:08:27 -04:00
Aleix Conchillo Flaqué
0c250c0603 Merge pull request #583 from pipecat-ai/aleix/add-pts-to-llm-full-response-end-frame
add pts to llm full response end frame
2024-10-15 10:39:50 -07:00
Aleix Conchillo Flaqué
7bbaf4dfe9 rtvi: merge TTS/TTSText and LLM/LLMText processors 2024-10-15 10:24:43 -07:00
Aleix Conchillo Flaqué
3a3bf3fe34 services(cartesia): schedule TTSStoppedFrame after text 2024-10-15 10:06:28 -07:00
Aleix Conchillo Flaqué
616aa54f75 ruff formatting 2024-10-15 10:06:28 -07:00
Aleix Conchillo Flaqué
164f06415c servcies(cartesia): no need to send LLMFullResponseEndFrame
Interruptions are already handled by context aggregators.
2024-10-15 10:06:28 -07:00
Aleix Conchillo Flaqué
51bc4839d1 transport(base_output): simplify code 2024-10-15 10:06:28 -07:00
Aleix Conchillo Flaqué
6d778e0491 services: add pts to LLMFullResponseEndFrame in WordTTSService 2024-10-15 10:06:28 -07:00
Aleix Conchillo Flaqué
fc4fa2faaa Merge pull request #593 from pipecat-ai/aleix/bot-transcription-processor
rtvi: add RTVIBotTranscriptionProcessor to send `bot-transcription`
2024-10-15 10:03:39 -07:00
Aleix Conchillo Flaqué
90b7f65545 rtvi: add RTVIBotTranscriptionProcessor to send bot-transcription 2024-10-15 10:03:20 -07:00
Kwindla Hultman Kramer
f7b7f0d680 Merge pull request #541 from pipecat-ai/khk/openai-realtime-beta
openai realtime beta
2024-10-14 21:02:06 -07:00
Kwindla Hultman Kramer
5431c44e51 remove two debug lines 2024-10-14 21:01:20 -07:00
Kwindla Hultman Kramer
40b3e50815 fix system, consecutive same role, and empty message parsing for anthropic 2024-10-14 20:56:42 -07:00
allenmylath
ec98a13a08 Update Dockerfile
utils and assets not used in this example hence removed
2024-10-15 08:18:16 +05:30
allenmylath
b999b76f70 Update README.md
readme description still shows simple-chatbot definition hence made more accurate description
2024-10-15 08:14:43 +05:30
allenmylath
b64dbe7bb4 Update env.example
canonical api url is also used from env.
2024-10-15 08:10:07 +05:30
Kwindla Hultman Kramer
2f6232fac9 fix for initial-messages with single message, and hoisting system message into instructions 2024-10-14 18:14:35 -07:00
Aleix Conchillo Flaqué
b4f2525c76 Merge pull request #585 from pipecat-ai/aleix/daily-urgent-transport-message-hang
transports(daily): send transport messages in a task
2024-10-14 16:31:10 -07:00
Aleix Conchillo Flaqué
8e956a4e88 Merge pull request #584 from pipecat-ai/aleix/urgent-bot-tts-audio
rtvi: bot-tts-audio messages should also be urgent
2024-10-14 16:25:35 -07:00
Aleix Conchillo Flaqué
7b9712daad transports(daily): send transport messages in a task
We queue transport messages and send them in a task to avoid potential hangs by
sending urgent transport messages from a transport event handler.
2024-10-14 16:19:53 -07:00
Kwindla Hultman Kramer
d4269acd67 user started/stopped speaking frames and interruption frames 2024-10-14 16:07:04 -07:00
Kwindla Hultman Kramer
d2ae82fb38 added back in missing LLMFullResponseStartFrame and LLMFullResponseEndFrame 2024-10-14 15:18:50 -07:00
Lewis Wolfgang
270949e6cd Merge pull request #582 from pipecat-ai/lewis/update_readme_aboutsilerofirstrun
Minor README update about Silero VAD.
2024-10-14 16:26:28 -04:00
Aleix Conchillo Flaqué
cfada94c13 rtvi: bot-tts-audio messages should also be urgent 2024-10-14 12:46:11 -07:00
Lewis Wolfgang
68fd6f7c44 Minor README update about Silero VAD.
We no longer download the model during first run - it's part of the repo.
2024-10-14 13:11:16 -04:00
Mark Backman
96bfcc3dca Merge pull request #571 from pipecat-ai/mb/add-code-filtering
Add code and table filtering option to MarkdownTextFilter
2024-10-14 12:54:16 -04:00
Mark Backman
b0890b1f75 Code review fixes 2024-10-14 12:52:16 -04:00
Aleix Conchillo Flaqué
802b3e42c4 Merge pull request #579 from Allenmylath/patch-16
Update Dockerfile
2024-10-14 08:58:02 -07:00
Aleix Conchillo Flaqué
bd134839ff Merge pull request #578 from Allenmylath/patch-15
Create Dockerfile
2024-10-14 08:57:34 -07:00
Aleix Conchillo Flaqué
428ce63e17 Merge pull request #575 from Allenmylath/patch-12
Update README.md
2024-10-14 08:55:12 -07:00
Aleix Conchillo Flaqué
46d6cde383 Merge pull request #574 from Allenmylath/patch-11
Update requirements.txt
2024-10-14 08:54:44 -07:00
allenmylath
6de82b3c11 Create .env.example (#562)
* Create .env.example

.env.example file with required env variables not added hence adding

* Rename .env.example to env.example

file name corrected as directed
2024-10-14 08:52:46 -07:00
Mark Backman
ec0bc7a057 A few bug fixes 2024-10-14 09:44:20 -04:00
allenmylath
c62156a4c3 Update Dockerfile
assets and utils files not found hence removed
2024-10-14 12:00:29 +05:30
allenmylath
e8618a07d0 Create Dockerfile
there is Dockerfile in other examples. this docker file assumes that there is a .env file(i added env.example in another pull request)
2024-10-14 11:49:35 +05:30
allenmylath
0ba99514a9 Update README.md
env.example added hence addying copy command will be necessary
2024-10-14 11:22:56 +05:30
allenmylath
837c8dad27 Update requirements.txt
whisper not used but deepgram used hence changed
2024-10-14 11:20:12 +05:30
allenmylath
0e69625a01 Rename frames.md to frame.md
edited again to frame.md
2024-10-14 10:07:47 +05:30
allenmylath
4e0823fced Rename Frames.md to frames.md
file name changed as requested
2024-10-14 10:05:26 +05:30
Kwindla Hultman Kramer
6f2a464451 conversation save/load for openai, openai-realtime, and anthropic 2024-10-13 18:12:03 -07:00
Kwindla Hultman Kramer
ac4c5ab369 response content item truncation when interrupted 2024-10-13 14:38:04 -07:00
Kwindla Hultman Kramer
9e95419301 much cleanup 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
f390ec9608 temp commit; debugging 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
ce8a83efba tools frame support and wip message resetting/loading 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
e5a2bf9564 context management improvements 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
7838018686 fix default response properties getting appended to ResponseCreateEvent 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
31916ed9fd turn on/off openai vad 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
3a2fbc2b19 send user started/stopped speaking event from openai realtime events
send user started/stopped speaking event from openai realtime events
2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
43520b44da add 'failed' case to Response event object 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
ab4a8d791a RTVI processors should use TextFrame not TextFrame and all subclasses 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
40dc546b81 function call fix and user transcription frames 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
5426891feb added input audio pause setting. no frame to update that state, yet. 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
1c5ccd3406 fixes for settings updates, context updates, and response creation 2024-10-12 21:58:11 -07:00
Mark Backman
3a745bfa3f Handle self._context of None 2024-10-12 21:58:11 -07:00
Mark Backman
ac4e39991e Update ai_services for OpenAI Realtime param inputs 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
c870832da6 types seem complete; some ws error handling 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
e782016c57 renamed a file 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
00badaf98e more pydantic cleanup 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
7dfac0163b bits of pydantic 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
09a3c2a82d major functionality working (not configurable, occasional timing bugs maybe) 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
c32c65014b definitely broke something in the pipeline 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
f082eb10a2 small cleanup 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
b8898e449e lots of debugging statements. multiple function calls broken 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
d1f6d229ca space exploration prompt 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
4fa0318005 configurability via constructor 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
93ebb9d541 working 19-openai-realtime-beta.py example 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
16101c79c5 beginning of realtime impl 2024-10-12 21:58:11 -07:00
Kwindla Hultman Kramer
c866b3f2c9 Merge pull request #572 from pipecat-ai/khk/fix-deepgram-settings
fix for Deepgram settings not merging properly
2024-10-12 20:07:04 -07:00
Mark Backman
c26a45721f Set inputs as Optional 2024-10-12 21:52:56 -04:00
Mark Backman
d9c900f872 Satisfy minimal text requirements for Cartesia and OpenAI 2024-10-12 21:27:37 -04:00
chadbailey59
73becbad29 fixed parallel async function calls bug (#569) 2024-10-12 17:45:24 -05:00
Aleix Conchillo Flaqué
f1df3de263 Merge pull request #560 from Allenmylath/patch-7
Update requirements.txt aiohttp missing
2024-10-12 14:52:24 -07:00
Aleix Conchillo Flaqué
3bc5c8cda7 Merge pull request #557 from Allenmylath/patch-4
Update env.example wrong tts service in env
2024-10-12 14:51:54 -07:00
Aleix Conchillo Flaqué
7b3b1058b2 Merge pull request #559 from Allenmylath/patch-6
Update server.py
2024-10-12 14:51:24 -07:00
Aleix Conchillo Flaqué
87473f857f Merge pull request #558 from Allenmylath/patch-5
Update env.example wrong tts
2024-10-12 14:50:52 -07:00
Aleix Conchillo Flaqué
a96209185c Merge pull request #546 from Allenmylath/patch-2
Update README.md
2024-10-12 14:46:15 -07:00
Aleix Conchillo Flaqué
34cc2ed1a1 Merge pull request #532 from nmaswood/nmaswood/format-logs
Format and Support Unicode for LLM Message Debug Logs
2024-10-12 14:42:58 -07:00
Aleix Conchillo Flaqué
667aa0c25a Merge pull request #542 from joachimchauvet/main
Update LiveKit audio transport for changes introduced in v0.0.42
2024-10-12 14:13:02 -07:00
Mark Backman
12707f4ff7 _settings needs to be Dict 2024-10-12 12:19:54 -04:00
Kwindla Hultman Kramer
53451899a7 fix for Deepgram settings not merging 2024-10-11 21:07:39 -07:00
Aleix Conchillo Flaqué
dc73b20c0b Merge pull request #451 from Canonical-AI-Inc/recording
Audio recording FrameProcessor
2024-10-11 13:48:19 -07:00
Adrian Cowham
4330374ba4 passing kwargs and forcing keyword-only arguments 2024-10-11 12:01:51 -07:00
Adrian Cowham
79c8aa2c4a ruff formatting 2024-10-11 11:35:02 -07:00
Adrian Cowham
083d221dd2 PR feedback 2024-10-11 11:29:01 -07:00
Mark Backman
74d47b725f Add table filtering 2024-10-11 14:10:47 -04:00
Adrian Cowham
917e482876 Merge branch 'main' into recording 2024-10-11 10:36:04 -07:00
Adrian Cowham
522d931950 better interruption handling by moving the processors after the transport output 2024-10-11 10:33:12 -07:00
Mark Backman
d10c7ac7ce Add Changelog entry 2024-10-11 13:28:34 -04:00
Mark Backman
84705427c5 Add code filtering option to MarkdownTextFilter 2024-10-11 11:11:58 -04:00
Aleix Conchillo Flaqué
66a76af341 Merge pull request #567 from pipecat-ai/aleix/prepare-0.0.43
update CHANGELOG for 0.0.43
2024-10-10 14:09:18 -07:00
Aleix Conchillo Flaqué
d402d91c2f update CHANGELOG for 0.0.43 2024-10-10 14:06:18 -07:00
Mark Backman
b05130a089 Merge pull request #566 from pipecat-ai/mb/make-markdown-modifiable
Mark the Markdown processor a util, and allow it to take inputs
2024-10-10 17:00:19 -04:00
Mark Backman
b3cc0779f0 Update the changelog 2024-10-10 16:49:20 -04:00
Mark Backman
cbecae40a9 Mark the Markdown processor a util, and allow it to take inputs 2024-10-10 16:43:48 -04:00
Mark Backman
5b8753c8b6 Add speak_code input param 2024-10-10 13:17:37 -04:00
Mark Backman
3c5f9457f1 More edge case improvements 2024-10-10 12:07:00 -04:00
Mark Backman
e32e56d0bc Merge pull request #565 from pipecat-ai/mb/add-markdown-remover
Add a new processor which removes markdown and special chars from TTS text
2024-10-10 07:16:42 -04:00
Mark Backman
788aec665b Add a new processor which removes markdown and special chars from TTS text 2024-10-10 07:11:31 -04:00
Mark Backman
3cada03a92 Merge pull request #564 from pipecat-ai/mb/bot-tts-text-urgent
Make bot-tts-text messages urgent
2024-10-08 19:26:46 -04:00
Mark Backman
e21fb520f9 Make bot-tts-text messages urgent 2024-10-08 17:07:08 -04:00
allenmylath
864f4d385f Update requirements.txt aiohttp missing
aiohttp is not included but uded in code
2024-10-08 16:39:25 +05:30
allenmylath
26ac2878ae Update server.py
desccription of fastapi sagrgumentparser wrongly shown as stroy teller instead of patient-intake
2024-10-08 15:18:26 +05:30
allenmylath
cac63f5565 Update env.example wrong tts
cartesian used in code but elevenlabs in .env example
2024-10-08 14:24:23 +05:30
allenmylath
aadffd6199 Update env.example wrong tts service in env
cartesian used in code but env got elevenlabs
2024-10-08 14:15:54 +05:30
Aleix Conchillo Flaqué
3403197a90 Merge pull request #552 from pipecat-ai/aleix/rtvi-user-llm-text
rtvi: add RTVIUserLLMTextProcessor
2024-10-07 08:33:29 -07:00
Aleix Conchillo Flaqué
8cdb9ab1ad rtvi: internal transport message should be urgent 2024-10-07 08:04:14 -07:00
Mark Backman
5dbf26d283 Handle cases where text is either a list or a string 2024-10-07 07:21:32 -04:00
Mark Backman
8001bab9b0 Remove another instance of urgent=true 2024-10-07 06:58:32 -04:00
Aleix Conchillo Flaqué
12d0686adc rtvi: rename bot-audio to bot-tts-audio 2024-10-06 16:50:55 -07:00
Aleix Conchillo Flaqué
a28a5e954a add TransportMessageSystemFrame 2024-10-06 16:50:12 -07:00
Aleix Conchillo Flaqué
bb966a89d2 rtvi: add RTVIUserLLMTextProcessor 2024-10-06 01:05:58 -07:00
Aleix Conchillo Flaqué
4a74eb3321 use isinstance tuples 2024-10-06 00:45:27 -07:00
Aleix Conchillo Flaqué
1f54ee6991 pyproject: update deepgram to 3.7.3 2024-10-06 00:40:47 -07:00
Allenmylath
40af3571f0 Create Frames.md
Made asmall explanation for diffrent types of frames in pipcat
2024-10-05 22:04:03 +05:30
joachimchauvet
86143f79a1 use new InputAudioRawFrame and OutputAudioRawFrame 2024-10-05 14:17:27 +03:00
joachimchauvet
b373bc82b5 match behavior of Daily's on_first_participant_joined 2024-10-05 14:17:27 +03:00
Mark Backman
ea2a05a04b Merge pull request #545 from pipecat-ai/mb/fix-language-handling
Improve language string handling for TTS services
2024-10-04 10:03:06 -04:00
Mark Backman
5692ca586c Merge pull request #547 from pipecat-ai/mb/update-test-requirements
Update fastapi version in test-requirements.txt
2024-10-04 08:28:05 -04:00
Mark Backman
a11ad81f02 Update fastapi version in test-requirements.txt 2024-10-04 07:35:48 -04:00
Allenmylath
805efdb144 Update README.md
the description provided is that of simple chatbot and also the video of simple chatbot hence changed
2024-10-04 10:19:38 +05:30
Mark Backman
c49b31e6ad Add CHANGELOG entry 2024-10-03 23:13:59 -04:00
Mark Backman
7796a272ce Improve language handling for TTS services 2024-10-03 23:09:27 -04:00
Adrian Cowham
678e87fd31 comment back in some code 2024-10-03 14:12:23 -07:00
Adrian Cowham
4d81a2ebfe nuked the code that marks user audio in favor for InputAudioRawFrame. also moving to stereo instead of mono with the human and bot on their own channel. 2024-10-03 14:10:03 -07:00
Adrian Cowham
2d82702e04 merge from main 2024-10-03 09:42:06 -07:00
Mark Backman
27dcf83f37 Merge pull request #543 from pipecat-ai/mb/fix-deepgram-stt-language
Deepgram: disconnect and reconnect on language change
2024-10-03 12:40:27 -04:00
Mark Backman
72db83528d Update changelog 2024-10-03 12:37:26 -04:00
Mark Backman
45c7d36b2e Deepgram: disconnect and reconnect on language change 2024-10-03 12:31:42 -04:00
Aleix Conchillo Flaqué
65eeb0f1f6 Merge pull request #540 from pipecat-ai/cb/interruption-fix
Fixed RTVI `tts:interrupt` action not interrupting
2024-10-02 13:46:52 -07:00
Aleix Conchillo Flaqué
1d7d0bb1ea Merge pull request #539 from pipecat-ai/aleix/pipecat-0.0.42-fixes
pipecat 0.0.42 fixes
2024-10-02 13:34:28 -07:00
Aleix Conchillo Flaqué
598936bc53 services: apply service language code before using service 2024-10-02 13:30:01 -07:00
Chad Bailey
b1bf6f7733 fixed botinterruptionframe 2024-10-02 19:43:51 +00:00
Aleix Conchillo Flaqué
75d27aeb9f examples(storytelling): update packages 2024-10-02 12:00:00 -07:00
Aleix Conchillo Flaqué
0a37caf4b4 openai: fix image json logging 2024-10-02 11:57:50 -07:00
Aleix Conchillo Flaqué
6db65f4335 cartesia: use model_name instead of model_id 2024-10-02 11:57:36 -07:00
Aleix Conchillo Flaqué
3648874301 gladia: fix languages 2024-10-02 11:57:25 -07:00
Aleix Conchillo Flaqué
8bcb5d7fd2 services: async generators should yield frames 2024-10-02 11:57:08 -07:00
Aleix Conchillo Flaqué
8c01a900cd google: allow using GOOGLE_APPLICATION_CREDENTIALS 2024-10-02 11:56:01 -07:00
Mark Backman
d378e699d2 Merge pull request #538 from Allenmylath/patch-2
Update env.example for wrong tts
2024-10-02 12:53:50 -04:00
Mark Backman
c25c375c41 Merge pull request #537 from pipecat-ai/mb/fix-nested-strings
Fix nested strings issue
2024-10-02 12:39:00 -04:00
Allenmylath
70c3ff31fd Update env.example
elevenlabs is not used in code instead cartesian is used hence changed
2024-10-02 21:59:51 +05:30
Mark Backman
cd2e29f285 Fix nested strings issue 2024-10-02 12:26:30 -04:00
Aleix Conchillo Flaqué
6d4d7d763d Merge pull request #534 from pipecat-ai/aleix/prepare-0.0.42
update CHANGELOG for 0.0.42
2024-10-02 08:36:32 -07:00
Aleix Conchillo Flaqué
6c1851eef8 update CHANGELOG for 0.0.42 2024-10-02 08:36:17 -07:00
Mark Backman
096a15eef6 Merge pull request #527 from pipecat-ai/mb/google-tts-inputs
Further consolidate service update settings into a single ServiceUpdateSettingsFrame class
2024-10-02 11:13:25 -04:00
Mark Backman
3d642df2b0 Revert aligning voice_id name in TTS service constructor 2024-10-02 11:07:48 -04:00
Mark Backman
d75a02dc51 Use Language enum and set languages accordingly 2024-10-01 21:03:01 -04:00
Mark Backman
28643b453d Update to use LLM, STT, TTS subclasses and remove setter methods 2024-10-01 20:30:27 -04:00
Nasr Maswood
d5635de5f6 add new lines and unicode to JSON debug logs 2024-10-01 13:31:58 -04:00
Mark Backman
88cca7bf68 Consolidate service UpdateSettingsFrame into a single ServiceUpdateSettingsFrame 2024-10-01 11:01:04 -04:00
Mark Backman
a397b859fe Add support for gender and google_style inputs to Google TTS 2024-10-01 10:39:45 -04:00
Kwindla Hultman Kramer
8aae4e9856 Merge pull request #531 from pipecat-ai/khk/function-calling-improvements 2024-10-01 07:23:38 -07:00
Kwindla Hultman Kramer
92d8b37229 implement vision for openai 2024-09-30 21:49:29 -07:00
Kwindla Hultman Kramer
0801fc578b Merge pull request #530 from pipecat-ai/khk/tts-say-fix
fix for multi-sentence tts say utterances
2024-09-30 20:59:53 -07:00
Kwindla Hultman Kramer
0d5cb84531 function calling testing and improvements 2024-09-30 20:59:28 -07:00
Kwindla Hultman Kramer
47b943a117 Merge pull request #522 from pipecat-ai/rebase-openai-multi-function-call
Handle parallel function calls for OpenAI LLMs
2024-09-30 16:23:37 -07:00
Kwindla Hultman Kramer
128355add5 fix for multi-sentence tts say utterances 2024-09-30 16:19:31 -07:00
Kwindla Hultman Kramer
0499fe41e4 get rid of some debug log lines used during development 2024-09-30 16:08:33 -07:00
Kwindla Hultman Kramer
6ad3437fd2 throw error if the llm tries to call a function that's not registered 2024-09-30 16:08:33 -07:00
Kwindla Hultman Kramer
a5c73ec829 handle openai multiple function calls 2024-09-30 16:08:30 -07:00
JeevanReddy
def04ac0ce openai can give multiple tool calls, current implementation assumes only one function call at a time. Fixed this to handle multiple function calls. 2024-09-30 16:07:56 -07:00
Kwindla Hultman Kramer
5d63615b1b Merge pull request #528 from pipecat-ai/khk/sentence-splits
TTS sentence aggregation fix
2024-09-30 16:07:21 -07:00
Kwindla Hultman Kramer
90ee284fe0 Merge pull request #520 from pipecat-ai/khk/context-frame-push
pushing context frames from assistant aggregators
2024-09-30 16:06:54 -07:00
Kwindla Hultman Kramer
539e0b66fb small fix as per aleix 2024-09-30 16:05:32 -07:00
Kwindla Hultman Kramer
fef393dcac assistant aggregator switch for space padding or not 2024-09-30 16:05:32 -07:00
Kwindla Hultman Kramer
ed607d5c4b typo fix 2024-09-30 16:05:32 -07:00
Kwindla Hultman Kramer
37da7e44cd whitespace fix 2024-09-30 16:05:32 -07:00
Kwindla Hultman Kramer
69c7edd60c pushing context frames from assistant aggregators 2024-09-30 16:05:28 -07:00
Aleix Conchillo Flaqué
392f210371 Merge pull request #524 from pipecat-ai/aleix/everything-is-async
all frame processors are asynchrnous
2024-09-30 15:59:03 -07:00
Mark Backman
9a63df1ea1 Merge pull request #529 from pipecat-ai/mb/daily-python-0-11-0
Update daily-python to 0.11.0
2024-09-30 18:29:27 -04:00
Mark Backman
f8a75cede9 Update daily-python to 0.11.0 2024-09-30 18:22:38 -04:00
Aleix Conchillo Flaqué
4d1e370e02 pipeline(task): since everything is async tasks should wait for EndFrame 2024-09-30 15:11:21 -07:00
Aleix Conchillo Flaqué
d080a31a5c tests: fix langchanin tests 2024-09-30 15:11:21 -07:00
Aleix Conchillo Flaqué
a90ebdfe7c syncparallelpipeline: fix now that all frames are asynchronous 2024-09-30 15:11:21 -07:00
Aleix Conchillo Flaqué
c8995b82e5 all frame processors are asynchrnous
In this commit we make all frame processors asynchronous, that is, they have an
internal queue and they push frames using a task from that queue.
2024-09-30 15:11:21 -07:00
Kwindla Hultman Kramer
6b7f924af6 tts sentence aggregation fix 2024-09-30 14:33:08 -07:00
Mark Backman
51580e5349 Merge pull request #526 from pipecat-ai/mb/google-tts-lang-update
Set Google TTS default language to en-US
2024-09-30 15:32:43 -04:00
Mark Backman
ed49cebf2c Set Google TTS default language to en-US 2024-09-30 15:16:46 -04:00
Mark Backman
46ac76701e Merge pull request #517 from pipecat-ai/mb/update-settings-frame
Consolidate update frames classes into a single UpdateSettingsFrame class
2024-09-30 12:56:45 -04:00
Mark Backman
1f77863aef Code review feedback 2024-09-30 12:50:40 -04:00
Mark Backman
d7555609fd Add TTS update settings options 2024-09-30 12:50:40 -04:00
Mark Backman
7fe118ce63 Align use of language param across TTS services 2024-09-30 12:50:40 -04:00
Mark Backman
44a349386c Consolidate update frames classes into a single UpdateSettingsFrame class 2024-09-30 12:50:39 -04:00
Mark Backman
97cba92fa5 Merge pull request #516 from pipecat-ai/mb/google-tts
Add Google TTS
2024-09-30 12:25:16 -04:00
Aleix Conchillo Flaqué
d9b16d4f73 services: import cosmetics 2024-09-27 13:32:27 -07:00
Aleix Conchillo Flaqué
50b6580fbb livekit: add license notice 2024-09-27 13:28:33 -07:00
Mark Backman
e7548f9494 Code review feedback 2024-09-27 08:02:44 -04:00
Mark Backman
830d2df671 Add Google TTS 2024-09-27 07:36:20 -04:00
Aleix Conchillo Flaqué
13b50a07db Merge pull request #515 from pipecat-ai/aleix/rtvi-frame-processors
RTVI frame processors
2024-09-27 00:48:09 -07:00
Aleix Conchillo Flaqué
4501dca133 Merge pull request #467 from joachimchauvet/main
Add LiveKit audio transport
2024-09-26 22:58:25 -07:00
Aleix Conchillo Flaqué
2c8e566507 rtvi: update version to 0.2 2024-09-26 22:42:36 -07:00
Aleix Conchillo Flaqué
6e8a202107 rtvi: fix handling transport messages 2024-09-26 22:42:19 -07:00
Aleix Conchillo Flaqué
2a05cd35b0 rtvi: add multiple RTVI frame processors 2024-09-26 22:42:08 -07:00
Mark Backman
55a70cde8f Merge pull request #514 from pipecat-ai/mb/aws-polly-tts
Add AWS Polly TTS support
2024-09-26 22:20:13 -04:00
Mark Backman
706c00d897 Code review feedback 2024-09-26 22:13:37 -04:00
Aleix Conchillo Flaqué
d323ea9e95 async_generator: keep pushing frames downstream 2024-09-26 16:44:49 -07:00
Aleix Conchillo Flaqué
b8ece84c6e services: super should be super() 2024-09-26 10:39:26 -07:00
Mark Backman
a018112a13 Merge pull request #510 from pipecat-ai/mb/deepgram-tts-http
Improve usability of Deepgram TTS: use Deepgram client, remove aiohttp
2024-09-26 13:38:42 -04:00
Mark Backman
d3a477902b Add changelog entry 2024-09-26 13:35:59 -04:00
Mark Backman
298b151486 Add setter methods 2024-09-26 13:35:59 -04:00
Mark Backman
6a6ea251ae Add AWS Polly TTS support 2024-09-26 13:35:59 -04:00
Aleix Conchillo Flaqué
c7c709a0a7 github: cache venv when running tests 2024-09-26 10:32:22 -07:00
Aleix Conchillo Flaqué
6ac57b4854 Merge pull request #494 from badbye/full-width-punctuations
add full-width punctuations as end of the sentence
2024-09-26 10:17:10 -07:00
Aleix Conchillo Flaqué
f5e0b946c7 services(cartesia): fix string formatting 2024-09-26 09:08:37 -07:00
Mark Backman
b1818cc370 Merge pull request #435 from golbin/main
Add speed and emotion options for Cartesia.
2024-09-26 07:14:59 -04:00
Jin Kim
d05717a1bd Apply Ruff formater 2024-09-26 19:52:25 +09:00
Aleix Conchillo Flaqué
d11daee31a Merge pull request #509 from pipecat-ai/aleix/frameprocessor-event-handlers
frame processor event handlers
2024-09-25 19:50:30 -07:00
Mark Backman
73da8c1910 Improve usability of Deepgram TTS: use Deepgram client, remove aiohttp 2024-09-25 22:43:10 -04:00
Aleix Conchillo Flaqué
f06aa300d0 rtvi: add on_bot_ready event 2024-09-25 16:52:18 -07:00
Aleix Conchillo Flaqué
c4e94e280e processors: add support for event handlers 2024-09-25 16:35:33 -07:00
Kwindla Hultman Kramer
8f2941c575 Merge pull request #492 from pipecat-ai/khk/flush-more-audio
add calls to flush_audio for say() and rtvi action
2024-09-25 12:35:50 -07:00
joachimchauvet
447baad5c3 update send_metrics() to support changes introduced in #474 2024-09-25 21:38:55 +03:00
Mark Backman
2703813e8a Merge pull request #496 from pipecat-ai/mb/azure-tts-inputs
Add Azure TTS input params
2024-09-25 14:38:01 -04:00
Mark Backman
521e152150 Merge pull request #495 from pipecat-ai/mb/elevenlabs-input-lang
Add language_code support for ElevenLabs TTS
2024-09-25 14:37:44 -04:00
Kwindla Hultman Kramer
3d43ad0f4d actually save the file 2024-09-25 10:59:00 -07:00
Kwindla Hultman Kramer
3621fceae2 fixes as noted by aleix 2024-09-25 09:19:58 -07:00
Aleix Conchillo Flaqué
e123f33c03 Merge pull request #506 from pipecat-ai/aleix/async-generator-processor
processors: add AsyncGeneratorProcessor
2024-09-25 00:04:09 -07:00
Aleix Conchillo Flaqué
b8713666c2 processors: add AsyncGeneratorProcessor 2024-09-25 00:01:04 -07:00
Aleix Conchillo Flaqué
cf0ab85e2c Merge pull request #505 from pipecat-ai/aleix/init-task-variables
initialize task variables and add minor description
2024-09-24 23:59:38 -07:00
Aleix Conchillo Flaqué
8502c7c801 Merge pull request #504 from pipecat-ai/aleix/rtvi-handle-frame
rtvi: add RTVIProcessor.handle_message()
2024-09-24 23:59:26 -07:00
Aleix Conchillo Flaqué
e89814dc6b Merge pull request #503 from pipecat-ai/aleix/end-cancel-task-frames
frames: add EndTaskFrame and CancelTaskFrame
2024-09-24 23:59:10 -07:00
Aleix Conchillo Flaqué
9461bacf0d pyproject: update fastapi to 0.115.0 2024-09-24 19:24:37 -07:00
Aleix Conchillo Flaqué
e276dcbab7 initialize task variables and add minor description 2024-09-24 19:19:00 -07:00
Aleix Conchillo Flaqué
1a3de0e819 rtvi: add RTVIProcessor.handle_message() 2024-09-24 19:12:06 -07:00
Aleix Conchillo Flaqué
ee3786fe15 frames: add EndTaskFrame and CancelTaskFrame 2024-09-24 19:10:22 -07:00
Aleix Conchillo Flaqué
31b5667cee frames: log text with [] so we can distinguish spaces better 2024-09-24 13:10:40 -07:00
Aleix Conchillo Flaqué
a483f1a083 rtvi: handle all actions from the action task 2024-09-24 10:48:15 -07:00
Aleix Conchillo Flaqué
2ecec1c9f8 Merge pull request #500 from pipecat-ai/aleix/rtvi-action-frames-task
RTVI action frames task
2024-09-24 10:13:43 -07:00
Aleix Conchillo Flaqué
08ac311971 rtvi: use task to process incoming action frames 2024-09-24 09:36:53 -07:00
Aleix Conchillo Flaqué
cb49b6a0d6 rtvi: add llm-text and tts-text server messages 2024-09-24 09:36:43 -07:00
Aleix Conchillo Flaqué
016da177db Merge pull request #499 from mercuryyy/main
Fix syntax error in deepgram.py
2024-09-24 09:10:05 -07:00
joachimchauvet
ec5998bc36 remove _internal_push_frame from LiveKitInputTransport 2024-09-24 14:54:37 +03:00
mercuryyy
b1e17ee347 Fix syntax error in deepgram.py 2024-09-24 07:45:29 -04:00
joachimchauvet
b6e1d6e6ae format with ruff 2024-09-24 10:21:02 +03:00
joachimchauvet
fa609f1afc adjust output sample rate and create user token 2024-09-24 10:16:54 +03:00
joachimchauvet
470b5eafe7 move tenacity imports inside try block 2024-09-24 10:16:54 +03:00
joachimchauvet
2e5b0c1d6b add tenacity dependency 2024-09-24 10:16:54 +03:00
joachimchauvet
a9390d96a1 add LiveKit audio transport 2024-09-24 10:16:54 +03:00
Mark Backman
8ee9621d66 Add setter functions 2024-09-23 21:12:01 -04:00
Jin Kim
49f2123893 Apply and Fix upstream changes for Cartesia 2024-09-24 07:59:26 +09:00
Jin Kim
cf72129852 Merge remote-tracking branch 'upstream/main' 2024-09-24 07:18:22 +09:00
Mark Backman
8edee8155d Add input params to Azure TTS 2024-09-23 17:52:23 -04:00
chadbailey59
c262b272fa Added RTVIActionFrame (#464)
* added RTVIActionFrame

* server-sent events

* reverted log changes

* fixup
2024-09-23 14:51:17 -05:00
Aleix Conchillo Flaqué
9ef9c1c58a Merge pull request #497 from pipecat-ai/aleix/ruff-formater
introduce Ruff formatting
2024-09-23 10:42:54 -07:00
Aleix Conchillo Flaqué
c7ff79a652 processors: fix formatting string 2024-09-23 09:53:37 -07:00
Aleix Conchillo Flaqué
da81df5284 github: install dev-requirements when running tests 2024-09-23 09:53:37 -07:00
Aleix Conchillo Flaqué
a4420dc88b README: add vscode and emacs ruff instructions 2024-09-23 09:53:37 -07:00
Aleix Conchillo Flaqué
eeb8338dce introduce Ruff formatting 2024-09-23 09:53:37 -07:00
Cyril S.
dfa4ac81fd Implement Sentry instrumentation for performance and error tracking (#470)
* feat: Add Sentry support in FrameProcessor

This update add optional Sentry integration for performance tracking and error monitoring.

Key changes include:

- Add conditional Sentry import and initialization check
- Implement Sentry spans in FrameProcessorMetrics to measure TTFB (Time To First Byte) and processing time when Sentry is available
- Maintain existing metrics functionality with MetricsFrame regardless of Sentry availability

* feat: Enable metrics in DeepgramSTTService for Sentry

This commit enhances the DeepgramSTTService class to enable metrics generation for use with Sentry.

Key changes include:

1. Enable general metrics generation:
   - Implement `can_generate_metrics` method, returning True when VAD is enabled
   - This allows metrics to be collected and used by both Sentry and the metrics system in frame_processor.py

2. Integrate Sentry-compatible performance tracking:
   - Add start_ttfb_metrics and start_processing_metrics calls in the VAD speech detection handler
   - Implement stop_ttfb_metrics call when receiving transcripts
   - Add stop_processing_metrics for final transcripts

3. Enhance VAD support for metrics:
   - Add `vad_enabled` property to check VAD event availability
   - Implement VAD-based speech detection handler for precise metric timing

These changes enable detailed performance tracking via both Sentry and the general metrics system when VAD is active. This allows for better monitoring and analysis of the speech-to-text process, providing valuable insights through Sentry and any other metrics consumers in the pipeline.

* Update frame_processor.py

* Refactor to support flexible metrics implementation

- Modified the __init__ method to accept a metrics parameter that is either FrameProcessorMetrics or one of its subclasses
- Updated the metrics initialization to create an instance with the processor's name
- Moved all FrameProcessorMetrics-related logic to a new processors\metrics\base.py file

* Implement flexible metrics system with Sentry integration

1. Created a new metrics module in processors/metrics/

2. Implemented FrameProcessorMetrics base class in base.py:

3. Implemented SentryMetrics class in sentry.py:
   - Inherits from FrameProcessorMetrics
   - Integrates with Sentry SDK for advanced metrics tracking
   - Implements Sentry-specific span creation and management for TTFB and processing metrics
   - Handles cases where Sentry is not available or initialized
2024-09-23 08:44:14 -07:00
Lewis Wolfgang
ea16dca8aa Merge pull request #469 from pipecat-ai/lewis/remove_torch_dependency
Remove torch dependency for using silero_vad
2024-09-23 09:59:40 -04:00
Mark Backman
306632b29a Add language_code support for ElevenLabs TTS 2024-09-23 09:01:02 -04:00
duyalei
4533ed014f add full-width punctuations as end of the sentence 2024-09-23 16:35:00 +08:00
Jin Kim
68cc4186ad Merge remote-tracking branch 'upstream/main' 2024-09-23 16:34:31 +09:00
Mark Backman
9a4e749c7c Merge pull request #491 from pipecat-ai/mb/elevenlabs-inputs
Add voice_settings and optimize_streaming_latency to ElevenLabs
2024-09-22 21:54:21 -04:00
Mark Backman
55c645c614 Add voice_settings and optimize_streaming_latency to ElevenLabs 2024-09-22 13:58:50 -04:00
Mark Backman
a1024bb365 Merge pull request #490 from pipecat-ai/mb/llm-rtvi-service-option
Add control frames for LLM param updates
2024-09-21 20:10:17 -04:00
Mark Backman
dfc82c3ba4 Merge pull request #486 from pipecat-ai/mb/llm-extra-params
Add extra input param to LLMs
2024-09-21 18:25:47 -04:00
Mark Backman
9e27a8aad0 Add control frames for LLM param updates 2024-09-21 00:02:58 -04:00
Mark Backman
c73111afea Add extra input param to LLMs 2024-09-21 00:01:25 -04:00
Kwindla Hultman Kramer
26a64afd8d Merge pull request #485 from pipecat-ai/khk/metrics-model-exclude-none
fixup for serialization issue
2024-09-20 18:24:19 -07:00
Kwindla Hultman Kramer
78a3f081de fixup for serialization issue 2024-09-20 18:21:06 -07:00
Mark Backman
e8f8a49646 Merge pull request #484 from pipecat-ai/mb/llm-input-params
Add input params for OpenAI, Anthropic, Together AI LLMs
2024-09-20 20:35:49 -04:00
Mark Backman
219304c5ee Added Changelog entries 2024-09-20 20:31:42 -04:00
Mark Backman
f3fd312b83 Add Together AI interruptible example 2024-09-20 20:21:19 -04:00
Mark Backman
357e66d64d Input params for Together AI LLM 2024-09-20 20:21:19 -04:00
Mark Backman
4fa1ea8c4b Input params for Anthropic LLM 2024-09-20 20:21:19 -04:00
Mark Backman
3b81cd462d Input params to OpenAI LLM 2024-09-20 20:21:19 -04:00
Aleix Conchillo Flaqué
14acf05a26 Merge pull request #480 from pipecat-ai/aleix/input-output-frames
introduce input/output audio and image frames
2024-09-20 14:44:37 -07:00
Mattie Ruth
58d9c84bc9 Merge pull request #474 from pipecat-ai/ruthless/improve-metrics-types-2
Ruthless/improve metrics types 2
2024-09-20 09:47:24 -04:00
Aleix Conchillo Flaqué
7e39d9ad3d introduce input/output audio and image frames
We now distinguish between input and output audio and image frames. We introduce
`InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame` and
`OutputImageRawFrame` (and other subclasses of those). The input frames usually
come from an input transport and are meant to be processed inside the pipeline
to generate new frames. However, the input frames will not be sent through an
output transport. The output frames can also be processed by any frame processor
in the pipeline and they are allowed to be sent by the output transport.
2024-09-19 23:11:03 -07:00
mattie ruth backman
a4edb3dab1 Cleanup on aisle METRICS. Note: See below, this is a breaking change
1. Fleshed out MetricsFrames and broke it into a proper set of types
2. Add model_name as a property to the AIService so that it can be
   automatically included in metrics and also remove that
   overhead from all the various services themselves

Breaking change!

Because of the types improvements, the MetricsFrame type has
changed. Each frame will have a list of metrics simlilar to before
except each item in the list will only contain one type of metric:
"ttfb", "tokens", "characters", or "processing". Previously these
fields would be in every entry but set to None if they didn't apply.

While this changes internal handling of the MetricsFrame, it does NOT
break the RTVI/daily messaging of metrics. That format remains the same.

Also. Remember to use model_name for accessing a service's current
model and set_model_name for setting it.
2024-09-19 21:30:34 -04:00
Mattie Ruth
ed409d0460 Merge pull request #478 from pipecat-ai/ruthless/get-tests-running
Ruthless/get tests running
2024-09-19 21:01:27 -04:00
mattie ruth backman
50b45ac2da get the test infrastructure running again
disable broken tests for now
2024-09-19 20:58:17 -04:00
Kwindla Hultman Kramer
29bcbc68c5 Merge pull request #479 from pipecat-ai/khk/small-fixes
fix small issues that crept into main
2024-09-19 17:25:27 -07:00
Kwindla Hultman Kramer
affbe9ac7d fix small issues that crept into main 2024-09-19 17:17:33 -07:00
Aleix Conchillo Flaqué
1790fa452f Merge pull request #436 from pipecat-ai/aleix/frameprocessor-single-task
introduce synchronous and asynchronous frame processors
2024-09-19 11:22:56 -07:00
Aleix Conchillo Flaqué
607a246572 updated CHANGELOG with sync/async frame processors 2024-09-19 01:32:17 -07:00
Aleix Conchillo Flaqué
4f1b06e6b2 pipeline: renamed ParallelTask to SyncParallelPipeline 2024-09-19 01:32:17 -07:00
Aleix Conchillo Flaqué
62e9a33a70 examples: use CartesiaHttpTTSService to synchronize frames 2024-09-19 01:32:17 -07:00
Aleix Conchillo Flaqué
3298f935ef services(fal,moondream): add missing **kwargs 2024-09-19 01:32:17 -07:00
Aleix Conchillo Flaqué
0e8f56c752 services: move TTSService push_stop_frames to AsyncTTSService 2024-09-19 01:32:15 -07:00
Aleix Conchillo Flaqué
8224538372 services(cartesia): added CartesiaHttpTTSService 2024-09-19 01:31:12 -07:00
Aleix Conchillo Flaqué
fbf6eef68f transports(base_output): wait for sink tasks before canceling audio/video tasks 2024-09-19 01:31:12 -07:00
Aleix Conchillo Flaqué
f078d156de frames: StartFrame is now a SystemFrame 2024-09-19 01:31:12 -07:00
Aleix Conchillo Flaqué
23d6eed5ea transports: input()/output() return subclass instead of base class 2024-09-19 01:31:12 -07:00
Aleix Conchillo Flaqué
0ed3d118d6 services(moondream); update revision to 2024-08-26 2024-09-19 01:31:12 -07:00
Aleix Conchillo Flaqué
337f048864 introduce synchronous and asynchronous frame processors
Pipecat has a pipeline-based architecture. The pipeline consists of frame
processors linked to each other. The elements travelling across the pipeline are
called frames.

To have a deterministic behavior the frames travelling through the pipeline
should always be ordered, except system frames which are out-of-band frames. To
achieve that, each frame processor should only output frames from a single task.

There are synchronous and asynchronous frame processors. The synchronous
processors push output frames from the same task that they receive input frames,
and therefore only pushing frames from one task. Asynchrnous frame processors
can have internal tasks to perform things asynchrnously (e.g. receiving data
from a websocket) but they also have a single task where they push frames from.
2024-09-19 01:31:10 -07:00
Mark Backman
6f3c421621 Merge pull request #475 from pipecat-ai/mb/tts-sample-rate
Add sample_rate setting to TTS services
2024-09-18 14:59:09 -04:00
Mark Backman
eadd68d40b Add sample_rate setting to TTS services 2024-09-18 14:50:20 -04:00
Lewis Wolfgang
71202e3cd5 Remove torch dependency for using silero_vad 2024-09-17 16:48:52 -04:00
Jin Kim
75008d8f11 Add speed and emotion setting method to Cartesia TTS service 2024-09-18 00:51:45 +09:00
Jin Kim
2da0ecbe3c Revert "model_id" as a main argument 2024-09-18 00:38:12 +09:00
Jin Kim
c7f814b2dc Merge remote-tracking branch 'upstream/main' 2024-09-18 00:33:29 +09:00
Aleix Conchillo Flaqué
13a4a05388 Merge pull request #466 from pipecat-ai/aleix/elevenlabs-cartesia-close-websocket-first
services(cartesia,elevenlabs): close websocket before the receiving task
2024-09-16 23:55:28 -07:00
Aleix Conchillo Flaqué
20c019ae16 services(cartesia,elevenlabs): close websocket before the receiving task 2024-09-16 23:54:21 -07:00
Adrian Cowham
387a36dd8a missed a debug print statement 2024-09-16 17:43:42 -07:00
Aleix Conchillo Flaqué
d9d6571c73 Merge pull request #465 from kunal-cai/ks--fix-ws
[Cartesia] Fix streaming truncation bug with Twilio Fast API WS
2024-09-16 17:17:13 -07:00
Kunal Shah
540cad4844 Undo sorting 2024-09-16 16:07:19 -07:00
Kunal Shah
0a26b650c0 Undo sorting 2024-09-16 16:06:25 -07:00
Kunal Shah
adaac003e5 [Cartesia] Fix streaming truncation bug with Twilio Fast API WS 2024-09-16 15:59:06 -07:00
Adrian Cowham
2e02ab740d PR feedback 2024-09-15 20:59:17 -07:00
Aleix Conchillo Flaqué
3d4f125071 Merge pull request #454 from pipecat-ai/aleix/initial-pipeline-clock-support
initial pipeline clock support
2024-09-13 13:51:04 -07:00
Aleix Conchillo Flaqué
bce87f8717 update CHANGELOG.md 2024-09-13 13:50:03 -07:00
Aleix Conchillo Flaqué
1fe940bd6b servceis(cartesia,elevenlabs): use word start times instead 2024-09-13 13:10:44 -07:00
Aleix Conchillo Flaqué
cb36a71381 fix some linting 2024-09-13 09:56:12 -07:00
Aleix Conchillo Flaqué
5acc4928fe examples: add 07d-interruptible-elevenlabs.py 2024-09-13 09:43:18 -07:00
Aleix Conchillo Flaqué
434493b8aa services(elevenlabs): implement word-by-word support through websockets 2024-09-13 09:31:35 -07:00
Aleix Conchillo Flaqué
f08b25dbb2 examples: assistant aggregator should always goes after transport 2024-09-12 00:37:34 -07:00
Aleix Conchillo Flaqué
3665734972 transports(output): initial sink clock synchronization 2024-09-12 00:37:34 -07:00
Aleix Conchillo Flaqué
a98d78cdea services(lmnt): change to subclass of AsyncTTSService 2024-09-12 00:37:34 -07:00
Aleix Conchillo Flaqué
80f6d74e80 services(cartesia): change to subclass of AsyncWordTTSService 2024-09-12 00:37:34 -07:00
Aleix Conchillo Flaqué
02d926e9bd services: create AsyncTTSService and AsyncWordTTSService 2024-09-12 00:31:48 -07:00
Aleix Conchillo Flaqué
7749692f72 processors: get pipeline clock from StartFrame 2024-09-12 00:31:48 -07:00
Aleix Conchillo Flaqué
7807cbeb39 pipeline(task): add a clock to the pipeline task 2024-09-12 00:31:48 -07:00
Aleix Conchillo Flaqué
72f231b327 frames: add a presentation timestamp (pts) to each frame 2024-09-12 00:31:48 -07:00
Aleix Conchillo Flaqué
3cbe97d346 clocks: added new BaseClock and SystemClock 2024-09-12 00:31:48 -07:00
Adrian Cowham
b4eff2028f Merge branch 'main' into recording 2024-09-10 10:18:57 -07:00
Adrian Cowham
f411bf33fd adding a frame processor with the ability to save a conversation to a buffer and another frame processor to upload audio to Canonical for evaluation and metrics collection. Examples included 2024-09-10 10:15:48 -07:00
Kwindla Hultman Kramer
b880e1a60e Merge pull request #448 from pipecat-ai/khk/aggregation-leading-space
fix for leading space in context aggregator strings
2024-09-10 09:57:35 -07:00
Aleix Conchillo Flaqué
886046e696 Merge pull request #445 from dleybz/patch-1
Update requirements.txt
2024-09-09 17:54:33 -07:00
Aleix Conchillo Flaqué
9106a5f8ae Merge pull request #449 from pipecat-ai/aleix/audio-out-bitrate
transports(daily): allow setting audio output bitrate (default 96kpbs)
2024-09-09 08:39:06 -07:00
Aleix Conchillo Flaqué
98286336bf transports(daily): allow setting audio output bitrate (default 96kpbs)
Fixes #388
2024-09-08 19:39:17 -07:00
Jin Kim
fa0deededa Add voice options and make to use InputParams for Cartesia. 2024-09-09 10:53:23 +09:00
Kwindla Hultman Kramer
081b001c8b fix for leading space in context aggregator strings 2024-09-07 16:42:52 -07:00
Danny D. Leybzon
c92531a02f Update requirements.txt
request.form() throws an error if you don't have python-multipart installed
2024-09-06 20:22:18 +02:00
Aleix Conchillo Flaqué
748a7af602 update CHANGELOG.md 2024-09-05 19:05:29 -07:00
Aleix Conchillo Flaqué
f4a0de6327 Merge pull request #444 from pipecat-ai/aleix/elevenlabs-streaming
services(elevenlabs): add elevenlabs package and use streaming
2024-09-05 11:24:12 -07:00
Aleix Conchillo Flaqué
e405d7af9f services(elevenlabs): add elevenlabs package and use streaming 2024-09-05 11:20:01 -07:00
Aashraya
51cd7fd285 twiliohandle interruption (#422)
* add interuption handler in twilio serializer

* fix autopep8

* revert ruff autoformatting

* address pr comments

* change interruption frame to user started frame in serializer

* remove overrrident handle interrupt

* remove unused import

* change userstarted to interuption frame
2024-09-02 11:06:38 -07:00
Aleix Conchillo Flaqué
aba5f89174 Merge pull request #437 from soof-golan/soof-obj-id-generation
Generate ids with itertools.count
2024-09-02 10:53:48 -07:00
Soof Golan
5c0f5a1613 Generate ids with itertools.count
Avoids the critical section with threading.Lock in favor of itertools.count.

`count` objects are threadsafe, and their critical section is implemented in C and provide better performance that Python level locking.
2024-09-02 15:39:58 +02:00
Aleix Conchillo Flaqué
7c342f7ba2 Merge pull request #433 from pipecat-ai/aleix/process-all-startframes
StartFrame should be the first frame every processor receives
2024-08-30 14:17:38 -07:00
Aleix Conchillo Flaqué
37e2388758 StartFrame should be the first frame every processor receives
Fixes #427
2024-08-29 22:43:44 -07:00
Aleix Conchillo Flaqué
05f0492a8d Merge pull request #421 from pipecat-ai/aleix/improve-multi-lingual-support
improve multi lingual support
2024-08-29 13:19:40 -07:00
Aleix Conchillo Flaqué
c0ac5c6ae8 services(lmnt): fix example and update README and CHANGELOG 2024-08-29 11:11:24 -07:00
Aleix Conchillo Flaqué
be923687fb processors(rtvi): user decices if bot interrupts on update config 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
5f32fb125d updated CHANGELOG.md 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
ae6fbb3146 services: just set model, voice, language independently 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
864768635a services: add voice and language to set_model() 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
d7c9679977 services: allow TTSModelUpdateFrame to also update language and voice 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
fedfc366f6 services(deepgram): fix strenum values 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
b3b39626e1 services: allow switching STT language and mdoel at the same time 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
4e0ece17b6 services: added support for setting STT model and language 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
fd3fdacdee transcriptions: added more languages 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
a253606d50 services(daily): on_joined now returns all data not only participant 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
568d9dc0a3 services(whisper): inherit from SegmentedSTTService 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
6629b853c5 services(deepgram): inherit from STTService instead of AsyncAIService 2024-08-29 11:00:03 -07:00
Aleix Conchillo Flaqué
3931cb3235 services(cartesia): allow setting language and language voices 2024-08-29 11:00:01 -07:00
Aleix Conchillo Flaqué
38cd86ad52 services: added language to transcription frames 2024-08-29 10:59:02 -07:00
Aleix Conchillo Flaqué
c0cdabf61d frames: adde TTSLanguageUpdateFrame and TTSLanguageVoicesUpdateFrame 2024-08-29 10:59:02 -07:00
Aleix Conchillo Flaqué
51270a96c5 frames: add language to transcription frames 2024-08-29 10:59:02 -07:00
Kwindla Hultman Kramer
84d72c0d5c Merge pull request #425 from pipecat-ai/khk/rtvi-together-function-calling
fixup type mismatches between rtvi data structures and together.py
2024-08-28 13:11:52 -07:00
Aleix Conchillo Flaqué
79aca8169a Merge pull request #391 from sharvil/pr/add-lmnt
LMNT TTS
2024-08-27 21:40:46 -07:00
Kwindla Hultman Kramer
b9d362bd62 fixup type mismatches between rtvi data structures and together.py 2024-08-27 17:39:21 -07:00
Sharvil Nanavati
87c4a1bee1 Move stop frame task creation into TTSService.start 2024-08-27 04:45:21 +00:00
Sharvil Nanavati
c979762b70 Handle cancellation, stopping, and restarting 2024-08-27 01:24:00 +00:00
Sharvil Nanavati
1d92fc3199 Merge branch 'main' into pr/add-lmnt 2024-08-24 10:07:52 -07:00
Sharvil Nanavati
8ac7fb1a67 Use a single long-lived Task to push TTSStoppedFrame 2024-08-24 16:18:07 +00:00
Sharvil Nanavati
60c3d33def Default LMNT to 24kHz, add example 2024-08-24 15:40:29 +00:00
Sharvil Nanavati
8a39d3f4eb services: add a generic mechanism to produce TTSStoppedFrames 2024-08-24 15:40:12 +00:00
Aleix Conchillo Flaqué
e038767b6f Merge pull request #413 from pipecat-ai/aleix/pipecat-0.0.41
prepare pipecat 0.0.41
2024-08-22 17:01:43 -07:00
Aleix Conchillo Flaqué
0c46b3e481 prepare pipecat 0.0.41 2024-08-22 11:50:20 -07:00
Aleix Conchillo Flaqué
d42f072ff5 examples: fix studypal errors and update requirements 2024-08-22 11:50:05 -07:00
Aleix Conchillo Flaqué
9b6f29c24a Merge pull request #414 from pipecat-ai/aleix/add-livekit-dependency
added livekit dependency
2024-08-22 10:55:43 -07:00
Aleix Conchillo Flaqué
873d5dc23f added livekit dependency 2024-08-22 10:54:18 -07:00
Aleix Conchillo Flaqué
6d141fd47f Merge pull request #396 from nulyang/feat/livekit-serializers
Add livekit audio serializers
2024-08-22 10:44:24 -07:00
Aleix Conchillo Flaqué
c6f6cb2947 Merge pull request #412 from pipecat-ai/aleix/fastapi-variable-clash
transports(fastapi): fix variable name clash
2024-08-22 09:50:23 -07:00
Aleix Conchillo Flaqué
0eb189ce7f transports(fastapi): fix variable name clash 2024-08-22 08:50:03 -07:00
Sharvil Nanavati
f4fd7b7028 LMNT TTS 2024-08-22 00:47:41 +00:00
Aleix Conchillo Flaqué
21de8e0a35 transport(out): log bot started/stopped speaking 2024-08-21 17:23:44 -07:00
Aleix Conchillo Flaqué
6f55d494bd frames: use VADParams type in VADParamsUpdateFrame 2024-08-21 17:23:12 -07:00
Aleix Conchillo Flaqué
d216edc567 Merge pull request #409 from aashsach/anthropic-empty-tool-argument
handle empty parameters for anthropic function calling
2024-08-21 16:14:51 -07:00
Aashraya
ec6063ecc4 system is not a list, it is handled and assisgned as string 2024-08-21 16:31:50 +05:30
Aashraya
40fe4ce6fb handle empty parameters for anthropic function calling 2024-08-21 15:49:36 +05:30
Aleix Conchillo Flaqué
31d87a4048 update CHANGELOG.md for 0.0.40 2024-08-20 11:48:40 -07:00
Aleix Conchillo Flaqué
ac8b171fa9 Merge pull request #406 from pipecat-ai/hush/cartesiaDocs
Hush/cartesia docs
2024-08-20 11:17:52 -07:00
James Hush
1f06d78213 github: remove *requirements.txt from tests.yaml 2024-08-20 11:16:25 -07:00
James Hush
28eba17df8 docs: update Cartesia references 2024-08-20 11:13:13 -07:00
Aleix Conchillo Flaqué
dfc2e62339 Merge pull request #405 from pipecat-ai/aleix/revert-dailysettings-aliases
Revert "transports(daily): use aliases in DailyDialinSettings"
2024-08-20 08:53:31 -07:00
Aleix Conchillo Flaqué
80c89a39c9 processors(rtvi): add support for client-ready message (fix) 2024-08-20 07:54:11 -07:00
Aleix Conchillo Flaqué
9d1c16e996 Revert "transports(daily): use aliases in DailyDialinSettings"
This reverts commit 47d375309d.
2024-08-20 07:52:35 -07:00
Aleix Conchillo Flaqué
86604c2353 examples(studypal): use aiohttp instead of requests 2024-08-19 18:11:30 -07:00
Aleix Conchillo Flaqué
8f31a02938 Merge pull request #403 from yashn35/studypal-demo
Add studypal
2024-08-19 17:39:19 -07:00
Aleix Conchillo Flaqué
47d375309d transports(daily): use aliases in DailyDialinSettings 2024-08-19 17:27:43 -07:00
Yash Narayan
980265ca97 Add studypal 2024-08-19 16:58:29 -07:00
Aleix Conchillo Flaqué
90479fff95 processors(rtvi): add set_client_ready() 2024-08-19 16:41:43 -07:00
Aleix Conchillo Flaqué
1ce1fcb0ce Merge pull request #401 from pipecat-ai/aleix/use-cartesia-in-more-examples
examples: use Cartesia TTS in most examples
2024-08-19 16:07:35 -07:00
Aleix Conchillo Flaqué
1a662376fc examples: use Cartesia TTS in most examples 2024-08-19 15:31:34 -07:00
Aleix Conchillo Flaqué
1d24f926ec Merge pull request #400 from pipecat-ai/aleix/rtvi-client-ready
processors(rtvi): add support for client-ready message
2024-08-19 10:53:49 -07:00
Aleix Conchillo Flaqué
4f2c37c940 processors(rtvi): add support for client-ready message 2024-08-19 10:33:18 -07:00
Aleix Conchillo Flaqué
042115a6bb processors(rtvi): update initial config when sending bot-ready message 2024-08-19 09:32:27 -07:00
Aleix Conchillo Flaqué
c9f1469b41 transports(daily/helpers): add server error message to the logs 2024-08-19 08:44:05 -07:00
Aleix Conchillo Flaqué
54c9f604c9 updated CHANGELOG with VADParamsUpdateFrame 2024-08-18 21:20:40 -07:00
Kwindla Hultman Kramer
56fbcd6562 Merge pull request #397 from pipecat-ai/khk/rtvi-vad-params
VADParamsUpdateFrame and handling thereof
2024-08-18 21:14:58 -07:00
Kwindla Hultman Kramer
e6b0500568 make VADAnalyzer:set_params() public 2024-08-18 21:11:18 -07:00
Aleix Conchillo Flaqué
41038b6673 Merge pull request #394 from pipecat-ai/aleix/fix-function-calling-examples
fix function calling examples
2024-08-18 20:55:29 -07:00
Aleix Conchillo Flaqué
26d03f26c9 services(openai, anthropic): a None result should not run inference 2024-08-18 20:48:43 -07:00
Aleix Conchillo Flaqué
f3a4e54996 function calling: start callback should have function name first 2024-08-18 20:48:20 -07:00
Kwindla Hultman Kramer
925e80bb20 VADParamsUpdateFrame and handling thereof 2024-08-18 13:34:46 -07:00
nulyang
9bda09b1a8 serializers(livekit): Add audio serializers 2024-08-18 23:40:32 +08:00
Aleix Conchillo Flaqué
ef0d0531fa services: moved request_image_frame() to LLMService 2024-08-17 23:59:38 -07:00
Aleix Conchillo Flaqué
6520f20ffe fix function calling examples 2024-08-17 23:32:39 -07:00
Aleix Conchillo Flaqué
ebc4e0924b Merge pull request #387 from pipecat-ai/aleix/update-reqs-081624
update pyproject.toml and remove requirements files
2024-08-17 23:29:47 -07:00
Aleix Conchillo Flaqué
9e7c0e6033 Merge pull request #390 from sharvil/pr/websocket-fix
transports(websocket): fix `_audio_buffer` being accidentally overwritten
2024-08-17 23:26:35 -07:00
Aleix Conchillo Flaqué
cf5720f316 update CHANGELOG.md 2024-08-17 21:00:32 -07:00
Kwindla Hultman Kramer
655b468269 Merge pull request #393 from pipecat-ai/khk/anthropic-tools-ordering
fix for out-of-order image messages in anthropic context
2024-08-17 15:07:27 -07:00
Kwindla Hultman Kramer
17f8c93e44 fix for out-of-order image messages in anthropic context 2024-08-17 14:47:29 -07:00
Aleix Conchillo Flaqué
5b4061b0d5 processors(rtvi): fix send_error() 2024-08-16 23:46:57 -07:00
Aleix Conchillo Flaqué
6ce0227e98 processors(rtvi): error-response should always include and error 2024-08-16 23:23:55 -07:00
Aleix Conchillo Flaqué
a583a28850 processors(rtvi): error message should use error field 2024-08-16 23:22:27 -07:00
Aleix Conchillo Flaqué
32daf65adc processors(rtvi): send to the client if errors are fatal 2024-08-16 23:17:55 -07:00
Aleix Conchillo Flaqué
e22c80610e frames: add new FatalErrorFrame 2024-08-16 23:17:31 -07:00
Sharvil Nanavati
374f1e7e01 transports(websocket): fix _audio_buffer being accidentally overwritten
`BaseOutputTransport` declares an `_audio_buffer` instance variable.
`WebsocketServerOutputTransport` accidentally reuses that variable
internally assuming it's class-local and not inherited.

This PR renames the variable in `WebsocketServerOutputTransport`
to avoid the name collision.
2024-08-17 05:28:05 +00:00
Aleix Conchillo Flaqué
d2dfa93bf1 processors(rtvi): send bot-ready when participant joins 2024-08-16 13:58:21 -07:00
Aleix Conchillo Flaqué
fa8c6712c6 transports(daily): fix multiple DailyTransport initialization 2024-08-16 13:32:34 -07:00
Aleix Conchillo Flaqué
4c2b84cb4d update pyproject.toml and remove requirements files 2024-08-16 09:28:46 -07:00
Aleix Conchillo Flaqué
b57c9d569b Merge pull request #352 from pipecat-ai/aleix/rtvi-0.1
processors(rtvi): rtvi 0.1 message protocol
2024-08-15 17:35:50 -07:00
Aleix Conchillo Flaqué
f0e50ba000 Merge pull request #336 from nulyang/fix/azure-transcriptionframe
services(azure): fix TranscriptionFrame parameter type
2024-08-15 17:08:56 -07:00
Mattie Ruth
4a6638f749 Merge pull request #385 from pipecat-ai/mrkb/anthropic-beta-caching
Mrkb/anthropic beta caching
2024-08-15 18:26:51 -04:00
Aleix Conchillo Flaqué
31577252f3 processors(rtvi): handle ErrorFrames 2024-08-15 15:23:31 -07:00
Aleix Conchillo Flaqué
5d71c50080 transports(daily): make sure audio_in_task exists before canceling 2024-08-15 15:23:07 -07:00
Aleix Conchillo Flaqué
981269d594 pipeline(task): process ErrorFrame in same task and stop pipeline task 2024-08-15 15:22:40 -07:00
mattie ruth backman
848db985fc bump anthropic in 3.10 requirements 2024-08-15 16:51:48 -04:00
mattie ruth backman
d5d8e31447 add cache tokens to metrics event 2024-08-15 16:51:48 -04:00
Aleix Conchillo Flaqué
66670a2370 Merge pull request #384 from pipecat-ai/aleix/enable-prompt-caching-frames
services(anthropic): allow setting enable prompt caching via frame
2024-08-15 13:26:39 -07:00
Aleix Conchillo Flaqué
5637f349c6 services(anthropic): allow setting enable prompt caching via frame 2024-08-15 12:43:29 -07:00
Aleix Conchillo Flaqué
93248e1d00 Merge pull request #382 from pipecat-ai/khk/anthropic-beta-caching
Support for Anthropic prompt caching beta
2024-08-15 12:34:54 -07:00
Kwindla Hultman Kramer
187769357f update version number of anthropic dependency 2024-08-15 12:28:41 -07:00
Aleix Conchillo Flaqué
5be6422cc8 Revert "processors(rtvi): process options in the order they are defined"
This reverts commit 61ac83e2d9.
2024-08-15 11:51:00 -07:00
Aleix Conchillo Flaqué
8670b2d994 utils: add match_endofsentence and use it in processors 2024-08-15 11:26:25 -07:00
Aleix Conchillo Flaqué
0bc6db428d processors(rtvi): implement bot-started-speaking and bot-stopped-speaking 2024-08-15 11:05:10 -07:00
Aleix Conchillo Flaqué
67d565930e services: send TTSStartFrame/TTSStopFrame when really needed 2024-08-15 11:05:10 -07:00
Aleix Conchillo Flaqué
b2a7ff6fd3 processors(rtvi): all transport messages should be urgent 2024-08-15 11:05:10 -07:00
Aleix Conchillo Flaqué
425a730d7c transports(base_output): send urgent transport messages immediately 2024-08-15 11:05:10 -07:00
Aleix Conchillo Flaqué
84c5709722 frames: add urgent field to TransportMessageFrame 2024-08-15 11:05:10 -07:00
Kwindla Hultman Kramer
94deec01c9 okay, both files now 2024-08-15 00:57:10 -07:00
Kwindla Hultman Kramer
6e0dd4a779 Anthropic beta prompt caching 2024-08-15 00:54:43 -07:00
Kwindla Hultman Kramer
14bde340dd Merge pull request #381 from pipecat-ai/khk/anthropic-fixup-0814.2
Fixup anthropic context set_messages
2024-08-14 23:34:31 -07:00
Kwindla Hultman Kramer
253765c611 and fixing anthropic demos 2024-08-14 23:14:20 -07:00
Kwindla Hultman Kramer
2b26d7182f replaces 379 2024-08-14 22:40:09 -07:00
Aleix Conchillo Flaqué
61ac83e2d9 processors(rtvi): process options in the order they are defined 2024-08-14 22:26:49 -07:00
Aleix Conchillo Flaqué
d5c7b28cad Merge pull request #380 from pipecat-ai/aleix/rtvi-0.1-context-aggregators-updates
processors(aggregators): multiple LLM aggregators updates
2024-08-14 20:43:50 -07:00
Aleix Conchillo Flaqué
959580a708 processors(logger): fix linting 2024-08-14 20:39:24 -07:00
Aleix Conchillo Flaqué
3a5cd17ea3 processors(aggregators): multiple LLM aggregators updates 2024-08-14 20:23:18 -07:00
Kwindla Hultman Kramer
b78981bb9d Merge pull request #374 from pipecat-ai/khk/together
Together.ai service implementation with Llama 3.1 function calling
2024-08-14 17:29:07 -07:00
Kwindla Hultman Kramer
a6d90b0a00 linting fixes to anthropic.py 2024-08-14 17:27:00 -07:00
Aleix Conchillo Flaqué
67016492f2 transports(daily/helpers): add delete_room_from_url() 2024-08-14 17:14:02 -07:00
Aleix Conchillo Flaqué
2c38089527 processors(rtvi): handle incoming messages in a separate task 2024-08-14 15:34:02 -07:00
Kwindla Hultman Kramer
48f68ba6dc Service for together.ai, including Llama 3.1 function calling support 2024-08-13 15:01:54 -07:00
Aleix Conchillo Flaqué
574df4ba3d processors(rtvi): make sure to send bot-ready when transport is joined 2024-08-13 13:25:15 -07:00
Aleix Conchillo Flaqué
49ca16d125 pipeline(task): only send initial metrics frames if metrics enabled 2024-08-13 12:22:37 -07:00
Aleix Conchillo Flaqué
87525b085e processors(rtvi): linting and make send_error() public 2024-08-13 11:21:51 -07:00
Aleix Conchillo Flaqué
6b53c6add3 transports(daily): DailyTransport default DailyParams 2024-08-13 11:13:18 -07:00
Kwindla Hultman Kramer
29ca1b7855 Anthropic tool use core Pipecat pieces refactored (#369)
* processors(rtvi): rtvi 0.1 message protocol

* added a single function call handler

* wip - function calling

* fixup

* fixup

* fixup

* processors(rtvi): no need for configure_on_start()

* processors(rtvi): add new option values if they haven't been set yet

* Add the model name to the LLM usage metrics

* wip - anthropic tool calling

* still wip - anthropic tool use and vision

* anthropic tools and vision working

* anthropic tool calling and vision

* Cartesia error handling

* Anthropic tool use core Pipecat pieces refactored as per plan

* aleix has good ideas

* Usage metrics for Anthropic LLMs

* fix function call result state not getting cleared bug

* Pass **kwargs through from AnthropicLLMService constructor

* about to tinker with anthropic

* added openai function calling

* openai function calling

* fixup

---------

Co-authored-by: Aleix Conchillo Flaqué <aleix@daily.co>
Co-authored-by: Chad Bailey <chadbailey@gmail.com>
Co-authored-by: mattie ruth backman <mattieruth@gmail.com>
Co-authored-by: chadbailey59 <chadbailey59@users.noreply.github.com>
2024-08-13 13:01:24 -05:00
Aleix Conchillo Flaqué
a42d0c9907 processors(rtvi): add interrupt_bot() 2024-08-13 09:22:43 -07:00
marcus-daily
8bc6ceaa3d Fixing pep8 2024-08-13 15:32:23 +01:00
marcus-daily
0b8a1ab5d1 Handle describe-actions message 2024-08-13 15:32:23 +01:00
Brian Hill
358c287db2 chore: Enable build without git 2024-08-12 11:38:41 -04:00
Brian Hill
2e68453655 Merge pull request #371 from pipecat-ai/cbrianhill/allow-build-without-git
chore: Enable build without git
2024-08-12 10:15:55 -04:00
Brian Hill
89b8a9de7d chore: Enable build without git 2024-08-12 09:36:25 -04:00
Aleix Conchillo Flaqué
c4c2058df9 processors(rtvi): handle frames pushed from outside in order 2024-08-11 23:09:11 -07:00
Aleix Conchillo Flaqué
0d85c0085f processors(rtvi): interrupt the bot if a new config is received 2024-08-11 23:09:11 -07:00
Mattie Ruth
6fa8a8f84f Merge pull request #365 from pipecat-ai/ruthless/metrics 2024-08-11 20:35:05 -04:00
mattie ruth backman
a97775bff3 Add the model name to the LLM usage metrics 2024-08-11 12:08:46 -04:00
Aleix Conchillo Flaqué
32640e054d processors(rtvi): add new option values if they haven't been set yet 2024-08-10 21:25:39 -07:00
Aleix Conchillo Flaqué
aa42da5658 processors(rtvi): no need for configure_on_start() 2024-08-10 21:25:21 -07:00
nulyang
900a94a825 services(azure): fix TranscriptionFrame parameter type 2024-08-10 13:00:03 +08:00
Aleix Conchillo Flaqué
c37552de70 processors(rtvi): add support for action responses 2024-08-09 18:12:37 -07:00
Aleix Conchillo Flaqué
916b37926c processors(rtvi): rtvi 0.1 message protocol 2024-08-09 17:24:38 -07:00
Aleix Conchillo Flaqué
2b76c3c15a update macos-py3.10-requirements 2024-08-09 17:18:30 -07:00
Aleix Conchillo Flaqué
cedd7dde18 update linux-py3.10-requirements.txt 2024-08-09 17:14:46 -07:00
Lewis Wolfgang
d088608d8e Merge pull request #340 from pipecat-ai/lewis/silero-vad-via-pip
Install Silero VAD via pip
2024-08-09 13:27:29 -04:00
Aleix Conchillo Flaqué
06ee29bb8b Merge pull request #359 from pipecat-ai/aleix/twilio-elevenlabs-sample-rates
twilio and elevenlabs sample rates
2024-08-09 09:38:35 -07:00
Aleix Conchillo Flaqué
d255e954d6 services(elevenlabs): allow specifying output_format 2024-08-09 09:38:20 -07:00
Aleix Conchillo Flaqué
6a7ab6b8ac serializers(twilio): allow specifying input and output sample rates 2024-08-09 09:37:51 -07:00
Aleix Conchillo Flaqué
45b18cc0b1 Merge pull request #358 from pipecat-ai/aleix/daily-create-room-exp-fixes
transports(daily): fixed create_room expirations
2024-08-09 09:37:01 -07:00
Aleix Conchillo Flaqué
0479431f0a Merge pull request #357 from pipecat-ai/aleix/daily-on-participant-updated
transports(daily): added on_participant_updated event
2024-08-09 09:36:46 -07:00
Aleix Conchillo Flaqué
ec58dbd791 transports(daily): added on_participant_updated event
Fixes #353
2024-08-09 09:36:24 -07:00
Aleix Conchillo Flaqué
91de68aab3 Merge pull request #355 from pipecat-ai/aleix/usage-metrics-update
processors(base): add start_llm_usage_metrics and start_tts_usage_met…
2024-08-09 09:35:36 -07:00
Aleix Conchillo Flaqué
85efc30145 Merge pull request #356 from pipecat-ai/aleix/eleven_turbo_v2_5
services(elevenlabs): update default model to eleven_turbo_v2_5
2024-08-09 09:34:47 -07:00
Aleix Conchillo Flaqué
0032594f21 transports(daily): fixed create_room expirations
Fixes #348
2024-08-08 22:04:22 -07:00
Aleix Conchillo Flaqué
829fdc5679 services(elevenlabs): update default model to eleven_turbo_v2_5
Fixes #349
2024-08-08 21:38:18 -07:00
Aleix Conchillo Flaqué
22e176e329 processors(base): add start_llm_usage_metrics and start_tts_usage_metrics 2024-08-08 16:46:56 -07:00
Lewis Wolfgang
826a70a137 Merge pull request #354 from pipecat-ai/lewis/delete_room_by_name
Add delete_room_by_name to DailyRESTHelper
2024-08-08 17:09:21 -04:00
Lewis Wolfgang
dd0ea674af Treat 404 (room not found) as a success for deletion 2024-08-08 16:57:58 -04:00
Lewis Wolfgang
a4761b8921 Add delete_room_by_name to DailyRESTHelper 2024-08-08 16:31:01 -04:00
chadbailey59
3958bb7903 Additional LLM and TTS metrics (#343)
* added llm and tts usage metrics

* Metrics debug logging

* cleanup
2024-08-07 08:55:51 -05:00
Aleix Conchillo Flaqué
83a037a7ce Merge pull request #345 from pipecat-ai/aleix/base-output-render-time-fixes
transports(base_output): improve render sleep computation
2024-08-06 17:30:47 -07:00
Aleix Conchillo Flaqué
a3eb8337a6 Merge pull request #342 from pipecat-ai/aleix/base-output-transport-push-audio
transport(base_output): push audio downstream
2024-08-06 17:30:32 -07:00
Aleix Conchillo Flaqué
541072f8e0 transports(base_output): improve render sleep computation 2024-08-06 17:20:41 -07:00
Aleix Conchillo Flaqué
881248cbd6 transport(base_output): push audio downstream 2024-08-05 14:00:09 -07:00
Aleix Conchillo Flaqué
d4979f5e64 Merge pull request #337 from pipecat-ai/aleix/audio-video-sync-and-gstreamer
audio/video sync and gstreamer
2024-08-05 09:28:11 -07:00
Aleix Conchillo Flaqué
4133cd03bb processors(gstreamer): add clock_sync property 2024-08-05 09:23:25 -07:00
Lewis Wolfgang
9f07c3ca27 Fly.io example: remove step to cache silero models.
No longer necessary.
2024-08-05 10:12:35 -04:00
Lewis Wolfgang
b20bacb9ed Remove no longer needed code 2024-08-05 10:10:39 -04:00
Lewis Wolfgang
97cfbfee1d Install silero via pip 2024-08-05 10:01:27 -04:00
Aleix Conchillo Flaqué
fa7c941792 examples(gstreamer): add new GStreamer examples 2024-08-04 12:29:36 -07:00
Aleix Conchillo Flaqué
4738879f32 processors(gstreamer): add new GStreamerPipelineSource 2024-08-04 12:29:34 -07:00
Aleix Conchillo Flaqué
d5d88f756a transport(output): improve audio and image handling for video use cases 2024-08-04 12:29:08 -07:00
Aleix Conchillo Flaqué
65b136bf15 Merge pull request #334 from pipecat-ai/aleix/cleanup-examples-remove-requests
cleanup examples and remove requests
2024-08-01 22:05:01 -07:00
Aleix Conchillo Flaqué
bee0b238e4 examples(storytelling-chatbot): include package-lock.json 2024-08-01 18:23:30 -07:00
Aleix Conchillo Flaqué
c891168ffb services: revert optional aiohttp.ClientSession 2024-08-01 18:22:56 -07:00
Aleix Conchillo Flaqué
6376c2f6aa transport(websocket): fix cancel 2024-08-01 18:09:16 -07:00
Aleix Conchillo Flaqué
4d9b7cdd61 DailyRESTHelper now receives an aiohttp client session 2024-08-01 18:08:57 -07:00
Aleix Conchillo Flaqué
8263d1dd6f update CHANGELOG with latest changes 2024-07-31 23:44:07 -07:00
Aleix Conchillo Flaqué
faf41c0b36 services: ignore yielded None values 2024-07-31 23:41:03 -07:00
Aleix Conchillo Flaqué
27a09c0b2c cleanup examples and remove requests library 2024-07-31 23:39:51 -07:00
Aleix Conchillo Flaqué
3db7f6a284 Merge pull request #333 from pipecat-ai/aleix/allow-internal-http-sessions-rebased
services: allow internal http sessions if none is given
2024-07-31 21:57:00 -07:00
Aleix Conchillo Flaqué
3bfeb5b5ef services: allow internal http sessions if none is given 2024-07-31 21:56:19 -07:00
Aleix Conchillo Flaqué
62a7a555b5 Merge pull request #330 from pipecat-ai/aleix/stop-and-cancel-are-different
EndFrame tries to end gracefully CancelFrame cancels tasks
2024-07-31 15:51:29 -07:00
301 changed files with 28252 additions and 10326 deletions

View File

@@ -1,4 +1,4 @@
name: lint
name: format
on:
workflow_dispatch:
@@ -12,12 +12,12 @@ on:
- "docs/**"
concurrency:
group: build-lint-${{ github.event.pull_request.number || github.ref }}
group: build-format-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
autopep8:
name: "Formatting lints"
ruff-format:
name: "Formatting checker"
runs-on: ubuntu-latest
steps:
- name: Checkout repo
@@ -25,7 +25,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: "3.10"
- name: Setup virtual environment
run: |
python -m venv .venv
@@ -34,11 +34,8 @@ jobs:
source .venv/bin/activate
python -m pip install --upgrade pip
pip install -r dev-requirements.txt
- name: autopep8
id: autopep8
- name: Ruff formatter
id: ruff
run: |
source .venv/bin/activate
autopep8 --max-line-length 100 --exit-code -r -d --exclude "*_pb2.py" -a -a src/
- name: Fail if autopep8 requires changes
if: steps.autopep8.outputs.exit-code == 2
run: exit 1
ruff format --diff

View File

@@ -20,21 +20,24 @@ jobs:
name: "Unit and Integration Tests"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Checkout repo
uses: actions/checkout@v4
- name: Set up Python
id: setup_python
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: "3.10"
- name: Cache virtual environment
uses: actions/cache@v3
with:
# We are hashing requirements-dev.txt and requirements-extra.txt which
# contain all dependencies needed to run the tests and examples.
key: venv-${{ runner.os }}-${{ steps.setup_python.outputs.python-version}}-${{ hashFiles('linux-py3.10-requirements.txt') }}-${{ hashFiles('dev-requirements.txt') }}
# We are hashing dev-requirements.txt and test-requirements.txt which
# contain all dependencies needed to run the tests.
key: venv-${{ runner.os }}-${{ steps.setup_python.outputs.python-version}}-${{ hashFiles('dev-requirements.txt') }}-${{ hashFiles('test-requirements.txt') }}
path: .venv
- name: Install system packages
run: sudo apt-get install -y portaudio19-dev
id: install_system_packages
run: |
sudo apt-get install -y portaudio19-dev
- name: Setup virtual environment
run: |
python -m venv .venv
@@ -42,8 +45,8 @@ jobs:
run: |
source .venv/bin/activate
python -m pip install --upgrade pip
pip install -r linux-py3.10-requirements.txt -r dev-requirements.txt
pip install -r dev-requirements.txt -r test-requirements.txt
- name: Test with pytest
run: |
source .venv/bin/activate
pytest --doctest-modules --ignore-glob="*to_be_updated*" src tests
pytest --ignore-glob="*to_be_updated*" --ignore-glob=*pipeline_source* src tests

1
.gitignore vendored
View File

@@ -4,6 +4,7 @@ __pycache__/
*~
venv
.venv
/.idea
#*#
# Distribution / packaging

View File

@@ -1,14 +1,503 @@
# Changelog
All notable changes to **pipecat** will be documented in this file.
All notable changes to **Pipecat** will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [0.0.49] - 2024-11-17
### Added
- Added RTVI `on_bot_started` event which is useful in a single turn
interaction.
- Added `DailyTransport` events `dialin-connected`, `dialin-stopped`,
`dialin-error` and `dialin-warning`. Needs daily-python >= 0.13.0.
- Added `RimeHttpTTSService` and the `07q-interruptible-rime.py` foundational
example.
- Added `STTMuteFilter`, a general-purpose processor that combines STT
muting and interruption control. When active, it prevents both transcription
and interruptions during bot speech. The processor supports multiple
strategies: `FIRST_SPEECH` (mute only during bot's first
speech), `ALWAYS` (mute during all bot speech), or `CUSTOM` (using provided
callback).
- Added `STTMuteFrame`, a control frame that enables/disables speech
transcription in STT services.
## [0.0.48] - 2024-11-10 "Antonio release"
### Added
- There's now an input queue in each frame processor. When you call
`FrameProcessor.push_frame()` this will internally call
`FrameProcessor.queue_frame()` on the next processor (upstream or downstream)
and the frame will be internally queued (except system frames). Then, the
queued frames will get processed. With this input queue it is also possible
for FrameProcessors to block processing more frames by calling
`FrameProcessor.pause_processing_frames()`. The way to resume processing
frames is by calling `FrameProcessor.resume_processing_frames()`.
- Added audio filter `NoisereduceFilter`.
- Introduce input transport audio filters (`BaseAudioFilter`). Audio filters can
be used to remove background noises before audio is sent to VAD.
- Introduce output transport audio mixers (`BaseAudioMixer`). Output transport
audio mixers can be used, for example, to add background sounds or any other
audio mixing functionality before the output audio is actually written to the
transport.
- Added `GatedOpenAILLMContextAggregator`. This aggregator keeps the last
received OpenAI LLM context frame and it doesn't let it through until the
notifier is notified.
- Added `WakeNotifierFilter`. This processor expects a list of frame types and
will execute a given callback predicate when a frame of any of those type is
being processed. If the callback returns true the notifier will be notified.
- Added `NullFilter`. A null filter doesn't push any frames upstream or
downstream. This is usually used to disable one of the pipelines in
`ParallelPipeline`.
- Added `EventNotifier`. This can be used as a very simple synchronization
feature between processors.
- Added `TavusVideoService`. This is an integration for Tavus digital twins.
(see https://www.tavus.io/)
- Added `DailyTransport.update_subscriptions()`. This allows you to have fine
grained control of what media subscriptions you want for each participant in a
room.
- Added audio filter `KrispFilter`.
### Changed
- The following `DailyTransport` functions are now `async` which means they need
to be awaited: `start_dialout`, `stop_dialout`, `start_recording`,
`stop_recording`, `capture_participant_transcription` and
`capture_participant_video`.
- Changed default output sample rate to 24000. This changes all TTS service to
output to 24000 and also the default output transport sample rate. This
improves audio quality at the cost of some extra bandwidth.
- `AzureTTSService` now uses Azure websockets instead of HTTP requests.
- The previous `AzureTTSService` HTTP implementation is now
`AzureHttpTTSService`.
### Fixed
- Websocket transports (FastAPI and Websocket) now synchronize with time before
sending data. This allows for interruptions to just work out of the box.
- Improved bot speaking detection for all TTS services by using actual bot
audio.
- Fixed an issue that was generating constant bot started/stopped speaking
frames for HTTP TTS services.
- Fixed an issue that was causing stuttering with AWS TTS service.
- Fixed an issue with PlayHTTTSService, where the TTFB metrics were reporting
very small time values.
- Fixed an issue where AzureTTSService wasn't initializing the specified
language.
### Other
- Add `23-bot-background-sound.py` foundational example.
- Added a new foundational example `22-natural-conversation.py`. This example
shows how to achieve a more natural conversation detecting when the user ends
statement.
## [0.0.47] - 2024-10-22
### Added
- Added `AssemblyAISTTService` and corresponding foundational examples
`07o-interruptible-assemblyai.py` and `13d-assemblyai-transcription.py`.
- Added a foundational example for Gladia transcription:
`13c-gladia-transcription.py`
### Changed
- Updated `GladiaSTTService` to use the V2 API.
- Changed `DailyTransport` transcription model to `nova-2-general`.
### Fixed
- Fixed an issue that would cause an import error when importing
`SileroVADAnalyzer` from the old package `pipecat.vad.silero`.
- Fixed `enable_usage_metrics` to control LLM/TTS usage metrics separately
from `enable_metrics`.
## [0.0.46] - 2024-10-19
### Added
- Added `audio_passthrough` parameter to `STTService`. If enabled it allows
audio frames to be pushed downstream in case other processors need them.
- Added input parameter options for `PlayHTTTSService` and
`PlayHTHttpTTSService`.
### Changed
- Changed `DeepgramSTTService` model to `nova-2-general`.
- Moved `SileroVAD` audio processor to `processors.audio.vad`.
- Module `utils.audio` is now `audio.utils`. A new `resample_audio` function has
been added.
- `PlayHTTTSService` now uses PlayHT websockets instead of HTTP requests.
- The previous `PlayHTTTSService` HTTP implementation is now
`PlayHTHttpTTSService`.
- `PlayHTTTSService` and `PlayHTHttpTTSService` now use a `voice_engine` of
`PlayHT3.0-mini`, which allows for multi-lingual support.
- Renamed `OpenAILLMServiceRealtimeBeta` to `OpenAIRealtimeBetaLLMService` to
match other services.
### Deprecated
- `LLMUserResponseAggregator` and `LLMAssistantResponseAggregator` are
mostly deprecated, use `OpenAILLMContext` instead.
- The `vad` package is now deprecated and `audio.vad` should be used
instead. The `avd` package will get removed in a future release.
### Fixed
- Fixed an issue that would cause an error if no VAD analyzer was passed to
`LiveKitTransport` params.
- Fixed `SileroVAD` processor to support interruptions properly.
### Other
- Added `examples/foundational/07-interruptible-vad.py`. This is the same as
`07-interruptible.py` but using the `SileroVAD` processor instead of passing
the `VADAnalyzer` in the transport.
## [0.0.45] - 2024-10-16
### Changed
- Metrics messages have moved out from the transport's base output into RTVI.
## [0.0.44] - 2024-10-15
### Added
- Added support for OpenAI Realtime API with the new
`OpenAILLMServiceRealtimeBeta` processor.
(see https://platform.openai.com/docs/guides/realtime/overview)
- Added `RTVIBotTranscriptionProcessor` which will send the RTVI
`bot-transcription` protocol message. These are TTS text aggregated (into
sentences) messages.
- Added new input params to the `MarkdownTextFilter` utility. You can set
`filter_code` to filter code from text and `filter_tables` to filter tables
from text.
- Added `CanonicalMetricsService`. This processor uses the new
`AudioBufferProcessor` to capture conversation audio and later send it to
Canonical AI.
(see https://canonical.chat/)
- Added `AudioBufferProcessor`. This processor can be used to buffer mixed user and
bot audio. This can later be saved into an audio file or processed by some
audio analyzer.
- Added `on_first_participant_joined` event to `LiveKitTransport`.
### Changed
- LLM text responses are now logged properly as unicode characters.
- `UserStartedSpeakingFrame`, `UserStoppedSpeakingFrame`,
`BotStartedSpeakingFrame`, `BotStoppedSpeakingFrame`, `BotSpeakingFrame` and
`UserImageRequestFrame` are now based from `SystemFrame`
### Fixed
- Merge `RTVIBotLLMProcessor`/`RTVIBotLLMTextProcessor` and
`RTVIBotTTSProcessor`/`RTVIBotTTSTextProcessor` to avoid out of order issues.
- Fixed an issue in RTVI protocol that could cause a `bot-llm-stopped` or
`bot-tts-stopped` message to be sent before a `bot-llm-text` or `bot-tts-text`
message.
- Fixed `DeepgramSTTService` constructor settings not being merged with default
ones.
- Fixed an issue in Daily transport that would cause tasks to be hanging if
urgent transport messages were being sent from a transport event handler.
- Fixed an issue in `BaseOutputTransport` that would cause `EndFrame` to be
pushed downed too early and call `FrameProcessor.cleanup()` before letting the
transport stop properly.
## [0.0.43] - 2024-10-10
### Added
- Added a new util called `MarkdownTextFilter` which is a subclass of a new
base class called `BaseTextFilter`. This is a configurable utility which
is intended to filter text received by TTS services.
- Added new `RTVIUserLLMTextProcessor`. This processor will send an RTVI
`user-llm-text` message with the user content's that was sent to the LLM.
### Changed
- `TransportMessageFrame` doesn't have an `urgent` field anymore, instead
there's now a `TransportMessageUrgentFrame` which is a `SystemFrame` and
therefore skip all internal queuing.
- For TTS services, convert inputted languages to match each service's language
format
### Fixed
- Fixed an issue where changing a language with the Deepgram STT service
wouldn't apply the change. This was fixed by disconnecting and reconnecting
when the language changes.
## [0.0.42] - 2024-10-02
### Added
- `SentryMetrics` has been added to report frame processor metrics to
Sentry. This is now possible because `FrameProcessorMetrics` can now be passed
to `FrameProcessor`.
- Added Google TTS service and corresponding foundational example
`07n-interruptible-google.py`
- Added AWS Polly TTS support and `07m-interruptible-aws.py` as an example.
- Added InputParams to Azure TTS service.
- Added `LivekitTransport` (audio-only for now).
- RTVI 0.2.0 is now supported.
- All `FrameProcessors` can now register event handlers.
```
tts = SomeTTSService(...)
@tts.event_handler("on_connected"):
async def on_connected(processor):
...
```
- Added `AsyncGeneratorProcessor`. This processor can be used together with a
`FrameSerializer` as an async generator. It provides a `generator()` function
that returns an `AsyncGenerator` and that yields serialized frames.
- Added `EndTaskFrame` and `CancelTaskFrame`. These are new frames that are
meant to be pushed upstream to tell the pipeline task to stop nicely or
immediately respectively.
- Added configurable LLM parameters (e.g., temperature, top_p, max_tokens, seed)
for OpenAI, Anthropic, and Together AI services along with corresponding
setter functions.
- Added `sample_rate` as a constructor parameter for TTS services.
- Pipecat has a pipeline-based architecture. The pipeline consists of frame
processors linked to each other. The elements traveling across the pipeline
are called frames.
To have a deterministic behavior the frames traveling through the pipeline
should always be ordered, except system frames which are out-of-band
frames. To achieve that, each frame processor should only output frames from a
single task.
In this version all the frame processors have their own task to push
frames. That is, when `push_frame()` is called the given frame will be put
into an internal queue (with the exception of system frames) and a frame
processor task will push it out.
- Added pipeline clocks. A pipeline clock is used by the output transport to
know when a frame needs to be presented. For that, all frames now have an
optional `pts` field (prensentation timestamp). There's currently just one
clock implementation `SystemClock` and the `pts` field is currently only used
for `TextFrame`s (audio and image frames will be next).
- A clock can now be specified to `PipelineTask` (defaults to
`SystemClock`). This clock will be passed to each frame processor via the
`StartFrame`.
- Added `CartesiaHttpTTSService`.
- `DailyTransport` now supports setting the audio bitrate to improve audio
quality through the `DailyParams.audio_out_bitrate` parameter. The new
default is 96kbps.
- `DailyTransport` now uses the number of audio output channels (1 or 2) to set
mono or stereo audio when needed.
- Interruptions support has been added to `TwilioFrameSerializer` when using
`FastAPIWebsocketTransport`.
- Added new `LmntTTSService` text-to-speech service.
(see https://www.lmnt.com/)
- Added `TTSModelUpdateFrame`, `TTSLanguageUpdateFrame`, `STTModelUpdateFrame`,
and `STTLanguageUpdateFrame` frames to allow you to switch models, language
and voices in TTS and STT services.
- Added new `transcriptions.Language` enum.
### Changed
- Context frames are now pushed downstream from assistant context aggregators.
- Removed Silero VAD torch dependency.
- Updated individual update settings frame classes into a single
`ServiceUpdateSettingsFrame` class.
- We now distinguish between input and output audio and image frames. We
introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame`
and `OutputImageRawFrame` (and other subclasses of those). The input frames
usually come from an input transport and are meant to be processed inside the
pipeline to generate new frames. However, the input frames will not be sent
through an output transport. The output frames can also be processed by any
frame processor in the pipeline and they are allowed to be sent by the output
transport.
- `ParallelTask` has been renamed to `SyncParallelPipeline`. A
`SyncParallelPipeline` is a frame processor that contains a list of different
pipelines to be executed concurrently. The difference between a
`SyncParallelPipeline` and a `ParallelPipeline` is that, given an input frame,
the `SyncParallelPipeline` will wait for all the internal pipelines to
complete. This is achieved by making sure the last processor in each of the
pipelines is synchronous (e.g. an HTTP-based service that waits for the
response).
- `StartFrame` is back a system frame to make sure it's processed immediately by
all processors. `EndFrame` stays a control frame since it needs to be ordered
allowing the frames in the pipeline to be processed.
- Updated `MoondreamService` revision to `2024-08-26`.
- `CartesiaTTSService` and `ElevenLabsTTSService` now add presentation
timestamps to their text output. This allows the output transport to push the
text frames downstream at almost the same time the words are spoken. We say
"almost" because currently the audio frames don't have presentation timestamp
but they should be played at roughly the same time.
- `DailyTransport.on_joined` event now returns the full session data instead of
just the participant.
- `CartesiaTTSService` is now a subclass of `TTSService`.
- `DeepgramSTTService` is now a subclass of `STTService`.
- `WhisperSTTService` is now a subclass of `SegmentedSTTService`. A
`SegmentedSTTService` is a `STTService` where the provided audio is given in a
big chunk (i.e. from when the user starts speaking until the user stops
speaking) instead of a continous stream.
### Fixed
- Fixed OpenAI multiple function calls.
- Fixed a Cartesia TTS issue that would cause audio to be truncated in some
cases.
- Fixed a `BaseOutputTransport` issue that would stop audio and video rendering
tasks (after receiving and `EndFrame`) before the internal queue was emptied,
causing the pipeline to finish prematurely.
- `StartFrame` should be the first frame every processor receives to avoid
situations where things are not initialized (because initialization happens on
`StartFrame`) and other frames come in resulting in undesired behavior.
### Performance
- `obj_id()` and `obj_count()` now use `itertools.count` avoiding the need of
`threading.Lock`.
### Other
- Pipecat now uses Ruff as its formatter (https://github.com/astral-sh/ruff).
## [0.0.41] - 2024-08-22
### Added
- Added `LivekitFrameSerializer` audio frame serializer.
### Fixed
- Fix `FastAPIWebsocketOutputTransport` variable name clash with subclass.
- Fix an `AnthropicLLMService` issue with empty arguments in function calling.
### Other
- Fixed `studypal` example errors.
## [0.0.40] - 2024-08-20
### Added
- VAD parameters can now be dynamicallt updated using the
`VADParamsUpdateFrame`.
- `ErrorFrame` has now a `fatal` field to indicate the bot should exit if a
fatal error is pushed upstream (false by default). A new `FatalErrorFrame`
that sets this flag to true has been added.
- `AnthropicLLMService` now supports function calling and initial support for
prompt caching.
(see https://www.anthropic.com/news/prompt-caching)
- `ElevenLabsTTSService` can now specify ElevenLabs input parameters such as
`output_format`.
- `TwilioFrameSerializer` can now specify Twilio's and Pipecat's desired sample
rates to use.
- Added new `on_participant_updated` event to `DailyTransport`.
- Added `DailyRESTHelper.delete_room_by_name()` and
`DailyRESTHelper.delete_room_by_url()`.
- Added LLM and TTS usage metrics. Those are enabled when
`PipelineParams.enable_usage_metrics` is True.
- `AudioRawFrame`s are now pushed downstream from the base output
transport. This allows capturing the exact words the bot says by adding an STT
service at the end of the pipeline.
- Added new `GStreamerPipelineSource`. This processor can generate image or
audio frames from a GStreamer pipeline (e.g. reading an MP4 file, and RTP
stream or anything supported by GStreamer).
- Added `TransportParams.audio_out_is_live`. This flag is False by default and
it is useful to indicate we should not synchronize audio with sporadic images.
- Added new `BotStartedSpeakingFrame` and `BotStoppedSpeakingFrame` control
frames. These frames are pushed upstream and they should wrap
`BotSpeakingFrame`.
@@ -17,6 +506,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- Support RTVI message protocol 0.1. This includes new messages, support for
messages responses, support for actions, configuration, webhooks and a bunch
of new cool stuff.
(see https://docs.rtvi.ai/)
- `SileroVAD` dependency is now imported via pip's `silero-vad` package.
- `ElevenLabsTTSService` now uses `eleven_turbo_v2_5` model by default.
- `BotSpeakingFrame` is now a control frame.
- `StartFrame` is now a control frame similar to `EndFrame`.
@@ -26,6 +524,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- `TTSStartFrame` and `TTSStopFrame` are now sent when TTS really starts and
stops. This allows for knowing when the bot starts and stops speaking even
with asynchronous services (like Cartesia).
- Fixed `AzureSTTService` transcription frame timestamps.
- Fixed an issue with `DailyRESTHelper.create_room()` expirations which would
cause this function to stop working after the initial expiration elapsed.
- Improved `EndFrame` and `CancelFrame` handling. `EndFrame` should end things
gracefully while a `CancelFrame` should cancel all running tasks as soon as
possible.
- Fixed an issue in `AIService` that would cause a yielded `None` value to be
processed.
- RTVI's `bot-ready` message is now sent when the RTVI pipeline is ready and
a first participant joins.
@@ -35,6 +549,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed a `BaseInputTransport` issue that was causing start/stop interruptions
incoming frames to not cancel tasks and be processed properly.
### Other
- Added `studypal` example (from to the Cartesia folks!).
- Most examples now use Cartesia.
- Added examples `foundational/19a-tools-anthropic.py`,
`foundational/19b-tools-video-anthropic.py` and
`foundational/19a-tools-togetherai.py`.
- Added examples `foundational/18-gstreamer-filesrc.py` and
`foundational/18a-gstreamer-videotestsrc.py` that show how to use
`GStreamerPipelineSource`
- Remove `requests` library usage.
- Cleanup examples and use `DailyRESTHelper`.
## [0.0.39] - 2024-07-23
### Fixed
@@ -134,7 +666,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- It is now possible to specify a Silero VAD version when using `SileroVADAnalyzer`
or `SileroVAD`.
- Added `AysncFrameProcessor` and `AsyncAIService`. Some services like
- Added `AysncFrameProcessor` and `AsyncAIService`. Some services like
`DeepgramSTTService` need to process things asynchronously. For example, audio
is sent to Deepgram but transcriptions are not returned immediately. In these
cases we still require all frames (except system frames) to be pushed
@@ -151,7 +683,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `WhisperSTTService` model can now also be a string.
- Added missing * keyword separators in services.
- Added missing \* keyword separators in services.
### Fixed
@@ -228,7 +760,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added new `TwilioFrameSerializer`. This is a new serializer that knows how to
serialize and deserialize audio frames from Twilio.
- Added Daily transport event: `on_dialout_answered`. See
- Added Daily transport event: `on_dialout_answered`. See
https://reference-python.daily.co/api_reference.html#daily.EventHandler
- Added new `AzureSTTService`. This allows you to use Azure Speech-To-Text.
@@ -468,7 +1000,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added Daily transport support for dial-in use cases.
- Added Daily transport events: `on_dialout_connected`, `on_dialout_stopped`,
`on_dialout_error` and `on_dialout_warning`. See
`on_dialout_error` and `on_dialout_warning`. See
https://reference-python.daily.co/api_reference.html#daily.EventHandler
## [0.0.21] - 2024-05-22

165
CONTRIBUTING.md Normal file
View File

@@ -0,0 +1,165 @@
## Contributing to Pipecat
We welcome contributions of all kinds! Your help is appreciated. Follow these steps to get involved:
1. **Fork this repository**: Start by forking the Pipecat Documentation repository to your GitHub account.
2. **Clone the repository**: Clone your forked repository to your local machine.
```bash
git clone https://github.com/your-username/pipecat
```
3. **Create a branch**: For your contribution, create a new branch.
```bash
git checkout -b your-branch-name
```
4. **Make your changes**: Edit or add files as necessary.
5. **Test your changes**: Ensure that your changes look correct and follow the style set in the codebase.
6. **Commit your changes**: Once you're satisfied with your changes, commit them with a meaningful message.
```bash
git commit -m "Description of your changes"
```
7. **Push your changes**: Push your branch to your forked repository.
```bash
git push origin your-branch-name
```
9. **Submit a Pull Request (PR)**: Open a PR from your forked repository to the main branch of this repo.
> Important: Describe the changes you've made clearly!
Our maintainers will review your PR, and once everything is good, your contributions will be merged!
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or advances of
any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official email address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at pipecat-ai@daily.co.
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series of
actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within the
community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].
[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations

190
README.md
View File

@@ -1,15 +1,20 @@
<div align="center">
<h1><div align="center">
 <img alt="pipecat" width="300px" height="auto" src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/pipecat.png">
</div>
</div></h1>
# Pipecat
[![PyPI](https://img.shields.io/pypi/v/pipecat-ai)](https://pypi.org/project/pipecat-ai) [![Discord](https://img.shields.io/discord/1239284677165056021)](https://discord.gg/pipecat) <a href="https://app.commanddash.io/agent/github_pipecat-ai_pipecat"><img src="https://img.shields.io/badge/AI-Code%20Agent-EB9FDA"></a>
[![PyPI](https://img.shields.io/pypi/v/pipecat-ai)](https://pypi.org/project/pipecat-ai) [![Discord](https://img.shields.io/discord/1239284677165056021
)](https://discord.gg/pipecat)
Pipecat is an open source Python framework for building voice and multimodal conversational agents. It handles the complex orchestration of AI services, network transport, audio processing, and multimodal interactions, letting you focus on creating engaging experiences.
`pipecat` is a framework for building voice (and multimodal) conversational agents. Things like personal coaches, meeting assistants, [story-telling toys for kids](https://storytelling-chatbot.fly.dev/), customer support bots, [intake flows](https://www.youtube.com/watch?v=lDevgsp9vn0), and snarky social companions.
## What you can build
Take a look at some example apps:
- **Voice Assistants**: [Natural, real-time conversations with AI](https://demo.dailybots.ai/)
- **Interactive Agents**: Personal coaches and meeting assistants
- **Multimodal Apps**: Combine voice, video, images, and text
- **Creative Tools**: [Story-telling experiences](https://storytelling-chatbot.fly.dev/) and social companions
- **Business Solutions**: [Customer intake flows](https://www.youtube.com/watch?v=lDevgsp9vn0) and support bots
## See it in action
<p float="left">
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/simple-chatbot/image.png" width="280" /></a>&nbsp;
@@ -19,86 +24,105 @@ Take a look at some example apps:
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/moondream-chatbot/image.png" width="280" /></a>
</p>
## Getting started with voice agents
## Key features
- **Voice-first Design**: Built-in speech recognition, TTS, and conversation handling
- **Flexible Integration**: Works with popular AI services (OpenAI, ElevenLabs, etc.)
- **Pipeline Architecture**: Build complex apps from simple, reusable components
- **Real-time Processing**: Frame-based pipeline architecture for fluid interactions
- **Production Ready**: Enterprise-grade WebRTC and Websocket support
## Getting started
You can get started with Pipecat running on your local machine, then move your agent processes to the cloud when youre ready. You can also add a 📞 telephone number, 🖼️ image output, 📺 video input, use different LLMs, and more.
```shell
# install the module
# Install the module
pip install pipecat-ai
# set up an .env file with API keys
# Set up your environment
cp dot-env.template .env
```
By default, in order to minimize dependencies, only the basic framework functionality is available. Some third-party AI services require additional dependencies that you can install with:
To keep things lightweight, only the core framework is included by default. If you need support for third-party AI services, you can add the necessary dependencies with:
```shell
pip install "pipecat-ai[option,...]"
```
Your project may or may not need these, so they're made available as optional requirements. Here is a list:
Available options include:
- **AI services**: `anthropic`, `azure`, `deepgram`, `gladia`, `google`, `fal`, `moondream`, `openai`, `openpipe`, `playht`, `silero`, `whisper`, `xtts`
- **Transports**: `local`, `websocket`, `daily`
| Category | Services | Install Command Example |
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------- |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/api-reference/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/api-reference/services/stt/azure), [Deepgram](https://docs.pipecat.ai/api-reference/services/stt/deepgram), [Gladia](https://docs.pipecat.ai/api-reference/services/stt/gladia), [Whisper](https://docs.pipecat.ai/api-reference/services/stt/whisper) | `pip install "pipecat-ai[deepgram]"` |
| LLMs | [Anthropic](https://docs.pipecat.ai/api-reference/services/llm/anthropic), [Azure](https://docs.pipecat.ai/api-reference/services/llm/azure), [Fireworks AI](https://docs.pipecat.ai/api-reference/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/api-reference/services/llm/gemini), [Ollama](https://docs.pipecat.ai/api-reference/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/api-reference/services/llm/openai), [Together AI](https://docs.pipecat.ai/api-reference/services/llm/together) | `pip install "pipecat-ai[openai]"` |
| Text-to-Speech | [AWS](https://docs.pipecat.ai/api-reference/services/tts/aws), [Azure](https://docs.pipecat.ai/api-reference/services/tts/azure), [Cartesia](https://docs.pipecat.ai/api-reference/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/api-reference/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/api-reference/services/tts/elevenlabs), [Google](https://docs.pipecat.ai/api-reference/services/tts/google), [LMNT](https://docs.pipecat.ai/api-reference/services/tts/lmnt), [OpenAI](https://docs.pipecat.ai/api-reference/services/tts/openai), [PlayHT](https://docs.pipecat.ai/api-reference/services/tts/playht), [Rime](https://docs.pipecat.ai/api-reference/services/tts/rime), [XTTS](https://docs.pipecat.ai/api-reference/services/tts/xtts) | `pip install "pipecat-ai[cartesia]"` |
| Speech-to-Speech | [OpenAI Realtime](https://docs.pipecat.ai/api-reference/services/s2s/openai) | `pip install "pipecat-ai[openai]"` |
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/api-reference/services/transport/daily), WebSocket, Local | `pip install "pipecat-ai[daily]"` |
| Video | [Tavus](https://docs.pipecat.ai/api-reference/services/video/tavus) | `pip install "pipecat-ai[tavus]"` |
| Vision & Image | [Moondream](https://docs.pipecat.ai/api-reference/services/vision/moondream), [fal](https://docs.pipecat.ai/api-reference/services/image-generation/fal) | `pip install "pipecat-ai[moondream]"` |
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/api-reference/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/api-reference/utilities/audio/krisp-filter), [Noisereduce](https://docs.pipecat.ai/api-reference/utilities/audio/noisereduce-filter) | `pip install "pipecat-ai[silero]"` |
| Analytics & Metrics | [Canonical AI](https://docs.pipecat.ai/api-reference/services/analytics/canonical), [Sentry](https://docs.pipecat.ai/api-reference/services/analytics/sentry) | `pip install "pipecat-ai[canonical]"` |
📚 [View full services documentation →](https://docs.pipecat.ai/api-reference/services/supported-services)
## Code examples
- [foundational](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational) — small snippets that build on each other, introducing one or two concepts at a time
- [example apps](https://github.com/pipecat-ai/pipecat/tree/main/examples/) — complete applications that you can use as starting points for development
- [Foundational](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational) — small snippets that build on each other, introducing one or two concepts at a time
- [Example apps](https://github.com/pipecat-ai/pipecat/tree/main/examples/) — complete applications that you can use as starting points for development
## A simple voice agent running locally
Here is a very basic Pipecat bot that greets a user when they join a real-time session. We'll use [Daily](https://daily.co) for real-time media transport, and [ElevenLabs](https://elevenlabs.io/) for text-to-speech.
Here is a very basic Pipecat bot that greets a user when they join a real-time session. We'll use [Daily](https://daily.co) for real-time media transport, and [Cartesia](https://cartesia.ai/) for text-to-speech.
```python
#app.py
import asyncio
import aiohttp
from pipecat.frames.frames import EndFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.task import PipelineTask
from pipecat.pipeline.runner import PipelineRunner
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
async def main():
async with aiohttp.ClientSession() as session:
# Use Daily as a real-time media transport (WebRTC)
transport = DailyTransport(
room_url=...,
token=...,
bot_name="Bot Name",
params=DailyParams(audio_out_enabled=True))
# Use Daily as a real-time media transport (WebRTC)
transport = DailyTransport(
room_url=...,
token="", # leave empty. Note: token is _not_ your api key
bot_name="Bot Name",
params=DailyParams(audio_out_enabled=True))
# Use Eleven Labs for Text-to-Speech
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=...,
voice_id=...,
)
# Use Cartesia for Text-to-Speech
tts = CartesiaTTSService(
api_key=...,
voice_id=...
)
# Simple pipeline that will process text to speech and output the result
pipeline = Pipeline([tts, transport.output()])
# Simple pipeline that will process text to speech and output the result
pipeline = Pipeline([tts, transport.output()])
# Create Pipecat processor that can run one or more pipelines tasks
runner = PipelineRunner()
# Create Pipecat processor that can run one or more pipelines tasks
runner = PipelineRunner()
# Assign the task callable to run the pipeline
task = PipelineTask(pipeline)
# Assign the task callable to run the pipeline
task = PipelineTask(pipeline)
# Register an event handler to play audio when a
# participant joins the transport WebRTC session
@transport.event_handler("on_participant_joined")
async def on_new_participant_joined(transport, participant):
participant_name = participant["info"]["userName"] or ''
# Queue a TextFrame that will get spoken by the TTS service (Eleven Labs)
await task.queue_frames([TextFrame(f"Hello there, {participant_name}!"), EndFrame()])
# Register an event handler to play audio when a
# participant joins the transport WebRTC session
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
participant_name = participant.get("info", {}).get("userName", "")
# Queue a TextFrame that will get spoken by the TTS service (Cartesia)
await task.queue_frame(TextFrame(f"Hello there, {participant_name}!"))
# Run the pipeline task
await runner.run(task)
# Register an event handler to exit the application when the user leaves.
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
# Run the pipeline task
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())
@@ -110,8 +134,7 @@ Run it with:
python app.py
```
Daily provides a prebuilt WebRTC user interface. Whilst the app is running, you can visit at `https://<yourdomain>.daily.co/<room_url>` and listen to the bot say hello!
Daily provides a prebuilt WebRTC user interface. While the app is running, you can visit at `https://<yourdomain>.daily.co/<room_url>` and listen to the bot say hello!
## WebRTC for production use
@@ -121,19 +144,6 @@ One way to get up and running quickly with WebRTC is to sign up for a Daily deve
Sign up [here](https://dashboard.daily.co/u/signup) and [create a room](https://docs.daily.co/reference/rest-api/rooms) in the developer Dashboard.
## What is VAD?
Voice Activity Detection &mdash; very important for knowing when a user has finished speaking to your bot. If you are not using press-to-talk, and want Pipecat to detect when the user has finished talking, VAD is an essential component for a natural feeling conversation.
Pipecat makes use of WebRTC VAD by default when using a WebRTC transport layer. Optionally, you can use Silero VAD for improved accuracy at the cost of higher CPU usage.
```shell
pip install pipecat-ai[silero]
```
The first time your run your bot with Silero, startup may take a while whilst it downloads and caches the model in the background. You can check the progress of this in the console.
## Hacking on the framework itself
_Note that you may need to set up a virtual environment before following the instructions below. For instance, you might need to run the following from the root of the repo:_
@@ -146,20 +156,20 @@ source venv/bin/activate
From the root of this repo, run the following:
```shell
pip install -r dev-requirements.txt -r {env}-requirements.txt
pip install -r dev-requirements.txt
python -m build
```
This builds the package. To use the package locally (eg to run sample files), run
This builds the package. To use the package locally (e.g. to run sample files), run
```shell
pip install --editable .
pip install --editable ".[option,...]"
```
If you want to use this package from another directory, you can run:
```shell
pip install path_to_this_repo
pip install "path_to_this_repo[option,...]"
```
### Running tests
@@ -167,27 +177,29 @@ pip install path_to_this_repo
From the root directory, run:
```shell
pytest --doctest-modules --ignore-glob="*to_be_updated*" src tests
pytest --doctest-modules --ignore-glob="*to_be_updated*" --ignore-glob=*pipeline_source* src tests
```
## Setting up your editor
This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting.
This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting via [Ruff](https://github.com/astral-sh/ruff).
### Emacs
You can use [use-package](https://github.com/jwiegley/use-package) to install [py-autopep8](https://codeberg.org/ideasman42/emacs-py-autopep8) package and configure `autopep8` arguments:
You can use [use-package](https://github.com/jwiegley/use-package) to install [emacs-lazy-ruff](https://github.com/christophermadsen/emacs-lazy-ruff) package and configure `ruff` arguments:
```elisp
(use-package py-autopep8
(use-package lazy-ruff
:ensure t
:defer t
:hook ((python-mode . py-autopep8-mode))
:hook ((python-mode . lazy-ruff-mode))
:config
(setq py-autopep8-options '("-a" "-a", "--max-line-length=100")))
(setq lazy-ruff-format-command "ruff format")
(setq lazy-ruff-only-format-block t)
(setq lazy-ruff-only-format-region t)
(setq lazy-ruff-only-format-buffer t))
```
`autopep8` was installed in the `venv` environment described before, so you should be able to use [pyvenv-auto](https://github.com/ryotaro612/pyvenv-auto) to automatically load that environment inside Emacs.
`ruff` was installed in the `venv` environment described before, so you should be able to use [pyvenv-auto](https://github.com/ryotaro612/pyvenv-auto) to automatically load that environment inside Emacs.
```elisp
(use-package pyvenv-auto
@@ -200,22 +212,32 @@ You can use [use-package](https://github.com/jwiegley/use-package) to install [p
### Visual Studio Code
Install the
[autopep8](https://marketplace.visualstudio.com/items?itemName=ms-python.autopep8) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, enable formatting on save and configure `autopep8` arguments:
[Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, and enable formatting on save:
```json
"[python]": {
"editor.defaultFormatter": "ms-python.autopep8",
"editor.defaultFormatter": "charliermarsh.ruff",
"editor.formatOnSave": true
},
"autopep8.args": [
"-a",
"-a",
"--max-line-length=100"
],
}
```
## Contributing
We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:
- **Found a bug?** Open an [issue](https://github.com/pipecat-ai/pipecat/issues)
- **Have a feature idea?** Start a [discussion](https://discord.gg/pipecat)
- **Want to contribute code?** Check our [CONTRIBUTING.md](CONTRIBUTING.md) guide
- **Documentation improvements?** [Docs](https://github.com/pipecat-ai/docs) PRs are always welcome
Before submitting a pull request, please check existing issues and PRs to avoid duplicates.
We aim to review all contributions promptly and provide constructive feedback to help get your changes merged.
## Getting help
➡️ [Join our Discord](https://discord.gg/pipecat)
➡️ [Read the docs](https://docs.pipecat.ai)
➡️ [Reach us on X](https://x.com/pipecat_ai)

View File

@@ -1,8 +1,8 @@
autopep8~=2.1.0
build~=1.2.1
grpcio-tools~=1.62.2
pip-tools~=7.4.1
pyright~=1.1.367
pytest~=8.2.0
setuptools~=71.1.0
pyright~=1.1.376
pytest~=8.3.2
ruff~=0.6.7
setuptools~=72.2.0
setuptools_scm~=8.1.0

22
docs/ISSUE_TEMPLATE.md Normal file
View File

@@ -0,0 +1,22 @@
# Description
Is this reporting a bug or feature request?
If reporting a bug, please fill out the following:
### Environment
- pipecat-ai version:
- python version:
- OS:
### Issue description
Provide a clear description of the issue.
### Repro steps
List the steps to reproduce the issue.
### Expected behavior
### Actual behavior
### Logs

View File

@@ -0,0 +1 @@
#### Please describe the changes in your PR. If it is addressing an issue, please reference that as well.

113
docs/frame.md Normal file
View File

@@ -0,0 +1,113 @@
# Understanding Different Frame Types in the Pipecat System
In the Pipecat system, frames are used to represent different types of data and control signals that flow through the pipeline. Understanding these frame types is crucial for working with the system effectively. This tutorial will cover the main categories of frames and their specific uses.
## 1. Base Frame Classes
### Frame
The `Frame` class is the base class for all frames. It includes:
- `id`: A unique identifier
- `name`: A descriptive name
- `pts`: Presentation timestamp (optional)
### DataFrame
`DataFrame` is a subclass of `Frame` and serves as a base for most data-carrying frames.
## 2. Audio Frames
### AudioRawFrame
Represents a chunk of audio with properties:
- `audio`: Raw audio data
- `sample_rate`: Audio sample rate
- `num_channels`: Number of audio channels
Subclasses include:
- `InputAudioRawFrame`: For audio from input sources
- `OutputAudioRawFrame`: For audio to be played by output devices
- `TTSAudioRawFrame`: For audio generated by Text-to-Speech services
## 3. Image Frames
### ImageRawFrame
Represents an image with properties:
- `image`: Raw image data
- `size`: Image dimensions
- `format`: Image format (e.g., JPEG, PNG)
Subclasses include:
- `InputImageRawFrame`: For images from input sources
- `OutputImageRawFrame`: For images to be displayed
- `UserImageRawFrame`: For images associated with a specific user
- `VisionImageRawFrame`: For images with associated text for description
- `URLImageRawFrame`: For images with an associated URL
### SpriteFrame
Represents an animated sprite, containing a list of `ImageRawFrame` objects.
## 4. Text and Transcription Frames
### TextFrame
Represents a chunk of text, used for various purposes in the pipeline.
### TranscriptionFrame
A specialized `TextFrame` for speech transcriptions, including:
- `user_id`: ID of the speaking user
- `timestamp`: When the transcription was generated
- `language`: Detected language of the speech
### InterimTranscriptionFrame
Similar to `TranscriptionFrame`, but for interim (not final) transcriptions.
## 5. LLM (Language Model) Frames
### LLMMessagesFrame
Contains a list of messages for an LLM service to process.
### LLMMessagesAppendFrame and LLMMessagesUpdateFrame
Used to modify the current context of LLM messages.
### LLMSetToolsFrame
Specifies tools (functions) available for the LLM to use.
### LLMEnablePromptCachingFrame
Controls prompt caching in certain LLMs.
## 6. System and Control Frames
### SystemFrame
Base class for system-level frames.
Important system frames include:
- `StartFrame`: Initiates a pipeline
- `CancelFrame`: Stops a pipeline immediately
- `ErrorFrame`: Notifies of errors (with `FatalErrorFrame` for unrecoverable errors)
- `EndTaskFrame` and `CancelTaskFrame`: Control pipeline tasks
- `StartInterruptionFrame` and `StopInterruptionFrame`: Indicate user speech for interruptions
### ControlFrame
Base class for control-flow frames.
Notable control frames:
- `EndFrame`: Signals the end of a pipeline
- `LLMFullResponseStartFrame` and `LLMFullResponseEndFrame`: Bracket LLM responses
- `UserStartedSpeakingFrame` and `UserStoppedSpeakingFrame`: Indicate user speech activity
- `BotStartedSpeakingFrame` and `BotStoppedSpeakingFrame`: Indicate bot speech activity
- `TTSStartedFrame` and `TTSStoppedFrame`: Bracket Text-to-Speech responses
## 7. Special Purpose Frames
### AppFrame
Base class for application-specific custom frames.
### MetricsFrame
Contains performance metrics data.
### FunctionCallInProgressFrame and FunctionCallResultFrame
Used for handling LLM function (tool) calls.
### ServiceUpdateSettingsFrame
Base class for updating service settings, with specific subclasses for LLM, TTS, and STT services.
## Conclusion
Understanding these frame types is essential for working with the Pipecat system. Each frame type serves a specific purpose in the pipeline, whether it's carrying data (like audio or images), controlling the flow of the pipeline, or managing system-level operations. By using the appropriate frame types, you can effectively process and transmit various kinds of information through your pipeline.

View File

@@ -1,6 +1,11 @@
# Anthropic
ANTHROPIC_API_KEY=...
# AWS
AWS_SECRET_ACCESS_KEY=...
AWS_ACCESS_KEY_ID=...
AWS_REGION=...
# Azure
AZURE_SPEECH_REGION=...
AZURE_SPEECH_API_KEY=...
@@ -30,6 +35,10 @@ FIREWORKS_API_KEY=...
# Gladia
GLADIA_API_KEY=...
# LMNT
LMNT_API_KEY=...
LMNT_VOICE_ID=...
# PlayHT
PLAY_HT_USER_ID=...
PLAY_HT_API_KEY=...
@@ -37,5 +46,13 @@ PLAY_HT_API_KEY=...
# OpenAI
OPENAI_API_KEY=...
#OpenPipe
# OpenPipe
OPENPIPE_API_KEY=...
# Tavus
TAVUS_API_KEY=...
TAVUS_REPLICA_ID=...
TAVUS_PERSONA_ID=...
#Krisp
KRISP_MODEL_PATH=...

View File

@@ -41,6 +41,7 @@ Next, follow the steps in the README for each demo.
| [Patient intake](patient-intake) | A chatbot that can call functions in response to user input. | Deepgram, ElevenLabs, OpenAI, Daily, Daily Prebuilt UI |
| [Dialin Chatbot](dialin-chatbot) | A chatbot that connects to an incoming phone call from Daily or Twilio. | Deepgram, ElevenLabs, OpenAI, Daily, Twilio |
| [Twilio Chatbot](twilio-chatbot) | A chatbot that connects to an incoming phone call from Twilio. | Deepgram, ElevenLabs, OpenAI, Daily, Twilio |
| [studypal](studypal) | A chatbot to have a conversation about any article on the web | |
> [!IMPORTANT]
> These example projects use Daily as a WebRTC transport and can be joined using their hosted Prebuilt UI.

161
examples/canonical-metrics/.gitignore vendored Normal file
View File

@@ -0,0 +1,161 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
recordings/
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
runpod.toml

View File

@@ -0,0 +1,10 @@
FROM python:3.10-bullseye
RUN mkdir /app
COPY *.py /app/
COPY requirements.txt /app/
WORKDIR /app
RUN pip3 install -r requirements.txt
EXPOSE 7860
CMD ["python3", "server.py"]

View File

@@ -0,0 +1,66 @@
# Chatbot with canonical-metrics
This project implements a chatbot using a pipeline architecture that integrates audio processing, transcription, and a language model for conversational interactions. The chatbot operates within a daily communication environment, utilizing various services for text-to-speech and language model responses.
## Features
- **Audio Input and Output**: Captures microphone input and plays back audio responses.
- **Voice Activity Detection**: Utilizes Silero VAD to manage audio input intelligently.
- **Text-to-Speech**: Integrates ElevenLabs TTS service to convert text responses into audio.
- **Language Model Interaction**: Uses OpenAI's GPT-4 model to generate responses based on user input.
- **Transcription Services**: Captures and transcribes participant speech for analytics.
- **Metrics Collection**: Sends audio data for analysis via Canonical Metrics Service.
## Requirements
- Python 3.10+
- `python-dotenv`
- Additional libraries from the `pipecat` package.
## Setup
1. Clone the repository.
2. Install the required packages.
3. Set up environment variables for API keys:
- `OPENAI_API_KEY`
- `ELEVENLABS_API_KEY`
- `CANONICAL_API_KEY`
- `CANONICAL_API_URL`
4. Run the script.
## Usage
The chatbot introduces itself and engages in conversations, providing brief and creative responses. Designed for flexibility, it can support multiple languages with appropriate configuration.
## Events
- Participants joining or leaving the call are handled dynamically, adjusting the chatbot's behavior accordingly.
The first time, things might take extra time to get started since VAD (Voice Activity Detection) model needs to be downloaded.
## Get started
```python
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
cp env.example .env # and add your credentials
```
## Run the server
```bash
python server.py
```
Then, visit `http://localhost:7860/` in your browser to start a chatbot session.
## Build and test the Docker image
```
docker build -t chatbot .
docker run --env-file .env -p 7860:7860 chatbot
```

View File

@@ -0,0 +1,146 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import uuid
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
from pipecat.services.canonical import CanonicalMetricsService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Chatbot",
DailyParams(
audio_out_enabled=True,
audio_in_enabled=True,
camera_out_enabled=False,
vad_enabled=True,
vad_audio_passthrough=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
#
# Spanish
#
# transcription_settings=DailyTranscriptionSettings(
# language="es",
# tier="nova",
# model="2-general"
# )
),
)
tts = ElevenLabsTTSService(
api_key=os.getenv("ELEVENLABS_API_KEY"),
#
# English
#
voice_id="cgSgspJ2msm6clMCkdW9",
aiohttp_session=session,
#
# Spanish
#
# model="eleven_multilingual_v2",
# voice_id="gD1IexrzCvsXPHUuT0s3",
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
#
# English
#
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself. Keep all your responses to 12 words or fewer.",
#
# Spanish
#
# "content": "Eres Chatbot, un amigable y útil robot. Tu objetivo es demostrar tus capacidades de una manera breve. Tus respuestas se convertiran a audio así que nunca no debes incluir caracteres especiales. Contesta a lo que el usuario pregunte de una manera creativa, útil y breve. Empieza por presentarte a ti mismo.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
"""
CanonicalMetrics uses AudioBufferProcessor under the hood to buffer the audio. On
call completion, CanonicalMetrics will send the audio buffer to Canonical for
analysis. Visit https://voice.canonical.chat to learn more.
"""
audio_buffer_processor = AudioBufferProcessor()
canonical = CanonicalMetricsService(
audio_buffer_processor=audio_buffer_processor,
aiohttp_session=session,
api_key=os.getenv("CANONICAL_API_KEY"),
api_url=os.getenv("CANONICAL_API_URL"),
call_id=str(uuid.uuid4()),
assistant="pipecat-chatbot",
assistant_speaks_first=True,
)
pipeline = Pipeline(
[
transport.input(), # microphone
context_aggregator.user(),
llm,
tts,
transport.output(),
audio_buffer_processor, # captures audio into a buffer
canonical, # uploads audio buffer to Canonical AI for metrics
context_aggregator.assistant(),
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
print(f"Participant left: {participant}")
await task.queue_frame(EndFrame())
@transport.event_handler("on_call_state_updated")
async def on_call_state_updated(transport, state):
if state == "left":
await task.queue_frame(EndFrame())
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,6 @@
DAILY_SAMPLE_ROOM_URL=https://yourdomain.daily.co/yourroom # (for joining the bot to the same room repeatedly for local dev)
DAILY_API_KEY=7df...
OPENAI_API_KEY=sk-PL...
ELEVENLABS_API_KEY=aeb...
CANONICAL_API_KEY=can...
CANONICAL_API_URL=

View File

@@ -0,0 +1,5 @@
python-dotenv
fastapi[all]
uvicorn
pipecat-ai[daily,openai,silero,elevenlabs,canonical]

View File

@@ -0,0 +1,56 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import os
import aiohttp
from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper
async def configure(aiohttp_session: aiohttp.ClientSession):
parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=False, help="URL of the Daily room to join"
)
parser.add_argument(
"-k",
"--apikey",
type=str,
required=False,
help="Daily API Key (needed to create an owner token for the room)",
)
args, unknown = parser.parse_known_args()
url = args.url or os.getenv("DAILY_SAMPLE_ROOM_URL")
key = args.apikey or os.getenv("DAILY_API_KEY")
if not url:
raise Exception(
"No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL."
)
if not key:
raise Exception(
"No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers."
)
daily_rest_helper = DailyRESTHelper(
daily_api_key=key,
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=aiohttp_session,
)
# Create a meeting token for the given room with an expiration 1 hour in
# the future.
expiry_time: float = 60 * 60
token = await daily_rest_helper.get_token(url, expiry_time)
return (url, token)
return (url, token)

View File

@@ -0,0 +1,139 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import os
import subprocess
from contextlib import asynccontextmanager
import aiohttp
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, RedirectResponse
from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomParams
MAX_BOTS_PER_ROOM = 1
# Bot sub-process dict for status reporting and concurrency control
bot_procs = {}
daily_helpers = {}
load_dotenv(override=True)
def cleanup():
# Clean up function, just to be extra safe
for entry in bot_procs.values():
proc = entry[0]
proc.terminate()
proc.wait()
@asynccontextmanager
async def lifespan(app: FastAPI):
aiohttp_session = aiohttp.ClientSession()
daily_helpers["rest"] = DailyRESTHelper(
daily_api_key=os.getenv("DAILY_API_KEY", ""),
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=aiohttp_session,
)
yield
await aiohttp_session.close()
cleanup()
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def start_agent(request: Request):
print(f"!!! Creating room")
room = await daily_helpers["rest"].create_room(DailyRoomParams())
print(f"!!! Room URL: {room.url}")
# Ensure the room property is present
if not room.url:
raise HTTPException(
status_code=500,
detail="Missing 'room' property in request data. Cannot start agent without a target room!",
)
# Check if there is already an existing process running in this room
num_bots_in_room = sum(
1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None
)
if num_bots_in_room >= MAX_BOTS_PER_ROOM:
raise HTTPException(status_code=500, detail=f"Max bot limited reach for room: {room.url}")
# Get the token for the room
token = await daily_helpers["rest"].get_token(room.url)
if not token:
raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room.url}")
# Spawn a new agent, and join the user session
# Note: this is mostly for demonstration purposes (refer to 'deployment' in README)
try:
proc = subprocess.Popen(
[f"python3 -m bot -u {room.url} -t {token}"],
shell=True,
bufsize=1,
cwd=os.path.dirname(os.path.abspath(__file__)),
)
bot_procs[proc.pid] = (proc, room.url)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}")
return RedirectResponse(room.url)
@app.get("/status/{pid}")
def get_status(pid: int):
# Look up the subprocess
proc = bot_procs.get(pid)
# If the subprocess doesn't exist, return an error
if not proc:
raise HTTPException(status_code=404, detail=f"Bot with process id: {pid} not found")
# Check the status of the subprocess
if proc[0].poll() is None:
status = "running"
else:
status = "finished"
return JSONResponse({"bot_id": pid, "status": status})
if __name__ == "__main__":
import uvicorn
default_host = os.getenv("HOST", "0.0.0.0")
default_port = int(os.getenv("FAST_API_PORT", "7860"))
parser = argparse.ArgumentParser(description="Daily Storyteller FastAPI server")
parser.add_argument("--host", type=str, default=default_host, help="Host address")
parser.add_argument("--port", type=int, default=default_port, help="Port number")
parser.add_argument("--reload", action="store_true", help="Reload code on change")
config = parser.parse_args()
uvicorn.run(
"server:app",
host=config.host,
port=config.port,
reload=config.reload,
)

View File

@@ -0,0 +1,161 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
runpod.toml

View File

@@ -0,0 +1,15 @@
FROM python:3.10-bullseye
RUN mkdir /app
RUN mkdir /app/assets
RUN mkdir /app/utils
COPY *.py /app/
COPY requirements.txt /app/
WORKDIR /app
RUN pip3 install -r requirements.txt
EXPOSE 7860
CMD ["python3", "server.py"]

View File

@@ -0,0 +1,37 @@
# Simple Chatbot
<img src="image.png" width="420px">
This app connects you to a chatbot powered by GPT-4, complete with animations generated by Stable Video Diffusion.
See a video of it in action: https://x.com/kwindla/status/1778628911817183509
And a quick video walkthrough of the code: https://www.loom.com/share/13df1967161f4d24ade054e7f8753416
The first time, things might take extra time to get started since VAD (Voice Activity Detection) model needs to be downloaded.
## Get started
```python
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
cp env.example .env # and add your credentials
```
## Run the server
```bash
python server.py
```
Then, visit `http://localhost:7860/` in your browser to start a chatbot session.
## Build and test the Docker image
```
docker build -t chatbot .
docker run --env-file .env -p 7860:7860 chatbot
```

View File

@@ -0,0 +1,141 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
import datetime
import wave
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def save_audio(audiobuffer):
if audiobuffer.has_audio():
merged_audio = audiobuffer.merge_audio_buffers()
filename = f"conversation_recording{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
with wave.open(filename, "wb") as wf:
wf.setnchannels(2)
wf.setsampwidth(2)
wf.setframerate(audiobuffer._sample_rate)
wf.writeframes(merged_audio)
print(f"Merged audio saved to {filename}")
else:
print("No audio data to save")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Chatbot",
DailyParams(
audio_out_enabled=True,
audio_in_enabled=True,
camera_out_enabled=False,
vad_enabled=True,
vad_audio_passthrough=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
#
# Spanish
#
# transcription_settings=DailyTranscriptionSettings(
# language="es",
# tier="nova",
# model="2-general"
# )
),
)
tts = ElevenLabsTTSService(
api_key=os.getenv("ELEVENLABS_API_KEY"),
#
# English
#
voice_id="cgSgspJ2msm6clMCkdW9",
aiohttp_session=session,
#
# Spanish
#
# model="eleven_multilingual_v2",
# voice_id="gD1IexrzCvsXPHUuT0s3",
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
#
# English
#
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself. Keep all your response to 12 words or fewer.",
#
# Spanish
#
# "content": "Eres Chatbot, un amigable y útil robot. Tu objetivo es demostrar tus capacidades de una manera breve. Tus respuestas se convertiran a audio así que nunca no debes incluir caracteres especiales. Contesta a lo que el usuario pregunte de una manera creativa, útil y breve. Empieza por presentarte a ti mismo.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
audiobuffer = AudioBufferProcessor()
pipeline = Pipeline(
[
transport.input(), # microphone
context_aggregator.user(),
llm,
tts,
transport.output(),
audiobuffer, # used to buffer the audio in the pipeline
context_aggregator.assistant(),
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
print(f"Participant left: {participant}")
await task.queue_frame(EndFrame())
await save_audio(audiobuffer)
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,4 @@
DAILY_SAMPLE_ROOM_URL=https://yourdomain.daily.co/yourroom # (for joining the bot to the same room repeatedly for local dev)
DAILY_API_KEY=7df...
OPENAI_API_KEY=sk-PL...
ELEVENLABS_API_KEY=aeb...

View File

@@ -0,0 +1,4 @@
python-dotenv
fastapi[all]
uvicorn
pipecat-ai[daily,openai,silero,elevenlabs]

View File

@@ -0,0 +1,56 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import os
import aiohttp
from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper
async def configure(aiohttp_session: aiohttp.ClientSession):
parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=False, help="URL of the Daily room to join"
)
parser.add_argument(
"-k",
"--apikey",
type=str,
required=False,
help="Daily API Key (needed to create an owner token for the room)",
)
args, unknown = parser.parse_known_args()
url = args.url or os.getenv("DAILY_SAMPLE_ROOM_URL")
key = args.apikey or os.getenv("DAILY_API_KEY")
if not url:
raise Exception(
"No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL."
)
if not key:
raise Exception(
"No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers."
)
daily_rest_helper = DailyRESTHelper(
daily_api_key=key,
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=aiohttp_session,
)
# Create a meeting token for the given room with an expiration 1 hour in
# the future.
expiry_time: float = 60 * 60
token = await daily_rest_helper.get_token(url, expiry_time)
return (url, token)
return (url, token)

View File

@@ -0,0 +1,139 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import os
import subprocess
from contextlib import asynccontextmanager
import aiohttp
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, RedirectResponse
from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomParams
MAX_BOTS_PER_ROOM = 1
# Bot sub-process dict for status reporting and concurrency control
bot_procs = {}
daily_helpers = {}
load_dotenv(override=True)
def cleanup():
# Clean up function, just to be extra safe
for entry in bot_procs.values():
proc = entry[0]
proc.terminate()
proc.wait()
@asynccontextmanager
async def lifespan(app: FastAPI):
aiohttp_session = aiohttp.ClientSession()
daily_helpers["rest"] = DailyRESTHelper(
daily_api_key=os.getenv("DAILY_API_KEY", ""),
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=aiohttp_session,
)
yield
await aiohttp_session.close()
cleanup()
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def start_agent(request: Request):
print(f"!!! Creating room")
room = await daily_helpers["rest"].create_room(DailyRoomParams())
print(f"!!! Room URL: {room.url}")
# Ensure the room property is present
if not room.url:
raise HTTPException(
status_code=500,
detail="Missing 'room' property in request data. Cannot start agent without a target room!",
)
# Check if there is already an existing process running in this room
num_bots_in_room = sum(
1 for proc in bot_procs.values() if proc[1] == room.url and proc[0].poll() is None
)
if num_bots_in_room >= MAX_BOTS_PER_ROOM:
raise HTTPException(status_code=500, detail=f"Max bot limited reach for room: {room.url}")
# Get the token for the room
token = await daily_helpers["rest"].get_token(room.url)
if not token:
raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room.url}")
# Spawn a new agent, and join the user session
# Note: this is mostly for demonstration purposes (refer to 'deployment' in README)
try:
proc = subprocess.Popen(
[f"python3 -m bot -u {room.url} -t {token}"],
shell=True,
bufsize=1,
cwd=os.path.dirname(os.path.abspath(__file__)),
)
bot_procs[proc.pid] = (proc, room.url)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}")
return RedirectResponse(room.url)
@app.get("/status/{pid}")
def get_status(pid: int):
# Look up the subprocess
proc = bot_procs.get(pid)
# If the subprocess doesn't exist, return an error
if not proc:
raise HTTPException(status_code=404, detail=f"Bot with process id: {pid} not found")
# Check the status of the subprocess
if proc[0].poll() is None:
status = "running"
else:
status = "finished"
return JSONResponse({"bot_id": pid, "status": status})
if __name__ == "__main__":
import uvicorn
default_host = os.getenv("HOST", "0.0.0.0")
default_port = int(os.getenv("FAST_API_PORT", "7860"))
parser = argparse.ArgumentParser(description="Daily Storyteller FastAPI server")
parser.add_argument("--host", type=str, default=default_host, help="Host address")
parser.add_argument("--port", type=int, default=default_port, help="Port number")
parser.add_argument("--reload", action="store_true", help="Reload code on change")
config = parser.parse_args()
uvicorn.run(
"server:app",
host=config.host,
port=config.port,
reload=config.reload,
)

View File

@@ -9,8 +9,5 @@ COPY *.py .
COPY ./requirements.txt requirements.txt
RUN pip3 install --no-cache-dir --upgrade -r requirements.txt
# Install models
RUN python3 install_deps.py
# Start the FastAPI server
CMD python3 bot_runner.py --port ${FAST_API_PORT}

View File

@@ -2,8 +2,6 @@
This project modifies the `bot_runner.py` server to launch a new machine for each user session. This is a recommended approach for production vs. running shell processess as your deployment will quickly run out of system resources under load.
To speed up machine boot times, we also download and cache Silero VAD as part of the Dockerfile (`install_deps.py`). If you are using other custom models, you can add them here too.
For this example, we are using Daily as a WebRTC transport and provisioning a new room and token for each session. You can use another transport, such as WebSockets, by modifying the `bot.py` and `bot_runner.py` files accordingly.
## Setting up your fly.io deployment
@@ -14,7 +12,7 @@ You can copy the `example-fly.toml` as a reference. Be sure to change the app na
### Create your .env file
Copy the base `env.example` to `.env` and enter the necessary API keys.
Copy the base `env.example` to `.env` and enter the necessary API keys.
`FLY_APP_NAME` should match that in the `fly.toml` file.
@@ -32,12 +30,10 @@ Note: you can do this manually via the fly.io dashboard under the "secrets" sub-
`fly deploy`
## Connecting to your bot
Send a post request to your running fly.io instance:
`curl --location --request POST 'https://YOUR_FLY_APP_NAME/start_bot'`
`curl --location --request POST 'https://YOUR_FLY_APP_NAME/'`
This request will wait until the machine enters into a `starting` state, before returning the a room URL and token to join.

View File

@@ -1,22 +1,22 @@
import asyncio
import aiohttp
import os
import sys
import argparse
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
from pipecat.frames.frames import LLMMessagesFrame, EndFrame
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.openai import OpenAILLMService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -27,71 +27,69 @@ daily_api_url = os.getenv("DAILY_API_URL", "https://api.daily.co/v1")
async def main(room_url: str, token: str):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Chatbot",
DailyParams(
api_url=daily_api_url,
api_key=daily_api_key,
audio_in_enabled=True,
audio_out_enabled=True,
camera_out_enabled=False,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
)
)
transport = DailyTransport(
room_url,
token,
"Chatbot",
DailyParams(
api_url=daily_api_url,
api_key=daily_api_key,
audio_in_enabled=True,
audio_out_enabled=True,
camera_out_enabled=False,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
)
tts = ElevenLabsTTSService(
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are Chatbot, a friendly, helpful robot. Your output will be converted to audio so don't include special characters other than '!' or '?' in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by saying hello.",
},
]
messages = [
{
"role": "system",
"content": "You are Chatbot, a friendly, helpful robot. Your output will be converted to audio so don't include special characters other than '!' or '?' in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by saying hello.",
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
pipeline = Pipeline(
[
transport.input(),
tma_in,
context_aggregator.user(),
llm,
tts,
transport.output(),
tma_out,
])
context_aggregator.assistant(),
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
@transport.event_handler("on_call_state_updated")
async def on_call_state_updated(transport, state):
if state == "left":
await task.queue_frame(EndFrame())
@transport.event_handler("on_call_state_updated")
async def on_call_state_updated(transport, state):
if state == "left":
await task.queue_frame(EndFrame())
runner = PipelineRunner()
runner = PipelineRunner()
await runner.run(task)
await runner.run(task)
if __name__ == "__main__":

View File

@@ -1,15 +1,29 @@
import os
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import argparse
import subprocess
import requests
import os
from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomObject, DailyRoomProperties, DailyRoomParams
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pipecat.transports.services.helpers.daily_rest import (
DailyRESTHelper,
DailyRoomObject,
DailyRoomProperties,
DailyRoomParams,
)
from dotenv import load_dotenv
load_dotenv(override=True)
@@ -17,93 +31,100 @@ load_dotenv(override=True)
MAX_SESSION_TIME = 5 * 60 # 5 minutes
REQUIRED_ENV_VARS = [
'DAILY_API_KEY',
'OPENAI_API_KEY',
'ELEVENLABS_API_KEY',
'ELEVENLABS_VOICE_ID',
'FLY_API_KEY',
'FLY_APP_NAME',]
"DAILY_API_KEY",
"OPENAI_API_KEY",
"ELEVENLABS_API_KEY",
"ELEVENLABS_VOICE_ID",
"FLY_API_KEY",
"FLY_APP_NAME",
]
FLY_API_HOST = os.getenv("FLY_API_HOST", "https://api.machines.dev/v1")
FLY_APP_NAME = os.getenv("FLY_APP_NAME", "pipecat-fly-example")
FLY_API_KEY = os.getenv("FLY_API_KEY", "")
FLY_HEADERS = {
'Authorization': f"Bearer {FLY_API_KEY}",
'Content-Type': 'application/json'
}
FLY_HEADERS = {"Authorization": f"Bearer {FLY_API_KEY}", "Content-Type": "application/json"}
daily_rest_helper = DailyRESTHelper(
os.getenv("DAILY_API_KEY", ""),
os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'))
daily_helpers = {}
# ----------------- API ----------------- #
app = FastAPI()
@asynccontextmanager
async def lifespan(app: FastAPI):
aiohttp_session = aiohttp.ClientSession()
daily_helpers["rest"] = DailyRESTHelper(
daily_api_key=os.getenv("DAILY_API_KEY", ""),
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=aiohttp_session,
)
yield
await aiohttp_session.close()
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"]
allow_headers=["*"],
)
# ----------------- Main ----------------- #
def spawn_fly_machine(room_url: str, token: str):
# Use the same image as the bot runner
res = requests.get(f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines", headers=FLY_HEADERS)
if res.status_code != 200:
raise Exception(f"Unable to get machine info from Fly: {res.text}")
image = res.json()[0]['config']['image']
async def spawn_fly_machine(room_url: str, token: str):
async with aiohttp.ClientSession() as session:
# Use the same image as the bot runner
async with session.get(
f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines", headers=FLY_HEADERS
) as r:
if r.status != 200:
text = await r.text()
raise Exception(f"Unable to get machine info from Fly: {text}")
# Machine configuration
cmd = f"python3 bot.py -u {room_url} -t {token}"
cmd = cmd.split()
worker_props = {
"config": {
"image": image,
"auto_destroy": True,
"init": {
"cmd": cmd
data = await r.json()
image = data[0]["config"]["image"]
# Machine configuration
cmd = f"python3 bot.py -u {room_url} -t {token}"
cmd = cmd.split()
worker_props = {
"config": {
"image": image,
"auto_destroy": True,
"init": {"cmd": cmd},
"restart": {"policy": "no"},
"guest": {"cpu_kind": "shared", "cpus": 1, "memory_mb": 1024},
},
"restart": {
"policy": "no"
},
"guest": {
"cpu_kind": "shared",
"cpus": 1,
"memory_mb": 1024
}
},
}
}
# Spawn a new machine instance
async with session.post(
f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines", headers=FLY_HEADERS, json=worker_props
) as r:
if r.status != 200:
text = await r.text()
raise Exception(f"Problem starting a bot worker: {text}")
# Spawn a new machine instance
res = requests.post(
f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines",
headers=FLY_HEADERS,
json=worker_props)
data = await r.json()
# Wait for the machine to enter the started state
vm_id = data["id"]
if res.status_code != 200:
raise Exception(f"Problem starting a bot worker: {res.text}")
# Wait for the machine to enter the started state
vm_id = res.json()['id']
res = requests.get(
f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines/{vm_id}/wait?state=started",
headers=FLY_HEADERS)
if res.status_code != 200:
raise Exception(f"Bot was unable to enter started state: {res.text}")
async with session.get(
f"{FLY_API_HOST}/apps/{FLY_APP_NAME}/machines/{vm_id}/wait?state=started",
headers=FLY_HEADERS,
) as r:
if r.status != 200:
text = await r.text()
raise Exception(f"Bot was unable to enter started state: {text}")
print(f"Machine joined room: {room_url}")
@app.post("/start_bot")
@app.post("/")
async def start_bot(request: Request) -> JSONResponse:
try:
data = await request.json()
@@ -117,29 +138,23 @@ async def start_bot(request: Request) -> JSONResponse:
room_url = os.getenv("DAILY_SAMPLE_ROOM_URL", "")
if not room_url:
params = DailyRoomParams(
properties=DailyRoomProperties()
)
params = DailyRoomParams(properties=DailyRoomProperties())
try:
room: DailyRoomObject = daily_rest_helper.create_room(params=params)
room: DailyRoomObject = await daily_helpers["rest"].create_room(params=params)
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Unable to provision room {e}")
raise HTTPException(status_code=500, detail=f"Unable to provision room {e}")
else:
# Check passed room URL exists, we should assume that it already has a sip set up
try:
room: DailyRoomObject = daily_rest_helper.get_room_from_url(room_url)
room: DailyRoomObject = await daily_helpers["rest"].get_room_from_url(room_url)
except Exception:
raise HTTPException(
status_code=500, detail=f"Room not found: {room_url}")
raise HTTPException(status_code=500, detail=f"Room not found: {room_url}")
# Give the agent a token to join the session
token = daily_rest_helper.get_token(room.url, MAX_SESSION_TIME)
token = await daily_helpers["rest"].get_token(room.url, MAX_SESSION_TIME)
if not room or not token:
raise HTTPException(
status_code=500, detail=f"Failed to get token for room: {room_url}")
raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room_url}")
# Launch a new fly.io machine, or run as a shell process (not recommended)
run_as_process = os.getenv("RUN_AS_PROCESS", False)
@@ -150,24 +165,26 @@ async def start_bot(request: Request) -> JSONResponse:
[f"python3 -m bot -u {room.url} -t {token}"],
shell=True,
bufsize=1,
cwd=os.path.dirname(os.path.abspath(__file__)))
cwd=os.path.dirname(os.path.abspath(__file__)),
)
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to start subprocess: {e}")
raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}")
else:
try:
spawn_fly_machine(room.url, token)
await spawn_fly_machine(room.url, token)
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to spawn VM: {e}")
raise HTTPException(status_code=500, detail=f"Failed to spawn VM: {e}")
# Grab a token for the user to join with
user_token = daily_rest_helper.get_token(room.url, MAX_SESSION_TIME)
user_token = await daily_helpers["rest"].get_token(room.url, MAX_SESSION_TIME)
return JSONResponse(
{
"room_url": room.url,
"token": user_token,
}
)
return JSONResponse({
"room_url": room.url,
"token": user_token,
})
if __name__ == "__main__":
# Check environment variables
@@ -176,24 +193,19 @@ if __name__ == "__main__":
raise Exception(f"Missing environment variable: {env_var}.")
parser = argparse.ArgumentParser(description="Pipecat Bot Runner")
parser.add_argument("--host", type=str,
default=os.getenv("HOST", "0.0.0.0"), help="Host address")
parser.add_argument("--port", type=int,
default=os.getenv("PORT", 7860), help="Port number")
parser.add_argument("--reload", action="store_true",
default=False, help="Reload code on change")
parser.add_argument(
"--host", type=str, default=os.getenv("HOST", "0.0.0.0"), help="Host address"
)
parser.add_argument("--port", type=int, default=os.getenv("PORT", 7860), help="Port number")
parser.add_argument(
"--reload", action="store_true", default=False, help="Reload code on change"
)
config = parser.parse_args()
try:
import uvicorn
uvicorn.run(
"bot_runner:app",
host=config.host,
port=config.port,
reload=config.reload
)
uvicorn.run("bot_runner:app", host=config.host, port=config.port, reload=config.reload)
except KeyboardInterrupt:
print("Pipecat runner shutting down...")

View File

@@ -1,4 +0,0 @@
import torch
# Download (cache) the Silero VAD model
torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=True)

View File

@@ -1,6 +1,5 @@
pipecat-ai[daily,openai,silero]
fastapi
uvicorn
requests
python-dotenv
loguru
loguru

View File

@@ -0,0 +1,91 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
dist/
*.egg-info/
*.egg
.installed.cfg
.eggs/
downloads/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
MANIFEST
# Virtual Environments
venv/
env/
.env
.venv/
ENV/
env.bak/
venv.bak/
# IDE
.idea/
.vscode/
.spyderproject
.spyproject
.ropeproject
# Testing and Coverage
.coverage
.coverage.*
htmlcov/
.pytest_cache/
.tox/
.nox/
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
cover/
# Logs and Databases
*.log
*.db
db.sqlite3
db.sqlite3-journal
pip-log.txt
# System Files
.DS_Store
Thumbs.db
desktop.ini
*.swp
*.swo
*.bak
*.tmp
*~
# Build and Documentation
docs/_build/
.pybuilder/
target/
instance/
.webassets-cache
.pdm.toml
.pdm-python
.pdm-build/
__pypackages__/
# Other
*.mo
*.pot
*.sage.py
.mypy_cache/
.dmypy.json
dmypy.json
.pyre/
.pytype/
cython_debug/
.ipynb_checkpoints

View File

@@ -0,0 +1,37 @@
# Deploying Pipecat to Modal.com
Barebones deployment example for [modal.com](https://www.modal.com)
1. Install dependencies
```bash
python -m venv venv
source venv/bin/active # or OS equivalent
pip install -r requirements.txt
```
2. Setup .env
```bash
cp env.example .env
```
Alternatively, you can configure your Modal app to use [secrets](https://modal.com/docs/guide/secrets)
3. Test the app locally
```bash
modal serve app.py
```
4. Deploy to production
```bash
modal deploy app.py
```
## Configuration options
This app sets some sensible defaults for reducing cold starts, such as `minkeep_warm=1`, which will keep at least 1 warm instance ready for your bot function.
It has been configured to only allow a concurrency of 1 (`max_inputs=1`) as each user will require their own running function.

View File

@@ -0,0 +1,75 @@
import os
import aiohttp
import modal
from fastapi import HTTPException
from fastapi.responses import JSONResponse
from loguru import logger
from bot import _voice_bot_process
MAX_SESSION_TIME = 15 * 60 # 15 minutes
app = modal.App("pipecat-modal")
image = modal.Image.debian_slim(python_version="3.12").pip_install_from_requirements(
"requirements.txt"
)
@app.function(
image=image,
cpu=1.0,
secrets=[modal.Secret.from_dotenv()],
keep_warm=1,
enable_memory_snapshot=True,
max_inputs=1, # Do not reuse instances across requests
retries=0,
)
def launch_bot_process(room_url: str, token: str):
_voice_bot_process(room_url, token)
@app.function(
image=image,
secrets=[modal.Secret.from_dotenv()],
)
@modal.web_endpoint(method="POST")
async def start():
from pipecat.transports.services.helpers.daily_rest import (
DailyRESTHelper,
DailyRoomParams,
)
logger.info("Request received")
async with aiohttp.ClientSession() as session:
daily_rest_helper = DailyRESTHelper(
daily_api_key=os.getenv("DAILY_API_KEY", ""),
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=session,
)
# Create new Daily room
room = await daily_rest_helper.create_room(DailyRoomParams())
if not room.url:
raise HTTPException(
status_code=500,
detail="Unable to create room",
)
logger.info(f"Created room: {room.url}")
# Create bot token for room
token = await daily_rest_helper.get_token(room.url, MAX_SESSION_TIME)
if not token:
raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room.url}")
logger.info(f"Bot token created: {token}")
# Spawn a new bot process
launch_bot_process.spawn(room_url=room.url, token=token)
# Return room URL to the user to join
# Note: in production, you would want to return a token to the user
return JSONResponse(content={"room_url": room.url, token: token})

View File

@@ -0,0 +1,90 @@
import asyncio
import os
import sys
from dotenv import load_dotenv
from loguru import logger
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token: str):
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
transport = DailyTransport(
room_url,
token,
"bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY", ""), voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22"
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
runner = PipelineRunner()
await runner.run(task)
def _voice_bot_process(room_url: str, token: str):
asyncio.run(main(room_url, token))

View File

@@ -0,0 +1,3 @@
DAILY_API_KEY=
OPENAI_API_KEY=
CARTESIA_API_KEY=

View File

@@ -0,0 +1,5 @@
python-dotenv==1.0.1
modal==0.65.48
pipecat-ai[daily,silero,cartesia,openai]==0.0.48
fastapi==0.115.4
aiohttp==3.10.10

View File

@@ -1,24 +1,22 @@
import asyncio
import aiohttp
import os
import sys
import argparse
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
from pipecat.frames.frames import (
LLMMessagesFrame,
EndFrame
)
from pipecat.frames.frames import LLMMessagesFrame, EndFrame
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyDialinSettings
from pipecat.vad.silero import SileroVADAnalyzer
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -29,75 +27,70 @@ daily_api_url = os.getenv("DAILY_API_URL", "https://api.daily.co/v1")
async def main(room_url: str, token: str, callId: str, callDomain: str):
async with aiohttp.ClientSession() as session:
# diallin_settings are only needed if Daily's SIP URI is used
# If you are handling this via Twilio, Telnyx, set this to None
# and handle call-forwarding when on_dialin_ready fires.
diallin_settings = DailyDialinSettings(
call_id=callId,
call_domain=callDomain
)
# diallin_settings are only needed if Daily's SIP URI is used
# If you are handling this via Twilio, Telnyx, set this to None
# and handle call-forwarding when on_dialin_ready fires.
diallin_settings = DailyDialinSettings(call_id=callId, call_domain=callDomain)
transport = DailyTransport(
room_url,
token,
"Chatbot",
DailyParams(
api_url=daily_api_url,
api_key=daily_api_key,
dialin_settings=diallin_settings,
audio_in_enabled=True,
audio_out_enabled=True,
camera_out_enabled=False,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
)
)
transport = DailyTransport(
room_url,
token,
"Chatbot",
DailyParams(
api_url=daily_api_url,
api_key=daily_api_key,
dialin_settings=diallin_settings,
audio_in_enabled=True,
audio_out_enabled=True,
camera_out_enabled=False,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
)
tts = ElevenLabsTTSService(
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by saying 'Oh, hello! Who dares dial me at this hour?!'.",
},
]
messages = [
{
"role": "system",
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by saying 'Oh, hello! Who dares dial me at this hour?!'.",
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
pipeline = Pipeline(
[
transport.input(),
tma_in,
context_aggregator.user(),
llm,
tts,
transport.output(),
tma_out,
])
context_aggregator.assistant(),
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
runner = PipelineRunner()
runner = PipelineRunner()
await runner.run(task)
await runner.run(task)
if __name__ == "__main__":

View File

@@ -6,40 +6,62 @@ provisioning a room and starting a Pipecat bot in response.
Refer to README for more information.
"""
import aiohttp
import os
import argparse
import subprocess
from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomObject, DailyRoomProperties, DailyRoomSipParams, DailyRoomParams
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, PlainTextResponse
from twilio.twiml.voice_response import VoiceResponse
from pipecat.transports.services.helpers.daily_rest import (
DailyRESTHelper,
DailyRoomObject,
DailyRoomProperties,
DailyRoomSipParams,
DailyRoomParams,
)
from dotenv import load_dotenv
load_dotenv(override=True)
# ------------ Configuration ------------ #
MAX_SESSION_TIME = 5 * 60 # 5 minutes
REQUIRED_ENV_VARS = ['OPENAI_API_KEY', 'DAILY_API_KEY',
'ELEVENLABS_API_KEY', 'ELEVENLABS_VOICE_ID']
daily_rest_helper = DailyRESTHelper(
os.getenv("DAILY_API_KEY", ""),
os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'))
REQUIRED_ENV_VARS = ["OPENAI_API_KEY", "DAILY_API_KEY", "ELEVENLABS_API_KEY", "ELEVENLABS_VOICE_ID"]
daily_helpers = {}
# ----------------- API ----------------- #
app = FastAPI()
@asynccontextmanager
async def lifespan(app: FastAPI):
aiohttp_session = aiohttp.ClientSession()
daily_helpers["rest"] = DailyRESTHelper(
daily_api_key=os.getenv("DAILY_API_KEY", ""),
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=aiohttp_session,
)
yield
await aiohttp_session.close()
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"]
allow_headers=["*"],
)
"""
@@ -53,61 +75,49 @@ action using the Twilio Client library.
"""
def _create_daily_room(room_url, callId, callDomain=None, vendor="daily"):
async def _create_daily_room(room_url, callId, callDomain=None, vendor="daily"):
if not room_url:
params = DailyRoomParams(
properties=DailyRoomProperties(
# Note: these are the default values, except for the display name
sip=DailyRoomSipParams(
display_name="dialin-user",
video=False,
sip_mode="dial-in",
num_endpoints=1
display_name="dialin-user", video=False, sip_mode="dial-in", num_endpoints=1
)
)
)
print(f"Creating new room...")
room: DailyRoomObject = daily_rest_helper.create_room(params=params)
room: DailyRoomObject = await daily_helpers["rest"].create_room(params=params)
else:
# Check passed room URL exist (we assume that it already has a sip set up!)
try:
print(f"Joining existing room: {room_url}")
room: DailyRoomObject = daily_rest_helper.get_room_from_url(
room_url)
room: DailyRoomObject = await daily_helpers["rest"].get_room_from_url(room_url)
except Exception:
raise HTTPException(
status_code=500, detail=f"Room not found: {room_url}")
raise HTTPException(status_code=500, detail=f"Room not found: {room_url}")
print(f"Daily room: {room.url} {room.config.sip_endpoint}")
# Give the agent a token to join the session
token = daily_rest_helper.get_token(room.url, MAX_SESSION_TIME)
token = await daily_helpers["rest"].get_token(room.url, MAX_SESSION_TIME)
if not room or not token:
raise HTTPException(
status_code=500, detail=f"Failed to get room or token token")
raise HTTPException(status_code=500, detail=f"Failed to get room or token token")
# Spawn a new agent, and join the user session
# Note: this is mostly for demonstration purposes (refer to 'deployment' in docs)
if vendor == "daily":
bot_proc = f"python3 -m bot_daily -u {room.url} -t {token} -i {
callId} -d {callDomain}"
bot_proc = f"python3 -m bot_daily -u {room.url} -t {token} -i {callId} -d {callDomain}"
else:
bot_proc = f"python3 -m bot_twilio -u {room.url} -t {
token} -i {callId} -s {room.config.sip_endpoint}"
bot_proc = f"python3 -m bot_twilio -u {room.url} -t {token} -i {callId} -s {room.config.sip_endpoint}"
try:
subprocess.Popen(
[bot_proc],
shell=True,
bufsize=1,
cwd=os.path.dirname(os.path.abspath(__file__))
[bot_proc], shell=True, bufsize=1, cwd=os.path.dirname(os.path.abspath(__file__))
)
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to start subprocess: {e}")
raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}")
return room
@@ -130,18 +140,16 @@ async def twilio_start_bot(request: Request):
pass
room_url = os.getenv("DAILY_SAMPLE_ROOM_URL", None)
callId = data.get('CallSid')
callId = data.get("CallSid")
if not callId:
raise HTTPException(
status_code=500, detail="Missing 'CallSid' in request")
raise HTTPException(status_code=500, detail="Missing 'CallSid' in request")
print("CallId: %s" % callId)
# create room and tell the bot to join the created room
# note: Twilio does not require a callDomain
room: DailyRoomObject = _create_daily_room(
room_url, callId, None, "twilio")
room: DailyRoomObject = await _create_daily_room(room_url, callId, None, "twilio")
print(f"Put Twilio on hold...")
# We have the room and the SIP URI,
@@ -151,7 +159,8 @@ async def twilio_start_bot(request: Request):
# http://com.twilio.music.classical.s3.amazonaws.com/BusyStrings.mp3
resp = VoiceResponse()
resp.play(
url="http://com.twilio.sounds.music.s3.amazonaws.com/MARKOVICHAMP-Borghestral.mp3", loop=10)
url="http://com.twilio.sounds.music.s3.amazonaws.com/MARKOVICHAMP-Borghestral.mp3", loop=10
)
return str(resp)
@@ -173,19 +182,14 @@ async def daily_start_bot(request: Request) -> JSONResponse:
callId = data.get("callId", None)
callDomain = data.get("callDomain", None)
except Exception:
raise HTTPException(
status_code=500,
detail="Missing properties 'callId' or 'callDomain'")
raise HTTPException(status_code=500, detail="Missing properties 'callId' or 'callDomain'")
print(f"CallId: {callId}, CallDomain: {callDomain}")
room: DailyRoomObject = _create_daily_room(
room_url, callId, callDomain, "daily")
room: DailyRoomObject = await _create_daily_room(room_url, callId, callDomain, "daily")
# Grab a token for the user to join with
return JSONResponse({
"room_url": room.url,
"sipUri": room.config.sip_endpoint
})
return JSONResponse({"room_url": room.url, "sipUri": room.config.sip_endpoint})
# ----------------- Main ----------------- #
@@ -197,24 +201,18 @@ if __name__ == "__main__":
raise Exception(f"Missing environment variable: {env_var}.")
parser = argparse.ArgumentParser(description="Pipecat Bot Runner")
parser.add_argument("--host", type=str,
default=os.getenv("HOST", "0.0.0.0"), help="Host address")
parser.add_argument("--port", type=int,
default=os.getenv("PORT", 7860), help="Port number")
parser.add_argument("--reload", action="store_true",
default=True, help="Reload code on change")
parser.add_argument(
"--host", type=str, default=os.getenv("HOST", "0.0.0.0"), help="Host address"
)
parser.add_argument("--port", type=int, default=os.getenv("PORT", 7860), help="Port number")
parser.add_argument("--reload", action="store_true", default=True, help="Reload code on change")
config = parser.parse_args()
try:
import uvicorn
uvicorn.run(
"bot_runner:app",
host=config.host,
port=config.port,
reload=config.reload
)
uvicorn.run("bot_runner:app", host=config.host, port=config.port, reload=config.reload)
except KeyboardInterrupt:
print("Pipecat runner shutting down...")

View File

@@ -1,117 +1,112 @@
import asyncio
import aiohttp
import os
import sys
import argparse
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
from pipecat.frames.frames import (
LLMMessagesFrame,
EndFrame
)
from pipecat.frames.frames import LLMMessagesFrame, EndFrame
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from twilio.rest import Client
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
twilio_account_sid = os.getenv('TWILIO_ACCOUNT_SID')
twilio_auth_token = os.getenv('TWILIO_AUTH_TOKEN')
twilio_account_sid = os.getenv("TWILIO_ACCOUNT_SID")
twilio_auth_token = os.getenv("TWILIO_AUTH_TOKEN")
twilioclient = Client(twilio_account_sid, twilio_auth_token)
daily_api_key = os.getenv("DAILY_API_KEY", "")
async def main(room_url: str, token: str, callId: str, sipUri: str):
async with aiohttp.ClientSession() as session:
# diallin_settings are only needed if Daily's SIP URI is used
# If you are handling this via Twilio, Telnyx, set this to None
# and handle call-forwarding when on_dialin_ready fires.
transport = DailyTransport(
room_url,
token,
"Chatbot",
DailyParams(
api_key=daily_api_key,
dialin_settings=None, # Not required for Twilio
audio_in_enabled=True,
audio_out_enabled=True,
camera_out_enabled=False,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
)
)
# dialin_settings are only needed if Daily's SIP URI is used
# If you are handling this via Twilio, Telnyx, set this to None
# and handle call-forwarding when on_dialin_ready fires.
transport = DailyTransport(
room_url,
token,
"Chatbot",
DailyParams(
api_key=daily_api_key,
dialin_settings=None, # Not required for Twilio
audio_in_enabled=True,
audio_out_enabled=True,
camera_out_enabled=False,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
)
tts = ElevenLabsTTSService(
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by saying 'Hello! Who dares dial me at this hour?!'.",
},
]
messages = [
{
"role": "system",
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by saying 'Hello! Who dares dial me at this hour?!'.",
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
pipeline = Pipeline(
[
transport.input(),
tma_in,
context_aggregator.user(),
llm,
tts,
transport.output(),
tma_out,
])
context_aggregator.assistant(),
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
@transport.event_handler("on_dialin_ready")
async def on_dialin_ready(transport, cdata):
# For Twilio, Telnyx, etc. You need to update the state of the call
# and forward it to the sip_uri..
print(f"Forwarding call: {callId} {sipUri}")
@transport.event_handler("on_dialin_ready")
async def on_dialin_ready(transport, cdata):
# For Twilio, Telnyx, etc. You need to update the state of the call
# and forward it to the sip_uri..
print(f"Forwarding call: {callId} {sipUri}")
try:
# The TwiML is updated using Twilio's client library
call = twilioclient.calls(callId).update(
twiml=f'<Response><Dial><Sip>{sipUri}</Sip></Dial></Response>'
)
except Exception as e:
raise Exception(f"Failed to forward call: {str(e)}")
try:
# The TwiML is updated using Twilio's client library
call = twilioclient.calls(callId).update(
twiml=f"<Response><Dial><Sip>{sipUri}</Sip></Dial></Response>"
)
except Exception as e:
raise Exception(f"Failed to forward call: {str(e)}")
runner = PipelineRunner()
await runner.run(task)
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":

View File

@@ -1,7 +1,6 @@
pipecat-ai[daily,openai,silero]
pipecat-ai[daily,elevenlabs,openai,silero]
fastapi
uvicorn
requests
python-dotenv
loguru
twilio
twilio
python-multipart

View File

@@ -9,11 +9,11 @@ import aiohttp
import os
import sys
from pipecat.frames.frames import EndFrame, TextFrame
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.task import PipelineTask
from pipecat.pipeline.runner import PipelineRunner
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
@@ -21,21 +21,24 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url):
async def main():
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url, None, "Say One Thing", DailyParams(audio_out_enabled=True))
(room_url, _) = await configure(session)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
transport = DailyTransport(
room_url, None, "Say One Thing", DailyParams(audio_out_enabled=True)
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
runner = PipelineRunner()
@@ -44,13 +47,15 @@ async def main(room_url):
# Register an event handler so we can play the audio when the
# participant joins.
@transport.event_handler("on_participant_joined")
async def on_new_participant_joined(transport, participant):
participant_name = participant["info"]["userName"] or ''
await task.queue_frames([TextFrame(f"Hello there, {participant_name}!"), EndFrame()])
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
participant_name = participant.get("info", {}).get("userName", "")
await task.queue_frames(
[TTSSpeakFrame(f"Hello there, {participant_name}!"), EndFrame()]
)
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))
asyncio.run(main())

View File

@@ -9,17 +9,18 @@ import aiohttp
import os
import sys
from pipecat.frames.frames import EndFrame, TextFrame
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.local.audio import LocalAudioTransport
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -27,26 +28,24 @@ logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
transport = LocalAudioTransport(TransportParams(audio_out_enabled=True))
transport = LocalAudioTransport(TransportParams(audio_out_enabled=True))
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
pipeline = Pipeline([tts, transport.output()])
pipeline = Pipeline([tts, transport.output()])
task = PipelineTask(pipeline)
task = PipelineTask(pipeline)
async def say_something():
await asyncio.sleep(1)
await task.queue_frames([TextFrame("Hello there!"), EndFrame()])
async def say_something():
await asyncio.sleep(1)
await task.queue_frames([TTSSpeakFrame("Hello there, how is it going!"), EndFrame()])
runner = PipelineRunner()
runner = PipelineRunner()
await asyncio.gather(runner.run(task), say_something())
await asyncio.gather(runner.run(task), say_something())
if __name__ == "__main__":

View File

@@ -0,0 +1,111 @@
import argparse
import asyncio
import os
import sys
import aiohttp
from pipecat.frames.frames import TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.services.livekit import LiveKitParams, LiveKitTransport
from livekit import api
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
def generate_token(room_name: str, participant_name: str, api_key: str, api_secret: str) -> str:
token = api.AccessToken(api_key, api_secret)
token.with_identity(participant_name).with_name(participant_name).with_grants(
api.VideoGrants(
room_join=True,
room=room_name,
)
)
return token.to_jwt()
async def configure_livekit():
parser = argparse.ArgumentParser(description="LiveKit AI SDK Bot Sample")
parser.add_argument(
"-r", "--room", type=str, required=False, help="Name of the LiveKit room to join"
)
parser.add_argument("-u", "--url", type=str, required=False, help="URL of the LiveKit server")
args, unknown = parser.parse_known_args()
room_name = args.room or os.getenv("LIVEKIT_ROOM_NAME")
url = args.url or os.getenv("LIVEKIT_URL")
api_key = os.getenv("LIVEKIT_API_KEY")
api_secret = os.getenv("LIVEKIT_API_SECRET")
if not room_name:
raise Exception(
"No LiveKit room specified. Use the -r/--room option from the command line, or set LIVEKIT_ROOM_NAME in your environment."
)
if not url:
raise Exception(
"No LiveKit server URL specified. Use the -u/--url option from the command line, or set LIVEKIT_URL in your environment."
)
if not api_key or not api_secret:
raise Exception(
"LIVEKIT_API_KEY and LIVEKIT_API_SECRET must be set in environment variables."
)
token = generate_token(room_name, "Say One Thing", api_key, api_secret)
user_token = generate_token(room_name, "User", api_key, api_secret)
logger.info(f"User token: {user_token}")
return (url, token, room_name)
async def main():
async with aiohttp.ClientSession() as session:
(url, token, room_name) = await configure_livekit()
transport = LiveKitTransport(
url=url,
token=token,
room_name=room_name,
params=LiveKitParams(audio_out_enabled=True),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
runner = PipelineRunner()
task = PipelineTask(Pipeline([tts, transport.output()]))
# Register an event handler so we can play the audio when the
# participant joins.
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant_id):
await asyncio.sleep(1)
await task.queue_frame(
TextFrame(
"Hello there! How are you doing today? Would you like to talk about the weather?"
)
)
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -13,7 +13,7 @@ from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
@@ -22,35 +22,34 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url):
async def main():
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
None,
"Say One Thing From an LLM",
DailyParams(audio_out_enabled=True))
(room_url, _) = await configure(session)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
transport = DailyTransport(
room_url, None, "Say One Thing From an LLM", DailyParams(audio_out_enabled=True)
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world.",
}]
}
]
runner = PipelineRunner()
@@ -64,5 +63,4 @@ async def main(room_url):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))
asyncio.run(main())

View File

@@ -9,7 +9,7 @@ import aiohttp
import os
import sys
from pipecat.frames.frames import TextFrame
from pipecat.frames.frames import EndFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
@@ -21,29 +21,26 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
None,
"Show a still frame image",
DailyParams(
camera_out_enabled=True,
camera_out_width=1024,
camera_out_height=1024
)
DailyParams(camera_out_enabled=True, camera_out_width=1024, camera_out_height=1024),
)
imagegen = FalImageGenService(
params=FalImageGenService.InputParams(
image_size="square_hd"
),
params=FalImageGenService.InputParams(image_size="square_hd"),
aiohttp_session=session,
key=os.getenv("FAL_KEY"),
)
@@ -54,15 +51,14 @@ async def main(room_url):
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
# Note that we do not put an EndFrame() item in the pipeline for this demo.
# This means that the bot will stay in the channel until it times out.
# An EndFrame() in the pipeline would cause the transport to shut
# down.
await task.queue_frames([TextFrame("a cat in the style of picasso")])
await task.queue_frame(TextFrame("a cat in the style of picasso"))
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))
asyncio.run(main())

View File

@@ -22,6 +22,7 @@ from pipecat.transports.local.tk import TkLocalTransport
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -35,15 +36,11 @@ async def main():
transport = TkLocalTransport(
tk_root,
TransportParams(
camera_out_enabled=True,
camera_out_width=1024,
camera_out_height=1024))
TransportParams(camera_out_enabled=True, camera_out_width=1024, camera_out_height=1024),
)
imagegen = FalImageGenService(
params=FalImageGenService.InputParams(
image_size="square_hd"
),
params=FalImageGenService.InputParams(image_size="square_hd"),
aiohttp_session=session,
key=os.getenv("FAL_KEY"),
)
@@ -56,7 +53,7 @@ async def main():
runner = PipelineRunner()
async def run_tk():
while runner.is_active():
while not task.has_finished():
tk_root.update()
tk_root.update_idletasks()
await asyncio.sleep(0.1)

View File

@@ -4,6 +4,10 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
#
# This example broken on latest pipecat and needs updating.
#
import aiohttp
import asyncio
import os
@@ -24,14 +28,17 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(room_url, None, "Static And Dynamic Speech")
meeting = TransportServiceOutput(transport, mic_enabled=True)
@@ -52,8 +59,7 @@ async def main(room_url: str):
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
messages = [{"role": "system",
"content": "tell the user a joke about llamas"}]
messages = [{"role": "system", "content": "tell the user a joke about llamas"}]
# Start a task to run the LLM to create a joke, and convert the LLM
# output to audio frames. This task will run in parallel with generating
@@ -71,8 +77,7 @@ async def main(room_url: str):
]
)
merge_pipeline = SequentialMergePipeline(
[simple_tts_pipeline, llm_pipeline])
merge_pipeline = SequentialMergePipeline([simple_tts_pipeline, llm_pipeline])
await asyncio.gather(
transport.run(merge_pipeline),
@@ -82,5 +87,4 @@ async def main(room_url: str):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))
asyncio.run(main())

View File

@@ -13,23 +13,19 @@ from dataclasses import dataclass
from pipecat.frames.frames import (
AppFrame,
EndFrame,
Frame,
ImageRawFrame,
LLMFullResponseStartFrame,
LLMMessagesFrame,
TextFrame
TextFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
from pipecat.pipeline.task import PipelineTask
from pipecat.pipeline.parallel_task import ParallelTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.processors.aggregators.gated import GatedAggregator
from pipecat.processors.aggregators.llm_response import LLMFullResponseAggregator
from pipecat.processors.aggregators.sentence import SentenceAggregator
from pipecat.services.cartesia import CartesiaHttpTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.fal import FalImageGenService
from pipecat.transports.services.daily import DailyParams, DailyTransport
@@ -38,6 +34,7 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -73,8 +70,10 @@ class MonthPrepender(FrameProcessor):
await self.push_frame(frame, direction)
async def main(room_url):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
None,
@@ -83,48 +82,46 @@ async def main(room_url):
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_width=1024,
camera_out_height=1024
)
camera_out_height=1024,
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
tts = CartesiaHttpTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
imagegen = FalImageGenService(
params=FalImageGenService.InputParams(
image_size="square_hd"
),
params=FalImageGenService.InputParams(image_size="square_hd"),
aiohttp_session=session,
key=os.getenv("FAL_KEY"),
)
gated_aggregator = GatedAggregator(
gate_open_fn=lambda frame: isinstance(frame, ImageRawFrame),
gate_close_fn=lambda frame: isinstance(frame, LLMFullResponseStartFrame),
start_open=False
)
sentence_aggregator = SentenceAggregator()
month_prepender = MonthPrepender()
llm_full_response_aggregator = LLMFullResponseAggregator()
pipeline = Pipeline([
llm, # LLM
sentence_aggregator, # Aggregates LLM output into full sentences
ParallelTask( # Run pipelines in parallel aggregating the result
[month_prepender, tts], # Create "Month: sentence" and output audio
[llm_full_response_aggregator, imagegen] # Aggregate full LLM response
),
gated_aggregator, # Queues everything until an image is available
transport.output() # Transport output
])
# With `SyncParallelPipeline` we synchronize audio and images by pushing
# them basically in order (e.g. I1 A1 A1 A1 I2 A2 A2 A2 A2 I3 A3). To do
# that, each pipeline runs concurrently and `SyncParallelPipeline` will
# wait for the input frame to be processed.
#
# Note that `SyncParallelPipeline` requires the last processor in each
# of the pipelines to be synchronous. In this case, we use
# `CartesiaHttpTTSService` and `FalImageGenService` which make HTTP
# requests and wait for the response.
pipeline = Pipeline(
[
llm, # LLM
sentence_aggregator, # Aggregates LLM output into full sentences
SyncParallelPipeline( # Run pipelines in parallel aggregating the result
[month_prepender, tts], # Create "Month: sentence" and output audio
[imagegen], # Generate image
),
transport.output(), # Transport output
]
)
frames = []
for month in [
@@ -150,8 +147,6 @@ async def main(room_url):
frames.append(MonthFrame(month=month))
frames.append(LLMMessagesFrame(messages))
frames.append(EndFrame())
runner = PipelineRunner()
task = PipelineTask(pipeline)
@@ -162,5 +157,4 @@ async def main(room_url):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))
asyncio.run(main())

View File

@@ -11,18 +11,25 @@ import sys
import tkinter as tk
from pipecat.frames.frames import AudioRawFrame, Frame, URLImageRawFrame, LLMMessagesFrame, TextFrame
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
from pipecat.frames.frames import (
Frame,
OutputAudioRawFrame,
TTSAudioRawFrame,
URLImageRawFrame,
LLMMessagesFrame,
TextFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_response import LLMFullResponseAggregator
from pipecat.processors.aggregators.sentence import SentenceAggregator
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaHttpTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.fal import FalImageGenService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.local.tk import TkLocalTransport
from pipecat.transports.local.tk import TkLocalTransport, TkOutputTransport
from loguru import logger
@@ -42,7 +49,12 @@ async def main():
runner = PipelineRunner()
async def get_month_data(month):
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
messages = [
{
"role": "system",
"content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.",
}
]
class ImageDescription(FrameProcessor):
def __init__(self):
@@ -60,14 +72,17 @@ async def main():
def __init__(self):
super().__init__()
self.audio = bytearray()
self.frame = None
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, AudioRawFrame):
if isinstance(frame, TTSAudioRawFrame):
self.audio.extend(frame.audio)
self.frame = AudioRawFrame(
bytes(self.audio), frame.sample_rate, frame.num_channels)
self.frame = OutputAudioRawFrame(
bytes(self.audio), frame.sample_rate, frame.num_channels
)
await self.push_frame(frame, direction)
class ImageGrabber(FrameProcessor):
def __init__(self):
@@ -79,24 +94,22 @@ async def main():
if isinstance(frame, URLImageRawFrame):
self.frame = frame
await self.push_frame(frame, direction)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
tts = CartesiaHttpTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
imagegen = FalImageGenService(
params=FalImageGenService.InputParams(
image_size="square_hd"
),
params=FalImageGenService.InputParams(image_size="square_hd"),
aiohttp_session=session,
key=os.getenv("FAL_KEY"))
key=os.getenv("FAL_KEY"),
)
aggregator = LLMFullResponseAggregator()
sentence_aggregator = SentenceAggregator()
description = ImageDescription()
@@ -104,13 +117,27 @@ async def main():
image_grabber = ImageGrabber()
pipeline = Pipeline([
llm,
aggregator,
description,
ParallelPipeline([tts, audio_grabber],
[imagegen, image_grabber])
])
# With `SyncParallelPipeline` we synchronize audio and images by
# pushing them basically in order (e.g. I1 A1 A1 A1 I2 A2 A2 A2 A2
# I3 A3). To do that, each pipeline runs concurrently and
# `SyncParallelPipeline` will wait for the input frame to be
# processed.
#
# Note that `SyncParallelPipeline` requires the last processor in
# each of the pipelines to be synchronous. In this case, we use
# `CartesiaHttpTTSService` and `FalImageGenService` which make HTTP
# requests and wait for the response.
pipeline = Pipeline(
[
llm, # LLM
sentence_aggregator, # Aggregates LLM output into full sentences
description, # Store sentence
SyncParallelPipeline(
[tts, audio_grabber], # Generate and store audio for the given sentence
[imagegen, image_grabber], # Generate and storeimage for the given sentence
),
]
)
task = PipelineTask(pipeline)
await task.queue_frame(LLMMessagesFrame(messages))
@@ -131,20 +158,19 @@ async def main():
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_width=1024,
camera_out_height=1024))
camera_out_height=1024,
),
)
pipeline = Pipeline([transport.output()])
task = PipelineTask(pipeline)
# We only specify 5 months as we create tasks all at once and we might
# get rate limited otherwise.
# We only specify a few months as we create tasks all at once and we
# might get rate limited otherwise.
months: list[str] = [
"January",
"February",
# "March",
# "April",
# "May",
]
# We create one task per month. This will be executed concurrently.

View File

@@ -5,37 +5,61 @@
#
import asyncio
import aiohttp
import os
import sys
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from pipecat.processors.logger import FrameLogger
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, LLMMessagesFrame, MetricsFrame
from pipecat.metrics.metrics import (
LLMUsageMetricsData,
ProcessingMetricsData,
TTFBMetricsData,
TTSUsageMetricsData,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
class MetricsLogger(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, MetricsFrame):
for d in frame.data:
if isinstance(d, TTFBMetricsData):
print(f"!!! MetricsFrame: {frame}, ttfb: {d.value}")
elif isinstance(d, ProcessingMetricsData):
print(f"!!! MetricsFrame: {frame}, processing: {d.value}")
elif isinstance(d, LLMUsageMetricsData):
tokens = d.value
print(
f"!!! MetricsFrame: {frame}, tokens: {
tokens.prompt_tokens}, characters: {
tokens.completion_tokens}"
)
elif isinstance(d, TTSUsageMetricsData):
print(f"!!! MetricsFrame: {frame}, characters: {d.value}")
await self.push_frame(frame, direction)
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -44,23 +68,18 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
fl = FrameLogger("!!! after LLM", "red")
fltts = FrameLogger("@@@ out of tts", "green")
flend = FrameLogger("### out of the end", "magenta")
ml = MetricsLogger()
messages = [
{
@@ -68,29 +87,32 @@ async def main(room_url: str, token):
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
pipeline = Pipeline([
transport.input(),
tma_in,
llm,
fl,
tts,
fltts,
transport.output(),
tma_out,
flend
])
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
task = PipelineTask(pipeline)
pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
llm,
tts,
ml,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
PipelineParams(enable_metrics=True, enable_usage_metrics=True),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
@@ -99,5 +121,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -11,19 +11,16 @@ import sys
from PIL import Image
from pipecat.frames.frames import ImageRawFrame, Frame, SystemFrame, TextFrame
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, OutputImageRawFrame, SystemFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.cartesia import CartesiaHttpTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.transports.services.daily import DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from pipecat.transports.services.daily import DailyParams
from runner import configure
@@ -31,6 +28,7 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -52,15 +50,29 @@ class ImageSyncAggregator(FrameProcessor):
await super().process_frame(frame, direction)
if not isinstance(frame, SystemFrame) and direction == FrameDirection.DOWNSTREAM:
await self.push_frame(ImageRawFrame(image=self._speaking_image_bytes, size=(1024, 1024), format=self._speaking_image_format))
await self.push_frame(
OutputImageRawFrame(
image=self._speaking_image_bytes,
size=(1024, 1024),
format=self._speaking_image_format,
)
)
await self.push_frame(frame)
await self.push_frame(ImageRawFrame(image=self._waiting_image_bytes, size=(1024, 1024), format=self._waiting_image_format))
await self.push_frame(
OutputImageRawFrame(
image=self._waiting_image_bytes,
size=(1024, 1024),
format=self._waiting_image_format,
)
)
else:
await self.push_frame(frame)
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -73,18 +85,15 @@ async def main(room_url: str, token):
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
)
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaHttpTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
@@ -93,30 +102,32 @@ async def main(room_url: str, token):
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
image_sync_aggregator = ImageSyncAggregator(
os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
)
pipeline = Pipeline([
transport.input(),
image_sync_aggregator,
tma_in,
llm,
tts,
transport.output(),
tma_out
])
pipeline = Pipeline(
[
transport.input(),
image_sync_aggregator,
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(pipeline)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
participant_name = participant["info"]["userName"] or ''
transport.capture_participant_transcription(participant["id"])
participant_name = participant.get("info", {}).get("userName", "")
await transport.capture_participant_transcription(participant["id"])
await task.queue_frames([TextFrame(f"Hi there {participant_name}!")])
runner = PipelineRunner()
@@ -125,5 +136,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -0,0 +1,103 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import sys
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.audio.vad.silero import SileroVAD
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
transcription_enabled=True,
),
)
vad = SileroVAD()
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
vad,
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -9,30 +9,32 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -41,19 +43,16 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
@@ -62,30 +61,35 @@ async def main(room_url: str, token):
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
transport.input(), # Transport user input
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(
allow_interruptions=True,
enable_metrics=True,
report_only_initial_ttfb=True,
))
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
@@ -94,5 +98,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -5,34 +5,34 @@
#
import asyncio
import aiohttp
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.anthropic import AnthropicLLMService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -41,19 +41,18 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = AnthropicLLMService(
api_key=os.getenv("ANTHROPIC_API_KEY"),
model="claude-3-opus-20240229")
api_key=os.getenv("ANTHROPIC_API_KEY"), model="claude-3-opus-20240229"
)
# todo: think more about how to handle system prompts in a more general way. OpenAI,
# Google, and Anthropic all have slightly different approaches to providing a system
@@ -65,23 +64,25 @@ async def main(room_url: str, token):
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
transport.input(), # Transport user input
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
await task.queue_frames([LLMMessagesFrame(messages)])
@@ -91,5 +92,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -10,16 +10,18 @@ import sys
import aiohttp
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from pipecat.processors.frameworks.langchain import LangchainProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.chat_message_histories import ChatMessageHistory
@@ -32,6 +34,7 @@ from loguru import logger
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
@@ -47,8 +50,10 @@ def get_session_history(session_id: str) -> BaseChatMessageHistory:
return message_store[session_id]
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -61,27 +66,29 @@ async def main(room_url: str, token):
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
prompt = ChatPromptTemplate.from_messages(
[
("system",
"Be nice and helpful. Answer very briefly and without special characters like `#` or `*`. "
"Your response will be synthesized to voice and those characters will create unnatural sounds.",
),
(
"system",
"Be nice and helpful. Answer very briefly and without special characters like `#` or `*`. "
"Your response will be synthesized to voice and those characters will create unnatural sounds.",
),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
])
]
)
chain = prompt | ChatOpenAI(model="gpt-4o", temperature=0.7)
history_chain = RunnableWithMessageHistory(
chain,
get_session_history,
history_messages_key="chat_history",
input_messages_key="input")
input_messages_key="input",
)
lc = LangchainProcessor(history_chain)
tma_in = LLMUserResponseAggregator()
@@ -89,12 +96,12 @@ async def main(room_url: str, token):
pipeline = Pipeline(
[
transport.input(), # Transport user input
tma_in, # User responses
lc, # Langchain
tts, # TTS
transport.output(), # Transport bot output
tma_out, # Assistant spoken responses
transport.input(), # Transport user input
tma_in, # User responses
lc, # Langchain
tts, # TTS
transport.output(), # Transport bot output
tma_out, # Assistant spoken responses
]
)
@@ -102,17 +109,13 @@ async def main(room_url: str, token):
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
lc.set_participant_id(participant["id"])
# Kick off the conversation.
# the `LLMMessagesFrame` will be picked up by the LangchainProcessor using
# only the content of the last message to inject it in the prompt defined
# above. So no role is required here.
messages = [(
{
"content": "Please briefly introduce yourself to the user."
}
)]
messages = [({"content": "Please briefly introduce yourself to the user."})]
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
@@ -121,5 +124,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -5,57 +5,51 @@
#
import asyncio
import aiohttp
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
token,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True
)
vad_audio_passthrough=True,
),
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = DeepgramTTSService(
aiohttp_session=session,
api_key=os.getenv("DEEPGRAM_API_KEY"),
voice="aura-helios-en"
)
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
@@ -64,27 +58,27 @@ async def main(room_url: str, token):
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
transport.input(), # Transport user input
stt, # STT
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
@@ -93,5 +87,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -1,94 +0,0 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_sample_rate=44100,
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
sample_rate=44100,
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
pipeline = Pipeline([
transport.input(), # Transport user input
tma_in, # User responses
llm, # LLM
tts, # TTS
tma_out, # Goes before the transport because cartesia has word-level timestamps!
transport.output(), # Transport bot output
])
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1,99 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = ElevenLabsTTSService(
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -8,86 +8,95 @@ import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.services.playht import PlayHTTTSService
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.openai import OpenAILLMService
from pipecat.services.playht import PlayHTTTSService
from pipecat.transcriptions.language import Language
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=16000,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
)
tts = PlayHTTTSService(
user_id=os.getenv("PLAYHT_USER_ID"),
api_key=os.getenv("PLAYHT_API_KEY"),
voice_url="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
)
tts = PlayHTTTSService(
user_id=os.getenv("PLAYHT_USER_ID"),
api_key=os.getenv("PLAYHT_API_KEY"),
voice_url="s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
params=PlayHTTTSService.InputParams(language=Language.EN),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
transport.input(), # Transport user input
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
runner = PipelineRunner()
await runner.run(task)
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -4,19 +4,19 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.azure import AzureLLMService, AzureSTTService, AzureTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
@@ -24,77 +24,80 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=16000,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)
)
stt = AzureSTTService(
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
region=os.getenv("AZURE_SPEECH_REGION"),
)
stt = AzureSTTService(
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
region=os.getenv("AZURE_SPEECH_REGION"),
)
tts = AzureTTSService(
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
region=os.getenv("AZURE_SPEECH_REGION"),
)
tts = AzureTTSService(
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
region=os.getenv("AZURE_SPEECH_REGION"),
)
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"),
)
llm = AzureLLMService(
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
model=os.getenv("AZURE_CHATGPT_MODEL"),
)
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
transport.input(), # Transport user input
stt, # STT
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
runner = PipelineRunner()
await runner.run(task)
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -8,85 +8,81 @@ import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.services.openai import OpenAITTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.services.openai import OpenAILLMService, OpenAITTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=24000,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=24000,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
)
tts = OpenAITTSService(
api_key=os.getenv("OPENAI_API_KEY"),
voice="alloy"
)
tts = OpenAITTSService(api_key=os.getenv("OPENAI_API_KEY"), voice="alloy")
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
transport.input(), # Transport user input
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
runner = PipelineRunner()
await runner.run(task)
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -9,18 +9,15 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openpipe import OpenPipeLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
@@ -28,14 +25,17 @@ from loguru import logger
import time
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -44,14 +44,13 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
timestamp = int(time.time())
@@ -59,9 +58,7 @@ async def main(room_url: str, token):
api_key=os.getenv("OPENAI_API_KEY"),
openpipe_api_key=os.getenv("OPENPIPE_API_KEY"),
model="gpt-4o",
tags={
"conversation_id": f"pipecat-{timestamp}"
}
tags={"conversation_id": f"pipecat-{timestamp}"},
)
messages = [
@@ -70,26 +67,28 @@ async def main(room_url: str, token):
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
pipeline = Pipeline([
transport.input(), # Transport user input
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
@@ -98,5 +97,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -9,31 +9,32 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.openai import OpenAILLMService
from pipecat.services.xtts import XTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -43,19 +44,17 @@ async def main(room_url: str, token):
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
)
),
)
tts = XTTSService(
aiohttp_session=session,
voice_id="Claribel Dervla",
language="en",
base_url="http://localhost:8000"
base_url="http://localhost:8000",
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
@@ -64,26 +63,27 @@ async def main(room_url: str, token):
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
transport.input(), # Transport user input
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
@@ -92,5 +92,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -5,36 +5,35 @@
#
import asyncio
import aiohttp
import os
import sys
from pipecat.frames.frames import LLMMessagesFrame
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.gladia import GladiaSTTService
from pipecat.services.openai import OpenAILLMService
from pipecat.services.xtts import XTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -44,22 +43,19 @@ async def main(room_url: str, token):
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
)
),
)
stt = GladiaSTTService(
api_key=os.getenv("GLADIA_API_KEY"),
)
tts = DeepgramTTSService(
aiohttp_session=session,
api_key=os.getenv("DEEPGRAM_API_KEY"),
voice="aura-helios-en"
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
@@ -68,34 +64,39 @@ async def main(room_url: str, token):
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
transport.input(), # Transport user input
stt, # STT
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
# Register an event handler to exit the application when the user leaves.
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -0,0 +1,91 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.lmnt import LmntTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=24000,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = LmntTTSService(api_key=os.getenv("LMNT_API_KEY"), voice_id="morgan")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User respones
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,109 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.services.ai_services import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.together import TogetherLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = TogetherLLMService(
api_key=os.getenv("TOGETHER_API_KEY"),
model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
params=TogetherLLMService.InputParams(
temperature=1.0,
top_p=0.9,
top_k=40,
extra={
"frequency_penalty": 2.0,
"presence_penalty": 0.0,
},
),
)
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond in plain language. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
user_aggregator = context_aggregator.user()
assistant_aggregator = context_aggregator.assistant()
pipeline = Pipeline(
[
transport.input(), # Transport user input
user_aggregator, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
assistant_aggregator, # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True, enable_metrics=True, enable_usage_metrics=True
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,98 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.aws import AWSTTSService
from pipecat.services.deepgram import DeepgramSTTService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = AWSTTSService(
api_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
region=os.getenv("AWS_REGION"),
voice_id="Amy",
params=AWSTTSService.InputParams(engine="neural", language="en-GB", rate="1.05"),
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,96 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.deepgram import DeepgramSTTService
from pipecat.services.google import GoogleTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=24000,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = GoogleTTSService(
voice_id="en-US-Neural2-J",
params=GoogleTTSService.InputParams(language="en-US", rate="1.05"),
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
context_aggregator.user(), # User respones
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,97 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.assemblyai import AssemblyAISTTService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)
stt = AssemblyAISTTService(
api_key=os.getenv("ASSEMBLYAI_API_KEY"),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,274 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import os
import sys
import google.ai.generativelanguage as glm
from dataclasses import dataclass
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.google import GoogleLLMService
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.frames.frames import (
LLMFullResponseStartFrame,
LLMFullResponseEndFrame,
InputAudioRawFrame,
Frame,
StartInterruptionFrame,
TextFrame,
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
marker = "|----|"
system_message = f"""
You are a helpful LLM in a WebRTC call. Your goals are to be helpful and brief in your responses.
You are expert at transcribing audio to text. You will receive a mixture of audio and text input. When
asked to transcribe what the user said, output an exact, word-for-word transcription.
Your output will be converted to audio so don't include special characters in your answers.
Each time you answer, you should respond in three parts.
1. Transcribe exactly what the user said.
2. Output the separator field '{marker}'.
3. Respond to the user's input in a helpful, creative way using only simple text and punctuation.
Example:
User: How many ounces are in a pound?
You: How many ounces are in a pound?
{marker}
There are 16 ounces in a pound.
"""
@dataclass
class MagicDemoTranscriptionFrame(Frame):
text: str
class UserAudioCollector(FrameProcessor):
def __init__(self, context, user_context_aggregator):
super().__init__()
self._context = context
self._user_context_aggregator = user_context_aggregator
self._audio_frames = []
self._start_secs = 0.2 # this should match VAD start_secs (hardcoding for now)
self._user_speaking = False
async def process_frame(self, frame, direction):
await super().process_frame(frame, direction)
if isinstance(frame, TranscriptionFrame):
# We could gracefully handle both audio input and text/transcription input ...
# but let's leave that as an exercise to the reader. :-)
return
if isinstance(frame, UserStartedSpeakingFrame):
self._user_speaking = True
elif isinstance(frame, UserStoppedSpeakingFrame):
self._user_speaking = False
self._context.add_audio_frames_message(audio_frames=self._audio_frames)
await self._user_context_aggregator.push_frame(
self._user_context_aggregator.get_context_frame()
)
elif isinstance(frame, InputAudioRawFrame):
if self._user_speaking:
self._audio_frames.append(frame)
else:
# Append the audio frame to our buffer. Treat the buffer as a ring buffer, dropping the oldest
# frames as necessary. Assume all audio frames have the same duration.
self._audio_frames.append(frame)
frame_duration = len(frame.audio) / 16 * frame.num_channels / frame.sample_rate
buffer_duration = frame_duration * len(self._audio_frames)
while buffer_duration > self._start_secs:
self._audio_frames.pop(0)
buffer_duration -= frame_duration
await self.push_frame(frame, direction)
class TranscriptExtractor(FrameProcessor):
def __init__(self, context):
super().__init__()
self._context = context
self._accumulator = ""
self._processing_llm_response = False
self._accumulating_transcript = False
def reset(self):
self._accumulator = ""
self._processing_llm_response = False
self._accumulating_transcript = False
async def process_frame(self, frame, direction):
await super().process_frame(frame, direction)
if isinstance(frame, LLMFullResponseStartFrame):
self._processing_llm_response = True
self._accumulating_transcript = True
elif isinstance(frame, TextFrame) and self._processing_llm_response:
if self._accumulating_transcript:
text = frame.text
split_index = text.find(marker)
if split_index < 0:
self._accumulator += frame.text
# do not push this frame
return
else:
self._accumulating_transcript = False
self._accumulator += text[:split_index]
frame.text = text[split_index + len(marker) :]
await self.push_frame(frame)
return
elif isinstance(frame, LLMFullResponseEndFrame):
await self.push_frame(MagicDemoTranscriptionFrame(text=self._accumulator.strip()))
self.reset()
await self.push_frame(frame, direction)
class TanscriptionContextFixup(FrameProcessor):
def __init__(self, context):
super().__init__()
self._context = context
self._transcript = "THIS IS A TRANSCRIPT"
def swap_user_audio(self):
if not self._transcript:
return
message = self._context.messages[-2]
last_part = message.parts[-1]
if (
message.role == "user"
and last_part.inline_data
and last_part.inline_data.mime_type == "audio/wav"
):
self._context.messages[-2] = glm.Content(
role="user", parts=[glm.Part(text=self._transcript)]
)
def add_transcript_back_to_inference_output(self):
if not self._transcript:
return
message = self._context.messages[-1]
last_part = message.parts[-1]
if message.role == "model" and last_part.text:
self._context.messages[-1].parts[-1].text += f"\n\n{marker}\n{self._transcript}\n"
async def process_frame(self, frame, direction):
await super().process_frame(frame, direction)
if isinstance(frame, MagicDemoTranscriptionFrame):
self._transcript = frame.text
elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(
frame, StartInterruptionFrame
):
self.swap_user_audio()
self.add_transcript_back_to_inference_output()
self._transcript = ""
await self.push_frame(frame, direction)
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
# No transcription at all. just audio input to Gemini!
# transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = GoogleLLMService(model="gemini-1.5-flash-latest", api_key=os.getenv("GOOGLE_API_KEY"))
messages = [
{
"role": "system",
"content": system_message,
},
{
"role": "user",
"content": "Start by saying hello.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
audio_collector = UserAudioCollector(context, context_aggregator.user())
pull_transcript_out_of_llm_output = TranscriptExtractor(context)
fixup_context_messages = TanscriptionContextFixup(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
audio_collector,
context_aggregator.user(), # User responses
llm, # LLM
pull_transcript_out_of_llm_output,
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
fixup_context_messages,
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,95 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator,
LLMUserResponseAggregator,
)
from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from pipecat.audio.filters.krisp_filter import KrispFilter
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
audio_in_filter=KrispFilter(),
),
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out, # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,100 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.openai import OpenAILLMService
from pipecat.services.rime import RimeHttpTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = RimeHttpTTSService(
api_key=os.getenv("RIME_API_KEY", ""),
voice_id="rex",
params=RimeHttpTTSService.InputParams(reduce_latency=True),
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -3,18 +3,19 @@ import aiohttp
import asyncio
import logging
import os
from pipecat.pipeline.aggregators import SentenceAggregator
from pipecat.processors.aggregators import SentenceAggregator
from pipecat.pipeline.pipeline import Pipeline
from pipecat.transports.daily_transport import DailyTransport
from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService
from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
from pipecat.services.fal_ai_services import FalImageGenService
from pipecat.pipeline.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
from pipecat.transports.services.daily import DailyTransport
from pipecat.services.azure import AzureLLMService, AzureTTSService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.fal import FalImageGenService
from pipecat.frames.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
from runner import configure
from dotenv import load_dotenv
load_dotenv(override=True)
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
@@ -22,8 +23,10 @@ logger = logging.getLogger("pipecat")
logger.setLevel(logging.DEBUG)
async def main(room_url: str):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
None,
@@ -51,9 +54,7 @@ async def main(room_url: str):
voice_id="jBpfuIE2acCO8z3wKNLl",
)
dalle = FalImageGenService(
params=FalImageGenService.InputParams(
image_size="1024x1024"
),
params=FalImageGenService.InputParams(image_size="1024x1024"),
aiohttp_session=session,
key=os.getenv("FAL_KEY"),
)
@@ -73,13 +74,11 @@ async def main(room_url: str):
async def get_text_and_audio(messages) -> Tuple[str, bytearray]:
"""This function streams text from the LLM and uses the TTS service to convert
that text to speech as it's received. """
that text to speech as it's received."""
source_queue = asyncio.Queue()
sink_queue = asyncio.Queue()
sentence_aggregator = SentenceAggregator()
pipeline = Pipeline(
[llm, sentence_aggregator, tts1], source_queue, sink_queue
)
pipeline = Pipeline([llm, sentence_aggregator, tts1], source_queue, sink_queue)
await source_queue.put(LLMMessagesFrame(messages))
await source_queue.put(EndFrame())
@@ -144,5 +143,4 @@ async def main(room_url: str):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))
asyncio.run(main())

View File

@@ -4,12 +4,21 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import sys
from pipecat.frames.frames import (
Frame,
InputAudioRawFrame,
InputImageRawFrame,
OutputAudioRawFrame,
OutputImageRawFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.transports.services.daily import DailyTransport, DailyParams
from runner import configure
@@ -17,38 +26,64 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url, token):
transport = DailyTransport(
room_url, token, "Test",
DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_is_live=True,
camera_out_width=1280,
camera_out_height=720
class MirrorProcessor(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, InputAudioRawFrame):
await self.push_frame(
OutputAudioRawFrame(
audio=frame.audio,
sample_rate=frame.sample_rate,
num_channels=frame.num_channels,
)
)
elif isinstance(frame, InputImageRawFrame):
await self.push_frame(
OutputImageRawFrame(image=frame.image, size=frame.size, format=frame.format)
)
else:
await self.push_frame(frame, direction)
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Test",
DailyParams(
audio_in_enabled=True,
audio_in_sample_rate=24000,
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_is_live=True,
camera_out_width=1280,
camera_out_height=720,
),
)
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_video(participant["id"])
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_video(participant["id"])
pipeline = Pipeline([transport.input(), transport.output()])
pipeline = Pipeline([transport.input(), MirrorProcessor(), transport.output()])
runner = PipelineRunner()
runner = PipelineRunner()
task = PipelineTask(pipeline)
task = PipelineTask(pipeline)
await runner.run(task)
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -4,14 +4,23 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import sys
import tkinter as tk
from pipecat.frames.frames import (
Frame,
InputAudioRawFrame,
InputImageRawFrame,
OutputAudioRawFrame,
OutputImageRawFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.local.tk import TkLocalTransport
from pipecat.transports.services.daily import DailyParams, DailyTransport
@@ -21,46 +30,73 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url, token):
tk_root = tk.Tk()
tk_root.title("Local Mirror")
class MirrorProcessor(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
daily_transport = DailyTransport(room_url, token, "Test", DailyParams(audio_in_enabled=True))
if isinstance(frame, InputAudioRawFrame):
await self.push_frame(
OutputAudioRawFrame(
audio=frame.audio,
sample_rate=frame.sample_rate,
num_channels=frame.num_channels,
)
)
elif isinstance(frame, InputImageRawFrame):
await self.push_frame(
OutputImageRawFrame(image=frame.image, size=frame.size, format=frame.format)
)
else:
await self.push_frame(frame, direction)
tk_transport = TkLocalTransport(
tk_root,
TransportParams(
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_is_live=True,
camera_out_width=1280,
camera_out_height=720))
@daily_transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_video(participant["id"])
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
pipeline = Pipeline([daily_transport.input(), tk_transport.output()])
tk_root = tk.Tk()
tk_root.title("Local Mirror")
task = PipelineTask(pipeline)
daily_transport = DailyTransport(
room_url, token, "Test", DailyParams(audio_in_enabled=True, audio_in_sample_rate=24000)
)
async def run_tk():
while not task.has_finished():
tk_root.update()
tk_root.update_idletasks()
await asyncio.sleep(0.1)
tk_transport = TkLocalTransport(
tk_root,
TransportParams(
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_is_live=True,
camera_out_width=1280,
camera_out_height=720,
),
)
runner = PipelineRunner()
@daily_transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_video(participant["id"])
await asyncio.gather(runner.run(task), run_tk())
pipeline = Pipeline([daily_transport.input(), MirrorProcessor(), tk_transport.output()])
task = PipelineTask(pipeline)
async def run_tk():
while not task.has_finished():
tk_root.update()
tk_root.update_idletasks()
await asyncio.sleep(0.1)
runner = PipelineRunner()
await asyncio.gather(runner.run(task), run_tk())
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -9,31 +9,32 @@ import aiohttp
import os
import sys
from pipecat.processors.filters.wake_check_filter import WakeCheckFilter
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.filters.wake_check_filter import WakeCheckFilter
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -42,19 +43,16 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
@@ -64,24 +62,27 @@ async def main(room_url: str, token):
]
hey_robot_filter = WakeCheckFilter(["hey robot", "hey, robot"])
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
pipeline = Pipeline([
transport.input(), # Transport user input
hey_robot_filter, # Filter out speech not directed at the robot
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
hey_robot_filter, # Filter out speech not directed at the robot
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
await tts.say("Hi! If you want to talk to me, just say 'Hey Robot'.")
runner = PipelineRunner()
@@ -90,5 +91,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -10,31 +10,29 @@ import os
import sys
import wave
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import (
Frame,
AudioRawFrame,
LLMFullResponseEndFrame,
LLMMessagesFrame,
OutputAudioRawFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMUserResponseAggregator,
LLMAssistantResponseAggregator,
)
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.processors.logger import FrameLogger
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.cartesia import CartesiaHttpTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -53,12 +51,12 @@ for file in sound_files:
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the image and convert it to bytes
with wave.open(full_path) as audio_file:
sounds[file] = AudioRawFrame(audio_file.readframes(-1),
audio_file.getframerate(), audio_file.getnchannels())
sounds[file] = OutputAudioRawFrame(
audio_file.readframes(-1), audio_file.getframerate(), audio_file.getnchannels()
)
class OutboundSoundEffectWrapper(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -71,7 +69,6 @@ class OutboundSoundEffectWrapper(FrameProcessor):
class InboundSoundEffectWrapper(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -83,8 +80,10 @@ class InboundSoundEffectWrapper(FrameProcessor):
await self.push_frame(frame, direction)
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -93,18 +92,15 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id="ErXwobaYiN019PkySvjV",
tts = CartesiaHttpTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
messages = [
@@ -114,29 +110,31 @@ async def main(room_url: str, token):
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
out_sound = OutboundSoundEffectWrapper()
in_sound = InboundSoundEffectWrapper()
fl = FrameLogger("LLM Out")
fl2 = FrameLogger("Transcription In")
pipeline = Pipeline([
transport.input(),
tma_in,
in_sound,
fl2,
llm,
fl,
tts,
out_sound,
transport.output(),
tma_out
])
pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
in_sound,
fl2,
llm,
fl,
tts,
out_sound,
transport.output(),
context_aggregator.assistant(),
]
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
await tts.say("Hi, I'm listening!")
await transport.send_audio(sounds["ding1.wav"])
@@ -148,5 +146,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -16,16 +17,16 @@ from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.user_response import UserResponseAggregator
from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.moondream import MoondreamService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -33,7 +34,6 @@ logger.add(sys.stderr, level="DEBUG")
class UserImageRequester(FrameProcessor):
def __init__(self, participant_id: str | None = None):
super().__init__()
self._participant_id = participant_id
@@ -45,12 +45,16 @@ class UserImageRequester(FrameProcessor):
await super().process_frame(frame, direction)
if self._participant_id and isinstance(frame, TextFrame):
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
await self.push_frame(
UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM
)
await self.push_frame(frame, direction)
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -59,14 +63,8 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
vad_analyzer=SileroVADAnalyzer(),
),
)
user_response = UserResponseAggregator()
@@ -78,28 +76,29 @@ async def main(room_url: str, token):
# If you run into weird description, try with use_cpu=True
moondream = MoondreamService()
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await tts.say("Hi there! Feel free to ask me what I see.")
transport.capture_participant_video(participant["id"], framerate=0)
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_video(participant["id"], framerate=0)
await transport.capture_participant_transcription(participant["id"])
image_requester.set_participant_id(participant["id"])
pipeline = Pipeline([
transport.input(),
user_response,
image_requester,
vision_aggregator,
moondream,
tts,
transport.output()
])
pipeline = Pipeline(
[
transport.input(),
user_response,
image_requester,
vision_aggregator,
moondream,
tts,
transport.output(),
]
)
task = PipelineTask(pipeline)
@@ -107,6 +106,6 @@ async def main(room_url: str, token):
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -16,16 +17,16 @@ from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.user_response import UserResponseAggregator
from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.google import GoogleLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -33,7 +34,6 @@ logger.add(sys.stderr, level="DEBUG")
class UserImageRequester(FrameProcessor):
def __init__(self, participant_id: str | None = None):
super().__init__()
self._participant_id = participant_id
@@ -45,12 +45,16 @@ class UserImageRequester(FrameProcessor):
await super().process_frame(frame, direction)
if self._participant_id and isinstance(frame, TextFrame):
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
await self.push_frame(
UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM
)
await self.push_frame(frame, direction)
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -60,8 +64,8 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
user_response = UserResponseAggregator()
@@ -71,31 +75,32 @@ async def main(room_url: str, token):
vision_aggregator = VisionImageFrameAggregator()
google = GoogleLLMService(
model="gemini-1.5-flash-latest",
api_key=os.getenv("GOOGLE_API_KEY"))
model="gemini-1.5-flash-latest", api_key=os.getenv("GOOGLE_API_KEY")
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await tts.say("Hi there! Feel free to ask me what I see.")
transport.capture_participant_video(participant["id"], framerate=0)
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_video(participant["id"], framerate=0)
await transport.capture_participant_transcription(participant["id"])
image_requester.set_participant_id(participant["id"])
pipeline = Pipeline([
transport.input(),
user_response,
image_requester,
vision_aggregator,
google,
tts,
transport.output()
])
pipeline = Pipeline(
[
transport.input(),
user_response,
image_requester,
vision_aggregator,
google,
tts,
transport.output(),
]
)
task = PipelineTask(pipeline)
@@ -103,6 +108,6 @@ async def main(room_url: str, token):
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -16,16 +17,16 @@ from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.user_response import UserResponseAggregator
from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -33,7 +34,6 @@ logger.add(sys.stderr, level="DEBUG")
class UserImageRequester(FrameProcessor):
def __init__(self, participant_id: str | None = None):
super().__init__()
self._participant_id = participant_id
@@ -45,12 +45,16 @@ class UserImageRequester(FrameProcessor):
await super().process_frame(frame, direction)
if self._participant_id and isinstance(frame, TextFrame):
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
await self.push_frame(
UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM
)
await self.push_frame(frame, direction)
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -59,8 +63,8 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
user_response = UserResponseAggregator()
@@ -69,33 +73,31 @@ async def main(room_url: str, token):
vision_aggregator = VisionImageFrameAggregator()
openai = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o"
)
openai = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await tts.say("Hi there! Feel free to ask me what I see.")
transport.capture_participant_video(participant["id"], framerate=0)
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_video(participant["id"], framerate=0)
await transport.capture_participant_transcription(participant["id"])
image_requester.set_participant_id(participant["id"])
pipeline = Pipeline([
transport.input(),
user_response,
image_requester,
vision_aggregator,
openai,
tts,
transport.output()
])
pipeline = Pipeline(
[
transport.input(),
user_response,
image_requester,
vision_aggregator,
openai,
tts,
transport.output(),
]
)
task = PipelineTask(pipeline)
@@ -103,6 +105,6 @@ async def main(room_url: str, token):
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -16,16 +17,16 @@ from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.user_response import UserResponseAggregator
from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.anthropic import AnthropicLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -33,7 +34,6 @@ logger.add(sys.stderr, level="DEBUG")
class UserImageRequester(FrameProcessor):
def __init__(self, participant_id: str | None = None):
super().__init__()
self._participant_id = participant_id
@@ -45,12 +45,16 @@ class UserImageRequester(FrameProcessor):
await super().process_frame(frame, direction)
if self._participant_id and isinstance(frame, TextFrame):
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
await self.push_frame(
UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM
)
await self.push_frame(frame, direction)
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -59,8 +63,8 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
user_response = UserResponseAggregator()
@@ -69,33 +73,31 @@ async def main(room_url: str, token):
vision_aggregator = VisionImageFrameAggregator()
anthropic = AnthropicLLMService(
api_key=os.getenv("ANTHROPIC_API_KEY"),
model="claude-3-sonnet-20240229"
)
anthropic = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await tts.say("Hi there! Feel free to ask me what I see.")
transport.capture_participant_video(participant["id"], framerate=0)
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_video(participant["id"], framerate=0)
await transport.capture_participant_transcription(participant["id"])
image_requester.set_participant_id(participant["id"])
pipeline = Pipeline([
transport.input(),
user_response,
image_requester,
vision_aggregator,
anthropic,
tts,
transport.output()
])
pipeline = Pipeline(
[
transport.input(),
user_response,
image_requester,
vision_aggregator,
anthropic,
tts,
transport.output(),
]
)
task = PipelineTask(pipeline)
@@ -103,6 +105,6 @@ async def main(room_url: str, token):
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -4,6 +4,7 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import sys
@@ -20,6 +21,7 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -27,7 +29,6 @@ logger.add(sys.stderr, level="DEBUG")
class TranscriptionLogger(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -35,23 +36,26 @@ class TranscriptionLogger(FrameProcessor):
print(f"Transcription: {frame.text}")
async def main(room_url: str):
transport = DailyTransport(room_url, None, "Transcription bot",
DailyParams(audio_in_enabled=True))
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
stt = WhisperSTTService()
transport = DailyTransport(
room_url, None, "Transcription bot", DailyParams(audio_in_enabled=True)
)
tl = TranscriptionLogger()
stt = WhisperSTTService()
pipeline = Pipeline([transport.input(), stt, tl])
tl = TranscriptionLogger()
task = PipelineTask(pipeline)
pipeline = Pipeline([transport.input(), stt, tl])
runner = PipelineRunner()
task = PipelineTask(pipeline)
await runner.run(task)
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))
asyncio.run(main())

View File

@@ -19,6 +19,7 @@ from pipecat.transports.local.audio import LocalAudioTransport
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -26,7 +27,6 @@ logger.add(sys.stderr, level="DEBUG")
class TranscriptionLogger(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)

View File

@@ -4,6 +4,7 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import aiohttp
import asyncio
import os
import sys
@@ -13,7 +14,7 @@ from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.deepgram import DeepgramSTTService
from pipecat.services.deepgram import DeepgramSTTService, LiveOptions, Language
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
@@ -21,6 +22,7 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -28,7 +30,6 @@ logger.add(sys.stderr, level="DEBUG")
class TranscriptionLogger(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
@@ -36,23 +37,29 @@ class TranscriptionLogger(FrameProcessor):
print(f"Transcription: {frame.text}")
async def main(room_url: str):
transport = DailyTransport(room_url, None, "Transcription bot",
DailyParams(audio_in_enabled=True))
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
stt = DeepgramSTTService(os.getenv("DEEPGRAM_API_KEY"))
transport = DailyTransport(
room_url, None, "Transcription bot", DailyParams(audio_in_enabled=True)
)
tl = TranscriptionLogger()
stt = DeepgramSTTService(
api_key=os.getenv("DEEPGRAM_API_KEY"),
# live_options=LiveOptions(language=Language.FR),
)
pipeline = Pipeline([transport.input(), stt, tl])
tl = TranscriptionLogger()
task = PipelineTask(pipeline)
pipeline = Pipeline([transport.input(), stt, tl])
runner = PipelineRunner()
task = PipelineTask(pipeline)
await runner.run(task)
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))
asyncio.run(main())

View File

@@ -0,0 +1,63 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import Frame, TranscriptionFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.gladia import GladiaSTTService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
class TranscriptionLogger(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, TranscriptionFrame):
print(f"Transcription: {frame.text}")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url, None, "Transcription bot", DailyParams(audio_in_enabled=True)
)
stt = GladiaSTTService(
api_key=os.getenv("GLADIA_API_KEY"),
# live_options=LiveOptions(language=Language.FR),
)
tl = TranscriptionLogger()
pipeline = Pipeline([transport.input(), stt, tl])
task = PipelineTask(pipeline)
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,62 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import Frame, TranscriptionFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.assemblyai import AssemblyAISTTService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
class TranscriptionLogger(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, TranscriptionFrame):
print(f"Transcription: {frame.text}")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url, None, "Transcription bot", DailyParams(audio_in_enabled=True)
)
stt = AssemblyAISTTService(
api_key=os.getenv("ASSEMBLYAI_API_KEY"),
)
tl = TranscriptionLogger()
pipeline = Pipeline([transport.input(), stt, tl])
task = PipelineTask(pipeline)
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -9,19 +9,13 @@ import aiohttp
import os
import sys
from pipecat.frames.frames import TextFrame
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
)
from pipecat.processors.logger import FrameLogger
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMContext, OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from openai.types.chat import ChatCompletionToolParam
@@ -30,22 +24,30 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def start_fetch_weather(llm):
await llm.push_frame(TextFrame("Let me think."))
async def start_fetch_weather(function_name, llm, context):
# note: we can't push a frame to the LLM here. the bot
# can interrupt itself and/or cause audio overlapping glitches.
# possible question for Aleix and Chad about what the right way
# to trigger speech is, now, with the new queues/async/sync refactors.
# await llm.push_frame(TextFrame("Let me check on that."))
logger.debug(f"Starting fetch_weather_from_api with function_name: {function_name}")
async def fetch_weather_from_api(llm, args):
return {"conditions": "nice", "temperature": "75"}
async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
await result_callback({"conditions": "nice", "temperature": "75"})
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -54,26 +56,19 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm.register_function(
"get_current_weather",
fetch_weather_from_api,
start_callback=start_fetch_weather)
fl_in = FrameLogger("Inner")
fl_out = FrameLogger("Outer")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
# Register a function_name of None to get all functions
# sent to the same callback with an additional function_name parameter.
llm.register_function(None, fetch_weather_from_api, start_callback=start_fetch_weather)
tools = [
ChatCompletionToolParam(
@@ -90,17 +85,15 @@ async def main(room_url: str, token):
},
"format": {
"type": "string",
"enum": [
"celsius",
"fahrenheit"],
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location.",
},
},
"required": [
"location",
"format"],
"required": ["location", "format"],
},
})]
},
)
]
messages = [
{
"role": "system",
@@ -109,26 +102,34 @@ async def main(room_url: str, token):
]
context = OpenAILLMContext(messages, tools)
tma_in = LLMUserContextAggregator(context)
tma_out = LLMAssistantContextAggregator(context)
pipeline = Pipeline([
fl_in,
transport.input(),
tma_in,
llm,
fl_out,
tts,
transport.output(),
tma_out
])
context_aggregator = llm.create_context_aggregator(context)
task = PipelineTask(pipeline)
pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
@ transport.event_handler("on_first_participant_joined")
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
await tts.say("Hi! Ask me about the weather in San Francisco.")
await task.queue_frames([context_aggregator.user().get_context_frame()])
runner = PipelineRunner()
@@ -136,5 +137,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -0,0 +1,118 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.anthropic import AnthropicLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def get_weather(function_name, tool_call_id, arguments, llm, context, result_callback):
location = arguments["location"]
await result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = AnthropicLLMService(
api_key=os.getenv("ANTHROPIC_API_KEY"), model="claude-3-5-sonnet-20240620"
)
llm.register_function("get_weather", get_weather)
tools = [
{
"name": "get_weather",
"description": "Get the current weather in a given location",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
}
},
"required": ["location"],
},
}
]
# todo: test with very short initial user message
# messages = [{"role": "system",
# "content": "You are a helpful assistant who can report the weather in any location in the universe. Respond concisely. Your response will be turned into speech so use only simple words and punctuation."},
# {"role": "user",
# "content": " Start the conversation by introducing yourself."}]
messages = [{"role": "user", "content": "Say 'hello' to start the conversation."}]
context = OpenAILLMContext(messages, tools)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User spoken responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses and tool context
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,174 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.anthropic import AnthropicLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
video_participant_id = None
async def get_weather(function_name, tool_call_id, arguments, llm, context, result_callback):
location = arguments["location"]
await result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
async def get_image(function_name, tool_call_id, arguments, llm, context, result_callback):
question = arguments["question"]
await llm.request_image_frame(user_id=video_participant_id, text_content=question)
async def main():
global llm
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = AnthropicLLMService(
api_key=os.getenv("ANTHROPIC_API_KEY"),
# model="claude-3-5-sonnet-20240620",
model="claude-3-5-sonnet-latest",
enable_prompt_caching_beta=True,
)
llm.register_function("get_weather", get_weather)
llm.register_function("get_image", get_image)
tools = [
{
"name": "get_weather",
"description": "Get the current weather in a given location",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
}
},
"required": ["location"],
},
},
{
"name": "get_image",
"description": "Get an image from the video stream.",
"input_schema": {
"type": "object",
"properties": {
"question": {
"type": "string",
"description": "The question that the user is asking about the image.",
}
},
"required": ["question"],
},
},
]
# todo: test with very short initial user message
system_prompt = """\
You are a helpful assistant who converses with a user and answers questions. Respond concisely to general questions.
Your response will be turned into speech so use only simple words and punctuation.
You have access to two tools: get_weather and get_image.
You can respond to questions about the weather using the get_weather tool.
You can answer questions about the user's video stream using the get_image tool. Some examples of phrases that \
indicate you should use the get_image tool are:
- What do you see?
- What's in the video?
- Can you describe the video?
- Tell me about what you see.
- Tell me something interesting about what you see.
- What's happening in the video?
If you need to use a tool, simply use the tool. Do not tell the user the tool you are using. Be brief and concise.
"""
messages = [
{
"role": "system",
"content": [
{
"type": "text",
"text": system_prompt,
}
],
},
{"role": "user", "content": "Start the conversation by introducing yourself."},
]
context = OpenAILLMContext(messages, tools)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User speech to text
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses and tool context
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
global video_participant_id
video_participant_id = participant["id"]
await transport.capture_participant_transcription(video_participant_id)
await transport.capture_participant_video(video_participant_id, framerate=0)
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,136 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMContext
from pipecat.services.together import TogetherLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from openai.types.chat import ChatCompletionToolParam
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def start_fetch_weather(function_name, llm, context):
# note: we can't push a frame to the LLM here. the bot
# can interrupt itself and/or cause audio overlapping glitches.
# possible question for Aleix and Chad about what the right way
# to trigger speech is, now, with the new queues/async/sync refactors.
# await llm.push_frame(TextFrame("Let me check on that."))
logger.debug(f"Starting fetch_weather_from_api with function_name: {function_name}")
async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
await result_callback({"conditions": "nice", "temperature": "75"})
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = TogetherLLMService(
api_key=os.getenv("TOGETHER_API_KEY"),
model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
)
# Register a function_name of None to get all functions
# sent to the same callback with an additional function_name parameter.
llm.register_function(None, fetch_weather_from_api, start_callback=start_fetch_weather)
tools = [
ChatCompletionToolParam(
type="function",
function={
"name": "get_current_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location.",
},
},
"required": ["location", "format"],
},
},
)
]
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages, tools)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(pipeline)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
# await tts.say("Hi! Ask me about the weather in San Francisco.")
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,167 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMContext, OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from openai.types.chat import ChatCompletionToolParam
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
video_participant_id = None
async def get_weather(function_name, tool_call_id, arguments, llm, context, result_callback):
location = arguments["location"]
await result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
async def get_image(function_name, tool_call_id, arguments, llm, context, result_callback):
logger.debug(f"!!! IN get_image {video_participant_id}, {arguments}")
question = arguments["question"]
await llm.request_image_frame(user_id=video_participant_id, text_content=question)
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
llm.register_function("get_weather", get_weather)
llm.register_function("get_image", get_image)
tools = [
ChatCompletionToolParam(
type="function",
function={
"name": "get_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location.",
},
},
"required": ["location", "format"],
},
},
),
ChatCompletionToolParam(
type="function",
function={
"name": "get_image",
"description": "Get an image from the video stream.",
"parameters": {
"type": "object",
"properties": {
"question": {
"type": "string",
"description": "The question to ask the AI to generate an image of",
},
},
"required": ["question"],
},
},
),
]
system_prompt = """\
You are a helpful assistant who converses with a user and answers questions. Respond concisely to general questions.
Your response will be turned into speech so use only simple words and punctuation.
You have access to two tools: get_weather and get_image.
You can respond to questions about the weather using the get_weather tool.
You can answer questions about the user's video stream using the get_image tool. Some examples of phrases that \
indicate you should use the get_image tool are:
- What do you see?
- What's in the video?
- Can you describe the video?
- Tell me about what you see.
- Tell me something interesting about what you see.
- What's happening in the video?
"""
messages = [
{"role": "system", "content": system_prompt},
]
context = OpenAILLMContext(messages, tools)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(pipeline)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
global video_participant_id
video_participant_id = participant["id"]
await transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_video(video_participant_id, framerate=0)
# Kick off the conversation.
await tts.say("Hi! Ask me about the weather in San Francisco.")
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,173 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.google import GoogleLLMService
from pipecat.services.openai import OpenAILLMContext
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
video_participant_id = None
async def get_weather(function_name, tool_call_id, arguments, llm, context, result_callback):
location = arguments["location"]
await result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
async def get_image(function_name, tool_call_id, arguments, llm, context, result_callback):
logger.debug(f"!!! IN get_image {video_participant_id}, {arguments}")
question = arguments["question"]
await llm.request_image_frame(user_id=video_participant_id, text_content=question)
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = GoogleLLMService(model="gemini-1.5-flash-latest", api_key=os.getenv("GOOGLE_API_KEY"))
llm.register_function("get_weather", get_weather)
llm.register_function("get_image", get_image)
tools = [
{
"function_declarations": [
{
"name": "get_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location.",
},
},
"required": ["location", "format"],
},
},
{
"name": "get_image",
"description": "Get and image from the camera or video stream.",
"parameters": {
"type": "object",
"properties": {
"question": {
"type": "string",
"description": "The question to to use when running inference on the acquired image.",
},
},
"required": ["question"],
},
},
]
}
]
system_prompt = """\
You are a helpful assistant who converses with a user and answers questions. Respond concisely to general questions.
Your response will be turned into speech so use only simple words and punctuation.
You have access to two tools: get_weather and get_image.
You can respond to questions about the weather using the get_weather tool.
You can answer questions about the user's video stream using the get_image tool. Some examples of phrases that \
indicate you should use the get_image tool are:
- What do you see?
- What's in the video?
- Can you describe the video?
- Tell me about what you see.
- Tell me something interesting about what you see.
- What's happening in the video?
"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": "Say hello."},
]
context = OpenAILLMContext(messages, tools)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
global video_participant_id
video_participant_id = participant["id"]
await transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_video(video_participant_id, framerate=0)
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -4,26 +4,22 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import aiohttp
import asyncio
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantContextAggregator,
LLMUserContextAggregator
)
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.filters.function_filter import FunctionFilter
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from openai.types.chat import ChatCompletionToolParam
@@ -32,6 +28,7 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -40,10 +37,14 @@ logger.add(sys.stderr, level="DEBUG")
current_voice = "News Lady"
async def switch_voice(llm, args):
async def switch_voice(function_name, tool_call_id, args, llm, context, result_callback):
global current_voice
current_voice = args["voice"]
return {"voice": f"You are now using your {current_voice} voice. Your responses should now be as if you were a {current_voice}."}
await result_callback(
{
"voice": f"You are now using your {current_voice} voice. Your responses should now be as if you were a {current_voice}."
}
)
async def news_lady_filter(frame) -> bool:
@@ -58,8 +59,10 @@ async def barbershop_man_filter(frame) -> bool:
return current_voice == "Barbershop Man"
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -68,8 +71,8 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
news_lady = CartesiaTTSService(
@@ -87,9 +90,7 @@ async def main(room_url: str, token):
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
llm.register_function("switch_voice", switch_voice)
tools = [
@@ -108,7 +109,9 @@ async def main(room_url: str, token):
},
"required": ["voice"],
},
})]
},
)
]
messages = [
{
"role": "system",
@@ -117,32 +120,35 @@ async def main(room_url: str, token):
]
context = OpenAILLMContext(messages, tools)
tma_in = LLMUserContextAggregator(context)
tma_out = LLMAssistantContextAggregator(context)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
transport.input(), # Transport user input
tma_in, # User responses
llm, # LLM
ParallelPipeline( # TTS (one of the following vocies)
[FunctionFilter(news_lady_filter), news_lady], # News Lady voice
[FunctionFilter(british_lady_filter), british_lady], # British Lady voice
[FunctionFilter(barbershop_man_filter), barbershop_man], # Barbershop Man voice
),
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
ParallelPipeline( # TTS (one of the following vocies)
[FunctionFilter(news_lady_filter), news_lady], # News Lady voice
[FunctionFilter(british_lady_filter), british_lady], # British Lady voice
[FunctionFilter(barbershop_man_filter), barbershop_man], # Barbershop Man voice
),
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{
"role": "system",
"content": f"Please introduce yourself to the user and let them know the voices you can do. Your initial responses should be as if you were a {current_voice}."})
"content": f"Please introduce yourself to the user and let them know the voices you can do. Your initial responses should be as if you were a {current_voice}.",
}
)
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
@@ -151,5 +157,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -9,22 +9,17 @@ import aiohttp
import os
import sys
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame, TTSUpdateSettingsFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantContextAggregator,
LLMUserContextAggregator
)
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.filters.function_filter import FunctionFilter
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.services.whisper import Model, WhisperSTTService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer
from openai.types.chat import ChatCompletionToolParam
@@ -33,6 +28,7 @@ from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -41,10 +37,10 @@ logger.add(sys.stderr, level="DEBUG")
current_language = "English"
async def switch_language(llm, args):
async def switch_language(function_name, tool_call_id, args, llm, context, result_callback):
global current_language
current_language = args["language"]
return {"voice": f"Your answers from now on should be in {current_language}."}
await result_callback({"voice": f"Your answers from now on should be in {current_language}."})
async def english_filter(frame) -> bool:
@@ -55,39 +51,34 @@ async def spanish_filter(frame) -> bool:
return current_language == "Spanish"
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Pipecat",
DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True
)
vad_audio_passthrough=True,
),
)
stt = WhisperSTTService(model=Model.LARGE)
english_tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id="pNInz6obpgDQGcFmaJgB",
english_tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
spanish_tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
model="eleven_multilingual_v2",
voice_id="9F4C8ztpNUmXkdDDbz3J",
spanish_tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="846d6cb0-2301-48b6-9683-48f5618ea2f6", # Spanish-speaking Lady
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o")
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
llm.register_function("switch_language", switch_language)
tools = [
@@ -106,7 +97,9 @@ async def main(room_url: str, token):
},
"required": ["language"],
},
})]
},
)
]
messages = [
{
"role": "system",
@@ -115,32 +108,34 @@ async def main(room_url: str, token):
]
context = OpenAILLMContext(messages, tools)
tma_in = LLMUserContextAggregator(context)
tma_out = LLMAssistantContextAggregator(context)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
transport.input(), # Transport user input
stt, # STT
tma_in, # User responses
llm, # LLM
ParallelPipeline( # TTS (bot will speak the chosen language)
[FunctionFilter(english_filter), english_tts], # English
[FunctionFilter(spanish_filter), spanish_tts], # Spanish
),
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
ParallelPipeline( # TTS (bot will speak the chosen language)
[FunctionFilter(english_filter), english_tts], # English
[FunctionFilter(spanish_filter), spanish_tts], # Spanish
),
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append(
{
"role": "system",
"content": f"Please introduce yourself to the user and let them know the languages you speak. Your initial responses should be in {current_language}."})
"content": f"Please introduce yourself to the user and let them know the languages you speak. Your initial responses should be in {current_language}.",
}
)
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
@@ -149,5 +144,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

View File

@@ -5,35 +5,38 @@
#
import asyncio
import aiohttp
import os
import sys
import json
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import (
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.deepgram import DeepgramTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyTransportMessageFrame
from pipecat.vad.silero import SileroVADAnalyzer
from pipecat.transports.services.daily import (
DailyParams,
DailyTransport,
DailyTransportMessageFrame,
)
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main(room_url: str, token):
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
@@ -42,15 +45,15 @@ async def main(room_url: str, token):
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = DeepgramTTSService(
aiohttp_session=session,
api_key=os.getenv("DEEPGRAM_API_KEY"),
voice="aura-asteria-en",
base_url="http://0.0.0.0:8080/v1/speak"
base_url="http://0.0.0.0:8080/v1/speak",
)
llm = OpenAILLMService(
@@ -59,7 +62,7 @@ async def main(room_url: str, token):
# model="gpt-4o"
# Or, to use a local vLLM (or similar) api server
model="meta-llama/Meta-Llama-3-8B-Instruct",
base_url="http://0.0.0.0:8000/v1"
base_url="http://0.0.0.0:8000/v1",
)
messages = [
@@ -69,17 +72,19 @@ async def main(room_url: str, token):
},
]
tma_in = LLMUserResponseAggregator(messages)
tma_out = LLMAssistantResponseAggregator(messages)
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline([
transport.input(), # Transport user input
tma_in, # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
tma_out # Assistant spoken responses
])
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(),
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(),
]
)
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True))
@@ -87,13 +92,12 @@ async def main(room_url: str, token):
# bot can "hear" and respond to them.
@transport.event_handler("on_participant_joined")
async def on_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
await transport.capture_participant_transcription(participant["id"])
# When the first participant joins, the bot should introduce itself.
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
# Handle "latency-ping" messages. The client will send app messages that look like
@@ -110,14 +114,18 @@ async def main(room_url: str, token):
logger.debug(f"Received latency ping app message: {message}")
ts = message["latency-ping"]["ts"]
# Send immediately
transport.output().send_message(DailyTransportMessageFrame(
message={"latency-pong-msg-handler": {"ts": ts}},
participant_id=sender))
transport.output().send_message(
DailyTransportMessageFrame(
message={"latency-pong-msg-handler": {"ts": ts}}, participant_id=sender
)
)
# And push to the pipeline for the Daily transport.output to send
await tma_in.push_frame(
await task.queue_frame(
DailyTransportMessageFrame(
message={"latency-pong-pipeline-delivery": {"ts": ts}},
participant_id=sender))
participant_id=sender,
)
)
except Exception as e:
logger.debug(f"message handling error: {e} - {message}")
@@ -126,5 +134,4 @@ async def main(room_url: str, token):
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
asyncio.run(main())

Some files were not shown because too many files have changed in this diff Show More