Compare commits
333 Commits
v0.0.17
...
khk/vad-ga
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e4388ad9fc | ||
|
|
153c10781d | ||
|
|
c7a188cdf8 | ||
|
|
ca30184237 | ||
|
|
ae466b07f2 | ||
|
|
a5adbb5124 | ||
|
|
5bbbc1f849 | ||
|
|
949e90bc63 | ||
|
|
99f8693db9 | ||
|
|
675c041e7b | ||
|
|
b9617a3fde | ||
|
|
8dff460307 | ||
|
|
cce1ddb183 | ||
|
|
9f2741e21c | ||
|
|
a56def9585 | ||
|
|
a4c02b412f | ||
|
|
3cf49e5306 | ||
|
|
b53f8886f1 | ||
|
|
ece76d36a3 | ||
|
|
3d43683b3d | ||
|
|
611790bf05 | ||
|
|
8691d14289 | ||
|
|
dd402da9e5 | ||
|
|
2fd04248f1 | ||
|
|
0ac42006f8 | ||
|
|
66e331248d | ||
|
|
4be3e8c87d | ||
|
|
dac033fe61 | ||
|
|
d302cbb114 | ||
|
|
e3b407db28 | ||
|
|
4ef623f09e | ||
|
|
253530a63d | ||
|
|
4f38d989f5 | ||
|
|
84074e90ee | ||
|
|
38aee7d8f2 | ||
|
|
64198313c6 | ||
|
|
d61b6c301c | ||
|
|
83d1931266 | ||
|
|
c31f2ab285 | ||
|
|
0ddc5721b4 | ||
|
|
98bd183bc4 | ||
|
|
aaa154524c | ||
|
|
beced68337 | ||
|
|
94823ab952 | ||
|
|
0b6a19802f | ||
|
|
c4a2d2197c | ||
|
|
269d06aa15 | ||
|
|
dfef1f2c54 | ||
|
|
b62beaba0b | ||
|
|
adf414e40f | ||
|
|
dc64e57f63 | ||
|
|
d3e410b2ac | ||
|
|
c544b2474b | ||
|
|
18243de358 | ||
|
|
6625895d1f | ||
|
|
f9ecce739e | ||
|
|
0075dd8386 | ||
|
|
eef1cde816 | ||
|
|
8d867c30c6 | ||
|
|
42c668b7ae | ||
|
|
b62227b4ae | ||
|
|
25ef0cb87b | ||
|
|
e195941aa5 | ||
|
|
e09eef1dd7 | ||
|
|
7c13663a4e | ||
|
|
5753869e5e | ||
|
|
ba878a19f4 | ||
|
|
55a9de78cd | ||
|
|
ff51fc9091 | ||
|
|
a4f857ee34 | ||
|
|
3250d74bef | ||
|
|
c086160239 | ||
|
|
6cdccaff53 | ||
|
|
a9ab8de25d | ||
|
|
2a29cb18a5 | ||
|
|
4193a4f415 | ||
|
|
0226ec450a | ||
|
|
020b8ebb35 | ||
|
|
1170b30c1b | ||
|
|
0004d4a906 | ||
|
|
cb27e86266 | ||
|
|
77a3b2ea5c | ||
|
|
099e65f3b6 | ||
|
|
befb8db120 | ||
|
|
9992d826b1 | ||
|
|
18604e1a39 | ||
|
|
312c569182 | ||
|
|
b43e0ed130 | ||
|
|
289debea34 | ||
|
|
ccd6af7016 | ||
|
|
effc69e4e4 | ||
|
|
c7a0d0db64 | ||
|
|
50d69a1ca4 | ||
|
|
8a6b8fe70a | ||
|
|
c4e53aea71 | ||
|
|
ad5125e93f | ||
|
|
8d92cbac93 | ||
|
|
0225443ec8 | ||
|
|
71e1d0a334 | ||
|
|
83f69e02fd | ||
|
|
e1b2da1ff0 | ||
|
|
5eb1b90a4b | ||
|
|
9c4ee74b91 | ||
|
|
f65f566829 | ||
|
|
c8ad3123b7 | ||
|
|
8cefce28cf | ||
|
|
a834d26885 | ||
|
|
810e3cd551 | ||
|
|
f258fa96cd | ||
|
|
757ec61f14 | ||
|
|
2c933f43d8 | ||
|
|
cc5bfa8af8 | ||
|
|
de9f3e55f1 | ||
|
|
ed0c986218 | ||
|
|
72c27215b6 | ||
|
|
c23b14f768 | ||
|
|
81282f9c4d | ||
|
|
2b324f6f81 | ||
|
|
049f110344 | ||
|
|
448a0307a8 | ||
|
|
7390e42f5c | ||
|
|
ee880d229f | ||
|
|
9cd07d81f8 | ||
|
|
b453d089c3 | ||
|
|
7410fe1d1e | ||
|
|
6323a77431 | ||
|
|
0aedaa8553 | ||
|
|
6554479d39 | ||
|
|
ce2ebd3198 | ||
|
|
13ea1efc96 | ||
|
|
ef380321cf | ||
|
|
294b037730 | ||
|
|
7603996612 | ||
|
|
3048d2b0b1 | ||
|
|
0bb47a09d2 | ||
|
|
1afe6901d9 | ||
|
|
3e019fb512 | ||
|
|
e069aa9608 | ||
|
|
0b32e42d25 | ||
|
|
8d18be5069 | ||
|
|
e715d99d0c | ||
|
|
dc28590247 | ||
|
|
139f158ea1 | ||
|
|
4b2a18837f | ||
|
|
b4340d0185 | ||
|
|
90d11398e6 | ||
|
|
bf8c73b25b | ||
|
|
21cd21de1b | ||
|
|
c25f6e56e7 | ||
|
|
a1f1d1995c | ||
|
|
390582d7f3 | ||
|
|
e765a29ca2 | ||
|
|
cf5c244487 | ||
|
|
a5eb30a93d | ||
|
|
ac7bc35944 | ||
|
|
ddfd721f6e | ||
|
|
aee3916cd1 | ||
|
|
3eff1e559b | ||
|
|
1a542c91fa | ||
|
|
cd60a84f8a | ||
|
|
3dd4bac6e6 | ||
|
|
06ff9cfede | ||
|
|
2d1ed9a304 | ||
|
|
50b51c05f6 | ||
|
|
5ce4b8dd5b | ||
|
|
2f4467b5a5 | ||
|
|
e91ab54a69 | ||
|
|
6a33432c82 | ||
|
|
135654a080 | ||
|
|
7b708a2bee | ||
|
|
b515c28417 | ||
|
|
854ffb0323 | ||
|
|
891b7b22ea | ||
|
|
c8d37a7227 | ||
|
|
489060881d | ||
|
|
d56a4cce1b | ||
|
|
7eb9dfde38 | ||
|
|
571e10f83e | ||
|
|
af202d4fe5 | ||
|
|
4057fbbcfd | ||
|
|
5cdb8a79a1 | ||
|
|
a674b43243 | ||
|
|
ac41f13b7c | ||
|
|
003b9887b1 | ||
|
|
ba45c2ab5b | ||
|
|
9d36a48a80 | ||
|
|
20a525635e | ||
|
|
659eceea95 | ||
|
|
d462c03d00 | ||
|
|
6591e07eb4 | ||
|
|
fe71825954 | ||
|
|
43516f84fe | ||
|
|
0849edb00b | ||
|
|
dd3b4083eb | ||
|
|
89673a4040 | ||
|
|
410dbd3dfc | ||
|
|
7085b1ea3f | ||
|
|
8683cae719 | ||
|
|
0197efa524 | ||
|
|
16e76caa33 | ||
|
|
1f5240694d | ||
|
|
f087151db7 | ||
|
|
0b691ff597 | ||
|
|
ae049961b7 | ||
|
|
0d6eee705f | ||
|
|
58d20ec9dc | ||
|
|
38befe1dc1 | ||
|
|
2f335100a5 | ||
|
|
3fef818843 | ||
|
|
428c8af77e | ||
|
|
54fccd2e25 | ||
|
|
66c6a5dc0f | ||
|
|
92561ae19d | ||
|
|
b85e93410b | ||
|
|
593993ba97 | ||
|
|
7b8b606278 | ||
|
|
7116ad0607 | ||
|
|
c507044277 | ||
|
|
5f45a9d90f | ||
|
|
e31e87aabd | ||
|
|
2957416d90 | ||
|
|
b9b761b67a | ||
|
|
a7539e9317 | ||
|
|
75575c0c68 | ||
|
|
77b3e08214 | ||
|
|
956b783c1a | ||
|
|
e90c080470 | ||
|
|
37aabaa03a | ||
|
|
3e289a7bef | ||
|
|
6dd5e3fdf5 | ||
|
|
e60df3c7c0 | ||
|
|
42f772beed | ||
|
|
3655c4a0fc | ||
|
|
012dbffd94 | ||
|
|
4b39efeee3 | ||
|
|
19caf750fd | ||
|
|
296611714f | ||
|
|
4c3d19cc8b | ||
|
|
a3ba07c7a3 | ||
|
|
a1579808b2 | ||
|
|
aecb9f5816 | ||
|
|
a5d42a526c | ||
|
|
a9472f8116 | ||
|
|
b19243ab75 | ||
|
|
2bf094b950 | ||
|
|
d5f106ae19 | ||
|
|
920745345a | ||
|
|
143033d7db | ||
|
|
335990c145 | ||
|
|
6d24e836b0 | ||
|
|
278a2fed56 | ||
|
|
c444004eec | ||
|
|
72cf7896d7 | ||
|
|
31af5f8177 | ||
|
|
6a68d9a57e | ||
|
|
39f41ab25e | ||
|
|
624cc1e987 | ||
|
|
08a15e5cdd | ||
|
|
4cd4787e4d | ||
|
|
65afee2808 | ||
|
|
00ece864ec | ||
|
|
6d6d9bea5a | ||
|
|
7c213f8533 | ||
|
|
3685c19b2d | ||
|
|
650a2b4da4 | ||
|
|
afea6f38f6 | ||
|
|
c45d428551 | ||
|
|
4e594aa9b0 | ||
|
|
32f91c5f31 | ||
|
|
a32ece897a | ||
|
|
88f6436aaa | ||
|
|
fac43cea06 | ||
|
|
a9e6aeed54 | ||
|
|
fa9f49f5bb | ||
|
|
2a6183aba5 | ||
|
|
b1a622971b | ||
|
|
5b72faccb4 | ||
|
|
c8732544c7 | ||
|
|
d4219b16b8 | ||
|
|
0c33432f64 | ||
|
|
95bd58cced | ||
|
|
8d7d1a7e24 | ||
|
|
3768cb2f2c | ||
|
|
d4b2741608 | ||
|
|
aef2152dcc | ||
|
|
d0b0221b97 | ||
|
|
b4758cd989 | ||
|
|
681250f114 | ||
|
|
fd13d3c50e | ||
|
|
674b8bb0cd | ||
|
|
5d9a962146 | ||
|
|
e130aada72 | ||
|
|
76709a9a39 | ||
|
|
acd2d55b84 | ||
|
|
fcec0eb812 | ||
|
|
e9965347b5 | ||
|
|
5a83f75e0d | ||
|
|
91c706a201 | ||
|
|
34384881bc | ||
|
|
71ba28753e | ||
|
|
32d2f0db66 | ||
|
|
e1169a4e82 | ||
|
|
0e5711e62d | ||
|
|
0ddfa3de5b | ||
|
|
661aa79b7c | ||
|
|
2c32cc2f27 | ||
|
|
d7bb0bc5cb | ||
|
|
d5644c3ab9 | ||
|
|
09ab8e3efd | ||
|
|
2f683529ec | ||
|
|
6ac012a82b | ||
|
|
075194cb54 | ||
|
|
269f070051 | ||
|
|
3342c9d7c2 | ||
|
|
b468b2f926 | ||
|
|
af1c7d0023 | ||
|
|
34670eef79 | ||
|
|
979739c1b7 | ||
|
|
83ed6870b9 | ||
|
|
57a568986a | ||
|
|
e828e26b5b | ||
|
|
825738440e | ||
|
|
147bd1a075 | ||
|
|
209e97f372 | ||
|
|
47f8627432 | ||
|
|
cc6713837a | ||
|
|
728fe0ad88 | ||
|
|
dbba45349f | ||
|
|
40ccf46b4b | ||
|
|
077bb9f20a | ||
|
|
e4c990c677 | ||
|
|
1c8b9d813a | ||
|
|
83812f2671 |
2
.github/workflows/publish_test.yaml
vendored
2
.github/workflows/publish_test.yaml
vendored
@@ -40,7 +40,7 @@ jobs:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
|
||||
publish-to-pypi:
|
||||
publish-to-test-pypi:
|
||||
name: "Publish to Test PyPI"
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
|
||||
368
CHANGELOG.md
368
CHANGELOG.md
@@ -5,6 +5,374 @@ All notable changes to **pipecat** will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.0.35] - 2024-06-28
|
||||
|
||||
### Changed
|
||||
|
||||
- `FastAPIWebsocketParams` now require a serializer.
|
||||
|
||||
- `TwilioFrameSerializer` now requires a `streamSid`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Silero VAD number of frames needs to be 512 for 16000 sample rate or 256 for
|
||||
8000 sample rate.
|
||||
|
||||
## [0.0.34] - 2024-06-25
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue with asynchronous STT services (Deepgram and Azure) that could
|
||||
interruptions to ignore transcriptions.
|
||||
|
||||
- Fixed an issue introduced in 0.0.33 that would cause the LLM to generate
|
||||
shorter output.
|
||||
|
||||
## [0.0.33] - 2024-06-25
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgraded to Cartesia's new Python library 1.0.0. `CartesiaTTSService` now
|
||||
expects a voice ID instead of a voice name (you can get the voice ID from
|
||||
Cartesia's playground). You can also specify the audio `sample_rate` and
|
||||
`encoding` instead of the previous `output_format`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue with asynchronous STT services (Deepgram and Azure) that could
|
||||
cause static audio issues and interruptions to not work properly when dealing
|
||||
with multiple LLMs sentences.
|
||||
|
||||
- Fixed an issue that could mix new LLM responses with previous ones when
|
||||
handling interruptions.
|
||||
|
||||
- Fixed a Daily transport blocking situation that occurred while reading audio
|
||||
frames after a participant left the room. Needs daily-python >= 0.10.1.
|
||||
|
||||
## [0.0.32] - 2024-06-22
|
||||
|
||||
### Added
|
||||
|
||||
- Allow specifying a `DeepgramSTTService` url which allows using on-prem
|
||||
Deepgram.
|
||||
|
||||
- Added new `FastAPIWebsocketTransport`. This is a new websocket transport that
|
||||
can be integrated with FastAPI websockets.
|
||||
|
||||
- Added new `TwilioFrameSerializer`. This is a new serializer that knows how to
|
||||
serialize and deserialize audio frames from Twilio.
|
||||
|
||||
- Added Daily transport event: `on_dialout_answered`. See
|
||||
https://reference-python.daily.co/api_reference.html#daily.EventHandler
|
||||
|
||||
- Added new `AzureSTTService`. This allows you to use Azure Speech-To-Text.
|
||||
|
||||
### Performance
|
||||
|
||||
- Convert `BaseOutputTransport` and `BaseOutputTransport` to fully use asyncio
|
||||
and remove the use of threads.
|
||||
|
||||
### Other
|
||||
|
||||
- Added `twilio-chatbot`. This is an example that shows how to integrate Twilio
|
||||
phone numbers with a Pipecat bot.
|
||||
|
||||
- Updated `07f-interruptible-azure.py` to use `AzureLLMService`,
|
||||
`AzureSTTService` and `AzureTTSService`.
|
||||
|
||||
## [0.0.31] - 2024-06-13
|
||||
|
||||
### Performance
|
||||
|
||||
- Break long audio frames into 20ms chunks instead of 10ms.
|
||||
|
||||
## [0.0.30] - 2024-06-13
|
||||
|
||||
### Added
|
||||
|
||||
- Added `report_only_initial_ttfb` to `PipelineParams`. This will make it so
|
||||
only the initial TTFB metrics after the user stops talking are reported.
|
||||
|
||||
- Added `OpenPipeLLMService`. This service will let you run OpenAI through
|
||||
OpenPipe's SDK.
|
||||
|
||||
- Allow specifying frame processors' name through a new `name` constructor
|
||||
argument.
|
||||
|
||||
- Added `DeepgramSTTService`. This service has an ongoing websocket
|
||||
connection. To handle this, it subclasses `AIService` instead of
|
||||
`STTService`. The output of this service will be pushed from the same task,
|
||||
except system frames like `StartFrame`, `CancelFrame` or
|
||||
`StartInterruptionFrame`.
|
||||
|
||||
### Changed
|
||||
|
||||
- `FrameSerializer.deserialize()` can now return `None` in case it is not
|
||||
possible to desearialize the given data.
|
||||
|
||||
- `daily_rest.DailyRoomProperties` now allows extra unknown parameters.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where `DailyRoomProperties.exp` always had the same old
|
||||
timestamp unless set by the user.
|
||||
|
||||
- Fixed a couple of issues with `WebsocketServerTransport`. It needed to use
|
||||
`push_audio_frame()` and also VAD was not working properly.
|
||||
|
||||
- Fixed an issue that would cause LLM aggregator to fail with small
|
||||
`VADParams.stop_secs` values.
|
||||
|
||||
- Fixed an issue where `BaseOutputTransport` would send longer audio frames
|
||||
preventing interruptions.
|
||||
|
||||
### Other
|
||||
|
||||
- Added new `07h-interruptible-openpipe.py` example. This example shows how to
|
||||
use OpenPipe to run OpenAI LLMs and get the logs stored in OpenPipe.
|
||||
|
||||
- Added new `dialin-chatbot` example. This examples shows how to call the bot
|
||||
using a phone number.
|
||||
|
||||
## [0.0.29] - 2024-06-07
|
||||
|
||||
### Added
|
||||
|
||||
- Added a new `FunctionFilter`. This filter will let you filter frames based on
|
||||
a given function, except system messages which should never be filtered.
|
||||
|
||||
- Added `FrameProcessor.can_generate_metrics()` method to indicate if a
|
||||
processor can generate metrics. In the future this might get an extra argument
|
||||
to ask for a specific type of metric.
|
||||
|
||||
- Added `BasePipeline`. All pipeline classes should be based on this class. All
|
||||
subclasses should implement a `processors_with_metrics()` method that returns
|
||||
a list of all `FrameProcessor`s in the pipeline that can generate metrics.
|
||||
|
||||
- Added `enable_metrics` to `PipelineParams`.
|
||||
|
||||
- Added `MetricsFrame`. The `MetricsFrame` will report different metrics in the
|
||||
system. Right now, it can report TTFB (Time To First Byte) values for
|
||||
different services, that is the time spent between the arrival of a `Frame` to
|
||||
the processor/service until the first `DataFrame` is pushed downstream. If
|
||||
metrics are enabled an intial `MetricsFrame` with all the services in the
|
||||
pipeline will be sent.
|
||||
|
||||
- Added TTFB metrics and debug logging for TTS services.
|
||||
|
||||
### Changed
|
||||
|
||||
- Moved `ParallelTask` to `pipecat.pipeline.parallel_task`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed PlayHT TTS service to work properly async.
|
||||
|
||||
## [0.0.28] - 2024-06-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue with `SileroVADAnalyzer` that would cause memory to keep
|
||||
growing indefinitely.
|
||||
|
||||
## [0.0.27] - 2024-06-05
|
||||
|
||||
### Added
|
||||
|
||||
- Added `DailyTransport.participants()` and `DailyTransport.participant_counts()`.
|
||||
|
||||
## [0.0.26] - 2024-06-05
|
||||
|
||||
### Added
|
||||
|
||||
- Added `OpenAITTSService`.
|
||||
|
||||
- Allow passing `output_format` and `model_id` to `CartesiaTTSService` to change
|
||||
audio sample format and the model to use.
|
||||
|
||||
- Added `DailyRESTHelper` which helps you create Daily rooms and tokens in an
|
||||
easy way.
|
||||
|
||||
- `PipelineTask` now has a `has_finished()` method to indicate if the task has
|
||||
completed. If a task is never ran `has_finished()` will return False.
|
||||
|
||||
- `PipelineRunner` now supports SIGTERM. If received, the runner will be
|
||||
canceled.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where `BaseInputTransport` and `BaseOutputTransport` where
|
||||
stopping push tasks before pushing `EndFrame` frames could cause the bots to
|
||||
get stuck.
|
||||
|
||||
- Fixed an error closing local audio transports.
|
||||
|
||||
- Fixed an issue with Deepgram TTS that was introduced in the previous release.
|
||||
|
||||
- Fixed `AnthropicLLMService` interruptions. If an interruption occurred, a
|
||||
`user` message could be appended after the previous `user` message. Anthropic
|
||||
does not allow that because it requires alternate `user` and `assistant`
|
||||
messages.
|
||||
|
||||
### Performance
|
||||
|
||||
- The `BaseInputTransport` does not pull audio frames from sub-classes any
|
||||
more. Instead, sub-classes now push audio frames into a queue in the base
|
||||
class. Also, `DailyInputTransport` now pushes audio frames every 20ms instead
|
||||
of 10ms.
|
||||
|
||||
- Remove redundant camera input thread from `DailyInputTransport`. This should
|
||||
improve performance a little bit when processing participant videos.
|
||||
|
||||
- Load Cartesia voice on startup.
|
||||
|
||||
## [0.0.25] - 2024-05-31
|
||||
|
||||
### Added
|
||||
|
||||
- Added WebsocketServerTransport. This will create a websocket server and will
|
||||
read messages coming from a client. The messages are serialized/deserialized
|
||||
with protobufs. See `examples/websocket-server` for a detailed example.
|
||||
|
||||
- Added function calling (LLMService.register_function()). This will allow the
|
||||
LLM to call functions you have registered when needed. For example, if you
|
||||
register a function to get the weather in Los Angeles and ask the LLM about
|
||||
the weather in Los Angeles, the LLM will call your function.
|
||||
See https://platform.openai.com/docs/guides/function-calling
|
||||
|
||||
- Added new `LangchainProcessor`.
|
||||
|
||||
- Added Cartesia TTS support (https://cartesia.ai/)
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed SileroVAD frame processor.
|
||||
|
||||
- Fixed an issue where `camera_out_enabled` would cause the highg CPU usage if
|
||||
no image was provided.
|
||||
|
||||
### Performance
|
||||
|
||||
- Removed unnecessary audio input tasks.
|
||||
|
||||
## [0.0.24] - 2024-05-29
|
||||
|
||||
### Added
|
||||
|
||||
- Exposed `on_dialin_ready` for Daily transport SIP endpoint handling. This
|
||||
notifies when the Daily room SIP endpoints are ready. This allows integrating
|
||||
with third-party services like Twilio.
|
||||
|
||||
- Exposed Daily transport `on_app_message` event.
|
||||
|
||||
- Added Daily transport `on_call_state_updated` event.
|
||||
|
||||
- Added Daily transport `start_recording()`, `stop_recording` and
|
||||
`stop_dialout`.
|
||||
|
||||
### Changed
|
||||
|
||||
- Added `PipelineParams`. This replaces the `allow_interruptions` argument in
|
||||
`PipelineTask` and will allow future parameters in the future.
|
||||
|
||||
- Fixed Deepgram Aura TTS base_url and added ErrorFrame reporting.
|
||||
|
||||
- GoogleLLMService `api_key` argument is now mandatory.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Daily tranport `dialin-ready` doesn't not block anymore and it now handles
|
||||
timeouts.
|
||||
|
||||
- Fixed AzureLLMService.
|
||||
|
||||
## [0.0.23] - 2024-05-23
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue handling Daily transport `dialin-ready` event.
|
||||
|
||||
## [0.0.22] - 2024-05-23
|
||||
|
||||
### Added
|
||||
|
||||
- Added Daily transport `start_dialout()` to be able to make phone or SIP calls.
|
||||
See https://reference-python.daily.co/api_reference.html#daily.CallClient.start_dialout
|
||||
|
||||
- Added Daily transport support for dial-in use cases.
|
||||
|
||||
- Added Daily transport events: `on_dialout_connected`, `on_dialout_stopped`,
|
||||
`on_dialout_error` and `on_dialout_warning`. See
|
||||
https://reference-python.daily.co/api_reference.html#daily.EventHandler
|
||||
|
||||
## [0.0.21] - 2024-05-22
|
||||
|
||||
### Added
|
||||
|
||||
- Added vision support to Anthropic service.
|
||||
|
||||
- Added `WakeCheckFilter` which allows you to pass information downstream only
|
||||
if you say a certain phrase/word.
|
||||
|
||||
### Changed
|
||||
|
||||
- `Filter` has been renamed to `FrameFilter` and it's now under
|
||||
`processors/filters`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed Anthropic service to use new frame types.
|
||||
|
||||
- Fixed an issue in `LLMUserResponseAggregator` and `UserResponseAggregator`
|
||||
that would cause frames after a brief pause to not be pushed to the LLM.
|
||||
|
||||
- Clear the audio output buffer if we are interrupted.
|
||||
|
||||
- Re-add exponential smoothing after volume calculation. This makes sure the
|
||||
volume value being used doesn't fluctuate so much.
|
||||
|
||||
## [0.0.20] - 2024-05-22
|
||||
|
||||
### Added
|
||||
|
||||
- In order to improve interruptions we now compute a loudness level using
|
||||
[pyloudnorm](https://github.com/csteinmetz1/pyloudnorm). The audio coming
|
||||
WebRTC transports (e.g. Daily) have an Automatic Gain Control (AGC) algorithm
|
||||
applied to the signal, however we don't do that on our local PyAudio
|
||||
signals. This means that currently incoming audio from PyAudio is kind of
|
||||
broken. We will fix it in future releases.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where `StartInterruptionFrame` would cause
|
||||
`LLMUserResponseAggregator` to push the accumulated text causing the LLM
|
||||
respond in the wrong task. The `StartInterruptionFrame` should not trigger any
|
||||
new LLM response because that would be spoken in a different task.
|
||||
|
||||
- Fixed an issue where tasks and threads could be paused because the executor
|
||||
didn't have more tasks available. This was causing issues when cancelling and
|
||||
recreating tasks during interruptions.
|
||||
|
||||
## [0.0.19] - 2024-05-20
|
||||
|
||||
### Changed
|
||||
|
||||
- `LLMUserResponseAggregator` and `LLMAssistantResponseAggregator` internal
|
||||
messages are now exposed through the `messages` property.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where `LLMAssistantResponseAggregator` was not accumulating the
|
||||
full response but short sentences instead. If there's an interruption we only
|
||||
accumulate what the bot has spoken until now in a long response as well.
|
||||
|
||||
## [0.0.18] - 2024-05-20
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue in `DailyOuputTransport` where transport messages were not
|
||||
being sent.
|
||||
|
||||
## [0.0.17] - 2024-05-19
|
||||
|
||||
### Added
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
BSD 2-Clause License
|
||||
|
||||
Copyright (c) 2024, Kwindla Hultman Kramer
|
||||
Copyright (c) 2024, Daily
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
@@ -39,7 +39,7 @@ pip install "pipecat-ai[option,...]"
|
||||
|
||||
Your project may or may not need these, so they're made available as optional requirements. Here is a list:
|
||||
|
||||
- **AI services**: `anthropic`, `azure`, `deepgram`, `google`, `fal`, `moondream`, `openai`, `playht`, `silero`, `whisper`
|
||||
- **AI services**: `anthropic`, `azure`, `deepgram`, `google`, `fal`, `moondream`, `openai`, `openpipe`, `playht`, `silero`, `whisper`
|
||||
- **Transports**: `local`, `websocket`, `daily`
|
||||
|
||||
## Code examples
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
autopep8~=2.1.0
|
||||
build~=1.2.1
|
||||
grpcio-tools~=1.62.2
|
||||
pip-tools~=7.4.1
|
||||
pyright~=1.1.367
|
||||
pytest~=8.2.0
|
||||
setuptools~=69.5.1
|
||||
setuptools_scm~=8.1.0
|
||||
|
||||
@@ -33,3 +33,6 @@ PLAY_HT_API_KEY=...
|
||||
|
||||
# OpenAI
|
||||
OPENAI_API_KEY=...
|
||||
|
||||
#OpenPipe
|
||||
OPENPIPE_API_KEY=...
|
||||
|
||||
@@ -32,13 +32,15 @@ Next, follow the steps in the README for each demo.
|
||||
|
||||
## Projects:
|
||||
|
||||
| Project | Description | Services |
|
||||
| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------- |
|
||||
| [Simple Chatbot](simple-chatbot) | Basic voice-driven conversational bot. A good starting point for learning the flow of the framework. | Deepgram, OpenAI, Daily, Daily Prebuilt UI |
|
||||
| [Storytelling Chatbot](storytelling-chatbot) | Stitches together multiple third-party services to create a collaborative storytime experience. | Deepgram, ElevenLabs, Open AI, Fal, Daily, Custom UI |
|
||||
| [Translation Chatbot](translation-chatbot) | Listens for user speech, then translates that speech to Spanish and speaks the translation back. Demonstrates multi-participant use-cases. | Deepgram, Azure, OpenAI, Daily, Daily Prebuilt UI |
|
||||
| [Moondream Chatbot](moondream-chatbot) | Demonstrates how to add vision capabilities to GPT4. **Note: works best with a GPU** | Deepgram, OpenAI, Moondream, Daily, Daily Prebuilt UI |
|
||||
| Function-calling Chatbot (TBC) | A chatbot that can call functions in response to user input | Deepgram, OpenAI, Fireworks, Daily, Daily Prebuilt UI |
|
||||
| Project | Description | Services |
|
||||
|----------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------|
|
||||
| [Simple Chatbot](simple-chatbot) | Basic voice-driven conversational bot. A good starting point for learning the flow of the framework. | Deepgram, ElevenLabs, OpenAI, Daily, Daily Prebuilt UI |
|
||||
| [Storytelling Chatbot](storytelling-chatbot) | Stitches together multiple third-party services to create a collaborative storytime experience. | Deepgram, ElevenLabs, OpenAI, Fal, Daily, Custom UI |
|
||||
| [Translation Chatbot](translation-chatbot) | Listens for user speech, then translates that speech to Spanish and speaks the translation back. Demonstrates multi-participant use-cases. | Deepgram, Azure, OpenAI, Daily, Daily Prebuilt UI |
|
||||
| [Moondream Chatbot](moondream-chatbot) | Demonstrates how to add vision capabilities to GPT4. **Note: works best with a GPU** | Deepgram, ElevenLabs, OpenAI, Moondream, Daily, Daily Prebuilt UI |
|
||||
| [Patient intake](patient-intake) | A chatbot that can call functions in response to user input. | Deepgram, ElevenLabs, OpenAI, Daily, Daily Prebuilt UI |
|
||||
| [Dialin Chatbot](dialin-chatbot) | A chatbot that connects to an incoming phone call from Daily or Twilio. | Deepgram, ElevenLabs, OpenAI, Daily, Twilio |
|
||||
| [Twilio Chatbot](twilio-chatbot) | A chatbot that connects to an incoming phone call from Twilio. | Deepgram, ElevenLabs, OpenAI, Daily, Twilio |
|
||||
|
||||
> [!IMPORTANT]
|
||||
> These example projects use Daily as a WebRTC transport and can be joined using their hosted Prebuilt UI.
|
||||
|
||||
3
examples/dialin-chatbot/.dockerignore
Normal file
3
examples/dialin-chatbot/.dockerignore
Normal file
@@ -0,0 +1,3 @@
|
||||
**/.DS_Store
|
||||
.env
|
||||
.env.*
|
||||
165
examples/dialin-chatbot/.gitignore
vendored
Normal file
165
examples/dialin-chatbot/.gitignore
vendored
Normal file
@@ -0,0 +1,165 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
runpod.toml
|
||||
|
||||
# custom script to recursively upgrade items in requirements.py
|
||||
upgrade_requirements.py
|
||||
.DS_Store
|
||||
40
examples/dialin-chatbot/Dockerfile
Normal file
40
examples/dialin-chatbot/Dockerfile
Normal file
@@ -0,0 +1,40 @@
|
||||
FROM python:3.11-bullseye
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ARG USE_PERSISTENT_DATA
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
# Expose FastAPI port
|
||||
ENV FAST_API_PORT=7860
|
||||
EXPOSE 7860
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install --no-install-recommends -y \
|
||||
build-essential \
|
||||
git \
|
||||
ffmpeg \
|
||||
google-perftools \
|
||||
ca-certificates curl gnupg \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set up a new user named "user" with user ID 1000
|
||||
RUN useradd -m -u 1000 user
|
||||
|
||||
# Set home to the user's home directory
|
||||
ENV HOME=/home/user \
|
||||
PATH=/home/user/.local/bin:$PATH \
|
||||
PYTHONPATH=$HOME/app \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
# Switch to the "user" user
|
||||
USER user
|
||||
|
||||
# Set the working directory to the user's home directory
|
||||
WORKDIR $HOME/app
|
||||
|
||||
# Install Python dependencies
|
||||
COPY *.py .
|
||||
COPY ./requirements.txt requirements.txt
|
||||
RUN pip3 install --no-cache-dir --upgrade -r requirements.txt
|
||||
|
||||
# Start the FastAPI server
|
||||
CMD python3 bot_runner.py --host "0.0.0.0" --port ${FAST_API_PORT}
|
||||
85
examples/dialin-chatbot/README.md
Normal file
85
examples/dialin-chatbot/README.md
Normal file
@@ -0,0 +1,85 @@
|
||||
<div align="center">
|
||||
<img alt="pipecat" width="300px" height="auto" src="image.png">
|
||||
</div>
|
||||
|
||||
# Dialin example
|
||||
|
||||
Example project that demonstrates how to add phone number dialin to your Pipecat bots. We include examples for both Daily (`bot_daily.py`) and Twilio (`bot_twilio.py`), depending on who you want to use as a phone vendor.
|
||||
|
||||
- 🔁 Transport: Daily WebRTC
|
||||
- 💬 Speech-to-Text: Deepgram via Daily transport
|
||||
- 🤖 LLM: GPT4-o / OpenAI
|
||||
- 🔉 Text-to-Speech: ElevenLabs
|
||||
|
||||
#### Should I use Daily or Twilio as a vendor?
|
||||
|
||||
If you're starting from scratch, using Daily to provision phone numbers alongside Daily as a transport offers some convenience (such as automatic call forwarding.)
|
||||
|
||||
If you already have Twilio numbers and workflows that you want to connect to your Pipecat bots, there is some additional configuration required (you'll need to create a `on_dialin_ready` and use the Twilio client to trigger the forward.)
|
||||
|
||||
You can read more about this, as well as see respective walkthroughs in our docs.
|
||||
|
||||
## Setup
|
||||
|
||||
```shell
|
||||
# Install the requirements
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Setup your env
|
||||
mv env.example .env
|
||||
```
|
||||
|
||||
## Using Daily numbers
|
||||
|
||||
Run `bot_runner.py` to handle incoming HTTP requests:
|
||||
|
||||
`python bot_runner.py --host localhost`
|
||||
|
||||
Then target the following URL:
|
||||
|
||||
`POST /daily_start_bot`
|
||||
|
||||
For more configuration options, please consult Daily's API documentation.
|
||||
|
||||
|
||||
## Using Twilio numbers
|
||||
|
||||
As above, but target the following URL:
|
||||
|
||||
`POST /twilio_start_bot`
|
||||
|
||||
For more configuration options, please consult Twilio's API documentation.
|
||||
|
||||
## Deployment example
|
||||
|
||||
A Dockerfile is included in this demo for convenience. Here is an example of how to build and deploy your bot to [fly.io](https://fly.io).
|
||||
|
||||
*Please note: This demo spawns agents as subprocesses for convenience / demonstration purposes. You would likely not want to do this in production as it would limit concurrency to available system resources. For more information on how to deploy your bots using VMs, refer to the Pipecat documentation.*
|
||||
|
||||
### Build the docker image
|
||||
|
||||
`docker build -t tag:project .`
|
||||
|
||||
### Launch the fly project
|
||||
|
||||
`mv fly.example.toml fly.toml`
|
||||
|
||||
`fly launch` (using the included fly.toml)
|
||||
|
||||
### Setup your secrets on Fly
|
||||
|
||||
Set the necessary secrets (found in `env.example`)
|
||||
|
||||
`fly secrets set DAILY_API_KEY=... OPENAI_API_KEY=... ELEVENLABS_API_KEY=... ELEVENLABS_VOICE_ID=...`
|
||||
|
||||
If you're using Twilio as a number vendor:
|
||||
|
||||
`fly secrets set TWILIO_ACCOUNT_SID=... TWILIO_AUTH_TOKEN=...`
|
||||
|
||||
### Deploy!
|
||||
|
||||
`fly deploy`
|
||||
|
||||
## Need to do something more advanced?
|
||||
|
||||
This demo covers the basics of bot telephony. If you want to know more about working with PSTN / SIP, please ping us on [Discord](https://discord.gg/pipecat).
|
||||
111
examples/dialin-chatbot/bot_daily.py
Normal file
111
examples/dialin-chatbot/bot_daily.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
|
||||
from pipecat.frames.frames import (
|
||||
LLMMessagesFrame,
|
||||
EndFrame
|
||||
)
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyDialinSettings
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
daily_api_key = os.getenv("DAILY_API_KEY", "")
|
||||
daily_api_url = os.getenv("DAILY_API_URL", "https://api.daily.co/v1")
|
||||
|
||||
|
||||
async def main(room_url: str, token: str, callId: str, callDomain: str):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# diallin_settings are only needed if Daily's SIP URI is used
|
||||
# If you are handling this via Twilio, Telnyx, set this to None
|
||||
# and handle call-forwarding when on_dialin_ready fires.
|
||||
diallin_settings = DailyDialinSettings(
|
||||
call_id=callId,
|
||||
call_domain=callDomain
|
||||
)
|
||||
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Chatbot",
|
||||
DailyParams(
|
||||
api_url=daily_api_url,
|
||||
api_key=daily_api_key,
|
||||
dialin_settings=diallin_settings,
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
camera_out_enabled=False,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
transcription_enabled=True,
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by saying 'Oh, hello! Who dares dial me at this hour?!'.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
tma_in,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
tma_out,
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
@transport.event_handler("on_participant_left")
|
||||
async def on_participant_left(transport, participant, reason):
|
||||
await task.queue_frame(EndFrame())
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Pipecat Simple ChatBot")
|
||||
parser.add_argument("-u", type=str, help="Room URL")
|
||||
parser.add_argument("-t", type=str, help="Token")
|
||||
parser.add_argument("-i", type=str, help="Call ID")
|
||||
parser.add_argument("-d", type=str, help="Call Domain")
|
||||
config = parser.parse_args()
|
||||
|
||||
asyncio.run(main(config.u, config.t, config.i, config.d))
|
||||
220
examples/dialin-chatbot/bot_runner.py
Normal file
220
examples/dialin-chatbot/bot_runner.py
Normal file
@@ -0,0 +1,220 @@
|
||||
"""
|
||||
bot_runner.py
|
||||
|
||||
HTTP service that listens for incoming calls from either Daily or Twilio,
|
||||
provisioning a room and starting a Pipecat bot in response.
|
||||
|
||||
Refer to README for more information.
|
||||
"""
|
||||
import os
|
||||
import argparse
|
||||
import subprocess
|
||||
from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomObject, DailyRoomProperties, DailyRoomSipParams, DailyRoomParams
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse, PlainTextResponse
|
||||
from twilio.twiml.voice_response import VoiceResponse
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# ------------ Configuration ------------ #
|
||||
|
||||
MAX_SESSION_TIME = 5 * 60 # 5 minutes
|
||||
REQUIRED_ENV_VARS = ['OPENAI_API_KEY', 'DAILY_API_KEY',
|
||||
'ELEVENLABS_API_KEY', 'ELEVENLABS_VOICE_ID']
|
||||
|
||||
daily_rest_helper = DailyRESTHelper(
|
||||
os.getenv("DAILY_API_KEY", ""),
|
||||
os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'))
|
||||
|
||||
|
||||
# ----------------- API ----------------- #
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"]
|
||||
)
|
||||
|
||||
"""
|
||||
Create Daily room, tell the bot if the room is created for Twilio's SIP or Daily's SIP (vendor).
|
||||
When the vendor is Daily, the bot handles the call forwarding automatically,
|
||||
i.e, forwards the call from the "hold music state" to the Daily Room's SIP URI.
|
||||
|
||||
Alternatively, when the vendor is Twilio (not Daily), the bot is responsible for
|
||||
updating the state on Twilio. So when `dialin-ready` fires, it takes appropriate
|
||||
action using the Twilio Client library.
|
||||
"""
|
||||
|
||||
|
||||
def _create_daily_room(room_url, callId, callDomain=None, vendor="daily"):
|
||||
if not room_url:
|
||||
params = DailyRoomParams(
|
||||
properties=DailyRoomProperties(
|
||||
# Note: these are the default values, except for the display name
|
||||
sip=DailyRoomSipParams(
|
||||
display_name="dialin-user",
|
||||
video=False,
|
||||
sip_mode="dial-in",
|
||||
num_endpoints=1
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Creating new room...")
|
||||
room: DailyRoomObject = daily_rest_helper.create_room(params=params)
|
||||
|
||||
else:
|
||||
# Check passed room URL exist (we assume that it already has a sip set up!)
|
||||
try:
|
||||
print(f"Joining existing room: {room_url}")
|
||||
room: DailyRoomObject = daily_rest_helper.get_room_from_url(
|
||||
room_url)
|
||||
except Exception:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Room not found: {room_url}")
|
||||
|
||||
print(f"Daily room: {room.url} {room.config.sip_endpoint}")
|
||||
|
||||
# Give the agent a token to join the session
|
||||
token = daily_rest_helper.get_token(room.url, MAX_SESSION_TIME)
|
||||
|
||||
if not room or not token:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to get room or token token")
|
||||
|
||||
# Spawn a new agent, and join the user session
|
||||
# Note: this is mostly for demonstration purposes (refer to 'deployment' in docs)
|
||||
if vendor == "daily":
|
||||
bot_proc = f"python3 -m bot_daily -u {room.url} -t {token} -i {
|
||||
callId} -d {callDomain}"
|
||||
else:
|
||||
bot_proc = f"python3 -m bot_twilio -u {room.url} -t {
|
||||
token} -i {callId} -s {room.config.sip_endpoint}"
|
||||
|
||||
try:
|
||||
subprocess.Popen(
|
||||
[bot_proc],
|
||||
shell=True,
|
||||
bufsize=1,
|
||||
cwd=os.path.dirname(os.path.abspath(__file__))
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to start subprocess: {e}")
|
||||
|
||||
return room
|
||||
|
||||
|
||||
@app.post("/twilio_start_bot", response_class=PlainTextResponse)
|
||||
async def twilio_start_bot(request: Request):
|
||||
print(f"POST /twilio_voice_bot")
|
||||
|
||||
# twilio_start_bot is invoked directly by Twilio (as a web hook).
|
||||
# On Twilio, under Active Numbers, pick the phone number
|
||||
# Click Configure and under Voice Configuration,
|
||||
# "a call comes in" choose webhook and point the URL to
|
||||
# where this code is hosted.
|
||||
data = {}
|
||||
try:
|
||||
# shouldnt have received json, twilio sends form data
|
||||
form_data = await request.form()
|
||||
data = dict(form_data)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
room_url = os.getenv("DAILY_SAMPLE_ROOM_URL", None)
|
||||
callId = data.get('CallSid')
|
||||
|
||||
if not callId:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Missing 'CallSid' in request")
|
||||
|
||||
print("CallId: %s" % callId)
|
||||
|
||||
# create room and tell the bot to join the created room
|
||||
# note: Twilio does not require a callDomain
|
||||
room: DailyRoomObject = _create_daily_room(
|
||||
room_url, callId, None, "twilio")
|
||||
|
||||
print(f"Put Twilio on hold...")
|
||||
# We have the room and the SIP URI,
|
||||
# but we do not know if the Daily SIP Worker and the Bot have joined the call
|
||||
# put the call on hold until the 'on_dialin_ready' fires.
|
||||
# Then, the bot will update the called sid with the sip uri.
|
||||
# http://com.twilio.music.classical.s3.amazonaws.com/BusyStrings.mp3
|
||||
resp = VoiceResponse()
|
||||
resp.play(
|
||||
url="http://com.twilio.sounds.music.s3.amazonaws.com/MARKOVICHAMP-Borghestral.mp3", loop=10)
|
||||
return str(resp)
|
||||
|
||||
|
||||
@app.post("/daily_start_bot")
|
||||
async def daily_start_bot(request: Request) -> JSONResponse:
|
||||
# The /daily_start_bot is invoked when a call is received on Daily's SIP URI
|
||||
# daily_start_bot will create the room, put the call on hold until
|
||||
# the bot and sip worker are ready. Daily will automatically
|
||||
# forward the call to the SIP URi when dialin_ready fires.
|
||||
|
||||
# Use specified room URL, or create a new one if not specified
|
||||
room_url = os.getenv("DAILY_SAMPLE_ROOM_URL", None)
|
||||
# Get the dial-in properties from the request
|
||||
try:
|
||||
data = await request.json()
|
||||
if "test" in data:
|
||||
# Pass through any webhook checks
|
||||
return JSONResponse({"test": True})
|
||||
callId = data.get("callId", None)
|
||||
callDomain = data.get("callDomain", None)
|
||||
except Exception:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Missing properties 'callId' or 'callDomain'")
|
||||
|
||||
print(f"CallId: {callId}, CallDomain: {callDomain}")
|
||||
room: DailyRoomObject = _create_daily_room(
|
||||
room_url, callId, callDomain, "daily")
|
||||
|
||||
# Grab a token for the user to join with
|
||||
return JSONResponse({
|
||||
"room_url": room.url,
|
||||
"sipUri": room.config.sip_endpoint
|
||||
})
|
||||
|
||||
# ----------------- Main ----------------- #
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Check environment variables
|
||||
for env_var in REQUIRED_ENV_VARS:
|
||||
if env_var not in os.environ:
|
||||
raise Exception(f"Missing environment variable: {env_var}.")
|
||||
|
||||
parser = argparse.ArgumentParser(description="Pipecat Bot Runner")
|
||||
parser.add_argument("--host", type=str,
|
||||
default=os.getenv("HOST", "0.0.0.0"), help="Host address")
|
||||
parser.add_argument("--port", type=int,
|
||||
default=os.getenv("PORT", 7860), help="Port number")
|
||||
parser.add_argument("--reload", action="store_true",
|
||||
default=True, help="Reload code on change")
|
||||
|
||||
config = parser.parse_args()
|
||||
|
||||
try:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(
|
||||
"bot_runner:app",
|
||||
host=config.host,
|
||||
port=config.port,
|
||||
reload=config.reload
|
||||
)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("Pipecat runner shutting down...")
|
||||
125
examples/dialin-chatbot/bot_twilio.py
Normal file
125
examples/dialin-chatbot/bot_twilio.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
|
||||
from pipecat.frames.frames import (
|
||||
LLMMessagesFrame,
|
||||
EndFrame
|
||||
)
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from twilio.rest import Client
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
twilio_account_sid = os.getenv('TWILIO_ACCOUNT_SID')
|
||||
twilio_auth_token = os.getenv('TWILIO_AUTH_TOKEN')
|
||||
twilioclient = Client(twilio_account_sid, twilio_auth_token)
|
||||
|
||||
daily_api_key = os.getenv("DAILY_API_KEY", "")
|
||||
|
||||
|
||||
async def main(room_url: str, token: str, callId: str, sipUri: str):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# diallin_settings are only needed if Daily's SIP URI is used
|
||||
# If you are handling this via Twilio, Telnyx, set this to None
|
||||
# and handle call-forwarding when on_dialin_ready fires.
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Chatbot",
|
||||
DailyParams(
|
||||
api_key=daily_api_key,
|
||||
dialin_settings=None, # Not required for Twilio
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
camera_out_enabled=False,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
transcription_enabled=True,
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by saying 'Hello! Who dares dial me at this hour?!'.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
tma_in,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
tma_out,
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
@transport.event_handler("on_participant_left")
|
||||
async def on_participant_left(transport, participant, reason):
|
||||
await task.queue_frame(EndFrame())
|
||||
|
||||
@transport.event_handler("on_dialin_ready")
|
||||
async def on_dialin_ready(transport, cdata):
|
||||
# For Twilio, Telnyx, etc. You need to update the state of the call
|
||||
# and forward it to the sip_uri..
|
||||
print(f"Forwarding call: {callId} {sipUri}")
|
||||
|
||||
try:
|
||||
# The TwiML is updated using Twilio's client library
|
||||
call = twilioclient.calls(callId).update(
|
||||
twiml=f'<Response><Dial><Sip>{sipUri}</Sip></Dial></Response>'
|
||||
)
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to forward call: {str(e)}")
|
||||
|
||||
runner = PipelineRunner()
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Pipecat Simple ChatBot")
|
||||
parser.add_argument("-u", type=str, help="Room URL")
|
||||
parser.add_argument("-t", type=str, help="Token")
|
||||
parser.add_argument("-i", type=str, help="Call ID")
|
||||
parser.add_argument("-s", type=str, help="SIP URI")
|
||||
config = parser.parse_args()
|
||||
|
||||
asyncio.run(main(config.u, config.t, config.i, config.s))
|
||||
8
examples/dialin-chatbot/env.example
Normal file
8
examples/dialin-chatbot/env.example
Normal file
@@ -0,0 +1,8 @@
|
||||
DAILY_SAMPLE_ROOM_URL=https://yourdomain.daily.co/yourroom # (optional: for joining the bot to the same room repeatedly for local dev)
|
||||
DAILY_API_KEY=.
|
||||
DAILY_API_URL=api.daily.co/v1
|
||||
OPENAI_API_KEY=
|
||||
ELEVENLABS_API_KEY=
|
||||
ELEVENLABS_VOICE_ID=
|
||||
TWILIO_ACCOUNT_SID=
|
||||
TWILIO_AUTH_TOKEN=
|
||||
19
examples/dialin-chatbot/fly.example.toml
Normal file
19
examples/dialin-chatbot/fly.example.toml
Normal file
@@ -0,0 +1,19 @@
|
||||
# fly.toml app configuration file generated for pipecat-dialin-demo on 2024-06-03T15:57:57+02:00
|
||||
#
|
||||
# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
|
||||
#
|
||||
|
||||
app = 'pipecat-dialin-demo'
|
||||
primary_region = 'sjc'
|
||||
|
||||
[build]
|
||||
|
||||
[http_service]
|
||||
internal_port = 7860
|
||||
force_https = true
|
||||
auto_stop_machines = true
|
||||
auto_start_machines = true
|
||||
min_machines_running = 1
|
||||
|
||||
[[vm]]
|
||||
size = 'performance-1x'
|
||||
BIN
examples/dialin-chatbot/image.png
Normal file
BIN
examples/dialin-chatbot/image.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 19 KiB |
7
examples/dialin-chatbot/requirements.txt
Normal file
7
examples/dialin-chatbot/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
pipecat-ai[daily,openai,silero]
|
||||
fastapi
|
||||
uvicorn
|
||||
requests
|
||||
python-dotenv
|
||||
loguru
|
||||
twilio
|
||||
165
examples/fast-chatbot/.gitignore
vendored
Normal file
165
examples/fast-chatbot/.gitignore
vendored
Normal file
@@ -0,0 +1,165 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
runpod.toml
|
||||
|
||||
# custom script to recursively upgrade items in requirements.py
|
||||
upgrade_requirements.py
|
||||
.DS_Store
|
||||
165
examples/fast-chatbot/bot-classic-pipeline.js
Normal file
165
examples/fast-chatbot/bot-classic-pipeline.js
Normal file
@@ -0,0 +1,165 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from loguru import logger
|
||||
import argparse
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, ValidationError
|
||||
|
||||
from pipecat.vad.vad_analyzer import VADParams
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.deepgram import DeepgramSTTService
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.frames.frames import LLMMessagesFrame, EndFrame
|
||||
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator
|
||||
)
|
||||
|
||||
from helpers import (
|
||||
ClearableDeepgramTTSService,
|
||||
AudioVolumeTimer,
|
||||
TranscriptionTimingLogger
|
||||
)
|
||||
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level=os.getenv("LOG_LEVEL", "DEBUG"))
|
||||
|
||||
|
||||
class BotSettings(BaseModel):
|
||||
room_url: str
|
||||
room_token: str
|
||||
bot_name: str = "Pipecat"
|
||||
prompt: Optional[str] = "You are a helpful assistant."
|
||||
deepgram_api_key: Optional[str] = os.getenv("DEEPGRAM_API_KEY", None)
|
||||
deepgram_voice: Optional[str] = os.getenv("DEEPGRAM_VOICE", "aura-asteria-en")
|
||||
deepgram_tts_base_url: Optional[str] = os.getenv(
|
||||
"DEEPGRAM_TTS_BASE_URL", "https://api.deepgram.com/v1/speak")
|
||||
deepgram_stt_base_url: Optional[str] = os.getenv(
|
||||
"DEEPGRAM_STT_BASE_URL", "https://api.deepgram.com/v1/speak")
|
||||
openai_api_key: Optional[str] = os.getenv("OPENAI_API_KEY", None),
|
||||
openai_model: Optional[str] = os.getenv("OPENAI_MODEL", None),
|
||||
openai_base_url: Optional[str] = os.getenv("OPENAI_BASE_URL", None)
|
||||
vad_stop_secs: Optional[float] = os.getenv("VAD_STOP_SECS", 0.200)
|
||||
|
||||
|
||||
async def main(settings: BotSettings):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
settings.room_url,
|
||||
settings.room_token,
|
||||
settings.bot_name,
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=False,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(
|
||||
stop_secs=settings.vad_stop_secs
|
||||
)),
|
||||
vad_audio_passthrough=True
|
||||
)
|
||||
)
|
||||
|
||||
stt = DeepgramSTTService(
|
||||
name="STT",
|
||||
api_key=settings.deepgram_api_key,
|
||||
url=settings.deepgram_stt_base_url
|
||||
)
|
||||
|
||||
tts = ClearableDeepgramTTSService(
|
||||
name="Voice",
|
||||
aiohttp_session=session,
|
||||
api_key=settings.deepgram_api_key,
|
||||
voice=settings.deepgram_voice,
|
||||
**({'base_url': url} if (url := settings.deepgram_tts_base_url) else {})
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
name="Groq Llama 3 70B",
|
||||
api_key=settings.openai_api_key,
|
||||
model=settings.openai_model,
|
||||
base_url=settings.openai_base_url,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": settings.prompt,
|
||||
},
|
||||
]
|
||||
|
||||
avt = AudioVolumeTimer()
|
||||
tl = TranscriptionTimingLogger(avt)
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
avt, # Audio volume timer
|
||||
stt, # Speech-to-text
|
||||
tl, # Transcription timing logger
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out, # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
PipelineParams(
|
||||
allow_interruptions=True,
|
||||
enable_metrics=True,
|
||||
report_only_initial_ttfb=True
|
||||
))
|
||||
|
||||
# When the participant leaves, we exit the bot.
|
||||
@transport.event_handler("on_participant_left")
|
||||
async def on_participant_left(transport, participant, reason):
|
||||
await task.queue_frame(EndFrame())
|
||||
|
||||
# When the first participant joins, the bot should introduce itself.
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
# Provide some air whilst tracks subscribe
|
||||
time.sleep(2)
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Introduce yourself by saying 'hello, I'm FastBot, how can I help you today?'"})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Pipecat Bot")
|
||||
parser.add_argument("-s", "--settings", type=str, required=True, help="Pipecat bot settings")
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
try:
|
||||
settings = BotSettings.model_validate_json(args.settings)
|
||||
print(f"settings: {settings.json()}")
|
||||
asyncio.run(main(settings))
|
||||
except ValidationError as e:
|
||||
print(e)
|
||||
192
examples/fast-chatbot/bot-vad-gated.py
Normal file
192
examples/fast-chatbot/bot-vad-gated.py
Normal file
@@ -0,0 +1,192 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from loguru import logger
|
||||
import argparse
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, ValidationError
|
||||
|
||||
from pipecat.vad.vad_analyzer import VADParams
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.services.openai import OpenAILLMService, OpenAILLMContext
|
||||
from pipecat.services.deepgram import DeepgramSTTService
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.frames.frames import LLMMessagesFrame, EndFrame
|
||||
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator
|
||||
)
|
||||
|
||||
from helpers import (
|
||||
GreedyLLMAggregator,
|
||||
ClearableDeepgramTTSService,
|
||||
VADGate,
|
||||
AudioVolumeTimer,
|
||||
TranscriptionTimingLogger
|
||||
)
|
||||
|
||||
# from helpers import (
|
||||
# ClearableDeepgramTTSService,
|
||||
# AudioVolumeTimer,
|
||||
# TranscriptionTimingLogger
|
||||
# )
|
||||
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level=os.getenv("LOG_LEVEL", "DEBUG"))
|
||||
|
||||
|
||||
class BotSettings(BaseModel):
|
||||
room_url: str
|
||||
room_token: str
|
||||
bot_name: str = "Pipecat"
|
||||
prompt: Optional[str] = "You are a helpful assistant."
|
||||
deepgram_api_key: Optional[str] = os.getenv("DEEPGRAM_API_KEY", None)
|
||||
deepgram_voice: Optional[str] = os.getenv("DEEPGRAM_VOICE", "aura-asteria-en")
|
||||
deepgram_tts_base_url: Optional[str] = os.getenv(
|
||||
"DEEPGRAM_TTS_BASE_URL", "https://api.deepgram.com/v1/speak")
|
||||
deepgram_stt_base_url: Optional[str] = os.getenv(
|
||||
"DEEPGRAM_STT_BASE_URL", "https://api.deepgram.com/v1/speak")
|
||||
openai_api_key: Optional[str] = os.getenv("OPENAI_API_KEY", None),
|
||||
openai_model: Optional[str] = os.getenv("OPENAI_MODEL", None),
|
||||
openai_base_url: Optional[str] = os.getenv("OPENAI_BASE_URL", None)
|
||||
vad_stop_secs: Optional[float] = os.getenv("VAD_STOP_SECS", 0.200)
|
||||
|
||||
|
||||
async def main(settings: BotSettings):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
settings.room_url,
|
||||
settings.room_token,
|
||||
settings.bot_name,
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=False,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(
|
||||
stop_secs=settings.vad_stop_secs
|
||||
)),
|
||||
vad_audio_passthrough=True
|
||||
)
|
||||
)
|
||||
|
||||
stt = DeepgramSTTService(
|
||||
name="STT",
|
||||
api_key=settings.deepgram_api_key,
|
||||
url=settings.deepgram_stt_base_url
|
||||
)
|
||||
|
||||
tts = ClearableDeepgramTTSService(
|
||||
name="Voice",
|
||||
aiohttp_session=session,
|
||||
api_key=settings.deepgram_api_key,
|
||||
voice=settings.deepgram_voice,
|
||||
**({'base_url': url} if (url := settings.deepgram_tts_base_url) else {})
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
name="LLM",
|
||||
api_key=settings.openai_api_key,
|
||||
model=settings.openai_model,
|
||||
base_url=settings.openai_base_url,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": settings.prompt,
|
||||
},
|
||||
]
|
||||
|
||||
# avt = AudioVolumeTimer()
|
||||
# tl = TranscriptionTimingLogger(avt)
|
||||
|
||||
# tma_in = LLMUserResponseAggregator(messages)
|
||||
# tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
# pipeline = Pipeline([
|
||||
# transport.input(), # Transport user input
|
||||
# avt, # Audio volume timer
|
||||
# stt, # Speech-to-text
|
||||
# tl, # Transcription timing logger
|
||||
# tma_in, # User responses
|
||||
# llm, # LLM
|
||||
# tts, # TTS
|
||||
# transport.output(), # Transport bot output
|
||||
# tma_out, # Assistant spoken responses
|
||||
# ])
|
||||
|
||||
ctx = OpenAILLMContext()
|
||||
greedy = GreedyLLMAggregator(name="greedy", context=ctx)
|
||||
gate = VADGate(name="gate", vad_analyzer=transport.input().vad_analyzer(), context=ctx)
|
||||
avt = AudioVolumeTimer()
|
||||
tl = TranscriptionTimingLogger(avt)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
avt,
|
||||
stt,
|
||||
tl,
|
||||
greedy,
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
gate,
|
||||
transport.output(), # Transport bot output
|
||||
# FrameLogger()
|
||||
])
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
PipelineParams(
|
||||
allow_interruptions=True,
|
||||
enable_metrics=True,
|
||||
report_only_initial_ttfb=True
|
||||
))
|
||||
|
||||
# When the participant leaves, we exit the bot.
|
||||
@transport.event_handler("on_participant_left")
|
||||
async def on_participant_left(transport, participant, reason):
|
||||
await task.queue_frame(EndFrame())
|
||||
|
||||
# When the first participant joins, the bot should introduce itself.
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
# Provide some air whilst tracks subscribe
|
||||
time.sleep(2)
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Introduce yourself by saying 'hello, I'm FastBot, how can I help you today?'"})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Pipecat Bot")
|
||||
parser.add_argument("-s", "--settings", type=str, required=True, help="Pipecat bot settings")
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
try:
|
||||
settings = BotSettings.model_validate_json(args.settings)
|
||||
# print(f"settings: {settings.json()}")
|
||||
asyncio.run(main(settings))
|
||||
except ValidationError as e:
|
||||
print(e)
|
||||
1
examples/fast-chatbot/bot.py
Symbolic link
1
examples/fast-chatbot/bot.py
Symbolic link
@@ -0,0 +1 @@
|
||||
bot-classic-pipeline.js
|
||||
164
examples/fast-chatbot/bot_runner.py
Normal file
164
examples/fast-chatbot/bot_runner.py
Normal file
@@ -0,0 +1,164 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import subprocess
|
||||
|
||||
from pydantic import BaseModel, ValidationError
|
||||
from typing import Optional
|
||||
|
||||
from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomObject, DailyRoomProperties, DailyRoomParams
|
||||
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from bot import BotSettings
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# ------------ Configuration ------------ #
|
||||
|
||||
MAX_SESSION_TIME = 5 * 60 # 5 minutes
|
||||
REQUIRED_ENV_VARS = ['DAILY_API_URL', 'DAILY_API_KEY', 'DEEPGRAM_API_KEY']
|
||||
|
||||
daily_rest_helper = DailyRESTHelper(
|
||||
os.getenv("DAILY_API_KEY", ""),
|
||||
os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'))
|
||||
|
||||
|
||||
class RunnerSettings(BaseModel):
|
||||
prompt: Optional[
|
||||
str] = "You are a fast, low-latency chatbot. Your goal is to demonstrate voice-driven AI capabilities at human-like speeds. The technology powering you is Daily for transport, Groq for AI inference, Llama 3 (70-B version) LLM, and Deepgram for speech-to-text and text-to-speech. You are running on servers in Oregon. Respond to what the user said in a creative and helpful way, but keep responses short and legible. Ensure responses contain only words. Check again that you have not included special characters other than '?' or '!'."
|
||||
deepgram_voice: Optional[str] = os.getenv("DEEPGRAM_VOICE")
|
||||
openai_model: Optional[str] = os.getenv("OPENAI_MODEL", "gpt-4o")
|
||||
openai_api_key: Optional[str] = os.getenv("OPENAI_API_KEY")
|
||||
test: Optional[bool] = None
|
||||
|
||||
# ----------------- API ----------------- #
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"]
|
||||
)
|
||||
|
||||
# ----------------- Main ----------------- #
|
||||
|
||||
|
||||
@app.post("/start_bot")
|
||||
async def start_bot(request: Request) -> JSONResponse:
|
||||
runner_settings = RunnerSettings()
|
||||
try:
|
||||
request_body = await request.body()
|
||||
if len(request_body) > 0:
|
||||
runner_settings = RunnerSettings.model_validate_json(request_body)
|
||||
except ValidationError as e:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Invalid request: {e}")
|
||||
except Exception as e:
|
||||
# If no data in request, pass
|
||||
pass
|
||||
|
||||
# Is this a webhook creation request?
|
||||
if runner_settings.test is not None:
|
||||
return JSONResponse({"test": True})
|
||||
|
||||
# Use specified room URL, or create a new one if not specified
|
||||
room_url = os.getenv("DAILY_SAMPLE_ROOM_URL", "")
|
||||
|
||||
if not room_url:
|
||||
params = DailyRoomParams(
|
||||
properties=DailyRoomProperties()
|
||||
)
|
||||
try:
|
||||
room: DailyRoomObject = daily_rest_helper.create_room(params=params)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Unable to provision room {e}")
|
||||
else:
|
||||
# Check passed room URL exists, we should assume that it already has a sip set up
|
||||
try:
|
||||
room: DailyRoomObject = daily_rest_helper.get_room_from_url(room_url)
|
||||
except Exception:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Room not found: {room_url}")
|
||||
|
||||
# Give the agent a token to join the session
|
||||
token = daily_rest_helper.get_token(room.url, MAX_SESSION_TIME)
|
||||
|
||||
if not room or not token:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to get token for room: {room_url}")
|
||||
|
||||
# Spawn a new agent, and join the user session
|
||||
try:
|
||||
bot_settings = BotSettings(
|
||||
room_url=room.url,
|
||||
room_token=token,
|
||||
prompt=runner_settings.prompt,
|
||||
deepgram_voice=runner_settings.deepgram_voice,
|
||||
openai_model=runner_settings.openai_model,
|
||||
openai_api_key=runner_settings.openai_api_key,
|
||||
)
|
||||
bot_settings_str = bot_settings.model_dump_json(exclude_none=True)
|
||||
|
||||
subprocess.Popen(
|
||||
[f"python3 -m bot -s '{bot_settings_str}'"],
|
||||
shell=True,
|
||||
bufsize=1,
|
||||
cwd=os.path.dirname(os.path.abspath(__file__)))
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to start subprocess: {e}")
|
||||
|
||||
# Grab a token for the user to join with
|
||||
user_token = daily_rest_helper.get_token(room.url, MAX_SESSION_TIME)
|
||||
|
||||
return JSONResponse({
|
||||
"room_url": room.url,
|
||||
"token": user_token,
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Check environment variables
|
||||
for env_var in REQUIRED_ENV_VARS:
|
||||
if env_var not in os.environ:
|
||||
raise Exception(f"Missing environment variable: {env_var}.")
|
||||
|
||||
parser = argparse.ArgumentParser(description="Pipecat Bot Runner")
|
||||
parser.add_argument("--host", type=str,
|
||||
default=os.getenv("HOST", "0.0.0.0"), help="Host address")
|
||||
parser.add_argument("--port", type=int,
|
||||
default=os.getenv("PORT", 7860), help="Port number")
|
||||
parser.add_argument("--reload", action="store_true",
|
||||
default=True, help="Reload code on change")
|
||||
|
||||
config = parser.parse_args()
|
||||
|
||||
try:
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(
|
||||
"bot_runner:app",
|
||||
host=config.host,
|
||||
port=config.port,
|
||||
reload=config.reload
|
||||
)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("Pipecat runner shutting down...")
|
||||
12
examples/fast-chatbot/env.example
Normal file
12
examples/fast-chatbot/env.example
Normal file
@@ -0,0 +1,12 @@
|
||||
DAILY_SAMPLE_ROOM_URL= #optional: use the same room each time, or create a new one if unset
|
||||
DAILY_API_KEY=
|
||||
DAILY_API_URL=
|
||||
|
||||
DEEPGRAM_API_KEY=
|
||||
DEEPGRAM_VOICE=
|
||||
DEEPGRAM_STT_URL=
|
||||
DEEPGRAM_TTS_BASE_URL=
|
||||
|
||||
OPENAI_API_KEY=
|
||||
OPENAI_MODEL=
|
||||
OPENAI_BASE_URL=
|
||||
267
examples/fast-chatbot/helpers.py
Normal file
267
examples/fast-chatbot/helpers.py
Normal file
@@ -0,0 +1,267 @@
|
||||
from loguru import logger
|
||||
import asyncio
|
||||
import math
|
||||
import struct
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
|
||||
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
AudioRawFrame,
|
||||
InterimTranscriptionFrame,
|
||||
TranscriptionFrame,
|
||||
TextFrame,
|
||||
StartInterruptionFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
TTSStoppedFrame,
|
||||
MetricsFrame
|
||||
)
|
||||
|
||||
from pipecat.vad.vad_analyzer import VADAnalyzer, VADState
|
||||
from pipecat.services.deepgram import DeepgramTTSService
|
||||
from pipecat.services.openai import OpenAILLMContext, OpenAILLMContextFrame
|
||||
|
||||
|
||||
class GreedyLLMAggregator(FrameProcessor):
|
||||
def __init__(self, context: OpenAILLMContext = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.context: OpenAILLMContext = context if context else OpenAILLMContext()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
logger.debug(f"{frame}")
|
||||
|
||||
try:
|
||||
if isinstance(frame, InterimTranscriptionFrame):
|
||||
return
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
# append transcribed text to last "user" frame
|
||||
if self.context.messages and self.context.messages[-1]["role"] == "user":
|
||||
last_frame = self.context.messages.pop()
|
||||
else:
|
||||
last_frame = {"role": "user", "content": ""}
|
||||
|
||||
last_frame["content"] += " " + frame.text
|
||||
self.context.messages.append(last_frame)
|
||||
|
||||
oai_context_frame = OpenAILLMContextFrame(context=self.context)
|
||||
logger.debug(f"pushing frame {oai_context_frame}")
|
||||
await self.push_frame(oai_context_frame)
|
||||
return
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
except Exception as e:
|
||||
logger.debug(f"error: {e}")
|
||||
|
||||
|
||||
class ClearableDeepgramTTSService(DeepgramTTSService):
|
||||
def __init___(self, **kwargs):
|
||||
super().__init(**kwargs)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, StartInterruptionFrame):
|
||||
self._current_sentence = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class BufferedSentence:
|
||||
audio_frames: List[AudioRawFrame] = field(default_factory=list)
|
||||
text_frame: TextFrame = None
|
||||
|
||||
|
||||
class VADGate(FrameProcessor):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vad_analyzer: VADAnalyzer = None,
|
||||
context: OpenAILLMContext = None,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.vad_analyzer = vad_analyzer
|
||||
self.context = context
|
||||
|
||||
self._audio_pusher_task = None
|
||||
self._expect_text_frame_next = False
|
||||
self._sentences: List[BufferedSentence] = []
|
||||
|
||||
# queue output from tts one sentence at a time. associate a buffer of audio frames with the content of
|
||||
# each text frame.
|
||||
#
|
||||
# start a coroutine to service the queue and send sentences down the pipeline when possible.
|
||||
# 1. do not send anything when we are not in VADState.QUIET
|
||||
# 2. if we are in VADState.QUIET, send a sentence, estimate how long it will take for that sentence
|
||||
# to output, sleep until it's time to send another sentence
|
||||
# 3. each time we send a sentence, append it to the conversation context
|
||||
# 3. when the sentence buffer becomes empty, cancel the coroutine
|
||||
# 4. if we get a new LLMFullResponse, treat that as a cancellation, too
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
try:
|
||||
|
||||
# A TTSService will emit a series of AudioRawFrame objects, then a TTSStoppedFrame,
|
||||
# then a TextFrame.
|
||||
|
||||
if self._expect_text_frame_next:
|
||||
self._expect_text_frame_next = False
|
||||
if isinstance(frame, TextFrame):
|
||||
self._sentences[-1].text_frame = frame
|
||||
else:
|
||||
logger.debug(f"expected a text frame, but received {frame}")
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
else:
|
||||
if isinstance(frame, TextFrame):
|
||||
logger.error(f"XXXXXXXXXXXXXXXXXXX received a text frame, wasn't expecting it.")
|
||||
|
||||
if isinstance(frame, AudioRawFrame):
|
||||
# if our buffer is empty or has a "finished" sentence at the end,
|
||||
# then we need to start buffering a new sentence
|
||||
if not self._sentences or self._sentences[-1].text_frame:
|
||||
self._sentences.append(BufferedSentence())
|
||||
self._sentences[-1].audio_frames.append(frame)
|
||||
await self.maybe_start_audio_pusher_task()
|
||||
return
|
||||
|
||||
if isinstance(frame, TTSStoppedFrame):
|
||||
self._expect_text_frame_next = True
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
# There are two ways we can be interrupted. During greedy inference, a new
|
||||
# LLM response can start. Or, during playout, we can get a traditional
|
||||
# user interruption frame.
|
||||
if (isinstance(frame, LLMFullResponseStartFrame) or
|
||||
isinstance(frame, StartInterruptionFrame)):
|
||||
logger.debug(f"{frame} - Handle interruption in VADGate")
|
||||
self._sentences = []
|
||||
if self._audio_pusher_task:
|
||||
self._audio_pusher_task.cancel()
|
||||
self._audio_pusher_task = None
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
except Exception as e:
|
||||
logger.debug(f"error: {e}")
|
||||
|
||||
async def maybe_start_audio_pusher_task(self):
|
||||
try:
|
||||
if self._audio_pusher_task:
|
||||
return
|
||||
self._audio_pusher_task = self.get_event_loop().create_task(self.push_audio())
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Exception {e}")
|
||||
|
||||
async def push_audio(self):
|
||||
try:
|
||||
while True:
|
||||
if not self._sentences:
|
||||
await asyncio.sleep(0.01)
|
||||
continue
|
||||
|
||||
if self.vad_analyzer._vad_state != VADState.QUIET:
|
||||
await asyncio.sleep(0.01)
|
||||
continue
|
||||
|
||||
# we only want to push completed sentence buffers
|
||||
if not self._sentences[0].text_frame:
|
||||
await asyncio.sleep(0.01)
|
||||
continue
|
||||
|
||||
s = self._sentences.pop(0)
|
||||
if not s.audio_frames:
|
||||
continue
|
||||
sample_rate = s.audio_frames[0].sample_rate
|
||||
duration = 0
|
||||
logger.debug(f"Pushing {len(s.audio_frames)} audio frames")
|
||||
for frame in s.audio_frames:
|
||||
await self.push_frame(frame)
|
||||
# assume linear16 encoding (2 bytes per sample). todo: add some more
|
||||
# metadata to AudioRawFrame, maybe
|
||||
duration += (len(frame.audio) / 2 / frame.num_channels) / sample_rate
|
||||
await asyncio.sleep(duration - 20 / 1000)
|
||||
if self.context:
|
||||
logger.debug(f"Appending assistant message to context: [{s.text_frame.text}]")
|
||||
self.context.messages.append(
|
||||
{"role": "assistant", "content": s.text_frame.text}
|
||||
)
|
||||
await self.push_frame(s.text_frame)
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Exception {e}")
|
||||
|
||||
|
||||
class TranscriptionTimingLogger(FrameProcessor):
|
||||
def __init__(self, avt):
|
||||
super().__init__()
|
||||
self.name = "Transcription"
|
||||
self._avt = avt
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
try:
|
||||
await super().process_frame(frame, direction)
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
elapsed = time.time() - self._avt.last_transition_ts
|
||||
logger.debug(f"Transcription TTF: {elapsed}")
|
||||
await self.push_frame(MetricsFrame(ttfb={self.name: elapsed}))
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
except Exception as e:
|
||||
logger.debug(f"Exception {e}")
|
||||
|
||||
|
||||
class AudioVolumeTimer(FrameProcessor):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.last_transition_ts = 0
|
||||
self._prev_volume = -80
|
||||
self._speech_volume_threshold = -50
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, AudioRawFrame):
|
||||
volume = self.calculate_volume(frame)
|
||||
# print(f"Audio volume: {volume:.2f} dB")
|
||||
if (volume >= self._speech_volume_threshold and
|
||||
self._prev_volume < self._speech_volume_threshold):
|
||||
# logger.debug("transition above speech volume threshold")
|
||||
self.last_transition_ts = time.time()
|
||||
elif (volume < self._speech_volume_threshold and
|
||||
self._prev_volume >= self._speech_volume_threshold):
|
||||
# logger.debug("transition below non-speech volume threshold")
|
||||
self.last_transition_ts = time.time()
|
||||
self._prev_volume = volume
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
def calculate_volume(self, frame: AudioRawFrame) -> float:
|
||||
if frame.num_channels != 1:
|
||||
raise ValueError(f"Expected 1 channel, got {frame.num_channels}")
|
||||
|
||||
# Unpack audio data into 16-bit integers
|
||||
fmt = f"{len(frame.audio) // 2}h"
|
||||
audio_samples = struct.unpack(fmt, frame.audio)
|
||||
|
||||
# Calculate RMS
|
||||
sum_squares = sum(sample**2 for sample in audio_samples)
|
||||
rms = math.sqrt(sum_squares / len(audio_samples))
|
||||
|
||||
# Convert RMS to decibels (dB)
|
||||
# Reference: maximum value for 16-bit audio is 32767
|
||||
if rms > 0:
|
||||
db = 20 * math.log10(rms / 32767)
|
||||
else:
|
||||
db = -96 # Minimum value (almost silent)
|
||||
|
||||
return db
|
||||
6
examples/fast-chatbot/requirements.txt
Normal file
6
examples/fast-chatbot/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
pipecat-ai[daily,openai,silero,deepgram]
|
||||
fastapi
|
||||
uvicorn
|
||||
requests
|
||||
python-dotenv
|
||||
loguru
|
||||
@@ -44,7 +44,7 @@ async def main(room_url):
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
|
||||
@@ -23,11 +23,11 @@ from pipecat.frames.frames import (
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.pipeline.parallel_task import ParallelTask
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.aggregators.gated import GatedAggregator
|
||||
from pipecat.processors.aggregators.llm_response import LLMFullResponseAggregator
|
||||
from pipecat.processors.aggregators.sentence import SentenceAggregator
|
||||
from pipecat.processors.aggregators.parallel_task import ParallelTask
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.fal import FalImageGenService
|
||||
@@ -59,6 +59,8 @@ class MonthPrepender(FrameProcessor):
|
||||
self.prepend_to_next_text_frame = False
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, MonthFrame):
|
||||
self.most_recent_month = frame.month
|
||||
elif self.prepend_to_next_text_frame and isinstance(frame, TextFrame):
|
||||
@@ -93,7 +95,7 @@ async def main(room_url):
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
model="gpt-4o")
|
||||
|
||||
imagegen = FalImageGenService(
|
||||
params=FalImageGenService.InputParams(
|
||||
|
||||
@@ -50,6 +50,8 @@ async def main():
|
||||
self.text = ""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TextFrame):
|
||||
self.text = frame.text
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -60,6 +62,8 @@ async def main():
|
||||
self.audio = bytearray()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, AudioRawFrame):
|
||||
self.audio.extend(frame.audio)
|
||||
self.frame = AudioRawFrame(
|
||||
@@ -71,12 +75,14 @@ async def main():
|
||||
self.frame = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, URLImageRawFrame):
|
||||
self.frame = frame
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
model="gpt-4o")
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
@@ -156,7 +162,7 @@ async def main():
|
||||
await runner.stop_when_done()
|
||||
|
||||
async def run_tk():
|
||||
while True:
|
||||
while not task.has_finished():
|
||||
tk_root.update()
|
||||
tk_root.update_idletasks()
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
@@ -56,10 +56,11 @@ async def main(room_url: str, token):
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
model="gpt-4o")
|
||||
|
||||
fl_in = FrameLogger("Inner")
|
||||
fl_out = FrameLogger("Outer")
|
||||
fl = FrameLogger("!!! after LLM", "red")
|
||||
fltts = FrameLogger("@@@ out of tts", "green")
|
||||
flend = FrameLogger("### out of the end", "magenta")
|
||||
|
||||
messages = [
|
||||
{
|
||||
@@ -71,14 +72,15 @@ async def main(room_url: str, token):
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
fl_in,
|
||||
transport.input(),
|
||||
tma_in,
|
||||
llm,
|
||||
fl_out,
|
||||
fl,
|
||||
tts,
|
||||
fltts,
|
||||
transport.output(),
|
||||
tma_out
|
||||
tma_out,
|
||||
flend
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
@@ -15,14 +15,15 @@ from pipecat.frames.frames import ImageRawFrame, Frame, SystemFrame, TextFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.transports.services.daily import DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
from runner import configure
|
||||
@@ -48,6 +49,8 @@ class ImageSyncAggregator(FrameProcessor):
|
||||
self._waiting_image_bytes = self._waiting_image.tobytes()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if not isinstance(frame, SystemFrame):
|
||||
await self.push_frame(ImageRawFrame(image=self._speaking_image_bytes, size=(1024, 1024), format=self._speaking_image_format))
|
||||
await self.push_frame(frame)
|
||||
@@ -66,7 +69,9 @@ async def main(room_url: str, token):
|
||||
audio_out_enabled=True,
|
||||
camera_out_width=1024,
|
||||
camera_out_height=1024,
|
||||
transcription_enabled=True
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
@@ -78,7 +83,7 @@ async def main(room_url: str, token):
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
@@ -87,8 +92,8 @@ async def main(room_url: str, token):
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(messages)
|
||||
tma_out = LLMAssistantContextAggregator(messages)
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
image_sync_aggregator = ImageSyncAggregator(
|
||||
os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
|
||||
|
||||
@@ -12,7 +12,7 @@ import sys
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
@@ -53,7 +53,7 @@ async def main(room_url: str, token):
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
@@ -74,7 +74,11 @@ async def main(room_url: str, token):
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, allow_interruptions=True)
|
||||
task = PipelineTask(pipeline, PipelineParams(
|
||||
allow_interruptions=True,
|
||||
enable_metrics=True,
|
||||
report_only_initial_ttfb=True,
|
||||
))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
|
||||
95
examples/foundational/07a-interruptible-anthropic.py
Normal file
95
examples/foundational/07a-interruptible-anthropic.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.anthropic import AnthropicLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
model="claude-3-opus-20240229")
|
||||
|
||||
# todo: think more about how to handle system prompts in a more general way. OpenAI,
|
||||
# Google, and Anthropic all have slightly different approaches to providing a system
|
||||
# prompt.
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative, helpful, and brief way. Say hello.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
125
examples/foundational/07b-interruptible-langchain.py
Normal file
125
examples/foundational/07b-interruptible-langchain.py
Normal file
@@ -0,0 +1,125 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
import aiohttp
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.processors.frameworks.langchain import LangchainProcessor
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
||||
from langchain_community.chat_message_histories import ChatMessageHistory
|
||||
from langchain_core.chat_history import BaseChatMessageHistory
|
||||
from langchain_core.runnables.history import RunnableWithMessageHistory
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from runner import configure
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
message_store = {}
|
||||
|
||||
|
||||
def get_session_history(session_id: str) -> BaseChatMessageHistory:
|
||||
if session_id not in message_store:
|
||||
message_store[session_id] = ChatMessageHistory()
|
||||
return message_store[session_id]
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
prompt = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
("system",
|
||||
"Be nice and helpful. Answer very briefly and without special characters like `#` or `*`. "
|
||||
"Your response will be synthesized to voice and those characters will create unnatural sounds.",
|
||||
),
|
||||
MessagesPlaceholder("chat_history"),
|
||||
("human", "{input}"),
|
||||
])
|
||||
chain = prompt | ChatOpenAI(model="gpt-4o", temperature=0.7)
|
||||
history_chain = RunnableWithMessageHistory(
|
||||
chain,
|
||||
get_session_history,
|
||||
history_messages_key="chat_history",
|
||||
input_messages_key="input")
|
||||
lc = LangchainProcessor(history_chain)
|
||||
|
||||
tma_in = LLMUserResponseAggregator()
|
||||
tma_out = LLMAssistantResponseAggregator()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
lc, # Langchain
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out, # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
lc.set_participant_id(participant["id"])
|
||||
# Kick off the conversation.
|
||||
# the `LLMMessagesFrame` will be picked up by the LangchainProcessor using
|
||||
# only the content of the last message to inject it in the prompt defined
|
||||
# above. So no role is required here.
|
||||
messages = [(
|
||||
{
|
||||
"content": "Please briefly introduce yourself to the user."
|
||||
}
|
||||
)]
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
97
examples/foundational/07c-interruptible-deepgram.py
Normal file
97
examples/foundational/07c-interruptible-deepgram.py
Normal file
@@ -0,0 +1,97 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.deepgram import DeepgramSTTService, DeepgramTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
vad_audio_passthrough=True
|
||||
)
|
||||
)
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = DeepgramTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
voice="aura-helios-en"
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
92
examples/foundational/07d-interruptible-cartesia.py
Normal file
92
examples/foundational/07d-interruptible-cartesia.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
93
examples/foundational/07e-interruptible-playht.py
Normal file
93
examples/foundational/07e-interruptible-playht.py
Normal file
@@ -0,0 +1,93 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.playht import PlayHTTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
audio_out_sample_rate=16000,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = PlayHTTTSService(
|
||||
user_id=os.getenv("PLAYHT_USER_ID"),
|
||||
api_key=os.getenv("PLAYHT_API_KEY"),
|
||||
voice_url="s3://voice-cloning-zero-shot/801a663f-efd0-4254-98d0-5c175514c3e8/jennifer/manifest.json",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
100
examples/foundational/07f-interruptible-azure.py
Normal file
100
examples/foundational/07f-interruptible-azure.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.azure import AzureLLMService, AzureSTTService, AzureTTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
audio_out_sample_rate=16000,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
vad_audio_passthrough=True,
|
||||
)
|
||||
)
|
||||
|
||||
stt = AzureSTTService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"),
|
||||
)
|
||||
|
||||
tts = AzureTTSService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"),
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
92
examples/foundational/07g-interruptible-openai-tts.py
Normal file
92
examples/foundational/07g-interruptible-openai-tts.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.openai import OpenAITTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
audio_out_sample_rate=24000,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = OpenAITTSService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
voice="alloy"
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
102
examples/foundational/07h-interruptible-openpipe.py
Normal file
102
examples/foundational/07h-interruptible-openpipe.py
Normal file
@@ -0,0 +1,102 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator,
|
||||
)
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openpipe import OpenPipeLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
import time
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
timestamp = int(time.time())
|
||||
llm = OpenPipeLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
openpipe_api_key=os.getenv("OPENPIPE_API_KEY"),
|
||||
model="gpt-4o",
|
||||
tags={
|
||||
"conversation_id": f"pipecat-{timestamp}"
|
||||
}
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -30,6 +30,7 @@ async def main(room_url, token):
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
camera_out_enabled=True,
|
||||
camera_out_is_live=True,
|
||||
camera_out_width=1280,
|
||||
camera_out_height=720
|
||||
)
|
||||
|
||||
@@ -38,6 +38,7 @@ async def main(room_url, token):
|
||||
TransportParams(
|
||||
audio_out_enabled=True,
|
||||
camera_out_enabled=True,
|
||||
camera_out_is_live=True,
|
||||
camera_out_width=1280,
|
||||
camera_out_height=720))
|
||||
|
||||
@@ -47,15 +48,15 @@ async def main(room_url, token):
|
||||
|
||||
pipeline = Pipeline([daily_transport.input(), tk_transport.output()])
|
||||
|
||||
runner = PipelineRunner()
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
async def run_tk():
|
||||
while runner.is_active():
|
||||
while not task.has_finished():
|
||||
tk_root.update()
|
||||
tk_root.update_idletasks()
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
runner = PipelineRunner()
|
||||
|
||||
await asyncio.gather(runner.run(task), run_tk())
|
||||
|
||||
|
||||
94
examples/foundational/10-wake-phrase.py
Normal file
94
examples/foundational/10-wake-phrase.py
Normal file
@@ -0,0 +1,94 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.processors.filters.wake_check_filter import WakeCheckFilter
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Robot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant. Respond to what the user said in a creative and helpful way. Keep your responses brief.",
|
||||
},
|
||||
]
|
||||
|
||||
hey_robot_filter = WakeCheckFilter(["hey robot", "hey, robot"])
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
hey_robot_filter, # Filter out speech not directed at the robot
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
await tts.say("Hi! If you want to talk to me, just say 'Hey Robot'.")
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,189 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
SystemFrame,
|
||||
TextFrame,
|
||||
ImageRawFrame,
|
||||
SpriteFrame,
|
||||
TranscriptionFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import (
|
||||
LLMUserContextAggregator,
|
||||
LLMAssistantContextAggregator,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
sprites = {}
|
||||
image_files = [
|
||||
"sc-default.png",
|
||||
"sc-talk.png",
|
||||
"sc-listen-1.png",
|
||||
"sc-think-1.png",
|
||||
"sc-think-2.png",
|
||||
"sc-think-3.png",
|
||||
"sc-think-4.png",
|
||||
]
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in image_files:
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the image and convert it to bytes
|
||||
with Image.open(full_path) as img:
|
||||
sprites[file] = ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)
|
||||
|
||||
# When the bot isn't talking, show a static image of the cat listening
|
||||
quiet_frame = sprites["sc-listen-1.png"]
|
||||
|
||||
# When the bot is talking, build an animation from two sprites
|
||||
talking_list = [sprites["sc-default.png"], sprites["sc-talk.png"]]
|
||||
talking = [random.choice(talking_list) for x in range(30)]
|
||||
talking_frame = SpriteFrame(talking)
|
||||
|
||||
# TODO: Support "thinking" as soon as we get a valid transcript, while LLM
|
||||
# is processing
|
||||
thinking_list = [
|
||||
sprites["sc-think-1.png"],
|
||||
sprites["sc-think-2.png"],
|
||||
sprites["sc-think-3.png"],
|
||||
sprites["sc-think-4.png"],
|
||||
]
|
||||
thinking_frame = SpriteFrame(thinking_list)
|
||||
|
||||
|
||||
class NameCheckFilter(FrameProcessor):
|
||||
def __init__(self, names: list[str]):
|
||||
super().__init__()
|
||||
self._names = names
|
||||
self._sentence = ""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, SystemFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
content: str = ""
|
||||
|
||||
# TODO: split up transcription by participant
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
content = frame.text
|
||||
self._sentence += content
|
||||
if self._sentence.endswith((".", "?", "!")):
|
||||
if any(name in self._sentence for name in self._names):
|
||||
await self.push_frame(TextFrame(self._sentence))
|
||||
self._sentence = ""
|
||||
else:
|
||||
self._sentence = ""
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class ImageSyncAggregator(FrameProcessor):
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await self.push_frame(talking_frame)
|
||||
await self.push_frame(frame)
|
||||
await self.push_frame(quiet_frame)
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Santa Cat",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
camera_out_enabled=True,
|
||||
camera_out_width=720,
|
||||
camera_out_height=1280,
|
||||
camera_out_framerate=10,
|
||||
transcription_enabled=True
|
||||
)
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="jBpfuIE2acCO8z3wKNLl",
|
||||
)
|
||||
isa = ImageSyncAggregator()
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(messages)
|
||||
tma_out = LLMAssistantContextAggregator(messages)
|
||||
ncf = NameCheckFilter(["Santa Cat", "Santa"])
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
isa,
|
||||
ncf,
|
||||
tma_in,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
tma_out
|
||||
])
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
# Send some greeting at the beginning.
|
||||
await tts.say("Hi! If you want to talk to me, just say 'hey Santa Cat'.")
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
|
||||
async def starting_image():
|
||||
await transport.send_image(quiet_frame)
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
await asyncio.gather(runner.run(task), starting_image())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -19,15 +19,16 @@ from pipecat.frames.frames import (
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import (
|
||||
LLMUserContextAggregator,
|
||||
LLMAssistantContextAggregator,
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMUserResponseAggregator,
|
||||
LLMAssistantResponseAggregator,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.logger import FrameLogger
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
@@ -59,6 +60,8 @@ for file in sound_files:
|
||||
class OutboundSoundEffectWrapper(FrameProcessor):
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, LLMFullResponseEndFrame):
|
||||
await self.push_frame(sounds["ding1.wav"])
|
||||
# In case anything else downstream needs it
|
||||
@@ -70,6 +73,8 @@ class OutboundSoundEffectWrapper(FrameProcessor):
|
||||
class InboundSoundEffectWrapper(FrameProcessor):
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, LLMMessagesFrame):
|
||||
await self.push_frame(sounds["ding2.wav"])
|
||||
# In case anything else downstream needs it
|
||||
@@ -84,12 +89,17 @@ async def main(room_url: str, token):
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(audio_out_enabled=True, transcription_enabled=True)
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
model="gpt-4o")
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
@@ -104,8 +114,8 @@ async def main(room_url: str, token):
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(messages)
|
||||
tma_out = LLMAssistantContextAggregator(messages)
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
out_sound = OutboundSoundEffectWrapper()
|
||||
in_sound = InboundSoundEffectWrapper()
|
||||
fl = FrameLogger("LLM Out")
|
||||
|
||||
@@ -42,6 +42,8 @@ class UserImageRequester(FrameProcessor):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
@@ -42,6 +42,8 @@ class UserImageRequester(FrameProcessor):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -62,19 +64,15 @@ async def main(room_url: str, token):
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
vision_aggregator = VisionImageFrameAggregator()
|
||||
|
||||
google = GoogleLLMService(model="gemini-1.5-flash-latest")
|
||||
google = GoogleLLMService(
|
||||
model="gemini-1.5-flash-latest",
|
||||
api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
|
||||
@@ -42,6 +42,8 @@ class UserImageRequester(FrameProcessor):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -61,12 +63,6 @@ async def main(room_url: str, token):
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
108
examples/foundational/12c-describe-video-anthropic.py
Normal file
108
examples/foundational/12c-describe-video-anthropic.py
Normal file
@@ -0,0 +1,108 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.anthropic import AnthropicLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
|
||||
def __init__(self, participant_id: str | None = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Describe participant video",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
vision_aggregator = VisionImageFrameAggregator()
|
||||
|
||||
anthropic = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
model="claude-3-sonnet-20240229"
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
await tts.say("Hi there! Feel free to ask me what I see.")
|
||||
transport.capture_participant_video(participant["id"], framerate=0)
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
image_requester.set_participant_id(participant["id"])
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
user_response,
|
||||
image_requester,
|
||||
vision_aggregator,
|
||||
anthropic,
|
||||
tts,
|
||||
transport.output()
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -29,6 +29,8 @@ logger.add(sys.stderr, level="DEBUG")
|
||||
class TranscriptionLogger(FrameProcessor):
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
print(f"Transcription: {frame.text}")
|
||||
|
||||
|
||||
@@ -16,8 +16,6 @@ from pipecat.services.whisper import WhisperSTTService
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
from pipecat.transports.local.audio import LocalAudioTransport
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
@@ -30,11 +28,13 @@ logger.add(sys.stderr, level="DEBUG")
|
||||
class TranscriptionLogger(FrameProcessor):
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
print(f"Transcription: {frame.text}")
|
||||
|
||||
|
||||
async def main(room_url: str):
|
||||
async def main():
|
||||
transport = LocalAudioTransport(TransportParams(audio_in_enabled=True))
|
||||
|
||||
stt = WhisperSTTService()
|
||||
@@ -51,5 +51,4 @@ async def main(room_url: str):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
asyncio.run(main())
|
||||
|
||||
58
examples/foundational/13b-deepgram-transcription.py
Normal file
58
examples/foundational/13b-deepgram-transcription.py
Normal file
@@ -0,0 +1,58 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import Frame, TranscriptionFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.deepgram import DeepgramSTTService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
class TranscriptionLogger(FrameProcessor):
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
print(f"Transcription: {frame.text}")
|
||||
|
||||
|
||||
async def main(room_url: str):
|
||||
transport = DailyTransport(room_url, None, "Transcription bot",
|
||||
DailyParams(audio_in_enabled=True))
|
||||
|
||||
stt = DeepgramSTTService(os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tl = TranscriptionLogger()
|
||||
|
||||
pipeline = Pipeline([transport.input(), stt, tl])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url))
|
||||
140
examples/foundational/14-function-calling.py
Normal file
140
examples/foundational/14-function-calling.py
Normal file
@@ -0,0 +1,140 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import TextFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
)
|
||||
from pipecat.processors.logger import FrameLogger
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMContext, OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def start_fetch_weather(llm):
|
||||
await llm.push_frame(TextFrame("Let me think."))
|
||||
|
||||
|
||||
async def fetch_weather_from_api(llm, args):
|
||||
return {"conditions": "nice", "temperature": "75"}
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
llm.register_function(
|
||||
"get_current_weather",
|
||||
fetch_weather_from_api,
|
||||
start_callback=start_fetch_weather)
|
||||
|
||||
fl_in = FrameLogger("Inner")
|
||||
fl_out = FrameLogger("Outer")
|
||||
|
||||
tools = [
|
||||
ChatCompletionToolParam(
|
||||
type="function",
|
||||
function={
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"celsius",
|
||||
"fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
"required": [
|
||||
"location",
|
||||
"format"],
|
||||
},
|
||||
})]
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
tma_in = LLMUserContextAggregator(context)
|
||||
tma_out = LLMAssistantContextAggregator(context)
|
||||
pipeline = Pipeline([
|
||||
fl_in,
|
||||
transport.input(),
|
||||
tma_in,
|
||||
llm,
|
||||
fl_out,
|
||||
tts,
|
||||
transport.output(),
|
||||
tma_out
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
@ transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
await tts.say("Hi! Ask me about the weather in San Francisco.")
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
155
examples/foundational/15-switch-voices.py
Normal file
155
examples/foundational/15-switch-voices.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator
|
||||
)
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.filters.function_filter import FunctionFilter
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
current_voice = "News Lady"
|
||||
|
||||
|
||||
async def switch_voice(llm, args):
|
||||
global current_voice
|
||||
current_voice = args["voice"]
|
||||
return {"voice": f"You are now using your {current_voice} voice. Your responses should now be as if you were a {current_voice}."}
|
||||
|
||||
|
||||
async def news_lady_filter(frame) -> bool:
|
||||
return current_voice == "News Lady"
|
||||
|
||||
|
||||
async def british_lady_filter(frame) -> bool:
|
||||
return current_voice == "British Lady"
|
||||
|
||||
|
||||
async def barbershop_man_filter(frame) -> bool:
|
||||
return current_voice == "Barbershop Man"
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Pipecat",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
news_lady = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="bf991597-6c13-47e4-8411-91ec2de5c466", # Newslady
|
||||
)
|
||||
|
||||
british_lady = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
|
||||
)
|
||||
|
||||
barbershop_man = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
llm.register_function("switch_voice", switch_voice)
|
||||
|
||||
tools = [
|
||||
ChatCompletionToolParam(
|
||||
type="function",
|
||||
function={
|
||||
"name": "switch_voice",
|
||||
"description": "Switch your voice only when the user asks you to",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"voice": {
|
||||
"type": "string",
|
||||
"description": "The voice the user wants you to use",
|
||||
},
|
||||
},
|
||||
"required": ["voice"],
|
||||
},
|
||||
})]
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities. Respond to what the user said in a creative and helpful way. Your output should not include non-alphanumeric characters. You can do the following voices: 'News Lady', 'British Lady' and 'Barbershop Man'.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
tma_in = LLMUserContextAggregator(context)
|
||||
tma_out = LLMAssistantContextAggregator(context)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
ParallelPipeline( # TTS (one of the following vocies)
|
||||
[FunctionFilter(news_lady_filter), news_lady], # News Lady voice
|
||||
[FunctionFilter(british_lady_filter), british_lady], # British Lady voice
|
||||
[FunctionFilter(barbershop_man_filter), barbershop_man], # Barbershop Man voice
|
||||
),
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user and let them know the voices you can do. Your initial responses should be as if you were a {current_voice}."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
153
examples/foundational/15a-switch-languages.py
Normal file
153
examples/foundational/15a-switch-languages.py
Normal file
@@ -0,0 +1,153 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator
|
||||
)
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.filters.function_filter import FunctionFilter
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.whisper import Model, WhisperSTTService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
current_language = "English"
|
||||
|
||||
|
||||
async def switch_language(llm, args):
|
||||
global current_language
|
||||
current_language = args["language"]
|
||||
return {"voice": f"Your answers from now on should be in {current_language}."}
|
||||
|
||||
|
||||
async def english_filter(frame) -> bool:
|
||||
return current_language == "English"
|
||||
|
||||
|
||||
async def spanish_filter(frame) -> bool:
|
||||
return current_language == "Spanish"
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Pipecat",
|
||||
DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
vad_audio_passthrough=True
|
||||
)
|
||||
)
|
||||
|
||||
stt = WhisperSTTService(model=Model.LARGE)
|
||||
|
||||
english_tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="pNInz6obpgDQGcFmaJgB",
|
||||
)
|
||||
|
||||
spanish_tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
model="eleven_multilingual_v2",
|
||||
voice_id="9F4C8ztpNUmXkdDDbz3J",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
llm.register_function("switch_language", switch_language)
|
||||
|
||||
tools = [
|
||||
ChatCompletionToolParam(
|
||||
type="function",
|
||||
function={
|
||||
"name": "switch_language",
|
||||
"description": "Switch to another language when the user asks you to",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"language": {
|
||||
"type": "string",
|
||||
"description": "The language the user wants you to speak",
|
||||
},
|
||||
},
|
||||
"required": ["language"],
|
||||
},
|
||||
})]
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities. Respond to what the user said in a creative and helpful way. Your output should not include non-alphanumeric characters. You can speak the following languages: 'English' and 'Spanish'.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
tma_in = LLMUserContextAggregator(context)
|
||||
tma_out = LLMAssistantContextAggregator(context)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
ParallelPipeline( # TTS (bot will speak the chosen language)
|
||||
[FunctionFilter(english_filter), english_tts], # English
|
||||
[FunctionFilter(spanish_filter), spanish_tts], # Spanish
|
||||
),
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user and let them know the languages you speak. Your initial responses should be in {current_language}."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
130
examples/foundational/16-gpu-container-local-bot.py
Normal file
130
examples/foundational/16-gpu-container-local-bot.py
Normal file
@@ -0,0 +1,130 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.deepgram import DeepgramTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyTransportMessageFrame
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = DeepgramTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
voice="aura-asteria-en",
|
||||
base_url="http://0.0.0.0:8080/v1/speak"
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
# To use OpenAI
|
||||
# api_key=os.getenv("OPENAI_API_KEY"),
|
||||
# model="gpt-4o"
|
||||
# Or, to use a local vLLM (or similar) api server
|
||||
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
base_url="http://0.0.0.0:8000/v1"
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True))
|
||||
|
||||
# When a participant joins, start transcription for that participant so the
|
||||
# bot can "hear" and respond to them.
|
||||
@transport.event_handler("on_participant_joined")
|
||||
async def on_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
|
||||
# When the first participant joins, the bot should introduce itself.
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
# Handle "latency-ping" messages. The client will send app messages that look like
|
||||
# this:
|
||||
# { "latency-ping": { ts: <client-side timestamp> }}
|
||||
#
|
||||
# We want to send an immediate pong back to the client from this handler function.
|
||||
# Also, we will push a frame into the top of the pipeline and send it after the
|
||||
#
|
||||
@transport.event_handler("on_app_message")
|
||||
async def on_app_message(transport, message, sender):
|
||||
try:
|
||||
if "latency-ping" in message:
|
||||
logger.debug(f"Received latency ping app message: {message}")
|
||||
ts = message["latency-ping"]["ts"]
|
||||
# Send immediately
|
||||
transport.output().send_message(DailyTransportMessageFrame(
|
||||
message={"latency-pong-msg-handler": {"ts": ts}},
|
||||
participant_id=sender))
|
||||
# And push to the pipeline for the Daily transport.output to send
|
||||
await tma_in.push_frame(
|
||||
DailyTransportMessageFrame(
|
||||
message={"latency-pong-pipeline-delivery": {"ts": ts}},
|
||||
participant_id=sender))
|
||||
except Exception as e:
|
||||
logger.debug(f"message handling error: {e} - {message}")
|
||||
|
||||
runner = PipelineRunner()
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,25 +0,0 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package pipecat_proto;
|
||||
|
||||
message TextFrame {
|
||||
string text = 1;
|
||||
}
|
||||
|
||||
message AudioFrame {
|
||||
bytes audio = 1;
|
||||
}
|
||||
|
||||
message TranscriptionFrame {
|
||||
string text = 1;
|
||||
string participant_id = 2;
|
||||
string timestamp = 3;
|
||||
}
|
||||
|
||||
message Frame {
|
||||
oneof frame {
|
||||
TextFrame text = 1;
|
||||
AudioFrame audio = 2;
|
||||
TranscriptionFrame transcription = 3;
|
||||
}
|
||||
}
|
||||
@@ -1,134 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<script src="//cdn.jsdelivr.net/npm/protobufjs@7.X.X/dist/protobuf.min.js"></script>
|
||||
<title>WebSocket Audio Stream</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<h1>WebSocket Audio Stream</h1>
|
||||
<button id="startAudioBtn">Start Audio</button>
|
||||
<button id="stopAudioBtn">Stop Audio</button>
|
||||
<script>
|
||||
const SAMPLE_RATE = 16000;
|
||||
const BUFFER_SIZE = 8192;
|
||||
const MIN_AUDIO_SIZE = 6400;
|
||||
|
||||
let audioContext;
|
||||
let microphoneStream;
|
||||
let scriptProcessor;
|
||||
let source;
|
||||
let frame;
|
||||
let audioChunks = [];
|
||||
let isPlaying = false;
|
||||
let ws;
|
||||
|
||||
const proto = protobuf.load("frames.proto", (err, root) => {
|
||||
if (err) throw err;
|
||||
frame = root.lookupType("pipecat_proto.Frame");
|
||||
});
|
||||
|
||||
function initWebSocket() {
|
||||
ws = new WebSocket('ws://localhost:8765');
|
||||
|
||||
ws.addEventListener('open', () => console.log('WebSocket connection established.'));
|
||||
ws.addEventListener('message', handleWebSocketMessage);
|
||||
ws.addEventListener('close', (event) => console.log("WebSocket connection closed.", event.code, event.reason));
|
||||
ws.addEventListener('error', (event) => console.error('WebSocket error:', event));
|
||||
}
|
||||
|
||||
async function handleWebSocketMessage(event) {
|
||||
const arrayBuffer = await event.data.arrayBuffer();
|
||||
enqueueAudioFromProto(arrayBuffer);
|
||||
}
|
||||
|
||||
function enqueueAudioFromProto(arrayBuffer) {
|
||||
const parsedFrame = frame.decode(new Uint8Array(arrayBuffer));
|
||||
if (!parsedFrame?.audio) return false;
|
||||
|
||||
const frameCount = parsedFrame.audio.data.length / 2;
|
||||
const audioOutBuffer = audioContext.createBuffer(1, frameCount, SAMPLE_RATE);
|
||||
const nowBuffering = audioOutBuffer.getChannelData(0);
|
||||
const view = new Int16Array(parsedFrame.audio.data.buffer);
|
||||
|
||||
for (let i = 0; i < frameCount; i++) {
|
||||
const word = view[i];
|
||||
nowBuffering[i] = ((word + 32768) % 65536 - 32768) / 32768.0;
|
||||
}
|
||||
|
||||
audioChunks.push(audioOutBuffer);
|
||||
if (!isPlaying) playNextChunk();
|
||||
}
|
||||
|
||||
function playNextChunk() {
|
||||
if (audioChunks.length === 0) {
|
||||
isPlaying = false;
|
||||
return;
|
||||
}
|
||||
|
||||
isPlaying = true;
|
||||
const audioOutBuffer = audioChunks.shift();
|
||||
const source = audioContext.createBufferSource();
|
||||
source.buffer = audioOutBuffer;
|
||||
source.connect(audioContext.destination);
|
||||
source.onended = playNextChunk;
|
||||
source.start();
|
||||
}
|
||||
|
||||
function startAudio() {
|
||||
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||||
alert('getUserMedia is not supported in your browser.');
|
||||
return;
|
||||
}
|
||||
|
||||
navigator.mediaDevices.getUserMedia({ audio: true })
|
||||
.then((stream) => {
|
||||
microphoneStream = stream;
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
scriptProcessor = audioContext.createScriptProcessor(BUFFER_SIZE, 1, 1);
|
||||
source = audioContext.createMediaStreamSource(stream);
|
||||
source.connect(scriptProcessor);
|
||||
scriptProcessor.connect(audioContext.destination);
|
||||
|
||||
const audioBuffer = [];
|
||||
const skipRatio = Math.floor(audioContext.sampleRate / (SAMPLE_RATE * 2));
|
||||
|
||||
scriptProcessor.onaudioprocess = (event) => {
|
||||
const rawLeftChannelData = event.inputBuffer.getChannelData(0);
|
||||
for (let i = 0; i < rawLeftChannelData.length; i += skipRatio) {
|
||||
const normalized = ((rawLeftChannelData[i] * 32768.0) + 32768) % 65536 - 32768;
|
||||
const swappedBytes = ((normalized & 0xff) << 8) | ((normalized >> 8) & 0xff);
|
||||
audioBuffer.push(swappedBytes);
|
||||
}
|
||||
|
||||
if (audioBuffer.length >= MIN_AUDIO_SIZE) {
|
||||
const audioFrame = frame.create({ audio: { audio: audioBuffer.slice(0, MIN_AUDIO_SIZE) } });
|
||||
const encodedFrame = new Uint8Array(frame.encode(audioFrame).finish());
|
||||
ws.send(encodedFrame);
|
||||
audioBuffer.splice(0, MIN_AUDIO_SIZE);
|
||||
}
|
||||
};
|
||||
|
||||
initWebSocket();
|
||||
})
|
||||
.catch((error) => console.error('Error accessing microphone:', error));
|
||||
}
|
||||
|
||||
function stopAudio() {
|
||||
if (ws) {
|
||||
ws.close();
|
||||
scriptProcessor.disconnect();
|
||||
source.disconnect();
|
||||
ws = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
document.getElementById('startAudioBtn').addEventListener('click', startAudio);
|
||||
document.getElementById('stopAudioBtn').addEventListener('click', stopAudio);
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
@@ -1,50 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from pipecat.pipeline.frame_processor import FrameProcessor
|
||||
from pipecat.pipeline.frames import TextFrame, TranscriptionFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
|
||||
from pipecat.transports.websocket_transport import WebsocketTransport
|
||||
from pipecat.services.whisper_ai_services import WhisperSTTService
|
||||
|
||||
logging.basicConfig(format="%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("pipecat")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class WhisperTranscriber(FrameProcessor):
|
||||
async def process_frame(self, frame):
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
print(f"Transcribed: {frame.text}")
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = WebsocketTransport(
|
||||
mic_enabled=True,
|
||||
speaker_enabled=True,
|
||||
)
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
pipeline = Pipeline([
|
||||
WhisperSTTService(),
|
||||
WhisperTranscriber(),
|
||||
tts,
|
||||
])
|
||||
|
||||
@transport.on_connection
|
||||
async def queue_frame():
|
||||
await pipeline.queue_frames([TextFrame("Hello there!")])
|
||||
|
||||
await transport.run(pipeline)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -74,6 +74,8 @@ class TalkingAnimation(FrameProcessor):
|
||||
self._is_talking = False
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, AudioRawFrame):
|
||||
if not self._is_talking:
|
||||
await self.push_frame(talking_frame)
|
||||
@@ -93,6 +95,8 @@ class UserImageRequester(FrameProcessor):
|
||||
self.participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if self.participant_id and isinstance(frame, TextFrame):
|
||||
if frame.text == user_request_answer:
|
||||
await self.push_frame(UserImageRequestFrame(self.participant_id), FrameDirection.UPSTREAM)
|
||||
@@ -107,6 +111,8 @@ class TextFilterProcessor(FrameProcessor):
|
||||
self.text = text
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TextFrame):
|
||||
if frame.text != self.text:
|
||||
await self.push_frame(frame)
|
||||
@@ -116,6 +122,8 @@ class TextFilterProcessor(FrameProcessor):
|
||||
|
||||
class ImageFilterProcessor(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if not isinstance(frame, ImageRawFrame):
|
||||
await self.push_frame(frame)
|
||||
|
||||
@@ -145,7 +153,7 @@ async def main(room_url: str, token):
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
model="gpt-4o")
|
||||
|
||||
ta = TalkingAnimation()
|
||||
|
||||
|
||||
16
examples/patient-intake/Dockerfile
Normal file
16
examples/patient-intake/Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
||||
FROM python:3.10-bullseye
|
||||
|
||||
RUN mkdir /app
|
||||
RUN mkdir /app/assets
|
||||
RUN mkdir /app/utils
|
||||
COPY *.py /app/
|
||||
COPY requirements.txt /app/
|
||||
copy assets/* /app/assets/
|
||||
copy utils/* /app/utils/
|
||||
|
||||
WORKDIR /app
|
||||
RUN pip3 install -r requirements.txt
|
||||
|
||||
EXPOSE 7860
|
||||
|
||||
CMD ["python3", "server.py"]
|
||||
37
examples/patient-intake/README.md
Normal file
37
examples/patient-intake/README.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# Simple Chatbot
|
||||
|
||||
<img src="image.png" width="420px">
|
||||
|
||||
This app connects you to a chatbot powered by GPT-4, complete with animations generated by Stable Video Diffusion.
|
||||
|
||||
See a video of it in action: https://x.com/kwindla/status/1778628911817183509
|
||||
|
||||
And a quick video walkthrough of the code: https://www.loom.com/share/13df1967161f4d24ade054e7f8753416
|
||||
|
||||
ℹ️ The first time, things might take extra time to get started since VAD (Voice Activity Detection) model needs to be downloaded.
|
||||
|
||||
## Get started
|
||||
|
||||
```python
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
|
||||
cp env.example .env # and add your credentials
|
||||
|
||||
```
|
||||
|
||||
## Run the server
|
||||
|
||||
```bash
|
||||
python server.py
|
||||
```
|
||||
|
||||
Then, visit `http://localhost:7860/start` in your browser to start a chatbot session.
|
||||
|
||||
## Build and test the Docker image
|
||||
|
||||
```
|
||||
docker build -t chatbot .
|
||||
docker run --env-file .env -p 7860:7860 chatbot
|
||||
```
|
||||
BIN
examples/patient-intake/assets/clack-short-quiet.wav
Normal file
BIN
examples/patient-intake/assets/clack-short-quiet.wav
Normal file
Binary file not shown.
BIN
examples/patient-intake/assets/clack-short.wav
Normal file
BIN
examples/patient-intake/assets/clack-short.wav
Normal file
Binary file not shown.
BIN
examples/patient-intake/assets/clack.wav
Normal file
BIN
examples/patient-intake/assets/clack.wav
Normal file
Binary file not shown.
BIN
examples/patient-intake/assets/ding.wav
Normal file
BIN
examples/patient-intake/assets/ding.wav
Normal file
Binary file not shown.
BIN
examples/patient-intake/assets/ding2.wav
Normal file
BIN
examples/patient-intake/assets/ding2.wav
Normal file
Binary file not shown.
BIN
examples/patient-intake/assets/ding3.wav
Normal file
BIN
examples/patient-intake/assets/ding3.wav
Normal file
Binary file not shown.
355
examples/patient-intake/bot.py
Normal file
355
examples/patient-intake/bot.py
Normal file
@@ -0,0 +1,355 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
import wave
|
||||
|
||||
from typing import List
|
||||
|
||||
from openai._types import NotGiven, NOT_GIVEN
|
||||
|
||||
from openai.types.chat import (
|
||||
ChatCompletionToolParam,
|
||||
)
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import LLMUserContextAggregator, LLMAssistantContextAggregator
|
||||
from pipecat.processors.logger import FrameLogger
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMContext, OpenAILLMContextFrame, OpenAILLMService
|
||||
from pipecat.services.ai_services import AIService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
sounds = {}
|
||||
sound_files = [
|
||||
"clack-short.wav",
|
||||
"clack.wav",
|
||||
"clack-short-quiet.wav",
|
||||
"ding.wav",
|
||||
"ding2.wav",
|
||||
]
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in sound_files:
|
||||
# Build the full path to the sound file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the sound and convert it to bytes
|
||||
with wave.open(full_path) as audio_file:
|
||||
sounds[file] = AudioRawFrame(audio_file.readframes(-1),
|
||||
audio_file.getframerate(), audio_file.getnchannels())
|
||||
|
||||
|
||||
class IntakeProcessor:
|
||||
def __init__(
|
||||
self,
|
||||
context: OpenAILLMContext,
|
||||
llm: AIService,
|
||||
tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._context: OpenAILLMContext = context
|
||||
self._llm = llm
|
||||
print(f"Initializing context from IntakeProcessor")
|
||||
self._context.add_message({"role": "system", "content": "You are Jessica, an agent for a company called Tri-County Health Services. Your job is to collect important information from the user before their doctor visit. You're talking to Chad Bailey. You should address the user by their first name and be polite and professional. You're not a medical professional, so you shouldn't provide any advice. Keep your responses short. Your job is to collect information to give to a doctor. Don't make assumptions about what values to plug into functions. Ask for clarification if a user response is ambiguous. Start by introducing yourself. Then, ask the user to confirm their identity by telling you their birthday, including the year. When they answer with their birthday, call the verify_birthday function."})
|
||||
self._context.set_tools([
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "verify_birthday",
|
||||
"description": "Use this function to verify the user has provided their correct birthday.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"birthday": {
|
||||
"type": "string",
|
||||
"description": "The user's birthdate, including the year. The user can provide it in any format, but convert it to YYYY-MM-DD format to call this function.",
|
||||
}},
|
||||
},
|
||||
},
|
||||
}])
|
||||
# Create an allowlist of functions that the LLM can call
|
||||
self._functions = [
|
||||
"verify_birthday",
|
||||
"list_prescriptions",
|
||||
"list_allergies",
|
||||
"list_conditions",
|
||||
"list_visit_reasons",
|
||||
]
|
||||
|
||||
async def verify_birthday(self, llm, args):
|
||||
if args["birthday"] == "1983-01-01":
|
||||
self._context.set_tools(
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "list_prescriptions",
|
||||
"description": "Once the user has provided a list of their prescription medications, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prescriptions": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"medication": {
|
||||
"type": "string",
|
||||
"description": "The medication's name",
|
||||
},
|
||||
"dosage": {
|
||||
"type": "string",
|
||||
"description": "The prescription's dosage",
|
||||
},
|
||||
},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
}])
|
||||
# It's a bit weird to push this to the LLM, but it gets it into the pipeline
|
||||
await llm.push_frame(sounds["ding2.wav"], FrameDirection.DOWNSTREAM)
|
||||
# We don't need the function call in the context, so just return a new
|
||||
# system message and let the framework re-prompt
|
||||
return [{"role": "system", "content": "Next, thank the user for confirming their identity, then ask the user to list their current prescriptions. Each prescription needs to have a medication name and a dosage. Do not call the list_prescriptions function with any unknown dosages."}]
|
||||
else:
|
||||
# The user provided an incorrect birthday; ask them to try again
|
||||
return [{"role": "system", "content": "The user provided an incorrect birthday. Ask them for their birthday again. When they answer, call the verify_birthday function."}]
|
||||
|
||||
async def start_prescriptions(self, llm):
|
||||
print(f"!!! doing start prescriptions")
|
||||
# Move on to allergies
|
||||
self._context.set_tools(
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "list_allergies",
|
||||
"description": "Once the user has provided a list of their allergies, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"allergies": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "What the user is allergic to",
|
||||
}},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
}])
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Next, ask the user if they have any allergies. Once they have listed their allergies or confirmed they don't have any, call the list_allergies function."})
|
||||
print(f"!!! about to await llm process frame in start prescrpitions")
|
||||
await llm.process_frame(OpenAILLMContextFrame(self._context), FrameDirection.DOWNSTREAM)
|
||||
print(f"!!! past await process frame in start prescriptions")
|
||||
|
||||
async def start_allergies(self, llm):
|
||||
print("!!! doing start allergies")
|
||||
# Move on to conditions
|
||||
self._context.set_tools(
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "list_conditions",
|
||||
"description": "Once the user has provided a list of their medical conditions, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"conditions": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "The user's medical condition",
|
||||
}},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
},
|
||||
])
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Now ask the user if they have any medical conditions the doctor should know about. Once they've answered the question, call the list_conditions function."})
|
||||
await llm.process_frame(OpenAILLMContextFrame(self._context), FrameDirection.DOWNSTREAM)
|
||||
|
||||
async def start_conditions(self, llm):
|
||||
print("!!! doing start conditions")
|
||||
# Move on to visit reasons
|
||||
self._context.set_tools(
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "list_visit_reasons",
|
||||
"description": "Once the user has provided a list of the reasons they are visiting a doctor today, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"visit_reasons": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "The user's reason for visiting the doctor",
|
||||
}},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
}])
|
||||
self._context.add_message(
|
||||
{"role": "system", "content": "Finally, ask the user the reason for their doctor visit today. Once they answer, call the list_visit_reasons function."})
|
||||
await llm.process_frame(OpenAILLMContextFrame(self._context), FrameDirection.DOWNSTREAM)
|
||||
|
||||
async def start_visit_reasons(self, llm):
|
||||
print("!!! doing start visit reasons")
|
||||
# move to finish call
|
||||
self._context.set_tools([])
|
||||
self._context.add_message({"role": "system",
|
||||
"content": "Now, thank the user and end the conversation."})
|
||||
await llm.process_frame(OpenAILLMContextFrame(self._context), FrameDirection.DOWNSTREAM)
|
||||
|
||||
async def save_data(self, llm, args):
|
||||
logger.info(f"!!! Saving data: {args}")
|
||||
# Since this is supposed to be "async", returning None from the callback
|
||||
# will prevent adding anything to context or re-prompting
|
||||
return None
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Chatbot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
camera_out_enabled=True,
|
||||
camera_out_width=1024,
|
||||
camera_out_height=576,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
transcription_enabled=True,
|
||||
#
|
||||
# Spanish
|
||||
#
|
||||
# transcription_settings=DailyTranscriptionSettings(
|
||||
# language="es",
|
||||
# tier="nova",
|
||||
# model="2-general"
|
||||
# )
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
#
|
||||
# English
|
||||
#
|
||||
voice_id="pNInz6obpgDQGcFmaJgB",
|
||||
|
||||
#
|
||||
# Spanish
|
||||
#
|
||||
# model="eleven_multilingual_v2",
|
||||
# voice_id="gD1IexrzCvsXPHUuT0s3",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
messages = []
|
||||
context = OpenAILLMContext(messages=messages)
|
||||
user_context = LLMUserContextAggregator(context)
|
||||
assistant_context = LLMAssistantContextAggregator(context)
|
||||
|
||||
intake = IntakeProcessor(context, llm)
|
||||
llm.register_function("verify_birthday", intake.verify_birthday)
|
||||
llm.register_function(
|
||||
"list_prescriptions",
|
||||
intake.save_data,
|
||||
start_callback=intake.start_prescriptions)
|
||||
llm.register_function(
|
||||
"list_allergies",
|
||||
intake.save_data,
|
||||
start_callback=intake.start_allergies)
|
||||
llm.register_function(
|
||||
"list_conditions",
|
||||
intake.save_data,
|
||||
start_callback=intake.start_conditions)
|
||||
llm.register_function(
|
||||
"list_visit_reasons",
|
||||
intake.save_data,
|
||||
start_callback=intake.start_visit_reasons)
|
||||
|
||||
fl = FrameLogger("LLM Output")
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport input
|
||||
user_context, # User responses
|
||||
llm, # LLM
|
||||
fl, # Frame logger
|
||||
tts, # TTS
|
||||
transport.output(), # Transport output
|
||||
assistant_context, # Assistant responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=False))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
print(f"Context is: {context}")
|
||||
await task.queue_frames([OpenAILLMContextFrame(context)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
4
examples/patient-intake/env.example
Normal file
4
examples/patient-intake/env.example
Normal file
@@ -0,0 +1,4 @@
|
||||
DAILY_SAMPLE_ROOM_URL=https://yourdomain.daily.co/yourroom # (for joining the bot to the same room repeatedly for local dev)
|
||||
DAILY_API_KEY=7df...
|
||||
OPENAI_API_KEY=sk-PL...
|
||||
ELEVENLABS_API_KEY=aeb...
|
||||
BIN
examples/patient-intake/image.png
Normal file
BIN
examples/patient-intake/image.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 733 KiB |
5
examples/patient-intake/requirements.txt
Normal file
5
examples/patient-intake/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
python-dotenv
|
||||
requests
|
||||
fastapi[all]
|
||||
uvicorn
|
||||
pipecat-ai[daily,openai,silero]
|
||||
58
examples/patient-intake/runner.py
Normal file
58
examples/patient-intake/runner.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import urllib
|
||||
import requests
|
||||
|
||||
|
||||
def configure():
|
||||
parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u",
|
||||
"--url",
|
||||
type=str,
|
||||
required=False,
|
||||
help="URL of the Daily room to join")
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--apikey",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Daily API Key (needed to create an owner token for the room)",
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
url = args.url or os.getenv("DAILY_SAMPLE_ROOM_URL")
|
||||
key = args.apikey or os.getenv("DAILY_API_KEY")
|
||||
|
||||
if not url:
|
||||
raise Exception(
|
||||
"No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.")
|
||||
|
||||
if not key:
|
||||
raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.")
|
||||
|
||||
# Create a meeting token for the given room with an expiration 1 hour in
|
||||
# the future.
|
||||
room_name: str = urllib.parse.urlparse(url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.daily.co/v1/meeting-tokens",
|
||||
headers={
|
||||
"Authorization": f"Bearer {key}"},
|
||||
json={
|
||||
"properties": {
|
||||
"room_name": room_name,
|
||||
"is_owner": True,
|
||||
"exp": expiration}},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(
|
||||
f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
|
||||
return (url, token)
|
||||
124
examples/patient-intake/server.py
Normal file
124
examples/patient-intake/server.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import os
|
||||
import argparse
|
||||
import subprocess
|
||||
import atexit
|
||||
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse, RedirectResponse
|
||||
|
||||
from utils.daily_helpers import create_room as _create_room, get_token
|
||||
|
||||
MAX_BOTS_PER_ROOM = 1
|
||||
|
||||
# Bot sub-process dict for status reporting and concurrency control
|
||||
bot_procs = {}
|
||||
|
||||
|
||||
def cleanup():
|
||||
# Clean up function, just to be extra safe
|
||||
for proc in bot_procs.values():
|
||||
proc.terminate()
|
||||
proc.wait()
|
||||
|
||||
|
||||
atexit.register(cleanup)
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
@app.get("/start")
|
||||
async def start_agent(request: Request):
|
||||
print(f"!!! Creating room")
|
||||
room_url, room_name = _create_room()
|
||||
print(f"!!! Room URL: {room_url}")
|
||||
# Ensure the room property is present
|
||||
if not room_url:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Missing 'room' property in request data. Cannot start agent without a target room!")
|
||||
|
||||
# Check if there is already an existing process running in this room
|
||||
num_bots_in_room = sum(
|
||||
1 for proc in bot_procs.values() if proc[1] == room_url and proc[0].poll() is None)
|
||||
if num_bots_in_room >= MAX_BOTS_PER_ROOM:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Max bot limited reach for room: {room_url}")
|
||||
|
||||
# Get the token for the room
|
||||
token = get_token(room_url)
|
||||
|
||||
if not token:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to get token for room: {room_url}")
|
||||
|
||||
# Spawn a new agent, and join the user session
|
||||
# Note: this is mostly for demonstration purposes (refer to 'deployment' in README)
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
f"python3 -m bot -u {room_url} -t {token}"
|
||||
],
|
||||
shell=True,
|
||||
bufsize=1,
|
||||
cwd=os.path.dirname(os.path.abspath(__file__))
|
||||
)
|
||||
bot_procs[proc.pid] = (proc, room_url)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to start subprocess: {e}")
|
||||
|
||||
return RedirectResponse(room_url)
|
||||
|
||||
|
||||
@app.get("/status/{pid}")
|
||||
def get_status(pid: int):
|
||||
# Look up the subprocess
|
||||
proc = bot_procs.get(pid)
|
||||
|
||||
# If the subprocess doesn't exist, return an error
|
||||
if not proc:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Bot with process id: {pid} not found")
|
||||
|
||||
# Check the status of the subprocess
|
||||
if proc[0].poll() is None:
|
||||
status = "running"
|
||||
else:
|
||||
status = "finished"
|
||||
|
||||
return JSONResponse({"bot_id": pid, "status": status})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
default_host = os.getenv("HOST", "0.0.0.0")
|
||||
default_port = int(os.getenv("FAST_API_PORT", "7860"))
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Daily Storyteller FastAPI server")
|
||||
parser.add_argument("--host", type=str,
|
||||
default=default_host, help="Host address")
|
||||
parser.add_argument("--port", type=int,
|
||||
default=default_port, help="Port number")
|
||||
parser.add_argument("--reload", action="store_true",
|
||||
help="Reload code on change")
|
||||
|
||||
config = parser.parse_args()
|
||||
print(f"to join a test room, visit http://localhost:{config.port}/start")
|
||||
uvicorn.run(
|
||||
"server:app",
|
||||
host=config.host,
|
||||
port=config.port,
|
||||
reload=config.reload,
|
||||
)
|
||||
109
examples/patient-intake/utils/daily_helpers.py
Normal file
109
examples/patient-intake/utils/daily_helpers.py
Normal file
@@ -0,0 +1,109 @@
|
||||
|
||||
import urllib.parse
|
||||
import os
|
||||
import time
|
||||
import urllib
|
||||
import requests
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
|
||||
daily_api_path = os.getenv("DAILY_API_URL") or "api.daily.co/v1"
|
||||
daily_api_key = os.getenv("DAILY_API_KEY")
|
||||
|
||||
|
||||
def create_room() -> tuple[str, str]:
|
||||
"""
|
||||
Helper function to create a Daily room.
|
||||
# See: https://docs.daily.co/reference/rest-api/rooms
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing the room URL and room name.
|
||||
|
||||
Raises:
|
||||
Exception: If the request to create the room fails or if the response does not contain the room URL or room name.
|
||||
"""
|
||||
room_props = {
|
||||
"exp": time.time() + 60 * 60, # 1 hour
|
||||
"enable_chat": True,
|
||||
"enable_emoji_reactions": True,
|
||||
"eject_at_room_exp": True,
|
||||
"enable_prejoin_ui": False, # Important for the bot to be able to join headlessly
|
||||
}
|
||||
res = requests.post(
|
||||
f"https://{daily_api_path}/rooms",
|
||||
headers={"Authorization": f"Bearer {daily_api_key}"},
|
||||
json={
|
||||
"properties": room_props
|
||||
},
|
||||
)
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Unable to create room: {res.text}")
|
||||
|
||||
data = res.json()
|
||||
room_url: str = data.get("url")
|
||||
room_name: str = data.get("name")
|
||||
if room_url is None or room_name is None:
|
||||
raise Exception("Missing room URL or room name in response")
|
||||
|
||||
return room_url, room_name
|
||||
|
||||
|
||||
def get_name_from_url(room_url: str) -> str:
|
||||
"""
|
||||
Extracts the name from a given room URL.
|
||||
|
||||
Args:
|
||||
room_url (str): The URL of the room.
|
||||
|
||||
Returns:
|
||||
str: The extracted name from the room URL.
|
||||
"""
|
||||
return urllib.parse.urlparse(room_url).path[1:]
|
||||
|
||||
|
||||
def get_token(room_url: str) -> str:
|
||||
"""
|
||||
Retrieves a meeting token for the specified Daily room URL.
|
||||
# See: https://docs.daily.co/reference/rest-api/meeting-tokens
|
||||
|
||||
Args:
|
||||
room_url (str): The URL of the Daily room.
|
||||
|
||||
Returns:
|
||||
str: The meeting token.
|
||||
|
||||
Raises:
|
||||
Exception: If no room URL is specified or if no Daily API key is specified.
|
||||
Exception: If there is an error creating the meeting token.
|
||||
"""
|
||||
if not room_url:
|
||||
raise Exception(
|
||||
"No Daily room specified. You must specify a Daily room in order a token to be generated.")
|
||||
|
||||
if not daily_api_key:
|
||||
raise Exception(
|
||||
"No Daily API key specified. set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.")
|
||||
|
||||
expiration: float = time.time() + 60 * 60
|
||||
room_name = get_name_from_url(room_url)
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://{daily_api_path}/meeting-tokens",
|
||||
headers={
|
||||
"Authorization": f"Bearer {daily_api_key}"},
|
||||
json={
|
||||
"properties": {
|
||||
"room_name": room_name,
|
||||
"is_owner": True, # Owner tokens required for transcription
|
||||
"exp": expiration}},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(
|
||||
f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
|
||||
return token
|
||||
@@ -7,7 +7,7 @@ from PIL import Image
|
||||
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
@@ -64,6 +64,8 @@ class TalkingAnimation(FrameProcessor):
|
||||
self._is_talking = False
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, AudioRawFrame):
|
||||
if not self._is_talking:
|
||||
await self.push_frame(talking_frame)
|
||||
@@ -117,7 +119,7 @@ async def main(room_url: str, token):
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
@@ -149,7 +151,7 @@ async def main(room_url: str, token):
|
||||
assistant_response,
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, allow_interruptions=True)
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
await task.queue_frame(quiet_frame)
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
|
||||
@@ -37,6 +37,8 @@ Adds pictures to our story (really fast!) Prompting is quite key for style consi
|
||||
**Install requirements**
|
||||
|
||||
```shell
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@@ -52,7 +54,7 @@ This project uses a custom frontend, which needs to built. Note: this is done au
|
||||
|
||||
```shell
|
||||
cd frontend/
|
||||
npm install / yarn
|
||||
npm install
|
||||
npm run build
|
||||
```
|
||||
|
||||
@@ -68,12 +70,7 @@ If you'd like to run a custom domain or port:
|
||||
|
||||
`python src/server.py --host somehost --p 7777`
|
||||
|
||||
➡️ Open the host URL in your browser
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Whilst working on the frontend code, please `yarn run dev`
|
||||
> and open the NextJS hosted service vs. the Python server.
|
||||
> (Usually localhost:3000.)
|
||||
➡️ Open the host URL in your browser `http://localhost:7860`
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -899,11 +899,11 @@ brace-expansion@^2.0.1:
|
||||
balanced-match "^1.0.0"
|
||||
|
||||
braces@^3.0.2, braces@~3.0.2:
|
||||
version "3.0.2"
|
||||
resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107"
|
||||
integrity sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==
|
||||
version "3.0.3"
|
||||
resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.3.tgz#490332f40919452272d55a8480adc0c441358789"
|
||||
integrity "sha1-SQMy9AkZRSJy1VqEgK3AxEE1h4k= sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA=="
|
||||
dependencies:
|
||||
fill-range "^7.0.1"
|
||||
fill-range "^7.1.1"
|
||||
|
||||
browserslist@^4.23.0:
|
||||
version "4.23.0"
|
||||
@@ -1551,10 +1551,10 @@ file-entry-cache@^6.0.1:
|
||||
dependencies:
|
||||
flat-cache "^3.0.4"
|
||||
|
||||
fill-range@^7.0.1:
|
||||
version "7.0.1"
|
||||
resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.0.1.tgz#1919a6a7c75fe38b2c7c77e5198535da9acdda40"
|
||||
integrity sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==
|
||||
fill-range@^7.1.1:
|
||||
version "7.1.1"
|
||||
resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.1.1.tgz#44265d3cac07e3ea7dc247516380643754a05292"
|
||||
integrity "sha1-RCZdPKwH4+p9wkdRY4BkN1SgUpI= sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg=="
|
||||
dependencies:
|
||||
to-regex-range "^5.0.1"
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ async def main(room_url, token=None):
|
||||
|
||||
llm_service = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo"
|
||||
model="gpt-4o"
|
||||
)
|
||||
|
||||
tts_service = ElevenLabsTTSService(
|
||||
|
||||
@@ -52,6 +52,8 @@ class StoryImageProcessor(FrameProcessor):
|
||||
self._fal_service = fal_service
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, StoryImageFrame):
|
||||
try:
|
||||
async with timeout(7):
|
||||
@@ -86,6 +88,8 @@ class StoryProcessor(FrameProcessor):
|
||||
self._story = story
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserStoppedSpeakingFrame):
|
||||
# Send an app message to the UI
|
||||
await self.push_frame(DailyTransportMessageFrame(CUE_ASSISTANT_TURN))
|
||||
|
||||
@@ -40,6 +40,8 @@ class TranslationProcessor(FrameProcessor):
|
||||
self._language = language
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TextFrame):
|
||||
context = [
|
||||
{
|
||||
@@ -65,6 +67,8 @@ class TranslationSubtitles(FrameProcessor):
|
||||
# subtitles.
|
||||
#
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TextFrame):
|
||||
message = {
|
||||
"language": self._language,
|
||||
@@ -97,7 +101,8 @@ async def main(room_url: str, token):
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4-turbo-preview"
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o"
|
||||
)
|
||||
|
||||
sa = SentenceAggregator()
|
||||
|
||||
161
examples/twilio-chatbot/.gitignore
vendored
Normal file
161
examples/twilio-chatbot/.gitignore
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
runpod.toml
|
||||
20
examples/twilio-chatbot/Dockerfile
Normal file
20
examples/twilio-chatbot/Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
||||
# Use an official Python runtime as a parent image
|
||||
FROM python:3.10-bullseye
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /twilio-chatbot
|
||||
|
||||
# Copy the requirements file into the container
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install any needed packages specified in requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy the current directory contents into the container
|
||||
COPY . .
|
||||
|
||||
# Expose the desired port
|
||||
EXPOSE 8765
|
||||
|
||||
# Run the application
|
||||
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8765"]
|
||||
82
examples/twilio-chatbot/README.md
Normal file
82
examples/twilio-chatbot/README.md
Normal file
@@ -0,0 +1,82 @@
|
||||
# Twilio Chatbot
|
||||
|
||||
This project is a FastAPI-based chatbot that integrates with Twilio to handle WebSocket connections and provide real-time communication. The project includes endpoints for starting a call and handling WebSocket connections.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Features](#features)
|
||||
- [Requirements](#requirements)
|
||||
- [Installation](#installation)
|
||||
- [Configure Twilio URLs](#configure-twilio-urls)
|
||||
- [Running the Application](#running-the-application)
|
||||
- [Usage](#usage)
|
||||
|
||||
## Features
|
||||
|
||||
- **FastAPI**: A modern, fast (high-performance), web framework for building APIs with Python 3.6+.
|
||||
- **WebSocket Support**: Real-time communication using WebSockets.
|
||||
- **CORS Middleware**: Allowing cross-origin requests for testing.
|
||||
- **Dockerized**: Easily deployable using Docker.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.10
|
||||
- Docker (for containerized deployment)
|
||||
- ngrok (for tunneling)
|
||||
- Twilio Account
|
||||
|
||||
## Installation
|
||||
|
||||
1. **Set up a virtual environment** (optional but recommended):
|
||||
```sh
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows, use `venv\Scripts\activate`
|
||||
```
|
||||
|
||||
2. **Install dependencies**:
|
||||
```sh
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. **Create .env**:
|
||||
create .env based on env.example
|
||||
|
||||
4. **Install ngrok**:
|
||||
Follow the instructions on the [ngrok website](https://ngrok.com/download) to download and install ngrok.
|
||||
|
||||
## Configure Twilio URLs
|
||||
|
||||
1. **Update the Twilio Webhook**:
|
||||
Copy the ngrok URL and update your Twilio phone number webhook URL to `http://<ngrok_url>/start_call`.
|
||||
|
||||
2. **Update the streams.xml**:
|
||||
Copy the ngrok URL and update templates/streams.xml with `wss://<ngrok_url>/ws`.
|
||||
|
||||
## Running the Application
|
||||
|
||||
### Using Python
|
||||
|
||||
1. **Run the FastAPI application**:
|
||||
```sh
|
||||
python server.py
|
||||
```
|
||||
|
||||
2. **Start ngrok**:
|
||||
In a new terminal, start ngrok to tunnel the local server:
|
||||
```sh
|
||||
ngrok http 8765
|
||||
```
|
||||
### Using Docker
|
||||
|
||||
1. **Build the Docker image**:
|
||||
```sh
|
||||
docker build -t twilio-chatbot .
|
||||
```
|
||||
|
||||
2. **Run the Docker container**:
|
||||
```sh
|
||||
docker run -it --rm -p 8765:8765 twilio-chatbot
|
||||
```
|
||||
## Usage
|
||||
|
||||
To start a call, simply make a call to your Twilio phone number. The webhook URL will direct the call to your FastAPI application, which will handle it accordingly.
|
||||
90
examples/twilio-chatbot/bot.py
Normal file
90
examples/twilio-chatbot/bot.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator
|
||||
)
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.deepgram import DeepgramSTTService
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketTransport, FastAPIWebsocketParams
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.serializers.twilio import TwilioFrameSerializer
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def run_bot(websocket_client, stream_sid):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = FastAPIWebsocketTransport(
|
||||
websocket=websocket_client,
|
||||
params=FastAPIWebsocketParams(
|
||||
audio_out_enabled=True,
|
||||
add_wav_header=False,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
vad_audio_passthrough=True,
|
||||
serializer=TwilioFrameSerializer(stream_sid)
|
||||
)
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv('DEEPGRAM_API_KEY'))
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in an audio call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Websocket input from client
|
||||
stt, # Speech-To-Text
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # Text-To-Speech
|
||||
transport.output(), # Websocket output to client
|
||||
tma_out # LLM responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
await task.queue_frames([EndFrame()])
|
||||
|
||||
runner = PipelineRunner(handle_sigint=False)
|
||||
|
||||
await runner.run(task)
|
||||
4
examples/twilio-chatbot/env.example
Normal file
4
examples/twilio-chatbot/env.example
Normal file
@@ -0,0 +1,4 @@
|
||||
OPENAI_API_KEY=
|
||||
DEEPGRAM_API_KEY=
|
||||
ELEVENLABS_API_KEY=
|
||||
ELEVENLABS_VOICE_ID=
|
||||
5
examples/twilio-chatbot/requirements.txt
Normal file
5
examples/twilio-chatbot/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
pipecat-ai[daily,openai,silero,deepgram]
|
||||
fastapi
|
||||
uvicorn
|
||||
python-dotenv
|
||||
loguru
|
||||
41
examples/twilio-chatbot/server.py
Normal file
41
examples/twilio-chatbot/server.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import json
|
||||
|
||||
import uvicorn
|
||||
|
||||
from fastapi import FastAPI, WebSocket
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from starlette.responses import HTMLResponse
|
||||
|
||||
from bot import run_bot
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"], # Allow all origins for testing
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
@app.post('/start_call')
|
||||
async def start_call():
|
||||
print("POST TwiML")
|
||||
return HTMLResponse(content=open("templates/streams.xml").read(), media_type="application/xml")
|
||||
|
||||
|
||||
@app.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
await websocket.accept()
|
||||
start_data = websocket.iter_text()
|
||||
await start_data.__anext__()
|
||||
call_data = json.loads(await start_data.__anext__())
|
||||
print(call_data, flush=True)
|
||||
stream_sid = call_data['start']['streamSid']
|
||||
print("WebSocket connection accepted")
|
||||
await run_bot(websocket, stream_sid)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host="0.0.0.0", port=8765)
|
||||
7
examples/twilio-chatbot/templates/streams.xml
Normal file
7
examples/twilio-chatbot/templates/streams.xml
Normal file
@@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Connect>
|
||||
<Stream url="wss://<your server url>/ws"></Stream>
|
||||
</Connect>
|
||||
<Pause length="40"/>
|
||||
</Response>
|
||||
27
examples/websocket-server/README.md
Normal file
27
examples/websocket-server/README.md
Normal file
@@ -0,0 +1,27 @@
|
||||
# Websocket Server
|
||||
|
||||
This is an example that shows how to use `WebsocketServerTransport` to communicate with a web client.
|
||||
|
||||
## Get started
|
||||
|
||||
```python
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Run the bot
|
||||
|
||||
```bash
|
||||
python bot.py
|
||||
```
|
||||
|
||||
## Run the HTTP server
|
||||
|
||||
This will host the static web client:
|
||||
|
||||
```bash
|
||||
python -m http.server
|
||||
```
|
||||
|
||||
Then, visit `http://localhost:8000` in your browser to start a session.
|
||||
93
examples/websocket-server/bot.py
Normal file
93
examples/websocket-server/bot.py
Normal file
@@ -0,0 +1,93 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator
|
||||
)
|
||||
from pipecat.services.deepgram import DeepgramSTTService
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.network.websocket_server import WebsocketServerParams, WebsocketServerTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = WebsocketServerTransport(
|
||||
params=WebsocketServerParams(
|
||||
audio_out_enabled=True,
|
||||
add_wav_header=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
vad_audio_passthrough=True
|
||||
)
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Websocket input from client
|
||||
stt, # Speech-To-Text
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # Text-To-Speech
|
||||
transport.output(), # Websocket output to client
|
||||
tma_out # LLM responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
43
examples/websocket-server/frames.proto
Normal file
43
examples/websocket-server/frames.proto
Normal file
@@ -0,0 +1,43 @@
|
||||
//
|
||||
// Copyright (c) 2024, Daily
|
||||
//
|
||||
// SPDX-License-Identifier: BSD 2-Clause License
|
||||
//
|
||||
|
||||
// Generate frames_pb2.py with:
|
||||
//
|
||||
// python -m grpc_tools.protoc --proto_path=./ --python_out=./protobufs frames.proto
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package pipecat;
|
||||
|
||||
message TextFrame {
|
||||
uint64 id = 1;
|
||||
string name = 2;
|
||||
string text = 3;
|
||||
}
|
||||
|
||||
message AudioRawFrame {
|
||||
uint64 id = 1;
|
||||
string name = 2;
|
||||
bytes audio = 3;
|
||||
uint32 sample_rate = 4;
|
||||
uint32 num_channels = 5;
|
||||
}
|
||||
|
||||
message TranscriptionFrame {
|
||||
uint64 id = 1;
|
||||
string name = 2;
|
||||
string text = 3;
|
||||
string user_id = 4;
|
||||
string timestamp = 5;
|
||||
}
|
||||
|
||||
message Frame {
|
||||
oneof frame {
|
||||
TextFrame text = 1;
|
||||
AudioRawFrame audio = 2;
|
||||
TranscriptionFrame transcription = 3;
|
||||
}
|
||||
}
|
||||
205
examples/websocket-server/index.html
Normal file
205
examples/websocket-server/index.html
Normal file
@@ -0,0 +1,205 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<script src="https://cdn.jsdelivr.net/npm/protobufjs@7.X.X/dist/protobuf.min.js"></script>
|
||||
<title>Pipecat WebSocket Client Example</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<h1>Pipecat WebSocket Client Example</h1>
|
||||
<h3><div id="progressText">Loading, wait...</div></h2>
|
||||
<button id="startAudioBtn">Start Audio</button>
|
||||
<button id="stopAudioBtn">Stop Audio</button>
|
||||
<script>
|
||||
const SAMPLE_RATE = 16000;
|
||||
const NUM_CHANNELS = 1;
|
||||
const PLAY_TIME_RESET_THRESHOLD_MS = 1.0;
|
||||
|
||||
// The protobuf type. We will load it later.
|
||||
let Frame = null;
|
||||
|
||||
// The websocket connection.
|
||||
let ws = null;
|
||||
|
||||
// The audio context
|
||||
let audioContext = null;
|
||||
|
||||
// The audio context media stream source
|
||||
let source = null;
|
||||
|
||||
// The microphone stream from getUserMedia. SHould be sampled to the
|
||||
// proper sample rate.
|
||||
let microphoneStream = null;
|
||||
|
||||
// Script processor to get data from microphone.
|
||||
let scriptProcessor = null;
|
||||
|
||||
// AudioContext play time.
|
||||
let playTime = 0;
|
||||
|
||||
// Last time we received a websocket message.
|
||||
let lastMessageTime = 0;
|
||||
|
||||
// Whether we should be playing audio.
|
||||
let isPlaying = false;
|
||||
|
||||
let startBtn = document.getElementById('startAudioBtn');
|
||||
let stopBtn = document.getElementById('stopAudioBtn');
|
||||
|
||||
const proto = protobuf.load("frames.proto", (err, root) => {
|
||||
if (err) {
|
||||
throw err;
|
||||
}
|
||||
Frame = root.lookupType("pipecat.Frame");
|
||||
const progressText = document.getElementById("progressText");
|
||||
progressText.textContent = "We are ready! Make sure to run the server and then click `Start Audio`.";
|
||||
|
||||
startBtn.disabled = false;
|
||||
stopBtn.disabled = true;
|
||||
});
|
||||
|
||||
function initWebSocket() {
|
||||
ws = new WebSocket('ws://localhost:8765');
|
||||
|
||||
ws.addEventListener('open', () => console.log('WebSocket connection established.'));
|
||||
ws.addEventListener('message', handleWebSocketMessage);
|
||||
ws.addEventListener('close', (event) => {
|
||||
console.log("WebSocket connection closed.", event.code, event.reason);
|
||||
stopAudio(false);
|
||||
});
|
||||
ws.addEventListener('error', (event) => console.error('WebSocket error:', event));
|
||||
}
|
||||
|
||||
async function handleWebSocketMessage(event) {
|
||||
const arrayBuffer = await event.data.arrayBuffer();
|
||||
if (isPlaying) {
|
||||
enqueueAudioFromProto(arrayBuffer);
|
||||
}
|
||||
}
|
||||
|
||||
function enqueueAudioFromProto(arrayBuffer) {
|
||||
const parsedFrame = Frame.decode(new Uint8Array(arrayBuffer));
|
||||
if (!parsedFrame?.audio) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reset play time if it's been a while we haven't played anything.
|
||||
const diffTime = audioContext.currentTime - lastMessageTime;
|
||||
if ((playTime == 0) || (diffTime > PLAY_TIME_RESET_THRESHOLD_MS)) {
|
||||
playTime = audioContext.currentTime;
|
||||
}
|
||||
lastMessageTime = audioContext.currentTime;
|
||||
|
||||
// We should be able to use parsedFrame.audio.audio.buffer but for
|
||||
// some reason that contains all the bytes from the protobuf message.
|
||||
const audioVector = Array.from(parsedFrame.audio.audio);
|
||||
const audioArray = new Uint8Array(audioVector);
|
||||
|
||||
audioContext.decodeAudioData(audioArray.buffer, function(buffer) {
|
||||
const source = new AudioBufferSourceNode(audioContext);
|
||||
source.buffer = buffer;
|
||||
source.start(playTime);
|
||||
source.connect(audioContext.destination);
|
||||
playTime = playTime + buffer.duration;
|
||||
});
|
||||
}
|
||||
|
||||
function convertFloat32ToS16PCM(float32Array) {
|
||||
let int16Array = new Int16Array(float32Array.length);
|
||||
|
||||
for (let i = 0; i < float32Array.length; i++) {
|
||||
let clampedValue = Math.max(-1, Math.min(1, float32Array[i]));
|
||||
int16Array[i] = clampedValue < 0 ? clampedValue * 32768 : clampedValue * 32767;
|
||||
}
|
||||
return int16Array;
|
||||
}
|
||||
|
||||
function startAudioBtnHandler() {
|
||||
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||||
alert('getUserMedia is not supported in your browser.');
|
||||
return;
|
||||
}
|
||||
|
||||
startBtn.disabled = true;
|
||||
stopBtn.disabled = false;
|
||||
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)({
|
||||
latencyHint: "interactive",
|
||||
sampleRate: SAMPLE_RATE
|
||||
});
|
||||
|
||||
isPlaying = true;
|
||||
|
||||
initWebSocket();
|
||||
|
||||
navigator.mediaDevices.getUserMedia({
|
||||
audio: {
|
||||
sampleRate: SAMPLE_RATE,
|
||||
channelCount: NUM_CHANNELS,
|
||||
autoGainControl: true,
|
||||
echoCancellation: true,
|
||||
noiseSuppression: true,
|
||||
}
|
||||
}).then((stream) => {
|
||||
microphoneStream = stream;
|
||||
// 512 is closest thing to 200ms.
|
||||
scriptProcessor = audioContext.createScriptProcessor(512, 1, 1);
|
||||
source = audioContext.createMediaStreamSource(stream);
|
||||
source.connect(scriptProcessor);
|
||||
scriptProcessor.connect(audioContext.destination);
|
||||
|
||||
scriptProcessor.onaudioprocess = (event) => {
|
||||
if (!ws) {
|
||||
return;
|
||||
}
|
||||
|
||||
const audioData = event.inputBuffer.getChannelData(0);
|
||||
const pcmS16Array = convertFloat32ToS16PCM(audioData);
|
||||
const pcmByteArray = new Uint8Array(pcmS16Array.buffer);
|
||||
const frame = Frame.create({
|
||||
audio: {
|
||||
audio: Array.from(pcmByteArray),
|
||||
sampleRate: SAMPLE_RATE,
|
||||
numChannels: NUM_CHANNELS
|
||||
}
|
||||
});
|
||||
const encodedFrame = new Uint8Array(Frame.encode(frame).finish());
|
||||
ws.send(encodedFrame);
|
||||
};
|
||||
}).catch((error) => console.error('Error accessing microphone:', error));
|
||||
}
|
||||
|
||||
function stopAudio(closeWebsocket) {
|
||||
playTime = 0;
|
||||
isPlaying = false;
|
||||
startBtn.disabled = false;
|
||||
stopBtn.disabled = true;
|
||||
|
||||
if (ws && closeWebsocket) {
|
||||
ws.close();
|
||||
ws = null;
|
||||
}
|
||||
|
||||
if (scriptProcessor) {
|
||||
scriptProcessor.disconnect();
|
||||
}
|
||||
if (source) {
|
||||
source.disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
function stopAudioBtnHandler() {
|
||||
stopAudio(true);
|
||||
}
|
||||
|
||||
startBtn.addEventListener('click', startAudioBtnHandler);
|
||||
stopBtn.addEventListener('click', stopAudioBtnHandler);
|
||||
startBtn.disabled = true;
|
||||
stopBtn.disabled = true;
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
2
examples/websocket-server/requirements.txt
Normal file
2
examples/websocket-server/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
python-dotenv
|
||||
pipecat-ai[openai,silero,websocket,whisper]
|
||||
@@ -4,24 +4,39 @@
|
||||
#
|
||||
# pip-compile --all-extras pyproject.toml
|
||||
#
|
||||
aiofiles==24.1.0
|
||||
# via deepgram-sdk
|
||||
aiohttp==3.9.5
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
# via
|
||||
# cartesia
|
||||
# deepgram-sdk
|
||||
# langchain
|
||||
# langchain-community
|
||||
# pipecat-ai (pyproject.toml)
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
annotated-types==0.6.0
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anthropic==0.25.9
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
anyio==4.3.0
|
||||
# via
|
||||
# openpipe
|
||||
# pipecat-ai (pyproject.toml)
|
||||
anyio==4.4.0
|
||||
# via
|
||||
# anthropic
|
||||
# httpx
|
||||
# openai
|
||||
# starlette
|
||||
# watchfiles
|
||||
async-timeout==4.0.3
|
||||
# via aiohttp
|
||||
# via
|
||||
# aiohttp
|
||||
# langchain
|
||||
attrs==23.2.0
|
||||
# via aiohttp
|
||||
av==12.0.0
|
||||
# via
|
||||
# aiohttp
|
||||
# openpipe
|
||||
av==12.1.0
|
||||
# via faster-whisper
|
||||
azure-cognitiveservices-speech==1.37.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
@@ -29,7 +44,9 @@ blinker==1.8.2
|
||||
# via flask
|
||||
cachetools==5.3.3
|
||||
# via google-auth
|
||||
certifi==2024.2.2
|
||||
cartesia==1.0.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
certifi==2024.6.2
|
||||
# via
|
||||
# httpcore
|
||||
# httpx
|
||||
@@ -37,26 +54,45 @@ certifi==2024.2.2
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
click==8.1.7
|
||||
# via flask
|
||||
# via
|
||||
# flask
|
||||
# typer
|
||||
# uvicorn
|
||||
coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
ctranslate2==4.3.0
|
||||
ctranslate2==4.3.1
|
||||
# via faster-whisper
|
||||
daily-python==0.7.4
|
||||
daily-python==0.10.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
dataclasses-json==0.6.7
|
||||
# via
|
||||
# deepgram-sdk
|
||||
# langchain-community
|
||||
deepgram-sdk==3.2.7
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
distro==1.9.0
|
||||
# via
|
||||
# anthropic
|
||||
# openai
|
||||
dnspython==2.6.1
|
||||
# via email-validator
|
||||
einops==0.8.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
email-validator==2.2.0
|
||||
# via fastapi
|
||||
exceptiongroup==1.2.1
|
||||
# via anyio
|
||||
# via
|
||||
# anyio
|
||||
# pytest
|
||||
fal-client==0.4.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
fastapi==0.111.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
fastapi-cli==0.0.4
|
||||
# via fastapi
|
||||
faster-whisper==1.0.2
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
filelock==3.14.0
|
||||
filelock==3.15.4
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
@@ -75,20 +111,22 @@ frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2024.5.0
|
||||
fsspec==2024.6.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
future==1.0.0
|
||||
# via pyloudnorm
|
||||
google-ai-generativelanguage==0.6.4
|
||||
# via google-generativeai
|
||||
google-api-core[grpc]==2.19.0
|
||||
google-api-core[grpc]==2.19.1
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-python-client
|
||||
# google-generativeai
|
||||
google-api-python-client==2.129.0
|
||||
google-api-python-client==2.134.0
|
||||
# via google-generativeai
|
||||
google-auth==2.29.0
|
||||
google-auth==2.30.0
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-core
|
||||
@@ -99,11 +137,13 @@ google-auth-httplib2==0.2.0
|
||||
# via google-api-python-client
|
||||
google-generativeai==0.5.4
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
googleapis-common-protos==1.63.0
|
||||
googleapis-common-protos==1.63.2
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio==1.63.0
|
||||
greenlet==3.0.3
|
||||
# via sqlalchemy
|
||||
grpcio==1.64.1
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
@@ -111,21 +151,29 @@ grpcio==1.63.0
|
||||
grpcio-status==1.62.2
|
||||
# via google-api-core
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
# via
|
||||
# httpcore
|
||||
# uvicorn
|
||||
httpcore==1.0.5
|
||||
# via httpx
|
||||
httplib2==0.22.0
|
||||
# via
|
||||
# google-api-python-client
|
||||
# google-auth-httplib2
|
||||
httptools==0.6.1
|
||||
# via uvicorn
|
||||
httpx==0.27.0
|
||||
# via
|
||||
# anthropic
|
||||
# cartesia
|
||||
# deepgram-sdk
|
||||
# fal-client
|
||||
# fastapi
|
||||
# openai
|
||||
# openpipe
|
||||
httpx-sse==0.4.0
|
||||
# via fal-client
|
||||
huggingface-hub==0.23.0
|
||||
huggingface-hub==0.23.4
|
||||
# via
|
||||
# faster-whisper
|
||||
# timm
|
||||
@@ -136,34 +184,75 @@ humanfriendly==10.0
|
||||
idna==3.7
|
||||
# via
|
||||
# anyio
|
||||
# email-validator
|
||||
# httpx
|
||||
# requests
|
||||
# yarl
|
||||
iniconfig==2.0.0
|
||||
# via pytest
|
||||
itsdangerous==2.2.0
|
||||
# via flask
|
||||
jinja2==3.1.4
|
||||
# via
|
||||
# fastapi
|
||||
# flask
|
||||
# torch
|
||||
jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain==0.2.5
|
||||
# via
|
||||
# langchain-community
|
||||
# pipecat-ai (pyproject.toml)
|
||||
langchain-community==0.2.5
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
langchain-core==0.2.9
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-openai
|
||||
# langchain-text-splitters
|
||||
langchain-openai==0.1.9
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
langchain-text-splitters==0.2.1
|
||||
# via langchain
|
||||
langsmith==0.1.82
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
loguru==0.7.2
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
markdown-it-py==3.0.0
|
||||
# via rich
|
||||
markupsafe==2.1.5
|
||||
# via
|
||||
# jinja2
|
||||
# werkzeug
|
||||
marshmallow==3.21.3
|
||||
# via dataclasses-json
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mpmath==1.3.0
|
||||
# via sympy
|
||||
multidict==6.0.5
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
mypy-extensions==1.0.0
|
||||
# via typing-inspect
|
||||
networkx==3.3
|
||||
# via torch
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# ctranslate2
|
||||
# langchain
|
||||
# langchain-community
|
||||
# onnxruntime
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# pyloudnorm
|
||||
# scipy
|
||||
# torchvision
|
||||
# transformers
|
||||
nvidia-cublas-cu12==12.1.3.1
|
||||
@@ -191,7 +280,7 @@ nvidia-cusparse-cu12==12.1.0.106
|
||||
# torch
|
||||
nvidia-nccl-cu12==2.20.5
|
||||
# via torch
|
||||
nvidia-nvjitlink-cu12==12.4.127
|
||||
nvidia-nvjitlink-cu12==12.5.40
|
||||
# via
|
||||
# nvidia-cusolver-cu12
|
||||
# nvidia-cusparse-cu12
|
||||
@@ -200,17 +289,31 @@ nvidia-nvtx-cu12==12.1.105
|
||||
onnxruntime==1.18.0
|
||||
# via faster-whisper
|
||||
openai==1.26.0
|
||||
# via
|
||||
# langchain-openai
|
||||
# openpipe
|
||||
# pipecat-ai (pyproject.toml)
|
||||
openpipe==4.14.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
packaging==24.0
|
||||
orjson==3.10.5
|
||||
# via
|
||||
# fastapi
|
||||
# langsmith
|
||||
packaging==24.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# langchain-core
|
||||
# marshmallow
|
||||
# onnxruntime
|
||||
# pytest
|
||||
# transformers
|
||||
pillow==10.3.0
|
||||
# via
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# torchvision
|
||||
proto-plus==1.23.0
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
proto-plus==1.24.0
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-core
|
||||
@@ -222,6 +325,7 @@ protobuf==4.25.3
|
||||
# googleapis-common-protos
|
||||
# grpcio-status
|
||||
# onnxruntime
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# proto-plus
|
||||
# pyht
|
||||
pyasn1==0.6.0
|
||||
@@ -232,49 +336,99 @@ pyasn1-modules==0.4.0
|
||||
# via google-auth
|
||||
pyaudio==0.2.14
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pydantic==2.7.1
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
# anthropic
|
||||
# fastapi
|
||||
# google-generativeai
|
||||
# langchain
|
||||
# langchain-core
|
||||
# langsmith
|
||||
# openai
|
||||
pydantic-core==2.18.2
|
||||
pydantic-core==2.18.4
|
||||
# via pydantic
|
||||
pygments==2.18.0
|
||||
# via rich
|
||||
pyht==0.0.28
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pyloudnorm==0.1.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pyparsing==3.1.2
|
||||
# via httplib2
|
||||
pytest==8.2.2
|
||||
# via pytest-asyncio
|
||||
pytest-asyncio==0.23.7
|
||||
# via cartesia
|
||||
python-dateutil==2.9.0.post0
|
||||
# via openpipe
|
||||
python-dotenv==1.0.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
# via
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# uvicorn
|
||||
python-multipart==0.0.9
|
||||
# via fastapi
|
||||
pyyaml==6.0.1
|
||||
# via
|
||||
# ctranslate2
|
||||
# huggingface-hub
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
# timm
|
||||
# transformers
|
||||
# uvicorn
|
||||
regex==2024.5.15
|
||||
# via transformers
|
||||
requests==2.31.0
|
||||
# via
|
||||
# tiktoken
|
||||
# transformers
|
||||
requests==2.32.3
|
||||
# via
|
||||
# cartesia
|
||||
# google-api-core
|
||||
# huggingface-hub
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langsmith
|
||||
# pyht
|
||||
# tiktoken
|
||||
# transformers
|
||||
rich==13.7.1
|
||||
# via typer
|
||||
rsa==4.9
|
||||
# via google-auth
|
||||
safetensors==0.4.3
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
scipy==1.14.0
|
||||
# via pyloudnorm
|
||||
shellingham==1.5.4
|
||||
# via typer
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# httpx
|
||||
# openai
|
||||
sympy==1.12
|
||||
sqlalchemy==2.0.31
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
starlette==0.37.2
|
||||
# via fastapi
|
||||
sympy==1.12.1
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
tenacity==8.4.2
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
tiktoken==0.7.0
|
||||
# via langchain-openai
|
||||
timm==0.9.16
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
tokenizers==0.19.1
|
||||
@@ -282,15 +436,17 @@ tokenizers==0.19.1
|
||||
# anthropic
|
||||
# faster-whisper
|
||||
# transformers
|
||||
torch==2.3.0
|
||||
tomli==2.0.1
|
||||
# via pytest
|
||||
torch==2.3.1
|
||||
# via
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# timm
|
||||
# torchaudio
|
||||
# torchvision
|
||||
torchaudio==2.3.0
|
||||
torchaudio==2.3.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
torchvision==0.18.0
|
||||
torchvision==0.18.1
|
||||
# via timm
|
||||
tqdm==4.66.4
|
||||
# via
|
||||
@@ -300,25 +456,49 @@ tqdm==4.66.4
|
||||
# transformers
|
||||
transformers==4.40.2
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
triton==2.3.0
|
||||
triton==2.3.1
|
||||
# via torch
|
||||
typing-extensions==4.11.0
|
||||
typer==0.12.3
|
||||
# via fastapi-cli
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# deepgram-sdk
|
||||
# fastapi
|
||||
# google-generativeai
|
||||
# huggingface-hub
|
||||
# openai
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# sqlalchemy
|
||||
# torch
|
||||
# typer
|
||||
# typing-inspect
|
||||
# uvicorn
|
||||
typing-inspect==0.9.0
|
||||
# via dataclasses-json
|
||||
ujson==5.10.0
|
||||
# via fastapi
|
||||
uritemplate==4.1.1
|
||||
# via google-api-python-client
|
||||
urllib3==2.2.1
|
||||
urllib3==2.2.2
|
||||
# via requests
|
||||
uvicorn[standard]==0.30.1
|
||||
# via fastapi
|
||||
uvloop==0.19.0
|
||||
# via uvicorn
|
||||
verboselogs==1.7
|
||||
# via deepgram-sdk
|
||||
watchfiles==0.22.0
|
||||
# via uvicorn
|
||||
websockets==12.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
# via
|
||||
# cartesia
|
||||
# deepgram-sdk
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# uvicorn
|
||||
werkzeug==3.0.3
|
||||
# via flask
|
||||
yarl==1.9.4
|
||||
|
||||
@@ -4,24 +4,39 @@
|
||||
#
|
||||
# pip-compile --all-extras pyproject.toml
|
||||
#
|
||||
aiofiles==24.1.0
|
||||
# via deepgram-sdk
|
||||
aiohttp==3.9.5
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
# via
|
||||
# cartesia
|
||||
# deepgram-sdk
|
||||
# langchain
|
||||
# langchain-community
|
||||
# pipecat-ai (pyproject.toml)
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
annotated-types==0.6.0
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anthropic==0.25.9
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
anyio==4.3.0
|
||||
# via
|
||||
# openpipe
|
||||
# pipecat-ai (pyproject.toml)
|
||||
anyio==4.4.0
|
||||
# via
|
||||
# anthropic
|
||||
# httpx
|
||||
# openai
|
||||
# starlette
|
||||
# watchfiles
|
||||
async-timeout==4.0.3
|
||||
# via aiohttp
|
||||
# via
|
||||
# aiohttp
|
||||
# langchain
|
||||
attrs==23.2.0
|
||||
# via aiohttp
|
||||
av==12.0.0
|
||||
# via
|
||||
# aiohttp
|
||||
# openpipe
|
||||
av==12.1.0
|
||||
# via faster-whisper
|
||||
azure-cognitiveservices-speech==1.37.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
@@ -29,7 +44,9 @@ blinker==1.8.2
|
||||
# via flask
|
||||
cachetools==5.3.3
|
||||
# via google-auth
|
||||
certifi==2024.2.2
|
||||
cartesia==1.0.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
certifi==2024.6.2
|
||||
# via
|
||||
# httpcore
|
||||
# httpx
|
||||
@@ -37,26 +54,45 @@ certifi==2024.2.2
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
click==8.1.7
|
||||
# via flask
|
||||
# via
|
||||
# flask
|
||||
# typer
|
||||
# uvicorn
|
||||
coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
ctranslate2==4.2.1
|
||||
ctranslate2==4.3.1
|
||||
# via faster-whisper
|
||||
daily-python==0.7.4
|
||||
daily-python==0.10.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
dataclasses-json==0.6.7
|
||||
# via
|
||||
# deepgram-sdk
|
||||
# langchain-community
|
||||
deepgram-sdk==3.2.7
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
distro==1.9.0
|
||||
# via
|
||||
# anthropic
|
||||
# openai
|
||||
dnspython==2.6.1
|
||||
# via email-validator
|
||||
einops==0.8.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
email-validator==2.2.0
|
||||
# via fastapi
|
||||
exceptiongroup==1.2.1
|
||||
# via anyio
|
||||
# via
|
||||
# anyio
|
||||
# pytest
|
||||
fal-client==0.4.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
fastapi==0.111.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
fastapi-cli==0.0.4
|
||||
# via fastapi
|
||||
faster-whisper==1.0.2
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
filelock==3.14.0
|
||||
filelock==3.15.4
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
@@ -74,20 +110,22 @@ frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2024.5.0
|
||||
fsspec==2024.6.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
google-ai-generativelanguage==0.6.3
|
||||
future==1.0.0
|
||||
# via pyloudnorm
|
||||
google-ai-generativelanguage==0.6.4
|
||||
# via google-generativeai
|
||||
google-api-core[grpc]==2.19.0
|
||||
google-api-core[grpc]==2.19.1
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-python-client
|
||||
# google-generativeai
|
||||
google-api-python-client==2.129.0
|
||||
google-api-python-client==2.134.0
|
||||
# via google-generativeai
|
||||
google-auth==2.29.0
|
||||
google-auth==2.30.0
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-core
|
||||
@@ -96,13 +134,13 @@ google-auth==2.29.0
|
||||
# google-generativeai
|
||||
google-auth-httplib2==0.2.0
|
||||
# via google-api-python-client
|
||||
google-generativeai==0.5.3
|
||||
google-generativeai==0.5.4
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
googleapis-common-protos==1.63.0
|
||||
googleapis-common-protos==1.63.2
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio==1.63.0
|
||||
grpcio==1.64.1
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
@@ -110,21 +148,29 @@ grpcio==1.63.0
|
||||
grpcio-status==1.62.2
|
||||
# via google-api-core
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
# via
|
||||
# httpcore
|
||||
# uvicorn
|
||||
httpcore==1.0.5
|
||||
# via httpx
|
||||
httplib2==0.22.0
|
||||
# via
|
||||
# google-api-python-client
|
||||
# google-auth-httplib2
|
||||
httptools==0.6.1
|
||||
# via uvicorn
|
||||
httpx==0.27.0
|
||||
# via
|
||||
# anthropic
|
||||
# cartesia
|
||||
# deepgram-sdk
|
||||
# fal-client
|
||||
# fastapi
|
||||
# openai
|
||||
# openpipe
|
||||
httpx-sse==0.4.0
|
||||
# via fal-client
|
||||
huggingface-hub==0.23.0
|
||||
huggingface-hub==0.23.4
|
||||
# via
|
||||
# faster-whisper
|
||||
# timm
|
||||
@@ -135,50 +181,105 @@ humanfriendly==10.0
|
||||
idna==3.7
|
||||
# via
|
||||
# anyio
|
||||
# email-validator
|
||||
# httpx
|
||||
# requests
|
||||
# yarl
|
||||
iniconfig==2.0.0
|
||||
# via pytest
|
||||
itsdangerous==2.2.0
|
||||
# via flask
|
||||
jinja2==3.1.4
|
||||
# via
|
||||
# fastapi
|
||||
# flask
|
||||
# torch
|
||||
jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain==0.2.5
|
||||
# via
|
||||
# langchain-community
|
||||
# pipecat-ai (pyproject.toml)
|
||||
langchain-community==0.2.5
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
langchain-core==0.2.9
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-openai
|
||||
# langchain-text-splitters
|
||||
langchain-openai==0.1.10
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
langchain-text-splitters==0.2.1
|
||||
# via langchain
|
||||
langsmith==0.1.82
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
loguru==0.7.2
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
markdown-it-py==3.0.0
|
||||
# via rich
|
||||
markupsafe==2.1.5
|
||||
# via
|
||||
# jinja2
|
||||
# werkzeug
|
||||
marshmallow==3.21.3
|
||||
# via dataclasses-json
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mpmath==1.3.0
|
||||
# via sympy
|
||||
multidict==6.0.5
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
mypy-extensions==1.0.0
|
||||
# via typing-inspect
|
||||
networkx==3.3
|
||||
# via torch
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# ctranslate2
|
||||
# langchain
|
||||
# langchain-community
|
||||
# onnxruntime
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# pyloudnorm
|
||||
# scipy
|
||||
# torchvision
|
||||
# transformers
|
||||
onnxruntime==1.17.3
|
||||
onnxruntime==1.18.0
|
||||
# via faster-whisper
|
||||
openai==1.26.0
|
||||
# via
|
||||
# langchain-openai
|
||||
# openpipe
|
||||
# pipecat-ai (pyproject.toml)
|
||||
openpipe==4.14.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
packaging==24.0
|
||||
orjson==3.10.5
|
||||
# via
|
||||
# fastapi
|
||||
# langsmith
|
||||
packaging==24.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# langchain-core
|
||||
# marshmallow
|
||||
# onnxruntime
|
||||
# pytest
|
||||
# transformers
|
||||
pillow==10.3.0
|
||||
# via
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# torchvision
|
||||
proto-plus==1.23.0
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
proto-plus==1.24.0
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-core
|
||||
@@ -190,6 +291,7 @@ protobuf==4.25.3
|
||||
# googleapis-common-protos
|
||||
# grpcio-status
|
||||
# onnxruntime
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# proto-plus
|
||||
# pyht
|
||||
pyasn1==0.6.0
|
||||
@@ -200,49 +302,99 @@ pyasn1-modules==0.4.0
|
||||
# via google-auth
|
||||
pyaudio==0.2.14
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pydantic==2.7.1
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
# anthropic
|
||||
# fastapi
|
||||
# google-generativeai
|
||||
# langchain
|
||||
# langchain-core
|
||||
# langsmith
|
||||
# openai
|
||||
pydantic-core==2.18.2
|
||||
pydantic-core==2.18.4
|
||||
# via pydantic
|
||||
pygments==2.18.0
|
||||
# via rich
|
||||
pyht==0.0.28
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pyloudnorm==0.1.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pyparsing==3.1.2
|
||||
# via httplib2
|
||||
pytest==8.2.2
|
||||
# via pytest-asyncio
|
||||
pytest-asyncio==0.23.7
|
||||
# via cartesia
|
||||
python-dateutil==2.9.0.post0
|
||||
# via openpipe
|
||||
python-dotenv==1.0.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
# via
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# uvicorn
|
||||
python-multipart==0.0.9
|
||||
# via fastapi
|
||||
pyyaml==6.0.1
|
||||
# via
|
||||
# ctranslate2
|
||||
# huggingface-hub
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
# timm
|
||||
# transformers
|
||||
# uvicorn
|
||||
regex==2024.5.15
|
||||
# via transformers
|
||||
requests==2.31.0
|
||||
# via
|
||||
# tiktoken
|
||||
# transformers
|
||||
requests==2.32.3
|
||||
# via
|
||||
# cartesia
|
||||
# google-api-core
|
||||
# huggingface-hub
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langsmith
|
||||
# pyht
|
||||
# tiktoken
|
||||
# transformers
|
||||
rich==13.7.1
|
||||
# via typer
|
||||
rsa==4.9
|
||||
# via google-auth
|
||||
safetensors==0.4.3
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
scipy==1.14.0
|
||||
# via pyloudnorm
|
||||
shellingham==1.5.4
|
||||
# via typer
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# httpx
|
||||
# openai
|
||||
sympy==1.12
|
||||
sqlalchemy==2.0.31
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
starlette==0.37.2
|
||||
# via fastapi
|
||||
sympy==1.12.1
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
tenacity==8.4.2
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
tiktoken==0.7.0
|
||||
# via langchain-openai
|
||||
timm==0.9.16
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
tokenizers==0.19.1
|
||||
@@ -250,15 +402,17 @@ tokenizers==0.19.1
|
||||
# anthropic
|
||||
# faster-whisper
|
||||
# transformers
|
||||
torch==2.3.0
|
||||
tomli==2.0.1
|
||||
# via pytest
|
||||
torch==2.3.1
|
||||
# via
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# timm
|
||||
# torchaudio
|
||||
# torchvision
|
||||
torchaudio==2.3.0
|
||||
torchaudio==2.3.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
torchvision==0.18.0
|
||||
torchvision==0.18.1
|
||||
# via timm
|
||||
tqdm==4.66.4
|
||||
# via
|
||||
@@ -268,23 +422,47 @@ tqdm==4.66.4
|
||||
# transformers
|
||||
transformers==4.40.2
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
typing-extensions==4.11.0
|
||||
typer==0.12.3
|
||||
# via fastapi-cli
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# deepgram-sdk
|
||||
# fastapi
|
||||
# google-generativeai
|
||||
# huggingface-hub
|
||||
# openai
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# sqlalchemy
|
||||
# torch
|
||||
# typer
|
||||
# typing-inspect
|
||||
# uvicorn
|
||||
typing-inspect==0.9.0
|
||||
# via dataclasses-json
|
||||
ujson==5.10.0
|
||||
# via fastapi
|
||||
uritemplate==4.1.1
|
||||
# via google-api-python-client
|
||||
urllib3==2.2.1
|
||||
urllib3==2.2.2
|
||||
# via requests
|
||||
uvicorn[standard]==0.30.1
|
||||
# via fastapi
|
||||
uvloop==0.19.0
|
||||
# via uvicorn
|
||||
verboselogs==1.7
|
||||
# via deepgram-sdk
|
||||
watchfiles==0.22.0
|
||||
# via uvicorn
|
||||
websockets==12.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
# via
|
||||
# cartesia
|
||||
# deepgram-sdk
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# uvicorn
|
||||
werkzeug==3.0.3
|
||||
# via flask
|
||||
yarl==1.9.4
|
||||
|
||||
@@ -24,7 +24,9 @@ dependencies = [
|
||||
"numpy~=1.26.4",
|
||||
"loguru~=0.7.0",
|
||||
"Pillow~=10.3.0",
|
||||
"typing-extensions~=4.11.0",
|
||||
"protobuf~=4.25.3",
|
||||
"pyloudnorm~=0.1.1",
|
||||
"typing-extensions~=4.12.1",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -34,17 +36,21 @@ Website = "https://pipecat.ai"
|
||||
[project.optional-dependencies]
|
||||
anthropic = [ "anthropic~=0.25.7" ]
|
||||
azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
|
||||
daily = [ "daily-python~=0.7.4" ]
|
||||
cartesia = [ "cartesia~=1.0.0" ]
|
||||
daily = [ "daily-python~=0.10.1" ]
|
||||
deepgram = [ "deepgram-sdk~=3.2.7" ]
|
||||
examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
|
||||
fal = [ "fal-client~=0.4.0" ]
|
||||
google = [ "google-generativeai~=0.5.3" ]
|
||||
fireworks = [ "openai~=1.26.0" ]
|
||||
langchain = [ "langchain~=0.2.1", "langchain-community~=0.2.1", "langchain-openai~=0.1.8" ]
|
||||
local = [ "pyaudio~=0.2.0" ]
|
||||
moondream = [ "einops~=0.8.0", "timm~=0.9.16", "transformers~=4.40.2" ]
|
||||
openai = [ "openai~=1.26.0" ]
|
||||
openpipe = [ "openpipe~=4.14.0" ]
|
||||
playht = [ "pyht~=0.0.28" ]
|
||||
silero = [ "torch~=2.3.0", "torchaudio~=2.3.0" ]
|
||||
websocket = [ "websockets~=12.0" ]
|
||||
websocket = [ "websockets~=12.0", "fastapi~=0.111.0" ]
|
||||
whisper = [ "faster-whisper~=1.0.2" ]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
|
||||
@@ -4,28 +4,40 @@
|
||||
// SPDX-License-Identifier: BSD 2-Clause License
|
||||
//
|
||||
|
||||
// Generate frames_pb2.py with:
|
||||
//
|
||||
// python -m grpc_tools.protoc --proto_path=./ --python_out=./protobufs frames.proto
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package pipecat_proto;
|
||||
package pipecat;
|
||||
|
||||
message TextFrame {
|
||||
string text = 1;
|
||||
uint64 id = 1;
|
||||
string name = 2;
|
||||
string text = 3;
|
||||
}
|
||||
|
||||
message AudioFrame {
|
||||
bytes data = 1;
|
||||
message AudioRawFrame {
|
||||
uint64 id = 1;
|
||||
string name = 2;
|
||||
bytes audio = 3;
|
||||
uint32 sample_rate = 4;
|
||||
uint32 num_channels = 5;
|
||||
}
|
||||
|
||||
message TranscriptionFrame {
|
||||
string text = 1;
|
||||
string participantId = 2;
|
||||
string timestamp = 3;
|
||||
uint64 id = 1;
|
||||
string name = 2;
|
||||
string text = 3;
|
||||
string user_id = 4;
|
||||
string timestamp = 5;
|
||||
}
|
||||
|
||||
message Frame {
|
||||
oneof frame {
|
||||
TextFrame text = 1;
|
||||
AudioFrame audio = 2;
|
||||
TranscriptionFrame transcription = 3;
|
||||
}
|
||||
oneof frame {
|
||||
TextFrame text = 1;
|
||||
AudioRawFrame audio = 2;
|
||||
TranscriptionFrame transcription = 3;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from typing import Any, List, Tuple
|
||||
from typing import Any, List, Mapping, Tuple
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
@@ -188,6 +188,8 @@ class SystemFrame(Frame):
|
||||
class StartFrame(SystemFrame):
|
||||
"""This is the first frame that should be pushed down a pipeline."""
|
||||
allow_interruptions: bool = False
|
||||
enable_metrics: bool = False
|
||||
report_only_initial_ttfb: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -238,6 +240,13 @@ class StopInterruptionFrame(SystemFrame):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetricsFrame(SystemFrame):
|
||||
"""Emitted by processor that can compute metrics like latencies.
|
||||
"""
|
||||
ttfb: Mapping[str, float]
|
||||
|
||||
|
||||
#
|
||||
# Control frames
|
||||
#
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user