Compare commits
170 Commits
v0.0.10
...
cb/test-cl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ff1b2961d8 | ||
|
|
ba42cffcc2 | ||
|
|
9778d86607 | ||
|
|
19caf750fd | ||
|
|
296611714f | ||
|
|
4c3d19cc8b | ||
|
|
a3ba07c7a3 | ||
|
|
a1579808b2 | ||
|
|
aecb9f5816 | ||
|
|
a5d42a526c | ||
|
|
a9472f8116 | ||
|
|
d5f106ae19 | ||
|
|
920745345a | ||
|
|
c444004eec | ||
|
|
72cf7896d7 | ||
|
|
31af5f8177 | ||
|
|
6a68d9a57e | ||
|
|
39f41ab25e | ||
|
|
624cc1e987 | ||
|
|
08a15e5cdd | ||
|
|
4cd4787e4d | ||
|
|
65afee2808 | ||
|
|
00ece864ec | ||
|
|
6d6d9bea5a | ||
|
|
7c213f8533 | ||
|
|
3685c19b2d | ||
|
|
650a2b4da4 | ||
|
|
afea6f38f6 | ||
|
|
c45d428551 | ||
|
|
4e594aa9b0 | ||
|
|
32f91c5f31 | ||
|
|
a32ece897a | ||
|
|
88f6436aaa | ||
|
|
fac43cea06 | ||
|
|
a9e6aeed54 | ||
|
|
fa9f49f5bb | ||
|
|
2a6183aba5 | ||
|
|
b1a622971b | ||
|
|
5b72faccb4 | ||
|
|
c8732544c7 | ||
|
|
d4219b16b8 | ||
|
|
0c33432f64 | ||
|
|
95bd58cced | ||
|
|
8d7d1a7e24 | ||
|
|
3768cb2f2c | ||
|
|
d4b2741608 | ||
|
|
aef2152dcc | ||
|
|
d0b0221b97 | ||
|
|
b4758cd989 | ||
|
|
681250f114 | ||
|
|
fd13d3c50e | ||
|
|
674b8bb0cd | ||
|
|
5d9a962146 | ||
|
|
e130aada72 | ||
|
|
76709a9a39 | ||
|
|
acd2d55b84 | ||
|
|
fcec0eb812 | ||
|
|
e9965347b5 | ||
|
|
5a83f75e0d | ||
|
|
91c706a201 | ||
|
|
34384881bc | ||
|
|
71ba28753e | ||
|
|
32d2f0db66 | ||
|
|
e1169a4e82 | ||
|
|
0e5711e62d | ||
|
|
0ddfa3de5b | ||
|
|
661aa79b7c | ||
|
|
2c32cc2f27 | ||
|
|
d7bb0bc5cb | ||
|
|
d5644c3ab9 | ||
|
|
09ab8e3efd | ||
|
|
2f683529ec | ||
|
|
6ac012a82b | ||
|
|
075194cb54 | ||
|
|
269f070051 | ||
|
|
3342c9d7c2 | ||
|
|
b468b2f926 | ||
|
|
af1c7d0023 | ||
|
|
34670eef79 | ||
|
|
979739c1b7 | ||
|
|
83ed6870b9 | ||
|
|
57a568986a | ||
|
|
e828e26b5b | ||
|
|
825738440e | ||
|
|
147bd1a075 | ||
|
|
209e97f372 | ||
|
|
47f8627432 | ||
|
|
cc6713837a | ||
|
|
728fe0ad88 | ||
|
|
dbba45349f | ||
|
|
40ccf46b4b | ||
|
|
077bb9f20a | ||
|
|
e4c990c677 | ||
|
|
1c8b9d813a | ||
|
|
83812f2671 | ||
|
|
4053c33899 | ||
|
|
03978b63bc | ||
|
|
bf036be6b8 | ||
|
|
7ffb10d7f5 | ||
|
|
66377954cb | ||
|
|
e507686cef | ||
|
|
e5ddaf14f4 | ||
|
|
cf597a2f6b | ||
|
|
d83f0aabca | ||
|
|
b337e984b3 | ||
|
|
6366ee072e | ||
|
|
c3bfcbd562 | ||
|
|
c0d5054798 | ||
|
|
810dc30d3d | ||
|
|
36dd4933e9 | ||
|
|
435fffe1b0 | ||
|
|
2b8f1c4cda | ||
|
|
0e8c7a9b28 | ||
|
|
3e13678f23 | ||
|
|
455ec4f1fd | ||
|
|
8dc81042c3 | ||
|
|
c77db79447 | ||
|
|
de65028061 | ||
|
|
d66a795413 | ||
|
|
34762bf604 | ||
|
|
57121338b1 | ||
|
|
a5d246ec0c | ||
|
|
f2cefeeedc | ||
|
|
537e72a05f | ||
|
|
efa5a061d7 | ||
|
|
0bef44c2ff | ||
|
|
f62fe059b1 | ||
|
|
f432e2b17e | ||
|
|
8c877d7d8e | ||
|
|
dc9377fb92 | ||
|
|
7384b63b1d | ||
|
|
ba6ecf541f | ||
|
|
94e5709d58 | ||
|
|
add8d3cbaf | ||
|
|
1a42188bce | ||
|
|
0da427e127 | ||
|
|
9447b32f3e | ||
|
|
af10adb7fe | ||
|
|
129acf886f | ||
|
|
9af3e1efac | ||
|
|
9e22a8b4ff | ||
|
|
28da747f19 | ||
|
|
3d6783ddb0 | ||
|
|
349fc526d7 | ||
|
|
acf6dc0a30 | ||
|
|
3563e66ff6 | ||
|
|
8965ff27ec | ||
|
|
86feb1e104 | ||
|
|
f6257a86d3 | ||
|
|
bd04ea8aca | ||
|
|
754c1c6775 | ||
|
|
0b01eb5a11 | ||
|
|
6247b9df39 | ||
|
|
bd5344c892 | ||
|
|
e4fe54cd7f | ||
|
|
97f9e9b042 | ||
|
|
3668eb1606 | ||
|
|
e23addcc02 | ||
|
|
5147f4086e | ||
|
|
fb3c2de83f | ||
|
|
107817317c | ||
|
|
663ff3417c | ||
|
|
2b19d6bbac | ||
|
|
7c41246e55 | ||
|
|
11aa9dc803 | ||
|
|
922cdefee5 | ||
|
|
e018d5b47a | ||
|
|
20c679988c | ||
|
|
a344101cff | ||
|
|
2cefc40a77 |
2
.github/workflows/publish_test.yaml
vendored
2
.github/workflows/publish_test.yaml
vendored
@@ -40,7 +40,7 @@ jobs:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
|
||||
publish-to-pypi:
|
||||
publish-to-test-pypi:
|
||||
name: "Publish to Test PyPI"
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
|
||||
403
CHANGELOG.md
Normal file
403
CHANGELOG.md
Normal file
@@ -0,0 +1,403 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to **pipecat** will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
- Added Cartesia TTS support (https://cartesia.ai/)
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed SileroVAD frame processor.
|
||||
|
||||
- Fixed an issue where `camera_out_enabled` would cause the highg CPU usage if
|
||||
no image was provided.
|
||||
|
||||
|
||||
## [0.0.24] - 2024-05-29
|
||||
|
||||
### Added
|
||||
|
||||
- Exposed `on_dialin_ready` for Daily transport SIP endpoint handling. This
|
||||
notifies when the Daily room SIP endpoints are ready. This allows integrating
|
||||
with third-party services like Twilio.
|
||||
|
||||
- Exposed Daily transport `on_app_message` event.
|
||||
|
||||
- Added Daily transport `on_call_state_updated` event.
|
||||
|
||||
- Added Daily transport `start_recording()`, `stop_recording` and
|
||||
`stop_dialout`.
|
||||
|
||||
### Changed
|
||||
|
||||
- Added `PipelineParams`. This replaces the `allow_interruptions` argument in
|
||||
`PipelineTask` and will allow future parameters in the future.
|
||||
|
||||
- Fixed Deepgram Aura TTS base_url and added ErrorFrame reporting.
|
||||
|
||||
- GoogleLLMService `api_key` argument is now mandatory.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Daily tranport `dialin-ready` doesn't not block anymore and it now handles
|
||||
timeouts.
|
||||
|
||||
- Fixed AzureLLMService.
|
||||
|
||||
## [0.0.23] - 2024-05-23
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue handling Daily transport `dialin-ready` event.
|
||||
|
||||
## [0.0.22] - 2024-05-23
|
||||
|
||||
### Added
|
||||
|
||||
- Added Daily transport `start_dialout()` to be able to make phone or SIP calls.
|
||||
See https://reference-python.daily.co/api_reference.html#daily.CallClient.start_dialout
|
||||
|
||||
- Added Daily transport support for dial-in use cases.
|
||||
|
||||
- Added Daily transport events: `on_dialout_connected`, `on_dialout_stopped`,
|
||||
`on_dialout_error` and `on_dialout_warning`. See
|
||||
https://reference-python.daily.co/api_reference.html#daily.EventHandler
|
||||
|
||||
## [0.0.21] - 2024-05-22
|
||||
|
||||
### Added
|
||||
|
||||
- Added vision support to Anthropic service.
|
||||
|
||||
- Added `WakeCheckFilter` which allows you to pass information downstream only
|
||||
if you say a certain phrase/word.
|
||||
|
||||
### Changed
|
||||
|
||||
- `Filter` has been renamed to `FrameFilter` and it's now under
|
||||
`processors/filters`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed Anthropic service to use new frame types.
|
||||
|
||||
- Fixed an issue in `LLMUserResponseAggregator` and `UserResponseAggregator`
|
||||
that would cause frames after a brief pause to not be pushed to the LLM.
|
||||
|
||||
- Clear the audio output buffer if we are interrupted.
|
||||
|
||||
- Re-add exponential smoothing after volume calculation. This makes sure the
|
||||
volume value being used doesn't fluctuate so much.
|
||||
|
||||
## [0.0.20] - 2024-05-22
|
||||
|
||||
### Added
|
||||
|
||||
- In order to improve interruptions we now compute a loudness level using
|
||||
[pyloudnorm](https://github.com/csteinmetz1/pyloudnorm). The audio coming
|
||||
WebRTC transports (e.g. Daily) have an Automatic Gain Control (AGC) algorithm
|
||||
applied to the signal, however we don't do that on our local PyAudio
|
||||
signals. This means that currently incoming audio from PyAudio is kind of
|
||||
broken. We will fix it in future releases.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where `StartInterruptionFrame` would cause
|
||||
`LLMUserResponseAggregator` to push the accumulated text causing the LLM
|
||||
respond in the wrong task. The `StartInterruptionFrame` should not trigger any
|
||||
new LLM response because that would be spoken in a different task.
|
||||
|
||||
- Fixed an issue where tasks and threads could be paused because the executor
|
||||
didn't have more tasks available. This was causing issues when cancelling and
|
||||
recreating tasks during interruptions.
|
||||
|
||||
## [0.0.19] - 2024-05-20
|
||||
|
||||
### Changed
|
||||
|
||||
- `LLMUserResponseAggregator` and `LLMAssistantResponseAggregator` internal
|
||||
messages are now exposed through the `messages` property.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where `LLMAssistantResponseAggregator` was not accumulating the
|
||||
full response but short sentences instead. If there's an interruption we only
|
||||
accumulate what the bot has spoken until now in a long response as well.
|
||||
|
||||
## [0.0.18] - 2024-05-20
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue in `DailyOuputTransport` where transport messages were not
|
||||
being sent.
|
||||
|
||||
## [0.0.17] - 2024-05-19
|
||||
|
||||
### Added
|
||||
|
||||
- Added `google.generativeai` model support, including vision. This new `google`
|
||||
service defaults to using `gemini-1.5-flash-latest`. Example in
|
||||
`examples/foundational/12a-describe-video-gemini-flash.py`.
|
||||
|
||||
- Added vision support to `openai` service. Example in
|
||||
`examples/foundational/12a-describe-video-gemini-flash.py`.
|
||||
|
||||
- Added initial interruptions support. The assistant contexts (or aggregators)
|
||||
should now be placed after the output transport. This way, only the completed
|
||||
spoken context is added to the assistant context.
|
||||
|
||||
- Added `VADParams` so you can control voice confidence level and others.
|
||||
|
||||
- `VADAnalyzer` now uses an exponential smoothed volume to improve speech
|
||||
detection. This is useful when voice confidence is high (because there's
|
||||
someone talking near you) but volume is low.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where TTSService was not pushing TextFrames downstream.
|
||||
|
||||
- Fixed issues with Ctrl-C program termination.
|
||||
|
||||
- Fixed an issue that was causing `StopTaskFrame` to actually not exit the
|
||||
`PipelineTask`.
|
||||
|
||||
## [0.0.16] - 2024-05-16
|
||||
|
||||
### Fixed
|
||||
|
||||
- `DailyTransport`: don't publish camera and audio tracks if not enabled.
|
||||
|
||||
- Fixed an issue in `BaseInputTransport` that was causing frames pushed
|
||||
downstream not pushed in the right order.
|
||||
|
||||
## [0.0.15] - 2024-05-15
|
||||
|
||||
### Fixed
|
||||
|
||||
- Quick hot fix for receiving `DailyTransportMessage`.
|
||||
|
||||
## [0.0.14] - 2024-05-15
|
||||
|
||||
### Added
|
||||
|
||||
- Added `DailyTransport` event `on_participant_left`.
|
||||
|
||||
- Added support for receiving `DailyTransportMessage`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Images are now resized to the size of the output camera. This was causing
|
||||
images not being displayed.
|
||||
|
||||
- Fixed an issue in `DailyTransport` that would not allow the input processor to
|
||||
shutdown if no participant ever joined the room.
|
||||
|
||||
- Fixed base transports start and stop. In some situation processors would halt
|
||||
or not shutdown properly.
|
||||
|
||||
## [0.0.13] - 2024-05-14
|
||||
|
||||
### Changed
|
||||
|
||||
- `MoondreamService` argument `model_id` is now `model`.
|
||||
|
||||
- `VADAnalyzer` arguments have been renamed for more clarity.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue with `DailyInputTransport` and `DailyOutputTransport` that
|
||||
could cause some threads to not start properly.
|
||||
|
||||
- Fixed `STTService`. Add `max_silence_secs` and `max_buffer_secs` to handle
|
||||
better what's being passed to the STT service. Also add exponential smoothing
|
||||
to the RMS.
|
||||
|
||||
- Fixed `WhisperSTTService`. Add `no_speech_prob` to avoid garbage output text.
|
||||
|
||||
## [0.0.12] - 2024-05-14
|
||||
|
||||
### Added
|
||||
|
||||
- Added `DailyTranscriptionSettings` to be able to specify transcription
|
||||
settings much easier (e.g. language).
|
||||
|
||||
### Other
|
||||
|
||||
- Updated `simple-chatbot` with Spanish.
|
||||
|
||||
- Add missing dependencies in some of the examples.
|
||||
|
||||
## [0.0.11] - 2024-05-13
|
||||
|
||||
### Added
|
||||
|
||||
- Allow stopping pipeline tasks with new `StopTaskFrame`.
|
||||
|
||||
### Changed
|
||||
|
||||
- TTS, STT and image generation service now use `AsyncGenerator`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- `DailyTransport`: allow registering for participant transcriptions even if
|
||||
input transport is not initialized yet.
|
||||
|
||||
### Other
|
||||
|
||||
- Updated `storytelling-chatbot`.
|
||||
|
||||
## [0.0.10] - 2024-05-13
|
||||
|
||||
### Added
|
||||
|
||||
- Added Intel GPU support to `MoondreamService`.
|
||||
|
||||
- Added support for sending transport messages (e.g. to communicate with an app
|
||||
at the other end of the transport).
|
||||
|
||||
- Added `FrameProcessor.push_error()` to easily send an `ErrorFrame` upstream.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed Azure services (TTS and image generation).
|
||||
|
||||
### Other
|
||||
|
||||
- Updated `simple-chatbot`, `moondream-chatbot` and `translation-chatbot`
|
||||
examples.
|
||||
|
||||
## [0.0.9] - 2024-05-12
|
||||
|
||||
### Changed
|
||||
|
||||
Many things have changed in this version. Many of the main ideas such as frames,
|
||||
processors, services and transports are still there but some things have changed
|
||||
a bit.
|
||||
|
||||
- `Frame`s describe the basic units for processing. For example, text, image or
|
||||
audio frames. Or control frames to indicate a user has started or stopped
|
||||
speaking.
|
||||
|
||||
- `FrameProcessor`s process frames (e.g. they convert a `TextFrame` to an
|
||||
`ImageRawFrame`) and push new frames downstream or upstream to their linked
|
||||
peers.
|
||||
|
||||
- `FrameProcessor`s can be linked together. The easiest wait is to use the
|
||||
`Pipeline` which is a container for processors. Linking processors allow
|
||||
frames to travel upstream or downstream easily.
|
||||
|
||||
- `Transport`s are a way to send or receive frames. There can be local
|
||||
transports (e.g. local audio or native apps), network transports
|
||||
(e.g. websocket) or service transports (e.g. https://daily.co).
|
||||
|
||||
- `Pipeline`s are just a processor container for other processors.
|
||||
|
||||
- A `PipelineTask` know how to run a pipeline.
|
||||
|
||||
- A `PipelineRunner` can run one or more tasks and it is also used, for example,
|
||||
to capture Ctrl-C from the user.
|
||||
|
||||
## [0.0.8] - 2024-04-11
|
||||
|
||||
### Added
|
||||
|
||||
- Added `FireworksLLMService`.
|
||||
|
||||
- Added `InterimTranscriptionFrame` and enable interim results in
|
||||
`DailyTransport` transcriptions.
|
||||
|
||||
### Changed
|
||||
|
||||
- `FalImageGenService` now uses new `fal_client` package.
|
||||
|
||||
### Fixed
|
||||
|
||||
- `FalImageGenService`: use `asyncio.to_thread` to not block main loop when
|
||||
generating images.
|
||||
|
||||
- Allow `TranscriptionFrame` after an end frame (transcriptions can be delayed
|
||||
and received after `UserStoppedSpeakingFrame`).
|
||||
|
||||
## [0.0.7] - 2024-04-10
|
||||
|
||||
### Added
|
||||
|
||||
- Add `use_cpu` argument to `MoondreamService`.
|
||||
|
||||
## [0.0.6] - 2024-04-10
|
||||
|
||||
### Added
|
||||
|
||||
- Added `FalImageGenService.InputParams`.
|
||||
|
||||
- Added `URLImageFrame` and `UserImageFrame`.
|
||||
|
||||
- Added `UserImageRequestFrame` and allow requesting an image from a participant.
|
||||
|
||||
- Added base `VisionService` and `MoondreamService`
|
||||
|
||||
### Changed
|
||||
|
||||
- Don't pass `image_size` to `ImageGenService`, images should have their own size.
|
||||
|
||||
- `ImageFrame` now receives a tuple`(width,height)` to specify the size.
|
||||
|
||||
- `on_first_other_participant_joined` now gets a participant argument.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Check if camera, speaker and microphone are enabled before writing to them.
|
||||
|
||||
### Performance
|
||||
|
||||
- `DailyTransport` only subscribe to desired participant video track.
|
||||
|
||||
## [0.0.5] - 2024-04-06
|
||||
|
||||
### Changed
|
||||
|
||||
- Use `camera_bitrate` and `camera_framerate`.
|
||||
|
||||
- Increase `camera_framerate` to 30 by default.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed `LocalTransport.read_audio_frames`.
|
||||
|
||||
## [0.0.4] - 2024-04-04
|
||||
|
||||
### Added
|
||||
|
||||
- Added project optional dependencies `[silero,openai,...]`.
|
||||
|
||||
### Changed
|
||||
|
||||
- Moved thransports to its own directory.
|
||||
|
||||
- Use `OPENAI_API_KEY` instead of `OPENAI_CHATGPT_API_KEY`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Don't write to microphone/speaker if not enabled.
|
||||
|
||||
### Other
|
||||
|
||||
- Added live translation example.
|
||||
|
||||
- Fix foundational examples.
|
||||
|
||||
## [0.0.3] - 2024-03-13
|
||||
|
||||
### Other
|
||||
|
||||
- Added `storybot` and `chatbot` examples.
|
||||
|
||||
## [0.0.2] - 2024-03-12
|
||||
|
||||
Initial public release.
|
||||
62
CHANGELOG.md.template
Normal file
62
CHANGELOG.md.template
Normal file
@@ -0,0 +1,62 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to the **<project name>** SDK will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
Please make sure to add your changes to the appropriate categories:
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
<!-- for new functionality -->
|
||||
|
||||
- n/a
|
||||
|
||||
### Changed
|
||||
|
||||
<!-- for changed functionality -->
|
||||
|
||||
- n/a
|
||||
|
||||
### Deprecated
|
||||
|
||||
<!-- for soon-to-be removed functionality -->
|
||||
|
||||
- n/a
|
||||
|
||||
### Removed
|
||||
|
||||
<!-- for removed functionality -->
|
||||
|
||||
- n/a
|
||||
|
||||
### Fixed
|
||||
|
||||
<!-- for fixed bugs -->
|
||||
|
||||
- n/a
|
||||
|
||||
### Performance
|
||||
|
||||
<!-- for performance-relevant changes -->
|
||||
|
||||
- n/a
|
||||
|
||||
### Security
|
||||
|
||||
<!-- for security-relevant changes -->
|
||||
|
||||
- n/a
|
||||
|
||||
### Other
|
||||
|
||||
<!-- for everything else -->
|
||||
|
||||
- n/a
|
||||
|
||||
## [0.1.0] - YYYY-MM-DD
|
||||
|
||||
Initial release.
|
||||
14
README.md
14
README.md
@@ -1,5 +1,5 @@
|
||||
<div align="center">
|
||||
<img alt="pipecat" width="300px" height="auto" src="pipecat.png">
|
||||
<img alt="pipecat" width="300px" height="auto" src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/pipecat.png">
|
||||
</div>
|
||||
|
||||
# Pipecat
|
||||
@@ -12,11 +12,11 @@
|
||||
Take a look at some example apps:
|
||||
|
||||
<p float="left">
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot"><img src="examples/simple-chatbot/image.png" width="280" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/storytelling-chatbot"><img src="examples/storytelling-chatbot/image.png" width="280" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/simple-chatbot/image.png" width="280" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/storytelling-chatbot/image.png" width="280" /></a>
|
||||
<br/>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/translation-chatbot"><img src="examples/translation-chatbot/image.png" width="280" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/moondream-chatbot"><img src="examples/moondream-chatbot/image.png" width="280" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/translation-chatbot/image.png" width="280" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/moondream-chatbot/image.png" width="280" /></a>
|
||||
</p>
|
||||
|
||||
## Getting started with voice agents
|
||||
@@ -39,7 +39,7 @@ pip install "pipecat-ai[option,...]"
|
||||
|
||||
Your project may or may not need these, so they're made available as optional requirements. Here is a list:
|
||||
|
||||
- **AI services**: `anthropic`, `azure`, `fal`, `moondream`, `openai`, `playht`, `silero`, `whisper`
|
||||
- **AI services**: `anthropic`, `azure`, `deepgram`, `google`, `fal`, `moondream`, `openai`, `playht`, `silero`, `whisper`
|
||||
- **Transports**: `local`, `websocket`, `daily`
|
||||
|
||||
## Code examples
|
||||
@@ -218,4 +218,4 @@ Install the
|
||||
|
||||
➡️ [Join our Discord](https://discord.gg/pipecat)
|
||||
|
||||
➡️ [Reach us on Twitter](https://x.com/pipecat_ai)
|
||||
➡️ [Reach us on X](https://x.com/pipecat_ai)
|
||||
|
||||
@@ -13,12 +13,12 @@ from dataclasses import dataclass
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
AppFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
ImageRawFrame,
|
||||
TextFrame,
|
||||
EndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesFrame,
|
||||
LLMResponseStartFrame,
|
||||
TextFrame
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -64,7 +64,7 @@ class MonthPrepender(FrameProcessor):
|
||||
elif self.prepend_to_next_text_frame and isinstance(frame, TextFrame):
|
||||
await self.push_frame(TextFrame(f"{self.most_recent_month}: {frame.text}"))
|
||||
self.prepend_to_next_text_frame = False
|
||||
elif isinstance(frame, LLMResponseStartFrame):
|
||||
elif isinstance(frame, LLMFullResponseStartFrame):
|
||||
self.prepend_to_next_text_frame = True
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
@@ -105,7 +105,7 @@ async def main(room_url):
|
||||
|
||||
gated_aggregator = GatedAggregator(
|
||||
gate_open_fn=lambda frame: isinstance(frame, ImageRawFrame),
|
||||
gate_close_fn=lambda frame: isinstance(frame, LLMResponseStartFrame),
|
||||
gate_close_fn=lambda frame: isinstance(frame, LLMFullResponseStartFrame),
|
||||
start_open=False
|
||||
)
|
||||
|
||||
@@ -114,14 +114,14 @@ async def main(room_url):
|
||||
llm_full_response_aggregator = LLMFullResponseAggregator()
|
||||
|
||||
pipeline = Pipeline([
|
||||
llm,
|
||||
sentence_aggregator,
|
||||
ParallelTask(
|
||||
[month_prepender, tts],
|
||||
[llm_full_response_aggregator, imagegen]
|
||||
llm, # LLM
|
||||
sentence_aggregator, # Aggregates LLM output into full sentences
|
||||
ParallelTask( # Run pipelines in parallel aggregating the result
|
||||
[month_prepender, tts], # Create "Month: sentence" and output audio
|
||||
[llm_full_response_aggregator, imagegen] # Aggregate full LLM response
|
||||
),
|
||||
gated_aggregator,
|
||||
transport.output()
|
||||
gated_aggregator, # Queues everything until an image is available
|
||||
transport.output() # Transport output
|
||||
])
|
||||
|
||||
frames = []
|
||||
|
||||
@@ -98,9 +98,13 @@ async def main():
|
||||
|
||||
image_grabber = ImageGrabber()
|
||||
|
||||
pipeline = Pipeline([llm, aggregator, description,
|
||||
ParallelPipeline([tts, audio_grabber],
|
||||
[imagegen, image_grabber])])
|
||||
pipeline = Pipeline([
|
||||
llm,
|
||||
aggregator,
|
||||
description,
|
||||
ParallelPipeline([tts, audio_grabber],
|
||||
[imagegen, image_grabber])
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
await task.queue_frame(LLMMessagesFrame(messages))
|
||||
|
||||
@@ -21,7 +21,7 @@ from pipecat.processors.logger import FrameLogger
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVAD
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
@@ -41,14 +41,13 @@ async def main(room_url: str, token):
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_in_enabled=True, # This is so Silero VAD can get audio data
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
vad = SileroVAD()
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
@@ -57,22 +56,32 @@ async def main(room_url: str, token):
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
model="gpt-4o")
|
||||
|
||||
fl_in = FrameLogger("Inner")
|
||||
fl_out = FrameLogger("Outer")
|
||||
fl = FrameLogger("!!! after LLM", "red")
|
||||
fltts = FrameLogger("@@@ out of tts", "green")
|
||||
flend = FrameLogger("### out of the end", "magenta")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not contain special characters. Respond to what the user said in a creative and helpful way.",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([fl_in, transport.input(), vad, tma_in, llm,
|
||||
fl_out, tts, tma_out, transport.output()])
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
tma_in,
|
||||
llm,
|
||||
fl,
|
||||
tts,
|
||||
fltts,
|
||||
transport.output(),
|
||||
tma_out,
|
||||
flend
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
|
||||
@@ -15,14 +15,15 @@ from pipecat.frames.frames import ImageRawFrame, Frame, SystemFrame, TextFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.transports.services.daily import DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
from runner import configure
|
||||
@@ -66,7 +67,9 @@ async def main(room_url: str, token):
|
||||
audio_out_enabled=True,
|
||||
camera_out_width=1024,
|
||||
camera_out_height=1024,
|
||||
transcription_enabled=True
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
@@ -83,20 +86,27 @@ async def main(room_url: str, token):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so it should not contain special characters. Respond to what the user said in a creative and helpful way.",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(messages)
|
||||
tma_out = LLMAssistantContextAggregator(messages)
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
image_sync_aggregator = ImageSyncAggregator(
|
||||
os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
|
||||
os.path.join(os.path.dirname(__file__), "assets", "waiting.png"),
|
||||
)
|
||||
|
||||
pipeline = Pipeline([transport.input(), image_sync_aggregator,
|
||||
tma_in, llm, tma_out, tts, transport.output()])
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
image_sync_aggregator,
|
||||
tma_in,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
tma_out
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
|
||||
@@ -1,26 +1,34 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from pipecat.pipeline.aggregators import (
|
||||
LLMAssistantResponseAggregator,
|
||||
LLMUserResponseAggregator,
|
||||
)
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.services.ai_services import FrameLogger
|
||||
from pipecat.transports.daily_transport import DailyTransport
|
||||
from pipecat.services.open_ai_services import OpenAILLMService
|
||||
from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("pipecat")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
@@ -29,12 +37,12 @@ async def main(room_url: str, token):
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False,
|
||||
vad_enabled=True,
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
@@ -47,27 +55,38 @@ async def main(room_url: str, token):
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
pipeline = Pipeline([FrameLogger(), llm, FrameLogger(), tts])
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
await transport.say("Hi, I'm listening!", tts)
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
async def run_conversation():
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
await transport.run_interruptible_pipeline(
|
||||
pipeline,
|
||||
post_processor=LLMAssistantResponseAggregator(messages),
|
||||
pre_processor=LLMUserResponseAggregator(messages),
|
||||
)
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
await asyncio.gather(transport.run(), run_conversation())
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
95
examples/foundational/07a-interruptible-anthropic.py
Normal file
95
examples/foundational/07a-interruptible-anthropic.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.anthropic import AnthropicLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
model="claude-3-opus-20240229")
|
||||
|
||||
# todo: think more about how to handle system prompts in a more general way. OpenAI,
|
||||
# Google, and Anthropic all have slightly different approaches to providing a system
|
||||
# prompt.
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative, helpful, and brief way. Say hello.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
94
examples/foundational/07c-interruptible-deepgram.py
Normal file
94
examples/foundational/07c-interruptible-deepgram.py
Normal file
@@ -0,0 +1,94 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.deepgram import DeepgramTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = DeepgramTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
voice="aura-helios-en"
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
93
examples/foundational/07d-interruptible-cartesia.py
Normal file
93
examples/foundational/07d-interruptible-cartesia.py
Normal file
@@ -0,0 +1,93 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_name="Barbershop Man"
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -3,14 +3,14 @@ import aiohttp
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from pipecat.pipeline.aggregators import SentenceAggregator
|
||||
from pipecat.processors.aggregators import SentenceAggregator
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
|
||||
from pipecat.transports.daily_transport import DailyTransport
|
||||
from pipecat.services.azure_ai_services import AzureLLMService, AzureTTSService
|
||||
from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
|
||||
from pipecat.services.fal_ai_services import FalImageGenService
|
||||
from pipecat.pipeline.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
|
||||
from pipecat.frames.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
|
||||
|
||||
from runner import configure
|
||||
|
||||
|
||||
@@ -7,12 +7,9 @@
|
||||
import asyncio
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame, ImageRawFrame
|
||||
from pipecat.processors.filter import Filter
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
from pipecat.transports.services.daily import DailyTransport, DailyParams
|
||||
|
||||
from runner import configure
|
||||
@@ -42,13 +39,7 @@ async def main(room_url, token):
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_video(participant["id"])
|
||||
|
||||
# The ParallelPipeline is not really necessary here but it shows how you
|
||||
# would process audio and video concurrently in parallel pipelines.
|
||||
pipeline = Pipeline([transport.input(),
|
||||
ParallelPipeline(
|
||||
[Filter([AudioRawFrame])],
|
||||
[Filter([ImageRawFrame])]),
|
||||
transport.output()])
|
||||
pipeline = Pipeline([transport.input(), transport.output()])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
|
||||
94
examples/foundational/10-wake-phrase.py
Normal file
94
examples/foundational/10-wake-phrase.py
Normal file
@@ -0,0 +1,94 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.processors.filters.wake_check_filter import WakeCheckFilter
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantResponseAggregator, LLMUserResponseAggregator)
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Robot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant. Respond to what the user said in a creative and helpful way. Keep your responses brief.",
|
||||
},
|
||||
]
|
||||
|
||||
hey_robot_filter = WakeCheckFilter(["hey robot", "hey, robot"])
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(), # Transport user input
|
||||
hey_robot_filter, # Filter out speech not directed at the robot
|
||||
tma_in, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
tma_out # Assistant spoken responses
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
await tts.say("Hi! If you want to talk to me, just say 'Hey Robot'.")
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -1,181 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
SystemFrame,
|
||||
TextFrame,
|
||||
ImageRawFrame,
|
||||
SpriteFrame,
|
||||
TranscriptionFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import (
|
||||
LLMUserContextAggregator,
|
||||
LLMAssistantContextAggregator,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
sprites = {}
|
||||
image_files = [
|
||||
"sc-default.png",
|
||||
"sc-talk.png",
|
||||
"sc-listen-1.png",
|
||||
"sc-think-1.png",
|
||||
"sc-think-2.png",
|
||||
"sc-think-3.png",
|
||||
"sc-think-4.png",
|
||||
]
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in image_files:
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the image and convert it to bytes
|
||||
with Image.open(full_path) as img:
|
||||
sprites[file] = ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)
|
||||
|
||||
# When the bot isn't talking, show a static image of the cat listening
|
||||
quiet_frame = sprites["sc-listen-1.png"]
|
||||
|
||||
# When the bot is talking, build an animation from two sprites
|
||||
talking_list = [sprites["sc-default.png"], sprites["sc-talk.png"]]
|
||||
talking = [random.choice(talking_list) for x in range(30)]
|
||||
talking_frame = SpriteFrame(talking)
|
||||
|
||||
# TODO: Support "thinking" as soon as we get a valid transcript, while LLM
|
||||
# is processing
|
||||
thinking_list = [
|
||||
sprites["sc-think-1.png"],
|
||||
sprites["sc-think-2.png"],
|
||||
sprites["sc-think-3.png"],
|
||||
sprites["sc-think-4.png"],
|
||||
]
|
||||
thinking_frame = SpriteFrame(thinking_list)
|
||||
|
||||
|
||||
class NameCheckFilter(FrameProcessor):
|
||||
def __init__(self, names: list[str]):
|
||||
super().__init__()
|
||||
self._names = names
|
||||
self._sentence = ""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, SystemFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
content: str = ""
|
||||
|
||||
# TODO: split up transcription by participant
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
content = frame.text
|
||||
self._sentence += content
|
||||
if self._sentence.endswith((".", "?", "!")):
|
||||
if any(name in self._sentence for name in self._names):
|
||||
await self.push_frame(TextFrame(self._sentence))
|
||||
self._sentence = ""
|
||||
else:
|
||||
self._sentence = ""
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class ImageSyncAggregator(FrameProcessor):
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await self.push_frame(talking_frame)
|
||||
await self.push_frame(frame)
|
||||
await self.push_frame(quiet_frame)
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Santa Cat",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
camera_out_enabled=True,
|
||||
camera_out_width=720,
|
||||
camera_out_height=1280,
|
||||
camera_out_framerate=10,
|
||||
transcription_enabled=True
|
||||
)
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="jBpfuIE2acCO8z3wKNLl",
|
||||
)
|
||||
isa = ImageSyncAggregator()
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are Santa Cat, a cat that lives in Santa's workshop at the North Pole. You should be clever, and a bit sarcastic. You should also tell jokes every once in a while. Your responses should only be a few sentences long.",
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(messages)
|
||||
tma_out = LLMAssistantContextAggregator(messages)
|
||||
ncf = NameCheckFilter(["Santa Cat", "Santa"])
|
||||
|
||||
pipeline = Pipeline([transport.input(), isa, ncf, tma_in,
|
||||
llm, tma_out, tts, transport.output()])
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
# Send some greeting at the beginning.
|
||||
await tts.say("Hi! If you want to talk to me, just say 'hey Santa Cat'.")
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
|
||||
async def starting_image():
|
||||
await transport.send_image(quiet_frame)
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
await asyncio.gather(runner.run(task), starting_image())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -13,21 +13,22 @@ import wave
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
AudioRawFrame,
|
||||
LLMResponseEndFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMMessagesFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import (
|
||||
LLMUserContextAggregator,
|
||||
LLMAssistantContextAggregator,
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMUserResponseAggregator,
|
||||
LLMAssistantResponseAggregator,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.logger import FrameLogger
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
@@ -59,7 +60,7 @@ for file in sound_files:
|
||||
class OutboundSoundEffectWrapper(FrameProcessor):
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, LLMResponseEndFrame):
|
||||
if isinstance(frame, LLMFullResponseEndFrame):
|
||||
await self.push_frame(sounds["ding1.wav"])
|
||||
# In case anything else downstream needs it
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -84,7 +85,12 @@ async def main(room_url: str, token):
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(audio_out_enabled=True, transcription_enabled=True)
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
@@ -104,15 +110,25 @@ async def main(room_url: str, token):
|
||||
},
|
||||
]
|
||||
|
||||
tma_in = LLMUserContextAggregator(messages)
|
||||
tma_out = LLMAssistantContextAggregator(messages)
|
||||
tma_in = LLMUserResponseAggregator(messages)
|
||||
tma_out = LLMAssistantResponseAggregator(messages)
|
||||
out_sound = OutboundSoundEffectWrapper()
|
||||
in_sound = InboundSoundEffectWrapper()
|
||||
fl = FrameLogger("LLM Out")
|
||||
fl2 = FrameLogger("Transcription In")
|
||||
|
||||
pipeline = Pipeline([transport.input(), tma_in, in_sound, fl2, llm,
|
||||
tma_out, fl, tts, out_sound, transport.output()])
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
tma_in,
|
||||
in_sound,
|
||||
fl2,
|
||||
llm,
|
||||
fl,
|
||||
tts,
|
||||
out_sound,
|
||||
transport.output(),
|
||||
tma_out
|
||||
])
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
|
||||
@@ -19,7 +19,7 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.moondream import MoondreamService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVAD
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
@@ -54,14 +54,13 @@ async def main(room_url: str, token):
|
||||
token,
|
||||
"Describe participant video",
|
||||
DailyParams(
|
||||
audio_in_enabled=True, # This is so Silero VAD can get audio data
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
vad = SileroVAD()
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
@@ -90,8 +89,15 @@ async def main(room_url: str, token):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
image_requester.set_participant_id(participant["id"])
|
||||
|
||||
pipeline = Pipeline([transport.input(), vad, user_response, image_requester,
|
||||
vision_aggregator, moondream, tts, transport.output()])
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
user_response,
|
||||
image_requester,
|
||||
vision_aggregator,
|
||||
moondream,
|
||||
tts,
|
||||
transport.output()
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
|
||||
106
examples/foundational/12a-describe-video-gemini-flash.py
Normal file
106
examples/foundational/12a-describe-video-gemini-flash.py
Normal file
@@ -0,0 +1,106 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.google import GoogleLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
|
||||
def __init__(self, participant_id: str | None = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Describe participant video",
|
||||
DailyParams(
|
||||
audio_in_enabled=True, # This is so Silero VAD can get audio data
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
vision_aggregator = VisionImageFrameAggregator()
|
||||
|
||||
google = GoogleLLMService(
|
||||
model="gemini-1.5-flash-latest",
|
||||
api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
await tts.say("Hi there! Feel free to ask me what I see.")
|
||||
transport.capture_participant_video(participant["id"], framerate=0)
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
image_requester.set_participant_id(participant["id"])
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
user_response,
|
||||
image_requester,
|
||||
vision_aggregator,
|
||||
google,
|
||||
tts,
|
||||
transport.output()
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
106
examples/foundational/12b-describe-video-gpt-4o.py
Normal file
106
examples/foundational/12b-describe-video-gpt-4o.py
Normal file
@@ -0,0 +1,106 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
|
||||
def __init__(self, participant_id: str | None = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Describe participant video",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
vision_aggregator = VisionImageFrameAggregator()
|
||||
|
||||
openai = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o"
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
await tts.say("Hi there! Feel free to ask me what I see.")
|
||||
transport.capture_participant_video(participant["id"], framerate=0)
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
image_requester.set_participant_id(participant["id"])
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
user_response,
|
||||
image_requester,
|
||||
vision_aggregator,
|
||||
openai,
|
||||
tts,
|
||||
transport.output()
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
106
examples/foundational/12c-describe-video-anthropic.py
Normal file
106
examples/foundational/12c-describe-video-anthropic.py
Normal file
@@ -0,0 +1,106 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.anthropic import AnthropicLLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
|
||||
def __init__(self, participant_id: str | None = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Describe participant video",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
vision_aggregator = VisionImageFrameAggregator()
|
||||
|
||||
anthropic = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
model="claude-3-sonnet-20240229"
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
await tts.say("Hi there! Feel free to ask me what I see.")
|
||||
transport.capture_participant_video(participant["id"], framerate=0)
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
image_requester.set_participant_id(participant["id"])
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
user_response,
|
||||
image_requester,
|
||||
vision_aggregator,
|
||||
anthropic,
|
||||
tts,
|
||||
transport.output()
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
145
examples/foundational/14-function-calling.py
Normal file
145
examples/foundational/14-function-calling.py
Normal file
@@ -0,0 +1,145 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
)
|
||||
from pipecat.services.openai import OpenAILLMContext
|
||||
from pipecat.processors.logger import FrameLogger
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from openai.types.chat import (
|
||||
ChatCompletionToolParam,
|
||||
)
|
||||
from pipecat.frames.frames import (
|
||||
TextFrame
|
||||
)
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def start_fetch_weather(llm):
|
||||
await llm.push_frame(TextFrame("Let me think."))
|
||||
|
||||
|
||||
async def fetch_weather_from_api(llm, args):
|
||||
return ({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
llm.register_function(
|
||||
"get_current_weather",
|
||||
fetch_weather_from_api,
|
||||
start_callback=start_fetch_weather)
|
||||
|
||||
fl_in = FrameLogger("Inner")
|
||||
fl_out = FrameLogger("Outer")
|
||||
|
||||
tools = [
|
||||
ChatCompletionToolParam(
|
||||
type="function",
|
||||
function={
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"celsius",
|
||||
"fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
"required": [
|
||||
"location",
|
||||
"format"],
|
||||
},
|
||||
})]
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
tma_in = LLMUserContextAggregator(context)
|
||||
tma_out = LLMAssistantContextAggregator(context)
|
||||
pipeline = Pipeline([
|
||||
fl_in,
|
||||
transport.input(),
|
||||
tma_in,
|
||||
llm,
|
||||
fl_out,
|
||||
tts,
|
||||
transport.output(),
|
||||
tma_out
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
@ transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
await tts.say("Hi! Ask me about the weather in San Francisco.")
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -2,8 +2,8 @@ import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from pipecat.pipeline.frame_processor import FrameProcessor
|
||||
from pipecat.pipeline.frames import TextFrame, TranscriptionFrame
|
||||
from pipeline.processors.frame_processor import FrameProcessor
|
||||
from pipecat.frames.frames import TextFrame, TranscriptionFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.services.elevenlabs_ai_services import ElevenLabsTTSService
|
||||
from pipecat.transports.websocket_transport import WebsocketTransport
|
||||
|
||||
@@ -29,7 +29,7 @@ from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.moondream import MoondreamService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVAD
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
@@ -66,7 +66,7 @@ talking_frame = SpriteFrame(images=sprites)
|
||||
class TalkingAnimation(FrameProcessor):
|
||||
"""
|
||||
This class starts a talking animation when it receives an first AudioFrame,
|
||||
and then returns to a "quiet" sprite when it sees a LLMResponseEndFrame.
|
||||
and then returns to a "quiet" sprite when it sees a TTSStoppedFrame.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@@ -127,17 +127,16 @@ async def main(room_url: str, token):
|
||||
token,
|
||||
"Chatbot",
|
||||
DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
camera_out_enabled=True,
|
||||
camera_out_width=1024,
|
||||
camera_out_height=576,
|
||||
transcription_enabled=True
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
)
|
||||
)
|
||||
|
||||
vad = SileroVAD()
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
@@ -163,17 +162,23 @@ async def main(room_url: str, token):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"You are Chatbot, a friendly, helpful robot. Let the user know that you are capable of chatting or describing what you see. Your goal is to demonstrate your capabilities in a succinct way. Reply with only '{user_request_answer}' if the user asks you to describe what you see. Your output will be converted to audio so never include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself.",
|
||||
"content": f"You are Chatbot, a friendly, helpful robot. Let the user know that you are capable of chatting or describing what you see. Your goal is to demonstrate your capabilities in a succinct way. Reply with only '{user_request_answer}' if the user asks you to describe what you see. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself.",
|
||||
},
|
||||
]
|
||||
|
||||
ura = LLMUserResponseAggregator(messages)
|
||||
|
||||
pipeline = Pipeline([transport.input(), vad, ura, llm,
|
||||
ParallelPipeline(
|
||||
[sa, ir, va, moondream],
|
||||
[tf, imgf]),
|
||||
tts, ta, transport.output()])
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
ura,
|
||||
llm,
|
||||
ParallelPipeline(
|
||||
[sa, ir, va, moondream],
|
||||
[tf, imgf]),
|
||||
tts,
|
||||
ta,
|
||||
transport.output()
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
await task.queue_frame(quiet_frame)
|
||||
|
||||
16
examples/patient-intake/Dockerfile
Normal file
16
examples/patient-intake/Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
||||
FROM python:3.10-bullseye
|
||||
|
||||
RUN mkdir /app
|
||||
RUN mkdir /app/assets
|
||||
RUN mkdir /app/utils
|
||||
COPY *.py /app/
|
||||
COPY requirements.txt /app/
|
||||
copy assets/* /app/assets/
|
||||
copy utils/* /app/utils/
|
||||
|
||||
WORKDIR /app
|
||||
RUN pip3 install -r requirements.txt
|
||||
|
||||
EXPOSE 7860
|
||||
|
||||
CMD ["python3", "server.py"]
|
||||
37
examples/patient-intake/README.md
Normal file
37
examples/patient-intake/README.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# Simple Chatbot
|
||||
|
||||
<img src="image.png" width="420px">
|
||||
|
||||
This app connects you to a chatbot powered by GPT-4, complete with animations generated by Stable Video Diffusion.
|
||||
|
||||
See a video of it in action: https://x.com/kwindla/status/1778628911817183509
|
||||
|
||||
And a quick video walkthrough of the code: https://www.loom.com/share/13df1967161f4d24ade054e7f8753416
|
||||
|
||||
ℹ️ The first time, things might take extra time to get started since VAD (Voice Activity Detection) model needs to be downloaded.
|
||||
|
||||
## Get started
|
||||
|
||||
```python
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
|
||||
cp env.example .env # and add your credentials
|
||||
|
||||
```
|
||||
|
||||
## Run the server
|
||||
|
||||
```bash
|
||||
python server.py
|
||||
```
|
||||
|
||||
Then, visit `http://localhost:7860/start` in your browser to start a chatbot session.
|
||||
|
||||
## Build and test the Docker image
|
||||
|
||||
```
|
||||
docker build -t chatbot .
|
||||
docker run --env-file .env -p 7860:7860 chatbot
|
||||
```
|
||||
BIN
examples/patient-intake/assets/clack-short-quiet.wav
Normal file
BIN
examples/patient-intake/assets/clack-short-quiet.wav
Normal file
Binary file not shown.
BIN
examples/patient-intake/assets/clack-short.wav
Normal file
BIN
examples/patient-intake/assets/clack-short.wav
Normal file
Binary file not shown.
BIN
examples/patient-intake/assets/clack.wav
Normal file
BIN
examples/patient-intake/assets/clack.wav
Normal file
Binary file not shown.
BIN
examples/patient-intake/assets/ding.wav
Normal file
BIN
examples/patient-intake/assets/ding.wav
Normal file
Binary file not shown.
BIN
examples/patient-intake/assets/ding2.wav
Normal file
BIN
examples/patient-intake/assets/ding2.wav
Normal file
Binary file not shown.
BIN
examples/patient-intake/assets/ding3.wav
Normal file
BIN
examples/patient-intake/assets/ding3.wav
Normal file
Binary file not shown.
359
examples/patient-intake/bot.py
Normal file
359
examples/patient-intake/bot.py
Normal file
@@ -0,0 +1,359 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import wave
|
||||
from typing import List
|
||||
|
||||
from openai._types import NotGiven, NOT_GIVEN
|
||||
|
||||
from openai.types.chat import (
|
||||
ChatCompletionToolParam,
|
||||
)
|
||||
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import LLMUserContextAggregator, LLMAssistantContextAggregator
|
||||
from pipecat.processors.logger import FrameLogger
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMMessagesFrame,
|
||||
AudioRawFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.ai_services import AIService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTranscriptionSettings, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.services.openai import OpenAILLMContext, OpenAILLMContextFrame
|
||||
|
||||
from runner import configure
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
sounds = {}
|
||||
sound_files = [
|
||||
"clack-short.wav",
|
||||
"clack.wav",
|
||||
"clack-short-quiet.wav",
|
||||
"ding.wav",
|
||||
"ding2.wav",
|
||||
]
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
for file in sound_files:
|
||||
# Build the full path to the sound file
|
||||
full_path = os.path.join(script_dir, "assets", file)
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the sound and convert it to bytes
|
||||
with wave.open(full_path) as audio_file:
|
||||
sounds[file] = AudioRawFrame(audio_file.readframes(-1),
|
||||
audio_file.getframerate(), audio_file.getnchannels())
|
||||
|
||||
|
||||
class IntakeProcessor:
|
||||
def __init__(
|
||||
self,
|
||||
context: OpenAILLMContext,
|
||||
llm: AIService,
|
||||
tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._context: OpenAILLMContext = context
|
||||
self._llm = llm
|
||||
print(f"Initializing context from IntakeProcessor")
|
||||
self._context.add_message({"role": "system", "content": "You are Jessica, an agent for a company called Tri-County Health Services. Your job is to collect important information from the user before their doctor visit. You're talking to Chad Bailey. You should address the user by their first name and be polite and professional. You're not a medical professional, so you shouldn't provide any advice. Keep your responses short. Your job is to collect information to give to a doctor. Don't make assumptions about what values to plug into functions. Ask for clarification if a user response is ambiguous. Start by introducing yourself. Then, ask the user to confirm their identity by telling you their birthday, including the year. When they answer with their birthday, call the verify_birthday function."})
|
||||
self._context.set_tools([
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "verify_birthday",
|
||||
"description": "Use this function to verify the user has provided their correct birthday.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"birthday": {
|
||||
"type": "string",
|
||||
"description": "The user's birthdate, including the year. The user can provide it in any format, but convert it to YYYY-MM-DD format to call this function.",
|
||||
}},
|
||||
},
|
||||
},
|
||||
}])
|
||||
# Create an allowlist of functions that the LLM can call
|
||||
self._functions = [
|
||||
"verify_birthday",
|
||||
"list_prescriptions",
|
||||
"list_allergies",
|
||||
"list_conditions",
|
||||
"list_visit_reasons",
|
||||
]
|
||||
|
||||
async def verify_birthday(self, llm, args):
|
||||
if args["birthday"] == "1983-01-01":
|
||||
self._context.set_tools(
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "list_prescriptions",
|
||||
"description": "Once the user has provided a list of their prescription medications, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prescriptions": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"medication": {
|
||||
"type": "string",
|
||||
"description": "The medication's name",
|
||||
},
|
||||
"dosage": {
|
||||
"type": "string",
|
||||
"description": "The prescription's dosage",
|
||||
},
|
||||
},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
}])
|
||||
# It's a bit weird to push this to the LLM, but it gets it into the pipeline
|
||||
await llm.push_frame(sounds["ding2.wav"], FrameDirection.DOWNSTREAM)
|
||||
# We don't need the function call in the context, so just return a new
|
||||
# system message and let the framework re-prompt
|
||||
return [{"role": "system", "content": "Next, thank the user for confirming their identity, then ask the user to list their current prescriptions. Each prescription needs to have a medication name and a dosage. Do not call the list_prescriptions function with any unknown dosages."}]
|
||||
else:
|
||||
# The user provided an incorrect birthday; ask them to try again
|
||||
return [{"role": "system", "content": "The user provided an incorrect birthday. Ask them for their birthday again. When they answer, call the verify_birthday function."}]
|
||||
|
||||
async def start_prescriptions(self, llm):
|
||||
print(f"!!! doing start prescriptions")
|
||||
# Move on to allergies
|
||||
self._context.set_tools(
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "list_allergies",
|
||||
"description": "Once the user has provided a list of their allergies, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"allergies": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "What the user is allergic to",
|
||||
}},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
}])
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Next, ask the user if they have any allergies. Once they have listed their allergies or confirmed they don't have any, call the list_allergies function."})
|
||||
print(f"!!! about to await llm process frame in start prescrpitions")
|
||||
await llm.process_frame(OpenAILLMContextFrame(self._context), FrameDirection.DOWNSTREAM)
|
||||
print(f"!!! past await process frame in start prescriptions")
|
||||
|
||||
async def start_allergies(self, llm):
|
||||
print("!!! doing start allergies")
|
||||
# Move on to conditions
|
||||
self._context.set_tools(
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "list_conditions",
|
||||
"description": "Once the user has provided a list of their medical conditions, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"conditions": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "The user's medical condition",
|
||||
}},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
},
|
||||
])
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Now ask the user if they have any medical conditions the doctor should know about. Once they've answered the question, call the list_conditions function."})
|
||||
await llm.process_frame(OpenAILLMContextFrame(self._context), FrameDirection.DOWNSTREAM)
|
||||
|
||||
async def start_conditions(self, llm):
|
||||
print("!!! doing start conditions")
|
||||
# Move on to visit reasons
|
||||
self._context.set_tools(
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "list_visit_reasons",
|
||||
"description": "Once the user has provided a list of the reasons they are visiting a doctor today, call this function.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"visit_reasons": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "The user's reason for visiting the doctor",
|
||||
}},
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
}])
|
||||
self._context.add_message(
|
||||
{"role": "system", "content": "Finally, ask the user the reason for their doctor visit today. Once they answer, call the list_visit_reasons function."})
|
||||
await llm.process_frame(OpenAILLMContextFrame(self._context), FrameDirection.DOWNSTREAM)
|
||||
pass
|
||||
|
||||
async def start_visit_reasons(self, llm):
|
||||
print("!!! doing start visit reasons")
|
||||
# move to finish call
|
||||
self._context.set_tools([])
|
||||
self._context.add_message({"role": "system",
|
||||
"content": "Now, thank the user and end the conversation."})
|
||||
await llm.process_frame(OpenAILLMContextFrame(self._context), FrameDirection.DOWNSTREAM)
|
||||
pass
|
||||
|
||||
async def save_data(self, llm, args):
|
||||
logger.info(f"!!! Saving data: {args}")
|
||||
# Since this is supposed to be "async", returning None from the callback
|
||||
# will prevent adding anything to context or re-prompting
|
||||
return None
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Chatbot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
camera_out_enabled=True,
|
||||
camera_out_width=1024,
|
||||
camera_out_height=576,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
transcription_enabled=True,
|
||||
#
|
||||
# Spanish
|
||||
#
|
||||
# transcription_settings=DailyTranscriptionSettings(
|
||||
# language="es",
|
||||
# tier="nova",
|
||||
# model="2-general"
|
||||
# )
|
||||
)
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
#
|
||||
# English
|
||||
#
|
||||
voice_id="pNInz6obpgDQGcFmaJgB",
|
||||
|
||||
#
|
||||
# Spanish
|
||||
#
|
||||
# model="eleven_multilingual_v2",
|
||||
# voice_id="gD1IexrzCvsXPHUuT0s3",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o")
|
||||
|
||||
messages = []
|
||||
context = OpenAILLMContext(
|
||||
messages=messages,
|
||||
)
|
||||
user_context = LLMUserContextAggregator(context)
|
||||
assistant_context = LLMAssistantContextAggregator(context)
|
||||
# checklist = ChecklistProcessor(context, llm)
|
||||
intake = IntakeProcessor(context, llm)
|
||||
llm.register_function("verify_birthday", intake.verify_birthday)
|
||||
llm.register_function(
|
||||
"list_prescriptions",
|
||||
intake.save_data,
|
||||
start_callback=intake.start_prescriptions)
|
||||
llm.register_function(
|
||||
"list_allergies",
|
||||
intake.save_data,
|
||||
start_callback=intake.start_allergies)
|
||||
llm.register_function(
|
||||
"list_conditions",
|
||||
intake.save_data,
|
||||
start_callback=intake.start_conditions)
|
||||
llm.register_function(
|
||||
"list_visit_reasons",
|
||||
intake.save_data,
|
||||
start_callback=intake.start_visit_reasons)
|
||||
fl = FrameLogger("LLM Output")
|
||||
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
user_context,
|
||||
llm,
|
||||
fl,
|
||||
tts,
|
||||
transport.output(),
|
||||
assistant_context,
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline, allow_interruptions=False)
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
print(f"Context is: {context}")
|
||||
await task.queue_frames([OpenAILLMContextFrame(context)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
4
examples/patient-intake/env.example
Normal file
4
examples/patient-intake/env.example
Normal file
@@ -0,0 +1,4 @@
|
||||
DAILY_SAMPLE_ROOM_URL=https://yourdomain.daily.co/yourroom # (for joining the bot to the same room repeatedly for local dev)
|
||||
DAILY_API_KEY=7df...
|
||||
OPENAI_API_KEY=sk-PL...
|
||||
ELEVENLABS_API_KEY=aeb...
|
||||
BIN
examples/patient-intake/image.png
Normal file
BIN
examples/patient-intake/image.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 733 KiB |
5
examples/patient-intake/requirements.txt
Normal file
5
examples/patient-intake/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
python-dotenv
|
||||
requests
|
||||
fastapi[all]
|
||||
uvicorn
|
||||
pipecat-ai[daily,openai,silero]
|
||||
58
examples/patient-intake/runner.py
Normal file
58
examples/patient-intake/runner.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import urllib
|
||||
import requests
|
||||
|
||||
|
||||
def configure():
|
||||
parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample")
|
||||
parser.add_argument(
|
||||
"-u",
|
||||
"--url",
|
||||
type=str,
|
||||
required=False,
|
||||
help="URL of the Daily room to join")
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--apikey",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Daily API Key (needed to create an owner token for the room)",
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
|
||||
url = args.url or os.getenv("DAILY_SAMPLE_ROOM_URL")
|
||||
key = args.apikey or os.getenv("DAILY_API_KEY")
|
||||
|
||||
if not url:
|
||||
raise Exception(
|
||||
"No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.")
|
||||
|
||||
if not key:
|
||||
raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.")
|
||||
|
||||
# Create a meeting token for the given room with an expiration 1 hour in
|
||||
# the future.
|
||||
room_name: str = urllib.parse.urlparse(url).path[1:]
|
||||
expiration: float = time.time() + 60 * 60
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://api.daily.co/v1/meeting-tokens",
|
||||
headers={
|
||||
"Authorization": f"Bearer {key}"},
|
||||
json={
|
||||
"properties": {
|
||||
"room_name": room_name,
|
||||
"is_owner": True,
|
||||
"exp": expiration}},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(
|
||||
f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
|
||||
return (url, token)
|
||||
124
examples/patient-intake/server.py
Normal file
124
examples/patient-intake/server.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import os
|
||||
import argparse
|
||||
import subprocess
|
||||
import atexit
|
||||
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse, RedirectResponse
|
||||
|
||||
from utils.daily_helpers import create_room as _create_room, get_token
|
||||
|
||||
MAX_BOTS_PER_ROOM = 1
|
||||
|
||||
# Bot sub-process dict for status reporting and concurrency control
|
||||
bot_procs = {}
|
||||
|
||||
|
||||
def cleanup():
|
||||
# Clean up function, just to be extra safe
|
||||
for proc in bot_procs.values():
|
||||
proc.terminate()
|
||||
proc.wait()
|
||||
|
||||
|
||||
atexit.register(cleanup)
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
@app.get("/start")
|
||||
async def start_agent(request: Request):
|
||||
print(f"!!! Creating room")
|
||||
room_url, room_name = _create_room()
|
||||
print(f"!!! Room URL: {room_url}")
|
||||
# Ensure the room property is present
|
||||
if not room_url:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Missing 'room' property in request data. Cannot start agent without a target room!")
|
||||
|
||||
# Check if there is already an existing process running in this room
|
||||
num_bots_in_room = sum(
|
||||
1 for proc in bot_procs.values() if proc[1] == room_url and proc[0].poll() is None)
|
||||
if num_bots_in_room >= MAX_BOTS_PER_ROOM:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Max bot limited reach for room: {room_url}")
|
||||
|
||||
# Get the token for the room
|
||||
token = get_token(room_url)
|
||||
|
||||
if not token:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to get token for room: {room_url}")
|
||||
|
||||
# Spawn a new agent, and join the user session
|
||||
# Note: this is mostly for demonstration purposes (refer to 'deployment' in README)
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
f"python3 -m bot -u {room_url} -t {token}"
|
||||
],
|
||||
shell=True,
|
||||
bufsize=1,
|
||||
cwd=os.path.dirname(os.path.abspath(__file__))
|
||||
)
|
||||
bot_procs[proc.pid] = (proc, room_url)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to start subprocess: {e}")
|
||||
|
||||
return RedirectResponse(room_url)
|
||||
|
||||
|
||||
@app.get("/status/{pid}")
|
||||
def get_status(pid: int):
|
||||
# Look up the subprocess
|
||||
proc = bot_procs.get(pid)
|
||||
|
||||
# If the subprocess doesn't exist, return an error
|
||||
if not proc:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Bot with process id: {pid} not found")
|
||||
|
||||
# Check the status of the subprocess
|
||||
if proc[0].poll() is None:
|
||||
status = "running"
|
||||
else:
|
||||
status = "finished"
|
||||
|
||||
return JSONResponse({"bot_id": pid, "status": status})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
default_host = os.getenv("HOST", "0.0.0.0")
|
||||
default_port = int(os.getenv("FAST_API_PORT", "7860"))
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Daily Storyteller FastAPI server")
|
||||
parser.add_argument("--host", type=str,
|
||||
default=default_host, help="Host address")
|
||||
parser.add_argument("--port", type=int,
|
||||
default=default_port, help="Port number")
|
||||
parser.add_argument("--reload", action="store_true",
|
||||
help="Reload code on change")
|
||||
|
||||
config = parser.parse_args()
|
||||
print(f"to join a test room, visit http://localhost:{config.port}/start")
|
||||
uvicorn.run(
|
||||
"server:app",
|
||||
host=config.host,
|
||||
port=config.port,
|
||||
reload=config.reload,
|
||||
)
|
||||
109
examples/patient-intake/utils/daily_helpers.py
Normal file
109
examples/patient-intake/utils/daily_helpers.py
Normal file
@@ -0,0 +1,109 @@
|
||||
|
||||
import urllib.parse
|
||||
import os
|
||||
import time
|
||||
import urllib
|
||||
import requests
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
|
||||
daily_api_path = os.getenv("DAILY_API_URL") or "api.daily.co/v1"
|
||||
daily_api_key = os.getenv("DAILY_API_KEY")
|
||||
|
||||
|
||||
def create_room() -> tuple[str, str]:
|
||||
"""
|
||||
Helper function to create a Daily room.
|
||||
# See: https://docs.daily.co/reference/rest-api/rooms
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing the room URL and room name.
|
||||
|
||||
Raises:
|
||||
Exception: If the request to create the room fails or if the response does not contain the room URL or room name.
|
||||
"""
|
||||
room_props = {
|
||||
"exp": time.time() + 60 * 60, # 1 hour
|
||||
"enable_chat": True,
|
||||
"enable_emoji_reactions": True,
|
||||
"eject_at_room_exp": True,
|
||||
"enable_prejoin_ui": False, # Important for the bot to be able to join headlessly
|
||||
}
|
||||
res = requests.post(
|
||||
f"https://{daily_api_path}/rooms",
|
||||
headers={"Authorization": f"Bearer {daily_api_key}"},
|
||||
json={
|
||||
"properties": room_props
|
||||
},
|
||||
)
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Unable to create room: {res.text}")
|
||||
|
||||
data = res.json()
|
||||
room_url: str = data.get("url")
|
||||
room_name: str = data.get("name")
|
||||
if room_url is None or room_name is None:
|
||||
raise Exception("Missing room URL or room name in response")
|
||||
|
||||
return room_url, room_name
|
||||
|
||||
|
||||
def get_name_from_url(room_url: str) -> str:
|
||||
"""
|
||||
Extracts the name from a given room URL.
|
||||
|
||||
Args:
|
||||
room_url (str): The URL of the room.
|
||||
|
||||
Returns:
|
||||
str: The extracted name from the room URL.
|
||||
"""
|
||||
return urllib.parse.urlparse(room_url).path[1:]
|
||||
|
||||
|
||||
def get_token(room_url: str) -> str:
|
||||
"""
|
||||
Retrieves a meeting token for the specified Daily room URL.
|
||||
# See: https://docs.daily.co/reference/rest-api/meeting-tokens
|
||||
|
||||
Args:
|
||||
room_url (str): The URL of the Daily room.
|
||||
|
||||
Returns:
|
||||
str: The meeting token.
|
||||
|
||||
Raises:
|
||||
Exception: If no room URL is specified or if no Daily API key is specified.
|
||||
Exception: If there is an error creating the meeting token.
|
||||
"""
|
||||
if not room_url:
|
||||
raise Exception(
|
||||
"No Daily room specified. You must specify a Daily room in order a token to be generated.")
|
||||
|
||||
if not daily_api_key:
|
||||
raise Exception(
|
||||
"No Daily API key specified. set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.")
|
||||
|
||||
expiration: float = time.time() + 60 * 60
|
||||
room_name = get_name_from_url(room_url)
|
||||
|
||||
res: requests.Response = requests.post(
|
||||
f"https://{daily_api_path}/meeting-tokens",
|
||||
headers={
|
||||
"Authorization": f"Bearer {daily_api_key}"},
|
||||
json={
|
||||
"properties": {
|
||||
"room_name": room_name,
|
||||
"is_owner": True, # Owner tokens required for transcription
|
||||
"exp": expiration}},
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(
|
||||
f"Failed to create meeting token: {res.status_code} {res.text}")
|
||||
|
||||
token: str = res.json()["token"]
|
||||
|
||||
return token
|
||||
@@ -7,8 +7,8 @@ from PIL import Image
|
||||
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import LLMUserResponseAggregator
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
ImageRawFrame,
|
||||
@@ -20,8 +20,8 @@ from pipecat.frames.frames import (
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.vad.silero import SileroVAD
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTranscriptionSettings, DailyTransport
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
|
||||
from runner import configure
|
||||
|
||||
@@ -56,7 +56,7 @@ talking_frame = SpriteFrame(images=sprites)
|
||||
class TalkingAnimation(FrameProcessor):
|
||||
"""
|
||||
This class starts a talking animation when it receives an first AudioFrame,
|
||||
and then returns to a "quiet" sprite when it sees a LLMResponseEndFrame.
|
||||
and then returns to a "quiet" sprite when it sees a TTSStoppedFrame.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@@ -82,21 +82,37 @@ async def main(room_url: str, token):
|
||||
token,
|
||||
"Chatbot",
|
||||
DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
camera_out_enabled=True,
|
||||
camera_out_width=1024,
|
||||
camera_out_height=576,
|
||||
transcription_enabled=True
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
transcription_enabled=True,
|
||||
#
|
||||
# Spanish
|
||||
#
|
||||
# transcription_settings=DailyTranscriptionSettings(
|
||||
# language="es",
|
||||
# tier="nova",
|
||||
# model="2-general"
|
||||
# )
|
||||
)
|
||||
)
|
||||
|
||||
vad = SileroVAD()
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
#
|
||||
# English
|
||||
#
|
||||
voice_id="pNInz6obpgDQGcFmaJgB",
|
||||
|
||||
#
|
||||
# Spanish
|
||||
#
|
||||
# model="eleven_multilingual_v2",
|
||||
# voice_id="gD1IexrzCvsXPHUuT0s3",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
@@ -106,18 +122,34 @@ async def main(room_url: str, token):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
#
|
||||
# English
|
||||
#
|
||||
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself.",
|
||||
|
||||
#
|
||||
# Spanish
|
||||
#
|
||||
# "content": "Eres Chatbot, un amigable y útil robot. Tu objetivo es demostrar tus capacidades de una manera breve. Tus respuestas se convertiran a audio así que nunca no debes incluir caracteres especiales. Contesta a lo que el usuario pregunte de una manera creativa, útil y breve. Empieza por presentarte a ti mismo.",
|
||||
},
|
||||
]
|
||||
|
||||
user_response = LLMUserResponseAggregator()
|
||||
assistant_response = LLMAssistantResponseAggregator()
|
||||
|
||||
ta = TalkingAnimation()
|
||||
|
||||
pipeline = Pipeline([transport.input(), vad, user_response,
|
||||
llm, tts, ta, transport.output()])
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
user_response,
|
||||
llm,
|
||||
tts,
|
||||
ta,
|
||||
transport.output(),
|
||||
assistant_response,
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
|
||||
await task.queue_frame(quiet_frame)
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
|
||||
@@ -2,4 +2,4 @@ python-dotenv
|
||||
requests
|
||||
fastapi[all]
|
||||
uvicorn
|
||||
pipecat-ai[daily,openai]
|
||||
pipecat-ai[daily,openai,silero]
|
||||
|
||||
@@ -37,6 +37,8 @@ Adds pictures to our story (really fast!) Prompting is quite key for style consi
|
||||
**Install requirements**
|
||||
|
||||
```shell
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@@ -52,7 +54,7 @@ This project uses a custom frontend, which needs to built. Note: this is done au
|
||||
|
||||
```shell
|
||||
cd frontend/
|
||||
npm install / yarn
|
||||
npm install
|
||||
npm run build
|
||||
```
|
||||
|
||||
@@ -68,12 +70,7 @@ If you'd like to run a custom domain or port:
|
||||
|
||||
`python src/server.py --host somehost --p 7777`
|
||||
|
||||
➡️ Open the host URL in your browser
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Whilst working on the frontend code, please `yarn run dev`
|
||||
> and open the NextJS hosted service vs. the Python server.
|
||||
> (Usually localhost:3000.)
|
||||
➡️ Open the host URL in your browser `http://localhost:7860`
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
ELEVENLABS_API_KEY=
|
||||
ELEVENLABS_VOICE_ID=
|
||||
FAL_KEY=
|
||||
DAILY_API_URL=api.daily.co/v1
|
||||
DAILY_API_KEY=
|
||||
OPENAI_API_KEY=
|
||||
DAILY_API_KEY=7df...
|
||||
ELEVENLABS_API_KEY=aeb...
|
||||
ELEVENLABS_VOICE_ID=7S...
|
||||
FAL_KEY=8c...
|
||||
OPENAI_API_KEY=sk-PL...
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
dailyai[daily,openai,fal]==0.0.8
|
||||
async_timeout
|
||||
fastapi
|
||||
uvicorn
|
||||
requests
|
||||
python-dotenv
|
||||
python-dotenv
|
||||
pipecat-ai[daily,openai,fal]
|
||||
|
||||
@@ -1,37 +1,31 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.pipeline.frames import (
|
||||
AudioFrame,
|
||||
ImageFrame,
|
||||
EndPipeFrame,
|
||||
LLMMessagesFrame,
|
||||
SendAppMessageFrame
|
||||
)
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMUserResponseAggregator,
|
||||
LLMAssistantResponseAggregator,
|
||||
)
|
||||
from dailyai.transports.daily_transport import DailyTransport
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
|
||||
from pipecat.frames.frames import LLMMessagesFrame, StopTaskFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.fal import FalImageGenService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyTransportMessageFrame
|
||||
|
||||
from processors import StoryProcessor, StoryImageProcessor
|
||||
from prompts import LLM_BASE_PROMPT, LLM_INTRO_PROMPT, CUE_USER_TURN
|
||||
from utils.helpers import load_sounds, load_images
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"[STORYBOT] %(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
sounds = load_sounds(["listening.wav"])
|
||||
images = load_images(["book1.png", "book2.png"])
|
||||
@@ -46,16 +40,14 @@ async def main(room_url, token=None):
|
||||
room_url,
|
||||
token,
|
||||
"Storytelling Bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
vad_enabled=True,
|
||||
camera_framerate=30,
|
||||
camera_bitrate=680000,
|
||||
camera_enabled=True,
|
||||
camera_width=768,
|
||||
camera_height=768,
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
camera_out_enabled=True,
|
||||
camera_out_width=768,
|
||||
camera_out_height=768,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
)
|
||||
)
|
||||
|
||||
logger.debug("Transport created for room:" + room_url)
|
||||
@@ -103,68 +95,54 @@ async def main(room_url, token=None):
|
||||
|
||||
# -------------- Story Loop ------------- #
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
# The intro pipeline is used to start
|
||||
# the story (as per LLM_INTRO_PROMPT)
|
||||
intro_pipeline = Pipeline([llm_service, tts_service, transport.output()])
|
||||
|
||||
intro_task = PipelineTask(intro_pipeline)
|
||||
|
||||
logger.debug("Waiting for participant...")
|
||||
|
||||
start_storytime_event = asyncio.Event()
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport, participant):
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
logger.debug("Participant joined, storytime commence!")
|
||||
start_storytime_event.set()
|
||||
|
||||
# The storytime coroutine will wait for the start_storytime_event
|
||||
# to be set before starting the storytime pipeline
|
||||
async def storytime():
|
||||
await start_storytime_event.wait()
|
||||
|
||||
# The intro pipeline is used to start
|
||||
# the story (as per LLM_INTRO_PROMPT)
|
||||
intro_pipeline = Pipeline(processors=[
|
||||
llm_service,
|
||||
tts_service,
|
||||
], sink=transport.send_queue)
|
||||
|
||||
await intro_pipeline.queue_frames(
|
||||
transport.capture_participant_transcription(participant["id"])
|
||||
await intro_task.queue_frames(
|
||||
[
|
||||
ImageFrame(images['book1'], (768, 768)),
|
||||
images['book1'],
|
||||
LLMMessagesFrame([LLM_INTRO_PROMPT]),
|
||||
SendAppMessageFrame(CUE_USER_TURN, None),
|
||||
AudioFrame(sounds["listening"]),
|
||||
ImageFrame(images['book2'], (768, 768)),
|
||||
EndPipeFrame(),
|
||||
DailyTransportMessageFrame(CUE_USER_TURN),
|
||||
sounds["listening"],
|
||||
images['book2'],
|
||||
StopTaskFrame()
|
||||
]
|
||||
)
|
||||
|
||||
# We start the pipeline as soon as the user joins
|
||||
await intro_pipeline.run_pipeline()
|
||||
# We run the intro pipeline. This will start the transport. The intro
|
||||
# task will exit after StopTaskFrame is processed.
|
||||
await runner.run(intro_task)
|
||||
|
||||
# The main story pipeline is used to continue the
|
||||
# story based on user input
|
||||
pipeline = Pipeline(processors=[
|
||||
user_responses,
|
||||
llm_service,
|
||||
story_processor,
|
||||
image_processor,
|
||||
tts_service,
|
||||
llm_responses,
|
||||
])
|
||||
# The main story pipeline is used to continue the story based on user
|
||||
# input.
|
||||
main_pipeline = Pipeline([
|
||||
transport.input(),
|
||||
user_responses,
|
||||
llm_service,
|
||||
story_processor,
|
||||
image_processor,
|
||||
tts_service,
|
||||
transport.output(),
|
||||
llm_responses
|
||||
])
|
||||
|
||||
await transport.run_pipeline(pipeline)
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
|
||||
try:
|
||||
await asyncio.gather(transport.run(), storytime())
|
||||
except (asyncio.CancelledError, KeyboardInterrupt):
|
||||
transport.stop()
|
||||
|
||||
logger.debug("Pipeline finished. Exiting.")
|
||||
main_task = PipelineTask(main_pipeline)
|
||||
|
||||
await runner.run(main_task)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Daily Storyteller Bot")
|
||||
parser = argparse.ArgumentParser(description="Daily Storyteller Bot")
|
||||
parser.add_argument("-u", type=str, help="Room URL")
|
||||
parser.add_argument("-t", type=str, help="Token")
|
||||
config = parser.parse_args()
|
||||
|
||||
@@ -1,19 +1,17 @@
|
||||
from typing import AsyncGenerator
|
||||
import re
|
||||
|
||||
from dailyai.pipeline.frames import TextFrame, Frame, AudioFrame
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.pipeline.frames import (
|
||||
from async_timeout import timeout
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMFullResponseEndFrame,
|
||||
TextFrame,
|
||||
SendAppMessageFrame,
|
||||
LLMResponseEndFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
UserStoppedSpeakingFrame)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.transports.services.daily import DailyTransportMessageFrame
|
||||
|
||||
from utils.helpers import load_sounds
|
||||
from prompts import IMAGE_GEN_PROMPT, CUE_USER_TURN, CUE_ASSISTANT_TURN
|
||||
import asyncio
|
||||
|
||||
sounds = load_sounds(["talking.wav", "listening.wav", "ding.wav"])
|
||||
|
||||
@@ -42,7 +40,7 @@ class StoryImageProcessor(FrameProcessor):
|
||||
Processor for image prompt frames that will be sent to the FAL service.
|
||||
|
||||
This processor is responsible for consuming frames of type `StoryImageFrame`.
|
||||
It processes the by passing it to the FAL service
|
||||
It processes them by passing it to the FAL service.
|
||||
The processed frames are then yielded back.
|
||||
|
||||
Attributes:
|
||||
@@ -50,25 +48,26 @@ class StoryImageProcessor(FrameProcessor):
|
||||
"""
|
||||
|
||||
def __init__(self, fal_service):
|
||||
super().__init__()
|
||||
self._fal_service = fal_service
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, StoryImageFrame):
|
||||
try:
|
||||
async with asyncio.timeout(7):
|
||||
async for i in self._fal_service.process_frame(TextFrame(IMAGE_GEN_PROMPT % frame.text)):
|
||||
yield i
|
||||
async with timeout(7):
|
||||
async for i in self._fal_service.run_image_gen(IMAGE_GEN_PROMPT % frame.text):
|
||||
await self.push_frame(i)
|
||||
except TimeoutError:
|
||||
pass
|
||||
pass
|
||||
else:
|
||||
yield frame
|
||||
await self.push_frame(frame)
|
||||
|
||||
|
||||
class StoryProcessor(FrameProcessor):
|
||||
"""
|
||||
Primary frame processor. It takes the frames generated by the LLM
|
||||
and processes them into image prompts and story pages (sentences.)
|
||||
and processes them into image prompts and story pages (sentences).
|
||||
For a clearer picture of how this works, reference prompts.py
|
||||
|
||||
Attributes:
|
||||
@@ -81,15 +80,16 @@ class StoryProcessor(FrameProcessor):
|
||||
"""
|
||||
|
||||
def __init__(self, messages, story):
|
||||
super().__init__()
|
||||
self._messages = messages
|
||||
self._text = ""
|
||||
self._story = story
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, UserStoppedSpeakingFrame):
|
||||
# Send an app message to the UI
|
||||
yield SendAppMessageFrame(CUE_ASSISTANT_TURN, None)
|
||||
yield AudioFrame(sounds["talking"])
|
||||
await self.push_frame(DailyTransportMessageFrame(CUE_ASSISTANT_TURN))
|
||||
await self.push_frame(sounds["talking"])
|
||||
|
||||
elif isinstance(frame, TextFrame):
|
||||
# We want to look for sentence breaks in the text
|
||||
@@ -111,7 +111,7 @@ class StoryProcessor(FrameProcessor):
|
||||
# Remove the image prompt from the text
|
||||
self._text = re.sub(r"<.*?>", '', self._text, count=1)
|
||||
# Process the image prompt frame
|
||||
yield StoryImageFrame(image_prompt)
|
||||
await self.push_frame(StoryImageFrame(image_prompt))
|
||||
|
||||
# STORY PAGE
|
||||
# Looking for: [break] in the LLM response
|
||||
@@ -126,23 +126,23 @@ class StoryProcessor(FrameProcessor):
|
||||
if len(self._text) > 2:
|
||||
# Append the sentence to the story
|
||||
self._story.append(self._text)
|
||||
yield StoryPageFrame(self._text)
|
||||
await self.push_frame(StoryPageFrame(self._text))
|
||||
# Assert that it's the LLMs turn, until we're finished
|
||||
yield SendAppMessageFrame(CUE_ASSISTANT_TURN, None)
|
||||
await self.push_frame(DailyTransportMessageFrame(CUE_ASSISTANT_TURN))
|
||||
# Clear the buffer
|
||||
self._text = ""
|
||||
|
||||
# End of LLM response
|
||||
# End of a full LLM response
|
||||
# Driven by the prompt, the LLM should have asked the user for input
|
||||
elif isinstance(frame, LLMResponseEndFrame):
|
||||
elif isinstance(frame, LLMFullResponseEndFrame):
|
||||
# We use a different frame type, as to avoid image generation ingest
|
||||
yield StoryPromptFrame(self._text)
|
||||
await self.push_frame(StoryPromptFrame(self._text))
|
||||
self._text = ""
|
||||
yield frame
|
||||
await self.push_frame(frame)
|
||||
# Send an app message to the UI
|
||||
yield SendAppMessageFrame(CUE_USER_TURN, None)
|
||||
yield AudioFrame(sounds["listening"])
|
||||
await self.push_frame(DailyTransportMessageFrame(CUE_USER_TURN))
|
||||
await self.push_frame(sounds["listening"])
|
||||
|
||||
# Anything that is not a TextFrame pass through
|
||||
else:
|
||||
yield frame
|
||||
await self.push_frame(frame)
|
||||
|
||||
@@ -3,7 +3,7 @@ LLM_INTRO_PROMPT = {
|
||||
"content": "You are a creative storyteller who loves to tell whimsical, fantastical stories. \
|
||||
Your goal is to craft an engaging and fun story. \
|
||||
Start by asking the user what kind of story they'd like to hear. Don't provide any examples. \
|
||||
Keep your reponse to only a few sentences."
|
||||
Keep your response to only a few sentences."
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ from typing import Optional
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.responses import FileResponse, JSONResponse, RedirectResponse
|
||||
from fastapi.responses import FileResponse, JSONResponse
|
||||
|
||||
from utils.daily_helpers import create_room as _create_room, get_token, get_name_from_url
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
|
||||
daily_api_path = os.getenv("DAILY_API_URL")
|
||||
daily_api_path = os.getenv("DAILY_API_URL") or "api.daily.co/v1"
|
||||
daily_api_key = os.getenv("DAILY_API_KEY")
|
||||
|
||||
|
||||
|
||||
@@ -2,6 +2,8 @@ import os
|
||||
import wave
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame, ImageRawFrame
|
||||
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
|
||||
@@ -14,7 +16,7 @@ def load_images(image_files):
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the image and convert it to bytes
|
||||
with Image.open(full_path) as img:
|
||||
images[filename] = img.tobytes()
|
||||
images[filename] = ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)
|
||||
return images
|
||||
|
||||
|
||||
@@ -28,6 +30,8 @@ def load_sounds(sound_files):
|
||||
filename = os.path.splitext(os.path.basename(full_path))[0]
|
||||
# Open the sound and convert it to bytes
|
||||
with wave.open(full_path) as audio_file:
|
||||
sounds[filename] = audio_file.readframes(-1)
|
||||
sounds[filename] = AudioRawFrame(audio=audio_file.readframes(-1),
|
||||
sample_rate=audio_file.getframerate(),
|
||||
num_channels=audio_file.getnchannels())
|
||||
|
||||
return sounds
|
||||
|
||||
@@ -3,7 +3,7 @@ import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.frames.frames import Frame, InterimTranscriptionFrame, LLMMessagesFrame, TextFrame, TranscriptionFrame, TransportMessageFrame
|
||||
from pipecat.frames.frames import Frame, LLMMessagesFrame, TextFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
@@ -12,7 +12,7 @@ from pipecat.processors.aggregators.sentence import SentenceAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.azure import AzureTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyTransportMessageFrame
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTranscriptionSettings, DailyTransport, DailyTransportMessageFrame
|
||||
|
||||
from runner import configure
|
||||
|
||||
@@ -84,7 +84,9 @@ async def main(room_url: str, token):
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
transcription_interim_results=False,
|
||||
transcription_settings=DailyTranscriptionSettings(extra={
|
||||
"interim_results": False
|
||||
})
|
||||
)
|
||||
)
|
||||
|
||||
@@ -103,7 +105,16 @@ async def main(room_url: str, token):
|
||||
lfra = LLMFullResponseAggregator()
|
||||
ts = TranslationSubtitles("spanish")
|
||||
|
||||
pipeline = Pipeline([transport.input(), sa, tp, llm, lfra, ts, tts, transport.output()])
|
||||
pipeline = Pipeline([
|
||||
transport.input(),
|
||||
sa,
|
||||
tp,
|
||||
llm,
|
||||
lfra,
|
||||
ts,
|
||||
tts,
|
||||
transport.output()
|
||||
])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
|
||||
@@ -5,14 +5,14 @@
|
||||
# pip-compile --all-extras pyproject.toml
|
||||
#
|
||||
aiohttp==3.9.5
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
annotated-types==0.6.0
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anthropic==0.25.8
|
||||
# via pipecat (pyproject.toml)
|
||||
anyio==4.3.0
|
||||
anthropic==0.25.9
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
anyio==4.4.0
|
||||
# via
|
||||
# anthropic
|
||||
# httpx
|
||||
@@ -24,9 +24,11 @@ attrs==23.2.0
|
||||
av==12.0.0
|
||||
# via faster-whisper
|
||||
azure-cognitiveservices-speech==1.37.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
blinker==1.8.2
|
||||
# via flask
|
||||
cachetools==5.3.3
|
||||
# via google-auth
|
||||
certifi==2024.2.2
|
||||
# via
|
||||
# httpcore
|
||||
@@ -40,20 +42,20 @@ coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
ctranslate2==4.2.1
|
||||
# via faster-whisper
|
||||
daily-python==0.7.4
|
||||
# via pipecat (pyproject.toml)
|
||||
daily-python==0.9.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
distro==1.9.0
|
||||
# via
|
||||
# anthropic
|
||||
# openai
|
||||
einops==0.8.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
exceptiongroup==1.2.1
|
||||
# via anyio
|
||||
fal-client==0.4.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
faster-whisper==1.0.2
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
filelock==3.14.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
@@ -64,25 +66,60 @@ filelock==3.14.0
|
||||
flask==3.0.3
|
||||
# via
|
||||
# flask-cors
|
||||
# pipecat (pyproject.toml)
|
||||
# pipecat-ai (pyproject.toml)
|
||||
flask-cors==4.0.1
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
flatbuffers==24.3.25
|
||||
# via onnxruntime
|
||||
frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2024.3.1
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
grpcio==1.63.0
|
||||
# via pyht
|
||||
future==1.0.0
|
||||
# via pyloudnorm
|
||||
google-ai-generativelanguage==0.6.4
|
||||
# via google-generativeai
|
||||
google-api-core[grpc]==2.19.0
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-python-client
|
||||
# google-generativeai
|
||||
google-api-python-client==2.131.0
|
||||
# via google-generativeai
|
||||
google-auth==2.29.0
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-core
|
||||
# google-api-python-client
|
||||
# google-auth-httplib2
|
||||
# google-generativeai
|
||||
google-auth-httplib2==0.2.0
|
||||
# via google-api-python-client
|
||||
google-generativeai==0.5.4
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
googleapis-common-protos==1.63.0
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio==1.64.0
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
# pyht
|
||||
grpcio-status==1.62.2
|
||||
# via google-api-core
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
httpcore==1.0.5
|
||||
# via httpx
|
||||
httplib2==0.22.0
|
||||
# via
|
||||
# google-api-python-client
|
||||
# google-auth-httplib2
|
||||
httpx==0.27.0
|
||||
# via
|
||||
# anthropic
|
||||
@@ -90,7 +127,7 @@ httpx==0.27.0
|
||||
# openai
|
||||
httpx-sse==0.4.0
|
||||
# via fal-client
|
||||
huggingface-hub==0.23.0
|
||||
huggingface-hub==0.23.2
|
||||
# via
|
||||
# faster-whisper
|
||||
# timm
|
||||
@@ -111,7 +148,7 @@ jinja2==3.1.4
|
||||
# flask
|
||||
# torch
|
||||
loguru==0.7.2
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
markupsafe==2.1.5
|
||||
# via
|
||||
# jinja2
|
||||
@@ -128,7 +165,9 @@ numpy==1.26.4
|
||||
# via
|
||||
# ctranslate2
|
||||
# onnxruntime
|
||||
# pipecat (pyproject.toml)
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# pyloudnorm
|
||||
# scipy
|
||||
# torchvision
|
||||
# transformers
|
||||
nvidia-cublas-cu12==12.1.3.1
|
||||
@@ -156,16 +195,16 @@ nvidia-cusparse-cu12==12.1.0.106
|
||||
# torch
|
||||
nvidia-nccl-cu12==2.20.5
|
||||
# via torch
|
||||
nvidia-nvjitlink-cu12==12.4.127
|
||||
nvidia-nvjitlink-cu12==12.5.40
|
||||
# via
|
||||
# nvidia-cusolver-cu12
|
||||
# nvidia-cusparse-cu12
|
||||
nvidia-nvtx-cu12==12.1.105
|
||||
# via torch
|
||||
onnxruntime==1.17.3
|
||||
onnxruntime==1.18.0
|
||||
# via faster-whisper
|
||||
openai==1.26.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
packaging==24.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
@@ -173,41 +212,67 @@ packaging==24.0
|
||||
# transformers
|
||||
pillow==10.3.0
|
||||
# via
|
||||
# pipecat (pyproject.toml)
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# torchvision
|
||||
proto-plus==1.23.0
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-core
|
||||
protobuf==4.25.3
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-core
|
||||
# google-generativeai
|
||||
# googleapis-common-protos
|
||||
# grpcio-status
|
||||
# onnxruntime
|
||||
# proto-plus
|
||||
# pyht
|
||||
pyasn1==0.6.0
|
||||
# via
|
||||
# pyasn1-modules
|
||||
# rsa
|
||||
pyasn1-modules==0.4.0
|
||||
# via google-auth
|
||||
pyaudio==0.2.14
|
||||
# via pipecat (pyproject.toml)
|
||||
pydantic==2.7.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pydantic==2.7.2
|
||||
# via
|
||||
# anthropic
|
||||
# google-generativeai
|
||||
# openai
|
||||
pydantic-core==2.18.2
|
||||
pydantic-core==2.18.3
|
||||
# via pydantic
|
||||
pyht==0.0.28
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pyloudnorm==0.1.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pyparsing==3.1.2
|
||||
# via httplib2
|
||||
python-dotenv==1.0.1
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pyyaml==6.0.1
|
||||
# via
|
||||
# ctranslate2
|
||||
# huggingface-hub
|
||||
# timm
|
||||
# transformers
|
||||
regex==2024.5.10
|
||||
regex==2024.5.15
|
||||
# via transformers
|
||||
requests==2.31.0
|
||||
requests==2.32.2
|
||||
# via
|
||||
# google-api-core
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
# transformers
|
||||
rsa==4.9
|
||||
# via google-auth
|
||||
safetensors==0.4.3
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
scipy==1.13.1
|
||||
# via pyloudnorm
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# anthropic
|
||||
@@ -219,7 +284,7 @@ sympy==1.12
|
||||
# onnxruntime
|
||||
# torch
|
||||
timm==0.9.16
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
tokenizers==0.19.1
|
||||
# via
|
||||
# anthropic
|
||||
@@ -227,37 +292,41 @@ tokenizers==0.19.1
|
||||
# transformers
|
||||
torch==2.3.0
|
||||
# via
|
||||
# pipecat (pyproject.toml)
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# timm
|
||||
# torchaudio
|
||||
# torchvision
|
||||
torchaudio==2.3.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
torchvision==0.18.0
|
||||
# via timm
|
||||
tqdm==4.66.4
|
||||
# via
|
||||
# google-generativeai
|
||||
# huggingface-hub
|
||||
# openai
|
||||
# transformers
|
||||
transformers==4.40.2
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
triton==2.3.0
|
||||
# via torch
|
||||
typing-extensions==4.11.0
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# google-generativeai
|
||||
# huggingface-hub
|
||||
# openai
|
||||
# pipecat (pyproject.toml)
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# torch
|
||||
uritemplate==4.1.1
|
||||
# via google-api-python-client
|
||||
urllib3==2.2.1
|
||||
# via requests
|
||||
websockets==12.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
werkzeug==3.0.3
|
||||
# via flask
|
||||
yarl==1.9.4
|
||||
|
||||
@@ -5,14 +5,16 @@
|
||||
# pip-compile --all-extras pyproject.toml
|
||||
#
|
||||
aiohttp==3.9.5
|
||||
# via pipecat (pyproject.toml)
|
||||
# via
|
||||
# cartesia
|
||||
# pipecat-ai (pyproject.toml)
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
annotated-types==0.6.0
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anthropic==0.25.8
|
||||
# via pipecat (pyproject.toml)
|
||||
anyio==4.3.0
|
||||
anthropic==0.25.9
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
anyio==4.4.0
|
||||
# via
|
||||
# anthropic
|
||||
# httpx
|
||||
@@ -21,17 +23,23 @@ async-timeout==4.0.3
|
||||
# via aiohttp
|
||||
attrs==23.2.0
|
||||
# via aiohttp
|
||||
av==12.0.0
|
||||
av==12.1.0
|
||||
# via faster-whisper
|
||||
azure-cognitiveservices-speech==1.37.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
blinker==1.8.2
|
||||
# via flask
|
||||
cachetools==5.3.3
|
||||
# via google-auth
|
||||
cartesia==0.1.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
certifi==2024.2.2
|
||||
# via
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
cffi==1.16.0
|
||||
# via sounddevice
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
click==8.1.7
|
||||
@@ -40,20 +48,22 @@ coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
ctranslate2==4.2.1
|
||||
# via faster-whisper
|
||||
daily-python==0.7.4
|
||||
# via pipecat (pyproject.toml)
|
||||
daily-python==0.9.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
distro==1.9.0
|
||||
# via
|
||||
# anthropic
|
||||
# openai
|
||||
einops==0.8.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
exceptiongroup==1.2.1
|
||||
# via anyio
|
||||
# via
|
||||
# anyio
|
||||
# pytest
|
||||
fal-client==0.4.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
faster-whisper==1.0.2
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
filelock==3.14.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
@@ -63,33 +73,69 @@ filelock==3.14.0
|
||||
flask==3.0.3
|
||||
# via
|
||||
# flask-cors
|
||||
# pipecat (pyproject.toml)
|
||||
# pipecat-ai (pyproject.toml)
|
||||
flask-cors==4.0.1
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
flatbuffers==24.3.25
|
||||
# via onnxruntime
|
||||
frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2024.3.1
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
grpcio==1.63.0
|
||||
# via pyht
|
||||
future==1.0.0
|
||||
# via pyloudnorm
|
||||
google-ai-generativelanguage==0.6.4
|
||||
# via google-generativeai
|
||||
google-api-core[grpc]==2.19.0
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-python-client
|
||||
# google-generativeai
|
||||
google-api-python-client==2.131.0
|
||||
# via google-generativeai
|
||||
google-auth==2.29.0
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-core
|
||||
# google-api-python-client
|
||||
# google-auth-httplib2
|
||||
# google-generativeai
|
||||
google-auth-httplib2==0.2.0
|
||||
# via google-api-python-client
|
||||
google-generativeai==0.5.4
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
googleapis-common-protos==1.63.0
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio==1.64.0
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
# pyht
|
||||
grpcio-status==1.62.2
|
||||
# via google-api-core
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
httpcore==1.0.5
|
||||
# via httpx
|
||||
httplib2==0.22.0
|
||||
# via
|
||||
# google-api-python-client
|
||||
# google-auth-httplib2
|
||||
httpx==0.27.0
|
||||
# via
|
||||
# anthropic
|
||||
# cartesia
|
||||
# fal-client
|
||||
# openai
|
||||
httpx-sse==0.4.0
|
||||
# via fal-client
|
||||
huggingface-hub==0.23.0
|
||||
huggingface-hub==0.23.2
|
||||
# via
|
||||
# faster-whisper
|
||||
# timm
|
||||
@@ -103,6 +149,8 @@ idna==3.7
|
||||
# httpx
|
||||
# requests
|
||||
# yarl
|
||||
iniconfig==2.0.0
|
||||
# via pytest
|
||||
itsdangerous==2.2.0
|
||||
# via flask
|
||||
jinja2==3.1.4
|
||||
@@ -110,7 +158,7 @@ jinja2==3.1.4
|
||||
# flask
|
||||
# torch
|
||||
loguru==0.7.2
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
markupsafe==2.1.5
|
||||
# via
|
||||
# jinja2
|
||||
@@ -127,103 +175,151 @@ numpy==1.26.4
|
||||
# via
|
||||
# ctranslate2
|
||||
# onnxruntime
|
||||
# pipecat (pyproject.toml)
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# pyloudnorm
|
||||
# scipy
|
||||
# torchvision
|
||||
# transformers
|
||||
onnxruntime==1.17.3
|
||||
onnxruntime==1.18.0
|
||||
# via faster-whisper
|
||||
openai==1.26.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
packaging==24.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# onnxruntime
|
||||
# pytest
|
||||
# transformers
|
||||
pillow==10.3.0
|
||||
# via
|
||||
# pipecat (pyproject.toml)
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# torchvision
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
proto-plus==1.23.0
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-core
|
||||
protobuf==4.25.3
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-core
|
||||
# google-generativeai
|
||||
# googleapis-common-protos
|
||||
# grpcio-status
|
||||
# onnxruntime
|
||||
# proto-plus
|
||||
# pyht
|
||||
pyasn1==0.6.0
|
||||
# via
|
||||
# pyasn1-modules
|
||||
# rsa
|
||||
pyasn1-modules==0.4.0
|
||||
# via google-auth
|
||||
pyaudio==0.2.14
|
||||
# via pipecat (pyproject.toml)
|
||||
pydantic==2.7.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pydantic==2.7.2
|
||||
# via
|
||||
# anthropic
|
||||
# google-generativeai
|
||||
# openai
|
||||
pydantic-core==2.18.2
|
||||
pydantic-core==2.18.3
|
||||
# via pydantic
|
||||
pyht==0.0.28
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pyloudnorm==0.1.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pyparsing==3.1.2
|
||||
# via httplib2
|
||||
pytest==8.2.1
|
||||
# via pytest-asyncio
|
||||
pytest-asyncio==0.23.7
|
||||
# via cartesia
|
||||
python-dotenv==1.0.1
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pyyaml==6.0.1
|
||||
# via
|
||||
# ctranslate2
|
||||
# huggingface-hub
|
||||
# timm
|
||||
# transformers
|
||||
regex==2024.5.10
|
||||
regex==2024.5.15
|
||||
# via transformers
|
||||
requests==2.31.0
|
||||
requests==2.32.3
|
||||
# via
|
||||
# cartesia
|
||||
# google-api-core
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
# transformers
|
||||
rsa==4.9
|
||||
# via google-auth
|
||||
safetensors==0.4.3
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
scipy==1.13.1
|
||||
# via pyloudnorm
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# httpx
|
||||
# openai
|
||||
sympy==1.12
|
||||
sounddevice==0.4.7
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
sympy==1.12.1
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
timm==0.9.16
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
tokenizers==0.19.1
|
||||
# via
|
||||
# anthropic
|
||||
# faster-whisper
|
||||
# transformers
|
||||
tomli==2.0.1
|
||||
# via pytest
|
||||
torch==2.3.0
|
||||
# via
|
||||
# pipecat (pyproject.toml)
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# timm
|
||||
# torchaudio
|
||||
# torchvision
|
||||
torchaudio==2.3.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
torchvision==0.18.0
|
||||
# via timm
|
||||
tqdm==4.66.4
|
||||
# via
|
||||
# google-generativeai
|
||||
# huggingface-hub
|
||||
# openai
|
||||
# transformers
|
||||
transformers==4.40.2
|
||||
# via pipecat (pyproject.toml)
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
typing-extensions==4.11.0
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# google-generativeai
|
||||
# huggingface-hub
|
||||
# openai
|
||||
# pipecat (pyproject.toml)
|
||||
# pipecat-ai (pyproject.toml)
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# torch
|
||||
uritemplate==4.1.1
|
||||
# via google-api-python-client
|
||||
urllib3==2.2.1
|
||||
# via requests
|
||||
websockets==12.0
|
||||
# via pipecat (pyproject.toml)
|
||||
# via
|
||||
# cartesia
|
||||
# pipecat-ai (pyproject.toml)
|
||||
werkzeug==3.0.3
|
||||
# via flask
|
||||
yarl==1.9.4
|
||||
|
||||
@@ -24,6 +24,7 @@ dependencies = [
|
||||
"numpy~=1.26.4",
|
||||
"loguru~=0.7.0",
|
||||
"Pillow~=10.3.0",
|
||||
"pyloudnorm~=0.1.1",
|
||||
"typing-extensions~=4.11.0",
|
||||
]
|
||||
|
||||
@@ -34,9 +35,11 @@ Website = "https://pipecat.ai"
|
||||
[project.optional-dependencies]
|
||||
anthropic = [ "anthropic~=0.25.7" ]
|
||||
azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
|
||||
daily = [ "daily-python~=0.7.4" ]
|
||||
cartesia = [ "numpy~=1.26.0", "sounddevice", "cartesia" ]
|
||||
daily = [ "daily-python~=0.9.0" ]
|
||||
examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
|
||||
fal = [ "fal-client~=0.4.0" ]
|
||||
google = [ "google-generativeai~=0.5.3" ]
|
||||
fireworks = [ "openai~=1.26.0" ]
|
||||
local = [ "pyaudio~=0.2.0" ]
|
||||
moondream = [ "einops~=0.8.0", "timm~=0.9.16", "transformers~=4.40.2" ]
|
||||
|
||||
@@ -55,7 +55,7 @@ class ImageRawFrame(DataFrame):
|
||||
"""
|
||||
image: bytes
|
||||
size: Tuple[int, int]
|
||||
format: str
|
||||
format: str | None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(size: {self.size}, format: {self.format})"
|
||||
@@ -119,7 +119,7 @@ class TextFrame(DataFrame):
|
||||
text: str
|
||||
|
||||
def __str__(self):
|
||||
return f'{self.name}: "{self.text}"'
|
||||
return f"{self.name}(text: {self.text})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -132,7 +132,7 @@ class TranscriptionFrame(TextFrame):
|
||||
timestamp: str
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(user: {self.user_id}, timestamp: {self.timestamp})"
|
||||
return f"{self.name}(user: {self.user_id}, text: {self.text}, timestamp: {self.timestamp})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -143,7 +143,7 @@ class InterimTranscriptionFrame(TextFrame):
|
||||
timestamp: str
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(user: {self.user_id}, timestamp: {self.timestamp})"
|
||||
return f"{self.name}(user: {self.user_id}, text: {self.text}, timestamp: {self.timestamp})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -187,7 +187,7 @@ class SystemFrame(Frame):
|
||||
@dataclass
|
||||
class StartFrame(SystemFrame):
|
||||
"""This is the first frame that should be pushed down a pipeline."""
|
||||
pass
|
||||
allow_interruptions: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -205,6 +205,39 @@ class ErrorFrame(SystemFrame):
|
||||
def __str__(self):
|
||||
return f"{self.name}(error: {self.error})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class StopTaskFrame(SystemFrame):
|
||||
"""Indicates that a pipeline task should be stopped. This should inform the
|
||||
pipeline processors that they should stop pushing frames but that they
|
||||
should be kept in a running state.
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class StartInterruptionFrame(SystemFrame):
|
||||
"""Emitted by VAD to indicate that a user has started speaking (i.e. is
|
||||
interruption). This is similar to UserStartedSpeakingFrame except that it
|
||||
should be pushed concurrently with other frames (so the order is not
|
||||
guaranteed).
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class StopInterruptionFrame(SystemFrame):
|
||||
"""Emitted by VAD to indicate that a user has stopped speaking (i.e. no more
|
||||
interruptions). This is similar to UserStoppedSpeakingFrame except that it
|
||||
should be pushed concurrently with other frames (so the order is not
|
||||
guaranteed).
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
#
|
||||
# Control frames
|
||||
#
|
||||
@@ -227,6 +260,20 @@ class EndFrame(ControlFrame):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMFullResponseStartFrame(ControlFrame):
|
||||
"""Used to indicate the beginning of a full LLM response. Following
|
||||
LLMResponseStartFrame, TextFrame and LLMResponseEndFrame for each sentence
|
||||
until a LLMFullResponseEndFrame."""
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMFullResponseEndFrame(ControlFrame):
|
||||
"""Indicates the end of a full LLM response."""
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMResponseStartFrame(ControlFrame):
|
||||
"""Used to indicate the beginning of an LLM response. Following TextFrames
|
||||
@@ -260,7 +307,7 @@ class UserStoppedSpeakingFrame(ControlFrame):
|
||||
@dataclass
|
||||
class TTSStartedFrame(ControlFrame):
|
||||
"""Used to indicate the beginning of a TTS response. Following
|
||||
AudioRawFrames are part of the TTS response until an TTSEndFrame. These
|
||||
AudioRawFrames are part of the TTS response until an TTSStoppedFrame. These
|
||||
frames can be used for aggregating audio frames in a transport to optimize
|
||||
the size of frames sent to the session, without needing to control this in
|
||||
the TTS service.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from typing import List
|
||||
from pipecat.pipeline.frames import EndFrame, EndPipeFrame
|
||||
from pipecat.frames.frames import EndFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
|
||||
|
||||
@@ -16,8 +16,7 @@ class SequentialMergePipeline(Pipeline):
|
||||
while True:
|
||||
frame = await pipeline.sink.get()
|
||||
if isinstance(
|
||||
frame, EndFrame) or isinstance(
|
||||
frame, EndPipeFrame):
|
||||
frame, EndFrame):
|
||||
break
|
||||
await self.sink.put(frame)
|
||||
|
||||
|
||||
@@ -63,7 +63,7 @@ class ParallelPipeline(FrameProcessor):
|
||||
if not isinstance(processors, list):
|
||||
raise TypeError(f"ParallelPipeline argument {processors} is not a list")
|
||||
|
||||
# We add a source at before the pipeline and a sink after.
|
||||
# We will add a source before the pipeline and a sink after.
|
||||
source = Source(self._up_queue)
|
||||
sink = Sink(self._down_queue)
|
||||
self._sources.append(source)
|
||||
|
||||
@@ -18,7 +18,6 @@ class PipelineRunner:
|
||||
def __init__(self, name: str | None = None, handle_sigint: bool = True):
|
||||
self.id: int = obj_id()
|
||||
self.name: str = name or f"{self.__class__.__name__}#{obj_count(self)}"
|
||||
self._loop: asyncio.AbstractEventLoop = asyncio.get_running_loop()
|
||||
|
||||
self._tasks = {}
|
||||
self._running = True
|
||||
@@ -47,7 +46,8 @@ class PipelineRunner:
|
||||
return self._running
|
||||
|
||||
def _setup_sigint(self):
|
||||
self._loop.add_signal_handler(
|
||||
loop = asyncio.get_running_loop()
|
||||
loop.add_signal_handler(
|
||||
signal.SIGINT,
|
||||
lambda *args: asyncio.create_task(self._sigint_handler())
|
||||
)
|
||||
|
||||
@@ -8,13 +8,19 @@ import asyncio
|
||||
|
||||
from typing import AsyncIterable, Iterable
|
||||
|
||||
from pipecat.frames.frames import CancelFrame, EndFrame, ErrorFrame, Frame, StartFrame
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import CancelFrame, EndFrame, ErrorFrame, Frame, StartFrame, StopTaskFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.utils import obj_count, obj_id
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class PipelineParams(BaseModel):
|
||||
allow_interruptions: bool = False
|
||||
|
||||
|
||||
class Source(FrameProcessor):
|
||||
|
||||
def __init__(self, up_queue: asyncio.Queue):
|
||||
@@ -31,13 +37,14 @@ class Source(FrameProcessor):
|
||||
|
||||
class PipelineTask:
|
||||
|
||||
def __init__(self, pipeline: FrameProcessor):
|
||||
def __init__(self, pipeline: FrameProcessor, params: PipelineParams = PipelineParams()):
|
||||
self.id: int = obj_id()
|
||||
self.name: str = f"{self.__class__.__name__}#{obj_count(self)}"
|
||||
|
||||
self._pipeline = pipeline
|
||||
self._params = params
|
||||
|
||||
self._task_queue = asyncio.Queue()
|
||||
self._down_queue = asyncio.Queue()
|
||||
self._up_queue = asyncio.Queue()
|
||||
|
||||
self._source = Source(self._up_queue)
|
||||
@@ -49,13 +56,20 @@ class PipelineTask:
|
||||
|
||||
async def cancel(self):
|
||||
logger.debug(f"Canceling pipeline task {self}")
|
||||
await self.queue_frame(CancelFrame())
|
||||
# Make sure everything is cleaned up downstream. This is sent
|
||||
# out-of-band from the main streaming task which is what we want since
|
||||
# we want to cancel right away.
|
||||
await self._source.process_frame(CancelFrame(), FrameDirection.DOWNSTREAM)
|
||||
self._process_down_task.cancel()
|
||||
self._process_up_task.cancel()
|
||||
|
||||
async def run(self):
|
||||
await asyncio.gather(self._process_task_queue(), self._process_up_queue())
|
||||
self._process_up_task = asyncio.create_task(self._process_up_queue())
|
||||
self._process_down_task = asyncio.create_task(self._process_down_queue())
|
||||
await asyncio.gather(self._process_up_task, self._process_down_task)
|
||||
|
||||
async def queue_frame(self, frame: Frame):
|
||||
await self._task_queue.put(frame)
|
||||
await self._down_queue.put(frame)
|
||||
|
||||
async def queue_frames(self, frames: Iterable[Frame] | AsyncIterable[Frame]):
|
||||
if isinstance(frames, AsyncIterable):
|
||||
@@ -67,27 +81,37 @@ class PipelineTask:
|
||||
else:
|
||||
raise Exception("Frames must be an iterable or async iterable")
|
||||
|
||||
async def _process_task_queue(self):
|
||||
await self._source.process_frame(StartFrame(), FrameDirection.DOWNSTREAM)
|
||||
async def _process_down_queue(self):
|
||||
await self._source.process_frame(
|
||||
StartFrame(allow_interruptions=self._params.allow_interruptions), FrameDirection.DOWNSTREAM)
|
||||
running = True
|
||||
should_cleanup = True
|
||||
while running:
|
||||
frame = await self._task_queue.get()
|
||||
await self._source.process_frame(frame, FrameDirection.DOWNSTREAM)
|
||||
self._task_queue.task_done()
|
||||
running = not (isinstance(frame, CancelFrame) or isinstance(frame, EndFrame))
|
||||
# We just enqueue None to terminate the task.
|
||||
await self._up_queue.put(None)
|
||||
try:
|
||||
frame = await self._down_queue.get()
|
||||
await self._source.process_frame(frame, FrameDirection.DOWNSTREAM)
|
||||
running = not (isinstance(frame, StopTaskFrame) or isinstance(frame, EndFrame))
|
||||
should_cleanup = not isinstance(frame, StopTaskFrame)
|
||||
self._down_queue.task_done()
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
# Cleanup only if we need to.
|
||||
if should_cleanup:
|
||||
await self._source.cleanup()
|
||||
await self._pipeline.cleanup()
|
||||
# We just enqueue None to terminate the task gracefully.
|
||||
self._process_up_task.cancel()
|
||||
|
||||
async def _process_up_queue(self):
|
||||
running = True
|
||||
while running:
|
||||
frame = await self._up_queue.get()
|
||||
if frame:
|
||||
while True:
|
||||
try:
|
||||
frame = await self._up_queue.get()
|
||||
if isinstance(frame, ErrorFrame):
|
||||
logger.error(f"Error running app: {frame.error}")
|
||||
await self.queue_frame(CancelFrame())
|
||||
self._up_queue.task_done()
|
||||
running = frame is not None
|
||||
self._up_queue.task_done()
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
@@ -17,7 +17,7 @@ class GatedAggregator(FrameProcessor):
|
||||
Yields gate-opening frame before any accumulated frames, then ensuing frames
|
||||
until and not including the gate-closed frame.
|
||||
|
||||
>>> from pipecat.pipeline.frames import ImageFrame
|
||||
>>> from pipecat.frames.frames import ImageFrame
|
||||
|
||||
>>> async def print_frames(aggregator, frame):
|
||||
... async for frame in aggregator.process_frame(frame):
|
||||
|
||||
@@ -1,82 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from pipecat.frames.frames import Frame, InterimTranscriptionFrame, LLMMessagesFrame, TextFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
class LLMContextAggregator(FrameProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
messages: list[dict],
|
||||
role: str,
|
||||
complete_sentences=True,
|
||||
pass_through=True,
|
||||
):
|
||||
super().__init__()
|
||||
self._messages = messages
|
||||
self._role = role
|
||||
self._sentence = ""
|
||||
self._complete_sentences = complete_sentences
|
||||
self._pass_through = pass_through
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
# We don't do anything with non-text frames, pass it along to next in
|
||||
# the pipeline.
|
||||
if not isinstance(frame, TextFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
# If we get interim results, we ignore them.
|
||||
if isinstance(frame, InterimTranscriptionFrame):
|
||||
return
|
||||
|
||||
# The common case for "pass through" is receiving frames from the LLM that we'll
|
||||
# use to update the "assistant" LLM messages, but also passing the text frames
|
||||
# along to a TTS service to be spoken to the user.
|
||||
if self._pass_through:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
# TODO: split up transcription by participant
|
||||
if self._complete_sentences:
|
||||
# type: ignore -- the linter thinks this isn't a TextFrame, even
|
||||
# though we check it above
|
||||
self._sentence += frame.text
|
||||
if self._sentence.endswith((".", "?", "!")):
|
||||
self._messages.append(
|
||||
{"role": self._role, "content": self._sentence})
|
||||
self._sentence = ""
|
||||
await self.push_frame(LLMMessagesFrame(self._messages))
|
||||
else:
|
||||
# type: ignore -- the linter thinks this isn't a TextFrame, even
|
||||
# though we check it above
|
||||
self._messages.append({"role": self._role, "content": frame.text})
|
||||
await self.push_frame(LLMMessagesFrame(self._messages))
|
||||
|
||||
|
||||
class LLMUserContextAggregator(LLMContextAggregator):
|
||||
def __init__(
|
||||
self,
|
||||
messages: list[dict],
|
||||
complete_sentences=True):
|
||||
super().__init__(
|
||||
messages,
|
||||
"user",
|
||||
complete_sentences,
|
||||
pass_through=False)
|
||||
|
||||
|
||||
class LLMAssistantContextAggregator(LLMContextAggregator):
|
||||
def __init__(
|
||||
self,
|
||||
messages: list[dict],
|
||||
complete_sentences=True):
|
||||
super().__init__(
|
||||
messages,
|
||||
"assistant",
|
||||
complete_sentences,
|
||||
pass_through=True,
|
||||
)
|
||||
@@ -6,15 +6,20 @@
|
||||
|
||||
from typing import List
|
||||
|
||||
from pipecat.services.openai import OpenAILLMContextFrame, OpenAILLMContext
|
||||
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMMessagesFrame,
|
||||
LLMResponseStartFrame,
|
||||
TextFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMResponseEndFrame,
|
||||
LLMResponseStartFrame,
|
||||
LLMMessagesFrame,
|
||||
StartInterruptionFrame,
|
||||
TranscriptionFrame,
|
||||
TextFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame)
|
||||
|
||||
@@ -29,7 +34,8 @@ class LLMResponseAggregator(FrameProcessor):
|
||||
start_frame,
|
||||
end_frame,
|
||||
accumulator_frame: TextFrame,
|
||||
interim_accumulator_frame: TextFrame | None = None
|
||||
interim_accumulator_frame: TextFrame | None = None,
|
||||
handle_interruptions: bool = False
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
@@ -39,12 +45,18 @@ class LLMResponseAggregator(FrameProcessor):
|
||||
self._end_frame = end_frame
|
||||
self._accumulator_frame = accumulator_frame
|
||||
self._interim_accumulator_frame = interim_accumulator_frame
|
||||
self._seen_start_frame = False
|
||||
self._seen_end_frame = False
|
||||
self._seen_interim_results = False
|
||||
self._handle_interruptions = handle_interruptions
|
||||
|
||||
self._aggregation = ""
|
||||
self._aggregating = False
|
||||
# Reset our accumulator state.
|
||||
self._reset()
|
||||
|
||||
@property
|
||||
def messages(self):
|
||||
return self._messages
|
||||
|
||||
@property
|
||||
def role(self):
|
||||
return self._role
|
||||
|
||||
#
|
||||
# Frame processor
|
||||
@@ -70,10 +82,14 @@ class LLMResponseAggregator(FrameProcessor):
|
||||
send_aggregation = False
|
||||
|
||||
if isinstance(frame, self._start_frame):
|
||||
self._seen_start_frame = True
|
||||
self._aggregation = ""
|
||||
self._aggregating = True
|
||||
self._seen_start_frame = True
|
||||
self._seen_end_frame = False
|
||||
self._seen_interim_results = False
|
||||
elif isinstance(frame, self._end_frame):
|
||||
self._seen_end_frame = True
|
||||
self._seen_start_frame = False
|
||||
|
||||
# We might have received the end frame but we might still be
|
||||
# aggregating (i.e. we have seen interim results but not the final
|
||||
@@ -95,6 +111,11 @@ class LLMResponseAggregator(FrameProcessor):
|
||||
self._seen_interim_results = False
|
||||
elif self._interim_accumulator_frame and isinstance(frame, self._interim_accumulator_frame):
|
||||
self._seen_interim_results = True
|
||||
elif self._handle_interruptions and isinstance(frame, StartInterruptionFrame):
|
||||
await self._push_aggregation()
|
||||
# Reset anyways
|
||||
self._reset()
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
@@ -104,14 +125,20 @@ class LLMResponseAggregator(FrameProcessor):
|
||||
async def _push_aggregation(self):
|
||||
if len(self._aggregation) > 0:
|
||||
self._messages.append({"role": self._role, "content": self._aggregation})
|
||||
|
||||
# Reset the aggregation. Reset it before pushing it down, otherwise
|
||||
# if the tasks gets cancelled we won't be able to clear things up.
|
||||
self._aggregation = ""
|
||||
|
||||
frame = LLMMessagesFrame(self._messages)
|
||||
await self.push_frame(frame)
|
||||
|
||||
# Reset
|
||||
self._aggregation = ""
|
||||
self._seen_start_frame = False
|
||||
self._seen_end_frame = False
|
||||
self._seen_interim_results = False
|
||||
def _reset(self):
|
||||
self._aggregation = ""
|
||||
self._aggregating = False
|
||||
self._seen_start_frame = False
|
||||
self._seen_end_frame = False
|
||||
self._seen_interim_results = False
|
||||
|
||||
|
||||
class LLMAssistantResponseAggregator(LLMResponseAggregator):
|
||||
@@ -119,9 +146,10 @@ class LLMAssistantResponseAggregator(LLMResponseAggregator):
|
||||
super().__init__(
|
||||
messages=messages,
|
||||
role="assistant",
|
||||
start_frame=LLMResponseStartFrame,
|
||||
end_frame=LLMResponseEndFrame,
|
||||
accumulator_frame=TextFrame
|
||||
start_frame=LLMFullResponseStartFrame,
|
||||
end_frame=LLMFullResponseEndFrame,
|
||||
accumulator_frame=TextFrame,
|
||||
handle_interruptions=True
|
||||
)
|
||||
|
||||
|
||||
@@ -181,9 +209,50 @@ class LLMFullResponseAggregator(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, TextFrame):
|
||||
self._aggregation += frame.text
|
||||
elif isinstance(frame, LLMResponseEndFrame):
|
||||
elif isinstance(frame, LLMFullResponseEndFrame):
|
||||
await self.push_frame(TextFrame(self._aggregation))
|
||||
await self.push_frame(frame)
|
||||
self._aggregation = ""
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class LLMContextAggregator(LLMResponseAggregator):
|
||||
def __init__(self, *, context: OpenAILLMContext, **kwargs):
|
||||
|
||||
self._context = context
|
||||
super().__init__(**kwargs)
|
||||
|
||||
async def _push_aggregation(self):
|
||||
if len(self._aggregation) > 0:
|
||||
self._context.add_message({"role": self._role, "content": self._aggregation})
|
||||
frame = OpenAILLMContextFrame(self._context)
|
||||
await self.push_frame(frame)
|
||||
|
||||
# Reset our accumulator state.
|
||||
self._reset()
|
||||
|
||||
|
||||
class LLMAssistantContextAggregator(LLMContextAggregator):
|
||||
def __init__(self, context: OpenAILLMContext):
|
||||
super().__init__(
|
||||
messages=[],
|
||||
context=context,
|
||||
role="assistant",
|
||||
start_frame=LLMResponseStartFrame,
|
||||
end_frame=LLMResponseEndFrame,
|
||||
accumulator_frame=TextFrame
|
||||
)
|
||||
|
||||
|
||||
class LLMUserContextAggregator(LLMContextAggregator):
|
||||
def __init__(self, context: OpenAILLMContext):
|
||||
super().__init__(
|
||||
messages=[],
|
||||
context=context,
|
||||
role="user",
|
||||
start_frame=UserStartedSpeakingFrame,
|
||||
end_frame=UserStoppedSpeakingFrame,
|
||||
accumulator_frame=TranscriptionFrame,
|
||||
interim_accumulator_frame=InterimTranscriptionFrame
|
||||
)
|
||||
|
||||
@@ -5,29 +5,34 @@
|
||||
#
|
||||
|
||||
from dataclasses import dataclass
|
||||
import io
|
||||
import json
|
||||
|
||||
from typing import AsyncGenerator, Callable, List
|
||||
from typing import List
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMResponseEndFrame,
|
||||
LLMResponseStartFrame,
|
||||
TextFrame,
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameProcessor
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.frames.frames import Frame, VisionImageRawFrame
|
||||
|
||||
from openai._types import NOT_GIVEN, NotGiven
|
||||
|
||||
from openai.types.chat import (
|
||||
ChatCompletionRole,
|
||||
ChatCompletionToolParam,
|
||||
ChatCompletionToolChoiceOptionParam,
|
||||
ChatCompletionMessageParam
|
||||
)
|
||||
|
||||
# JSON custom encoder to handle bytes arrays so that we can log contexts
|
||||
# with images to the console.
|
||||
|
||||
|
||||
class CustomEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, io.BytesIO):
|
||||
# Convert the first 8 bytes to an ASCII hex string
|
||||
return (f"{obj.getbuffer()[0:8].hex()}...")
|
||||
return super().default(obj)
|
||||
|
||||
|
||||
class OpenAILLMContext:
|
||||
|
||||
@@ -42,7 +47,7 @@ class OpenAILLMContext:
|
||||
self.tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = tool_choice
|
||||
self.tools: List[ChatCompletionToolParam] | NotGiven = tools
|
||||
|
||||
@ staticmethod
|
||||
@staticmethod
|
||||
def from_messages(messages: List[dict]) -> "OpenAILLMContext":
|
||||
context = OpenAILLMContext()
|
||||
for message in messages:
|
||||
@@ -53,12 +58,40 @@ class OpenAILLMContext:
|
||||
})
|
||||
return context
|
||||
|
||||
@staticmethod
|
||||
def from_image_frame(frame: VisionImageRawFrame) -> "OpenAILLMContext":
|
||||
"""
|
||||
For images, we are deviating from the OpenAI messages shape. OpenAI
|
||||
expects images to be base64 encoded, but other vision models may not.
|
||||
So we'll store the image as bytes and do the base64 encoding as needed
|
||||
in the LLM service.
|
||||
"""
|
||||
context = OpenAILLMContext()
|
||||
buffer = io.BytesIO()
|
||||
Image.frombytes(
|
||||
frame.format,
|
||||
frame.size,
|
||||
frame.image
|
||||
).save(
|
||||
buffer,
|
||||
format="JPEG")
|
||||
context.add_message({
|
||||
"content": frame.text,
|
||||
"role": "user",
|
||||
"data": buffer,
|
||||
"mime_type": "image/jpeg"
|
||||
})
|
||||
return context
|
||||
|
||||
def add_message(self, message: ChatCompletionMessageParam):
|
||||
self.messages.append(message)
|
||||
|
||||
def get_messages(self) -> List[ChatCompletionMessageParam]:
|
||||
return self.messages
|
||||
|
||||
def get_messages_json(self) -> str:
|
||||
return json.dumps(self.messages, cls=CustomEncoder)
|
||||
|
||||
def set_tool_choice(
|
||||
self, tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven
|
||||
):
|
||||
@@ -71,100 +104,6 @@ class OpenAILLMContext:
|
||||
self.tools = tools
|
||||
|
||||
|
||||
class OpenAIContextAggregator(FrameProcessor):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
context: OpenAILLMContext,
|
||||
aggregator: Callable[[Frame, str | None], str | None],
|
||||
role: ChatCompletionRole,
|
||||
start_frame: type,
|
||||
end_frame: type,
|
||||
accumulator_frame: type,
|
||||
pass_through=True,
|
||||
):
|
||||
if not (
|
||||
issubclass(start_frame, Frame)
|
||||
and issubclass(end_frame, Frame)
|
||||
and issubclass(accumulator_frame, Frame)
|
||||
):
|
||||
raise TypeError(
|
||||
"start_frame, end_frame and accumulator_frame must be instances of Frame"
|
||||
)
|
||||
|
||||
self._context: OpenAILLMContext = context
|
||||
self._aggregator: Callable[[Frame, str | None], None] = aggregator
|
||||
self._role: ChatCompletionRole = role
|
||||
self._start_frame = start_frame
|
||||
self._end_frame = end_frame
|
||||
self._accumulator_frame = accumulator_frame
|
||||
self._pass_through = pass_through
|
||||
|
||||
self._aggregating = False
|
||||
self._aggregation = None
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, self._start_frame):
|
||||
self._aggregating = True
|
||||
elif isinstance(frame, self._end_frame):
|
||||
self._aggregating = False
|
||||
if self._aggregation:
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": self._role,
|
||||
"content": self._aggregation,
|
||||
"name": self._role,
|
||||
} # type: ignore
|
||||
)
|
||||
self._aggregation = None
|
||||
yield OpenAILLMContextFrame(self._context)
|
||||
elif isinstance(frame, self._accumulator_frame) and self._aggregating:
|
||||
self._aggregation = self._aggregator(frame, self._aggregation)
|
||||
if self._pass_through:
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
def string_aggregator(
|
||||
self,
|
||||
frame: Frame,
|
||||
aggregation: str | None) -> str | None:
|
||||
if not isinstance(frame, TextFrame):
|
||||
raise TypeError(
|
||||
"Frame must be a TextFrame instance to be aggregated by a string aggregator."
|
||||
)
|
||||
if not aggregation:
|
||||
aggregation = ""
|
||||
return " ".join([aggregation, frame.text])
|
||||
|
||||
|
||||
class OpenAIUserContextAggregator(OpenAIContextAggregator):
|
||||
def __init__(self, context: OpenAILLMContext):
|
||||
super().__init__(
|
||||
context=context,
|
||||
aggregator=self.string_aggregator,
|
||||
role="user",
|
||||
start_frame=UserStartedSpeakingFrame,
|
||||
end_frame=UserStoppedSpeakingFrame,
|
||||
accumulator_frame=TranscriptionFrame,
|
||||
pass_through=False,
|
||||
)
|
||||
|
||||
|
||||
class OpenAIAssistantContextAggregator(OpenAIContextAggregator):
|
||||
|
||||
def __init__(self, context: OpenAILLMContext):
|
||||
super().__init__(
|
||||
context,
|
||||
aggregator=self.string_aggregator,
|
||||
role="assistant",
|
||||
start_frame=LLMResponseStartFrame,
|
||||
end_frame=LLMResponseEndFrame,
|
||||
accumulator_frame=TextFrame,
|
||||
pass_through=True,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenAILLMContextFrame(Frame):
|
||||
"""Like an LLMMessagesFrame, but with extra context specific to the OpenAI
|
||||
|
||||
@@ -8,6 +8,7 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
StartInterruptionFrame,
|
||||
TextFrame,
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
@@ -56,12 +57,9 @@ class ResponseAggregator(FrameProcessor):
|
||||
self._end_frame = end_frame
|
||||
self._accumulator_frame = accumulator_frame
|
||||
self._interim_accumulator_frame = interim_accumulator_frame
|
||||
self._seen_start_frame = False
|
||||
self._seen_end_frame = False
|
||||
self._seen_interim_results = False
|
||||
|
||||
self._aggregation = ""
|
||||
self._aggregating = False
|
||||
# Reset our accumulator state.
|
||||
self._reset()
|
||||
|
||||
#
|
||||
# Frame processor
|
||||
@@ -87,10 +85,13 @@ class ResponseAggregator(FrameProcessor):
|
||||
send_aggregation = False
|
||||
|
||||
if isinstance(frame, self._start_frame):
|
||||
self._seen_start_frame = True
|
||||
self._aggregating = True
|
||||
self._seen_start_frame = True
|
||||
self._seen_end_frame = False
|
||||
self._seen_interim_results = False
|
||||
elif isinstance(frame, self._end_frame):
|
||||
self._seen_end_frame = True
|
||||
self._seen_start_frame = False
|
||||
|
||||
# We might have received the end frame but we might still be
|
||||
# aggregating (i.e. we have seen interim results but not the final
|
||||
@@ -120,13 +121,23 @@ class ResponseAggregator(FrameProcessor):
|
||||
|
||||
async def _push_aggregation(self):
|
||||
if len(self._aggregation) > 0:
|
||||
await self.push_frame(TextFrame(self._aggregation.strip()))
|
||||
frame = TextFrame(self._aggregation.strip())
|
||||
|
||||
# Reset
|
||||
# Reset the aggregation. Reset it before pushing it down, otherwise
|
||||
# if the tasks gets cancelled we won't be able to clear things up.
|
||||
self._aggregation = ""
|
||||
self._seen_start_frame = False
|
||||
self._seen_end_frame = False
|
||||
self._seen_interim_results = False
|
||||
|
||||
await self.push_frame(frame)
|
||||
|
||||
# Reset our accumulator state.
|
||||
self._reset()
|
||||
|
||||
def _reset(self):
|
||||
self._aggregation = ""
|
||||
self._aggregating = False
|
||||
self._seen_start_frame = False
|
||||
self._seen_end_frame = False
|
||||
self._seen_interim_results = False
|
||||
|
||||
|
||||
class UserResponseAggregator(ResponseAggregator):
|
||||
|
||||
@@ -12,7 +12,7 @@ class VisionImageFrameAggregator(FrameProcessor):
|
||||
"""This aggregator waits for a consecutive TextFrame and an
|
||||
ImageFrame. After the ImageFrame arrives it will output a VisionImageFrame.
|
||||
|
||||
>>> from pipecat.pipeline.frames import ImageFrame
|
||||
>>> from pipecat.frames.frames import ImageFrame
|
||||
|
||||
>>> async def print_frames(aggregator, frame):
|
||||
... async for frame in aggregator.process_frame(frame):
|
||||
|
||||
@@ -10,7 +10,7 @@ from pipecat.frames.frames import AppFrame, ControlFrame, Frame, SystemFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
class Filter(FrameProcessor):
|
||||
class FrameFilter(FrameProcessor):
|
||||
|
||||
def __init__(self, types: List[type]):
|
||||
super().__init__()
|
||||
84
src/pipecat/processors/filters/wake_check_filter.py
Normal file
84
src/pipecat/processors/filters/wake_check_filter.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import re
|
||||
import time
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class WakeCheckFilter(FrameProcessor):
|
||||
"""
|
||||
This filter looks for wake phrases in the transcription frames and only passes through frames
|
||||
after a wake phrase has been detected. It also has a keepalive timeout to allow for a brief
|
||||
period of continued conversation after a wake phrase has been detected.
|
||||
"""
|
||||
class WakeState(Enum):
|
||||
IDLE = 1
|
||||
AWAKE = 2
|
||||
|
||||
class ParticipantState:
|
||||
def __init__(self, participant_id: str):
|
||||
self.participant_id = participant_id
|
||||
self.state = WakeCheckFilter.WakeState.IDLE
|
||||
self.wake_timer = 0.0
|
||||
self.accumulator = ""
|
||||
|
||||
def __init__(self, wake_phrases: list[str], keepalive_timeout: float = 3):
|
||||
super().__init__()
|
||||
self._participant_states = {}
|
||||
self._keepalive_timeout = keepalive_timeout
|
||||
self._wake_patterns = []
|
||||
for name in wake_phrases:
|
||||
pattern = re.compile(r'\b' + r'\s*'.join(re.escape(word)
|
||||
for word in name.split()) + r'\b', re.IGNORECASE)
|
||||
self._wake_patterns.append(pattern)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
try:
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
p = self._participant_states.get(frame.user_id)
|
||||
if p is None:
|
||||
p = WakeCheckFilter.ParticipantState(frame.user_id)
|
||||
self._participant_states[frame.user_id] = p
|
||||
|
||||
# If we have been AWAKE within the last keepalive_timeout seconds, pass
|
||||
# the frame through
|
||||
if p.state == WakeCheckFilter.WakeState.AWAKE:
|
||||
if time.time() - p.wake_timer < self._keepalive_timeout:
|
||||
logger.debug(
|
||||
f"Wake phrase keepalive timeout has not expired. Pushing {frame}")
|
||||
p.wake_timer = time.time()
|
||||
await self.push_frame(frame)
|
||||
return
|
||||
else:
|
||||
p.state = WakeCheckFilter.WakeState.IDLE
|
||||
|
||||
p.accumulator += frame.text
|
||||
for pattern in self._wake_patterns:
|
||||
match = pattern.search(p.accumulator)
|
||||
if match:
|
||||
logger.debug(f"Wake phrase triggered: {match.group()}")
|
||||
# Found the wake word. Discard from the accumulator up to the start of the match
|
||||
# and modify the frame in place.
|
||||
p.state = WakeCheckFilter.WakeState.AWAKE
|
||||
p.wake_timer = time.time()
|
||||
frame.text = p.accumulator[match.start():]
|
||||
p.accumulator = ""
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
pass
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
except Exception as e:
|
||||
error_msg = f"Error in wake word filter: {e}"
|
||||
logger.error(error_msg)
|
||||
await self.push_error(ErrorFrame(error_msg))
|
||||
@@ -6,17 +6,22 @@
|
||||
|
||||
from pipecat.frames.frames import Frame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from loguru import logger
|
||||
from typing import Optional
|
||||
logger = logger.opt(ansi=True)
|
||||
|
||||
|
||||
class FrameLogger(FrameProcessor):
|
||||
def __init__(self, prefix="Frame"):
|
||||
def __init__(self, prefix="Frame", color: Optional[str] = None):
|
||||
super().__init__()
|
||||
self._prefix = prefix
|
||||
self._color = color
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
match direction:
|
||||
case FrameDirection.UPSTREAM:
|
||||
print(f"< {self._prefix}: {frame}")
|
||||
case FrameDirection.DOWNSTREAM:
|
||||
print(f"> {self._prefix}: {frame}")
|
||||
dir = "<" if direction is FrameDirection.UPSTREAM else ">"
|
||||
msg = f"{dir} {self._prefix}: {frame}"
|
||||
if self._color:
|
||||
msg = f"<{self._color}>{msg}</>"
|
||||
logger.debug(msg)
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame
|
||||
|
||||
|
||||
def maybe_split_audio_frame(frame: AudioRawFrame, largest_write_size: int) -> List[AudioRawFrame]:
|
||||
"""Subdivide large audio frames to enable interruption."""
|
||||
frames: List[AudioRawFrame] = []
|
||||
if len(frame.audio) > largest_write_size:
|
||||
for i in range(0, len(frame.audio), largest_write_size):
|
||||
chunk = frame.audio[i: i + largest_write_size]
|
||||
frames.append(
|
||||
AudioRawFrame(
|
||||
audio=chunk,
|
||||
sample_rate=frame.sample_rate,
|
||||
num_channels=frame.num_channels))
|
||||
else:
|
||||
frames.append(frame)
|
||||
return frames
|
||||
@@ -1,6 +1,6 @@
|
||||
from abc import abstractmethod
|
||||
|
||||
from pipecat.pipeline.frames import Frame
|
||||
from pipecat.frames.frames import Frame
|
||||
|
||||
|
||||
class FrameSerializer:
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
import dataclasses
|
||||
from typing import Text
|
||||
from pipecat.pipeline.frames import AudioFrame, Frame, TextFrame, TranscriptionFrame
|
||||
import pipecat.pipeline.protobufs.frames_pb2 as frame_protos
|
||||
from pipecat.frames.frames import AudioRawFrame, Frame, TextFrame, TranscriptionFrame
|
||||
import pipecat.frames.protobufs.frames_pb2 as frame_protos
|
||||
from pipecat.serializers.abstract_frame_serializer import FrameSerializer
|
||||
|
||||
|
||||
class ProtobufFrameSerializer(FrameSerializer):
|
||||
SERIALIZABLE_TYPES = {
|
||||
TextFrame: "text",
|
||||
AudioFrame: "audio",
|
||||
AudioRawFrame: "audio",
|
||||
TranscriptionFrame: "transcription"
|
||||
}
|
||||
|
||||
|
||||
@@ -4,28 +4,39 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import array
|
||||
import io
|
||||
import math
|
||||
import wave
|
||||
|
||||
from abc import abstractmethod
|
||||
from typing import BinaryIO
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
TextFrame,
|
||||
VisionImageRawFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.audio import calculate_audio_volume
|
||||
from pipecat.utils.utils import exp_smoothing
|
||||
|
||||
|
||||
class AIService(FrameProcessor):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
async def process_generator(self, generator: AsyncGenerator[Frame, None]):
|
||||
async for f in generator:
|
||||
if isinstance(f, ErrorFrame):
|
||||
await self.push_error(f)
|
||||
else:
|
||||
await self.push_frame(f)
|
||||
|
||||
|
||||
class LLMService(AIService):
|
||||
"""This class is a no-op but serves as a base class for LLM services."""
|
||||
@@ -42,7 +53,7 @@ class TTSService(AIService):
|
||||
|
||||
# Converts the text to audio.
|
||||
@abstractmethod
|
||||
async def run_tts(self, text: str):
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
pass
|
||||
|
||||
async def say(self, text: str):
|
||||
@@ -55,18 +66,26 @@ class TTSService(AIService):
|
||||
else:
|
||||
self._current_sentence += frame.text
|
||||
if self._current_sentence.strip().endswith((".", "?", "!")):
|
||||
text = self._current_sentence
|
||||
text = self._current_sentence.strip()
|
||||
self._current_sentence = ""
|
||||
|
||||
if text:
|
||||
await self.run_tts(text)
|
||||
await self._push_tts_frames(text)
|
||||
|
||||
async def _push_tts_frames(self, text: str):
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
await self.process_generator(self.run_tts(text))
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
# We send the original text after the audio. This way, if we are
|
||||
# interrupted, the text is not added to the assistant context.
|
||||
await self.push_frame(TextFrame(text))
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, TextFrame):
|
||||
await self._process_text_frame(frame)
|
||||
elif isinstance(frame, EndFrame):
|
||||
if self._current_sentence:
|
||||
await self.run_tts(self._current_sentence)
|
||||
await self._push_tts_frames(self._current_sentence)
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -76,64 +95,74 @@ class STTService(AIService):
|
||||
"""STTService is a base class for speech-to-text services."""
|
||||
|
||||
def __init__(self,
|
||||
min_rms: int = 400,
|
||||
max_silence_frames: int = 3,
|
||||
min_volume: float = 0.6,
|
||||
max_silence_secs: float = 0.3,
|
||||
max_buffer_secs: float = 1.5,
|
||||
sample_rate: int = 16000,
|
||||
num_channels: int = 1):
|
||||
super().__init__()
|
||||
self._min_rms = min_rms
|
||||
self._max_silence_frames = max_silence_frames
|
||||
self._min_volume = min_volume
|
||||
self._max_silence_secs = max_silence_secs
|
||||
self._max_buffer_secs = max_buffer_secs
|
||||
self._sample_rate = sample_rate
|
||||
self._num_channels = num_channels
|
||||
self._current_silence_frames = 0
|
||||
(self._content, self._wave) = self._new_wave()
|
||||
self._silence_num_frames = 0
|
||||
# Volume exponential smoothing
|
||||
self._smoothing_factor = 0.4
|
||||
self._prev_volume = 1 - self._smoothing_factor
|
||||
|
||||
@abstractmethod
|
||||
async def run_stt(self, audio: BinaryIO):
|
||||
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
||||
"""Returns transcript as a string"""
|
||||
pass
|
||||
|
||||
def _new_wave(self):
|
||||
content = io.BufferedRandom(io.BytesIO())
|
||||
content = io.BytesIO()
|
||||
ww = wave.open(content, "wb")
|
||||
ww.setsampwidth(2)
|
||||
ww.setnchannels(self._num_channels)
|
||||
ww.setframerate(self._sample_rate)
|
||||
return (content, ww)
|
||||
|
||||
def _get_volume(self, audio: bytes) -> float:
|
||||
# https://docs.python.org/3/library/array.html
|
||||
audio_array = array.array('h', audio)
|
||||
squares = [sample**2 for sample in audio_array]
|
||||
mean = sum(squares) / len(audio_array)
|
||||
rms = math.sqrt(mean)
|
||||
return rms
|
||||
def _get_smoothed_volume(self, frame: AudioRawFrame) -> float:
|
||||
volume = calculate_audio_volume(frame.audio, frame.sample_rate)
|
||||
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
|
||||
|
||||
async def _append_audio(self, frame: AudioRawFrame):
|
||||
# Try to filter out empty background noise
|
||||
volume = self._get_smoothed_volume(frame)
|
||||
if volume >= self._min_volume:
|
||||
# If volume is high enough, write new data to wave file
|
||||
self._wave.writeframes(frame.audio)
|
||||
self._silence_num_frames = 0
|
||||
else:
|
||||
self._silence_num_frames += frame.num_frames
|
||||
self._prev_volume = volume
|
||||
|
||||
# If buffer is not empty and we have enough data or there's been a long
|
||||
# silence, transcribe the audio gathered so far.
|
||||
silence_secs = self._silence_num_frames / self._sample_rate
|
||||
buffer_secs = self._wave.getnframes() / self._sample_rate
|
||||
if self._content.tell() > 0 and (
|
||||
buffer_secs > self._max_buffer_secs or silence_secs > self._max_silence_secs):
|
||||
self._silence_num_frames = 0
|
||||
self._wave.close()
|
||||
self._content.seek(0)
|
||||
await self.process_generator(self.run_stt(self._content.read()))
|
||||
(self._content, self._wave) = self._new_wave()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Processes a frame of audio data, either buffering or transcribing it."""
|
||||
if not isinstance(frame, AudioRawFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
audio = frame.audio
|
||||
|
||||
# Try to filter out empty background noise
|
||||
# (Very rudimentary approach, can be improved)
|
||||
rms = self._get_volume(audio)
|
||||
if rms >= self._min_rms:
|
||||
# If volume is high enough, write new data to wave file
|
||||
self._wave.writeframes(audio)
|
||||
|
||||
# If buffer is not empty and we detect a 3-frame pause in speech,
|
||||
# transcribe the audio gathered so far.
|
||||
if self._content.tell() > 0 and self._current_silence_frames > self._max_silence_frames:
|
||||
self._current_silence_frames = 0
|
||||
if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame):
|
||||
self._wave.close()
|
||||
self._content.seek(0)
|
||||
await self.run_stt(self._content)
|
||||
(self._content, self._wave) = self._new_wave()
|
||||
# If we get this far, this is a frame of silence
|
||||
self._current_silence_frames += 1
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, AudioRawFrame):
|
||||
# In this service we accumulate audio internally and at the end we
|
||||
# push a TextFrame. We don't really want to push audio frames down.
|
||||
await self._append_audio(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class ImageGenService(AIService):
|
||||
@@ -142,13 +171,14 @@ class ImageGenService(AIService):
|
||||
super().__init__()
|
||||
|
||||
# Renders the image. Returns an Image object.
|
||||
@abstractmethod
|
||||
async def run_image_gen(self, prompt: str):
|
||||
@ abstractmethod
|
||||
async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, TextFrame):
|
||||
await self.run_image_gen(frame.text)
|
||||
await self.push_frame(frame, direction)
|
||||
await self.process_generator(self.run_image_gen(frame.text))
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
@@ -160,12 +190,12 @@ class VisionService(AIService):
|
||||
super().__init__()
|
||||
self._describe_text = None
|
||||
|
||||
@abstractmethod
|
||||
async def run_vision(self, frame: VisionImageRawFrame):
|
||||
@ abstractmethod
|
||||
async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]:
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, VisionImageRawFrame):
|
||||
await self.run_vision(frame)
|
||||
await self.process_generator(self.run_vision(frame))
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
@@ -4,9 +4,24 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from pipecat.frames.frames import Frame, LLMMessagesFrame, TextFrame
|
||||
import os
|
||||
import asyncio
|
||||
import time
|
||||
import base64
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
TextFrame,
|
||||
VisionImageRawFrame,
|
||||
LLMMessagesFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMResponseStartFrame,
|
||||
LLMResponseEndFrame,
|
||||
LLMFullResponseEndFrame
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_services import LLMService
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -20,18 +35,98 @@ except ModuleNotFoundError as e:
|
||||
|
||||
|
||||
class AnthropicLLMService(LLMService):
|
||||
"""This class implements inference with Anthropic's AI models
|
||||
|
||||
This service translates internally from OpenAILLMContext to the messages format
|
||||
expected by the Anthropic Python SDK. We are using the OpenAILLMContext as a lingua
|
||||
franca for all LLM services, so that it is easy to switch between different LLMs.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key,
|
||||
model="claude-3-opus-20240229",
|
||||
max_tokens=1024):
|
||||
api_key: str,
|
||||
model: str = "claude-3-opus-20240229",
|
||||
max_tokens: int = 1024):
|
||||
super().__init__()
|
||||
self.client = AsyncAnthropic(api_key=api_key)
|
||||
self.model = model
|
||||
self.max_tokens = max_tokens
|
||||
self._client = AsyncAnthropic(api_key=api_key)
|
||||
self._model = model
|
||||
self._max_tokens = max_tokens
|
||||
|
||||
def _get_messages_from_openai_context(
|
||||
self, context: OpenAILLMContext):
|
||||
openai_messages = context.get_messages()
|
||||
anthropic_messages = []
|
||||
|
||||
for message in openai_messages:
|
||||
role = message["role"]
|
||||
text = message["content"]
|
||||
if role == "system":
|
||||
role = "user"
|
||||
if message.get("mime_type") == "image/jpeg":
|
||||
# vision frame
|
||||
encoded_image = base64.b64encode(message["data"].getvalue()).decode("utf-8")
|
||||
anthropic_messages.append({
|
||||
"role": role,
|
||||
"content": [{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": message.get("mime_type"),
|
||||
"data": encoded_image,
|
||||
}
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": text
|
||||
}]
|
||||
})
|
||||
else:
|
||||
# text frame
|
||||
anthropic_messages.append({"role": role, "content": content})
|
||||
|
||||
return anthropic_messages
|
||||
|
||||
async def _process_context(self, context: OpenAILLMContext):
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
try:
|
||||
logger.debug(f"Generating chat: {context.get_messages_json()}")
|
||||
|
||||
messages = self._get_messages_from_openai_context(context)
|
||||
|
||||
start_time = time.time()
|
||||
response = await self._client.messages.create(
|
||||
messages=messages,
|
||||
model=self._model,
|
||||
max_tokens=self._max_tokens,
|
||||
stream=True)
|
||||
logger.debug(f"Anthropic LLM TTFB: {time.time() - start_time}")
|
||||
async for event in response:
|
||||
# logger.debug(f"Anthropic LLM event: {event}")
|
||||
if (event.type == "content_block_delta"):
|
||||
await self.push_frame(LLMResponseStartFrame())
|
||||
await self.push_frame(TextFrame(event.delta.text))
|
||||
await self.push_frame(LLMResponseEndFrame())
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
finally:
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
context = None
|
||||
|
||||
if isinstance(frame, OpenAILLMContextFrame):
|
||||
context: OpenAILLMContext = frame.context
|
||||
elif isinstance(frame, LLMMessagesFrame):
|
||||
context = OpenAILLMContext.from_messages(frame.messages)
|
||||
elif isinstance(frame, VisionImageRawFrame):
|
||||
context = OpenAILLMContext.from_image_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
if context:
|
||||
await self._process_context(context)
|
||||
|
||||
async def x_process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, LLMMessagesFrame):
|
||||
stream = await self.client.messages.create(
|
||||
max_tokens=self.max_tokens,
|
||||
|
||||
@@ -9,10 +9,12 @@ import asyncio
|
||||
import io
|
||||
|
||||
from PIL import Image
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from numpy import str_
|
||||
from openai import AsyncAzureOpenAI
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame, ErrorFrame, URLImageRawFrame
|
||||
from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame, URLImageRawFrame
|
||||
from pipecat.services.ai_services import TTSService, ImageGenService
|
||||
from pipecat.services.openai import BaseOpenAILLMService
|
||||
|
||||
@@ -34,8 +36,8 @@ except ModuleNotFoundError as e:
|
||||
|
||||
|
||||
class AzureTTSService(TTSService):
|
||||
def __init__(self, *, api_key, region, voice="en-US-SaraNeural"):
|
||||
super().__init__()
|
||||
def __init__(self, *, api_key: str, region: str, voice="en-US-SaraNeural", **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.speech_config = SpeechConfig(subscription=api_key, region=region)
|
||||
self.speech_synthesizer = SpeechSynthesizer(
|
||||
@@ -43,8 +45,8 @@ class AzureTTSService(TTSService):
|
||||
)
|
||||
self._voice = voice
|
||||
|
||||
async def run_tts(self, text: str):
|
||||
logger.debug(f"Transcribing text: {text}")
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating TTS: {text}")
|
||||
|
||||
ssml = (
|
||||
"<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' "
|
||||
@@ -60,7 +62,7 @@ class AzureTTSService(TTSService):
|
||||
|
||||
if result.reason == ResultReason.SynthesizingAudioCompleted:
|
||||
# Azure always sends a 44-byte header. Strip it off.
|
||||
await self.push_frame(AudioRawFrame(audio=result.audio_data[44:], sample_rate=16000, num_channels=1))
|
||||
yield AudioRawFrame(audio=result.audio_data[44:], sample_rate=16000, num_channels=1)
|
||||
elif result.reason == ResultReason.Canceled:
|
||||
cancellation_details = result.cancellation_details
|
||||
logger.warning(f"Speech synthesis canceled: {cancellation_details.reason}")
|
||||
@@ -72,17 +74,18 @@ class AzureLLMService(BaseOpenAILLMService):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key,
|
||||
endpoint,
|
||||
api_version="2023-12-01-preview",
|
||||
model):
|
||||
super().__init__(api_key=api_key, model=model)
|
||||
api_key: str,
|
||||
endpoint: str,
|
||||
model: str,
|
||||
api_version: str = "2023-12-01-preview"):
|
||||
# Initialize variables before calling parent __init__() because that
|
||||
# will call create_client() and we need those values there.
|
||||
self._endpoint = endpoint
|
||||
self._api_version = api_version
|
||||
self._model: str = model
|
||||
super().__init__(api_key=api_key, model=model)
|
||||
|
||||
def create_client(self, api_key=None, base_url=None):
|
||||
self._client = AsyncAzureOpenAI(
|
||||
return AsyncAzureOpenAI(
|
||||
api_key=api_key,
|
||||
azure_endpoint=self._endpoint,
|
||||
api_version=self._api_version,
|
||||
@@ -94,12 +97,12 @@ class AzureImageGenServiceREST(ImageGenService):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_version="2023-06-01-preview",
|
||||
image_size: str,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
api_key,
|
||||
endpoint,
|
||||
model,
|
||||
image_size: str,
|
||||
api_key: str,
|
||||
endpoint: str,
|
||||
model: str,
|
||||
api_version="2023-06-01-preview",
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
@@ -110,7 +113,7 @@ class AzureImageGenServiceREST(ImageGenService):
|
||||
self._aiohttp_session = aiohttp_session
|
||||
self._image_size = image_size
|
||||
|
||||
async def run_image_gen(self, prompt: str):
|
||||
async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
|
||||
url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
|
||||
|
||||
headers = {
|
||||
@@ -136,7 +139,7 @@ class AzureImageGenServiceREST(ImageGenService):
|
||||
attempts_left -= 1
|
||||
if attempts_left == 0:
|
||||
logger.error("Image generation timed out")
|
||||
await self.push_error(ErrorFrame("Image generation timed out"))
|
||||
yield ErrorFrame("Image generation timed out")
|
||||
return
|
||||
|
||||
await asyncio.sleep(1)
|
||||
@@ -149,7 +152,7 @@ class AzureImageGenServiceREST(ImageGenService):
|
||||
image_url = json_response["result"]["data"][0]["url"] if json_response else None
|
||||
if not image_url:
|
||||
logger.error("Image generation failed")
|
||||
await self.push_error(ErrorFrame("Image generation failed"))
|
||||
yield ErrorFrame("Image generation failed")
|
||||
return
|
||||
|
||||
# Load the image from the url
|
||||
@@ -161,4 +164,4 @@ class AzureImageGenServiceREST(ImageGenService):
|
||||
image=image.tobytes(),
|
||||
size=image.size,
|
||||
format=image.format)
|
||||
await self.push_frame(frame)
|
||||
yield frame
|
||||
|
||||
56
src/pipecat/services/cartesia.py
Normal file
56
src/pipecat/services/cartesia.py
Normal file
@@ -0,0 +1,56 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from cartesia.tts import AsyncCartesiaTTS
|
||||
|
||||
import time
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame
|
||||
from pipecat.services.ai_services import TTSService
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class CartesiaTTSService(TTSService):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
voice_name: str,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._voice_name = voice_name
|
||||
|
||||
self._client = None
|
||||
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Transcribing text: [{text}]")
|
||||
|
||||
try:
|
||||
if self._client is None:
|
||||
self._client = AsyncCartesiaTTS(api_key=self._api_key)
|
||||
voices = self._client.get_voices()
|
||||
self._voice_id = voices[self._voice_name]["id"]
|
||||
self._voice = self._client.get_voice_embedding(voice_id=self._voice_id)
|
||||
|
||||
chunk_generator = await self._client.generate(
|
||||
transcript=text, voice=self._voice, stream=True,
|
||||
model_id="upbeat-moon", data_rtype='array', output_format='pcm_16000',
|
||||
# a chunk_time of 0.1 seems to be the default. there are small audio pops/gaps which
|
||||
# we need to debug
|
||||
chunk_time=0.1
|
||||
)
|
||||
|
||||
async for chunk in chunk_generator:
|
||||
# print(f"")
|
||||
frame = AudioRawFrame(chunk['audio'], 16000, 1)
|
||||
yield frame
|
||||
except Exception as e:
|
||||
logger.error(f"Exception {e}")
|
||||
@@ -4,7 +4,11 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame
|
||||
import aiohttp
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame
|
||||
from pipecat.services.ai_services import TTSService
|
||||
|
||||
from loguru import logger
|
||||
@@ -15,22 +19,34 @@ class DeepgramTTSService(TTSService):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
aiohttp_session,
|
||||
api_key,
|
||||
voice="alpha-asteria-en-v2"):
|
||||
super().__init__()
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
api_key: str,
|
||||
voice: str = "aura-helios-en",
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._voice = voice
|
||||
self._api_key = api_key
|
||||
self._aiohttp_session = aiohttp_session
|
||||
|
||||
async def run_tts(self, text: str):
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.info(f"Running Deepgram TTS for {text}")
|
||||
base_url = "https://api.beta.deepgram.com/v1/speak"
|
||||
request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000"
|
||||
base_url = "https://api.deepgram.com/v1/speak"
|
||||
request_url = f"{base_url}?model = {
|
||||
self._voice} & encoding = linear16 & container = none & sample_rate = 16000"
|
||||
headers = {"authorization": f"token {self._api_key}"}
|
||||
body = {"text": text}
|
||||
async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
|
||||
async for data in r.content:
|
||||
frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1)
|
||||
await self.push_frame(frame)
|
||||
|
||||
try:
|
||||
async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
|
||||
if r.status != 200:
|
||||
text = await r.text()
|
||||
logger.error(f"Error getting audio (status: {r.status}, error: {text})")
|
||||
yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})")
|
||||
return
|
||||
|
||||
async for data in r.content:
|
||||
frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1)
|
||||
yield frame
|
||||
except Exception as e:
|
||||
logger.error(f"Exception {e}")
|
||||
|
||||
@@ -6,7 +6,9 @@
|
||||
|
||||
import aiohttp
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame, TTSStartedFrame, TTSStoppedFrame
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame
|
||||
from pipecat.services.ai_services import TTSService
|
||||
|
||||
from loguru import logger
|
||||
@@ -15,22 +17,22 @@ from loguru import logger
|
||||
class ElevenLabsTTSService(TTSService):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
api_key: str,
|
||||
voice_id: str,
|
||||
model: str = "eleven_turbo_v2",
|
||||
):
|
||||
super().__init__()
|
||||
self,
|
||||
*,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
api_key: str,
|
||||
voice_id: str,
|
||||
model: str = "eleven_turbo_v2",
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._voice_id = voice_id
|
||||
self._aiohttp_session = aiohttp_session
|
||||
self._model = model
|
||||
|
||||
async def run_tts(self, text: str):
|
||||
logger.debug(f"Transcribing text: {text}")
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating TTS: [{text}]")
|
||||
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
|
||||
|
||||
@@ -47,12 +49,12 @@ class ElevenLabsTTSService(TTSService):
|
||||
|
||||
async with self._aiohttp_session.post(url, json=payload, headers=headers, params=querystring) as r:
|
||||
if r.status != 200:
|
||||
logger.error(f"Audio fetch status code: {r.status}, error: {r.text}")
|
||||
text = await r.text()
|
||||
logger.error(f"Error getting audio (status: {r.status}, error: {text})")
|
||||
yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})")
|
||||
return
|
||||
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
async for chunk in r.content:
|
||||
if len(chunk) > 0:
|
||||
frame = AudioRawFrame(chunk, 16000, 1)
|
||||
await self.push_frame(frame)
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield frame
|
||||
|
||||
@@ -9,11 +9,10 @@ import io
|
||||
import os
|
||||
|
||||
from PIL import Image
|
||||
from numpy import result_type
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, Union, Dict
|
||||
from typing import AsyncGenerator, Optional, Union, Dict
|
||||
|
||||
from pipecat.frames.frames import URLImageRawFrame
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, URLImageRawFrame
|
||||
from pipecat.services.ai_services import ImageGenService
|
||||
|
||||
from loguru import logger
|
||||
@@ -52,7 +51,7 @@ class FalImageGenService(ImageGenService):
|
||||
if key:
|
||||
os.environ["FAL_KEY"] = key
|
||||
|
||||
async def run_image_gen(self, prompt: str):
|
||||
async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating image from prompt: {prompt}")
|
||||
|
||||
response = await fal_client.run_async(
|
||||
@@ -64,6 +63,7 @@ class FalImageGenService(ImageGenService):
|
||||
|
||||
if not image_url:
|
||||
logger.error("Image generation failed")
|
||||
yield ErrorFrame("Image generation failed")
|
||||
return
|
||||
|
||||
logger.debug(f"Image generated at: {image_url}")
|
||||
@@ -80,4 +80,4 @@ class FalImageGenService(ImageGenService):
|
||||
image=image.tobytes(),
|
||||
size=image.size,
|
||||
format=image.format)
|
||||
await self.push_frame(frame)
|
||||
yield frame
|
||||
|
||||
@@ -19,6 +19,6 @@ except ModuleNotFoundError as e:
|
||||
|
||||
class FireworksLLMService(BaseOpenAILLMService):
|
||||
def __init__(self,
|
||||
model="accounts/fireworks/models/firefunction-v1",
|
||||
base_url="https://api.fireworks.ai/inference/v1"):
|
||||
model: str = "accounts/fireworks/models/firefunction-v1",
|
||||
base_url: str = "https://api.fireworks.ai/inference/v1"):
|
||||
super().__init__(model, base_url)
|
||||
|
||||
120
src/pipecat/services/google.py
Normal file
120
src/pipecat/services/google.py
Normal file
@@ -0,0 +1,120 @@
|
||||
|
||||
import json
|
||||
import os
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
from typing import List
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
TextFrame,
|
||||
VisionImageRawFrame,
|
||||
LLMMessagesFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMResponseStartFrame,
|
||||
LLMResponseEndFrame,
|
||||
LLMFullResponseEndFrame
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_services import LLMService
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame
|
||||
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
import google.generativeai as gai
|
||||
import google.ai.generativelanguage as glm
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
"In order to use Google AI, you need to `pip install pipecat-ai[google]`. Also, set `GOOGLE_API_KEY` environment variable.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class GoogleLLMService(LLMService):
|
||||
"""This class implements inference with Google's AI models
|
||||
|
||||
This service translates internally from OpenAILLMContext to the messages format
|
||||
expected by the Google AI model. We are using the OpenAILLMContext as a lingua
|
||||
franca for all LLM services, so that it is easy to switch between different LLMs.
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str, model: str = "gemini-1.5-flash-latest", **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
gai.configure(api_key=api_key)
|
||||
self._client = gai.GenerativeModel(model)
|
||||
|
||||
def _get_messages_from_openai_context(
|
||||
self, context: OpenAILLMContext) -> List[glm.Content]:
|
||||
openai_messages = context.get_messages()
|
||||
google_messages = []
|
||||
|
||||
for message in openai_messages:
|
||||
role = message["role"]
|
||||
content = message["content"]
|
||||
if role == "system":
|
||||
role = "user"
|
||||
elif role == "assistant":
|
||||
role = "model"
|
||||
|
||||
parts = [glm.Part(text=content)]
|
||||
if "mime_type" in message:
|
||||
parts.append(
|
||||
glm.Part(inline_data=glm.Blob(
|
||||
mime_type=message["mime_type"],
|
||||
data=message["data"].getvalue()
|
||||
)))
|
||||
google_messages.append({"role": role, "parts": parts})
|
||||
|
||||
return google_messages
|
||||
|
||||
async def _async_generator_wrapper(self, sync_generator):
|
||||
for item in sync_generator:
|
||||
yield item
|
||||
await asyncio.sleep(0)
|
||||
|
||||
async def _process_context(self, context: OpenAILLMContext):
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
try:
|
||||
logger.debug(f"Generating chat: {context.get_messages_json()}")
|
||||
|
||||
messages = self._get_messages_from_openai_context(context)
|
||||
|
||||
start_time = time.time()
|
||||
response = self._client.generate_content(messages, stream=True)
|
||||
logger.debug(f"Google LLM TTFB: {time.time() - start_time}")
|
||||
|
||||
async for chunk in self._async_generator_wrapper(response):
|
||||
try:
|
||||
text = chunk.text
|
||||
await self.push_frame(LLMResponseStartFrame())
|
||||
await self.push_frame(TextFrame(text))
|
||||
await self.push_frame(LLMResponseEndFrame())
|
||||
except Exception as e:
|
||||
# Google LLMs seem to flag safety issues a lot!
|
||||
if chunk.candidates[0].finish_reason == 3:
|
||||
logger.debug(
|
||||
f"LLM refused to generate content for safety reasons - {messages}.")
|
||||
else:
|
||||
logger.error(f"Error {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
finally:
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
context = None
|
||||
|
||||
if isinstance(frame, OpenAILLMContextFrame):
|
||||
context: OpenAILLMContext = frame.context
|
||||
elif isinstance(frame, LLMMessagesFrame):
|
||||
context = OpenAILLMContext.from_messages(frame.messages)
|
||||
elif isinstance(frame, VisionImageRawFrame):
|
||||
context = OpenAILLMContext.from_image_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
if context:
|
||||
await self._process_context(context)
|
||||
@@ -6,11 +6,13 @@
|
||||
|
||||
import asyncio
|
||||
|
||||
from pipecat.frames.frames import TextFrame, VisionImageRawFrame
|
||||
from pipecat.services.ai_services import VisionService
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, TextFrame, VisionImageRawFrame
|
||||
from pipecat.services.ai_services import VisionService
|
||||
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
@@ -44,7 +46,7 @@ def detect_device():
|
||||
class MoondreamService(VisionService):
|
||||
def __init__(
|
||||
self,
|
||||
model_id="vikhyatk/moondream2",
|
||||
model="vikhyatk/moondream2",
|
||||
revision="2024-04-02",
|
||||
use_cpu=False
|
||||
):
|
||||
@@ -56,26 +58,27 @@ class MoondreamService(VisionService):
|
||||
device = torch.device("cpu")
|
||||
dtype = torch.float32
|
||||
|
||||
self._tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
|
||||
self._tokenizer = AutoTokenizer.from_pretrained(model, revision=revision)
|
||||
|
||||
logger.debug("Loading Moondream model...")
|
||||
|
||||
self._model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id, trust_remote_code=True, revision=revision
|
||||
model, trust_remote_code=True, revision=revision
|
||||
).to(device=device, dtype=dtype)
|
||||
self._model.eval()
|
||||
|
||||
logger.debug("Loaded Moondream model")
|
||||
|
||||
async def run_vision(self, frame: VisionImageRawFrame):
|
||||
async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]:
|
||||
if not self._model:
|
||||
logger.error("Moondream model not available")
|
||||
yield ErrorFrame("Moondream model not available")
|
||||
return
|
||||
|
||||
logger.debug(f"Analyzing image: {frame}")
|
||||
|
||||
def get_image_description(frame: VisionImageRawFrame):
|
||||
image = Image.frombytes(frame.format, (frame.size[0], frame.size[1]), frame.image)
|
||||
image = Image.frombytes(frame.format, frame.size, frame.image)
|
||||
image_embeds = self._model.encode_image(image)
|
||||
description = self._model.answer_question(
|
||||
image_embeds=image_embeds,
|
||||
@@ -85,4 +88,4 @@ class MoondreamService(VisionService):
|
||||
|
||||
description = await asyncio.to_thread(get_image_description, frame)
|
||||
|
||||
await self.push_frame(TextFrame(text=description))
|
||||
yield TextFrame(text=description)
|
||||
|
||||
@@ -9,5 +9,5 @@ from pipecat.services.openai import BaseOpenAILLMService
|
||||
|
||||
class OLLamaLLMService(BaseOpenAILLMService):
|
||||
|
||||
def __init__(self, model="llama2", base_url="http://localhost:11434/v1"):
|
||||
def __init__(self, model: str = "llama2", base_url: str = "http://localhost:11434/v1"):
|
||||
super().__init__(model=model, base_url=base_url, api_key="ollama")
|
||||
|
||||
@@ -8,22 +8,33 @@ import io
|
||||
import json
|
||||
import time
|
||||
import aiohttp
|
||||
import base64
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from typing import List, Literal
|
||||
from typing import AsyncGenerator, List, Literal
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesFrame,
|
||||
LLMResponseEndFrame,
|
||||
LLMResponseStartFrame,
|
||||
TextFrame,
|
||||
URLImageRawFrame
|
||||
URLImageRawFrame,
|
||||
VisionImageRawFrame
|
||||
)
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_services import LLMService, ImageGenService
|
||||
|
||||
from openai.types.chat import (
|
||||
ChatCompletionSystemMessageParam,
|
||||
ChatCompletionFunctionMessageParam,
|
||||
ChatCompletionToolParam,
|
||||
ChatCompletionUserMessageParam,
|
||||
)
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
@@ -41,6 +52,10 @@ except ModuleNotFoundError as e:
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class OpenAIUnhandledFunctionException(BaseException):
|
||||
pass
|
||||
|
||||
|
||||
class BaseOpenAILLMService(LLMService):
|
||||
"""This is the base for all services that use the AsyncOpenAI client.
|
||||
|
||||
@@ -54,17 +69,42 @@ class BaseOpenAILLMService(LLMService):
|
||||
def __init__(self, model: str, api_key=None, base_url=None):
|
||||
super().__init__()
|
||||
self._model: str = model
|
||||
self.create_client(api_key=api_key, base_url=base_url)
|
||||
self._client = self.create_client(api_key=api_key, base_url=base_url)
|
||||
self._callbacks = {}
|
||||
self._start_callbacks = {}
|
||||
|
||||
def create_client(self, api_key=None, base_url=None):
|
||||
self._client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
||||
return AsyncOpenAI(api_key=api_key, base_url=base_url)
|
||||
|
||||
# TODO-CB: callback function type
|
||||
def register_function(self, function_name, callback, start_callback=None):
|
||||
self._callbacks[function_name] = callback
|
||||
if start_callback:
|
||||
self._start_callbacks[function_name] = start_callback
|
||||
|
||||
def unregister_function(self, function_name):
|
||||
del self._callbacks[function_name]
|
||||
if self._start_callbacks[function_name]:
|
||||
del self._start_callbacks[function_name]
|
||||
|
||||
async def _stream_chat_completions(
|
||||
self, context: OpenAILLMContext
|
||||
) -> AsyncStream[ChatCompletionChunk]:
|
||||
logger.debug(f"Generating chat: {context.get_messages_json()}")
|
||||
|
||||
messages: List[ChatCompletionMessageParam] = context.get_messages()
|
||||
messages_for_log = json.dumps(messages)
|
||||
logger.debug(f"Generating chat: {messages_for_log}")
|
||||
|
||||
# base64 encode any images
|
||||
for message in messages:
|
||||
if message.get("mime_type") == "image/jpeg":
|
||||
encoded_image = base64.b64encode(message["data"].getvalue()).decode("utf-8")
|
||||
text = message["content"]
|
||||
message["content"] = [
|
||||
{"type": "text", "text": text},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
|
||||
]
|
||||
del message["data"]
|
||||
del message["mime_type"]
|
||||
|
||||
start_time = time.time()
|
||||
chunks: AsyncStream[ChatCompletionChunk] = (
|
||||
@@ -82,10 +122,6 @@ class BaseOpenAILLMService(LLMService):
|
||||
return chunks
|
||||
|
||||
async def _chat_completions(self, messages) -> str | None:
|
||||
messages_for_log = json.dumps(messages)
|
||||
|
||||
logger.debug(f"Generating chat: {messages_for_log}")
|
||||
|
||||
response: ChatCompletion = await self._client.chat.completions.create(
|
||||
model=self._model, stream=False, messages=messages
|
||||
)
|
||||
@@ -97,8 +133,7 @@ class BaseOpenAILLMService(LLMService):
|
||||
async def _process_context(self, context: OpenAILLMContext):
|
||||
function_name = ""
|
||||
arguments = ""
|
||||
|
||||
await self.push_frame(LLMResponseStartFrame())
|
||||
tool_call_id = ""
|
||||
|
||||
chunk_stream: AsyncStream[ChatCompletionChunk] = (
|
||||
await self._stream_chat_completions(context)
|
||||
@@ -123,21 +158,77 @@ class BaseOpenAILLMService(LLMService):
|
||||
tool_call = chunk.choices[0].delta.tool_calls[0]
|
||||
if tool_call.function and tool_call.function.name:
|
||||
function_name += tool_call.function.name
|
||||
# yield LLMFunctionStartFrame(function_name=tool_call.function.name)
|
||||
tool_call_id = tool_call.id
|
||||
# only send a function start frame if we're not handling the function call
|
||||
if function_name in self._callbacks.keys():
|
||||
if function_name in self._start_callbacks.keys():
|
||||
await self._start_callbacks[function_name](self)
|
||||
if tool_call.function and tool_call.function.arguments:
|
||||
# Keep iterating through the response to collect all the argument fragments and
|
||||
# yield a complete LLMFunctionCallFrame after run_llm_async
|
||||
# completes
|
||||
# Keep iterating through the response to collect all the argument fragments
|
||||
arguments += tool_call.function.arguments
|
||||
elif chunk.choices[0].delta.content:
|
||||
await self.push_frame(LLMResponseStartFrame())
|
||||
await self.push_frame(TextFrame(chunk.choices[0].delta.content))
|
||||
await self.push_frame(LLMResponseEndFrame())
|
||||
|
||||
# if we got a function name and arguments, yield the frame with all the info so
|
||||
# frame consumers can take action based on the function call.
|
||||
# if function_name and arguments:
|
||||
# yield LLMFunctionCallFrame(function_name=function_name, arguments=arguments)
|
||||
# if we got a function name and arguments, check to see if it's a function with
|
||||
# a registered handler. If so, run the registered callback, save the result to
|
||||
# the context, and re-prompt to get a chat answer. If we don't have a registered
|
||||
# handler, raise an exception.
|
||||
if function_name and arguments:
|
||||
if function_name in self._callbacks.keys():
|
||||
await self._handle_function_call(context, tool_call_id, function_name, arguments)
|
||||
|
||||
await self.push_frame(LLMResponseEndFrame())
|
||||
else:
|
||||
raise OpenAIUnhandledFunctionException(
|
||||
f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function.")
|
||||
|
||||
async def _handle_function_call(
|
||||
self,
|
||||
context,
|
||||
tool_call_id,
|
||||
function_name,
|
||||
arguments
|
||||
):
|
||||
arguments = json.loads(arguments)
|
||||
result = await self._callbacks[function_name](self, arguments)
|
||||
arguments = json.dumps(arguments)
|
||||
if isinstance(result, (str, dict)):
|
||||
# Handle it in "full magic mode"
|
||||
tool_call = ChatCompletionFunctionMessageParam({
|
||||
"role": "assistant",
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": tool_call_id,
|
||||
"function": {
|
||||
"arguments": arguments,
|
||||
"name": function_name
|
||||
},
|
||||
"type": "function"
|
||||
}
|
||||
]
|
||||
|
||||
})
|
||||
context.add_message(tool_call)
|
||||
if isinstance(result, dict):
|
||||
result = json.dumps(result)
|
||||
tool_result = ChatCompletionToolParam({
|
||||
"tool_call_id": tool_call_id,
|
||||
"role": "tool",
|
||||
"content": result
|
||||
})
|
||||
context.add_message(tool_result)
|
||||
# re-prompt to get a human answer
|
||||
await self._process_context(context)
|
||||
elif isinstance(result, list):
|
||||
# reduced magic
|
||||
for msg in result:
|
||||
context.add_message(msg)
|
||||
await self._process_context(context)
|
||||
elif isinstance(result, type(None)):
|
||||
pass
|
||||
else:
|
||||
raise BaseException(f"Unknown return type from function callback: {type(result)}")
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
context = None
|
||||
@@ -145,11 +236,15 @@ class BaseOpenAILLMService(LLMService):
|
||||
context: OpenAILLMContext = frame.context
|
||||
elif isinstance(frame, LLMMessagesFrame):
|
||||
context = OpenAILLMContext.from_messages(frame.messages)
|
||||
elif isinstance(frame, VisionImageRawFrame):
|
||||
context = OpenAILLMContext.from_image_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
if context:
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
await self._process_context(context)
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
|
||||
|
||||
class OpenAILLMService(BaseOpenAILLMService):
|
||||
@@ -174,7 +269,7 @@ class OpenAIImageGenService(ImageGenService):
|
||||
self._client = AsyncOpenAI(api_key=api_key)
|
||||
self._aiohttp_session = aiohttp_session
|
||||
|
||||
async def run_image_gen(self, prompt: str):
|
||||
async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating image from prompt: {prompt}")
|
||||
|
||||
image = await self._client.images.generate(
|
||||
@@ -187,11 +282,13 @@ class OpenAIImageGenService(ImageGenService):
|
||||
image_url = image.data[0].url
|
||||
|
||||
if not image_url:
|
||||
logger.error(f"no image provided in response: {image}")
|
||||
logger.error(f"No image provided in response: {image}")
|
||||
yield ErrorFrame("Image generation failed")
|
||||
return
|
||||
|
||||
# Load the image from the url
|
||||
async with self._aiohttp_session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
frame = URLImageRawFrame(image_url, image.tobytes(), image.size, image.format)
|
||||
await self.push_frame(frame)
|
||||
yield frame
|
||||
|
||||
@@ -7,7 +7,9 @@
|
||||
import io
|
||||
import struct
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame, Frame
|
||||
from pipecat.services.ai_services import TTSService
|
||||
|
||||
from loguru import logger
|
||||
@@ -25,8 +27,8 @@ except ModuleNotFoundError as e:
|
||||
|
||||
class PlayHTAIService(TTSService):
|
||||
|
||||
def __init__(self, *, api_key, user_id, voice_url):
|
||||
super().__init__()
|
||||
def __init__(self, *, api_key: str, user_id: str, voice_url: str, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._user_id = user_id
|
||||
self._speech_key = api_key
|
||||
@@ -44,7 +46,7 @@ class PlayHTAIService(TTSService):
|
||||
def __del__(self):
|
||||
self._client.close()
|
||||
|
||||
async def run_tts(self, text: str):
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
b = bytearray()
|
||||
in_header = True
|
||||
for chunk in self._client.tts(text, self._options):
|
||||
@@ -69,4 +71,4 @@ class PlayHTAIService(TTSService):
|
||||
else:
|
||||
if len(chunk):
|
||||
frame = AudioRawFrame(chunk, 16000, 1)
|
||||
await self.push_frame(frame)
|
||||
yield frame
|
||||
|
||||
@@ -10,9 +10,11 @@ import asyncio
|
||||
import time
|
||||
|
||||
from enum import Enum
|
||||
from typing import BinaryIO
|
||||
from typing_extensions import AsyncGenerator
|
||||
|
||||
from pipecat.frames.frames import TranscriptionFrame
|
||||
import numpy as np
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
|
||||
from pipecat.services.ai_services import STTService
|
||||
|
||||
from loguru import logger
|
||||
@@ -39,14 +41,18 @@ class Model(Enum):
|
||||
class WhisperSTTService(STTService):
|
||||
"""Class to transcribe audio with a locally-downloaded Whisper model"""
|
||||
|
||||
def __init__(self, model_name: Model = Model.DISTIL_MEDIUM_EN,
|
||||
def __init__(self,
|
||||
model: Model = Model.DISTIL_MEDIUM_EN,
|
||||
device: str = "auto",
|
||||
compute_type: str = "default"):
|
||||
compute_type: str = "default",
|
||||
no_speech_prob: float = 0.1,
|
||||
**kwargs):
|
||||
|
||||
super().__init__()
|
||||
super().__init__(**kwargs)
|
||||
self._device: str = device
|
||||
self._compute_type = compute_type
|
||||
self._model_name: Model = model_name
|
||||
self._model_name: Model = model
|
||||
self._no_speech_prob = no_speech_prob
|
||||
self._model: WhisperModel | None = None
|
||||
self._load()
|
||||
|
||||
@@ -60,15 +66,21 @@ class WhisperSTTService(STTService):
|
||||
compute_type=self._compute_type)
|
||||
logger.debug("Loaded Whisper model")
|
||||
|
||||
async def run_stt(self, audio: BinaryIO):
|
||||
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
||||
"""Transcribes given audio using Whisper"""
|
||||
if not self._model:
|
||||
yield ErrorFrame("Whisper model not available")
|
||||
logger.error("Whisper model not available")
|
||||
return
|
||||
|
||||
segments, _ = await asyncio.to_thread(self._model.transcribe, audio)
|
||||
# Divide by 32768 because we have signed 16-bit data.
|
||||
audio_float = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
|
||||
segments, _ = await asyncio.to_thread(self._model.transcribe, audio_float)
|
||||
text: str = ""
|
||||
for segment in segments:
|
||||
text += f"{segment.text} "
|
||||
if segment.no_speech_prob < self._no_speech_prob:
|
||||
text += f"{segment.text} "
|
||||
|
||||
await self.push_frame(TranscriptionFrame(text, "", int(time.time_ns() / 1000000)))
|
||||
if text:
|
||||
yield TranscriptionFrame(text, "", int(time.time_ns() / 1000000))
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
|
||||
import asyncio
|
||||
import queue
|
||||
import threading
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.frames.frames import (
|
||||
@@ -15,6 +16,8 @@ from pipecat.frames.frames import (
|
||||
StartFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
StartInterruptionFrame,
|
||||
StopInterruptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame)
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
@@ -30,26 +33,50 @@ class BaseInputTransport(FrameProcessor):
|
||||
|
||||
self._params = params
|
||||
|
||||
self._running = True
|
||||
self._running = False
|
||||
self._allow_interruptions = False
|
||||
|
||||
# Start media threads.
|
||||
self._in_executor = ThreadPoolExecutor(max_workers=5)
|
||||
|
||||
# Create audio input queue if needed.
|
||||
if self._params.audio_in_enabled or self._params.vad_enabled:
|
||||
self._audio_in_queue = queue.Queue()
|
||||
self._audio_in_thread = threading.Thread(target=self._audio_in_thread_handler)
|
||||
self._audio_out_thread = threading.Thread(target=self._audio_out_thread_handler)
|
||||
|
||||
self._stopped_event = asyncio.Event()
|
||||
# Create push frame task. This is the task that will push frames in
|
||||
# order. We also guarantee that all frames are pushed in the same task.
|
||||
self._create_push_task()
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
# Make sure we have the latest params. Note that this transport might
|
||||
# have been started on another task that might not need interruptions,
|
||||
# for example.
|
||||
self._allow_interruptions = frame.allow_interruptions
|
||||
|
||||
if self._running:
|
||||
return
|
||||
|
||||
self._running = True
|
||||
|
||||
async def start(self):
|
||||
if self._params.audio_in_enabled or self._params.vad_enabled:
|
||||
self._audio_in_thread.start()
|
||||
self._audio_out_thread.start()
|
||||
loop = self.get_event_loop()
|
||||
self._audio_in_thread = loop.run_in_executor(
|
||||
self._in_executor, self._audio_in_thread_handler)
|
||||
self._audio_out_thread = loop.run_in_executor(
|
||||
self._in_executor, self._audio_out_thread_handler)
|
||||
|
||||
async def stop(self):
|
||||
if not self._running:
|
||||
return
|
||||
|
||||
# This will exit all threads.
|
||||
self._running = False
|
||||
|
||||
self._stopped_event.set()
|
||||
# Wait for the threads to finish.
|
||||
if self._params.audio_in_enabled or self._params.vad_enabled:
|
||||
await self._audio_in_thread
|
||||
await self._audio_out_thread
|
||||
|
||||
self._push_frame_task.cancel()
|
||||
|
||||
def vad_analyze(self, audio_frames: bytes) -> VADState:
|
||||
pass
|
||||
@@ -62,24 +89,62 @@ class BaseInputTransport(FrameProcessor):
|
||||
#
|
||||
|
||||
async def cleanup(self):
|
||||
if self._params.audio_in_enabled or self._params.vad_enabled:
|
||||
self._audio_in_thread.join()
|
||||
self._audio_out_thread.join()
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, StartFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
await self.start()
|
||||
elif isinstance(frame, CancelFrame) or isinstance(frame, EndFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
if isinstance(frame, CancelFrame):
|
||||
await self.stop()
|
||||
else:
|
||||
# We don't queue a CancelFrame since we want to stop ASAP.
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, StartFrame):
|
||||
self._allow_interruption = frame.allow_interruptions
|
||||
await self.start(frame)
|
||||
await self._internal_push_frame(frame, direction)
|
||||
elif isinstance(frame, EndFrame):
|
||||
await self.stop()
|
||||
await self._internal_push_frame(frame, direction)
|
||||
else:
|
||||
await self._internal_push_frame(frame, direction)
|
||||
|
||||
# If we are finishing, wait here until we have stopped, otherwise we
|
||||
# might close things too early upstream.
|
||||
if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame):
|
||||
await self._stopped_event.wait()
|
||||
#
|
||||
# Push frames task
|
||||
#
|
||||
|
||||
def _create_push_task(self):
|
||||
loop = self.get_event_loop()
|
||||
self._push_frame_task = loop.create_task(self._push_frame_task_handler())
|
||||
self._push_queue = asyncio.Queue()
|
||||
|
||||
async def _internal_push_frame(
|
||||
self,
|
||||
frame: Frame | None,
|
||||
direction: FrameDirection | None = FrameDirection.DOWNSTREAM):
|
||||
await self._push_queue.put((frame, direction))
|
||||
|
||||
async def _push_frame_task_handler(self):
|
||||
while True:
|
||||
try:
|
||||
(frame, direction) = await self._push_queue.get()
|
||||
await self.push_frame(frame, direction)
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
|
||||
#
|
||||
# Handle interruptions
|
||||
#
|
||||
|
||||
async def _handle_interruptions(self, frame: Frame):
|
||||
if self._allow_interruptions:
|
||||
# Make sure we notify about interruptions quickly out-of-band
|
||||
if isinstance(frame, UserStartedSpeakingFrame):
|
||||
logger.debug("User started speaking")
|
||||
self._push_frame_task.cancel()
|
||||
self._create_push_task()
|
||||
await self.push_frame(StartInterruptionFrame())
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
logger.debug("User stopped speaking")
|
||||
await self.push_frame(StopInterruptionFrame())
|
||||
await self._internal_push_frame(frame)
|
||||
|
||||
#
|
||||
# Audio input
|
||||
@@ -93,11 +158,13 @@ class BaseInputTransport(FrameProcessor):
|
||||
frame = UserStartedSpeakingFrame()
|
||||
elif new_vad_state == VADState.QUIET:
|
||||
frame = UserStoppedSpeakingFrame()
|
||||
|
||||
if frame:
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self.push_frame(frame), self.get_event_loop())
|
||||
self._handle_interruptions(frame), self.get_event_loop())
|
||||
future.result()
|
||||
vad_state = new_vad_state
|
||||
|
||||
vad_state = new_vad_state
|
||||
return vad_state
|
||||
|
||||
def _audio_in_thread_handler(self):
|
||||
@@ -133,8 +200,10 @@ class BaseInputTransport(FrameProcessor):
|
||||
# Push audio downstream if passthrough.
|
||||
if audio_passthrough:
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self.push_frame(frame), self.get_event_loop())
|
||||
self._internal_push_frame(frame), self.get_event_loop())
|
||||
future.result()
|
||||
|
||||
self._audio_in_queue.task_done()
|
||||
except queue.Empty:
|
||||
pass
|
||||
except BaseException as e:
|
||||
|
||||
@@ -8,9 +8,12 @@
|
||||
import asyncio
|
||||
import itertools
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
import threading
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from PIL import Image
|
||||
from typing import List
|
||||
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
@@ -22,6 +25,8 @@ from pipecat.frames.frames import (
|
||||
EndFrame,
|
||||
Frame,
|
||||
ImageRawFrame,
|
||||
StartInterruptionFrame,
|
||||
StopInterruptionFrame,
|
||||
TransportMessageFrame)
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
|
||||
@@ -35,27 +40,50 @@ class BaseOutputTransport(FrameProcessor):
|
||||
|
||||
self._params = params
|
||||
|
||||
self._running = True
|
||||
self._running = False
|
||||
self._allow_interruptions = False
|
||||
|
||||
self._out_executor = ThreadPoolExecutor(max_workers=5)
|
||||
|
||||
# These are the images that we should send to the camera at our desired
|
||||
# framerate.
|
||||
self._camera_images = None
|
||||
|
||||
# Start media threads.
|
||||
# Create media threads queues.
|
||||
if self._params.camera_out_enabled:
|
||||
self._camera_out_queue = queue.Queue()
|
||||
self._camera_out_thread = threading.Thread(target=self._camera_out_thread_handler)
|
||||
self._camera_out_thread.start()
|
||||
|
||||
self._sink_queue = queue.Queue()
|
||||
self._sink_thread = threading.Thread(target=self._sink_thread_handler)
|
||||
|
||||
self._stopped_event = asyncio.Event()
|
||||
self._is_interrupted = threading.Event()
|
||||
|
||||
async def start(self):
|
||||
self._sink_thread.start()
|
||||
async def start(self, frame: StartFrame):
|
||||
# Make sure we have the latest params. Note that this transport might
|
||||
# have been started on another task that might not need interruptions,
|
||||
# for example.
|
||||
self._allow_interruptions = frame.allow_interruptions
|
||||
|
||||
if self._running:
|
||||
return
|
||||
|
||||
self._running = True
|
||||
|
||||
loop = self.get_event_loop()
|
||||
|
||||
if self._params.camera_out_enabled:
|
||||
self._camera_out_thread = loop.run_in_executor(
|
||||
self._out_executor, self._camera_out_thread_handler)
|
||||
|
||||
self._sink_thread = loop.run_in_executor(self._out_executor, self._sink_thread_handler)
|
||||
|
||||
# Create push frame task. This is the task that will push frames in
|
||||
# order. We also guarantee that all frames are pushed in the same task.
|
||||
self._create_push_task()
|
||||
|
||||
async def stop(self):
|
||||
if not self._running:
|
||||
return
|
||||
|
||||
# This will exit all threads.
|
||||
self._running = False
|
||||
|
||||
@@ -75,64 +103,115 @@ class BaseOutputTransport(FrameProcessor):
|
||||
#
|
||||
|
||||
async def cleanup(self):
|
||||
# Wait on the threads to finish.
|
||||
if self._params.camera_out_enabled:
|
||||
self._camera_out_thread.join()
|
||||
await self._camera_out_thread
|
||||
|
||||
self._sink_thread.join()
|
||||
await self._sink_thread
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
#
|
||||
# Out-of-band frames like (CancelFrame or StartInterruptionFrame) are
|
||||
# pushed immediately. Other frames require order so they are put in the
|
||||
# sink queue.
|
||||
#
|
||||
if isinstance(frame, StartFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
await self.start()
|
||||
await self.start(frame)
|
||||
self._sink_queue.put(frame)
|
||||
# EndFrame is managed in the queue handler.
|
||||
elif isinstance(frame, CancelFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
await self.stop()
|
||||
elif self._frame_managed_by_sink(frame):
|
||||
self._sink_queue.put(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, StartInterruptionFrame) or isinstance(frame, StopInterruptionFrame):
|
||||
await self._handle_interruptions(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
self._sink_queue.put(frame)
|
||||
|
||||
# If we are finishing, wait here until we have stopped, otherwise we might
|
||||
# close things too early upstream.
|
||||
# close things too early upstream. We need this event because we don't
|
||||
# know when the internal threads will finish.
|
||||
if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame):
|
||||
await self._stopped_event.wait()
|
||||
|
||||
def _frame_managed_by_sink(self, frame: Frame):
|
||||
return (isinstance(frame, AudioRawFrame)
|
||||
or isinstance(frame, ImageRawFrame)
|
||||
or isinstance(frame, SpriteFrame)
|
||||
or isinstance(frame, TransportMessageFrame)
|
||||
or isinstance(frame, CancelFrame)
|
||||
or isinstance(frame, EndFrame))
|
||||
async def _handle_interruptions(self, frame: Frame):
|
||||
if not self._allow_interruptions:
|
||||
return
|
||||
|
||||
if isinstance(frame, StartInterruptionFrame):
|
||||
self._is_interrupted.set()
|
||||
self._push_frame_task.cancel()
|
||||
self._create_push_task()
|
||||
elif isinstance(frame, StopInterruptionFrame):
|
||||
self._is_interrupted.clear()
|
||||
|
||||
def _sink_thread_handler(self):
|
||||
buffer = bytearray()
|
||||
# 10ms bytes
|
||||
bytes_size_10ms = int(self._params.audio_out_sample_rate / 100) * \
|
||||
self._params.audio_out_channels * 2
|
||||
|
||||
# We will send at least 100ms bytes.
|
||||
smallest_write_size = bytes_size_10ms * 10
|
||||
|
||||
# Audio accumlation buffer
|
||||
buffer = bytearray()
|
||||
while self._running:
|
||||
try:
|
||||
frame = self._sink_queue.get(timeout=1)
|
||||
if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame):
|
||||
if not self._is_interrupted.is_set():
|
||||
if isinstance(frame, AudioRawFrame):
|
||||
if self._params.audio_out_enabled:
|
||||
buffer.extend(frame.audio)
|
||||
buffer = self._send_audio_truncated(buffer, smallest_write_size)
|
||||
elif isinstance(frame, ImageRawFrame) and self._params.camera_out_enabled:
|
||||
self._set_camera_image(frame)
|
||||
elif isinstance(frame, SpriteFrame) and self._params.camera_out_enabled:
|
||||
self._set_camera_images(frame.images)
|
||||
elif isinstance(frame, TransportMessageFrame):
|
||||
self.send_message(frame)
|
||||
else:
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self._internal_push_frame(frame), self.get_event_loop())
|
||||
future.result()
|
||||
else:
|
||||
# If we get interrupted just clear the output buffer.
|
||||
buffer = bytearray()
|
||||
|
||||
if isinstance(frame, EndFrame):
|
||||
# Send all remaining audio before stopping (multiple of 10ms of audio).
|
||||
self._send_audio_truncated(buffer, bytes_size_10ms)
|
||||
future = asyncio.run_coroutine_threadsafe(self.stop(), self.get_event_loop())
|
||||
future.result()
|
||||
elif isinstance(frame, AudioRawFrame):
|
||||
if self._params.audio_out_enabled:
|
||||
buffer.extend(frame.audio)
|
||||
buffer = self._send_audio_truncated(buffer, bytes_size_10ms)
|
||||
elif isinstance(frame, ImageRawFrame) and self._params.camera_out_enabled:
|
||||
self._set_camera_image(frame)
|
||||
elif isinstance(frame, SpriteFrame) and self._params.camera_out_enabled:
|
||||
self._set_camera_images(frame.images)
|
||||
elif isinstance(frame, TransportMessageFrame):
|
||||
self.send_message(frame)
|
||||
|
||||
self._sink_queue.task_done()
|
||||
except queue.Empty:
|
||||
pass
|
||||
except BaseException as e:
|
||||
logger.error(f"Error processing sink queue: {e}")
|
||||
|
||||
#
|
||||
# Push frames task
|
||||
#
|
||||
|
||||
def _create_push_task(self):
|
||||
loop = self.get_event_loop()
|
||||
self._push_frame_task = loop.create_task(self._push_frame_task_handler())
|
||||
self._push_queue = asyncio.Queue()
|
||||
|
||||
async def _internal_push_frame(
|
||||
self,
|
||||
frame: Frame | None,
|
||||
direction: FrameDirection | None = FrameDirection.DOWNSTREAM):
|
||||
await self._push_queue.put((frame, direction))
|
||||
|
||||
async def _push_frame_task_handler(self):
|
||||
while True:
|
||||
try:
|
||||
(frame, direction) = await self._push_queue.get()
|
||||
await self.push_frame(frame, direction)
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
|
||||
#
|
||||
# Camera out
|
||||
#
|
||||
@@ -140,15 +219,17 @@ class BaseOutputTransport(FrameProcessor):
|
||||
async def send_image(self, frame: ImageRawFrame | SpriteFrame):
|
||||
await self.process_frame(frame, FrameDirection.DOWNSTREAM)
|
||||
|
||||
def _draw_image(self, image: ImageRawFrame):
|
||||
def _draw_image(self, frame: ImageRawFrame):
|
||||
desired_size = (self._params.camera_out_width, self._params.camera_out_height)
|
||||
|
||||
if image.size != desired_size:
|
||||
if frame.size != desired_size:
|
||||
image = Image.frombytes(frame.format, frame.size, frame.image)
|
||||
resized_image = image.resize(desired_size)
|
||||
logger.warning(
|
||||
f"{image} does not have the expected size {desired_size}, ignoring")
|
||||
return
|
||||
f"{frame} does not have the expected size {desired_size}, resizing")
|
||||
frame = ImageRawFrame(resized_image.tobytes(), resized_image.size, resized_image.format)
|
||||
|
||||
self.write_frame_to_camera(image)
|
||||
self.write_frame_to_camera(frame)
|
||||
|
||||
def _set_camera_image(self, image: ImageRawFrame):
|
||||
if self._params.camera_out_is_live:
|
||||
@@ -165,10 +246,13 @@ class BaseOutputTransport(FrameProcessor):
|
||||
if self._params.camera_out_is_live:
|
||||
image = self._camera_out_queue.get(timeout=1)
|
||||
self._draw_image(image)
|
||||
self._camera_out_queue.task_done()
|
||||
elif self._camera_images:
|
||||
image = next(self._camera_images)
|
||||
self._draw_image(image)
|
||||
time.sleep(1.0 / self._params.camera_out_framerate)
|
||||
else:
|
||||
time.sleep(1.0 / self._params.camera_out_framerate)
|
||||
except queue.Empty:
|
||||
pass
|
||||
except Exception as e:
|
||||
|
||||
@@ -6,12 +6,16 @@
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from pydantic import ConfigDict
|
||||
from pydantic.main import BaseModel
|
||||
|
||||
from pipecat.processors.frame_processor import FrameProcessor
|
||||
from pipecat.vad.vad_analyzer import VADAnalyzer
|
||||
|
||||
|
||||
class TransportParams(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
camera_out_enabled: bool = False
|
||||
camera_out_is_live: bool = False
|
||||
camera_out_width: int = 1024
|
||||
@@ -27,6 +31,7 @@ class TransportParams(BaseModel):
|
||||
audio_in_channels: int = 1
|
||||
vad_enabled: bool = False
|
||||
vad_audio_passthrough: bool = False
|
||||
vad_analyzer: VADAnalyzer | None = None
|
||||
|
||||
|
||||
class BaseTransport(ABC):
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
import asyncio
|
||||
|
||||
from pipecat.frames.frames import StartFrame
|
||||
from pipecat.processors.frame_processor import FrameProcessor
|
||||
from pipecat.transports.base_input import BaseInputTransport
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
@@ -37,6 +38,14 @@ class LocalAudioInputTransport(BaseInputTransport):
|
||||
def read_raw_audio_frames(self, frame_count: int) -> bytes:
|
||||
return self._in_stream.read(frame_count, exception_on_overflow=False)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
await super().start(frame)
|
||||
self._in_stream.start_stream()
|
||||
|
||||
async def stop(self):
|
||||
await super().stop()
|
||||
self._in_stream.stop_stream()
|
||||
|
||||
async def cleanup(self):
|
||||
# This is not very pretty (taken from PyAudio docs).
|
||||
while self._in_stream.is_active():
|
||||
@@ -60,6 +69,14 @@ class LocalAudioOutputTransport(BaseOutputTransport):
|
||||
def write_raw_audio_frames(self, frames: bytes):
|
||||
self._out_stream.write(frames)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
await super().start(frame)
|
||||
self._out_stream.start_stream()
|
||||
|
||||
async def stop(self):
|
||||
await super().stop()
|
||||
self._out_stream.stop_stream()
|
||||
|
||||
async def cleanup(self):
|
||||
# This is not very pretty (taken from PyAudio docs).
|
||||
while self._out_stream.is_active():
|
||||
|
||||
@@ -9,7 +9,7 @@ import asyncio
|
||||
import numpy as np
|
||||
import tkinter as tk
|
||||
|
||||
from pipecat.frames.frames import ImageRawFrame
|
||||
from pipecat.frames.frames import ImageRawFrame, StartFrame
|
||||
from pipecat.processors.frame_processor import FrameProcessor
|
||||
from pipecat.transports.base_input import BaseInputTransport
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
@@ -48,6 +48,14 @@ class TkInputTransport(BaseInputTransport):
|
||||
def read_raw_audio_frames(self, frame_count: int) -> bytes:
|
||||
return self._in_stream.read(frame_count, exception_on_overflow=False)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
await super().start(frame)
|
||||
self._in_stream.start_stream()
|
||||
|
||||
async def stop(self):
|
||||
await super().stop()
|
||||
self._in_stream.stop_stream()
|
||||
|
||||
async def cleanup(self):
|
||||
# This is not very pretty (taken from PyAudio docs).
|
||||
while self._in_stream.is_active():
|
||||
@@ -79,7 +87,15 @@ class TkOutputTransport(BaseOutputTransport):
|
||||
self._out_stream.write(frames)
|
||||
|
||||
def write_frame_to_camera(self, frame: ImageRawFrame):
|
||||
asyncio.run_coroutine_threadsafe(self._write_frame_to_tk(frame), self.get_event_loop())
|
||||
self.get_event_loop().call_soon(self._write_frame_to_tk, frame)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
await super().start(frame)
|
||||
self._out_stream.start_stream()
|
||||
|
||||
async def stop(self):
|
||||
await super().stop()
|
||||
self._out_stream.stop_stream()
|
||||
|
||||
async def cleanup(self):
|
||||
# This is not very pretty (taken from PyAudio docs).
|
||||
@@ -89,7 +105,7 @@ class TkOutputTransport(BaseOutputTransport):
|
||||
|
||||
await super().cleanup()
|
||||
|
||||
async def _write_frame_to_tk(self, frame: ImageRawFrame):
|
||||
def _write_frame_to_tk(self, frame: ImageRawFrame):
|
||||
width = frame.size[0]
|
||||
height = frame.size[1]
|
||||
data = f"P6 {width} {height} 255 ".encode() + frame.image
|
||||
|
||||
@@ -4,10 +4,11 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import inspect
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
import types
|
||||
|
||||
@@ -16,8 +17,8 @@ from functools import partial
|
||||
from typing import Any, Callable, Mapping
|
||||
|
||||
from daily import (
|
||||
CallClient,
|
||||
Daily,
|
||||
CallClient,
|
||||
EventHandler,
|
||||
VirtualCameraDevice,
|
||||
VirtualMicrophoneDevice,
|
||||
@@ -30,6 +31,7 @@ from pipecat.frames.frames import (
|
||||
ImageRawFrame,
|
||||
InterimTranscriptionFrame,
|
||||
SpriteFrame,
|
||||
StartFrame,
|
||||
TranscriptionFrame,
|
||||
TransportMessageFrame,
|
||||
UserImageRawFrame,
|
||||
@@ -38,7 +40,7 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.transports.base_input import BaseInputTransport
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.vad.vad_analyzer import VADAnalyzer, VADState
|
||||
from pipecat.vad.vad_analyzer import VADAnalyzer, VADParams, VADState
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -60,8 +62,8 @@ class DailyTransportMessageFrame(TransportMessageFrame):
|
||||
|
||||
class WebRTCVADAnalyzer(VADAnalyzer):
|
||||
|
||||
def __init__(self, sample_rate=16000, num_channels=1):
|
||||
super().__init__(sample_rate, num_channels)
|
||||
def __init__(self, sample_rate=16000, num_channels=1, params: VADParams = VADParams()):
|
||||
super().__init__(sample_rate, num_channels, params)
|
||||
|
||||
self._webrtc_vad = Daily.create_native_vad(
|
||||
reset_period_ms=VAD_RESET_PERIOD_MS,
|
||||
@@ -80,32 +82,50 @@ class WebRTCVADAnalyzer(VADAnalyzer):
|
||||
return confidence
|
||||
|
||||
|
||||
class DailyParams(TransportParams):
|
||||
transcription_enabled: bool = False
|
||||
transcription_settings: Mapping[str, Any] = {
|
||||
"language": "en",
|
||||
"tier": "nova",
|
||||
"model": "2-conversationalai",
|
||||
"profanity_filter": True,
|
||||
"redact": False,
|
||||
"endpointing": True,
|
||||
"punctuate": True,
|
||||
"includeRawResponse": True,
|
||||
"extra": {
|
||||
"interim_results": True,
|
||||
}
|
||||
class DailyDialinSettings(BaseModel):
|
||||
call_id: str = ""
|
||||
call_domain: str = ""
|
||||
|
||||
|
||||
class DailyTranscriptionSettings(BaseModel):
|
||||
language: str = "en"
|
||||
tier: str = "nova"
|
||||
model: str = "2-conversationalai"
|
||||
profanity_filter: bool = True
|
||||
redact: bool = False
|
||||
endpointing: bool = True
|
||||
punctuate: bool = True
|
||||
includeRawResponse: bool = True
|
||||
extra: Mapping[str, Any] = {
|
||||
"interim_results": True
|
||||
}
|
||||
|
||||
|
||||
class DailyParams(TransportParams):
|
||||
api_url: str = "https://api.daily.co"
|
||||
api_key: str = ""
|
||||
dialin_settings: DailyDialinSettings | None = None
|
||||
transcription_enabled: bool = False
|
||||
transcription_settings: DailyTranscriptionSettings = DailyTranscriptionSettings()
|
||||
|
||||
|
||||
class DailyCallbacks(BaseModel):
|
||||
on_joined: Callable[[Mapping[str, Any]], None]
|
||||
on_left: Callable[[], None]
|
||||
on_participant_joined: Callable[[Mapping[str, Any]], None]
|
||||
on_first_participant_joined: Callable[[Mapping[str, Any]], None]
|
||||
on_error: Callable[[str], None]
|
||||
on_app_message: Callable[[Any, str], None]
|
||||
on_call_state_updated: Callable[[str], None]
|
||||
on_dialin_ready: Callable[[str], None]
|
||||
on_dialout_connected: Callable[[Any], None]
|
||||
on_dialout_stopped: Callable[[Any], None]
|
||||
on_dialout_error: Callable[[Any], None]
|
||||
on_dialout_warning: Callable[[Any], None]
|
||||
on_first_participant_joined: Callable[[Mapping[str, Any]], None]
|
||||
on_participant_joined: Callable[[Mapping[str, Any]], None]
|
||||
on_participant_left: Callable[[Mapping[str, Any], str], None]
|
||||
|
||||
|
||||
class DailySession(EventHandler):
|
||||
class DailyTransportClient(EventHandler):
|
||||
|
||||
_daily_initialized: bool = False
|
||||
|
||||
@@ -142,6 +162,8 @@ class DailySession(EventHandler):
|
||||
self._leaving = False
|
||||
self._sync_response = {k: queue.Queue() for k in ["join", "leave"]}
|
||||
|
||||
self._executor = ThreadPoolExecutor(max_workers=5)
|
||||
|
||||
self._client: CallClient = CallClient(event_handler=self)
|
||||
|
||||
self._camera: VirtualCameraDevice = Daily.create_camera_device(
|
||||
@@ -157,30 +179,25 @@ class DailySession(EventHandler):
|
||||
"speaker", sample_rate=self._params.audio_in_sample_rate, channels=self._params.audio_in_channels)
|
||||
Daily.select_speaker_device("speaker")
|
||||
|
||||
self._vad_analyzer = None
|
||||
if self._params.vad_enabled:
|
||||
self._vad_analyzer = WebRTCVADAnalyzer(
|
||||
sample_rate=self._params.audio_in_sample_rate,
|
||||
num_channels=self._params.audio_in_channels)
|
||||
|
||||
@ property
|
||||
@property
|
||||
def participant_id(self) -> str:
|
||||
return self._participant_id
|
||||
|
||||
def set_callbacks(self, callbacks: DailyCallbacks):
|
||||
self._callbacks = callbacks
|
||||
|
||||
def vad_analyze(self, audio_frames: bytes) -> VADState:
|
||||
state = VADState.QUIET
|
||||
if self._vad_analyzer:
|
||||
state = self._vad_analyzer.analyze_audio(audio_frames)
|
||||
return state
|
||||
|
||||
def send_message(self, frame: DailyTransportMessageFrame):
|
||||
self._client.send_app_message(frame.message, frame.participant_id)
|
||||
|
||||
def read_raw_audio_frames(self, frame_count: int) -> bytes:
|
||||
return self._speaker.read_frames(frame_count)
|
||||
if self._other_participant_has_joined:
|
||||
return self._speaker.read_frames(frame_count)
|
||||
else:
|
||||
# If no one has ever joined the meeting `read_frames()` would block,
|
||||
# instead we just wait a bit. daily-python should probably return
|
||||
# silence instead.
|
||||
time.sleep(0.01)
|
||||
return b''
|
||||
|
||||
def write_raw_audio_frames(self, frames: bytes):
|
||||
self._mic.write_frames(frames)
|
||||
@@ -196,7 +213,7 @@ class DailySession(EventHandler):
|
||||
self._joining = True
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
await loop.run_in_executor(None, self._join)
|
||||
await loop.run_in_executor(self._executor, self._join)
|
||||
|
||||
def _join(self):
|
||||
logger.info(f"Joining {self._room_url}")
|
||||
@@ -219,13 +236,13 @@ class DailySession(EventHandler):
|
||||
client_settings={
|
||||
"inputs": {
|
||||
"camera": {
|
||||
"isEnabled": True,
|
||||
"isEnabled": self._params.camera_out_enabled,
|
||||
"settings": {
|
||||
"deviceId": "camera",
|
||||
},
|
||||
},
|
||||
"microphone": {
|
||||
"isEnabled": True,
|
||||
"isEnabled": self._params.audio_out_enabled,
|
||||
"settings": {
|
||||
"deviceId": "mic",
|
||||
"customConstraints": {
|
||||
@@ -265,13 +282,15 @@ class DailySession(EventHandler):
|
||||
if self._token and self._params.transcription_enabled:
|
||||
logger.info(
|
||||
f"Enabling transcription with settings {self._params.transcription_settings}")
|
||||
self._client.start_transcription(self._params.transcription_settings)
|
||||
self._client.start_transcription(
|
||||
self._params.transcription_settings.model_dump())
|
||||
|
||||
self._callbacks.on_joined(data["participants"]["local"])
|
||||
else:
|
||||
error_msg = f"Error joining {self._room_url}: {error}"
|
||||
logger.error(error_msg)
|
||||
self._callbacks.on_error(error_msg)
|
||||
self._sync_response["join"].task_done()
|
||||
except queue.Empty:
|
||||
error_msg = f"Time out joining {self._room_url}"
|
||||
logger.error(error_msg)
|
||||
@@ -286,7 +305,7 @@ class DailySession(EventHandler):
|
||||
self._leaving = True
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
await loop.run_in_executor(None, self._leave)
|
||||
await loop.run_in_executor(self._executor, self._leave)
|
||||
|
||||
def _leave(self):
|
||||
logger.info(f"Leaving {self._room_url}")
|
||||
@@ -309,6 +328,7 @@ class DailySession(EventHandler):
|
||||
error_msg = f"Error leaving {self._room_url}: {error}"
|
||||
logger.error(error_msg)
|
||||
self._callbacks.on_error(error_msg)
|
||||
self._sync_response["leave"].task_done()
|
||||
except queue.Empty:
|
||||
error_msg = f"Time out leaving {self._room_url}"
|
||||
logger.error(error_msg)
|
||||
@@ -316,13 +336,25 @@ class DailySession(EventHandler):
|
||||
|
||||
async def cleanup(self):
|
||||
loop = asyncio.get_running_loop()
|
||||
await loop.run_in_executor(None, self._cleanup)
|
||||
await loop.run_in_executor(self._executor, self._cleanup)
|
||||
|
||||
def _cleanup(self):
|
||||
if self._client:
|
||||
self._client.release()
|
||||
self._client = None
|
||||
|
||||
def start_dialout(self, settings):
|
||||
self._client.start_dialout(settings)
|
||||
|
||||
def stop_dialout(self, participant_id):
|
||||
self._client.stop_dialout(participant_id)
|
||||
|
||||
def start_recording(self, streaming_settings, stream_id, force_new):
|
||||
self._client.start_recording(streaming_settings, stream_id, force_new)
|
||||
|
||||
def stop_recording(self, stream_id):
|
||||
self._client.stop_recording(stream_id)
|
||||
|
||||
def capture_participant_transcription(self, participant_id: str, callback: Callable):
|
||||
if not self._params.transcription_enabled:
|
||||
return
|
||||
@@ -356,6 +388,27 @@ class DailySession(EventHandler):
|
||||
# Daily (EventHandler)
|
||||
#
|
||||
|
||||
def on_app_message(self, message: Any, sender: str):
|
||||
self._callbacks.on_app_message(message, sender)
|
||||
|
||||
def on_call_state_updated(self, state: str):
|
||||
self._callbacks.on_call_state_updated(state)
|
||||
|
||||
def on_dialin_ready(self, sip_endpoint: str):
|
||||
self._callbacks.on_dialin_ready(sip_endpoint)
|
||||
|
||||
def on_dialout_connected(self, data: Any):
|
||||
self._callbacks.on_dialout_connected(data)
|
||||
|
||||
def on_dialout_stopped(self, data: Any):
|
||||
self._callbacks.on_dialout_stopped(data)
|
||||
|
||||
def on_dialout_error(self, data: Any):
|
||||
self._callbacks.on_dialout_error(data)
|
||||
|
||||
def on_dialout_warning(self, data: Any):
|
||||
self._callbacks.on_dialout_warning(data)
|
||||
|
||||
def on_participant_joined(self, participant):
|
||||
id = participant["id"]
|
||||
logger.info(f"Participant joined {id}")
|
||||
@@ -366,6 +419,12 @@ class DailySession(EventHandler):
|
||||
|
||||
self._callbacks.on_participant_joined(participant)
|
||||
|
||||
def on_participant_left(self, participant, reason):
|
||||
id = participant["id"]
|
||||
logger.info(f"Participant left {id}")
|
||||
|
||||
self._callbacks.on_participant_left(participant, reason)
|
||||
|
||||
def on_transcription_message(self, message: Mapping[str, Any]):
|
||||
participant_id = ""
|
||||
if "participantId" in message:
|
||||
@@ -384,6 +443,7 @@ class DailySession(EventHandler):
|
||||
def on_transcription_stopped(self, stopped_by, stopped_by_error):
|
||||
logger.debug("Transcription stopped")
|
||||
|
||||
#
|
||||
# Daily (CallClient callbacks)
|
||||
#
|
||||
|
||||
@@ -403,36 +463,54 @@ class DailySession(EventHandler):
|
||||
|
||||
class DailyInputTransport(BaseInputTransport):
|
||||
|
||||
def __init__(self, session: DailySession, params: DailyParams):
|
||||
def __init__(self, client: DailyTransportClient, params: DailyParams):
|
||||
super().__init__(params)
|
||||
|
||||
self._session = session
|
||||
self._client = client
|
||||
|
||||
self._video_renderers = {}
|
||||
self._camera_in_queue = queue.Queue()
|
||||
self._camera_in_thread = threading.Thread(target=self._camera_in_thread_handler)
|
||||
self._camera_in_thread.start()
|
||||
|
||||
async def start(self):
|
||||
await self._session.join()
|
||||
await super().start()
|
||||
self._vad_analyzer = params.vad_analyzer
|
||||
if params.vad_enabled and not params.vad_analyzer:
|
||||
self._vad_analyzer = WebRTCVADAnalyzer(
|
||||
sample_rate=self._params.audio_in_sample_rate,
|
||||
num_channels=self._params.audio_in_channels)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
if self._running:
|
||||
return
|
||||
# Join the room.
|
||||
await self._client.join()
|
||||
# This will set _running=True
|
||||
await super().start(frame)
|
||||
# Create camera in thread (runs if _running is true).
|
||||
loop = asyncio.get_running_loop()
|
||||
self._camera_in_thread = loop.run_in_executor(
|
||||
self._in_executor, self._camera_in_thread_handler)
|
||||
|
||||
async def stop(self):
|
||||
await self._session.leave()
|
||||
if not self._running:
|
||||
return
|
||||
# Leave the room.
|
||||
await self._client.leave()
|
||||
# This will set _running=False
|
||||
await super().stop()
|
||||
# The thread will stop.
|
||||
await self._camera_in_thread
|
||||
|
||||
async def cleanup(self):
|
||||
self._camera_in_thread.join()
|
||||
|
||||
await self._session.cleanup()
|
||||
|
||||
await super().cleanup()
|
||||
await self._client.cleanup()
|
||||
|
||||
def vad_analyze(self, audio_frames: bytes) -> VADState:
|
||||
return self._session.vad_analyze(audio_frames)
|
||||
state = VADState.QUIET
|
||||
if self._vad_analyzer:
|
||||
state = self._vad_analyzer.analyze_audio(audio_frames)
|
||||
return state
|
||||
|
||||
def read_raw_audio_frames(self, frame_count: int) -> bytes:
|
||||
return self._session.read_raw_audio_frames(frame_count)
|
||||
return self._client.read_raw_audio_frames(frame_count)
|
||||
|
||||
#
|
||||
# FrameProcessor
|
||||
@@ -445,24 +523,18 @@ class DailyInputTransport(BaseInputTransport):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
#
|
||||
# Transcription
|
||||
# Frames
|
||||
#
|
||||
|
||||
def capture_participant_transcription(self, participant_id: str):
|
||||
self._session.capture_participant_transcription(
|
||||
participant_id,
|
||||
self._on_transcription_message
|
||||
)
|
||||
def push_transcription_frame(self, frame: TranscriptionFrame | InterimTranscriptionFrame):
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self._internal_push_frame(frame), self.get_event_loop())
|
||||
future.result()
|
||||
|
||||
def _on_transcription_message(self, participant_id, message):
|
||||
text = message["text"]
|
||||
timestamp = message["timestamp"]
|
||||
is_final = message["rawResponse"]["is_final"]
|
||||
if is_final:
|
||||
frame = TranscriptionFrame(text, participant_id, timestamp)
|
||||
else:
|
||||
frame = InterimTranscriptionFrame(text, participant_id, timestamp)
|
||||
future = asyncio.run_coroutine_threadsafe(self.push_frame(frame), self.get_event_loop())
|
||||
def push_app_message(self, message: Any, sender: str):
|
||||
frame = DailyTransportMessageFrame(message=message, participant_id=sender)
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self._internal_push_frame(frame), self.get_event_loop())
|
||||
future.result()
|
||||
|
||||
#
|
||||
@@ -481,7 +553,7 @@ class DailyInputTransport(BaseInputTransport):
|
||||
"render_next_frame": False,
|
||||
}
|
||||
|
||||
self._session.capture_participant_video(
|
||||
self._client.capture_participant_video(
|
||||
participant_id,
|
||||
self._on_participant_video_frame,
|
||||
framerate,
|
||||
@@ -522,8 +594,9 @@ class DailyInputTransport(BaseInputTransport):
|
||||
try:
|
||||
frame = self._camera_in_queue.get(timeout=1)
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self.push_frame(frame), self.get_event_loop())
|
||||
self._internal_push_frame(frame), self.get_event_loop())
|
||||
future.result()
|
||||
self._camera_in_queue.task_done()
|
||||
except queue.Empty:
|
||||
pass
|
||||
except BaseException as e:
|
||||
@@ -532,28 +605,39 @@ class DailyInputTransport(BaseInputTransport):
|
||||
|
||||
class DailyOutputTransport(BaseOutputTransport):
|
||||
|
||||
def __init__(self, session: DailySession, params: DailyParams):
|
||||
def __init__(self, client: DailyTransportClient, params: DailyParams):
|
||||
super().__init__(params)
|
||||
|
||||
self._session = session
|
||||
self._client = client
|
||||
|
||||
async def start(self):
|
||||
await self._session.join()
|
||||
await super().start()
|
||||
async def start(self, frame: StartFrame):
|
||||
if self._running:
|
||||
return
|
||||
# This will set _running=True
|
||||
await super().start(frame)
|
||||
# Join the room.
|
||||
await self._client.join()
|
||||
|
||||
async def stop(self):
|
||||
await self._session.leave()
|
||||
if not self._running:
|
||||
return
|
||||
# This will set _running=False
|
||||
await super().stop()
|
||||
# Leave the room.
|
||||
await self._client.leave()
|
||||
|
||||
async def cleanup(self):
|
||||
await self._session.cleanup()
|
||||
await super().cleanup()
|
||||
await self._client.cleanup()
|
||||
|
||||
def send_message(self, frame: DailyTransportMessageFrame):
|
||||
self._client.send_message(frame)
|
||||
|
||||
def write_raw_audio_frames(self, frames: bytes):
|
||||
self._session.write_raw_audio_frames(frames)
|
||||
self._client.write_raw_audio_frames(frames)
|
||||
|
||||
def write_frame_to_camera(self, frame: ImageRawFrame):
|
||||
self._session.write_frame_to_camera(frame)
|
||||
self._client.write_frame_to_camera(frame)
|
||||
|
||||
|
||||
class DailyTransport(BaseTransport):
|
||||
@@ -562,13 +646,21 @@ class DailyTransport(BaseTransport):
|
||||
callbacks = DailyCallbacks(
|
||||
on_joined=self._on_joined,
|
||||
on_left=self._on_left,
|
||||
on_error=self._on_error,
|
||||
on_app_message=self._on_app_message,
|
||||
on_call_state_updated=self._on_call_state_updated,
|
||||
on_dialin_ready=self._on_dialin_ready,
|
||||
on_dialout_connected=self._on_dialout_connected,
|
||||
on_dialout_stopped=self._on_dialout_stopped,
|
||||
on_dialout_error=self._on_dialout_error,
|
||||
on_dialout_warning=self._on_dialout_warning,
|
||||
on_first_participant_joined=self._on_first_participant_joined,
|
||||
on_participant_joined=self._on_participant_joined,
|
||||
on_error=self._on_error,
|
||||
on_participant_left=self._on_participant_left,
|
||||
)
|
||||
self._params = params
|
||||
|
||||
self._session = DailySession(room_url, token, bot_name, params, callbacks)
|
||||
self._client = DailyTransportClient(room_url, token, bot_name, params, callbacks)
|
||||
self._input: DailyInputTransport | None = None
|
||||
self._output: DailyOutputTransport | None = None
|
||||
self._loop = asyncio.get_running_loop()
|
||||
@@ -579,8 +671,16 @@ class DailyTransport(BaseTransport):
|
||||
# these handlers.
|
||||
self._register_event_handler("on_joined")
|
||||
self._register_event_handler("on_left")
|
||||
self._register_event_handler("on_participant_joined")
|
||||
self._register_event_handler("on_app_message")
|
||||
self._register_event_handler("on_call_state_updated")
|
||||
self._register_event_handler("on_dialin_ready")
|
||||
self._register_event_handler("on_dialout_connected")
|
||||
self._register_event_handler("on_dialout_stopped")
|
||||
self._register_event_handler("on_dialout_error")
|
||||
self._register_event_handler("on_dialout_warning")
|
||||
self._register_event_handler("on_first_participant_joined")
|
||||
self._register_event_handler("on_participant_joined")
|
||||
self._register_event_handler("on_participant_left")
|
||||
|
||||
#
|
||||
# BaseTransport
|
||||
@@ -588,12 +688,12 @@ class DailyTransport(BaseTransport):
|
||||
|
||||
def input(self) -> FrameProcessor:
|
||||
if not self._input:
|
||||
self._input = DailyInputTransport(self._session, self._params)
|
||||
self._input = DailyInputTransport(self._client, self._params)
|
||||
return self._input
|
||||
|
||||
def output(self) -> FrameProcessor:
|
||||
if not self._output:
|
||||
self._output = DailyOutputTransport(self._session, self._params)
|
||||
self._output = DailyOutputTransport(self._client, self._params)
|
||||
return self._output
|
||||
|
||||
#
|
||||
@@ -602,7 +702,7 @@ class DailyTransport(BaseTransport):
|
||||
|
||||
@property
|
||||
def participant_id(self) -> str:
|
||||
return self._session.participant_id
|
||||
return self._client.participant_id
|
||||
|
||||
async def send_image(self, frame: ImageRawFrame | SpriteFrame):
|
||||
if self._output:
|
||||
@@ -612,9 +712,23 @@ class DailyTransport(BaseTransport):
|
||||
if self._output:
|
||||
await self._output.process_frame(frame, FrameDirection.DOWNSTREAM)
|
||||
|
||||
def start_dialout(self, settings=None):
|
||||
self._client.start_dialout(settings)
|
||||
|
||||
def stop_dialout(self, participant_id):
|
||||
self._client.stop_dialout(participant_id)
|
||||
|
||||
def start_recording(self, streaming_settings=None, stream_id=None, force_new=None):
|
||||
self._client.start_recording(streaming_settings, stream_id, force_new)
|
||||
|
||||
def stop_recording(self, stream_id=None):
|
||||
self._client.stop_recording(stream_id)
|
||||
|
||||
def capture_participant_transcription(self, participant_id: str):
|
||||
if self._input:
|
||||
self._input.capture_participant_transcription(participant_id)
|
||||
self._client.capture_participant_transcription(
|
||||
participant_id,
|
||||
self._on_transcription_message
|
||||
)
|
||||
|
||||
def capture_participant_video(
|
||||
self,
|
||||
@@ -634,15 +748,87 @@ class DailyTransport(BaseTransport):
|
||||
|
||||
def _on_error(self, error):
|
||||
# TODO(aleix): Report error to input/output transports. The one managing
|
||||
# the session should report the error.
|
||||
# the client should report the error.
|
||||
pass
|
||||
|
||||
def _on_app_message(self, message: Any, sender: str):
|
||||
if self._input:
|
||||
self._input.push_app_message(message, sender)
|
||||
self.on_app_message(message, sender)
|
||||
|
||||
def _on_call_state_updated(self, state: str):
|
||||
self.on_call_state_updated(state)
|
||||
|
||||
async def _handle_dialin_ready(self, sip_endpoint: str):
|
||||
if not self._params.dialin_settings:
|
||||
return
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self._params.api_key}",
|
||||
"Content-Type": "application/x-www-form-urlencoded"
|
||||
}
|
||||
data = {
|
||||
"callId": self._params.dialin_settings.call_id,
|
||||
"callDomain": self._params.dialin_settings.call_domain,
|
||||
"sipUri": sip_endpoint
|
||||
}
|
||||
|
||||
url = f"{self._params.api_url}/dialin/pinlessCallUpdate"
|
||||
|
||||
try:
|
||||
async with session.post(url, headers=headers, data=data, timeout=10) as r:
|
||||
if r.status != 200:
|
||||
text = await r.text()
|
||||
logger.error(
|
||||
f"Unable to handle dialin-ready event (status: {r.status}, error: {text})")
|
||||
return
|
||||
|
||||
logger.debug("Event dialin-ready was handled successfully")
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"Timeout handling dialin-ready event ({url})")
|
||||
except BaseException as e:
|
||||
logger.error(f"Error handling dialin-ready event ({url}): {e}")
|
||||
|
||||
def _on_dialin_ready(self, sip_endpoint):
|
||||
if self._params.dialin_settings:
|
||||
asyncio.run_coroutine_threadsafe(self._handle_dialin_ready(sip_endpoint), self._loop)
|
||||
self.on_dialin_ready(sip_endpoint)
|
||||
|
||||
def _on_dialout_connected(self, data):
|
||||
self.on_dialout_connected(data)
|
||||
|
||||
def _on_dialout_stopped(self, data):
|
||||
self.on_dialout_stopped(data)
|
||||
|
||||
def _on_dialout_error(self, data):
|
||||
self.on_dialout_error(data)
|
||||
|
||||
def _on_dialout_warning(self, data):
|
||||
self.on_dialout_warning(data)
|
||||
|
||||
def _on_participant_joined(self, participant):
|
||||
self.on_participant_joined(participant)
|
||||
|
||||
def _on_participant_left(self, participant, reason):
|
||||
self.on_participant_left(participant, reason)
|
||||
|
||||
def _on_first_participant_joined(self, participant):
|
||||
self.on_first_participant_joined(participant)
|
||||
|
||||
def _on_transcription_message(self, participant_id, message):
|
||||
text = message["text"]
|
||||
timestamp = message["timestamp"]
|
||||
is_final = message["rawResponse"]["is_final"]
|
||||
if is_final:
|
||||
frame = TranscriptionFrame(text, participant_id, timestamp)
|
||||
logger.debug(f"Transcription (from: {participant_id}): [{text}]")
|
||||
else:
|
||||
frame = InterimTranscriptionFrame(text, participant_id, timestamp)
|
||||
|
||||
if self._input:
|
||||
self._input.push_transcription_frame(frame)
|
||||
|
||||
#
|
||||
# Decorators (event handlers)
|
||||
#
|
||||
@@ -653,12 +839,36 @@ class DailyTransport(BaseTransport):
|
||||
def on_left(self):
|
||||
pass
|
||||
|
||||
def on_participant_joined(self, participant):
|
||||
def on_app_message(self, message, sender):
|
||||
pass
|
||||
|
||||
def on_call_state_updated(self, state):
|
||||
pass
|
||||
|
||||
def on_dialin_ready(self, sip_endpoint):
|
||||
pass
|
||||
|
||||
def on_dialout_connected(self, data):
|
||||
pass
|
||||
|
||||
def on_dialout_stopped(self, data):
|
||||
pass
|
||||
|
||||
def on_dialout_error(self, data):
|
||||
pass
|
||||
|
||||
def on_dialout_warning(self, data):
|
||||
pass
|
||||
|
||||
def on_first_participant_joined(self, participant):
|
||||
pass
|
||||
|
||||
def on_participant_joined(self, participant):
|
||||
pass
|
||||
|
||||
def on_participant_left(self, participant, reason):
|
||||
pass
|
||||
|
||||
def event_handler(self, event_name: str):
|
||||
def decorator(handler):
|
||||
self._add_event_handler(event_name, handler)
|
||||
@@ -698,46 +908,5 @@ class DailyTransport(BaseTransport):
|
||||
logger.error(f"Exception in event handler {event_name}: {e}")
|
||||
raise e
|
||||
|
||||
# def send_app_message(self, message: Any, participant_id: str | None):
|
||||
# self.client.send_app_message(message, participant_id)
|
||||
|
||||
# def process_interrupt_handler(self, signum, frame):
|
||||
# self._post_run()
|
||||
# if callable(self.original_sigint_handler):
|
||||
# self.original_sigint_handler(signum, frame)
|
||||
|
||||
# def _post_run(self):
|
||||
# self.client.leave()
|
||||
# self.client.release()
|
||||
|
||||
# def on_first_other_participant_joined(self, participant):
|
||||
# pass
|
||||
|
||||
# def call_joined(self, join_data, client_error):
|
||||
# # self._logger.info(f"Call_joined: {join_data}, {client_error}")
|
||||
# pass
|
||||
|
||||
# def dialout(self, number):
|
||||
# self.client.start_dialout({"phoneNumber": number})
|
||||
|
||||
# def start_recording(self):
|
||||
# self.client.start_recording()
|
||||
|
||||
# def on_error(self, error):
|
||||
# self._logger.error(f"on_error: {error}")
|
||||
|
||||
# def on_participant_joined(self, participant):
|
||||
# if not self._other_participant_has_joined and participant["id"] != self._my_participant_id:
|
||||
# self._other_participant_has_joined = True
|
||||
# self.on_first_other_participant_joined(participant)
|
||||
|
||||
# def on_participant_left(self, participant, reason):
|
||||
# if len(self.client.participants()) < self._min_others_count + 1:
|
||||
# self._stop_threads.set()
|
||||
|
||||
# def on_app_message(self, message: Any, sender: str):
|
||||
# if self._loop:
|
||||
# frame = ReceivedAppMessageFrame(message, sender)
|
||||
# asyncio.run_coroutine_threadsafe(
|
||||
# self.receive_queue.put(frame), self._loop
|
||||
# )
|
||||
|
||||
33
src/pipecat/utils/audio.py
Normal file
33
src/pipecat/utils/audio.py
Normal file
@@ -0,0 +1,33 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import numpy as np
|
||||
import pyloudnorm as pyln
|
||||
|
||||
|
||||
def normalize_value(value, min_value, max_value):
|
||||
normalized = (value - min_value) / (max_value - min_value)
|
||||
normalized_clamped = max(0, min(1, normalized))
|
||||
return normalized_clamped
|
||||
|
||||
|
||||
def calculate_audio_volume(audio: bytes, sample_rate: int) -> float:
|
||||
audio_np = np.frombuffer(audio, dtype=np.int16)
|
||||
audio_float = audio_np.astype(np.float64)
|
||||
|
||||
block_size = audio_np.size / sample_rate
|
||||
meter = pyln.Meter(sample_rate, block_size=block_size)
|
||||
loudness = meter.integrated_loudness(audio_float)
|
||||
|
||||
# Loudness goes from -20 to 80 (more or less), where -20 is quiet and 80 is
|
||||
# loud.
|
||||
loudness = normalize_value(loudness, -20, 80)
|
||||
|
||||
return loudness
|
||||
|
||||
|
||||
def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
|
||||
return prev_value + factor * (value - prev_value)
|
||||
41
src/pipecat/utils/test_frame_processor.py
Normal file
41
src/pipecat/utils/test_frame_processor.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from typing import List
|
||||
from pipecat.processors.frame_processor import FrameProcessor
|
||||
|
||||
|
||||
class TestException(BaseException):
|
||||
pass
|
||||
|
||||
|
||||
class TestFrameProcessor(FrameProcessor):
|
||||
def __init__(self, test_frames):
|
||||
self.test_frames = test_frames
|
||||
self._list_counter = 0
|
||||
super().__init__()
|
||||
|
||||
async def process_frame(self, frame, direction):
|
||||
if not self.test_frames[0]: # then we've run out of required frames but the generator is still going?
|
||||
raise TestException(f"Oops, got an extra frame, {frame}")
|
||||
if isinstance(self.test_frames[0], List):
|
||||
# We need to consume frames until we see the next frame type after this
|
||||
next_frame = self.test_frames[1]
|
||||
if isinstance(frame, next_frame):
|
||||
# we're done iterating the list I guess
|
||||
print(f"TestFrameProcessor got expected list exit frame: {frame}")
|
||||
# pop twice to get rid of the list, as well as the next frame
|
||||
self.test_frames.pop(0)
|
||||
self.test_frames.pop(0)
|
||||
self.list_counter = 0
|
||||
else:
|
||||
fl = self.test_frames[0]
|
||||
fl_el = fl[self._list_counter % len(fl)]
|
||||
if isinstance(frame, fl_el):
|
||||
print(f"TestFrameProcessor got expected list frame: {frame}")
|
||||
self._list_counter += 1
|
||||
else:
|
||||
raise TestException(f"Inside a list, expected {fl_el} but got {frame}")
|
||||
|
||||
else:
|
||||
if not isinstance(frame, self.test_frames[0]):
|
||||
raise TestException(f"Expected {self.test_frames[0]}, but got {frame}")
|
||||
print(f"TestFrameProcessor got expected frame: {frame}")
|
||||
self.test_frames.pop(0)
|
||||
@@ -29,3 +29,7 @@ def obj_count(obj) -> int:
|
||||
else:
|
||||
_COUNTS[name] += 1
|
||||
return _COUNTS[name]
|
||||
|
||||
|
||||
def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
|
||||
return prev_value + factor * (value - prev_value)
|
||||
|
||||
@@ -8,7 +8,7 @@ import numpy as np
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame, Frame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.vad.vad_analyzer import VADAnalyzer, VADState
|
||||
from pipecat.vad.vad_analyzer import VADAnalyzer, VADParams, VADState
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -26,24 +26,10 @@ except ModuleNotFoundError as e:
|
||||
raise Exception(f"Missing module(s): {e}")
|
||||
|
||||
|
||||
# Provided by Alexander Veysov
|
||||
def int2float(sound):
|
||||
try:
|
||||
abs_max = np.abs(sound).max()
|
||||
sound = sound.astype("float32")
|
||||
if abs_max > 0:
|
||||
sound *= 1 / 32768
|
||||
sound = sound.squeeze() # depends on the use case
|
||||
return sound
|
||||
except ValueError:
|
||||
return sound
|
||||
class SileroVADAnalyzer(VADAnalyzer):
|
||||
|
||||
|
||||
class SileroVAD(FrameProcessor, VADAnalyzer):
|
||||
|
||||
def __init__(self, sample_rate=16000, audio_passthrough=False):
|
||||
FrameProcessor.__init__(self)
|
||||
VADAnalyzer.__init__(self, sample_rate=sample_rate, num_channels=1)
|
||||
def __init__(self, sample_rate=16000, params: VADParams = VADParams()):
|
||||
super().__init__(sample_rate=sample_rate, num_channels=1, params=params)
|
||||
|
||||
logger.debug("Loading Silero VAD model...")
|
||||
|
||||
@@ -51,9 +37,6 @@ class SileroVAD(FrameProcessor, VADAnalyzer):
|
||||
repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False
|
||||
)
|
||||
|
||||
self._processor_vad_state: VADState = VADState.QUIET
|
||||
self._audio_passthrough = audio_passthrough
|
||||
|
||||
logger.debug("Loaded Silero VAD")
|
||||
|
||||
#
|
||||
@@ -66,7 +49,8 @@ class SileroVAD(FrameProcessor, VADAnalyzer):
|
||||
def voice_confidence(self, buffer) -> float:
|
||||
try:
|
||||
audio_int16 = np.frombuffer(buffer, np.int16)
|
||||
audio_float32 = int2float(audio_int16)
|
||||
# Divide by 32768 because we have signed 16-bit data.
|
||||
audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
new_confidence = self._model(torch.from_numpy(audio_float32), self.sample_rate).item()
|
||||
return new_confidence
|
||||
except BaseException as e:
|
||||
@@ -74,6 +58,21 @@ class SileroVAD(FrameProcessor, VADAnalyzer):
|
||||
logger.error(f"Error analyzing audio with Silero VAD: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
class SileroVAD(FrameProcessor):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate: int = 16000,
|
||||
vad_params: VADParams = VADParams(),
|
||||
audio_passthrough: bool = False):
|
||||
super().__init__()
|
||||
|
||||
self._vad_analyzer = SileroVADAnalyzer(sample_rate=sample_rate, params=vad_params)
|
||||
self._audio_passthrough = audio_passthrough
|
||||
|
||||
self._processor_vad_state: VADState = VADState.QUIET
|
||||
|
||||
#
|
||||
# FrameProcessor
|
||||
#
|
||||
@@ -89,7 +88,7 @@ class SileroVAD(FrameProcessor, VADAnalyzer):
|
||||
async def _analyze_audio(self, frame: AudioRawFrame):
|
||||
# Check VAD and push event if necessary. We just care about changes
|
||||
# from QUIET to SPEAKING and vice versa.
|
||||
new_vad_state = self.analyze_audio(frame.audio)
|
||||
new_vad_state = self._vad_analyzer.analyze_audio(frame.audio)
|
||||
if new_vad_state != self._processor_vad_state and new_vad_state != VADState.STARTING and new_vad_state != VADState.STOPPING:
|
||||
new_frame = None
|
||||
|
||||
|
||||
@@ -7,6 +7,10 @@
|
||||
from abc import abstractmethod
|
||||
from enum import Enum
|
||||
|
||||
from pydantic.main import BaseModel
|
||||
|
||||
from pipecat.utils.audio import calculate_audio_volume, exp_smoothing
|
||||
|
||||
|
||||
class VADState(Enum):
|
||||
QUIET = 1
|
||||
@@ -15,32 +19,36 @@ class VADState(Enum):
|
||||
STOPPING = 4
|
||||
|
||||
|
||||
class VADParams(BaseModel):
|
||||
confidence: float = 0.6
|
||||
start_secs: float = 0.2
|
||||
stop_secs: float = 0.8
|
||||
min_volume: float = 0.6
|
||||
|
||||
|
||||
class VADAnalyzer:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate,
|
||||
num_channels,
|
||||
vad_confidence=0.5,
|
||||
vad_start_s=0.2,
|
||||
vad_stop_s=0.8):
|
||||
def __init__(self, sample_rate: int, num_channels: int, params: VADParams):
|
||||
self._sample_rate = sample_rate
|
||||
self._vad_confidence = vad_confidence
|
||||
self._vad_start_s = vad_start_s
|
||||
self._vad_stop_s = vad_stop_s
|
||||
self._num_channels = num_channels
|
||||
self._params = params
|
||||
self._vad_frames = self.num_frames_required()
|
||||
self._vad_frames_num_bytes = self._vad_frames * num_channels * 2
|
||||
|
||||
vad_frame_s = self._vad_frames / self._sample_rate
|
||||
vad_frames_per_sec = self._vad_frames / self._sample_rate
|
||||
|
||||
self._vad_start_frames = round(self._vad_start_s / vad_frame_s)
|
||||
self._vad_stop_frames = round(self._vad_stop_s / vad_frame_s)
|
||||
self._vad_start_frames = round(self._params.start_secs / vad_frames_per_sec)
|
||||
self._vad_stop_frames = round(self._params.stop_secs / vad_frames_per_sec)
|
||||
self._vad_starting_count = 0
|
||||
self._vad_stopping_count = 0
|
||||
self._vad_state: VADState = VADState.QUIET
|
||||
|
||||
self._vad_buffer = b""
|
||||
|
||||
# Volume exponential smoothing
|
||||
self._smoothing_factor = 0.4
|
||||
self._prev_volume = 1 - self._smoothing_factor
|
||||
|
||||
@property
|
||||
def sample_rate(self):
|
||||
return self._sample_rate
|
||||
@@ -53,6 +61,10 @@ class VADAnalyzer:
|
||||
def voice_confidence(self, buffer) -> float:
|
||||
pass
|
||||
|
||||
def _get_smoothed_volume(self, audio: bytes) -> float:
|
||||
volume = calculate_audio_volume(audio, self._sample_rate)
|
||||
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
|
||||
|
||||
def analyze_audio(self, buffer) -> VADState:
|
||||
self._vad_buffer += buffer
|
||||
|
||||
@@ -64,7 +76,11 @@ class VADAnalyzer:
|
||||
self._vad_buffer = self._vad_buffer[num_required_bytes:]
|
||||
|
||||
confidence = self.voice_confidence(audio_frames)
|
||||
speaking = confidence >= self._vad_confidence
|
||||
|
||||
volume = self._get_smoothed_volume(audio_frames)
|
||||
self._prev_volume = volume
|
||||
|
||||
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
|
||||
|
||||
if speaking:
|
||||
match self._vad_state:
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import asyncio
|
||||
import os
|
||||
from pipecat.pipeline.openai_frames import OpenAILLMContextFrame
|
||||
from pipecat.services.azure_ai_services import AzureLLMService
|
||||
from pipecat.services.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame
|
||||
from pipecat.services.azure import AzureLLMService
|
||||
from pipecat.services.openai import OpenAILLMContext
|
||||
|
||||
from openai.types.chat import (
|
||||
ChatCompletionSystemMessageParam,
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
import asyncio
|
||||
from pipecat.pipeline.openai_frames import OpenAILLMContextFrame
|
||||
from pipecat.services.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame, OpenAILLMContext
|
||||
|
||||
from openai.types.chat import (
|
||||
ChatCompletionSystemMessageParam,
|
||||
)
|
||||
from pipecat.services.ollama_ai_services import OLLamaLLMService
|
||||
from pipecat.services.ollama import OLLamaLLMService
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def test_chat():
|
||||
|
||||
@@ -1,51 +1,75 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from pipecat.pipeline.openai_frames import OpenAILLMContextFrame
|
||||
from pipecat.services.openai_llm_context import OpenAILLMContext
|
||||
from typing import List
|
||||
|
||||
|
||||
from pipecat.services.openai import OpenAILLMContextFrame, OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.frames.frames import (
|
||||
LLMFullResponseStartFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMResponseEndFrame,
|
||||
LLMResponseStartFrame,
|
||||
TextFrame
|
||||
)
|
||||
from pipecat.utils.test_frame_processor import TestFrameProcessor
|
||||
from openai.types.chat import (
|
||||
ChatCompletionSystemMessageParam,
|
||||
ChatCompletionToolParam,
|
||||
ChatCompletionUserMessageParam,
|
||||
)
|
||||
|
||||
from pipecat.services.openai_api_llm_service import BaseOpenAILLMService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
|
||||
tools = [
|
||||
ChatCompletionToolParam(
|
||||
type="function",
|
||||
function={
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"celsius",
|
||||
"fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
"required": [
|
||||
"location",
|
||||
"format"],
|
||||
},
|
||||
})]
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def test_functions():
|
||||
tools = [
|
||||
ChatCompletionToolParam(
|
||||
type="function",
|
||||
function={
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"celsius",
|
||||
"fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
"required": [
|
||||
"location",
|
||||
"format"],
|
||||
},
|
||||
})]
|
||||
async def test_simple_functions():
|
||||
|
||||
async def get_weather_from_api(llm, args):
|
||||
return json.dumps({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
|
||||
llm = BaseOpenAILLMService(
|
||||
llm = OpenAILLMService(
|
||||
api_key=api_key or "",
|
||||
model="gpt-4-1106-preview",
|
||||
)
|
||||
|
||||
llm.register_function("get_current_weather", get_weather_from_api)
|
||||
t = TestFrameProcessor([
|
||||
LLMFullResponseStartFrame,
|
||||
[LLMResponseStartFrame, TextFrame, LLMResponseEndFrame],
|
||||
LLMFullResponseEndFrame
|
||||
])
|
||||
llm.link(t)
|
||||
|
||||
context = OpenAILLMContext(tools=tools)
|
||||
system_message: ChatCompletionSystemMessageParam = ChatCompletionSystemMessageParam(
|
||||
content="Ask the user to ask for a weather report", name="system", role="system"
|
||||
@@ -58,26 +82,64 @@ if __name__ == "__main__":
|
||||
context.add_message(system_message)
|
||||
context.add_message(user_message)
|
||||
frame = OpenAILLMContextFrame(context)
|
||||
async for s in llm.process_frame(frame):
|
||||
print(s)
|
||||
await llm.process_frame(frame, FrameDirection.DOWNSTREAM)
|
||||
|
||||
async def test_advanced_functions():
|
||||
|
||||
async def get_weather_from_api(llm, args):
|
||||
return [{"role": "system", "content": "The user has asked for live weather. Respond by telling them we don't currently support live weather for that area, but it's coming soon."}]
|
||||
|
||||
async def test_chat():
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
|
||||
llm = BaseOpenAILLMService(
|
||||
llm = OpenAILLMService(
|
||||
api_key=api_key or "",
|
||||
model="gpt-4-1106-preview",
|
||||
)
|
||||
|
||||
llm.register_function("get_current_weather", get_weather_from_api)
|
||||
t = TestFrameProcessor([
|
||||
LLMFullResponseStartFrame,
|
||||
[LLMResponseStartFrame, TextFrame, LLMResponseEndFrame],
|
||||
LLMFullResponseEndFrame
|
||||
])
|
||||
llm.link(t)
|
||||
|
||||
context = OpenAILLMContext(tools=tools)
|
||||
system_message: ChatCompletionSystemMessageParam = ChatCompletionSystemMessageParam(
|
||||
content="Ask the user to ask for a weather report", name="system", role="system"
|
||||
)
|
||||
user_message: ChatCompletionUserMessageParam = ChatCompletionUserMessageParam(
|
||||
content="Could you tell me the weather for Boulder, Colorado",
|
||||
name="user",
|
||||
role="user",
|
||||
)
|
||||
context.add_message(system_message)
|
||||
context.add_message(user_message)
|
||||
frame = OpenAILLMContextFrame(context)
|
||||
await llm.process_frame(frame, FrameDirection.DOWNSTREAM)
|
||||
|
||||
async def test_chat():
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
t = TestFrameProcessor([
|
||||
LLMFullResponseStartFrame,
|
||||
[LLMResponseStartFrame, TextFrame, LLMResponseEndFrame],
|
||||
LLMFullResponseEndFrame
|
||||
])
|
||||
llm = OpenAILLMService(
|
||||
api_key=api_key or "",
|
||||
model="gpt-4o",
|
||||
)
|
||||
llm.link(t)
|
||||
context = OpenAILLMContext()
|
||||
message: ChatCompletionSystemMessageParam = ChatCompletionSystemMessageParam(
|
||||
content="Please tell the world hello.", name="system", role="system")
|
||||
context.add_message(message)
|
||||
frame = OpenAILLMContextFrame(context)
|
||||
async for s in llm.process_frame(frame):
|
||||
print(s)
|
||||
await llm.process_frame(frame, FrameDirection.DOWNSTREAM)
|
||||
|
||||
async def run_tests():
|
||||
await test_functions()
|
||||
await test_simple_functions()
|
||||
await test_advanced_functions()
|
||||
await test_chat()
|
||||
|
||||
asyncio.run(run_tests())
|
||||
|
||||
@@ -3,16 +3,15 @@ import doctest
|
||||
import functools
|
||||
import unittest
|
||||
|
||||
from pipecat.pipeline.aggregators import (
|
||||
GatedAggregator,
|
||||
ParallelPipeline,
|
||||
SentenceAggregator,
|
||||
StatelessTextTransformer,
|
||||
)
|
||||
from pipecat.pipeline.frames import (
|
||||
AudioFrame,
|
||||
from pipecat.processors.aggregators.sentence import SentenceAggregator
|
||||
from pipecat.processors.text_transformer import StatelessTextTransformer
|
||||
from pipecat.processors.aggregators.gated import GatedAggregator
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
EndFrame,
|
||||
ImageFrame,
|
||||
ImageRawFrame,
|
||||
LLMResponseEndFrame,
|
||||
LLMResponseStartFrame,
|
||||
Frame,
|
||||
@@ -46,26 +45,26 @@ class TestDailyFrameAggregators(unittest.IsolatedAsyncioTestCase):
|
||||
async def test_gated_accumulator(self):
|
||||
gated_aggregator = GatedAggregator(
|
||||
gate_open_fn=lambda frame: isinstance(
|
||||
frame, ImageFrame), gate_close_fn=lambda frame: isinstance(
|
||||
frame, ImageRawFrame), gate_close_fn=lambda frame: isinstance(
|
||||
frame, LLMResponseStartFrame), start_open=False, )
|
||||
|
||||
frames = [
|
||||
LLMResponseStartFrame(),
|
||||
TextFrame("Hello, "),
|
||||
TextFrame("world."),
|
||||
AudioFrame(b"hello"),
|
||||
ImageFrame(b"image", (0, 0)),
|
||||
AudioFrame(b"world"),
|
||||
AudioRawFrame(b"hello", 1, 1),
|
||||
ImageRawFrame(b"image", (0, 0)),
|
||||
AudioRawFrame(b"world", 1, 1),
|
||||
LLMResponseEndFrame(),
|
||||
]
|
||||
|
||||
expected_output_frames = [
|
||||
ImageFrame(b"image", (0, 0)),
|
||||
ImageRawFrame(b"image", (0, 0)),
|
||||
LLMResponseStartFrame(),
|
||||
TextFrame("Hello, "),
|
||||
TextFrame("world."),
|
||||
AudioFrame(b"hello"),
|
||||
AudioFrame(b"world"),
|
||||
AudioRawFrame(b"hello", 1, 1),
|
||||
AudioRawFrame(b"world", 1, 1),
|
||||
LLMResponseEndFrame(),
|
||||
]
|
||||
for frame in frames:
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user