Compare commits
283 Commits
aleix/audi
...
hush/hidde
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d175e5e5fc | ||
|
|
6eed6ff779 | ||
|
|
1375211610 | ||
|
|
4e9369a702 | ||
|
|
f9e8748a96 | ||
|
|
20d6bf267a | ||
|
|
b573f9dab2 | ||
|
|
dbc76389d8 | ||
|
|
c27f838444 | ||
|
|
ce84485e26 | ||
|
|
6cf254e2f9 | ||
|
|
02b63c28a5 | ||
|
|
57c6ce7ffa | ||
|
|
2f3272ea2f | ||
|
|
f5c2d57e4b | ||
|
|
baa878272d | ||
|
|
093285868e | ||
|
|
6c9d058ec2 | ||
|
|
5df7be6892 | ||
|
|
2deca816ae | ||
|
|
b8d2fceced | ||
|
|
7596d71460 | ||
|
|
096067b097 | ||
|
|
ec09505f6b | ||
|
|
251ea756c8 | ||
|
|
8f6544efe2 | ||
|
|
6045a8ad8c | ||
|
|
b184d62634 | ||
|
|
1a8d512abb | ||
|
|
a62be8ea32 | ||
|
|
c230d94ff0 | ||
|
|
e7b02773f5 | ||
|
|
ed83248a6b | ||
|
|
af8b4901d4 | ||
|
|
64c8230960 | ||
|
|
bf664534cc | ||
|
|
274a04e535 | ||
|
|
cb81f3d50e | ||
|
|
30a3b24287 | ||
|
|
8aacf71956 | ||
|
|
72d503d3a3 | ||
|
|
453a904290 | ||
|
|
368bff4fb4 | ||
|
|
4ae045d704 | ||
|
|
8c71939425 | ||
|
|
a437c2d365 | ||
|
|
a1784e3237 | ||
|
|
abee0f853c | ||
|
|
e9d358ed17 | ||
|
|
c5d54d06bb | ||
|
|
c16eed7ca2 | ||
|
|
76388a10b5 | ||
|
|
38bcc033a2 | ||
|
|
5af563cd91 | ||
|
|
3de271161c | ||
|
|
c19f9bc43a | ||
|
|
ef85d245ed | ||
|
|
25749bd4c0 | ||
|
|
e19c5464fe | ||
|
|
5c2ea3b804 | ||
|
|
c27348d470 | ||
|
|
de5f9c9217 | ||
|
|
f9086ee3a2 | ||
|
|
43298a9026 | ||
|
|
d80e228c6f | ||
|
|
2902362886 | ||
|
|
1cd303ad7f | ||
|
|
f590a476e7 | ||
|
|
e71cb3ba68 | ||
|
|
510a9af2e5 | ||
|
|
5328f84df4 | ||
|
|
18817fd81b | ||
|
|
4bcc536fd2 | ||
|
|
1ab2ddd317 | ||
|
|
09aa168840 | ||
|
|
05753fb207 | ||
|
|
715e3f8543 | ||
|
|
9c9d4b35a4 | ||
|
|
2ee935f784 | ||
|
|
58aedc88a4 | ||
|
|
0e60385871 | ||
|
|
a4188f7986 | ||
|
|
c7cbfe7a4f | ||
|
|
f1c9f5040b | ||
|
|
79e51051c7 | ||
|
|
a63d0da528 | ||
|
|
4fd8df208f | ||
|
|
44d3bd30fa | ||
|
|
6e6e932370 | ||
|
|
baccf50417 | ||
|
|
7b1071b30d | ||
|
|
bd7ca94196 | ||
|
|
1ec1aa76e9 | ||
|
|
77c369c3c7 | ||
|
|
9171d4b040 | ||
|
|
e02b95fca5 | ||
|
|
d45a07b5e5 | ||
|
|
0cdcfcee8d | ||
|
|
324546b4e7 | ||
|
|
c8ee67a636 | ||
|
|
b87c57c951 | ||
|
|
721f662bbe | ||
|
|
fccd48bfff | ||
|
|
5310d903ec | ||
|
|
8cbce555e4 | ||
|
|
f6112713e8 | ||
|
|
cc637f4dea | ||
|
|
7f76a14c54 | ||
|
|
58675f4d5a | ||
|
|
d50e6db312 | ||
|
|
de74284a8e | ||
|
|
4c9a295b28 | ||
|
|
0968f36d3e | ||
|
|
fd570b0377 | ||
|
|
68ea5ee570 | ||
|
|
f891140a74 | ||
|
|
5ed2d7ac2b | ||
|
|
b713527da0 | ||
|
|
224d2cedc8 | ||
|
|
55cfea776f | ||
|
|
d7a2078e0b | ||
|
|
a3e540eb32 | ||
|
|
e01c20be84 | ||
|
|
ce3ca418c2 | ||
|
|
15b9a5faf6 | ||
|
|
3afa30894f | ||
|
|
0ecfa827e6 | ||
|
|
e1b0db75eb | ||
|
|
b0c773189f | ||
|
|
3064326834 | ||
|
|
c67e50fe34 | ||
|
|
9d45e3eca1 | ||
|
|
43a24d15f6 | ||
|
|
cafbda1668 | ||
|
|
86c26fd64c | ||
|
|
0c20668008 | ||
|
|
92df8dc43c | ||
|
|
9d5f5844b8 | ||
|
|
2cf31884d0 | ||
|
|
19354c6f2d | ||
|
|
0b2079ad41 | ||
|
|
5f18c3af70 | ||
|
|
0a40285d43 | ||
|
|
5b1c328541 | ||
|
|
37929533af | ||
|
|
3b92113680 | ||
|
|
46b52cb9bb | ||
|
|
f0bcc9d9ba | ||
|
|
1cac028bfe | ||
|
|
4956886819 | ||
|
|
c720cfc7c7 | ||
|
|
8fcef5628f | ||
|
|
c4a72802f0 | ||
|
|
917394803c | ||
|
|
01040ddcdd | ||
|
|
7947497f7e | ||
|
|
539ca5856f | ||
|
|
89c801f82c | ||
|
|
3de4f22d34 | ||
|
|
0e4d2be98c | ||
|
|
d8ce108ccd | ||
|
|
d123cd4b2b | ||
|
|
4d34aa7cd6 | ||
|
|
b860e94582 | ||
|
|
9d653e3788 | ||
|
|
9e518cf2ba | ||
|
|
2856372ad6 | ||
|
|
efbf574613 | ||
|
|
c018eb2f0e | ||
|
|
d7bfe54b7c | ||
|
|
137282b7a9 | ||
|
|
769f8c8f34 | ||
|
|
8b8a37ae7c | ||
|
|
56e2b006f5 | ||
|
|
79cca05e43 | ||
|
|
166c8e8e82 | ||
|
|
9b64d2c325 | ||
|
|
03e3e9fae9 | ||
|
|
65234ae41a | ||
|
|
3828df8cf9 | ||
|
|
9cbe85bf99 | ||
|
|
7bf805b829 | ||
|
|
990ee436e1 | ||
|
|
1cd42066a6 | ||
|
|
ba43558049 | ||
|
|
951c8d34da | ||
|
|
ac61139243 | ||
|
|
5b8f1fe3e3 | ||
|
|
0aa197e4a4 | ||
|
|
f04e058c96 | ||
|
|
6ef2ae12b7 | ||
|
|
fe6bbdaefe | ||
|
|
cc66fddca9 | ||
|
|
04b70ddf13 | ||
|
|
bb3bb8d9c6 | ||
|
|
f80f62c7d1 | ||
|
|
2007ae4317 | ||
|
|
a1e5a1eff4 | ||
|
|
691999b402 | ||
|
|
33f3a4cea1 | ||
|
|
ab1d2dbe6a | ||
|
|
f622b281d0 | ||
|
|
fb12bf9b4c | ||
|
|
27af50087e | ||
|
|
03502bed52 | ||
|
|
27c7e2d150 | ||
|
|
e81d387971 | ||
|
|
ef1ade3a71 | ||
|
|
4f032f5b96 | ||
|
|
72cb967780 | ||
|
|
357934a644 | ||
|
|
327973657f | ||
|
|
d2730e6741 | ||
|
|
eb5ecab104 | ||
|
|
202055a9b8 | ||
|
|
7034a9e3fd | ||
|
|
8f7ed12262 | ||
|
|
96b5320ef9 | ||
|
|
d5cd742237 | ||
|
|
1f1da8942d | ||
|
|
7953e1e9d9 | ||
|
|
d6f7ecc0a3 | ||
|
|
3eed316049 | ||
|
|
851cf079c3 | ||
|
|
dfb0da32a9 | ||
|
|
f450da57e5 | ||
|
|
2ec6b6c995 | ||
|
|
53b769a8ec | ||
|
|
4f9adc173a | ||
|
|
dc4a58877e | ||
|
|
a6243a6fe7 | ||
|
|
cf5f1b541a | ||
|
|
70e6c48233 | ||
|
|
51f7d14d0a | ||
|
|
4853d5d1fc | ||
|
|
076a8938f0 | ||
|
|
5a3457ba33 | ||
|
|
2fc224384d | ||
|
|
a4e6ea5a3f | ||
|
|
d3c211f293 | ||
|
|
20047c369e | ||
|
|
dd1ff237a8 | ||
|
|
39d80d0b0e | ||
|
|
7a48316534 | ||
|
|
031a93ac46 | ||
|
|
ea6cc1aa95 | ||
|
|
365260ec44 | ||
|
|
2eb244c80a | ||
|
|
aee3011d61 | ||
|
|
40496e7b0f | ||
|
|
6b24f89fa7 | ||
|
|
2097800042 | ||
|
|
6739318e68 | ||
|
|
d0bd563d42 | ||
|
|
74280829fc | ||
|
|
3fde8880f2 | ||
|
|
98d39e0d38 | ||
|
|
c9cebb5ffe | ||
|
|
f52ac6e99c | ||
|
|
787a6b1c6a | ||
|
|
d00a91074e | ||
|
|
4e11497a38 | ||
|
|
0443d5202a | ||
|
|
633c25cb13 | ||
|
|
d07f45132f | ||
|
|
a51280afa6 | ||
|
|
be14eb2460 | ||
|
|
e26dbffcbe | ||
|
|
59992fd24a | ||
|
|
455362ccaf | ||
|
|
16c0e2460b | ||
|
|
92246f7125 | ||
|
|
7737335ec9 | ||
|
|
5cc9b7e0d1 | ||
|
|
8c6a441064 | ||
|
|
fddc058ce2 | ||
|
|
89750086c5 | ||
|
|
e69406c7e2 | ||
|
|
878ae42d84 | ||
|
|
fae2d272d5 | ||
|
|
03a067d3e6 | ||
|
|
c94c51d44f | ||
|
|
3da711ba8b |
6
.github/workflows/format.yaml
vendored
6
.github/workflows/format.yaml
vendored
@@ -17,7 +17,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
ruff-format:
|
||||
name: "Formatting checker"
|
||||
name: "Code quality checks"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
@@ -39,8 +39,8 @@ jobs:
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
ruff format --diff
|
||||
- name: Ruff import linter
|
||||
- name: Ruff linter (all rules)
|
||||
id: ruff-check
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
ruff check --select I
|
||||
ruff check
|
||||
|
||||
2
.github/workflows/publish.yaml
vendored
2
.github/workflows/publish.yaml
vendored
@@ -5,7 +5,7 @@ on:
|
||||
inputs:
|
||||
gitref:
|
||||
type: string
|
||||
description: "what git ref to build"
|
||||
description: "what git tag to build (e.g. v0.0.74)"
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
|
||||
204
CHANGELOG.md
204
CHANGELOG.md
@@ -9,8 +9,156 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Added
|
||||
|
||||
- Added call hang-up error handling in `TwilioFrameSerializer`, which handles
|
||||
the case where the user has hung up before the `TwilioFrameSerializer` hangs
|
||||
up the call.
|
||||
|
||||
### Changed
|
||||
|
||||
- The `UserIdleProcessor` now handles the scenario where function calls take
|
||||
longer than the idle timeout duration. This allows you to use the
|
||||
`UserIdleProcessor` in conjunction with function calls that take a while to
|
||||
return a result.
|
||||
|
||||
### Performance
|
||||
|
||||
- Remove unncessary push task in each `FrameProcessor`.
|
||||
|
||||
## [0.0.74] - 2025-07-03
|
||||
|
||||
### Added
|
||||
|
||||
- Added a new STT service, `SpeechmaticsSTTService`. This service provides
|
||||
real-time speech-to-text transcription using the Speechmatics API. It supports
|
||||
partial and final transcriptions, multiple languages, various audio formats,
|
||||
and speaker diarization.
|
||||
|
||||
- Added `normalize` and `model_id` to `FishAudioTTSService`.
|
||||
|
||||
- Added `http_options` argument to `GoogleLLMService`.
|
||||
|
||||
- Added `run_llm` field to `LLMMessagesAppendFrame` and `LLMMessagesUpdateFrame`
|
||||
frames. If true, a context frame will be pushed triggering the LLM to respond.
|
||||
|
||||
- Added a new `SOXRStreamAudioResampler` for processing audio in chunks or
|
||||
streams. If you write your own processor and need to use an audio resampler,
|
||||
use the new `create_stream_resampler()`.
|
||||
|
||||
- Added new `DailyParams.audio_in_user_tracks` to allow receiving one track per
|
||||
user (default) or a single track from the room (all participants mixed).
|
||||
|
||||
- Added support for providing "direct" functions, which don't need an
|
||||
accompanying `FunctionSchema` or function definition dict. Instead, metadata
|
||||
(i.e. `name`, `description`, `properties`, and `required`) are automatically
|
||||
extracted from a combination of the function signature and docstring.
|
||||
|
||||
Usage:
|
||||
|
||||
```python
|
||||
# "Direct" function
|
||||
# `params` must be the first parameter
|
||||
async def do_something(params: FunctionCallParams, foo: int, bar: str = ""):
|
||||
"""
|
||||
Do something interesting.
|
||||
|
||||
Args:
|
||||
foo (int): The foo to do something interesting with.
|
||||
bar (string): The bar to do something interesting with.
|
||||
"""
|
||||
|
||||
result = await process(foo, bar)
|
||||
await params.result_callback({"result": result})
|
||||
|
||||
# ...
|
||||
|
||||
llm.register_direct_function(do_something)
|
||||
|
||||
# ...
|
||||
|
||||
tools = ToolsSchema(standard_tools=[do_something])
|
||||
```
|
||||
|
||||
- `user_id` is now populated in the `TranscriptionFrame` and
|
||||
`InterimTranscriptionFrame` when using a transport that provides a `user_id`,
|
||||
like `DailyTransport` or `LiveKitTransport`.
|
||||
|
||||
- Added `watchdog_coroutine()`. This is a watchdog helper for couroutines. So,
|
||||
if you have a coroutine that is waiting for a result and that takes a long
|
||||
time, you will need to wrap it with `watchdog_coroutine()` so the watchdog
|
||||
timers are reset regularly.
|
||||
|
||||
- Added `session_token` parameter to `AWSNovaSonicLLMService`.
|
||||
|
||||
- Added Gemini Multimodal Live File API for uploading, fetching, listing, and
|
||||
deleting files. See `26f-gemini-multimodal-live-files-api.py` for example usage.
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated all the services to use the new `SOXRStreamAudioResampler`, ensuring smooth
|
||||
transitions and eliminating clicks.
|
||||
|
||||
- Upgraded `daily-python` to 0.19.4.
|
||||
|
||||
- Updated `google` optional dependency to use `google-genai` version `1.24.0`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where audio would get stuck in the queue when an interrupt occurs
|
||||
during Azure TTS synthesis.
|
||||
|
||||
- Fixed a race condition that occurs in Python 3.10+ where the task could miss
|
||||
the `CancelledError` and continue running indefinitely, freezing the pipeline.
|
||||
|
||||
- Fixed a `AWSNovaSonicLLMService` issue introduced in 0.0.72.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- In `FishAudioTTSService`, deprecated `model` and replaced with
|
||||
`reference_id`. This change is to better align with Fish Audio's variable
|
||||
naming and to reduce confusion about what functionality the variable
|
||||
controls.
|
||||
|
||||
## [0.0.73] - 2025-06-26
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue introduced in 0.0.72 that would cause `ElevenLabsTTSService`,
|
||||
`GladiaSTTService`, `NeuphonicTTSService` and `OpenAIRealtimeBetaLLMService`
|
||||
to throw an error.
|
||||
|
||||
## [0.0.72] - 2025-06-26
|
||||
|
||||
### Added
|
||||
|
||||
- Added logging and improved error handling to help diagnose and prevent potential
|
||||
Pipeline freezes.
|
||||
|
||||
- Added `WatchdogQueue`, `WatchdogPriorityQueue`, `WatchdogEvent` and
|
||||
`WatchdogAsyncIterator`. These helper utilities reset watchdog timers
|
||||
appropriately before they expire. When watchdog timers are disabled, the
|
||||
utilities behave as standard counterparts without side effects.
|
||||
|
||||
- Introduce task watchdog timers. Watchdog timers are used to detect if a
|
||||
Pipecat task is taking longer than expected (by default 5 seconds). Watchdog
|
||||
timers are disabled by default and can be enabled globally by passing
|
||||
`enable_watchdog_timers` argument to `PipelineTask` constructor. It is
|
||||
possible to change the default watchdog timer timeout by using the
|
||||
`watchdog_timeout` argument. You can also log how long it takes to reset the
|
||||
watchdog timers which is done with the `enable_watchdog_logging`. You can
|
||||
control all these settings per each frame processor or even per task. That is,
|
||||
you can set `enable_watchdog_timers`, `enable_watchdog_logging` and
|
||||
`watchdog_timeout` when creating any frame processor through their constructor
|
||||
arguments or when you create a task with `FrameProcessor.create_task()`. Note
|
||||
that watchdog timers only work with Pipecat tasks and will not work if you use
|
||||
`asycio.create_task()` or similar.
|
||||
|
||||
- Added `lexicon_names` parameter to `AWSPollyTTSService.InputParams`.
|
||||
|
||||
- Added reconnection logic and audio buffer management to `GladiaSTTService`.
|
||||
|
||||
- The `TurnTrackingObserver` now ends a turn upon observing an `EndFrame` or
|
||||
`CancelFrame`.
|
||||
|
||||
- Added Polish support to `AWSTranscribeSTTService`.
|
||||
|
||||
- Added new frames `FrameProcessorPauseFrame` and `FrameProcessorResumeFrame`
|
||||
@@ -27,8 +175,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
`LLMAssistantContextAggregator` that exposes whether a function call is in
|
||||
progress.
|
||||
|
||||
- Added `SambaNovaLLMService` which provides llm api integration with an
|
||||
OpenAI-compatible interface.
|
||||
|
||||
- Added `SambaNovaTTSService` which provides speech-to-text functionality using
|
||||
SambaNovas's (whisper) API.
|
||||
|
||||
- Add fundational examples for function calling and transcription
|
||||
`14s-function-calling-sambanova.py`, `13g-sambanova-transcription.py`
|
||||
|
||||
### Changed
|
||||
|
||||
- `HeartbeatFrame`s are now control frames. This will make it easier to detect
|
||||
pipeline freezes. Previously, heartbeat frames were system frames which meant
|
||||
they were not get queued with other frames, making it difficult to detect
|
||||
pipeline stalls.
|
||||
|
||||
- Updated `OpenAIRealtimeBetaLLMService` to accept `language` in the
|
||||
`InputAudioTranscription` class for all models.
|
||||
|
||||
- Updated the default model for `OpenAIRealtimeBetaLLMService` to
|
||||
`gpt-4o-realtime-preview-2025-06-03`.
|
||||
|
||||
- The `PipelineParams` arg `allow_interruptions` now defaults to `True`.
|
||||
|
||||
- `TavusTransport` and `TavusVideoService` now send audio to Tavus using WebRTC
|
||||
@@ -37,21 +205,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
- Upgraded `daily-python` to 0.19.3.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `AudioBufferProcessor` parameter `user_continuos_stream` is deprecated.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an `AudioBufferProcessor` issue that was causing crackling on the audio
|
||||
stream with lower sample rate (due to upsampling the other stream). We now
|
||||
record with the lowest sample rate to avoid upsampling.
|
||||
- Fixed an issue that would cause heartbeat frames to be sent before processors
|
||||
were started.
|
||||
|
||||
- Fixed an event loop blocking issue when using `SentryMetrics`.
|
||||
|
||||
- Fixed an issue in `FastAPIWebsocketClient` to ensure proper disconnection
|
||||
when the websocket is already closed.
|
||||
|
||||
- Fixed an issue where the `UserStoppedSpeakingFrame` was not received if the
|
||||
transport was not receiving new audio frames.
|
||||
|
||||
- Fixed an edge case where if the user interrupted the bot but no new aggregation
|
||||
was received, the bot would not resume speaking.
|
||||
|
||||
- Fixed an issue with `TelnyxFrameSerializer` where it would throw an exception
|
||||
when the user hung up the call.
|
||||
|
||||
- Fixed an issue with `ElevenLabsTTSService` where the context was not being
|
||||
closed.
|
||||
|
||||
- Fixed function calling in `AWSNovaSonicLLMService`.
|
||||
|
||||
- Fixed an issue that would cause multiple `PipelineTask.on_idle_timeout`
|
||||
events to be triggered repeatedly.
|
||||
|
||||
- Fixed an `AudioBufferProcessor` issue that was causing user and bot speech to
|
||||
not be synchronized during recordings.
|
||||
- Fixed an issue that was causing user and bot speech to not be synchronized
|
||||
during recordings.
|
||||
|
||||
- Fixed an issue where voice settings weren't applied to ElevenLabsTTSService.
|
||||
|
||||
@@ -63,6 +245,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
- Fixed an issue where `GoogleLLMService`'s TTFB value was incorrect.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `AudioBufferProcessor` parameter `user_continuos_stream` is deprecated.
|
||||
|
||||
### Other
|
||||
|
||||
- Rename `14e-function-calling-gemini.py` to `14e-function-calling-google.py`.
|
||||
|
||||
150
CONTRIBUTING.md
150
CONTRIBUTING.md
@@ -41,36 +41,150 @@ We use Ruff for code linting and formatting. Please ensure your code passes all
|
||||
|
||||
We follow Google-style docstrings with these specific conventions:
|
||||
|
||||
- Class docstrings should fully document all parameters used in `__init__`
|
||||
- We don't require separate docstrings for `__init__` methods when parameters are documented in the class docstring
|
||||
- Property methods should have docstrings explaining their purpose and return value
|
||||
**Regular Classes:**
|
||||
|
||||
Example of correctly documented class:
|
||||
- Class docstring describes the class purpose and key functionality
|
||||
- `__init__` method has its own docstring with complete `Args:` section documenting all parameters
|
||||
- All public methods must have docstrings with `Args:` and `Returns:` sections as appropriate
|
||||
|
||||
**Dataclasses:**
|
||||
|
||||
- Class docstring describes the purpose and documents all fields in a `Parameters:` section
|
||||
- No `__init__` docstring (auto-generated)
|
||||
|
||||
**Properties:**
|
||||
|
||||
- Must have docstrings with `Returns:` section
|
||||
|
||||
**Abstract Methods:**
|
||||
|
||||
- Must have docstrings explaining what subclasses should implement
|
||||
|
||||
**`__init__.py` Files:**
|
||||
|
||||
- **Skip docstrings** for pure import/re-export modules
|
||||
- **Add brief docstrings** for top-level packages or those with initialization logic
|
||||
|
||||
**Enums:**
|
||||
|
||||
- Class docstring describes the enumeration purpose
|
||||
- Use `Parameters:` section to document each enum value and its meaning
|
||||
- No `__init__` docstring (Enums don't have custom constructors)
|
||||
|
||||
**Code Examples in Docstrings:**
|
||||
|
||||
- Use `Examples:` as a section header for multiple examples
|
||||
- Use descriptive text followed by double colons (`::`) for each example
|
||||
- **Always include a blank line after the `::"`**
|
||||
- Indent all code consistently within each block
|
||||
- Separate multiple examples with blank lines for readability
|
||||
|
||||
**Lists and Bullets in Docstrings:**
|
||||
|
||||
- Use dashes (`-`) for bullet points, not asterisks (`*`)
|
||||
- **Add a blank line before bullet lists** when they follow a colon
|
||||
- Use section headers like "Supported features:" or "Behavior:" before lists
|
||||
- For complex nested information, consider using paragraph format instead
|
||||
|
||||
**Deprecations:**
|
||||
|
||||
- Use `warnings.warn()` in code for runtime deprecation warnings
|
||||
- Add `.. deprecated::` directive in docstrings for documentation visibility
|
||||
- Include version information and describe current status
|
||||
- Describe parameters in present tense, use directive to indicate deprecation status
|
||||
|
||||
#### Examples:
|
||||
|
||||
```python
|
||||
class MyClass:
|
||||
"""Class description.
|
||||
# Regular class
|
||||
class MyService(BaseService):
|
||||
"""Description of what the service does.
|
||||
|
||||
Additional details about the class.
|
||||
Provides detailed explanation of the service's functionality,
|
||||
key features, and usage patterns.
|
||||
|
||||
Args:
|
||||
param1: Description of first parameter.
|
||||
param2: Description of second parameter.
|
||||
Supported features:
|
||||
|
||||
- Feature one with detailed explanation
|
||||
- Feature two with additional context
|
||||
- Feature three for advanced use cases
|
||||
"""
|
||||
|
||||
def __init__(self, param1, param2):
|
||||
# No docstring required here as parameters are documented above
|
||||
self.param1 = param1
|
||||
self.param2 = param2
|
||||
def __init__(self, param1: str, old_param: str = None, **kwargs):
|
||||
"""Initialize the service.
|
||||
|
||||
Args:
|
||||
param1: Description of param1.
|
||||
old_param: Controls legacy behavior.
|
||||
|
||||
.. deprecated:: 1.2.0
|
||||
This parameter no longer has any effect and will be removed in version 2.0.
|
||||
|
||||
**kwargs: Additional arguments passed to parent.
|
||||
"""
|
||||
if old_param is not None:
|
||||
import warnings
|
||||
warnings.warn(
|
||||
"Parameter 'old_param' is deprecated and will be removed in version 2.0.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@property
|
||||
def some_property(self) -> str:
|
||||
"""Get the formatted property value.
|
||||
def sample_rate(self) -> int:
|
||||
"""Get the current sample rate.
|
||||
|
||||
Returns:
|
||||
A string representation of the property.
|
||||
The sample rate in Hz.
|
||||
"""
|
||||
return f"Property: {self.param1}"
|
||||
return self._sample_rate
|
||||
|
||||
async def process_data(self, data: str) -> bool:
|
||||
"""Process the provided data.
|
||||
|
||||
Args:
|
||||
data: The data to process.
|
||||
|
||||
Returns:
|
||||
True if processing succeeded.
|
||||
"""
|
||||
pass
|
||||
|
||||
# Dataclass with code examples
|
||||
@dataclass
|
||||
class MessageFrame:
|
||||
"""Frame containing messages in OpenAI format.
|
||||
|
||||
Supports both simple and content list message formats.
|
||||
|
||||
Example::
|
||||
|
||||
[
|
||||
{"role": "user", "content": "Hello"},
|
||||
{"role": "assistant", "content": "Hi there!"}
|
||||
]
|
||||
|
||||
Parameters:
|
||||
messages: List of messages in OpenAI format.
|
||||
"""
|
||||
|
||||
messages: List[dict]
|
||||
|
||||
# Enum class
|
||||
class Status(Enum):
|
||||
"""Status codes for processing operations.
|
||||
|
||||
Parameters:
|
||||
PENDING: Operation is queued but not started.
|
||||
RUNNING: Operation is currently in progress.
|
||||
COMPLETED: Operation finished successfully.
|
||||
FAILED: Operation encountered an error.
|
||||
"""
|
||||
|
||||
PENDING = "pending"
|
||||
RUNNING = "running"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
```
|
||||
|
||||
# Contributor Covenant Code of Conduct
|
||||
|
||||
26
README.md
26
README.md
@@ -51,19 +51,19 @@ You can connect to Pipecat from any platform using our official SDKs:
|
||||
|
||||
## 🧩 Available services
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
| Category | Services |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
build~=1.2.2
|
||||
coverage~=7.6.12
|
||||
coverage~=7.9.1
|
||||
grpcio-tools~=1.67.1
|
||||
pip-tools~=7.4.1
|
||||
pre-commit~=4.0.1
|
||||
pyright~=1.1.400
|
||||
pytest~=8.3.4
|
||||
pytest-asyncio~=0.25.3
|
||||
pre-commit~=4.2.0
|
||||
pyright~=1.1.402
|
||||
pytest~=8.4.1
|
||||
pytest-asyncio~=1.0.0
|
||||
pytest-aiohttp==1.1.0
|
||||
ruff~=0.11.13
|
||||
setuptools~=70.0.0
|
||||
setuptools_scm~=8.1.0
|
||||
python-dotenv~=1.0.1
|
||||
ruff~=0.12.1
|
||||
setuptools~=78.1.1
|
||||
setuptools_scm~=8.3.1
|
||||
python-dotenv~=1.1.1
|
||||
|
||||
190
docs/api/conf.py
190
docs/api/conf.py
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# Configure logging
|
||||
@@ -13,7 +14,8 @@ sys.path.insert(0, str(project_root / "src"))
|
||||
|
||||
# Project information
|
||||
project = "pipecat-ai"
|
||||
copyright = "2024, Daily"
|
||||
current_year = datetime.now().year
|
||||
copyright = f"2024-{current_year}, Daily" if current_year > 2024 else "2024, Daily"
|
||||
author = "Daily"
|
||||
|
||||
# General configuration
|
||||
@@ -24,19 +26,20 @@ extensions = [
|
||||
"sphinx.ext.intersphinx",
|
||||
]
|
||||
|
||||
suppress_warnings = [
|
||||
"autodoc.mocked_object",
|
||||
]
|
||||
|
||||
# Napoleon settings
|
||||
napoleon_google_docstring = True
|
||||
napoleon_numpy_docstring = False
|
||||
napoleon_include_init_with_doc = True
|
||||
|
||||
# AutoDoc settings
|
||||
autodoc_default_options = {
|
||||
"members": True,
|
||||
"member-order": "bysource",
|
||||
"special-members": "__init__",
|
||||
"undoc-members": True,
|
||||
"exclude-members": "__weakref__",
|
||||
"no-index": True,
|
||||
"undoc-members": False,
|
||||
"exclude-members": "__weakref__,model_config",
|
||||
"show-inheritance": True,
|
||||
}
|
||||
|
||||
@@ -71,7 +74,6 @@ autodoc_mock_imports = [
|
||||
"langchain",
|
||||
"lmnt",
|
||||
"noisereduce",
|
||||
"openai",
|
||||
"openpipe",
|
||||
"simli",
|
||||
"soundfile",
|
||||
@@ -81,10 +83,6 @@ autodoc_mock_imports = [
|
||||
"tkinter",
|
||||
"daily",
|
||||
"daily_python",
|
||||
"pydantic.BaseModel",
|
||||
"pydantic.Field",
|
||||
"pydantic._internal._model_construction",
|
||||
"pydantic._internal._fields",
|
||||
# Moondream dependencies
|
||||
"torch",
|
||||
"transformers",
|
||||
@@ -145,85 +143,76 @@ autodoc_mock_imports = [
|
||||
"transformers.AutoFeatureExtractor",
|
||||
# Also add specific classes that are imported
|
||||
"AutoFeatureExtractor",
|
||||
# Sentry dependencies
|
||||
"sentry_sdk",
|
||||
# AWS Nova Sonic dependencies
|
||||
"aws_sdk_bedrock_runtime",
|
||||
"aws_sdk_bedrock_runtime.client",
|
||||
"aws_sdk_bedrock_runtime.config",
|
||||
"aws_sdk_bedrock_runtime.models",
|
||||
"smithy_aws_core",
|
||||
"smithy_aws_core.credentials_resolvers",
|
||||
"smithy_aws_core.credentials_resolvers.static",
|
||||
"smithy_aws_core.identity",
|
||||
"smithy_core",
|
||||
"smithy_core.aio",
|
||||
"smithy_core.aio.eventstream",
|
||||
# MCP dependencies (you may already have these)
|
||||
"mcp",
|
||||
"mcp.client",
|
||||
"mcp.client.session_group",
|
||||
"mcp.client.sse",
|
||||
"mcp.client.stdio",
|
||||
"mcp.ClientSession",
|
||||
"mcp.StdioServerParameters",
|
||||
# gstreamer
|
||||
"gi",
|
||||
"gi.require_version",
|
||||
"gi.repository",
|
||||
# Protobuf mocks
|
||||
"pipecat.frames.protobufs.frames_pb2",
|
||||
"pipecat.serializers.protobuf",
|
||||
"google.protobuf",
|
||||
"google.protobuf.descriptor",
|
||||
"google.protobuf.descriptor_pool",
|
||||
"google.protobuf.runtime_version",
|
||||
"google.protobuf.symbol_database",
|
||||
"google.protobuf.internal.builder",
|
||||
]
|
||||
|
||||
# HTML output settings
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
html_static_path = ["_static"]
|
||||
autodoc_typehints = "description"
|
||||
autodoc_typehints = "signature" # Show type hints in the signature only, not in the docstring
|
||||
html_show_sphinx = False
|
||||
|
||||
|
||||
def verify_modules():
|
||||
"""Verify that required modules are available."""
|
||||
required_modules = {
|
||||
"services": [
|
||||
"assemblyai",
|
||||
"aws",
|
||||
"cartesia",
|
||||
"deepgram",
|
||||
"google",
|
||||
"lmnt",
|
||||
"riva",
|
||||
"simli",
|
||||
],
|
||||
"serializers": ["livekit"],
|
||||
"vad": ["silero", "vad_analyzer"],
|
||||
"transports": {
|
||||
"services": ["daily", "livekit"],
|
||||
"local": ["audio", "tk"],
|
||||
"network": ["fastapi_websocket", "websocket_server"],
|
||||
},
|
||||
}
|
||||
def import_core_modules():
|
||||
"""Import core pipecat modules for autodoc to discover."""
|
||||
core_modules = [
|
||||
"pipecat",
|
||||
"pipecat.frames",
|
||||
"pipecat.pipeline",
|
||||
"pipecat.processors",
|
||||
"pipecat.services",
|
||||
"pipecat.transports",
|
||||
"pipecat.audio",
|
||||
"pipecat.adapters",
|
||||
"pipecat.clocks",
|
||||
"pipecat.metrics",
|
||||
"pipecat.observers",
|
||||
"pipecat.serializers",
|
||||
"pipecat.sync",
|
||||
"pipecat.transcriptions",
|
||||
"pipecat.utils",
|
||||
]
|
||||
|
||||
# Skip importing modules that are in autodoc_mock_imports
|
||||
skipped_modules = set(autodoc_mock_imports)
|
||||
|
||||
missing = []
|
||||
for category, modules in required_modules.items():
|
||||
if isinstance(modules, dict):
|
||||
# Handle nested structure
|
||||
for subcategory, submodules in modules.items():
|
||||
for module in submodules:
|
||||
# Check if module is in autodoc_mock_imports
|
||||
if (
|
||||
f"pipecat.{category}.{subcategory}.{module}" in skipped_modules
|
||||
or module in skipped_modules
|
||||
):
|
||||
logger.info(
|
||||
f"Skipping import of mocked module: pipecat.{category}.{subcategory}.{module}"
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
__import__(f"pipecat.{category}.{subcategory}.{module}")
|
||||
logger.info(
|
||||
f"Successfully imported pipecat.{category}.{subcategory}.{module}"
|
||||
)
|
||||
except (ImportError, TypeError, NameError) as e:
|
||||
missing.append(f"pipecat.{category}.{subcategory}.{module}")
|
||||
logger.warning(
|
||||
f"Optional module not available: pipecat.{category}.{subcategory}.{module} - {str(e)}"
|
||||
)
|
||||
else:
|
||||
# Handle flat structure
|
||||
for module in modules:
|
||||
# Check if module is in autodoc_mock_imports
|
||||
if f"pipecat.{category}.{module}" in skipped_modules or module in skipped_modules:
|
||||
logger.info(f"Skipping import of mocked module: pipecat.{category}.{module}")
|
||||
continue
|
||||
|
||||
try:
|
||||
__import__(f"pipecat.{category}.{module}")
|
||||
logger.info(f"Successfully imported pipecat.{category}.{module}")
|
||||
except (ImportError, TypeError, NameError) as e:
|
||||
missing.append(f"pipecat.{category}.{module}")
|
||||
logger.warning(
|
||||
f"Optional module not available: pipecat.{category}.{module} - {str(e)}"
|
||||
)
|
||||
|
||||
if missing:
|
||||
logger.warning(f"Some optional modules are not available: {missing}")
|
||||
for module_name in core_modules:
|
||||
try:
|
||||
__import__(module_name)
|
||||
logger.info(f"Successfully imported {module_name}")
|
||||
except ImportError as e:
|
||||
logger.warning(f"Failed to import {module_name}: {e}")
|
||||
|
||||
|
||||
def clean_title(title: str) -> str:
|
||||
@@ -235,36 +224,7 @@ def clean_title(title: str) -> str:
|
||||
parts = title.split(".")
|
||||
title = parts[-1]
|
||||
|
||||
# Special cases for service names and common acronyms
|
||||
special_cases = {
|
||||
"ai": "AI",
|
||||
"aws": "AWS",
|
||||
"api": "API",
|
||||
"vad": "VAD",
|
||||
"assemblyai": "AssemblyAI",
|
||||
"deepgram": "Deepgram",
|
||||
"elevenlabs": "ElevenLabs",
|
||||
"openai": "OpenAI",
|
||||
"openpipe": "OpenPipe",
|
||||
"playht": "PlayHT",
|
||||
"xtts": "XTTS",
|
||||
"lmnt": "LMNT",
|
||||
}
|
||||
|
||||
# Check if the entire title is a special case
|
||||
if title.lower() in special_cases:
|
||||
return special_cases[title.lower()]
|
||||
|
||||
# Otherwise, capitalize each word
|
||||
words = title.split("_")
|
||||
cleaned_words = []
|
||||
for word in words:
|
||||
if word.lower() in special_cases:
|
||||
cleaned_words.append(special_cases[word.lower()])
|
||||
else:
|
||||
cleaned_words.append(word.capitalize())
|
||||
|
||||
return " ".join(cleaned_words)
|
||||
return title
|
||||
|
||||
|
||||
def setup(app):
|
||||
@@ -289,9 +249,8 @@ def setup(app):
|
||||
|
||||
excludes = [
|
||||
str(project_root / "src/pipecat/pipeline/to_be_updated"),
|
||||
str(project_root / "src/pipecat/processors/gstreamer"),
|
||||
str(project_root / "src/pipecat/services/to_be_updated"),
|
||||
str(project_root / "src/pipecat/vad"), # deprecated
|
||||
str(project_root / "src/pipecat/examples"),
|
||||
str(project_root / "src/pipecat/tests"),
|
||||
"**/test_*.py",
|
||||
"**/tests/*.py",
|
||||
]
|
||||
@@ -332,5 +291,4 @@ def setup(app):
|
||||
logger.error(f"Error generating API documentation: {e}", exc_info=True)
|
||||
|
||||
|
||||
# Run module verification
|
||||
verify_modules()
|
||||
import_core_modules()
|
||||
|
||||
@@ -1,57 +1,17 @@
|
||||
Pipecat API Reference Docs
|
||||
==========================
|
||||
Pipecat API Reference
|
||||
=====================
|
||||
|
||||
Welcome to Pipecat's API reference documentation!
|
||||
Welcome to the Pipecat API reference.
|
||||
|
||||
Pipecat is an open source framework for building voice and multimodal assistants.
|
||||
It provides a flexible pipeline architecture for connecting various AI services,
|
||||
audio processing, and transport layers.
|
||||
Use the navigation on the left to browse modules, or search using the search box.
|
||||
|
||||
**New to Pipecat?** Check out the `main documentation <https://docs.pipecat.ai>`_ for tutorials, guides, and client SDK information.
|
||||
|
||||
Quick Links
|
||||
-----------
|
||||
|
||||
* `GitHub Repository <https://github.com/pipecat-ai/pipecat>`_
|
||||
* `Website <https://pipecat.ai>`_
|
||||
|
||||
API Reference
|
||||
-------------
|
||||
|
||||
Core Components
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
* :mod:`Frames <pipecat.frames>`
|
||||
* :mod:`Processors <pipecat.processors>`
|
||||
* :mod:`Pipeline <pipecat.pipeline>`
|
||||
|
||||
Audio Processing
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
* :mod:`Audio <pipecat.audio>`
|
||||
|
||||
Services
|
||||
~~~~~~~~
|
||||
|
||||
* :mod:`Services <pipecat.services>`
|
||||
|
||||
Transport & Serialization
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* :mod:`Transports <pipecat.transports>`
|
||||
* :mod:`Local <pipecat.transports.local>`
|
||||
* :mod:`Network <pipecat.transports.network>`
|
||||
* :mod:`Services <pipecat.transports.services>`
|
||||
* :mod:`Serializers <pipecat.serializers>`
|
||||
|
||||
Utilities
|
||||
~~~~~~~~~
|
||||
|
||||
* :mod:`Adapters <pipecat.adapters>`
|
||||
* :mod:`Clocks <pipecat.clocks>`
|
||||
* :mod:`Metrics <pipecat.metrics>`
|
||||
* :mod:`Observers <pipecat.observers>`
|
||||
* :mod:`Sync <pipecat.sync>`
|
||||
* :mod:`Transcriptions <pipecat.transcriptions>`
|
||||
* :mod:`Utils <pipecat.utils>`
|
||||
* `Join our Community <https://discord.gg/pipecat>`_
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 3
|
||||
@@ -71,11 +31,4 @@ Utilities
|
||||
Sync <api/pipecat.sync>
|
||||
Transcriptions <api/pipecat.transcriptions>
|
||||
Transports <api/pipecat.transports>
|
||||
Utils <api/pipecat.utils>
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
||||
Utils <api/pipecat.utils>
|
||||
@@ -42,9 +42,11 @@ pipecat-ai[openai]
|
||||
pipecat-ai[qwen]
|
||||
pipecat-ai[remote-smart-turn]
|
||||
# pipecat-ai[riva] # Mocked
|
||||
pipecat-ai[sambanova]
|
||||
pipecat-ai[silero]
|
||||
pipecat-ai[simli]
|
||||
pipecat-ai[soundfile]
|
||||
pipecat-ai[speechmatics]
|
||||
pipecat-ai[tavus]
|
||||
pipecat-ai[together]
|
||||
# pipecat-ai[ultravox] # Mocked
|
||||
|
||||
@@ -107,4 +107,14 @@ MINIMAX_API_KEY=...
|
||||
MINIMAX_GROUP_ID=...
|
||||
|
||||
# Sarvam AI
|
||||
SARVAM_API_KEY=...
|
||||
SARVAM_API_KEY=...
|
||||
|
||||
# Speechmatics
|
||||
SPEECHMATICS_API_KEY=...
|
||||
|
||||
|
||||
# SambaNova
|
||||
SAMBANOVA_API_KEY=...
|
||||
|
||||
# Sentry
|
||||
SENTRY_DSN=...
|
||||
|
||||
@@ -4364,9 +4364,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/brace-expansion": {
|
||||
"version": "1.1.11",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
|
||||
"integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
|
||||
"version": "1.1.12",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
|
||||
"integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==",
|
||||
"dependencies": {
|
||||
"balanced-match": "^1.0.0",
|
||||
"concat-map": "0.0.1"
|
||||
@@ -6081,9 +6081,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/glob/node_modules/brace-expansion": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
|
||||
"integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
|
||||
"integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
|
||||
"dependencies": {
|
||||
"balanced-match": "^1.0.0"
|
||||
}
|
||||
|
||||
@@ -2,4 +2,4 @@ aiofiles
|
||||
python-dotenv
|
||||
fastapi[all]
|
||||
uvicorn
|
||||
pipecat-ai[daily,deepgram,openai,silero,cartesia]
|
||||
pipecat-ai[daily,deepgram,openai,silero,cartesia,soundfile]
|
||||
|
||||
@@ -215,10 +215,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/env": {
|
||||
"version": "14.2.26",
|
||||
"resolved": "https://registry.npmjs.org/@next/env/-/env-14.2.26.tgz",
|
||||
"integrity": "sha512-vO//GJ/YBco+H7xdQhzJxF7ub3SUwft76jwaeOyVVQFHCi5DCnkP16WHB+JBylo4vOKPoZBlR94Z8xBxNBdNJA==",
|
||||
"license": "MIT"
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/env/-/env-14.2.30.tgz",
|
||||
"integrity": "sha512-KBiBKrDY6kxTQWGzKjQB7QirL3PiiOkV7KW98leHFjtVRKtft76Ra5qSA/SL75xT44dp6hOcqiiJ6iievLOYug=="
|
||||
},
|
||||
"node_modules/@next/eslint-plugin-next": {
|
||||
"version": "14.2.25",
|
||||
@@ -231,13 +230,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-darwin-arm64": {
|
||||
"version": "14.2.26",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-14.2.26.tgz",
|
||||
"integrity": "sha512-zDJY8gsKEseGAxG+C2hTMT0w9Nk9N1Sk1qV7vXYz9MEiyRoF5ogQX2+vplyUMIfygnjn9/A04I6yrUTRTuRiyQ==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-14.2.30.tgz",
|
||||
"integrity": "sha512-EAqfOTb3bTGh9+ewpO/jC59uACadRHM6TSA9DdxJB/6gxOpyV+zrbqeXiFTDy9uV6bmipFDkfpAskeaDcO+7/g==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
@@ -247,13 +245,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-darwin-x64": {
|
||||
"version": "14.2.26",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-14.2.26.tgz",
|
||||
"integrity": "sha512-U0adH5ryLfmTDkahLwG9sUQG2L0a9rYux8crQeC92rPhi3jGQEY47nByQHrVrt3prZigadwj/2HZ1LUUimuSbg==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-14.2.30.tgz",
|
||||
"integrity": "sha512-TyO7Wz1IKE2kGv8dwQ0bmPL3s44EKVencOqwIY69myoS3rdpO1NPg5xPM5ymKu7nfX4oYJrpMxv8G9iqLsnL4A==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
@@ -263,13 +260,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-arm64-gnu": {
|
||||
"version": "14.2.26",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-14.2.26.tgz",
|
||||
"integrity": "sha512-SINMl1I7UhfHGM7SoRiw0AbwnLEMUnJ/3XXVmhyptzriHbWvPPbbm0OEVG24uUKhuS1t0nvN/DBvm5kz6ZIqpg==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-14.2.30.tgz",
|
||||
"integrity": "sha512-I5lg1fgPJ7I5dk6mr3qCH1hJYKJu1FsfKSiTKoYwcuUf53HWTrEkwmMI0t5ojFKeA6Vu+SfT2zVy5NS0QLXV4Q==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
@@ -279,13 +275,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-arm64-musl": {
|
||||
"version": "14.2.26",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-14.2.26.tgz",
|
||||
"integrity": "sha512-s6JaezoyJK2DxrwHWxLWtJKlqKqTdi/zaYigDXUJ/gmx/72CrzdVZfMvUc6VqnZ7YEvRijvYo+0o4Z9DencduA==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-14.2.30.tgz",
|
||||
"integrity": "sha512-8GkNA+sLclQyxgzCDs2/2GSwBc92QLMrmYAmoP2xehe5MUKBLB2cgo34Yu242L1siSkwQkiV4YLdCnjwc/Micw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
@@ -295,13 +290,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-x64-gnu": {
|
||||
"version": "14.2.26",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-14.2.26.tgz",
|
||||
"integrity": "sha512-FEXeUQi8/pLr/XI0hKbe0tgbLmHFRhgXOUiPScz2hk0hSmbGiU8aUqVslj/6C6KA38RzXnWoJXo4FMo6aBxjzg==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-14.2.30.tgz",
|
||||
"integrity": "sha512-8Ly7okjssLuBoe8qaRCcjGtcMsv79hwzn/63wNeIkzJVFVX06h5S737XNr7DZwlsbTBDOyI6qbL2BJB5n6TV/w==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
@@ -311,13 +305,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-x64-musl": {
|
||||
"version": "14.2.26",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-14.2.26.tgz",
|
||||
"integrity": "sha512-BUsomaO4d2DuXhXhgQCVt2jjX4B4/Thts8nDoIruEJkhE5ifeQFtvW5c9JkdOtYvE5p2G0hcwQ0UbRaQmQwaVg==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-14.2.30.tgz",
|
||||
"integrity": "sha512-dBmV1lLNeX4mR7uI7KNVHsGQU+OgTG5RGFPi3tBJpsKPvOPtg9poyav/BYWrB3GPQL4dW5YGGgalwZ79WukbKQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
@@ -327,13 +320,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-win32-arm64-msvc": {
|
||||
"version": "14.2.26",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-14.2.26.tgz",
|
||||
"integrity": "sha512-5auwsMVzT7wbB2CZXQxDctpWbdEnEW/e66DyXO1DcgHxIyhP06awu+rHKshZE+lPLIGiwtjo7bsyeuubewwxMw==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-14.2.30.tgz",
|
||||
"integrity": "sha512-6MMHi2Qc1Gkq+4YLXAgbYslE1f9zMGBikKMdmQRHXjkGPot1JY3n5/Qrbg40Uvbi8//wYnydPnyvNhI1DMUW1g==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
@@ -343,13 +335,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-win32-ia32-msvc": {
|
||||
"version": "14.2.26",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-14.2.26.tgz",
|
||||
"integrity": "sha512-GQWg/Vbz9zUGi9X80lOeGsz1rMH/MtFO/XqigDznhhhTfDlDoynCM6982mPCbSlxJ/aveZcKtTlwfAjwhyxDpg==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-14.2.30.tgz",
|
||||
"integrity": "sha512-pVZMnFok5qEX4RT59mK2hEVtJX+XFfak+/rjHpyFh7juiT52r177bfFKhnlafm0UOSldhXjj32b+LZIOdswGTg==",
|
||||
"cpu": [
|
||||
"ia32"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
@@ -359,13 +350,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-win32-x64-msvc": {
|
||||
"version": "14.2.26",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-14.2.26.tgz",
|
||||
"integrity": "sha512-2rdB3T1/Gp7bv1eQTTm9d1Y1sv9UuJ2LAwOE0Pe2prHKe32UNscj7YS13fRB37d0GAiGNR+Y7ZcW8YjDI8Ns0w==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-14.2.30.tgz",
|
||||
"integrity": "sha512-4KCo8hMZXMjpTzs3HOqOGYYwAXymXIy7PEPAXNEcEOyKqkjiDlECumrWziy+JEF0Oi4ILHGxzgQ3YiMGG2t/Lg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
@@ -620,11 +610,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
|
||||
"integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
|
||||
"integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"balanced-match": "^1.0.0"
|
||||
}
|
||||
@@ -1224,11 +1213,10 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/brace-expansion": {
|
||||
"version": "1.1.11",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
|
||||
"integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
|
||||
"version": "1.1.12",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
|
||||
"integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"balanced-match": "^1.0.0",
|
||||
"concat-map": "0.0.1"
|
||||
@@ -2614,11 +2602,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/glob/node_modules/brace-expansion": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
|
||||
"integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
|
||||
"integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"balanced-match": "^1.0.0"
|
||||
}
|
||||
@@ -3613,12 +3600,11 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/next": {
|
||||
"version": "14.2.26",
|
||||
"resolved": "https://registry.npmjs.org/next/-/next-14.2.26.tgz",
|
||||
"integrity": "sha512-b81XSLihMwCfwiUVRRja3LphLo4uBBMZEzBBWMaISbKTwOmq3wPknIETy/8000tr7Gq4WmbuFYPS7jOYIf+ZJw==",
|
||||
"license": "MIT",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/next/-/next-14.2.30.tgz",
|
||||
"integrity": "sha512-+COdu6HQrHHFQ1S/8BBsCag61jZacmvbuL2avHvQFbWa2Ox7bE+d8FyNgxRLjXQ5wtPyQwEmk85js/AuaG2Sbg==",
|
||||
"dependencies": {
|
||||
"@next/env": "14.2.26",
|
||||
"@next/env": "14.2.30",
|
||||
"@swc/helpers": "0.5.5",
|
||||
"busboy": "1.6.0",
|
||||
"caniuse-lite": "^1.0.30001579",
|
||||
@@ -3633,15 +3619,15 @@
|
||||
"node": ">=18.17.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@next/swc-darwin-arm64": "14.2.26",
|
||||
"@next/swc-darwin-x64": "14.2.26",
|
||||
"@next/swc-linux-arm64-gnu": "14.2.26",
|
||||
"@next/swc-linux-arm64-musl": "14.2.26",
|
||||
"@next/swc-linux-x64-gnu": "14.2.26",
|
||||
"@next/swc-linux-x64-musl": "14.2.26",
|
||||
"@next/swc-win32-arm64-msvc": "14.2.26",
|
||||
"@next/swc-win32-ia32-msvc": "14.2.26",
|
||||
"@next/swc-win32-x64-msvc": "14.2.26"
|
||||
"@next/swc-darwin-arm64": "14.2.30",
|
||||
"@next/swc-darwin-x64": "14.2.30",
|
||||
"@next/swc-linux-arm64-gnu": "14.2.30",
|
||||
"@next/swc-linux-arm64-musl": "14.2.30",
|
||||
"@next/swc-linux-x64-gnu": "14.2.30",
|
||||
"@next/swc-linux-x64-musl": "14.2.30",
|
||||
"@next/swc-win32-arm64-msvc": "14.2.30",
|
||||
"@next/swc-win32-ia32-msvc": "14.2.30",
|
||||
"@next/swc-win32-x64-msvc": "14.2.30"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@opentelemetry/api": "^1.1.0",
|
||||
|
||||
153
examples/foundational/07a-interruptible-speechmatics.py
Normal file
153
examples/foundational/07a-interruptible-speechmatics.py
Normal file
@@ -0,0 +1,153 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
|
||||
from pipecat.services.openai.base_llm import BaseOpenAILLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.speechmatics.stt import SpeechmaticsSTTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
|
||||
"""Run example using Speechmatics STT.
|
||||
|
||||
This example will use diarization within our STT service and output the words spoken by
|
||||
each individual speaker and wrap them with XML tags for the LLM to process. Note the
|
||||
instructions in the system context for the LLM. This greatly improves the conversation
|
||||
experience by allowing the LLM to understand who is speaking in a multi-party call.
|
||||
|
||||
If you do not wish to use diarization, then set the `enable_speaker_diarization` parameter
|
||||
to `False` or omit it altogether. The `text_format` will only be used if diarization is enabled.
|
||||
|
||||
By default, this example will use our ENHANCED operating point, which is optimized for
|
||||
high accuracy. You can change this by setting the `operating_point` parameter to a different
|
||||
value.
|
||||
|
||||
For more information on operating points, see the Speechmatics documentation:
|
||||
https://docs.speechmatics.com/rt-api-ref
|
||||
"""
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
language=Language.EN,
|
||||
enable_speaker_diarization=True,
|
||||
text_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
|
||||
model="eleven_turbo_v2_5",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
params=BaseOpenAILLMService.InputParams(temperature=0.75),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful British assistant called Alfred. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be converted to audio so don't include special characters in your answers. "
|
||||
"Always include punctuation in your responses. "
|
||||
"Give very short replies - do not give longer replies unless strictly necessary. "
|
||||
"Respond to what the user said in a concise, funny, creative and helpful way. "
|
||||
"Use `<Sn/>` tags to identify different speakers - do not use tags in your replies."
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(aggregation_timeout=0.005),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.examples.run import main
|
||||
|
||||
main(run_example, transport_params=transport_params)
|
||||
@@ -35,7 +35,7 @@ transport_params = {
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: TransportParams(
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
|
||||
@@ -61,7 +61,12 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash",
|
||||
# turn on thinking if you want it
|
||||
# params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),)
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
|
||||
@@ -8,8 +8,8 @@ import argparse
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
import google.ai.generativelanguage as glm
|
||||
from dotenv import load_dotenv
|
||||
from google.genai.types import Content, Part
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
@@ -164,9 +164,7 @@ class TanscriptionContextFixup(FrameProcessor):
|
||||
and last_part.inline_data
|
||||
and last_part.inline_data.mime_type == "audio/wav"
|
||||
):
|
||||
self._context.messages[-2] = glm.Content(
|
||||
role="user", parts=[glm.Part(text=self._transcript)]
|
||||
)
|
||||
self._context.messages[-2] = Content(role="user", parts=[Part(text=self._transcript)])
|
||||
|
||||
def add_transcript_back_to_inference_output(self):
|
||||
if not self._transcript:
|
||||
@@ -216,7 +214,12 @@ transport_params = {
|
||||
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"), model="gemini-2.0-flash-001")
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash",
|
||||
# turn on thinking if you want it
|
||||
# params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),
|
||||
)
|
||||
|
||||
tts = GoogleTTSService(
|
||||
voice_id="en-US-Chirp3-HD-Charon",
|
||||
|
||||
108
examples/foundational/13g-sambanova-transcription.py
Normal file
108
examples/foundational/13g-sambanova-transcription.py
Normal file
@@ -0,0 +1,108 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import Frame, TranscriptionFrame, UserStoppedSpeakingFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.sambanova.stt import SambaNovaSTTService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
STOP_SECS = 2.0
|
||||
|
||||
|
||||
class TranscriptionLogger(FrameProcessor):
|
||||
"""Measures transcription latency.
|
||||
|
||||
Uses the (intentionally) long STOP_SECS parameter to give the transcription time to finish,
|
||||
then outputs the timing between when the VAD first classified audio input as not-speech and
|
||||
the delivery of the last transcription frame.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._last_transcription_time = time.time()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserStoppedSpeakingFrame):
|
||||
logger.debug(
|
||||
f"Transcription latency: {(STOP_SECS - (time.time() - self._last_transcription_time)):.2f}"
|
||||
)
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
self._last_transcription_time = time.time()
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=STOP_SECS)),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=STOP_SECS)),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=STOP_SECS)),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = SambaNovaSTTService(
|
||||
model="Whisper-Large-v3",
|
||||
api_key=os.getenv("SAMBANOVA_API_KEY"),
|
||||
)
|
||||
|
||||
tl = TranscriptionLogger()
|
||||
|
||||
pipeline = Pipeline([transport.input(), stt, tl])
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.examples.run import main
|
||||
|
||||
main(run_example, transport_params=transport_params)
|
||||
89
examples/foundational/13h-speechmatics-transcription.py
Normal file
89
examples/foundational/13h-speechmatics-transcription.py
Normal file
@@ -0,0 +1,89 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import Frame, TranscriptionFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.speechmatics.stt import SpeechmaticsSTTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class TranscriptionLogger(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
print(f"Transcription: {frame.text}")
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(audio_in_enabled=True),
|
||||
"twilio": lambda: FastAPIWebsocketParams(audio_in_enabled=True),
|
||||
"webrtc": lambda: TransportParams(audio_in_enabled=True),
|
||||
}
|
||||
|
||||
|
||||
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
|
||||
"""Run example using Speechmatics STT.
|
||||
|
||||
This example will use diarization within our STT service and output the words spoken by
|
||||
each individual speaker and wrap them with XML tags.
|
||||
|
||||
If you do not wish to use diarization, then set the `enable_speaker_diarization` parameter
|
||||
to `False` or omit it altogether. The `text_format` will only be used if diarization is enabled.
|
||||
|
||||
By default, this example will use our ENHANCED operating point, which is optimized for
|
||||
high accuracy. You can change this by setting the `operating_point` parameter to a different
|
||||
value.
|
||||
|
||||
For more information on operating points, see the Speechmatics documentation:
|
||||
https://docs.speechmatics.com/rt-api-ref
|
||||
"""
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
language=Language.EN,
|
||||
enable_speaker_diarization=True,
|
||||
text_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
)
|
||||
|
||||
tl = TranscriptionLogger()
|
||||
|
||||
pipeline = Pipeline([transport.input(), stt, tl])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.examples.run import main
|
||||
|
||||
main(run_example, transport_params=transport_params)
|
||||
@@ -42,7 +42,7 @@ transport_params = {
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: TransportParams(
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
|
||||
152
examples/foundational/14s-function-calling-sambanova.py
Normal file
152
examples/foundational/14s-function-calling-sambanova.py
Normal file
@@ -0,0 +1,152 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response import LLMUserAggregatorParams
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.sambanova.llm import SambaNovaLLMService
|
||||
from pipecat.services.sambanova.stt import SambaNovaSTTService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = SambaNovaSTTService(
|
||||
model="Whisper-Large-v3",
|
||||
api_key=os.getenv("SAMBANOVA_API_KEY"),
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = SambaNovaLLMService(
|
||||
api_key=os.getenv("SAMBANOVA_API_KEY"),
|
||||
model="Llama-4-Maverick-17B-128E-Instruct",
|
||||
)
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
|
||||
@llm.event_handler("on_function_calls_started")
|
||||
async def on_function_calls_started(service, function_calls):
|
||||
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function])
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(
|
||||
context, user_params=LLMUserAggregatorParams(aggregation_timeout=0.05)
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.examples.run import main
|
||||
|
||||
main(run_example, transport_params=transport_params)
|
||||
146
examples/foundational/14t-function-calling-direct.py
Normal file
146
examples/foundational/14t-function-calling-direct.py
Normal file
@@ -0,0 +1,146 @@
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def get_current_weather(params: FunctionCallParams, location: str, format: str):
|
||||
"""
|
||||
Get the current weather.
|
||||
|
||||
Args:
|
||||
location (str): The city and state, e.g. "San Francisco, CA".
|
||||
format (str): The temperature unit to use. Must be either "celsius" or "fahrenheit". Infer this from the user's location.
|
||||
"""
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
async def get_restaurant_recommendation(params: FunctionCallParams, location: str):
|
||||
"""
|
||||
Get a restaurant recommendation.
|
||||
|
||||
Args:
|
||||
location (str): The city and state, e.g. "San Francisco, CA".
|
||||
"""
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
llm.register_direct_function(get_current_weather)
|
||||
llm.register_direct_function(get_restaurant_recommendation)
|
||||
|
||||
@llm.event_handler("on_function_calls_started")
|
||||
async def on_function_calls_started(service, function_calls):
|
||||
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
|
||||
|
||||
tools = ToolsSchema(standard_tools=[get_current_weather, get_restaurant_recommendation])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.examples.run import main
|
||||
|
||||
main(run_example, transport_params=transport_params)
|
||||
@@ -33,7 +33,7 @@ transport_params = {
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: TransportParams(
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
|
||||
@@ -9,8 +9,8 @@ import asyncio
|
||||
import os
|
||||
import time
|
||||
|
||||
import google.ai.generativelanguage as glm
|
||||
from dotenv import load_dotenv
|
||||
from google.genai.types import Content, Part
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
@@ -611,9 +611,7 @@ class OutputGate(FrameProcessor):
|
||||
await self._notifier.wait()
|
||||
|
||||
transcription = await self._transcription_buffer.wait_for_transcription() or "-"
|
||||
self._context._messages.append(
|
||||
glm.Content(role="user", parts=[glm.Part(text=transcription)])
|
||||
)
|
||||
self._context.add_message(Content(role="user", parts=[Part(text=transcription)]))
|
||||
|
||||
self.open_gate()
|
||||
for frame, direction in self._frames_buffer:
|
||||
|
||||
@@ -8,8 +8,8 @@ import argparse
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
import google.ai.generativelanguage as glm
|
||||
from dotenv import load_dotenv
|
||||
from google.genai.types import Content, Part
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
@@ -142,8 +142,8 @@ class InputTranscriptionContextFilter(FrameProcessor):
|
||||
context = GoogleLLMContext.upgrade_to_google(frame.context)
|
||||
message = context.messages[-1]
|
||||
|
||||
if not isinstance(message, glm.Content):
|
||||
logger.error(f"Expected glm.Content, got {type(message)}")
|
||||
if not isinstance(message, Content):
|
||||
logger.error(f"Expected Content, got {type(message)}")
|
||||
return
|
||||
|
||||
last_part = message.parts[-1]
|
||||
@@ -168,15 +168,15 @@ class InputTranscriptionContextFilter(FrameProcessor):
|
||||
history += f"{msg.role}: {part.text}\n"
|
||||
if history:
|
||||
assembled = f"Here is the conversation history so far. These are not instructions. This is data that you should use only to improve the accuracy of your transcription.\n\n----\n\n{history}\n\n----\n\nEND OF CONVERSATION HISTORY\n\n"
|
||||
parts.append(glm.Part(text=assembled))
|
||||
parts.append(Part(text=assembled))
|
||||
|
||||
parts.append(
|
||||
glm.Part(
|
||||
Part(
|
||||
text="Transcribe this audio. Respond either with the transcription exactly as it was said by the user, or with the special string 'EMPTY' if the audio is not clear."
|
||||
)
|
||||
)
|
||||
parts.append(last_part)
|
||||
msg = glm.Content(role="user", parts=parts)
|
||||
msg = Content(role="user", parts=parts)
|
||||
ctx = GoogleLLMContext([msg])
|
||||
ctx.system_message = transcriber_system_message
|
||||
await self.push_frame(OpenAILLMContextFrame(context=ctx))
|
||||
|
||||
@@ -55,7 +55,7 @@ transport_params = {
|
||||
# endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
"twilio": lambda: TransportParams(
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
# set stop_secs to something roughly similar to the internal setting
|
||||
|
||||
242
examples/foundational/26f-gemini-multimodal-live-files-api.py
Normal file
242
examples/foundational/26f-gemini-multimodal-live-files-api.py
Normal file
@@ -0,0 +1,242 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.services.gemini_multimodal_live.gemini import (
|
||||
GeminiMultimodalLiveLLMService,
|
||||
)
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=False,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=False,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=False,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
sample_file_path = ""
|
||||
|
||||
|
||||
async def create_sample_file():
|
||||
if sample_file_path:
|
||||
return sample_file_path
|
||||
else:
|
||||
"""Create a sample text file for testing the File API."""
|
||||
content = """# Sample Document for Gemini File API Test
|
||||
|
||||
This is a test document to demonstrate the Gemini File API functionality.
|
||||
|
||||
## Key Information:
|
||||
- This document was created for testing purposes
|
||||
- It contains information about AI assistants
|
||||
- The document should be analyzed by Gemini
|
||||
- The secret phrase for the test is "Pineapple Pizza"
|
||||
|
||||
## AI Assistant Capabilities:
|
||||
1. Natural language processing
|
||||
2. File analysis and understanding
|
||||
3. Context-aware conversations
|
||||
4. Multi-modal interactions
|
||||
|
||||
## Conclusion:
|
||||
This document serves as a test case for the Gemini File API integration with Pipecat.
|
||||
The AI should be able to reference and discuss the contents of this file.
|
||||
"""
|
||||
|
||||
# Create a temporary file
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||
f.write(content)
|
||||
return f.name
|
||||
|
||||
|
||||
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
|
||||
logger.info(f"Starting File API bot")
|
||||
|
||||
# Create a sample file to upload
|
||||
sample_file_path = await create_sample_file()
|
||||
logger.info(f"Created sample file: {sample_file_path}")
|
||||
|
||||
system_instruction = """
|
||||
You are a helpful AI assistant with access to a document that has been uploaded for analysis.
|
||||
|
||||
The document contains test information.
|
||||
You should be able to:
|
||||
- Reference and discuss the contents of the uploaded document
|
||||
- Answer questions about what's in the document
|
||||
- Use the information from the document in our conversation
|
||||
|
||||
Your output will be converted to audio so don't include special characters in your answers.
|
||||
Be friendly and demonstrate your ability to work with the uploaded file.
|
||||
"""
|
||||
|
||||
# Initialize Gemini service with File API support
|
||||
llm = GeminiMultimodalLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=system_instruction,
|
||||
voice_id="Charon", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
transcribe_user_audio=True,
|
||||
)
|
||||
|
||||
# Upload the sample file to Gemini File API
|
||||
logger.info("Uploading file to Gemini File API...")
|
||||
file_info = None
|
||||
try:
|
||||
file_info = await llm.file_api.upload_file(
|
||||
sample_file_path, display_name="Sample Test Document"
|
||||
)
|
||||
logger.info(f"File uploaded successfully: {file_info['file']['name']}")
|
||||
|
||||
# Get file URI and mime type
|
||||
file_uri = file_info["file"]["uri"]
|
||||
mime_type = "text/plain"
|
||||
|
||||
# Create context with file reference
|
||||
context = OpenAILLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Greet the user and let them know you have access to a document they can ask you about. Mention that you can discuss its contents.",
|
||||
},
|
||||
{
|
||||
"type": "file_data",
|
||||
"file_data": {"mime_type": mime_type, "file_uri": file_uri},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
logger.info("File reference added to conversation context")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error uploading file: {e}")
|
||||
# Continue with a basic context if file upload fails
|
||||
context = OpenAILLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Greet the user and explain that there was an issue with file upload, but you're ready to help with other tasks.",
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# Create context aggregator
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
# Build the pipeline
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
|
||||
# Configure the pipeline task
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
allow_interruptions=True,
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Handle client connection event
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation using standard context frame
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
# Handle client disconnection events
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
|
||||
@transport.event_handler("on_client_closed")
|
||||
async def on_client_closed(transport, client):
|
||||
logger.info(f"Client closed connection")
|
||||
await task.cancel()
|
||||
|
||||
# Run the pipeline
|
||||
runner = PipelineRunner(handle_sigint=False)
|
||||
await runner.run(task)
|
||||
|
||||
# Clean up: delete the uploaded file and temporary file
|
||||
if file_info:
|
||||
try:
|
||||
await llm.file_api.delete_file(file_info["file"]["name"])
|
||||
logger.info("Cleaned up uploaded file from Gemini")
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up file: {e}")
|
||||
|
||||
# Remove temporary file
|
||||
try:
|
||||
os.unlink(sample_file_path)
|
||||
logger.info("Cleaned up temporary file")
|
||||
except Exception as e:
|
||||
logger.error(f"Error removing temporary file: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.examples.run import main
|
||||
|
||||
upload_example_file = input("""
|
||||
|
||||
Please pass in a TEXT filepath to test upload.
|
||||
NOTE: Files are stored on Google's servers for 48 hours.
|
||||
|
||||
Press Enter to use a default test file.
|
||||
|
||||
text filepath : """)
|
||||
if upload_example_file:
|
||||
print(f"Uploading file: {upload_example_file}")
|
||||
sample_file_path = upload_example_file.strip()
|
||||
else:
|
||||
print(f"Using default file")
|
||||
|
||||
main(run_example, transport_params=transport_params)
|
||||
@@ -27,7 +27,6 @@ from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
aiohttp_session = aiohttp.ClientSession()
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
@@ -38,7 +37,7 @@ transport_params = {
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=FalSmartTurnAnalyzer(
|
||||
api_key=os.getenv("FAL_SMART_TURN_API_KEY"), aiohttp_session=aiohttp_session
|
||||
api_key=os.getenv("FAL_SMART_TURN_API_KEY"), aiohttp_session=aiohttp.ClientSession()
|
||||
),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
@@ -46,7 +45,7 @@ transport_params = {
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=FalSmartTurnAnalyzer(
|
||||
api_key=os.getenv("FAL_SMART_TURN_API_KEY"), aiohttp_session=aiohttp_session
|
||||
api_key=os.getenv("FAL_SMART_TURN_API_KEY"), aiohttp_session=aiohttp.ClientSession()
|
||||
),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
@@ -54,7 +53,7 @@ transport_params = {
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=FalSmartTurnAnalyzer(
|
||||
api_key=os.getenv("FAL_SMART_TURN_API_KEY"), aiohttp_session=aiohttp_session
|
||||
api_key=os.getenv("FAL_SMART_TURN_API_KEY"), aiohttp_session=aiohttp.ClientSession()
|
||||
),
|
||||
),
|
||||
}
|
||||
@@ -118,8 +117,6 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
await aiohttp_session.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.examples.run import main
|
||||
|
||||
@@ -9,6 +9,7 @@ import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from mcp.client.session_group import SseServerParameters
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
@@ -63,7 +64,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
|
||||
|
||||
try:
|
||||
# https://docs.mcp.run/integrating/tutorials/mcp-run-sse-openai-agents/
|
||||
mcp = MCPClient(server_params=os.getenv("MCP_RUN_SSE_URL"))
|
||||
mcp = MCPClient(server_params=SseServerParameters(url=os.getenv("MCP_RUN_SSE_URL")))
|
||||
except Exception as e:
|
||||
logger.error(f"error setting up mcp")
|
||||
logger.exception("error trace:")
|
||||
|
||||
@@ -15,6 +15,7 @@ import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from mcp import StdioServerParameters
|
||||
from mcp.client.session_group import SseServerParameters
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
@@ -149,7 +150,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
|
||||
# https://docs.mcp.run/integrating/tutorials/mcp-run-sse-openai-agents/
|
||||
# ie. "https://www.mcp.run/api/mcp/sse?..."
|
||||
# ensure the profile has a tool or few installed
|
||||
mcp_run = MCPClient(server_params=os.getenv("MCP_RUN_SSE_URL"))
|
||||
mcp_run = MCPClient(server_params=SseServerParameters(url=os.getenv("MCP_RUN_SSE_URL")))
|
||||
except Exception as e:
|
||||
logger.error(f"error setting up mcp.run")
|
||||
logger.exception("error trace:")
|
||||
|
||||
133
examples/foundational/39c-mcp-run-http.py
Normal file
133
examples/foundational/39c-mcp-run-http.py
Normal file
@@ -0,0 +1,133 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from mcp.client.session_group import StreamableHttpParameters
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.mcp_service import MCPClient
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"), model="gemini-2.0-flash")
|
||||
|
||||
try:
|
||||
# Github MCP docs: https://github.com/github/github-mcp-server
|
||||
# Enable Github Copilot on your GitHub account. Free tier is ok. (https://github.com/settings/copilot)
|
||||
# Generate a personal access token. It must be a Fine-grained token, classic tokens are not supported. (https://github.com/settings/personal-access-tokens)
|
||||
# Set permissions you want to use (eg. "all repositories", "profile: read/write", etc)
|
||||
mcp = MCPClient(
|
||||
server_params=StreamableHttpParameters(
|
||||
url="https://api.githubcopilot.com/mcp/",
|
||||
headers={"Authorization": f"Bearer {os.getenv('GITHUB_PERSONAL_ACCESS_TOKEN')}"},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"error setting up mcp")
|
||||
logger.exception("error trace:")
|
||||
|
||||
tools = await mcp.register_tools(llm)
|
||||
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
Your goal is to answer questions about the user's GitHub repositories and account.
|
||||
You have access to a number of tools provided by Github. Use any and all tools to help users.
|
||||
Your output will be converted to audio so don't include special characters in your answers.
|
||||
Don't overexplain what you are doing.
|
||||
Just respond with short sentences when you are carrying out tool calls.
|
||||
"""
|
||||
|
||||
messages = [{"role": "system", "content": system}]
|
||||
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User spoken responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses and tool context
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.examples.run import main
|
||||
|
||||
main(run_example, transport_params=transport_params)
|
||||
@@ -102,6 +102,7 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
|
||||
secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
||||
access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
||||
region=os.getenv("AWS_REGION"), # as of 2025-05-06, us-east-1 is the only supported region
|
||||
session_token=os.getenv("AWS_SESSION_TOKEN"),
|
||||
voice_id="tiffany", # matthew, tiffany, amy
|
||||
# you could choose to pass instruction here rather than via context
|
||||
# system_instruction=system_instruction
|
||||
|
||||
@@ -10,8 +10,8 @@ import os
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.interruptions.min_words_interruption_strategy import MinWordsInterruptionStrategy
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import MinWordsInterruptionStrategy
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
|
||||
59
examples/freeze-test/README.md
Normal file
59
examples/freeze-test/README.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# Freeze Test Client
|
||||
|
||||
The purpose of this example is to create an environment for testing the bot and try to create freezing conditions.
|
||||
|
||||
### Approach 1: Server-Side Testing with `SimulateFreezeInput`
|
||||
|
||||
- Utilize only the bot `freeze_test_bot.py` with the `SimulateFreezeInput` processor. This input continuously injects frames, simulating user speech interruptions at random intervals.
|
||||
- This approach excludes the use of input transport and speech-to-text (STT) functionalities.
|
||||
|
||||
### Approach 2: Server-Side with TypeScript Client
|
||||
|
||||
- Combine server-side operations with a TypeScript client.
|
||||
- The client initially records a segment of audio, e.g., 5–10 seconds long. It can be anything.
|
||||
- After that, it replays this recorded audio to the server at random intervals, mimicking user input interruptions.
|
||||
- This helps testing interruptions in the pipeline as if real users were interacting with the bot.
|
||||
|
||||
## Setup
|
||||
|
||||
Follow these steps to set up and run the Freeze Test Client:
|
||||
|
||||
1. **Run the Bot Server**
|
||||
- Set up and activate your virtual environment:
|
||||
```bash
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
```
|
||||
|
||||
- Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
- Create your `.env` file and set your env vars:
|
||||
```bash
|
||||
cp env.example .env
|
||||
```
|
||||
|
||||
- Run the server:
|
||||
```bash
|
||||
python freeze_test_bot.py
|
||||
```
|
||||
|
||||
2. **Navigate to the Client Directory**
|
||||
```bash
|
||||
cd client
|
||||
```
|
||||
|
||||
3. **Install Dependencies**
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
4. **Run the Client Application**
|
||||
```bash
|
||||
npm run dev
|
||||
```
|
||||
|
||||
5. **Access the Client in Your Browser**
|
||||
Visit [http://localhost:5173](http://localhost:5173) to interact with the Freeze Test Client.
|
||||
43
examples/freeze-test/client/index.html
Normal file
43
examples/freeze-test/client/index.html
Normal file
@@ -0,0 +1,43 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>AI Chatbot</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="status-bar">
|
||||
<div class="status">
|
||||
Transport: <span id="connection-status">Disconnected</span>
|
||||
</div>
|
||||
<div class="controls">
|
||||
<button id="connect-btn">Connect</button>
|
||||
<button id="disconnect-btn" disabled>Disconnect</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="status-bar">
|
||||
<div class="status">
|
||||
Playing audio: <span id="play-audio-status"></span>
|
||||
</div>
|
||||
<div class="controls">
|
||||
<button id="play-btn">Start</button>
|
||||
<button id="stop-btn" disabled>Stop</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<audio id="bot-audio" autoplay></audio>
|
||||
|
||||
<div class="debug-panel">
|
||||
<h3>Debug Info</h3>
|
||||
<div id="debug-log"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script type="module" src="/src/app.ts"></script>
|
||||
<link rel="stylesheet" href="/src/style.css">
|
||||
</body>
|
||||
|
||||
</html>
|
||||
1770
examples/freeze-test/client/package-lock.json
generated
Normal file
1770
examples/freeze-test/client/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
26
examples/freeze-test/client/package.json
Normal file
26
examples/freeze-test/client/package.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"name": "client",
|
||||
"version": "1.0.0",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "tsc && vite build",
|
||||
"preview": "vite preview"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"description": "",
|
||||
"devDependencies": {
|
||||
"@types/node": "^22.15.30",
|
||||
"@types/protobufjs": "^6.0.0",
|
||||
"@vitejs/plugin-react-swc": "^3.10.1",
|
||||
"typescript": "^5.8.3",
|
||||
"vite": "^6.3.5"
|
||||
},
|
||||
"dependencies": {
|
||||
"@pipecat-ai/client-js": "^0.4.0",
|
||||
"@pipecat-ai/websocket-transport": "^0.4.1",
|
||||
"protobufjs": "^7.4.0"
|
||||
}
|
||||
}
|
||||
338
examples/freeze-test/client/src/app.ts
Normal file
338
examples/freeze-test/client/src/app.ts
Normal file
@@ -0,0 +1,338 @@
|
||||
/**
|
||||
* Copyright (c) 2024–2025, Daily
|
||||
*
|
||||
* SPDX-License-Identifier: BSD 2-Clause License
|
||||
*/
|
||||
|
||||
/**
|
||||
* RTVI Client Implementation
|
||||
*
|
||||
* This client connects to an RTVI-compatible bot server using WebSocket.
|
||||
*
|
||||
* Requirements:
|
||||
* - A running RTVI bot server (defaults to http://localhost:7860)
|
||||
*/
|
||||
|
||||
import {
|
||||
RTVIClient,
|
||||
RTVIClientOptions,
|
||||
RTVIEvent,
|
||||
} from '@pipecat-ai/client-js';
|
||||
import {
|
||||
ProtobufFrameSerializer,
|
||||
WebSocketTransport
|
||||
} from "@pipecat-ai/websocket-transport";
|
||||
|
||||
class RecordingSerializer extends ProtobufFrameSerializer {
|
||||
|
||||
private lastTimestamp: number | null = null;
|
||||
private recordingAudioToSend: boolean = false;
|
||||
private _recordedAudio: { data: ArrayBuffer; delay: number }[] = [];
|
||||
|
||||
public startRecording() {
|
||||
this.recordingAudioToSend = true;
|
||||
this._recordedAudio = [];
|
||||
this.lastTimestamp = null;
|
||||
}
|
||||
|
||||
public stopRecording() {
|
||||
this.recordingAudioToSend = false;
|
||||
}
|
||||
|
||||
// @ts-ignore
|
||||
serializeAudio(data: ArrayBuffer, sampleRate: number, numChannels: number): Uint8Array | null {
|
||||
if (this.recordingAudioToSend) {
|
||||
const now = Date.now();
|
||||
// Compute delay since last packet
|
||||
const delay = this.lastTimestamp ? now - this.lastTimestamp : 0;
|
||||
this.lastTimestamp = now;
|
||||
// Save audio chunk and delay
|
||||
this._recordedAudio.push({ data, delay });
|
||||
return null;
|
||||
} else {
|
||||
return super.serializeAudio(data, sampleRate, numChannels);
|
||||
}
|
||||
}
|
||||
|
||||
public get recordedAudio() {
|
||||
return this._recordedAudio
|
||||
}
|
||||
}
|
||||
|
||||
class WebsocketClientApp {
|
||||
private ENABLE_RECORDING_MODE = false
|
||||
private RECORDING_TIME_MS = 10000
|
||||
|
||||
private rtviClient: RTVIClient | null = null;
|
||||
private connectBtn: HTMLButtonElement | null = null;
|
||||
private disconnectBtn: HTMLButtonElement | null = null;
|
||||
private statusSpan: HTMLElement | null = null;
|
||||
private debugLog: HTMLElement | null = null;
|
||||
private botAudio: HTMLAudioElement;
|
||||
|
||||
private declare websocketTransport: WebSocketTransport;
|
||||
private sendRecordedAudio: boolean = false
|
||||
private declare recordingSerializer: RecordingSerializer;
|
||||
|
||||
private playBtn: HTMLButtonElement | null = null;
|
||||
private stopBtn: HTMLButtonElement | null = null;
|
||||
|
||||
constructor() {
|
||||
this.botAudio = document.createElement('audio');
|
||||
this.botAudio.autoplay = true;
|
||||
//this.botAudio.playsInline = true;
|
||||
document.body.appendChild(this.botAudio);
|
||||
|
||||
this.setupDOMElements();
|
||||
this.setupEventListeners();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up references to DOM elements and create necessary media elements
|
||||
*/
|
||||
private setupDOMElements(): void {
|
||||
this.connectBtn = document.getElementById('connect-btn') as HTMLButtonElement;
|
||||
this.disconnectBtn = document.getElementById('disconnect-btn') as HTMLButtonElement;
|
||||
this.statusSpan = document.getElementById('connection-status');
|
||||
this.debugLog = document.getElementById('debug-log');
|
||||
this.playBtn = document.getElementById('play-btn') as HTMLButtonElement;
|
||||
this.stopBtn = document.getElementById('stop-btn') as HTMLButtonElement;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up event listeners for connect/disconnect buttons
|
||||
*/
|
||||
private setupEventListeners(): void {
|
||||
this.connectBtn?.addEventListener('click', () => this.connect());
|
||||
this.disconnectBtn?.addEventListener('click', () => this.disconnect());
|
||||
this.playBtn?.addEventListener('click', () => this.startSendingRecordedAudio());
|
||||
this.stopBtn?.addEventListener('click', () => this.stopSendingRecordedAudio());
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a timestamped message to the debug log
|
||||
*/
|
||||
private log(message: string): void {
|
||||
if (!this.debugLog) return;
|
||||
const entry = document.createElement('div');
|
||||
entry.textContent = `${new Date().toISOString()} - ${message}`;
|
||||
if (message.startsWith('User: ')) {
|
||||
entry.style.color = '#2196F3';
|
||||
} else if (message.startsWith('Bot: ')) {
|
||||
entry.style.color = '#4CAF50';
|
||||
}
|
||||
this.debugLog.appendChild(entry);
|
||||
this.debugLog.scrollTop = this.debugLog.scrollHeight;
|
||||
console.log(message);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the connection status display
|
||||
*/
|
||||
private updateStatus(status: string): void {
|
||||
if (this.statusSpan) {
|
||||
this.statusSpan.textContent = status;
|
||||
}
|
||||
this.log(`Status: ${status}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check for available media tracks and set them up if present
|
||||
* This is called when the bot is ready or when the transport state changes to ready
|
||||
*/
|
||||
setupMediaTracks() {
|
||||
if (!this.rtviClient) return;
|
||||
const tracks = this.rtviClient.tracks();
|
||||
if (tracks.bot?.audio) {
|
||||
this.setupAudioTrack(tracks.bot.audio);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up listeners for track events (start/stop)
|
||||
* This handles new tracks being added during the session
|
||||
*/
|
||||
setupTrackListeners() {
|
||||
if (!this.rtviClient) return;
|
||||
|
||||
// Listen for new tracks starting
|
||||
this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => {
|
||||
// Only handle non-local (bot) tracks
|
||||
if (!participant?.local && track.kind === 'audio') {
|
||||
this.setupAudioTrack(track);
|
||||
}
|
||||
});
|
||||
|
||||
// Listen for tracks stopping
|
||||
this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => {
|
||||
this.log(`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up an audio track for playback
|
||||
* Handles both initial setup and track updates
|
||||
*/
|
||||
private setupAudioTrack(track: MediaStreamTrack): void {
|
||||
this.log('Setting up audio track');
|
||||
if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) {
|
||||
const oldTrack = this.botAudio.srcObject.getAudioTracks()[0];
|
||||
if (oldTrack?.id === track.id) return;
|
||||
}
|
||||
this.botAudio.srcObject = new MediaStream([track]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize and connect to the bot
|
||||
* This sets up the RTVI client, initializes devices, and establishes the connection
|
||||
*/
|
||||
public async connect(): Promise<void> {
|
||||
try {
|
||||
const startTime = Date.now();
|
||||
|
||||
this.recordingSerializer = new RecordingSerializer()
|
||||
const transport = this.ENABLE_RECORDING_MODE ?
|
||||
new WebSocketTransport({
|
||||
serializer: this.recordingSerializer,
|
||||
recorderSampleRate: 8000,
|
||||
playerSampleRate:8000
|
||||
}) :
|
||||
new WebSocketTransport({
|
||||
serializer: new ProtobufFrameSerializer(),
|
||||
recorderSampleRate: 8000,
|
||||
playerSampleRate:8000
|
||||
});
|
||||
this.websocketTransport = transport
|
||||
|
||||
const RTVIConfig: RTVIClientOptions = {
|
||||
transport,
|
||||
params: {
|
||||
// The baseURL and endpoint of your bot server that the client will connect to
|
||||
baseUrl: 'http://localhost:7860',
|
||||
endpoints: { connect: '/connect' },
|
||||
},
|
||||
enableMic: true,
|
||||
enableCam: false,
|
||||
callbacks: {
|
||||
onConnected: () => {
|
||||
this.updateStatus('Connected');
|
||||
if (this.connectBtn) this.connectBtn.disabled = true;
|
||||
if (this.disconnectBtn) this.disconnectBtn.disabled = false;
|
||||
},
|
||||
onDisconnected: () => {
|
||||
this.updateStatus('Disconnected');
|
||||
if (this.connectBtn) this.connectBtn.disabled = false;
|
||||
if (this.disconnectBtn) this.disconnectBtn.disabled = true;
|
||||
this.log('Client disconnected');
|
||||
},
|
||||
onBotReady: (data) => {
|
||||
this.log(`Bot ready: ${JSON.stringify(data)}`);
|
||||
this.setupMediaTracks();
|
||||
},
|
||||
onUserTranscript: (data) => {
|
||||
if (data.final) {
|
||||
this.log(`User: ${data.text}`);
|
||||
}
|
||||
},
|
||||
onBotTranscript: (data) => this.log(`Bot: ${data.text}`),
|
||||
onMessageError: (error) => console.error('Message error:', error),
|
||||
onError: (error) => console.error('Error:', error),
|
||||
},
|
||||
}
|
||||
this.rtviClient = new RTVIClient(RTVIConfig);
|
||||
this.setupTrackListeners();
|
||||
|
||||
this.log('Initializing devices...');
|
||||
await this.rtviClient.initDevices();
|
||||
|
||||
this.log('Connecting to bot...');
|
||||
await this.rtviClient.connect();
|
||||
|
||||
const timeTaken = Date.now() - startTime;
|
||||
this.log(`Connection complete, timeTaken: ${timeTaken}`);
|
||||
|
||||
if (this.ENABLE_RECORDING_MODE) {
|
||||
this.log(`Starting to recording the next ${(this.RECORDING_TIME_MS/1000)}s of audio`);
|
||||
this.recordingSerializer.startRecording()
|
||||
await this.sleep(this.RECORDING_TIME_MS)
|
||||
this.recordingSerializer.stopRecording()
|
||||
this.log("Recording stopped");
|
||||
this.rtviClient.enableMic(false)
|
||||
this.startSendingRecordedAudio()
|
||||
}
|
||||
} catch (error) {
|
||||
this.log(`Error connecting: ${(error as Error).message}`);
|
||||
this.updateStatus('Error');
|
||||
// Clean up if there's an error
|
||||
if (this.rtviClient) {
|
||||
try {
|
||||
await this.rtviClient.disconnect();
|
||||
} catch (disconnectError) {
|
||||
this.log(`Error during disconnect: ${disconnectError}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Disconnect from the bot and clean up media resources
|
||||
*/
|
||||
public async disconnect(): Promise<void> {
|
||||
if (this.rtviClient) {
|
||||
try {
|
||||
this.stopSendingRecordedAudio()
|
||||
await this.rtviClient.disconnect();
|
||||
this.rtviClient = null;
|
||||
if (this.botAudio.srcObject && "getAudioTracks" in this.botAudio.srcObject) {
|
||||
this.botAudio.srcObject.getAudioTracks().forEach((track) => track.stop());
|
||||
this.botAudio.srcObject = null;
|
||||
}
|
||||
} catch (error) {
|
||||
this.log(`Error disconnecting: ${(error as Error).message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private startSendingRecordedAudio() {
|
||||
this.sendRecordedAudio = true
|
||||
if (this.playBtn) this.playBtn.disabled = true;
|
||||
if (this.stopBtn) this.stopBtn.disabled = false;
|
||||
void this.replayAudio()
|
||||
}
|
||||
|
||||
private stopSendingRecordedAudio() {
|
||||
if (this.stopBtn) this.stopBtn.disabled = true;
|
||||
if (this.playBtn) this.playBtn.disabled = false;
|
||||
this.sendRecordedAudio = false
|
||||
}
|
||||
|
||||
private async replayAudio() {
|
||||
if (this.sendRecordedAudio) {
|
||||
this.log("Sending recorded audio")
|
||||
for (const chunk of this.recordingSerializer.recordedAudio) {
|
||||
await this.sleep(chunk.delay);
|
||||
this.websocketTransport.handleUserAudioStream(chunk.data);
|
||||
}
|
||||
const randomDelay = 1000 + Math.random() * (10000 - 500);
|
||||
await this.sleep(randomDelay);
|
||||
|
||||
void this.replayAudio()
|
||||
}
|
||||
}
|
||||
|
||||
private sleep(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
declare global {
|
||||
interface Window {
|
||||
WebsocketClientApp: typeof WebsocketClientApp;
|
||||
}
|
||||
}
|
||||
|
||||
window.addEventListener('DOMContentLoaded', () => {
|
||||
window.WebsocketClientApp = WebsocketClientApp;
|
||||
new WebsocketClientApp();
|
||||
});
|
||||
98
examples/freeze-test/client/src/style.css
Normal file
98
examples/freeze-test/client/src/style.css
Normal file
@@ -0,0 +1,98 @@
|
||||
body {
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
font-family: Arial, sans-serif;
|
||||
background-color: #f0f0f0;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
.status-bar {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 10px;
|
||||
background-color: #fff;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.controls button {
|
||||
padding: 8px 16px;
|
||||
margin-left: 10px;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
#connect-btn {
|
||||
background-color: #4caf50;
|
||||
color: white;
|
||||
}
|
||||
|
||||
#disconnect-btn {
|
||||
background-color: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
button:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
background-color: #fff;
|
||||
border-radius: 8px;
|
||||
padding: 20px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.bot-container {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
#bot-video-container {
|
||||
width: 640px;
|
||||
height: 360px;
|
||||
background-color: #e0e0e0;
|
||||
border-radius: 8px;
|
||||
margin: 20px auto;
|
||||
overflow: hidden;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
#bot-video-container video {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
object-fit: cover;
|
||||
}
|
||||
|
||||
.debug-panel {
|
||||
background-color: #fff;
|
||||
border-radius: 8px;
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
.debug-panel h3 {
|
||||
margin: 0 0 10px 0;
|
||||
font-size: 16px;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
#debug-log {
|
||||
height: 500px;
|
||||
overflow-y: auto;
|
||||
background-color: #f8f8f8;
|
||||
padding: 10px;
|
||||
border-radius: 4px;
|
||||
font-family: monospace;
|
||||
font-size: 12px;
|
||||
line-height: 1.4;
|
||||
}
|
||||
111
examples/freeze-test/client/tsconfig.json
Normal file
111
examples/freeze-test/client/tsconfig.json
Normal file
@@ -0,0 +1,111 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
/* Visit https://aka.ms/tsconfig to read more about this file */
|
||||
|
||||
/* Projects */
|
||||
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
|
||||
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
|
||||
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
|
||||
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
|
||||
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
|
||||
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
||||
|
||||
/* Language and Environment */
|
||||
"target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
||||
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
||||
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
||||
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
|
||||
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
|
||||
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
|
||||
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
|
||||
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
|
||||
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
|
||||
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
|
||||
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
|
||||
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
||||
|
||||
/* Modules */
|
||||
"module": "commonjs", /* Specify what module code is generated. */
|
||||
// "rootDir": "./", /* Specify the root folder within your source files. */
|
||||
// "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
|
||||
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
||||
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
|
||||
// "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */
|
||||
// "types": [], /* Specify type package names to be included without being referenced in a source file. */
|
||||
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
|
||||
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
|
||||
// "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
|
||||
// "rewriteRelativeImportExtensions": true, /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */
|
||||
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
|
||||
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
|
||||
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
|
||||
// "noUncheckedSideEffectImports": true, /* Check side effect imports. */
|
||||
// "resolveJsonModule": true, /* Enable importing .json files. */
|
||||
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
|
||||
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||
|
||||
/* JavaScript Support */
|
||||
// "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
|
||||
// "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
|
||||
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
|
||||
|
||||
/* Emit */
|
||||
// "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
|
||||
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
|
||||
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
|
||||
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
|
||||
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
|
||||
// "noEmit": true, /* Disable emitting files from a compilation. */
|
||||
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
|
||||
// "outDir": "./", /* Specify an output folder for all emitted files. */
|
||||
// "removeComments": true, /* Disable emitting comments. */
|
||||
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
|
||||
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
|
||||
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
|
||||
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
|
||||
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
|
||||
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
|
||||
// "newLine": "crlf", /* Set the newline character for emitting files. */
|
||||
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
|
||||
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
|
||||
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
|
||||
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
|
||||
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
|
||||
|
||||
/* Interop Constraints */
|
||||
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
|
||||
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
|
||||
// "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
|
||||
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
|
||||
"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
|
||||
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
|
||||
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
|
||||
|
||||
/* Type Checking */
|
||||
"strict": true, /* Enable all strict type-checking options. */
|
||||
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
|
||||
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
|
||||
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
|
||||
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
|
||||
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
|
||||
// "strictBuiltinIteratorReturn": true, /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */
|
||||
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
|
||||
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
|
||||
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
|
||||
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
|
||||
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
|
||||
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
|
||||
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
|
||||
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
|
||||
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
|
||||
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
|
||||
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
|
||||
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
|
||||
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
|
||||
|
||||
/* Completeness */
|
||||
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
|
||||
"skipLibCheck": true /* Skip type checking all .d.ts files. */
|
||||
}
|
||||
}
|
||||
15
examples/freeze-test/client/vite.config.js
Normal file
15
examples/freeze-test/client/vite.config.js
Normal file
@@ -0,0 +1,15 @@
|
||||
import { defineConfig } from 'vite';
|
||||
import react from '@vitejs/plugin-react-swc';
|
||||
|
||||
export default defineConfig({
|
||||
plugins: [react()],
|
||||
server: {
|
||||
proxy: {
|
||||
// Proxy /api requests to the backend server
|
||||
'/connect': {
|
||||
target: 'http://0.0.0.0:7860', // Replace with your backend URL
|
||||
changeOrigin: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
4
examples/freeze-test/env.example
Normal file
4
examples/freeze-test/env.example
Normal file
@@ -0,0 +1,4 @@
|
||||
SENTRY_DSN=
|
||||
DEEPGRAM_API_KEY=
|
||||
CARTESIA_API_KEY=
|
||||
OPENAI_API_KEY=
|
||||
359
examples/freeze-test/freeze_test_bot.py
Normal file
359
examples/freeze-test/freeze_test_bot.py
Normal file
@@ -0,0 +1,359 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Any, Dict
|
||||
|
||||
import sentry_sdk
|
||||
import uvicorn
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import FastAPI, Request, WebSocket
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import RedirectResponse
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMMessagesFrame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
StopFrame,
|
||||
StopInterruptionFrame,
|
||||
TranscriptionFrame,
|
||||
TTSSpeakFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.observers.loggers.debug_log_observer import DebugLogObserver
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import (
|
||||
OpenAILLMContext,
|
||||
OpenAILLMContextFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIProcessor
|
||||
from pipecat.processors.metrics.sentry import SentryMetrics
|
||||
from pipecat.processors.user_idle_processor import UserIdleProcessor
|
||||
from pipecat.serializers.protobuf import ProtobufFrameSerializer
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.network.fastapi_websocket import (
|
||||
FastAPIWebsocketParams,
|
||||
FastAPIWebsocketTransport,
|
||||
)
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Handles FastAPI startup and shutdown."""
|
||||
yield # Run app
|
||||
|
||||
|
||||
# Initialize FastAPI app with lifespan manager
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
|
||||
# Configure CORS to allow requests from any origin
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
class SimulateFreezeInput(FrameProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
# Whether we have seen a StartFrame already.
|
||||
self._initialized = False
|
||||
self._send_frames_task = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
if isinstance(frame, StartFrame):
|
||||
# Push StartFrame before start(), because we want StartFrame to be
|
||||
# processed by every processor before any other frame is processed.
|
||||
await self.push_frame(frame, direction)
|
||||
await self._start(frame)
|
||||
elif isinstance(frame, CancelFrame):
|
||||
logger.info("SimulateFreezeInput: Received cancel frame")
|
||||
await self._stop()
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, EndFrame):
|
||||
logger.info("SimulateFreezeInput: Received end frame")
|
||||
await self.push_frame(frame, direction)
|
||||
await self._stop()
|
||||
elif isinstance(frame, StopFrame):
|
||||
logger.info("SimulateFreezeInput: Received stop frame")
|
||||
await self.push_frame(frame, direction)
|
||||
await self._stop()
|
||||
|
||||
async def _start(self, frame: StartFrame):
|
||||
if self._initialized:
|
||||
return
|
||||
logger.info(f"Starting SimulateFreezeInput")
|
||||
self._initialized = True
|
||||
if not self._send_frames_task:
|
||||
self._send_frames_task = self.create_task(self._send_frames())
|
||||
|
||||
async def _stop(self):
|
||||
logger.info(f"Stopping SimulateFreezeInput")
|
||||
self._initialized = False
|
||||
if self._send_frames_task:
|
||||
await self.cancel_task(self._send_frames_task)
|
||||
self._send_frames_task = None
|
||||
|
||||
async def _send_user_text(self, text: str):
|
||||
self.reset_watchdog()
|
||||
# Emulation as if the user has spoken and the stt transcribed
|
||||
await self.push_frame(UserStartedSpeakingFrame())
|
||||
await self.push_frame(StartInterruptionFrame())
|
||||
await self.push_frame(
|
||||
TranscriptionFrame(
|
||||
text,
|
||||
"",
|
||||
time_now_iso8601(),
|
||||
)
|
||||
)
|
||||
# Need to wait before sending the UserStoppedSpeakingFrame,
|
||||
# otherwise TranscriptionFrame will be processed
|
||||
# later than the UserStoppedSpeakingFrame
|
||||
await asyncio.sleep(0.1)
|
||||
await self.push_frame(UserStoppedSpeakingFrame())
|
||||
await self.push_frame(StopInterruptionFrame())
|
||||
|
||||
async def _send_frames(self):
|
||||
try:
|
||||
i = 0
|
||||
while True:
|
||||
logger.debug("SimulateFreezeInput _send_frames")
|
||||
await self._send_user_text("Tell me a brief history of Brazil!")
|
||||
await asyncio.sleep(3)
|
||||
await self._send_user_text("and who has discovered it")
|
||||
i += 1
|
||||
if i >= 20:
|
||||
break
|
||||
# sleeping 1s before interrupting
|
||||
wait_time = random.uniform(1, 10)
|
||||
await asyncio.sleep(wait_time)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception receiving data: {e.__class__.__name__} ({e})")
|
||||
|
||||
|
||||
async def run_example(websocket_client):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Create a transport using the WebRTC connection
|
||||
transport = FastAPIWebsocketTransport(
|
||||
websocket=websocket_client,
|
||||
params=FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
add_wav_header=False,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
serializer=ProtobufFrameSerializer(),
|
||||
),
|
||||
)
|
||||
|
||||
sentry_sdk.init(
|
||||
dsn=os.getenv("SENTRY_DSN"),
|
||||
traces_sample_rate=1.0,
|
||||
)
|
||||
|
||||
freeze = SimulateFreezeInput()
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
async def handle_user_idle(user_idle: UserIdleProcessor, retry_count: int) -> bool:
|
||||
if retry_count == 1:
|
||||
# First attempt: Add a gentle prompt to the conversation
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "The user has been quiet. Politely and briefly ask if they're still there.",
|
||||
}
|
||||
)
|
||||
await user_idle.push_frame(LLMMessagesFrame(messages))
|
||||
return True
|
||||
elif retry_count == 2:
|
||||
# Second attempt: More direct prompt
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "The user is still inactive. Ask if they'd like to continue our conversation.",
|
||||
}
|
||||
)
|
||||
await user_idle.push_frame(LLMMessagesFrame(messages))
|
||||
return True
|
||||
else:
|
||||
# Third attempt: End the conversation
|
||||
await user_idle.push_frame(
|
||||
TTSSpeakFrame("It seems like you're busy right now. Have a nice day!")
|
||||
)
|
||||
await task.queue_frame(EndFrame())
|
||||
return False
|
||||
|
||||
user_idle = UserIdleProcessor(callback=handle_user_idle, timeout=10.0)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
metrics=SentryMetrics(),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
metrics=SentryMetrics(),
|
||||
)
|
||||
|
||||
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
ParallelPipeline(
|
||||
[
|
||||
freeze,
|
||||
],
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
],
|
||||
),
|
||||
user_idle,
|
||||
rtvi,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
allow_interruptions=True,
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
report_only_initial_ttfb=True,
|
||||
audio_in_sample_rate=8000,
|
||||
audio_out_sample_rate=8000,
|
||||
),
|
||||
idle_timeout_secs=120,
|
||||
observers=[
|
||||
DebugLogObserver(
|
||||
frame_types={
|
||||
InterimTranscriptionFrame: None,
|
||||
TranscriptionFrame: None,
|
||||
# TTSTextFrame: None,
|
||||
# LLMTextFrame: None,
|
||||
OpenAILLMContextFrame: None,
|
||||
LLMFullResponseEndFrame: None,
|
||||
UserStartedSpeakingFrame: None,
|
||||
UserStoppedSpeakingFrame: None,
|
||||
StartInterruptionFrame: None,
|
||||
StopInterruptionFrame: None,
|
||||
},
|
||||
exclude_fields={
|
||||
"result",
|
||||
"metadata",
|
||||
"audio",
|
||||
"image",
|
||||
"images",
|
||||
},
|
||||
),
|
||||
],
|
||||
enable_watchdog_timers=True,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
@rtvi.event_handler("on_client_ready")
|
||||
async def on_client_ready(rtvi):
|
||||
logger.info(f"Client ready")
|
||||
await rtvi.set_bot_ready()
|
||||
# Kick off the conversation.
|
||||
# messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
# await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=False)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
@app.get("/", include_in_schema=False)
|
||||
async def root_redirect():
|
||||
return RedirectResponse(url="/client/")
|
||||
|
||||
|
||||
@app.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
await websocket.accept()
|
||||
print("WebSocket connection accepted")
|
||||
try:
|
||||
await run_example(websocket)
|
||||
except Exception as e:
|
||||
print(f"Exception in run_bot: {e}")
|
||||
|
||||
|
||||
@app.post("/connect")
|
||||
async def bot_connect(request: Request) -> Dict[Any, Any]:
|
||||
server_mode = os.getenv("WEBSOCKET_SERVER", "fast_api")
|
||||
if server_mode == "websocket_server":
|
||||
ws_url = "ws://localhost:8765"
|
||||
else:
|
||||
ws_url = "ws://localhost:7860/ws"
|
||||
return {"ws_url": ws_url}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Pipecat Bot Runner")
|
||||
parser.add_argument(
|
||||
"--host", default="localhost", help="Host for HTTP server (default: localhost)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port", type=int, default=7860, help="Port for HTTP server (default: 7860)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
uvicorn.run(app, host=args.host, port=args.port)
|
||||
4
examples/freeze-test/requirements.txt
Normal file
4
examples/freeze-test/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
python-dotenv
|
||||
fastapi[all]
|
||||
uvicorn
|
||||
pipecat-ai[silero,websocket,openai, deepgram, cartesia, sentry]
|
||||
@@ -143,6 +143,7 @@ async def main():
|
||||
DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
video_out_enabled=True,
|
||||
video_out_width=1024,
|
||||
video_out_height=576,
|
||||
|
||||
@@ -24,6 +24,7 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
from pipecat.utils.tracing.setup import setup_tracing
|
||||
|
||||
@@ -61,7 +62,7 @@ transport_params = {
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: TransportParams(
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
python-dotenv
|
||||
pipecat-ai[webrtc,silero,cartesia,deepgram,openai,tracing]
|
||||
pipecat-ai[daily,webrtc,silero,cartesia,deepgram,openai,tracing]
|
||||
pipecat-ai-small-webrtc-prebuilt
|
||||
opentelemetry-exporter-otlp-proto-grpc
|
||||
@@ -26,7 +26,7 @@ Create a `.env` file with your API keys to enable tracing:
|
||||
```
|
||||
ENABLE_TRACING=true
|
||||
# OTLP endpoint for Langfuse
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=http://cloud.langfuse.com/api/public/otel
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel
|
||||
OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic%20<base64_encoded_api_key>
|
||||
# Set to any value to enable console output for debugging
|
||||
# OTEL_CONSOLE_EXPORT=true
|
||||
|
||||
@@ -24,6 +24,7 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
from pipecat.utils.tracing.setup import setup_tracing
|
||||
|
||||
@@ -58,7 +59,7 @@ transport_params = {
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: TransportParams(
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
python-dotenv
|
||||
pipecat-ai[webrtc,silero,cartesia,deepgram,openai,tracing]
|
||||
pipecat-ai[daily,webrtc,silero,cartesia,deepgram,openai,tracing]
|
||||
pipecat-ai-small-webrtc-prebuilt
|
||||
opentelemetry-exporter-otlp-proto-http
|
||||
@@ -1,4 +1,4 @@
|
||||
pipecat-ai[daily,elevenlabs,openai,silero]
|
||||
pipecat-ai[daily,cartesia,openai,silero]
|
||||
fastapi==0.115.6
|
||||
uvicorn
|
||||
python-dotenv
|
||||
|
||||
@@ -49,7 +49,7 @@ async def main():
|
||||
|
||||
# Initialize Sentry
|
||||
sentry_sdk.init(
|
||||
dsn="your-project-dsn",
|
||||
dsn=os.getenv("SENTRY_DSN"),
|
||||
traces_sample_rate=1.0,
|
||||
)
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
90031FC22C616EE900408370 /* SimpleChatbotUITests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 90031FC12C616EE900408370 /* SimpleChatbotUITests.swift */; };
|
||||
90031FC42C616EE900408370 /* SimpleChatbotUITestsLaunchTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 90031FC32C616EE900408370 /* SimpleChatbotUITestsLaunchTests.swift */; };
|
||||
90031FDC2C6D5DD700408370 /* ToastModifier.swift in Sources */ = {isa = PBXBuildFile; fileRef = 90031FDB2C6D5DD700408370 /* ToastModifier.swift */; };
|
||||
907C98842D37E6AF0079441F /* PipecatClientIOSDaily in Frameworks */ = {isa = PBXBuildFile; productRef = 907C98832D37E6AF0079441F /* PipecatClientIOSDaily */; };
|
||||
90ABB98E2C735ED6000D9CC7 /* MeetingView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 90ABB98D2C735ED6000D9CC7 /* MeetingView.swift */; };
|
||||
90ABB9902C736A8B000D9CC7 /* WaveformView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 90ABB98F2C736A8B000D9CC7 /* WaveformView.swift */; };
|
||||
90ABB9932C73820D000D9CC7 /* MicrophoneView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 90ABB9922C73820D000D9CC7 /* MicrophoneView.swift */; };
|
||||
@@ -25,6 +24,8 @@
|
||||
90ABB9A32C74E1CE000D9CC7 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 90ABB9A22C74E1CE000D9CC7 /* SettingsView.swift */; };
|
||||
90ABB9A62C74EA8A000D9CC7 /* SettingsPreference.swift in Sources */ = {isa = PBXBuildFile; fileRef = 90ABB9A52C74EA8A000D9CC7 /* SettingsPreference.swift */; };
|
||||
90ABB9A82C74EAB1000D9CC7 /* SettingsManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 90ABB9A72C74EAB1000D9CC7 /* SettingsManager.swift */; };
|
||||
90CC98B02E158093003C2706 /* PipecatClientIOSDaily in Frameworks */ = {isa = PBXBuildFile; productRef = 90CC98AF2E158093003C2706 /* PipecatClientIOSDaily */; };
|
||||
90CC98B62E15820B003C2706 /* PipecatClientIOSDaily in Frameworks */ = {isa = PBXBuildFile; productRef = 90CC98B52E15820B003C2706 /* PipecatClientIOSDaily */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXContainerItemProxy section */
|
||||
@@ -73,7 +74,8 @@
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
907C98842D37E6AF0079441F /* PipecatClientIOSDaily in Frameworks */,
|
||||
90CC98B62E15820B003C2706 /* PipecatClientIOSDaily in Frameworks */,
|
||||
90CC98B02E158093003C2706 /* PipecatClientIOSDaily in Frameworks */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
@@ -218,7 +220,8 @@
|
||||
);
|
||||
name = SimpleChatbot;
|
||||
packageProductDependencies = (
|
||||
907C98832D37E6AF0079441F /* PipecatClientIOSDaily */,
|
||||
90CC98AF2E158093003C2706 /* PipecatClientIOSDaily */,
|
||||
90CC98B52E15820B003C2706 /* PipecatClientIOSDaily */,
|
||||
);
|
||||
productName = SimpleChatbot;
|
||||
productReference = 90031FA32C616EE700408370 /* SimpleChatbot.app */;
|
||||
@@ -293,7 +296,7 @@
|
||||
);
|
||||
mainGroup = 90031F9A2C616EE700408370;
|
||||
packageReferences = (
|
||||
907C98822D37E6AF0079441F /* XCRemoteSwiftPackageReference "pipecat-client-ios-daily" */,
|
||||
90CC98B42E15820B003C2706 /* XCRemoteSwiftPackageReference "pipecat-client-ios-daily" */,
|
||||
);
|
||||
productRefGroup = 90031FA42C616EE700408370 /* Products */;
|
||||
projectDirPath = "";
|
||||
@@ -682,20 +685,24 @@
|
||||
/* End XCConfigurationList section */
|
||||
|
||||
/* Begin XCRemoteSwiftPackageReference section */
|
||||
907C98822D37E6AF0079441F /* XCRemoteSwiftPackageReference "pipecat-client-ios-daily" */ = {
|
||||
90CC98B42E15820B003C2706 /* XCRemoteSwiftPackageReference "pipecat-client-ios-daily" */ = {
|
||||
isa = XCRemoteSwiftPackageReference;
|
||||
repositoryURL = "https://github.com/pipecat-ai/pipecat-client-ios-daily/";
|
||||
requirement = {
|
||||
kind = upToNextMajorVersion;
|
||||
minimumVersion = 0.3.2;
|
||||
minimumVersion = 0.3.6;
|
||||
};
|
||||
};
|
||||
/* End XCRemoteSwiftPackageReference section */
|
||||
|
||||
/* Begin XCSwiftPackageProductDependency section */
|
||||
907C98832D37E6AF0079441F /* PipecatClientIOSDaily */ = {
|
||||
90CC98AF2E158093003C2706 /* PipecatClientIOSDaily */ = {
|
||||
isa = XCSwiftPackageProductDependency;
|
||||
package = 907C98822D37E6AF0079441F /* XCRemoteSwiftPackageReference "pipecat-client-ios-daily" */;
|
||||
productName = PipecatClientIOSDaily;
|
||||
};
|
||||
90CC98B52E15820B003C2706 /* PipecatClientIOSDaily */ = {
|
||||
isa = XCSwiftPackageProductDependency;
|
||||
package = 90CC98B42E15820B003C2706 /* XCRemoteSwiftPackageReference "pipecat-client-ios-daily" */;
|
||||
productName = PipecatClientIOSDaily;
|
||||
};
|
||||
/* End XCSwiftPackageProductDependency section */
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
{
|
||||
"originHash" : "cc17f08b06def9570d775e9c6f7a8dc10d1588b98127e977c47d052abac659b7",
|
||||
"pins" : [
|
||||
{
|
||||
"identity" : "daily-client-ios",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/daily-co/daily-client-ios.git",
|
||||
"state" : {
|
||||
"revision" : "15804ce495780da3ec2d05ab99736315f7bfbd24",
|
||||
"version" : "0.28.0"
|
||||
"revision" : "431938db25e5807120e89e2dc5bab1c076729f59",
|
||||
"version" : "0.31.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -14,8 +15,8 @@
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/pipecat-ai/pipecat-client-ios.git",
|
||||
"state" : {
|
||||
"revision" : "c679512e367002a1a67da85d503fec72d9b17191",
|
||||
"version" : "0.3.2"
|
||||
"revision" : "f92b5e68e56a8311f7d8ead68a7a5674843cbc40",
|
||||
"version" : "0.3.6"
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -23,10 +24,10 @@
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/pipecat-ai/pipecat-client-ios-daily/",
|
||||
"state" : {
|
||||
"revision" : "a337fe6642c52376d2f90eafcb965f5be772ce72",
|
||||
"version" : "0.3.2"
|
||||
"revision" : "8f494da903192c22c367ecf9e51248c9b651fbc6",
|
||||
"version" : "0.3.6"
|
||||
}
|
||||
}
|
||||
],
|
||||
"version" : 2
|
||||
"version" : 3
|
||||
}
|
||||
|
||||
@@ -78,10 +78,11 @@ class CallContainerModel: ObservableObject {
|
||||
self.saveCredentials(backendURL: baseUrl)
|
||||
}
|
||||
|
||||
@MainActor
|
||||
func disconnect() {
|
||||
self.rtviClientIOS?.disconnect(completion: nil)
|
||||
self.rtviClientIOS?.release()
|
||||
Task { @MainActor in
|
||||
try await self.rtviClientIOS?.disconnect()
|
||||
self.rtviClientIOS?.release()
|
||||
}
|
||||
}
|
||||
|
||||
func showError(message: String) {
|
||||
|
||||
@@ -4,18 +4,6 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""OpenAI Bot Implementation.
|
||||
|
||||
This module implements a chatbot using OpenAI's GPT-4 model for natural language
|
||||
processing. It includes:
|
||||
- Real-time audio/video interaction through Daily
|
||||
- Animated robot avatar
|
||||
- Text-to-speech using ElevenLabs
|
||||
- Support for both English and Spanish
|
||||
|
||||
The bot runs as part of a pipeline that processes audio/video frames and manages
|
||||
the conversation flow.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
@@ -24,150 +12,72 @@ import sys
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
from runner import configure
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
Frame,
|
||||
OutputImageRawFrame,
|
||||
SpriteFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
from pipecat.transports.services.helpers.daily_rest import (
|
||||
DailyMeetingTokenParams,
|
||||
DailyMeetingTokenProperties,
|
||||
DailyRESTHelper,
|
||||
DailyRoomParams,
|
||||
)
|
||||
|
||||
load_dotenv(override=True)
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
sprites = []
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
# Load sequential animation frames
|
||||
for i in range(1, 26):
|
||||
# Build the full path to the image file
|
||||
full_path = os.path.join(script_dir, f"assets/robot0{i}.png")
|
||||
# Get the filename without the extension to use as the dictionary key
|
||||
# Open the image and convert it to bytes
|
||||
with Image.open(full_path) as img:
|
||||
sprites.append(OutputImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
|
||||
|
||||
# Create a smooth animation by adding reversed frames
|
||||
flipped = sprites[::-1]
|
||||
sprites.extend(flipped)
|
||||
|
||||
# Define static and animated states
|
||||
quiet_frame = sprites[0] # Static frame for when bot is listening
|
||||
talking_frame = SpriteFrame(images=sprites) # Animation sequence for when bot is talking
|
||||
|
||||
|
||||
class TalkingAnimation(FrameProcessor):
|
||||
"""Manages the bot's visual animation states.
|
||||
|
||||
Switches between static (listening) and animated (talking) states based on
|
||||
the bot's current speaking status.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._is_talking = False
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process incoming frames and update animation state.
|
||||
|
||||
Args:
|
||||
frame: The incoming frame to process
|
||||
direction: The direction of frame flow in the pipeline
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
# Switch to talking animation when bot starts speaking
|
||||
if isinstance(frame, BotStartedSpeakingFrame):
|
||||
if not self._is_talking:
|
||||
await self.push_frame(talking_frame)
|
||||
self._is_talking = True
|
||||
# Return to static frame when bot stops speaking
|
||||
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||
await self.push_frame(quiet_frame)
|
||||
self._is_talking = False
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main bot execution function.
|
||||
|
||||
Sets up and runs the bot pipeline including:
|
||||
- Daily video transport
|
||||
- Speech-to-text and text-to-speech services
|
||||
- Language model integration
|
||||
- Animation processing
|
||||
- RTVI event handling
|
||||
"""
|
||||
"""Main bot execution function."""
|
||||
async with aiohttp.ClientSession() as session:
|
||||
(room_url, token) = await configure(session)
|
||||
daily_rest_helper = DailyRESTHelper(
|
||||
daily_api_key=os.getenv("DAILY_API_KEY"),
|
||||
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
|
||||
aiohttp_session=session,
|
||||
)
|
||||
|
||||
room = await daily_rest_helper.create_room(
|
||||
DailyRoomParams(properties={"enable_prejoin_ui": False})
|
||||
)
|
||||
|
||||
token_params = DailyMeetingTokenParams(
|
||||
properties=DailyMeetingTokenProperties(
|
||||
is_owner=True,
|
||||
permissions={
|
||||
"hasPresence": False, # Example: join as a hidden participant
|
||||
},
|
||||
start_video_off=True,
|
||||
start_audio_off=True,
|
||||
)
|
||||
)
|
||||
|
||||
token = await daily_rest_helper.get_token(room_url=room.url, params=token_params)
|
||||
|
||||
# Set up Daily transport with video/audio parameters
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
room.url,
|
||||
token,
|
||||
"Chatbot",
|
||||
DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_out_enabled=True,
|
||||
video_out_width=1024,
|
||||
video_out_height=576,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
transcription_enabled=True,
|
||||
#
|
||||
# Spanish
|
||||
#
|
||||
# transcription_settings=DailyTranscriptionSettings(
|
||||
# language="es",
|
||||
# tier="nova",
|
||||
# model="2-general"
|
||||
# )
|
||||
),
|
||||
)
|
||||
|
||||
# Initialize text-to-speech service
|
||||
tts = ElevenLabsTTSService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
#
|
||||
# English
|
||||
#
|
||||
voice_id="pNInz6obpgDQGcFmaJgB",
|
||||
#
|
||||
# Spanish
|
||||
#
|
||||
# model="eleven_multilingual_v2",
|
||||
# voice_id="gD1IexrzCvsXPHUuT0s3",
|
||||
)
|
||||
|
||||
# Initialize LLM service
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
#
|
||||
# English
|
||||
#
|
||||
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself.",
|
||||
#
|
||||
# Spanish
|
||||
#
|
||||
# "content": "Eres Chatbot, un amigable y útil robot. Tu objetivo es demostrar tus capacidades de una manera breve. Tus respuestas se convertiran a audio así que nunca no debes incluir caracteres especiales. Contesta a lo que el usuario pregunte de una manera creativa, útil y breve. Empieza por presentarte a ti mismo.",
|
||||
"content": "Summerize the conversation so far in a single sentence.",
|
||||
},
|
||||
]
|
||||
|
||||
@@ -176,8 +86,6 @@ async def main():
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
ta = TalkingAnimation()
|
||||
|
||||
#
|
||||
# RTVI events for Pipecat client UI
|
||||
#
|
||||
@@ -189,8 +97,6 @@ async def main():
|
||||
rtvi,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
ta,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
@@ -204,7 +110,6 @@ async def main():
|
||||
),
|
||||
observers=[RTVIObserver(rtvi)],
|
||||
)
|
||||
await task.queue_frame(quiet_frame)
|
||||
|
||||
@rtvi.event_handler("on_client_ready")
|
||||
async def on_client_ready(rtvi):
|
||||
|
||||
104
examples/storytelling-chatbot/client/package-lock.json
generated
104
examples/storytelling-chatbot/client/package-lock.json
generated
@@ -345,9 +345,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/env": {
|
||||
"version": "14.2.28",
|
||||
"resolved": "https://registry.npmjs.org/@next/env/-/env-14.2.28.tgz",
|
||||
"integrity": "sha512-PAmWhJfJQlP+kxZwCjrVd9QnR5x0R3u0mTXTiZDgSd4h5LdXmjxCCWbN9kq6hkZBOax8Rm3xDW5HagWyJuT37g=="
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/env/-/env-14.2.30.tgz",
|
||||
"integrity": "sha512-KBiBKrDY6kxTQWGzKjQB7QirL3PiiOkV7KW98leHFjtVRKtft76Ra5qSA/SL75xT44dp6hOcqiiJ6iievLOYug=="
|
||||
},
|
||||
"node_modules/@next/eslint-plugin-next": {
|
||||
"version": "14.1.4",
|
||||
@@ -359,9 +359,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-darwin-arm64": {
|
||||
"version": "14.2.28",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-14.2.28.tgz",
|
||||
"integrity": "sha512-kzGChl9setxYWpk3H6fTZXXPFFjg7urptLq5o5ZgYezCrqlemKttwMT5iFyx/p1e/JeglTwDFRtb923gTJ3R1w==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-14.2.30.tgz",
|
||||
"integrity": "sha512-EAqfOTb3bTGh9+ewpO/jC59uACadRHM6TSA9DdxJB/6gxOpyV+zrbqeXiFTDy9uV6bmipFDkfpAskeaDcO+7/g==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -374,9 +374,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-darwin-x64": {
|
||||
"version": "14.2.28",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-14.2.28.tgz",
|
||||
"integrity": "sha512-z6FXYHDJlFOzVEOiiJ/4NG8aLCeayZdcRSMjPDysW297Up6r22xw6Ea9AOwQqbNsth8JNgIK8EkWz2IDwaLQcw==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-14.2.30.tgz",
|
||||
"integrity": "sha512-TyO7Wz1IKE2kGv8dwQ0bmPL3s44EKVencOqwIY69myoS3rdpO1NPg5xPM5ymKu7nfX4oYJrpMxv8G9iqLsnL4A==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -389,9 +389,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-arm64-gnu": {
|
||||
"version": "14.2.28",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-14.2.28.tgz",
|
||||
"integrity": "sha512-9ARHLEQXhAilNJ7rgQX8xs9aH3yJSj888ssSjJLeldiZKR4D7N08MfMqljk77fAwZsWwsrp8ohHsMvurvv9liQ==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-14.2.30.tgz",
|
||||
"integrity": "sha512-I5lg1fgPJ7I5dk6mr3qCH1hJYKJu1FsfKSiTKoYwcuUf53HWTrEkwmMI0t5ojFKeA6Vu+SfT2zVy5NS0QLXV4Q==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -404,9 +404,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-arm64-musl": {
|
||||
"version": "14.2.28",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-14.2.28.tgz",
|
||||
"integrity": "sha512-p6gvatI1nX41KCizEe6JkF0FS/cEEF0u23vKDpl+WhPe/fCTBeGkEBh7iW2cUM0rvquPVwPWdiUR6Ebr/kQWxQ==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-14.2.30.tgz",
|
||||
"integrity": "sha512-8GkNA+sLclQyxgzCDs2/2GSwBc92QLMrmYAmoP2xehe5MUKBLB2cgo34Yu242L1siSkwQkiV4YLdCnjwc/Micw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -419,9 +419,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-x64-gnu": {
|
||||
"version": "14.2.28",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-14.2.28.tgz",
|
||||
"integrity": "sha512-nsiSnz2wO6GwMAX2o0iucONlVL7dNgKUqt/mDTATGO2NY59EO/ZKnKEr80BJFhuA5UC1KZOMblJHWZoqIJddpA==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-14.2.30.tgz",
|
||||
"integrity": "sha512-8Ly7okjssLuBoe8qaRCcjGtcMsv79hwzn/63wNeIkzJVFVX06h5S737XNr7DZwlsbTBDOyI6qbL2BJB5n6TV/w==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -434,9 +434,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-linux-x64-musl": {
|
||||
"version": "14.2.28",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-14.2.28.tgz",
|
||||
"integrity": "sha512-+IuGQKoI3abrXFqx7GtlvNOpeExUH1mTIqCrh1LGFf8DnlUcTmOOCApEnPJUSLrSbzOdsF2ho2KhnQoO0I1RDw==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-14.2.30.tgz",
|
||||
"integrity": "sha512-dBmV1lLNeX4mR7uI7KNVHsGQU+OgTG5RGFPi3tBJpsKPvOPtg9poyav/BYWrB3GPQL4dW5YGGgalwZ79WukbKQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -449,9 +449,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-win32-arm64-msvc": {
|
||||
"version": "14.2.28",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-14.2.28.tgz",
|
||||
"integrity": "sha512-l61WZ3nevt4BAnGksUVFKy2uJP5DPz2E0Ma/Oklvo3sGj9sw3q7vBWONFRgz+ICiHpW5mV+mBrkB3XEubMrKaA==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-14.2.30.tgz",
|
||||
"integrity": "sha512-6MMHi2Qc1Gkq+4YLXAgbYslE1f9zMGBikKMdmQRHXjkGPot1JY3n5/Qrbg40Uvbi8//wYnydPnyvNhI1DMUW1g==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -464,9 +464,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-win32-ia32-msvc": {
|
||||
"version": "14.2.28",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-14.2.28.tgz",
|
||||
"integrity": "sha512-+Kcp1T3jHZnJ9v9VTJ/yf1t/xmtFAc/Sge4v7mVc1z+NYfYzisi8kJ9AsY8itbgq+WgEwMtOpiLLJsUy2qnXZw==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-14.2.30.tgz",
|
||||
"integrity": "sha512-pVZMnFok5qEX4RT59mK2hEVtJX+XFfak+/rjHpyFh7juiT52r177bfFKhnlafm0UOSldhXjj32b+LZIOdswGTg==",
|
||||
"cpu": [
|
||||
"ia32"
|
||||
],
|
||||
@@ -479,9 +479,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@next/swc-win32-x64-msvc": {
|
||||
"version": "14.2.28",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-14.2.28.tgz",
|
||||
"integrity": "sha512-1gCmpvyhz7DkB1srRItJTnmR2UwQPAUXXIg9r0/56g3O8etGmwlX68skKXJOp9EejW3hhv7nSQUJ2raFiz4MoA==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-14.2.30.tgz",
|
||||
"integrity": "sha512-4KCo8hMZXMjpTzs3HOqOGYYwAXymXIy7PEPAXNEcEOyKqkjiDlECumrWziy+JEF0Oi4ILHGxzgQ3YiMGG2t/Lg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -1317,9 +1317,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
|
||||
"integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
|
||||
"integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"balanced-match": "^1.0.0"
|
||||
@@ -1960,9 +1960,9 @@
|
||||
"integrity": "sha512-AlcaJBi/pqqJBIQ8U9Mcpc9i8Aqxn88Skv5d+xBX006BY5u8N3mGLHa5Lgppa7L/HfwgwLgZ6NYs+Ag6uUmJRA=="
|
||||
},
|
||||
"node_modules/brace-expansion": {
|
||||
"version": "1.1.11",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
|
||||
"integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
|
||||
"version": "1.1.12",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
|
||||
"integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"balanced-match": "^1.0.0",
|
||||
@@ -3391,9 +3391,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/glob/node_modules/brace-expansion": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
|
||||
"integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
|
||||
"integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
|
||||
"dependencies": {
|
||||
"balanced-match": "^1.0.0"
|
||||
}
|
||||
@@ -4389,11 +4389,11 @@
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/next": {
|
||||
"version": "14.2.28",
|
||||
"resolved": "https://registry.npmjs.org/next/-/next-14.2.28.tgz",
|
||||
"integrity": "sha512-QLEIP/kYXynIxtcKB6vNjtWLVs3Y4Sb+EClTC/CSVzdLD1gIuItccpu/n1lhmduffI32iPGEK2cLLxxt28qgYA==",
|
||||
"version": "14.2.30",
|
||||
"resolved": "https://registry.npmjs.org/next/-/next-14.2.30.tgz",
|
||||
"integrity": "sha512-+COdu6HQrHHFQ1S/8BBsCag61jZacmvbuL2avHvQFbWa2Ox7bE+d8FyNgxRLjXQ5wtPyQwEmk85js/AuaG2Sbg==",
|
||||
"dependencies": {
|
||||
"@next/env": "14.2.28",
|
||||
"@next/env": "14.2.30",
|
||||
"@swc/helpers": "0.5.5",
|
||||
"busboy": "1.6.0",
|
||||
"caniuse-lite": "^1.0.30001579",
|
||||
@@ -4408,15 +4408,15 @@
|
||||
"node": ">=18.17.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@next/swc-darwin-arm64": "14.2.28",
|
||||
"@next/swc-darwin-x64": "14.2.28",
|
||||
"@next/swc-linux-arm64-gnu": "14.2.28",
|
||||
"@next/swc-linux-arm64-musl": "14.2.28",
|
||||
"@next/swc-linux-x64-gnu": "14.2.28",
|
||||
"@next/swc-linux-x64-musl": "14.2.28",
|
||||
"@next/swc-win32-arm64-msvc": "14.2.28",
|
||||
"@next/swc-win32-ia32-msvc": "14.2.28",
|
||||
"@next/swc-win32-x64-msvc": "14.2.28"
|
||||
"@next/swc-darwin-arm64": "14.2.30",
|
||||
"@next/swc-darwin-x64": "14.2.30",
|
||||
"@next/swc-linux-arm64-gnu": "14.2.30",
|
||||
"@next/swc-linux-arm64-musl": "14.2.30",
|
||||
"@next/swc-linux-x64-gnu": "14.2.30",
|
||||
"@next/swc-linux-x64-musl": "14.2.30",
|
||||
"@next/swc-win32-arm64-msvc": "14.2.30",
|
||||
"@next/swc-win32-ia32-msvc": "14.2.30",
|
||||
"@next/swc-win32-x64-msvc": "14.2.30"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@opentelemetry/api": "^1.1.0",
|
||||
|
||||
@@ -6,10 +6,10 @@ Basic implementation using the [Pipecat JavaScript SDK](https://docs.pipecat.ai/
|
||||
|
||||
1. Run the bot server. See the [server README](../README).
|
||||
|
||||
2. Navigate to the `client/javascript` directory:
|
||||
2. Navigate to the `client` directory:
|
||||
|
||||
```bash
|
||||
cd client/javascript
|
||||
cd client
|
||||
```
|
||||
|
||||
3. Install dependencies:
|
||||
|
||||
@@ -22,6 +22,7 @@ classifiers = [
|
||||
dependencies = [
|
||||
"aiohttp~=3.11.12",
|
||||
"audioop-lts~=0.2.1; python_version>='3.13'",
|
||||
"docstring_parser~=0.16",
|
||||
"loguru~=0.7.3",
|
||||
"Markdown~=3.7",
|
||||
"numpy~=1.26.4",
|
||||
@@ -31,7 +32,7 @@ dependencies = [
|
||||
"pyloudnorm~=0.1.1",
|
||||
"resampy~=0.4.3",
|
||||
"soxr~=0.5.0",
|
||||
"openai~=1.70.0"
|
||||
"openai~=1.70.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -47,14 +48,14 @@ azure = [ "azure-cognitiveservices-speech~=1.42.0"]
|
||||
cartesia = [ "cartesia~=2.0.3", "websockets~=13.1" ]
|
||||
cerebras = []
|
||||
deepseek = []
|
||||
daily = [ "daily-python~=0.19.3" ]
|
||||
daily = [ "daily-python~=0.19.4" ]
|
||||
deepgram = [ "deepgram-sdk~=4.1.0" ]
|
||||
elevenlabs = [ "websockets~=13.1" ]
|
||||
fal = [ "fal-client~=0.5.9" ]
|
||||
fireworks = []
|
||||
fish = [ "ormsgpack~=1.7.0", "websockets~=13.1" ]
|
||||
gladia = [ "websockets~=13.1" ]
|
||||
google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", "google-genai~=1.14.0", "websockets~=13.1" ]
|
||||
google = [ "google-cloud-speech~=2.32.0", "google-cloud-texttospeech~=2.26.0", "google-genai~=1.24.0", "websockets~=13.1" ]
|
||||
grok = []
|
||||
groq = [ "groq~=0.23.0" ]
|
||||
gstreamer = [ "pygobject~=3.50.0" ]
|
||||
@@ -64,7 +65,7 @@ langchain = [ "langchain~=0.3.20", "langchain-community~=0.3.20", "langchain-ope
|
||||
livekit = [ "livekit~=0.22.0", "livekit-api~=0.8.2", "tenacity~=9.0.0" ]
|
||||
lmnt = [ "websockets~=13.1" ]
|
||||
local = [ "pyaudio~=0.2.14" ]
|
||||
mcp = [ "mcp[cli]~=1.6.0" ]
|
||||
mcp = [ "mcp[cli]~=1.9.4" ]
|
||||
mem0 = [ "mem0ai~=0.1.94" ]
|
||||
mlx-whisper = [ "mlx-whisper~=0.4.2" ]
|
||||
moondream = [ "einops~=0.8.0", "timm~=1.0.13", "transformers~=4.48.0" ]
|
||||
@@ -79,12 +80,14 @@ playht = [ "pyht~=0.1.12", "websockets~=13.1" ]
|
||||
qwen = []
|
||||
rime = [ "websockets~=13.1" ]
|
||||
riva = [ "nvidia-riva-client~=2.19.1" ]
|
||||
sambanova = []
|
||||
sentry = [ "sentry-sdk~=2.23.1" ]
|
||||
local-smart-turn = [ "coremltools>=8.0", "transformers", "torch==2.5.0", "torchaudio==2.5.0" ]
|
||||
remote-smart-turn = []
|
||||
silero = [ "onnxruntime~=1.20.1" ]
|
||||
simli = [ "simli-ai~=0.1.10"]
|
||||
soundfile = [ "soundfile~=0.13.0" ]
|
||||
speechmatics = [ "speechmatics-rt>=0.3.1" ]
|
||||
tavus=[]
|
||||
together = []
|
||||
tracing = [ "opentelemetry-sdk>=1.33.0", "opentelemetry-api>=1.33.0", "opentelemetry-instrumentation>=0.54b0" ]
|
||||
@@ -122,9 +125,21 @@ select = [
|
||||
"D", # Docstring rules
|
||||
"I", # Import rules
|
||||
]
|
||||
# We ignore D107 because class docstrings already document __init__ parameters
|
||||
# and our Sphinx configuration uses napoleon_include_init_with_doc=True
|
||||
ignore = ["D107"]
|
||||
ignore = [
|
||||
"D105", # Missing docstring in magic methods (__str__, __repr__, etc.)
|
||||
]
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
# Skip docstring checks for non-source code
|
||||
"examples/**/*.py" = ["D"]
|
||||
"tests/**/*.py" = ["D"]
|
||||
"scripts/**/*.py" = ["D"]
|
||||
"docs/**/*.py" = ["D"]
|
||||
# Skip D104 (missing docstring in public package) for __init__.py files
|
||||
"**/__init__.py" = ["D104"]
|
||||
# Skip specific rules for generated protobuf files
|
||||
"**/*_pb2.py" = ["D"]
|
||||
"src/pipecat/services/__init__.py" = ["D"]
|
||||
|
||||
[tool.ruff.lint.pydocstyle]
|
||||
convention = "google"
|
||||
|
||||
@@ -49,7 +49,7 @@ python run-release-evals.py -p 07 -a -v
|
||||
You can also run evals for a single example (not part of the release set):
|
||||
|
||||
```sh
|
||||
python run-eval.py YOUR_EXAMPLE_SCRIPT -a -v
|
||||
python run-eval.py -p "A simple math addition" -a -v YOUR_EXAMPLE_SCRIPT
|
||||
```
|
||||
|
||||
Your script needs to follow any of the foundation examples pattern.
|
||||
|
||||
@@ -100,17 +100,18 @@ class EvalRunner:
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
await asyncio.wait(
|
||||
[
|
||||
asyncio.create_task(run_example_pipeline(script_path)),
|
||||
asyncio.create_task(run_eval_pipeline(self, example_file, prompt, eval)),
|
||||
],
|
||||
timeout=90,
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
tasks = [
|
||||
asyncio.create_task(run_example_pipeline(script_path)),
|
||||
asyncio.create_task(run_eval_pipeline(self, example_file, prompt, eval)),
|
||||
]
|
||||
_, pending = await asyncio.wait(tasks, timeout=90)
|
||||
if pending:
|
||||
logger.error(f"ERROR: Eval timeout expired, cancelling pending tasks...")
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
await asyncio.gather(*pending, return_exceptions=True)
|
||||
except Exception as e:
|
||||
print(f"ERROR: Unable to run {example_file}: {e}")
|
||||
logger.error(f"ERROR: Unable to run {example_file}: {e}")
|
||||
|
||||
try:
|
||||
result = await asyncio.wait_for(self._queue.get(), timeout=1.0)
|
||||
@@ -134,6 +135,7 @@ class EvalRunner:
|
||||
async def save_audio(self, name: str, audio: bytes, sample_rate: int, num_channels: int):
|
||||
if len(audio) > 0:
|
||||
filename = self._recording_file_name(name)
|
||||
logger.debug(f"Saving {name} audio to {filename}")
|
||||
with io.BytesIO() as buffer:
|
||||
with wave.open(buffer, "wb") as wf:
|
||||
wf.setsampwidth(2)
|
||||
@@ -142,7 +144,6 @@ class EvalRunner:
|
||||
wf.writeframes(audio)
|
||||
async with aiofiles.open(filename, "wb") as file:
|
||||
await file.write(buffer.getvalue())
|
||||
logger.debug(f"Saving {name} audio to {filename}")
|
||||
else:
|
||||
logger.warning(f"There's no audio to save for {name}")
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ TESTS_07 = [
|
||||
# 07 series
|
||||
("07-interruptible.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07-interruptible-cartesia-http.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07a-interruptible-speechmatics.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, None),
|
||||
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, None),
|
||||
@@ -111,11 +112,16 @@ TESTS_26 = [
|
||||
# ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, None),
|
||||
]
|
||||
|
||||
TESTS_40 = [
|
||||
("40-aws-nova-sonic.py", PROMPT_SIMPLE_MATH, None),
|
||||
]
|
||||
|
||||
TESTS = [
|
||||
*TESTS_07,
|
||||
*TESTS_14,
|
||||
*TESTS_19,
|
||||
*TESTS_26,
|
||||
*TESTS_40,
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -2,4 +2,4 @@ ruff format src
|
||||
ruff format examples
|
||||
ruff format tests
|
||||
ruff format scripts
|
||||
ruff check --select I --fix
|
||||
ruff check --select I,D --fix
|
||||
@@ -1,3 +1,27 @@
|
||||
#!/bin/sh
|
||||
#!/bin/bash
|
||||
|
||||
NO_COLOR=1 ruff format --diff
|
||||
# Color codes for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo "🔍 Running pre-commit checks..."
|
||||
|
||||
# Change to project root (one level up from scripts/)
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
# Format check
|
||||
echo "📝 Checking code formatting..."
|
||||
if ! NO_COLOR=1 ruff format --diff --check; then
|
||||
echo -e "${RED}❌ Code formatting issues found. Run 'ruff format' to fix.${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Lint check
|
||||
echo "🔍 Running linter..."
|
||||
if ! ruff check; then
|
||||
echo -e "${RED}❌ Linting issues found.${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo -e "${GREEN}✅ All pre-commit checks passed!${NC}"
|
||||
@@ -1,3 +1,15 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Base adapter for LLM provider integration.
|
||||
|
||||
This module provides the abstract base class for implementing LLM provider-specific
|
||||
adapters that handle tool format conversion and standardization.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, List, Union, cast
|
||||
|
||||
@@ -7,12 +19,35 @@ from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
|
||||
|
||||
class BaseLLMAdapter(ABC):
|
||||
"""Abstract base class for LLM provider adapters.
|
||||
|
||||
Provides a standard interface for converting between Pipecat's standardized
|
||||
tool schemas and provider-specific tool formats. Subclasses must implement
|
||||
provider-specific conversion logic.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Any]:
|
||||
"""Converts tools to the provider's format."""
|
||||
"""Convert tools schema to the provider's specific format.
|
||||
|
||||
Args:
|
||||
tools_schema: The standardized tools schema to convert.
|
||||
|
||||
Returns:
|
||||
List of tools in the provider's expected format.
|
||||
"""
|
||||
pass
|
||||
|
||||
def from_standard_tools(self, tools: Any) -> List[Any]:
|
||||
"""Convert tools from standard format to provider format.
|
||||
|
||||
Args:
|
||||
tools: Tools in standard format or provider-specific format.
|
||||
|
||||
Returns:
|
||||
List of tools converted to provider format, or original tools
|
||||
if not in standard format.
|
||||
"""
|
||||
if isinstance(tools, ToolsSchema):
|
||||
logger.debug(f"Retrieving the tools using the adapter: {type(self)}")
|
||||
return self.to_provider_tools_format(tools)
|
||||
|
||||
296
src/pipecat/adapters/schemas/direct_function.py
Normal file
296
src/pipecat/adapters/schemas/direct_function.py
Normal file
@@ -0,0 +1,296 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Direct function wrapper utilities for LLM function calling.
|
||||
|
||||
This module provides utilities for wrapping "direct" functions that handle LLM
|
||||
function calls. Direct functions have their metadata automatically extracted
|
||||
from function signatures and docstrings, allowing them to be used without
|
||||
accompanying configurations (as FunctionSchemas or in provider-specific
|
||||
formats).
|
||||
"""
|
||||
|
||||
import inspect
|
||||
import types
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
List,
|
||||
Mapping,
|
||||
Protocol,
|
||||
Set,
|
||||
Tuple,
|
||||
Union,
|
||||
get_args,
|
||||
get_origin,
|
||||
get_type_hints,
|
||||
)
|
||||
|
||||
import docstring_parser
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
|
||||
|
||||
class DirectFunction(Protocol):
|
||||
"""Protocol for a "direct" function that handles LLM function calls.
|
||||
|
||||
"Direct" functions' metadata is automatically extracted from their function signature and
|
||||
docstrings, allowing them to be used without accompanying function configurations (as
|
||||
`FunctionSchema`s or in provider-specific formats).
|
||||
"""
|
||||
|
||||
async def __call__(self, params: "FunctionCallParams", **kwargs: Any) -> None:
|
||||
"""Execute the direct function.
|
||||
|
||||
Args:
|
||||
params: Function call parameters from the LLM service.
|
||||
**kwargs: Additional keyword arguments passed to the function.
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
class BaseDirectFunctionWrapper:
|
||||
"""Base class for a wrapper around a DirectFunction.
|
||||
|
||||
Provides functionality to:
|
||||
|
||||
- extract metadata from the function signature and docstring
|
||||
- use that metadata to generate a corresponding FunctionSchema
|
||||
"""
|
||||
|
||||
def __init__(self, function: Callable):
|
||||
"""Initialize the direct function wrapper.
|
||||
|
||||
Args:
|
||||
function: The function to wrap and extract metadata from.
|
||||
"""
|
||||
self.__class__.validate_function(function)
|
||||
self.function = function
|
||||
self._initialize_metadata()
|
||||
|
||||
@classmethod
|
||||
def special_first_param_name(cls) -> str:
|
||||
"""Get the name of the special first function parameter.
|
||||
|
||||
The special first parameter is ignored by metadata extraction as it's
|
||||
not relevant to the LLM (e.g., 'params' for FunctionCallParams).
|
||||
|
||||
Returns:
|
||||
The name of the special first parameter.
|
||||
"""
|
||||
raise NotImplementedError("Subclasses must define the special first parameter name.")
|
||||
|
||||
@classmethod
|
||||
def validate_function(cls, function: Callable) -> None:
|
||||
"""Validate that the function meets direct function requirements.
|
||||
|
||||
Args:
|
||||
function: The function to validate.
|
||||
|
||||
Raises:
|
||||
Exception: If function doesn't meet requirements (not async, missing
|
||||
parameters, incorrect first parameter name).
|
||||
"""
|
||||
if not inspect.iscoroutinefunction(function):
|
||||
raise Exception(f"Direct function {function.__name__} must be async")
|
||||
params = list(inspect.signature(function).parameters.items())
|
||||
special_first_param_name = cls.special_first_param_name()
|
||||
if len(params) == 0:
|
||||
raise Exception(
|
||||
f"Direct function {function.__name__} must have at least one parameter ({special_first_param_name})"
|
||||
)
|
||||
first_param_name = params[0][0]
|
||||
if first_param_name != special_first_param_name:
|
||||
raise Exception(
|
||||
f"Direct function {function.__name__} first parameter must be named '{special_first_param_name}'"
|
||||
)
|
||||
|
||||
def to_function_schema(self) -> FunctionSchema:
|
||||
"""Convert the wrapped function to a FunctionSchema.
|
||||
|
||||
Returns:
|
||||
A FunctionSchema instance with extracted metadata.
|
||||
"""
|
||||
return FunctionSchema(
|
||||
name=self.name,
|
||||
description=self.description,
|
||||
properties=self.properties,
|
||||
required=self.required,
|
||||
)
|
||||
|
||||
def _initialize_metadata(self):
|
||||
"""Initialize metadata from function signature and docstring."""
|
||||
# Get function name
|
||||
self.name = self.function.__name__
|
||||
|
||||
# Parse docstring for description and parameters
|
||||
docstring = docstring_parser.parse(inspect.getdoc(self.function))
|
||||
|
||||
# Get function description
|
||||
self.description = (docstring.description or "").strip()
|
||||
|
||||
# Get function parameters as JSON schemas, and the list of required parameters
|
||||
self.properties, self.required = self._get_parameters_as_jsonschema(
|
||||
self.function, docstring.params
|
||||
)
|
||||
|
||||
# TODO: maybe to better support things like enums, check if each type is a pydantic type and use its convert-to-jsonschema function
|
||||
def _get_parameters_as_jsonschema(
|
||||
self, func: Callable, docstring_params: List[docstring_parser.DocstringParam]
|
||||
) -> Tuple[Dict[str, Any], List[str]]:
|
||||
"""Get function parameters as a dictionary of JSON schemas and a list of required parameters.
|
||||
|
||||
Ignore the first parameter, as it's expected to be the "special" one.
|
||||
|
||||
Args:
|
||||
func: Function to get parameters from.
|
||||
docstring_params: List of parameters extracted from the function's docstring.
|
||||
|
||||
Returns:
|
||||
A tuple containing:
|
||||
|
||||
- A dictionary mapping each function parameter to its JSON schema
|
||||
- A list of required parameter names
|
||||
"""
|
||||
sig = inspect.signature(func)
|
||||
hints = get_type_hints(func)
|
||||
properties = {}
|
||||
required = []
|
||||
|
||||
for name, param in sig.parameters.items():
|
||||
# Ignore 'self' parameter
|
||||
if name == "self":
|
||||
continue
|
||||
|
||||
# Ignore the first parameter, which is expected to be the "special" one
|
||||
# (We have already validated that this is the case in validate_function())
|
||||
is_first_param = name == next(iter(sig.parameters))
|
||||
if is_first_param:
|
||||
continue
|
||||
|
||||
type_hint = hints.get(name)
|
||||
|
||||
# Convert type hint to JSON schema
|
||||
properties[name] = self._typehint_to_jsonschema(type_hint)
|
||||
|
||||
# Add whether the parameter is required
|
||||
# If the parameter has no default value, it's required
|
||||
if param.default is inspect.Parameter.empty:
|
||||
required.append(name)
|
||||
|
||||
# Add parameter description from docstring
|
||||
for doc_param in docstring_params:
|
||||
if doc_param.arg_name == name:
|
||||
properties[name]["description"] = doc_param.description or ""
|
||||
|
||||
return properties, required
|
||||
|
||||
def _typehint_to_jsonschema(self, type_hint: Any) -> Dict[str, Any]:
|
||||
"""Convert a Python type hint to a JSON Schema.
|
||||
|
||||
Args:
|
||||
type_hint: A Python type hint
|
||||
|
||||
Returns:
|
||||
A dictionary representing the JSON Schema
|
||||
"""
|
||||
if type_hint is None:
|
||||
return {}
|
||||
|
||||
# Handle basic types
|
||||
if type_hint is type(None):
|
||||
return {"type": "null"}
|
||||
if type_hint is str:
|
||||
return {"type": "string"}
|
||||
elif type_hint is int:
|
||||
return {"type": "integer"}
|
||||
elif type_hint is float:
|
||||
return {"type": "number"}
|
||||
elif type_hint is bool:
|
||||
return {"type": "boolean"}
|
||||
elif type_hint is dict or type_hint is Dict:
|
||||
return {"type": "object"}
|
||||
elif type_hint is list or type_hint is List:
|
||||
return {"type": "array"}
|
||||
|
||||
# Get origin and arguments for complex types
|
||||
origin = get_origin(type_hint)
|
||||
args = get_args(type_hint)
|
||||
|
||||
# Handle Optional/Union types
|
||||
if origin is Union or origin is types.UnionType:
|
||||
return {"anyOf": [self._typehint_to_jsonschema(arg) for arg in args]}
|
||||
|
||||
# Handle List, Tuple, Set with specific item types
|
||||
if origin in (list, List, tuple, Tuple, set, Set) and args:
|
||||
return {"type": "array", "items": self._typehint_to_jsonschema(args[0])}
|
||||
|
||||
# Handle Dict with specific key/value types
|
||||
if origin in (dict, Dict) and len(args) == 2:
|
||||
# For JSON Schema, keys must be strings
|
||||
return {"type": "object", "additionalProperties": self._typehint_to_jsonschema(args[1])}
|
||||
|
||||
# Handle TypedDict
|
||||
if hasattr(type_hint, "__annotations__"):
|
||||
properties = {}
|
||||
required = []
|
||||
|
||||
# NOTE: this does not yet support some fields being required and others not, which could happen when:
|
||||
# - the base class is a TypedDict with required fields (total=True or not specified) and the derived class has optional fields (total=False)
|
||||
# - Python 3.11+ NotRequired is used
|
||||
all_fields_required = getattr(type_hint, "__total__", True)
|
||||
|
||||
for field_name, field_type in get_type_hints(type_hint).items():
|
||||
properties[field_name] = self._typehint_to_jsonschema(field_type)
|
||||
if all_fields_required:
|
||||
required.append(field_name)
|
||||
|
||||
schema = {"type": "object", "properties": properties}
|
||||
|
||||
if required:
|
||||
schema["required"] = required
|
||||
|
||||
return schema
|
||||
|
||||
# Default to any type if we can't determine the specific schema
|
||||
return {}
|
||||
|
||||
|
||||
class DirectFunctionWrapper(BaseDirectFunctionWrapper):
|
||||
"""Wrapper around a DirectFunction for LLM function calling.
|
||||
|
||||
This class:
|
||||
|
||||
- Extracts metadata from the function signature and docstring
|
||||
- Generates a corresponding FunctionSchema
|
||||
- Helps with function invocation
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def special_first_param_name(cls) -> str:
|
||||
"""Get the special first parameter name for direct functions.
|
||||
|
||||
Returns:
|
||||
The string "params" which is expected as the first parameter.
|
||||
"""
|
||||
return "params"
|
||||
|
||||
async def invoke(self, args: Mapping[str, Any], params: "FunctionCallParams"):
|
||||
"""Invoke the wrapped function with the provided arguments.
|
||||
|
||||
Args:
|
||||
args: Arguments to pass to the function.
|
||||
params: Function call parameters from the LLM service.
|
||||
|
||||
Returns:
|
||||
The result of the function call.
|
||||
"""
|
||||
return await self.function(params=params, **args)
|
||||
@@ -4,6 +4,13 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Function schema utilities for AI tool definitions.
|
||||
|
||||
This module provides standardized function schema representation for defining
|
||||
tools and functions used with AI models, ensuring consistent formatting
|
||||
across different AI service providers.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
@@ -13,17 +20,19 @@ class FunctionSchema:
|
||||
Provides a structured way to define function tools used with AI models like OpenAI.
|
||||
This schema defines the function's name, description, parameter properties, and
|
||||
required parameters, following specifications required by AI service providers.
|
||||
|
||||
Args:
|
||||
name: Name of the function to be called.
|
||||
description: Description of what the function does.
|
||||
properties: Dictionary defining parameter types, descriptions, and constraints.
|
||||
required: List of property names that are required parameters.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, name: str, description: str, properties: Dict[str, Any], required: List[str]
|
||||
) -> None:
|
||||
"""Initialize the function schema.
|
||||
|
||||
Args:
|
||||
name: Name of the function to be called.
|
||||
description: Description of what the function does.
|
||||
properties: Dictionary defining parameter types, descriptions, and constraints.
|
||||
required: List of property names that are required parameters.
|
||||
"""
|
||||
self._name = name
|
||||
self._description = description
|
||||
self._properties = properties
|
||||
|
||||
@@ -4,40 +4,88 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Tools schema definitions for function calling adapters.
|
||||
|
||||
This module provides schemas for managing both standardized function tools
|
||||
and custom adapter-specific tools in the Pipecat framework.
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from pipecat.adapters.schemas.direct_function import DirectFunction, DirectFunctionWrapper
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
|
||||
|
||||
class AdapterType(Enum):
|
||||
"""Supported adapter types for custom tools.
|
||||
|
||||
Parameters:
|
||||
GEMINI: Google Gemini adapter - currently the only service supporting custom tools.
|
||||
"""
|
||||
|
||||
GEMINI = "gemini" # that is the only service where we are able to add custom tools for now
|
||||
|
||||
|
||||
class ToolsSchema:
|
||||
"""Schema for managing both standard and custom function calling tools.
|
||||
|
||||
This class provides a unified interface for handling standardized function
|
||||
schemas alongside custom tools that may not follow the standard format,
|
||||
such as adapter-specific search tools.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
standard_tools: List[FunctionSchema],
|
||||
standard_tools: List[FunctionSchema | DirectFunction],
|
||||
custom_tools: Optional[Dict[AdapterType, List[Dict[str, Any]]]] = None,
|
||||
) -> None:
|
||||
"""
|
||||
A schema for tools that includes both standardized function schemas
|
||||
and custom tools that do not follow the FunctionSchema format.
|
||||
"""Initialize the tools schema.
|
||||
|
||||
:param standard_tools: List of tools following FunctionSchema.
|
||||
:param custom_tools: List of tools in a custom format (e.g., search_tool).
|
||||
Args:
|
||||
standard_tools: List of tools following the standardized FunctionSchema format.
|
||||
custom_tools: Dictionary mapping adapter types to their custom tool definitions.
|
||||
These tools may not follow the FunctionSchema format (e.g., search_tool).
|
||||
"""
|
||||
self._standard_tools = standard_tools
|
||||
|
||||
def _map_standard_tools(tools):
|
||||
schemas = []
|
||||
for tool in tools:
|
||||
if isinstance(tool, FunctionSchema):
|
||||
schemas.append(tool)
|
||||
elif callable(tool):
|
||||
wrapper = DirectFunctionWrapper(tool)
|
||||
schemas.append(wrapper.to_function_schema())
|
||||
else:
|
||||
raise TypeError(f"Unsupported tool type: {type(tool)}")
|
||||
return schemas
|
||||
|
||||
self._standard_tools = _map_standard_tools(standard_tools)
|
||||
self._custom_tools = custom_tools
|
||||
|
||||
@property
|
||||
def standard_tools(self) -> List[FunctionSchema]:
|
||||
"""Get the list of standard function schema tools.
|
||||
|
||||
Returns:
|
||||
List of tools following the FunctionSchema format.
|
||||
"""
|
||||
return self._standard_tools
|
||||
|
||||
@property
|
||||
def custom_tools(self) -> Dict[AdapterType, List[Dict[str, Any]]]:
|
||||
"""Get the custom tools dictionary.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping adapter types to their custom tool definitions.
|
||||
"""
|
||||
return self._custom_tools
|
||||
|
||||
@custom_tools.setter
|
||||
def custom_tools(self, value: Dict[AdapterType, List[Dict[str, Any]]]) -> None:
|
||||
"""Set the custom tools dictionary.
|
||||
|
||||
Args:
|
||||
value: Dictionary mapping adapter types to their custom tool definitions.
|
||||
"""
|
||||
self._custom_tools = value
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Anthropic LLM adapter for Pipecat."""
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
@@ -12,8 +14,22 @@ from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
|
||||
|
||||
class AnthropicLLMAdapter(BaseLLMAdapter):
|
||||
"""Adapter for converting tool schemas to Anthropic's function-calling format.
|
||||
|
||||
This adapter handles the conversion of Pipecat's standard function schemas
|
||||
to the specific format required by Anthropic's Claude models for function calling.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _to_anthropic_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||
"""Convert a single function schema to Anthropic's format.
|
||||
|
||||
Args:
|
||||
function: The function schema to convert.
|
||||
|
||||
Returns:
|
||||
Dictionary containing the function definition in Anthropic's format.
|
||||
"""
|
||||
return {
|
||||
"name": function.name,
|
||||
"description": function.description,
|
||||
@@ -25,10 +41,13 @@ class AnthropicLLMAdapter(BaseLLMAdapter):
|
||||
}
|
||||
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]:
|
||||
"""Converts function schemas to Anthropic's function-calling format.
|
||||
"""Convert function schemas to Anthropic's function-calling format.
|
||||
|
||||
:return: Anthropic formatted function call definition.
|
||||
Args:
|
||||
tools_schema: The tools schema containing functions to convert.
|
||||
|
||||
Returns:
|
||||
List of function definitions formatted for Anthropic's API.
|
||||
"""
|
||||
|
||||
functions_schema = tools_schema.standard_tools
|
||||
return [self._to_anthropic_function_format(func) for func in functions_schema]
|
||||
|
||||
@@ -3,6 +3,9 @@
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""AWS Nova Sonic LLM adapter for Pipecat."""
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, List
|
||||
|
||||
@@ -12,8 +15,22 @@ from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
|
||||
|
||||
class AWSNovaSonicLLMAdapter(BaseLLMAdapter):
|
||||
"""Adapter for AWS Nova Sonic language models.
|
||||
|
||||
Converts Pipecat's standard function schemas into AWS Nova Sonic's
|
||||
specific function-calling format, enabling tool use with Nova Sonic models.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _to_aws_nova_sonic_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||
"""Convert a function schema to AWS Nova Sonic format.
|
||||
|
||||
Args:
|
||||
function: The function schema to convert.
|
||||
|
||||
Returns:
|
||||
Dictionary in AWS Nova Sonic function format with toolSpec structure.
|
||||
"""
|
||||
return {
|
||||
"toolSpec": {
|
||||
"name": function.name,
|
||||
@@ -31,10 +48,13 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter):
|
||||
}
|
||||
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]:
|
||||
"""Converts function schemas to AWS Nova Sonic function-calling format.
|
||||
"""Convert tools schema to AWS Nova Sonic function-calling format.
|
||||
|
||||
:return: AWS Nova Sonic formatted function call definition.
|
||||
Args:
|
||||
tools_schema: The tools schema containing function definitions to convert.
|
||||
|
||||
Returns:
|
||||
List of dictionaries in AWS Nova Sonic function format.
|
||||
"""
|
||||
|
||||
functions_schema = tools_schema.standard_tools
|
||||
return [self._to_aws_nova_sonic_function_format(func) for func in functions_schema]
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""AWS Bedrock LLM adapter for Pipecat."""
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
@@ -12,8 +14,22 @@ from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
|
||||
|
||||
class AWSBedrockLLMAdapter(BaseLLMAdapter):
|
||||
"""Adapter for AWS Bedrock LLM integration with Pipecat.
|
||||
|
||||
Provides conversion utilities for transforming Pipecat function schemas
|
||||
into AWS Bedrock's expected tool format for function calling capabilities.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _to_bedrock_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||
"""Convert a function schema to Bedrock's tool format.
|
||||
|
||||
Args:
|
||||
function: The function schema to convert.
|
||||
|
||||
Returns:
|
||||
Dictionary formatted for Bedrock's tool specification.
|
||||
"""
|
||||
return {
|
||||
"toolSpec": {
|
||||
"name": function.name,
|
||||
@@ -29,10 +45,13 @@ class AWSBedrockLLMAdapter(BaseLLMAdapter):
|
||||
}
|
||||
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]:
|
||||
"""Converts function schemas to Bedrock's function-calling format.
|
||||
"""Convert function schemas to Bedrock's function-calling format.
|
||||
|
||||
:return: Bedrock formatted function call definition.
|
||||
Args:
|
||||
tools_schema: The tools schema containing functions to convert.
|
||||
|
||||
Returns:
|
||||
List of Bedrock formatted function call definitions.
|
||||
"""
|
||||
|
||||
functions_schema = tools_schema.standard_tools
|
||||
return [self._to_bedrock_function_format(func) for func in functions_schema]
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Gemini LLM adapter for Pipecat."""
|
||||
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
@@ -11,12 +13,23 @@ from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
|
||||
|
||||
class GeminiLLMAdapter(BaseLLMAdapter):
|
||||
"""LLM adapter for Google's Gemini service.
|
||||
|
||||
Provides tool schema conversion functionality to transform standard tool
|
||||
definitions into Gemini's specific function-calling format for use with
|
||||
Gemini LLM models.
|
||||
"""
|
||||
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]:
|
||||
"""Converts function schemas to Gemini's function-calling format.
|
||||
"""Convert tool schemas to Gemini's function-calling format.
|
||||
|
||||
:return: Gemini formatted function call definition.
|
||||
Args:
|
||||
tools_schema: The tools schema containing standard and custom tool definitions.
|
||||
|
||||
Returns:
|
||||
List of tool definitions formatted for Gemini's function-calling API.
|
||||
Includes both converted standard tools and any custom Gemini-specific tools.
|
||||
"""
|
||||
|
||||
functions_schema = tools_schema.standard_tools
|
||||
formatted_standard_tools = [
|
||||
{"function_declarations": [func.to_default_dict() for func in functions_schema]}
|
||||
|
||||
@@ -3,6 +3,9 @@
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""OpenAI LLM adapter for Pipecat."""
|
||||
|
||||
from typing import List
|
||||
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
@@ -12,10 +15,22 @@ from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
|
||||
|
||||
class OpenAILLMAdapter(BaseLLMAdapter):
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[ChatCompletionToolParam]:
|
||||
"""Converts function schemas to OpenAI's function-calling format.
|
||||
"""Adapter for converting tool schemas to OpenAI's format.
|
||||
|
||||
:return: OpenAI formatted function call definition.
|
||||
Provides conversion utilities for transforming Pipecat's standard tool
|
||||
schemas into the format expected by OpenAI's ChatCompletion API for
|
||||
function calling capabilities.
|
||||
"""
|
||||
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[ChatCompletionToolParam]:
|
||||
"""Convert function schemas to OpenAI's function-calling format.
|
||||
|
||||
Args:
|
||||
tools_schema: The Pipecat tools schema to convert.
|
||||
|
||||
Returns:
|
||||
List of OpenAI formatted function call definitions ready for use
|
||||
with ChatCompletion API.
|
||||
"""
|
||||
functions_schema = tools_schema.standard_tools
|
||||
return [
|
||||
|
||||
@@ -3,6 +3,9 @@
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""OpenAI Realtime LLM adapter for Pipecat."""
|
||||
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
@@ -11,8 +14,22 @@ from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
|
||||
|
||||
class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
"""LLM adapter for OpenAI Realtime API function calling.
|
||||
|
||||
Converts Pipecat's tool schemas into the specific format required by
|
||||
OpenAI's Realtime API for function calling capabilities.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _to_openai_realtime_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||
"""Convert a function schema to OpenAI Realtime format.
|
||||
|
||||
Args:
|
||||
function: The function schema to convert.
|
||||
|
||||
Returns:
|
||||
Dictionary in OpenAI Realtime function format.
|
||||
"""
|
||||
return {
|
||||
"type": "function",
|
||||
"name": function.name,
|
||||
@@ -25,10 +42,13 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
}
|
||||
|
||||
def to_provider_tools_format(self, tools_schema: ToolsSchema) -> List[Dict[str, Any]]:
|
||||
"""Converts function schemas to Openai Realtime function-calling format.
|
||||
"""Convert tool schemas to OpenAI Realtime function-calling format.
|
||||
|
||||
:return: Openai Realtime formatted function call definition.
|
||||
Args:
|
||||
tools_schema: The tools schema containing functions to convert.
|
||||
|
||||
Returns:
|
||||
List of function definitions in OpenAI Realtime format.
|
||||
"""
|
||||
|
||||
functions_schema = tools_schema.standard_tools
|
||||
return [self._to_openai_realtime_function_format(func) for func in functions_schema]
|
||||
|
||||
@@ -4,44 +4,68 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Base audio filter interface for input transport audio processing.
|
||||
|
||||
This module provides the abstract base class for implementing audio filters
|
||||
that process audio data before VAD and downstream processing in input transports.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from pipecat.frames.frames import FilterControlFrame
|
||||
|
||||
|
||||
class BaseAudioFilter(ABC):
|
||||
"""This is a base class for input transport audio filters. If an audio
|
||||
"""Base class for input transport audio filters.
|
||||
|
||||
This is a base class for input transport audio filters. If an audio
|
||||
filter is provided to the input transport it will be used to process audio
|
||||
before VAD and before pushing it downstream. There are control frames to
|
||||
update filter settings or to enable or disable the filter at runtime.
|
||||
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def start(self, sample_rate: int):
|
||||
"""This will be called from the input transport when the transport is
|
||||
"""Initialize the filter when the input transport starts.
|
||||
|
||||
This will be called from the input transport when the transport is
|
||||
started. It can be used to initialize the filter. The input transport
|
||||
sample rate is provided so the filter can adjust to that sample rate.
|
||||
|
||||
Args:
|
||||
sample_rate: The sample rate of the input transport in Hz.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def stop(self):
|
||||
"""This will be called from the input transport when the transport is
|
||||
stopping.
|
||||
"""Clean up the filter when the input transport stops.
|
||||
|
||||
This will be called from the input transport when the transport is
|
||||
stopping.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def process_frame(self, frame: FilterControlFrame):
|
||||
"""This will be called when the input transport receives a
|
||||
"""Process control frames for runtime filter configuration.
|
||||
|
||||
This will be called when the input transport receives a
|
||||
FilterControlFrame.
|
||||
|
||||
Args:
|
||||
frame: The control frame containing filter commands or settings.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def filter(self, audio: bytes) -> bytes:
|
||||
"""Apply the audio filter to the provided audio data.
|
||||
|
||||
Args:
|
||||
audio: Raw audio data as bytes to be filtered.
|
||||
|
||||
Returns:
|
||||
Filtered audio data as bytes.
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -4,6 +4,12 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Koala noise suppression audio filter for Pipecat.
|
||||
|
||||
This module provides an audio filter implementation using PicoVoice's Koala
|
||||
Noise Suppression engine to reduce background noise in audio streams.
|
||||
"""
|
||||
|
||||
from typing import Sequence
|
||||
|
||||
import numpy as np
|
||||
@@ -21,12 +27,19 @@ except ModuleNotFoundError as e:
|
||||
|
||||
|
||||
class KoalaFilter(BaseAudioFilter):
|
||||
"""This is an audio filter that uses Koala Noise Suppression (from
|
||||
PicoVoice).
|
||||
"""Audio filter using Koala Noise Suppression from PicoVoice.
|
||||
|
||||
Provides real-time noise suppression for audio streams using PicoVoice's
|
||||
Koala engine. The filter buffers audio data to match Koala's required
|
||||
frame length and processes it in chunks.
|
||||
"""
|
||||
|
||||
def __init__(self, *, access_key: str) -> None:
|
||||
"""Initialize the Koala noise suppression filter.
|
||||
|
||||
Args:
|
||||
access_key: PicoVoice access key for Koala engine authentication.
|
||||
"""
|
||||
self._access_key = access_key
|
||||
|
||||
self._filtering = True
|
||||
@@ -36,6 +49,11 @@ class KoalaFilter(BaseAudioFilter):
|
||||
self._audio_buffer = bytearray()
|
||||
|
||||
async def start(self, sample_rate: int):
|
||||
"""Initialize the filter with the transport's sample rate.
|
||||
|
||||
Args:
|
||||
sample_rate: The sample rate of the input transport in Hz.
|
||||
"""
|
||||
self._sample_rate = sample_rate
|
||||
if self._sample_rate != self._koala.sample_rate:
|
||||
logger.warning(
|
||||
@@ -44,13 +62,30 @@ class KoalaFilter(BaseAudioFilter):
|
||||
self._koala_ready = False
|
||||
|
||||
async def stop(self):
|
||||
"""Clean up the Koala engine when stopping."""
|
||||
self._koala.reset()
|
||||
|
||||
async def process_frame(self, frame: FilterControlFrame):
|
||||
"""Process control frames to enable/disable filtering.
|
||||
|
||||
Args:
|
||||
frame: The control frame containing filter commands.
|
||||
"""
|
||||
if isinstance(frame, FilterEnableFrame):
|
||||
self._filtering = frame.enable
|
||||
|
||||
async def filter(self, audio: bytes) -> bytes:
|
||||
"""Apply Koala noise suppression to audio data.
|
||||
|
||||
Buffers incoming audio and processes it in chunks that match Koala's
|
||||
required frame length. Returns filtered audio data.
|
||||
|
||||
Args:
|
||||
audio: Raw audio data as bytes to be filtered.
|
||||
|
||||
Returns:
|
||||
Noise-suppressed audio data as bytes.
|
||||
"""
|
||||
if not self._koala_ready or not self._filtering:
|
||||
return audio
|
||||
|
||||
|
||||
@@ -4,6 +4,12 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Krisp noise reduction audio filter for Pipecat.
|
||||
|
||||
This module provides an audio filter implementation using Krisp's noise
|
||||
reduction technology to suppress background noise in audio streams.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
@@ -21,14 +27,27 @@ except ModuleNotFoundError as e:
|
||||
|
||||
|
||||
class KrispProcessorManager:
|
||||
"""
|
||||
Ensures that only one KrispAudioProcessor instance exists for the entire program.
|
||||
"""Singleton manager for KrispAudioProcessor instances.
|
||||
|
||||
Ensures that only one KrispAudioProcessor instance exists for the entire
|
||||
program.
|
||||
"""
|
||||
|
||||
_krisp_instance = None
|
||||
|
||||
@classmethod
|
||||
def get_processor(cls, sample_rate: int, sample_type: str, channels: int, model_path: str):
|
||||
"""Get or create a KrispAudioProcessor instance.
|
||||
|
||||
Args:
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
sample_type: Audio sample type (e.g., "PCM_16").
|
||||
channels: Number of audio channels.
|
||||
model_path: Path to the Krisp model file.
|
||||
|
||||
Returns:
|
||||
Shared KrispAudioProcessor instance.
|
||||
"""
|
||||
if cls._krisp_instance is None:
|
||||
cls._krisp_instance = KrispAudioProcessor(
|
||||
sample_rate, sample_type, channels, model_path
|
||||
@@ -37,14 +56,26 @@ class KrispProcessorManager:
|
||||
|
||||
|
||||
class KrispFilter(BaseAudioFilter):
|
||||
"""Audio filter using Krisp noise reduction technology.
|
||||
|
||||
Provides real-time noise reduction for audio streams using Krisp's
|
||||
proprietary noise suppression algorithms. Requires a Krisp model file
|
||||
for operation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, sample_type: str = "PCM_16", channels: int = 1, model_path: str = None
|
||||
) -> None:
|
||||
"""Initializes the KrispAudioProcessor with customizable audio processing settings.
|
||||
"""Initialize the Krisp noise reduction filter.
|
||||
|
||||
:param sample_type: The type of audio sample, default is 'PCM_16'.
|
||||
:param channels: Number of audio channels, default is 1.
|
||||
:param model_path: Path to the Krisp model; defaults to environment variable KRISP_MODEL_PATH if not provided.
|
||||
Args:
|
||||
sample_type: The audio sample format. Defaults to "PCM_16".
|
||||
channels: Number of audio channels. Defaults to 1.
|
||||
model_path: Path to the Krisp model file. If None, uses KRISP_MODEL_PATH
|
||||
environment variable.
|
||||
|
||||
Raises:
|
||||
ValueError: If model_path is not provided and KRISP_MODEL_PATH is not set.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
@@ -63,19 +94,41 @@ class KrispFilter(BaseAudioFilter):
|
||||
self._krisp_processor = None
|
||||
|
||||
async def start(self, sample_rate: int):
|
||||
"""Initialize the Krisp processor with the transport's sample rate.
|
||||
|
||||
Args:
|
||||
sample_rate: The sample rate of the input transport in Hz.
|
||||
"""
|
||||
self._sample_rate = sample_rate
|
||||
self._krisp_processor = KrispProcessorManager.get_processor(
|
||||
self._sample_rate, self._sample_type, self._channels, self._model_path
|
||||
)
|
||||
|
||||
async def stop(self):
|
||||
"""Clean up the Krisp processor when stopping."""
|
||||
self._krisp_processor = None
|
||||
|
||||
async def process_frame(self, frame: FilterControlFrame):
|
||||
"""Process control frames to enable/disable filtering.
|
||||
|
||||
Args:
|
||||
frame: The control frame containing filter commands.
|
||||
"""
|
||||
if isinstance(frame, FilterEnableFrame):
|
||||
self._filtering = frame.enable
|
||||
|
||||
async def filter(self, audio: bytes) -> bytes:
|
||||
"""Apply Krisp noise reduction to audio data.
|
||||
|
||||
Converts audio to float32, applies Krisp noise reduction processing,
|
||||
and returns the filtered audio clipped to int16 range.
|
||||
|
||||
Args:
|
||||
audio: Raw audio data as bytes to be filtered.
|
||||
|
||||
Returns:
|
||||
Noise-reduced audio data as bytes.
|
||||
"""
|
||||
if not self._filtering:
|
||||
return audio
|
||||
|
||||
|
||||
@@ -4,6 +4,13 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Noisereduce audio filter for Pipecat.
|
||||
|
||||
This module provides an audio filter implementation using the noisereduce
|
||||
library to reduce background noise in audio streams through spectral
|
||||
gating algorithms.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
@@ -21,21 +28,51 @@ except ModuleNotFoundError as e:
|
||||
|
||||
|
||||
class NoisereduceFilter(BaseAudioFilter):
|
||||
"""Audio filter using the noisereduce library for noise suppression.
|
||||
|
||||
Applies spectral gating noise reduction algorithms to suppress background
|
||||
noise in audio streams. Uses the noisereduce library's default noise
|
||||
reduction parameters.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the noisereduce filter."""
|
||||
self._filtering = True
|
||||
self._sample_rate = 0
|
||||
|
||||
async def start(self, sample_rate: int):
|
||||
"""Initialize the filter with the transport's sample rate.
|
||||
|
||||
Args:
|
||||
sample_rate: The sample rate of the input transport in Hz.
|
||||
"""
|
||||
self._sample_rate = sample_rate
|
||||
|
||||
async def stop(self):
|
||||
"""Clean up the filter when stopping."""
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: FilterControlFrame):
|
||||
"""Process control frames to enable/disable filtering.
|
||||
|
||||
Args:
|
||||
frame: The control frame containing filter commands.
|
||||
"""
|
||||
if isinstance(frame, FilterEnableFrame):
|
||||
self._filtering = frame.enable
|
||||
|
||||
async def filter(self, audio: bytes) -> bytes:
|
||||
"""Apply noise reduction to audio data using spectral gating.
|
||||
|
||||
Converts audio to float32, applies noisereduce processing, and returns
|
||||
the filtered audio clipped to int16 range.
|
||||
|
||||
Args:
|
||||
audio: Raw audio data as bytes to be filtered.
|
||||
|
||||
Returns:
|
||||
Noise-reduced audio data as bytes.
|
||||
"""
|
||||
if not self._filtering:
|
||||
return audio
|
||||
|
||||
|
||||
@@ -4,31 +4,51 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Base interruption strategy for determining when users can interrupt bot speech."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class BaseInterruptionStrategy(ABC):
|
||||
"""This is a base class for interruption strategies. Interruption strategies
|
||||
"""Base class for interruption strategies.
|
||||
|
||||
This is a base class for interruption strategies. Interruption strategies
|
||||
decide when the user can interrupt the bot while the bot is speaking. For
|
||||
example, there could be strategies based on audio volume or strategies based
|
||||
on the number of words the user spoke.
|
||||
|
||||
"""
|
||||
|
||||
async def append_audio(self, audio: bytes, sample_rate: int):
|
||||
"""Appends audio to the strategy. Not all strategies handle audio."""
|
||||
"""Append audio data to the strategy for analysis.
|
||||
|
||||
Not all strategies handle audio. Default implementation does nothing.
|
||||
|
||||
Args:
|
||||
audio: Raw audio bytes to append.
|
||||
sample_rate: Sample rate of the audio data in Hz.
|
||||
"""
|
||||
pass
|
||||
|
||||
async def append_text(self, text: str):
|
||||
"""Appends text to the strategy. Not all strategies handle text."""
|
||||
"""Append text data to the strategy for analysis.
|
||||
|
||||
Not all strategies handle text. Default implementation does nothing.
|
||||
|
||||
Args:
|
||||
text: Text string to append for analysis.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def should_interrupt(self) -> bool:
|
||||
"""This is called when the user stops speaking and it's time to decide
|
||||
"""Determine if the user should interrupt the bot.
|
||||
|
||||
This is called when the user stops speaking and it's time to decide
|
||||
whether the user should interrupt the bot. The decision will be based on
|
||||
the aggregated audio and/or text.
|
||||
|
||||
Returns:
|
||||
True if the user should interrupt the bot, False otherwise.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@@ -4,31 +4,47 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Minimum words interruption strategy for word count-based interruptions."""
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.interruptions.base_interruption_strategy import BaseInterruptionStrategy
|
||||
|
||||
|
||||
class MinWordsInterruptionStrategy(BaseInterruptionStrategy):
|
||||
"""This is an interruption strategy based on a minimum number of words said
|
||||
"""Interruption strategy based on minimum number of words spoken.
|
||||
|
||||
This is an interruption strategy based on a minimum number of words said
|
||||
by the user. That is, the strategy will be true if the user has said at
|
||||
least that amount of words.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *, min_words: int):
|
||||
"""Initialize the minimum words interruption strategy.
|
||||
|
||||
Args:
|
||||
min_words: Minimum number of words required to trigger an interruption.
|
||||
"""
|
||||
super().__init__()
|
||||
self._min_words = min_words
|
||||
self._text = ""
|
||||
|
||||
async def append_text(self, text: str):
|
||||
"""Appends text for later analysis. Not all strategies need to handle
|
||||
text.
|
||||
"""Append text for word count analysis.
|
||||
|
||||
Args:
|
||||
text: Text string to append to the accumulated text.
|
||||
|
||||
Note: Not all strategies need to handle text.
|
||||
"""
|
||||
self._text += text
|
||||
|
||||
async def should_interrupt(self) -> bool:
|
||||
"""Check if the minimum word count has been reached.
|
||||
|
||||
Returns:
|
||||
True if the user has spoken at least the minimum number of words.
|
||||
"""
|
||||
word_count = len(self._text.split())
|
||||
interrupt = word_count >= self._min_words
|
||||
logger.debug(
|
||||
@@ -37,4 +53,5 @@ class MinWordsInterruptionStrategy(BaseInterruptionStrategy):
|
||||
return interrupt
|
||||
|
||||
async def reset(self):
|
||||
"""Reset the accumulated text for the next analysis cycle."""
|
||||
self._text = ""
|
||||
|
||||
@@ -4,50 +4,73 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Base audio mixer for output transport integration.
|
||||
|
||||
Provides the abstract base class for audio mixers that can be integrated with
|
||||
output transports to mix incoming audio with generated audio from the mixer.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from pipecat.frames.frames import MixerControlFrame
|
||||
|
||||
|
||||
class BaseAudioMixer(ABC):
|
||||
"""This is a base class for output transport audio mixers. If an audio mixer
|
||||
"""Base class for output transport audio mixers.
|
||||
|
||||
This is a base class for output transport audio mixers. If an audio mixer
|
||||
is provided to the output transport it will be used to mix the audio frames
|
||||
coming into to the transport with the audio generated from the mixer. There
|
||||
are control frames to update mixer settings or to enable or disable the
|
||||
mixer at runtime.
|
||||
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def start(self, sample_rate: int):
|
||||
"""This will be called from the output transport when the transport is
|
||||
"""Initialize the mixer when the output transport starts.
|
||||
|
||||
This will be called from the output transport when the transport is
|
||||
started. It can be used to initialize the mixer. The output transport
|
||||
sample rate is provided so the mixer can adjust to that sample rate.
|
||||
|
||||
Args:
|
||||
sample_rate: The sample rate of the output transport in Hz.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def stop(self):
|
||||
"""This will be called from the output transport when the transport is
|
||||
stopping.
|
||||
"""Clean up the mixer when the output transport stops.
|
||||
|
||||
This will be called from the output transport when the transport is
|
||||
stopping.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def process_frame(self, frame: MixerControlFrame):
|
||||
"""This will be called when the output transport receives a
|
||||
"""Process mixer control frames from the transport.
|
||||
|
||||
This will be called when the output transport receives a
|
||||
MixerControlFrame.
|
||||
|
||||
Args:
|
||||
frame: The mixer control frame to process.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def mix(self, audio: bytes) -> bytes:
|
||||
"""This is called with the audio that is about to be sent from the
|
||||
"""Mix transport audio with mixer-generated audio.
|
||||
|
||||
This is called with the audio that is about to be sent from the
|
||||
output transport and that should be mixed with the mixer audio if the
|
||||
mixer is enabled.
|
||||
|
||||
Args:
|
||||
audio: Raw audio bytes from the transport to mix.
|
||||
|
||||
Returns:
|
||||
Mixed audio bytes combining transport and mixer audio.
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -4,6 +4,13 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Soundfile-based audio mixer for file playback integration.
|
||||
|
||||
Provides an audio mixer that combines incoming audio with audio loaded from
|
||||
files using the soundfile library. Supports multiple audio formats and
|
||||
runtime configuration changes.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import Any, Dict, Mapping
|
||||
|
||||
@@ -24,7 +31,9 @@ except ModuleNotFoundError as e:
|
||||
|
||||
|
||||
class SoundfileMixer(BaseAudioMixer):
|
||||
"""This is an audio mixer that mixes incoming audio with audio from a
|
||||
"""Audio mixer that combines incoming audio with file-based audio.
|
||||
|
||||
This is an audio mixer that mixes incoming audio with audio from a
|
||||
file. It uses the soundfile library to load files so it supports multiple
|
||||
formats. The audio files need to only have one channel (mono) and it needs
|
||||
to match the sample rate of the output transport.
|
||||
@@ -33,7 +42,6 @@ class SoundfileMixer(BaseAudioMixer):
|
||||
`MixerUpdateSettingsFrame` has the following settings available: `sound`
|
||||
(str) and `volume` (float) to be able to update to a different sound file or
|
||||
to change the volume at runtime.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -46,6 +54,16 @@ class SoundfileMixer(BaseAudioMixer):
|
||||
loop: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the soundfile mixer.
|
||||
|
||||
Args:
|
||||
sound_files: Mapping of sound names to file paths for loading.
|
||||
default_sound: Name of the default sound to play initially.
|
||||
volume: Mixing volume level (0.0 to 1.0). Defaults to 0.4.
|
||||
mixing: Whether mixing is initially enabled. Defaults to True.
|
||||
loop: Whether to loop audio files when they end. Defaults to True.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._sound_files = sound_files
|
||||
self._volume = volume
|
||||
@@ -58,14 +76,28 @@ class SoundfileMixer(BaseAudioMixer):
|
||||
self._loop = loop
|
||||
|
||||
async def start(self, sample_rate: int):
|
||||
"""Initialize the mixer and load all sound files.
|
||||
|
||||
Args:
|
||||
sample_rate: The sample rate of the output transport in Hz.
|
||||
"""
|
||||
self._sample_rate = sample_rate
|
||||
for sound_name, file_name in self._sound_files.items():
|
||||
await asyncio.to_thread(self._load_sound_file, sound_name, file_name)
|
||||
|
||||
async def stop(self):
|
||||
"""Clean up mixer resources.
|
||||
|
||||
Currently performs no cleanup as sound data is managed by garbage collection.
|
||||
"""
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: MixerControlFrame):
|
||||
"""Process mixer control frames to update settings or enable/disable mixing.
|
||||
|
||||
Args:
|
||||
frame: The mixer control frame to process.
|
||||
"""
|
||||
if isinstance(frame, MixerUpdateSettingsFrame):
|
||||
await self._update_settings(frame)
|
||||
elif isinstance(frame, MixerEnableFrame):
|
||||
@@ -73,12 +105,22 @@ class SoundfileMixer(BaseAudioMixer):
|
||||
pass
|
||||
|
||||
async def mix(self, audio: bytes) -> bytes:
|
||||
"""Mix transport audio with the current sound file.
|
||||
|
||||
Args:
|
||||
audio: Raw audio bytes from the transport to mix.
|
||||
|
||||
Returns:
|
||||
Mixed audio bytes combining transport and file audio.
|
||||
"""
|
||||
return self._mix_with_sound(audio)
|
||||
|
||||
async def _enable_mixing(self, enable: bool):
|
||||
"""Enable or disable audio mixing."""
|
||||
self._mixing = enable
|
||||
|
||||
async def _update_settings(self, frame: MixerUpdateSettingsFrame):
|
||||
"""Update mixer settings from a control frame."""
|
||||
for setting, value in frame.settings.items():
|
||||
match setting:
|
||||
case "sound":
|
||||
@@ -89,6 +131,11 @@ class SoundfileMixer(BaseAudioMixer):
|
||||
await self._update_loop(value)
|
||||
|
||||
async def _change_sound(self, sound: str):
|
||||
"""Change the currently playing sound file.
|
||||
|
||||
Args:
|
||||
sound: Name of the sound file to switch to.
|
||||
"""
|
||||
if sound in self._sound_files:
|
||||
self._current_sound = sound
|
||||
self._sound_pos = 0
|
||||
@@ -96,12 +143,15 @@ class SoundfileMixer(BaseAudioMixer):
|
||||
logger.error(f"Sound {sound} is not available")
|
||||
|
||||
async def _update_volume(self, volume: float):
|
||||
"""Update the mixing volume level."""
|
||||
self._volume = volume
|
||||
|
||||
async def _update_loop(self, loop: bool):
|
||||
"""Update the looping behavior."""
|
||||
self._loop = loop
|
||||
|
||||
def _load_sound_file(self, sound_name: str, file_name: str):
|
||||
"""Load an audio file into memory for mixing."""
|
||||
try:
|
||||
logger.debug(f"Loading mixer sound from {file_name}")
|
||||
sound, sample_rate = sf.read(file_name, dtype="int16")
|
||||
@@ -118,10 +168,7 @@ class SoundfileMixer(BaseAudioMixer):
|
||||
logger.error(f"Unable to open file {file_name}: {e}")
|
||||
|
||||
def _mix_with_sound(self, audio: bytes):
|
||||
"""Mixes raw audio frames with chunks of the same length from the sound
|
||||
file.
|
||||
|
||||
"""
|
||||
"""Mix raw audio frames with chunks of the same length from the sound file."""
|
||||
if not self._mixing or not self._current_sound in self._sounds:
|
||||
return audio
|
||||
|
||||
|
||||
@@ -4,27 +4,35 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Base audio resampler interface for Pipecat.
|
||||
|
||||
This module defines the abstract base class for audio resampling implementations,
|
||||
providing a common interface for converting audio between different sample rates.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class BaseAudioResampler(ABC):
|
||||
"""Abstract base class for audio resampling. This class defines an
|
||||
interface for audio resampling implementations.
|
||||
"""Abstract base class for audio resampling implementations.
|
||||
|
||||
This class defines the interface that all audio resampling implementations
|
||||
must follow, providing a standardized way to convert audio data between
|
||||
different sample rates.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
||||
"""
|
||||
Resamples the given audio data to a different sample rate.
|
||||
"""Resamples the given audio data to a different sample rate.
|
||||
|
||||
This is an abstract method that must be implemented in subclasses.
|
||||
|
||||
Parameters:
|
||||
audio (bytes): The audio data to be resampled, represented as a byte string.
|
||||
in_rate (int): The original sample rate of the audio data (in Hz).
|
||||
out_rate (int): The desired sample rate for the resampled audio data (in Hz).
|
||||
Args:
|
||||
audio: The audio data to be resampled, as raw bytes.
|
||||
in_rate: The original sample rate of the audio data in Hz.
|
||||
out_rate: The desired sample rate for the output audio in Hz.
|
||||
|
||||
Returns:
|
||||
bytes: The resampled audio data as a byte string.
|
||||
The resampled audio data as raw bytes.
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -4,6 +4,12 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Resampy-based audio resampler implementation.
|
||||
|
||||
This module provides an audio resampler that uses the resampy library
|
||||
for high-quality audio sample rate conversion.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import resampy
|
||||
|
||||
@@ -11,12 +17,31 @@ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
|
||||
|
||||
|
||||
class ResampyResampler(BaseAudioResampler):
|
||||
"""Audio resampler implementation using the resampy library."""
|
||||
"""Audio resampler implementation using the resampy library.
|
||||
|
||||
This resampler uses the resampy library's Kaiser windowing filter
|
||||
for high-quality audio resampling with good performance characteristics.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize the resampy resampler.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional keyword arguments (currently unused).
|
||||
"""
|
||||
pass
|
||||
|
||||
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
||||
"""Resample audio data using resampy library.
|
||||
|
||||
Args:
|
||||
audio: Input audio data as raw bytes (16-bit signed integers).
|
||||
in_rate: Original sample rate in Hz.
|
||||
out_rate: Target sample rate in Hz.
|
||||
|
||||
Returns:
|
||||
Resampled audio data as raw bytes (16-bit signed integers).
|
||||
"""
|
||||
if in_rate == out_rate:
|
||||
return audio
|
||||
audio_data = np.frombuffer(audio, dtype=np.int16)
|
||||
|
||||
@@ -4,6 +4,17 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""SoX-based audio resampler implementation.
|
||||
|
||||
This module provides an audio resampler that uses the SoX resampler library
|
||||
for very high-quality audio sample rate conversion.
|
||||
|
||||
When to use the SOXRAudioResampler:
|
||||
1. For batch processing of complete audio files
|
||||
2. When you have all the audio data available at once
|
||||
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import soxr
|
||||
|
||||
@@ -11,12 +22,32 @@ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
|
||||
|
||||
|
||||
class SOXRAudioResampler(BaseAudioResampler):
|
||||
"""Audio resampler implementation using the SoX resampler library."""
|
||||
"""Audio resampler implementation using the SoX resampler library.
|
||||
|
||||
This resampler uses the SoX resampler library configured for very high
|
||||
quality (VHQ) resampling, providing excellent audio quality at the cost
|
||||
of additional computational overhead.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize the SoX audio resampler.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional keyword arguments (currently unused).
|
||||
"""
|
||||
pass
|
||||
|
||||
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
||||
"""Resample audio data using SoX resampler library.
|
||||
|
||||
Args:
|
||||
audio: Input audio data as raw bytes (16-bit signed integers).
|
||||
in_rate: Original sample rate in Hz.
|
||||
out_rate: Target sample rate in Hz.
|
||||
|
||||
Returns:
|
||||
Resampled audio data as raw bytes (16-bit signed integers).
|
||||
"""
|
||||
if in_rate == out_rate:
|
||||
return audio
|
||||
audio_data = np.frombuffer(audio, dtype=np.int16)
|
||||
|
||||
101
src/pipecat/audio/resamplers/soxr_stream_resampler.py
Normal file
101
src/pipecat/audio/resamplers/soxr_stream_resampler.py
Normal file
@@ -0,0 +1,101 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""SoX-based audio resampler stream implementation.
|
||||
|
||||
This module provides an audio resampler that uses the SoX ResampleStream library
|
||||
for very high quality audio sample rate conversion.
|
||||
|
||||
When to use the SOXRStreamAudioResampler:
|
||||
1. For real-time processing scenarios
|
||||
2. When dealing with very long audio signals
|
||||
3. When processing audio in chunks or streams
|
||||
4. When you need to reuse the same resampler configuration multiple times, as it saves initialization overhead
|
||||
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import soxr
|
||||
|
||||
from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
|
||||
|
||||
CLEAR_STREAM_AFTER_SECS = 0.2
|
||||
|
||||
|
||||
class SOXRStreamAudioResampler(BaseAudioResampler):
|
||||
"""Audio resampler implementation using the SoX ResampleStream library.
|
||||
|
||||
This resampler uses the SoX ResampleStream library configured for very high
|
||||
quality (VHQ) resampling, providing excellent audio quality at the cost
|
||||
of additional computational overhead.
|
||||
It keeps an internal history which avoids clicks at chunk boundaries.
|
||||
|
||||
Notes:
|
||||
- Only supports mono audio (1 channel).
|
||||
- Input must be 16-bit signed PCM audio as raw bytes.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize the resampler.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional keyword arguments (currently unused).
|
||||
"""
|
||||
self._in_rate: float | None = None
|
||||
self._out_rate: float | None = None
|
||||
self._last_resample_time: float = 0
|
||||
self._soxr_stream: soxr.ResampleStream | None = None
|
||||
|
||||
def _initialize(self, in_rate: float, out_rate: float):
|
||||
self._in_rate = in_rate
|
||||
self._out_rate = out_rate
|
||||
self._last_resample_time = time.time()
|
||||
self._soxr_stream = soxr.ResampleStream(
|
||||
in_rate=in_rate, out_rate=out_rate, num_channels=1, quality="VHQ", dtype="int16"
|
||||
)
|
||||
|
||||
def _maybe_clear_internal_state(self):
|
||||
current_time = time.time()
|
||||
time_since_last_resample = current_time - self._last_resample_time
|
||||
# If more than CLEAR_STREAM_AFTER_SECS seconds have passed, clear the resampler state
|
||||
if time_since_last_resample > CLEAR_STREAM_AFTER_SECS:
|
||||
if self._soxr_stream:
|
||||
self._soxr_stream.clear()
|
||||
self._last_resample_time = current_time
|
||||
|
||||
def _maybe_initialize_sox_stream(self, in_rate: int, out_rate: int):
|
||||
if self._soxr_stream is None:
|
||||
self._initialize(in_rate, out_rate)
|
||||
else:
|
||||
self._maybe_clear_internal_state()
|
||||
|
||||
if self._in_rate != in_rate or self._out_rate != out_rate:
|
||||
raise ValueError(
|
||||
f"SOXRStreamAudioResampler cannot be reused with different sample rates: "
|
||||
f"expected {self._in_rate}->{self._out_rate}, got {in_rate}->{out_rate}"
|
||||
)
|
||||
|
||||
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
||||
"""Resample audio data using soxr.ResampleStream resampler library.
|
||||
|
||||
Args:
|
||||
audio: Input audio data as raw bytes (16-bit signed integers).
|
||||
in_rate: Original sample rate in Hz.
|
||||
out_rate: Target sample rate in Hz.
|
||||
|
||||
Returns:
|
||||
Resampled audio data as raw bytes (16-bit signed integers).
|
||||
"""
|
||||
if in_rate == out_rate:
|
||||
return audio
|
||||
|
||||
self._maybe_initialize_sox_stream(in_rate, out_rate)
|
||||
audio_data = np.frombuffer(audio, dtype=np.int16)
|
||||
resampled_audio = self._soxr_stream.resample_chunk(audio_data)
|
||||
result = resampled_audio.astype(np.int16).tobytes()
|
||||
return result
|
||||
@@ -4,6 +4,12 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Base turn analyzer for determining end-of-turn in audio conversations.
|
||||
|
||||
This module provides the abstract base class and enumeration for analyzing
|
||||
when a user has finished speaking in a conversation.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from typing import Optional, Tuple
|
||||
@@ -12,6 +18,13 @@ from pipecat.metrics.metrics import MetricsData
|
||||
|
||||
|
||||
class EndOfTurnState(Enum):
|
||||
"""State enumeration for end-of-turn analysis results.
|
||||
|
||||
Parameters:
|
||||
COMPLETE: The user has finished their turn and stopped speaking.
|
||||
INCOMPLETE: The user is still speaking or may continue speaking.
|
||||
"""
|
||||
|
||||
COMPLETE = 1
|
||||
INCOMPLETE = 2
|
||||
|
||||
@@ -24,6 +37,12 @@ class BaseTurnAnalyzer(ABC):
|
||||
"""
|
||||
|
||||
def __init__(self, *, sample_rate: Optional[int] = None):
|
||||
"""Initialize the turn analyzer.
|
||||
|
||||
Args:
|
||||
sample_rate: Optional initial sample rate for audio processing.
|
||||
If provided, this will be used as the fixed sample rate.
|
||||
"""
|
||||
self._init_sample_rate = sample_rate
|
||||
self._sample_rate = 0
|
||||
|
||||
@@ -78,3 +97,8 @@ class BaseTurnAnalyzer(ABC):
|
||||
EndOfTurnState: The result of the end of turn analysis.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def clear(self):
|
||||
"""Reset the turn analyzer to its initial state."""
|
||||
pass
|
||||
|
||||
@@ -4,6 +4,13 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Smart turn analyzer base class using ML models for end-of-turn detection.
|
||||
|
||||
This module provides the base implementation for smart turn analyzers that use
|
||||
machine learning models to determine when a user has finished speaking, going
|
||||
beyond simple silence-based detection.
|
||||
"""
|
||||
|
||||
import time
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
@@ -23,6 +30,14 @@ USE_ONLY_LAST_VAD_SEGMENT = True
|
||||
|
||||
|
||||
class SmartTurnParams(BaseModel):
|
||||
"""Configuration parameters for smart turn analysis.
|
||||
|
||||
Parameters:
|
||||
stop_secs: Maximum silence duration in seconds before ending turn.
|
||||
pre_speech_ms: Milliseconds of audio to include before speech starts.
|
||||
max_duration_secs: Maximum duration in seconds for audio segments.
|
||||
"""
|
||||
|
||||
stop_secs: float = STOP_SECS
|
||||
pre_speech_ms: float = PRE_SPEECH_MS
|
||||
max_duration_secs: float = MAX_DURATION_SECONDS
|
||||
@@ -31,13 +46,28 @@ class SmartTurnParams(BaseModel):
|
||||
|
||||
|
||||
class SmartTurnTimeoutException(Exception):
|
||||
"""Exception raised when smart turn analysis times out."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
"""Base class for smart turn analyzers using ML models.
|
||||
|
||||
Provides common functionality for smart turn detection including audio
|
||||
buffering, speech tracking, and ML model integration. Subclasses must
|
||||
implement the specific model prediction logic.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, *, sample_rate: Optional[int] = None, params: Optional[SmartTurnParams] = None
|
||||
):
|
||||
"""Initialize the smart turn analyzer.
|
||||
|
||||
Args:
|
||||
sample_rate: Optional sample rate for audio processing.
|
||||
params: Configuration parameters for turn analysis behavior.
|
||||
"""
|
||||
super().__init__(sample_rate=sample_rate)
|
||||
self._params = params or SmartTurnParams()
|
||||
# Configuration
|
||||
@@ -50,9 +80,23 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
|
||||
@property
|
||||
def speech_triggered(self) -> bool:
|
||||
"""Check if speech has been detected and triggered analysis.
|
||||
|
||||
Returns:
|
||||
True if speech has been detected and turn analysis is active.
|
||||
"""
|
||||
return self._speech_triggered
|
||||
|
||||
def append_audio(self, buffer: bytes, is_speech: bool) -> EndOfTurnState:
|
||||
"""Append audio data for turn analysis.
|
||||
|
||||
Args:
|
||||
buffer: Raw audio data bytes to append for analysis.
|
||||
is_speech: Whether the audio buffer contains detected speech.
|
||||
|
||||
Returns:
|
||||
Current end-of-turn state after processing the audio.
|
||||
"""
|
||||
# Convert raw audio to float32 format and append to the buffer
|
||||
audio_int16 = np.frombuffer(buffer, dtype=np.int16)
|
||||
audio_float32 = np.frombuffer(audio_int16, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
@@ -92,13 +136,24 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
return state
|
||||
|
||||
async def analyze_end_of_turn(self) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
|
||||
"""Analyze the current audio state to determine if turn has ended.
|
||||
|
||||
Returns:
|
||||
Tuple containing the end-of-turn state and optional metrics data
|
||||
from the ML model analysis.
|
||||
"""
|
||||
state, result = await self._process_speech_segment(self._audio_buffer)
|
||||
if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
|
||||
self._clear(state)
|
||||
logger.debug(f"End of Turn result: {state}")
|
||||
return state, result
|
||||
|
||||
def clear(self):
|
||||
"""Reset the turn analyzer to its initial state."""
|
||||
self._clear(EndOfTurnState.COMPLETE)
|
||||
|
||||
def _clear(self, turn_state: EndOfTurnState):
|
||||
"""Clear internal state based on turn completion status."""
|
||||
# If the state is still incomplete, keep the _speech_triggered as True
|
||||
self._speech_triggered = turn_state == EndOfTurnState.INCOMPLETE
|
||||
self._audio_buffer = []
|
||||
@@ -108,6 +163,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
async def _process_speech_segment(
|
||||
self, audio_buffer
|
||||
) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
|
||||
"""Process accumulated audio segment using ML model."""
|
||||
state = EndOfTurnState.INCOMPLETE
|
||||
|
||||
if not audio_buffer:
|
||||
@@ -185,14 +241,5 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
|
||||
@abstractmethod
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Abstract method to predict if a turn has ended based on audio.
|
||||
|
||||
Args:
|
||||
audio_array: Float32 numpy array of audio samples at 16kHz.
|
||||
|
||||
Returns:
|
||||
Dictionary with:
|
||||
- prediction: 1 if turn is complete, else 0
|
||||
- probability: Confidence of the prediction
|
||||
"""
|
||||
"""Predict end-of-turn using ML model from audio data."""
|
||||
pass
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Fal.ai smart turn analyzer implementation.
|
||||
|
||||
This module provides a smart turn analyzer that uses Fal.ai's hosted smart-turn model
|
||||
for end-of-turn detection in conversations.
|
||||
|
||||
Note: To learn more about the smart-turn model, visit:
|
||||
- https://fal.ai/models/fal-ai/smart-turn/playground
|
||||
- https://github.com/pipecat-ai/smart-turn
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
@@ -12,6 +22,12 @@ from pipecat.audio.turn.smart_turn.http_smart_turn import HttpSmartTurnAnalyzer
|
||||
|
||||
|
||||
class FalSmartTurnAnalyzer(HttpSmartTurnAnalyzer):
|
||||
"""Smart turn analyzer using Fal.ai's hosted smart-turn model.
|
||||
|
||||
Extends HttpSmartTurnAnalyzer to provide integration with Fal.ai's
|
||||
smart turn detection API endpoint with proper authentication.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
@@ -20,6 +36,14 @@ class FalSmartTurnAnalyzer(HttpSmartTurnAnalyzer):
|
||||
api_key: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Fal.ai smart turn analyzer.
|
||||
|
||||
Args:
|
||||
aiohttp_session: HTTP client session for making API requests.
|
||||
url: Fal.ai API endpoint URL for smart turn detection.
|
||||
api_key: API key for authenticating with Fal.ai service.
|
||||
**kwargs: Additional arguments passed to parent HttpSmartTurnAnalyzer.
|
||||
"""
|
||||
headers = {}
|
||||
if api_key:
|
||||
headers = {"Authorization": f"Key {api_key}"}
|
||||
|
||||
@@ -4,6 +4,12 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""HTTP-based smart turn analyzer for remote ML inference.
|
||||
|
||||
This module provides a smart turn analyzer that sends audio data to remote
|
||||
HTTP endpoints for ML-based end-of-turn detection.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
from typing import Any, Dict, Optional
|
||||
@@ -16,6 +22,12 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import BaseSmartTurn, SmartTu
|
||||
|
||||
|
||||
class HttpSmartTurnAnalyzer(BaseSmartTurn):
|
||||
"""Smart turn analyzer using HTTP-based ML inference.
|
||||
|
||||
Sends audio data to remote HTTP endpoints for ML-based end-of-turn
|
||||
prediction. Handles serialization, HTTP communication, and error recovery.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
@@ -24,12 +36,21 @@ class HttpSmartTurnAnalyzer(BaseSmartTurn):
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the HTTP smart turn analyzer.
|
||||
|
||||
Args:
|
||||
url: HTTP endpoint URL for the smart turn ML service.
|
||||
aiohttp_session: HTTP client session for making requests.
|
||||
headers: Optional HTTP headers to include in requests.
|
||||
**kwargs: Additional arguments passed to BaseSmartTurn.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._url = url
|
||||
self._headers = headers or {}
|
||||
self._aiohttp_session = aiohttp_session
|
||||
|
||||
def _serialize_array(self, audio_array: np.ndarray) -> bytes:
|
||||
"""Serialize NumPy audio array to bytes for HTTP transmission."""
|
||||
logger.trace("Serializing NumPy array to bytes...")
|
||||
buffer = io.BytesIO()
|
||||
np.save(buffer, audio_array)
|
||||
@@ -38,6 +59,7 @@ class HttpSmartTurnAnalyzer(BaseSmartTurn):
|
||||
return serialized_bytes
|
||||
|
||||
async def _send_raw_request(self, data_bytes: bytes) -> Dict[str, Any]:
|
||||
"""Send raw audio data to the HTTP endpoint for prediction."""
|
||||
headers = {"Content-Type": "application/octet-stream"}
|
||||
headers.update(self._headers)
|
||||
|
||||
@@ -83,6 +105,7 @@ class HttpSmartTurnAnalyzer(BaseSmartTurn):
|
||||
raise Exception("Failed to send raw request to Daily Smart Turn.")
|
||||
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Predict end-of-turn using remote HTTP ML service."""
|
||||
try:
|
||||
serialized_array = self._serialize_array(audio_array)
|
||||
return await self._send_raw_request(serialized_array)
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Local CoreML smart turn analyzer for on-device ML inference.
|
||||
|
||||
This module provides a smart turn analyzer that uses CoreML models for
|
||||
local end-of-turn detection without requiring network connectivity.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
@@ -25,7 +30,24 @@ except ModuleNotFoundError as e:
|
||||
|
||||
|
||||
class LocalCoreMLSmartTurnAnalyzer(BaseSmartTurn):
|
||||
"""Local smart turn analyzer using CoreML models.
|
||||
|
||||
Provides end-of-turn detection using locally-stored CoreML models,
|
||||
enabling offline operation without network dependencies. Optimized
|
||||
for Apple Silicon and other CoreML-compatible hardware.
|
||||
"""
|
||||
|
||||
def __init__(self, *, smart_turn_model_path: str, **kwargs):
|
||||
"""Initialize the local CoreML smart turn analyzer.
|
||||
|
||||
Args:
|
||||
smart_turn_model_path: Path to directory containing the CoreML model
|
||||
and feature extractor files.
|
||||
**kwargs: Additional arguments passed to BaseSmartTurn.
|
||||
|
||||
Raises:
|
||||
Exception: If smart_turn_model_path is not provided or model loading fails.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if not smart_turn_model_path:
|
||||
@@ -41,6 +63,7 @@ class LocalCoreMLSmartTurnAnalyzer(BaseSmartTurn):
|
||||
logger.debug("Loaded Local Smart Turn")
|
||||
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Predict end-of-turn using local CoreML model."""
|
||||
inputs = self._turn_processor(
|
||||
audio_array,
|
||||
sampling_rate=16000,
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Local PyTorch smart turn analyzer for on-device ML inference.
|
||||
|
||||
This module provides a smart turn analyzer that uses PyTorch models for
|
||||
local end-of-turn detection without requiring network connectivity.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
@@ -24,7 +29,21 @@ except ModuleNotFoundError as e:
|
||||
|
||||
|
||||
class LocalSmartTurnAnalyzer(BaseSmartTurn):
|
||||
"""Local smart turn analyzer using PyTorch models.
|
||||
|
||||
Provides end-of-turn detection using locally-stored PyTorch models,
|
||||
enabling offline operation without network dependencies. Uses
|
||||
Wav2Vec2-BERT architecture for audio sequence classification.
|
||||
"""
|
||||
|
||||
def __init__(self, *, smart_turn_model_path: str, **kwargs):
|
||||
"""Initialize the local PyTorch smart turn analyzer.
|
||||
|
||||
Args:
|
||||
smart_turn_model_path: Path to directory containing the PyTorch model
|
||||
and feature extractor files. If empty, uses default HuggingFace model.
|
||||
**kwargs: Additional arguments passed to BaseSmartTurn.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if not smart_turn_model_path:
|
||||
@@ -46,6 +65,7 @@ class LocalSmartTurnAnalyzer(BaseSmartTurn):
|
||||
logger.debug("Loaded Local Smart Turn")
|
||||
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Predict end-of-turn using local PyTorch model."""
|
||||
inputs = self._turn_processor(
|
||||
audio_array,
|
||||
sampling_rate=16000,
|
||||
|
||||
@@ -4,21 +4,87 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Audio utility functions for Pipecat.
|
||||
|
||||
This module provides common audio processing utilities including mixing,
|
||||
format conversion, volume calculation, and codec transformations for
|
||||
various audio formats used in Pipecat pipelines.
|
||||
"""
|
||||
|
||||
import audioop
|
||||
|
||||
import numpy as np
|
||||
import pyloudnorm as pyln
|
||||
import soxr
|
||||
|
||||
from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
|
||||
from pipecat.audio.resamplers.soxr_resampler import SOXRAudioResampler
|
||||
from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler
|
||||
|
||||
|
||||
def create_default_resampler(**kwargs) -> BaseAudioResampler:
|
||||
"""Create a default audio resampler instance.
|
||||
|
||||
. deprecated:: 0.0.74
|
||||
This function is deprecated and will be removed in a future version.
|
||||
Use `create_stream_resampler` for real-time processing scenarios or
|
||||
`create_file_resampler` for batch processing of complete audio files.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional keyword arguments passed to the resampler constructor.
|
||||
|
||||
Returns:
|
||||
A configured SOXRAudioResampler instance.
|
||||
"""
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"`create_default_resampler` is deprecated. "
|
||||
"Use `create_stream_resampler` for real-time processing scenarios or "
|
||||
"`create_file_resampler` for batch processing of complete audio files.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return SOXRAudioResampler(**kwargs)
|
||||
|
||||
|
||||
def create_file_resampler(**kwargs) -> BaseAudioResampler:
|
||||
"""Create an audio resampler instance for batch processing of complete audio files.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional keyword arguments passed to the resampler constructor.
|
||||
|
||||
Returns:
|
||||
A configured SOXRAudioResampler instance.
|
||||
"""
|
||||
return SOXRAudioResampler(**kwargs)
|
||||
|
||||
|
||||
def create_stream_resampler(**kwargs) -> BaseAudioResampler:
|
||||
"""Create a stream audio resampler instance.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional keyword arguments passed to the resampler constructor.
|
||||
|
||||
Returns:
|
||||
A configured SOXRStreamAudioResampler instance.
|
||||
"""
|
||||
return SOXRStreamAudioResampler(**kwargs)
|
||||
|
||||
|
||||
def mix_audio(audio1: bytes, audio2: bytes) -> bytes:
|
||||
"""Mix two audio streams together by adding their samples.
|
||||
|
||||
Both audio streams are assumed to be 16-bit signed integer PCM data.
|
||||
If the streams have different lengths, the shorter one is zero-padded
|
||||
to match the longer stream.
|
||||
|
||||
Args:
|
||||
audio1: First audio stream as raw bytes (16-bit signed integers).
|
||||
audio2: Second audio stream as raw bytes (16-bit signed integers).
|
||||
|
||||
Returns:
|
||||
Mixed audio data as raw bytes with samples clipped to 16-bit range.
|
||||
"""
|
||||
data1 = np.frombuffer(audio1, dtype=np.int16)
|
||||
data2 = np.frombuffer(audio2, dtype=np.int16)
|
||||
|
||||
@@ -37,6 +103,19 @@ def mix_audio(audio1: bytes, audio2: bytes) -> bytes:
|
||||
|
||||
|
||||
def interleave_stereo_audio(left_audio: bytes, right_audio: bytes) -> bytes:
|
||||
"""Interleave left and right mono audio channels into stereo audio.
|
||||
|
||||
Takes two mono audio streams and combines them into a single stereo
|
||||
stream by interleaving the samples (L, R, L, R, ...). If the channels
|
||||
have different lengths, both are truncated to the shorter length.
|
||||
|
||||
Args:
|
||||
left_audio: Left channel audio as raw bytes (16-bit signed integers).
|
||||
right_audio: Right channel audio as raw bytes (16-bit signed integers).
|
||||
|
||||
Returns:
|
||||
Interleaved stereo audio data as raw bytes.
|
||||
"""
|
||||
left = np.frombuffer(left_audio, dtype=np.int16)
|
||||
right = np.frombuffer(right_audio, dtype=np.int16)
|
||||
|
||||
@@ -50,12 +129,34 @@ def interleave_stereo_audio(left_audio: bytes, right_audio: bytes) -> bytes:
|
||||
|
||||
|
||||
def normalize_value(value, min_value, max_value):
|
||||
"""Normalize a value to the range [0, 1] and clamp it to bounds.
|
||||
|
||||
Args:
|
||||
value: The value to normalize.
|
||||
min_value: The minimum value of the input range.
|
||||
max_value: The maximum value of the input range.
|
||||
|
||||
Returns:
|
||||
Normalized value clamped to the range [0, 1].
|
||||
"""
|
||||
normalized = (value - min_value) / (max_value - min_value)
|
||||
normalized_clamped = max(0, min(1, normalized))
|
||||
return normalized_clamped
|
||||
|
||||
|
||||
def calculate_audio_volume(audio: bytes, sample_rate: int) -> float:
|
||||
"""Calculate the loudness level of audio data using EBU R128 standard.
|
||||
|
||||
Uses the pyloudnorm library to calculate integrated loudness according
|
||||
to the EBU R128 recommendation, then normalizes the result to [0, 1].
|
||||
|
||||
Args:
|
||||
audio: Audio data as raw bytes (16-bit signed integers).
|
||||
sample_rate: Sample rate of the audio in Hz.
|
||||
|
||||
Returns:
|
||||
Normalized loudness value between 0 (quiet) and 1 (loud).
|
||||
"""
|
||||
audio_np = np.frombuffer(audio, dtype=np.int16)
|
||||
audio_float = audio_np.astype(np.float64)
|
||||
|
||||
@@ -71,12 +172,37 @@ def calculate_audio_volume(audio: bytes, sample_rate: int) -> float:
|
||||
|
||||
|
||||
def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
|
||||
"""Apply exponential smoothing to a value.
|
||||
|
||||
Exponential smoothing is used to reduce noise in time-series data by
|
||||
giving more weight to recent values while still considering historical data.
|
||||
|
||||
Args:
|
||||
value: The new value to incorporate.
|
||||
prev_value: The previous smoothed value.
|
||||
factor: Smoothing factor between 0 and 1. Higher values give more
|
||||
weight to the new value.
|
||||
|
||||
Returns:
|
||||
The exponentially smoothed value.
|
||||
"""
|
||||
return prev_value + factor * (value - prev_value)
|
||||
|
||||
|
||||
async def ulaw_to_pcm(
|
||||
ulaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler
|
||||
):
|
||||
"""Convert μ-law encoded audio to PCM and optionally resample.
|
||||
|
||||
Args:
|
||||
ulaw_bytes: μ-law encoded audio data as raw bytes.
|
||||
in_rate: Original sample rate of the μ-law audio in Hz.
|
||||
out_rate: Desired output sample rate in Hz.
|
||||
resampler: Audio resampler instance for rate conversion.
|
||||
|
||||
Returns:
|
||||
PCM audio data as raw bytes at the specified output rate.
|
||||
"""
|
||||
# Convert μ-law to PCM
|
||||
in_pcm_bytes = audioop.ulaw2lin(ulaw_bytes, 2)
|
||||
|
||||
@@ -87,6 +213,17 @@ async def ulaw_to_pcm(
|
||||
|
||||
|
||||
async def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
|
||||
"""Convert PCM audio to μ-law encoding and optionally resample.
|
||||
|
||||
Args:
|
||||
pcm_bytes: PCM audio data as raw bytes (16-bit signed integers).
|
||||
in_rate: Original sample rate of the PCM audio in Hz.
|
||||
out_rate: Desired output sample rate in Hz.
|
||||
resampler: Audio resampler instance for rate conversion.
|
||||
|
||||
Returns:
|
||||
μ-law encoded audio data as raw bytes at the specified output rate.
|
||||
"""
|
||||
# Resample
|
||||
in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate)
|
||||
|
||||
@@ -99,6 +236,17 @@ async def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler:
|
||||
async def alaw_to_pcm(
|
||||
alaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler
|
||||
) -> bytes:
|
||||
"""Convert A-law encoded audio to PCM and optionally resample.
|
||||
|
||||
Args:
|
||||
alaw_bytes: A-law encoded audio data as raw bytes.
|
||||
in_rate: Original sample rate of the A-law audio in Hz.
|
||||
out_rate: Desired output sample rate in Hz.
|
||||
resampler: Audio resampler instance for rate conversion.
|
||||
|
||||
Returns:
|
||||
PCM audio data as raw bytes at the specified output rate.
|
||||
"""
|
||||
# Convert a-law to PCM
|
||||
in_pcm_bytes = audioop.alaw2lin(alaw_bytes, 2)
|
||||
|
||||
@@ -109,6 +257,17 @@ async def alaw_to_pcm(
|
||||
|
||||
|
||||
async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
|
||||
"""Convert PCM audio to A-law encoding and optionally resample.
|
||||
|
||||
Args:
|
||||
pcm_bytes: PCM audio data as raw bytes (16-bit signed integers).
|
||||
in_rate: Original sample rate of the PCM audio in Hz.
|
||||
out_rate: Desired output sample rate in Hz.
|
||||
resampler: Audio resampler instance for rate conversion.
|
||||
|
||||
Returns:
|
||||
A-law encoded audio data as raw bytes at the specified output rate.
|
||||
"""
|
||||
# Resample
|
||||
in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate)
|
||||
|
||||
|
||||
@@ -4,6 +4,13 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Silero Voice Activity Detection (VAD) implementation for Pipecat.
|
||||
|
||||
This module provides a VAD analyzer based on the Silero VAD ONNX model,
|
||||
which can detect voice activity in audio streams with high accuracy.
|
||||
Supports 8kHz and 16kHz sample rates.
|
||||
"""
|
||||
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
@@ -25,11 +32,20 @@ except ModuleNotFoundError as e:
|
||||
|
||||
|
||||
class SileroOnnxModel:
|
||||
"""ONNX runtime wrapper for the Silero VAD model.
|
||||
|
||||
Provides voice activity detection using the pre-trained Silero VAD model
|
||||
with ONNX runtime for efficient inference. Handles model state management
|
||||
and input validation for audio processing.
|
||||
"""
|
||||
|
||||
def __init__(self, path, force_onnx_cpu=True):
|
||||
import numpy as np
|
||||
|
||||
global np
|
||||
"""Initialize the Silero ONNX model.
|
||||
|
||||
Args:
|
||||
path: Path to the ONNX model file.
|
||||
force_onnx_cpu: Whether to force CPU execution provider.
|
||||
"""
|
||||
opts = onnxruntime.SessionOptions()
|
||||
opts.inter_op_num_threads = 1
|
||||
opts.intra_op_num_threads = 1
|
||||
@@ -45,6 +61,7 @@ class SileroOnnxModel:
|
||||
self.sample_rates = [8000, 16000]
|
||||
|
||||
def _validate_input(self, x, sr: int):
|
||||
"""Validate and preprocess input audio data."""
|
||||
if np.ndim(x) == 1:
|
||||
x = np.expand_dims(x, 0)
|
||||
if np.ndim(x) > 2:
|
||||
@@ -60,12 +77,18 @@ class SileroOnnxModel:
|
||||
return x, sr
|
||||
|
||||
def reset_states(self, batch_size=1):
|
||||
"""Reset the internal model states.
|
||||
|
||||
Args:
|
||||
batch_size: Batch size for state initialization. Defaults to 1.
|
||||
"""
|
||||
self._state = np.zeros((2, batch_size, 128), dtype="float32")
|
||||
self._context = np.zeros((batch_size, 0), dtype="float32")
|
||||
self._last_sr = 0
|
||||
self._last_batch_size = 0
|
||||
|
||||
def __call__(self, x, sr: int):
|
||||
"""Process audio input through the VAD model."""
|
||||
x, sr = self._validate_input(x, sr)
|
||||
num_samples = 512 if sr == 16000 else 256
|
||||
|
||||
@@ -105,7 +128,20 @@ class SileroOnnxModel:
|
||||
|
||||
|
||||
class SileroVADAnalyzer(VADAnalyzer):
|
||||
"""Voice Activity Detection analyzer using the Silero VAD model.
|
||||
|
||||
Implements VAD analysis using the pre-trained Silero ONNX model for
|
||||
accurate voice activity detection. Supports 8kHz and 16kHz sample rates
|
||||
with automatic model state management and periodic resets.
|
||||
"""
|
||||
|
||||
def __init__(self, *, sample_rate: Optional[int] = None, params: Optional[VADParams] = None):
|
||||
"""Initialize the Silero VAD analyzer.
|
||||
|
||||
Args:
|
||||
sample_rate: Audio sample rate (8000 or 16000 Hz). If None, will be set later.
|
||||
params: VAD parameters for detection thresholds and timing.
|
||||
"""
|
||||
super().__init__(sample_rate=sample_rate, params=params)
|
||||
|
||||
logger.debug("Loading Silero VAD model...")
|
||||
@@ -137,6 +173,14 @@ class SileroVADAnalyzer(VADAnalyzer):
|
||||
#
|
||||
|
||||
def set_sample_rate(self, sample_rate: int):
|
||||
"""Set the sample rate for audio processing.
|
||||
|
||||
Args:
|
||||
sample_rate: Audio sample rate (must be 8000 or 16000 Hz).
|
||||
|
||||
Raises:
|
||||
ValueError: If sample rate is not 8000 or 16000 Hz.
|
||||
"""
|
||||
if sample_rate != 16000 and sample_rate != 8000:
|
||||
raise ValueError(
|
||||
f"Silero VAD sample rate needs to be 16000 or 8000 (sample rate: {sample_rate})"
|
||||
@@ -145,9 +189,22 @@ class SileroVADAnalyzer(VADAnalyzer):
|
||||
super().set_sample_rate(sample_rate)
|
||||
|
||||
def num_frames_required(self) -> int:
|
||||
"""Get the number of audio frames required for VAD analysis.
|
||||
|
||||
Returns:
|
||||
Number of frames required (512 for 16kHz, 256 for 8kHz).
|
||||
"""
|
||||
return 512 if self.sample_rate == 16000 else 256
|
||||
|
||||
def voice_confidence(self, buffer) -> float:
|
||||
"""Calculate voice activity confidence for the given audio buffer.
|
||||
|
||||
Args:
|
||||
buffer: Audio buffer to analyze.
|
||||
|
||||
Returns:
|
||||
Voice confidence score between 0.0 and 1.0.
|
||||
"""
|
||||
try:
|
||||
audio_int16 = np.frombuffer(buffer, np.int16)
|
||||
# Divide by 32768 because we have signed 16-bit data.
|
||||
|
||||
@@ -4,6 +4,13 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Voice Activity Detection (VAD) analyzer base classes and utilities.
|
||||
|
||||
This module provides the abstract base class for VAD analyzers and associated
|
||||
data structures for voice activity detection in audio streams. Includes state
|
||||
management, parameter configuration, and audio analysis framework.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
@@ -20,6 +27,15 @@ VAD_MIN_VOLUME = 0.6
|
||||
|
||||
|
||||
class VADState(Enum):
|
||||
"""Voice Activity Detection states.
|
||||
|
||||
Parameters:
|
||||
QUIET: No voice activity detected.
|
||||
STARTING: Voice activity beginning, transitioning from quiet.
|
||||
SPEAKING: Active voice detected and confirmed.
|
||||
STOPPING: Voice activity ending, transitioning to quiet.
|
||||
"""
|
||||
|
||||
QUIET = 1
|
||||
STARTING = 2
|
||||
SPEAKING = 3
|
||||
@@ -27,6 +43,15 @@ class VADState(Enum):
|
||||
|
||||
|
||||
class VADParams(BaseModel):
|
||||
"""Configuration parameters for Voice Activity Detection.
|
||||
|
||||
Parameters:
|
||||
confidence: Minimum confidence threshold for voice detection.
|
||||
start_secs: Duration to wait before confirming voice start.
|
||||
stop_secs: Duration to wait before confirming voice stop.
|
||||
min_volume: Minimum audio volume threshold for voice detection.
|
||||
"""
|
||||
|
||||
confidence: float = VAD_CONFIDENCE
|
||||
start_secs: float = VAD_START_SECS
|
||||
stop_secs: float = VAD_STOP_SECS
|
||||
@@ -34,7 +59,20 @@ class VADParams(BaseModel):
|
||||
|
||||
|
||||
class VADAnalyzer(ABC):
|
||||
"""Abstract base class for Voice Activity Detection analyzers.
|
||||
|
||||
Provides the framework for implementing VAD analysis with configurable
|
||||
parameters, state management, and audio processing capabilities.
|
||||
Subclasses must implement the core voice confidence calculation.
|
||||
"""
|
||||
|
||||
def __init__(self, *, sample_rate: Optional[int] = None, params: Optional[VADParams] = None):
|
||||
"""Initialize the VAD analyzer.
|
||||
|
||||
Args:
|
||||
sample_rate: Audio sample rate in Hz. If None, will be set later.
|
||||
params: VAD parameters for detection configuration.
|
||||
"""
|
||||
self._init_sample_rate = sample_rate
|
||||
self._sample_rate = 0
|
||||
self._params = params or VADParams()
|
||||
@@ -48,29 +86,67 @@ class VADAnalyzer(ABC):
|
||||
|
||||
@property
|
||||
def sample_rate(self) -> int:
|
||||
"""Get the current sample rate.
|
||||
|
||||
Returns:
|
||||
Current audio sample rate in Hz.
|
||||
"""
|
||||
return self._sample_rate
|
||||
|
||||
@property
|
||||
def num_channels(self) -> int:
|
||||
"""Get the number of audio channels.
|
||||
|
||||
Returns:
|
||||
Number of audio channels (always 1 for mono).
|
||||
"""
|
||||
return self._num_channels
|
||||
|
||||
@property
|
||||
def params(self) -> VADParams:
|
||||
"""Get the current VAD parameters.
|
||||
|
||||
Returns:
|
||||
Current VAD configuration parameters.
|
||||
"""
|
||||
return self._params
|
||||
|
||||
@abstractmethod
|
||||
def num_frames_required(self) -> int:
|
||||
"""Get the number of audio frames required for analysis.
|
||||
|
||||
Returns:
|
||||
Number of frames needed for VAD processing.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def voice_confidence(self, buffer) -> float:
|
||||
"""Calculate voice activity confidence for the given audio buffer.
|
||||
|
||||
Args:
|
||||
buffer: Audio buffer to analyze.
|
||||
|
||||
Returns:
|
||||
Voice confidence score between 0.0 and 1.0.
|
||||
"""
|
||||
pass
|
||||
|
||||
def set_sample_rate(self, sample_rate: int):
|
||||
"""Set the sample rate for audio processing.
|
||||
|
||||
Args:
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
"""
|
||||
self._sample_rate = self._init_sample_rate or sample_rate
|
||||
self.set_params(self._params)
|
||||
|
||||
def set_params(self, params: VADParams):
|
||||
"""Set VAD parameters and recalculate internal values.
|
||||
|
||||
Args:
|
||||
params: VAD parameters for detection configuration.
|
||||
"""
|
||||
logger.debug(f"Setting VAD params to: {params}")
|
||||
self._params = params
|
||||
self._vad_frames = self.num_frames_required()
|
||||
@@ -85,10 +161,22 @@ class VADAnalyzer(ABC):
|
||||
self._vad_state: VADState = VADState.QUIET
|
||||
|
||||
def _get_smoothed_volume(self, audio: bytes) -> float:
|
||||
"""Calculate smoothed audio volume using exponential smoothing."""
|
||||
volume = calculate_audio_volume(audio, self.sample_rate)
|
||||
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
|
||||
|
||||
def analyze_audio(self, buffer) -> VADState:
|
||||
"""Analyze audio buffer and return current VAD state.
|
||||
|
||||
Processes incoming audio data, maintains internal state, and determines
|
||||
voice activity status based on confidence and volume thresholds.
|
||||
|
||||
Args:
|
||||
buffer: Audio buffer to analyze.
|
||||
|
||||
Returns:
|
||||
Current VAD state after processing the buffer.
|
||||
"""
|
||||
self._vad_buffer += buffer
|
||||
|
||||
num_required_bytes = self._vad_frames_num_bytes
|
||||
|
||||
@@ -4,14 +4,33 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Base clock interface for Pipecat timing operations."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class BaseClock(ABC):
|
||||
"""Abstract base class for clock implementations.
|
||||
|
||||
Provides a common interface for timing operations used in Pipecat
|
||||
for synchronization, scheduling, and time-based processing.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_time(self) -> int:
|
||||
"""Get the current time value.
|
||||
|
||||
Returns:
|
||||
The current time as an integer value. The specific unit and
|
||||
reference point depend on the concrete implementation.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def start(self):
|
||||
"""Start or initialize the clock.
|
||||
|
||||
Performs any necessary initialization or starts the timing mechanism.
|
||||
This method should be called before using get_time().
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -4,17 +4,42 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""System clock implementation for Pipecat."""
|
||||
|
||||
import time
|
||||
|
||||
from pipecat.clocks.base_clock import BaseClock
|
||||
|
||||
|
||||
class SystemClock(BaseClock):
|
||||
"""A monotonic clock implementation using system time.
|
||||
|
||||
Provides high-precision timing using the system's monotonic clock,
|
||||
which is not affected by system clock adjustments and is suitable
|
||||
for measuring elapsed time in real-time applications.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the system clock.
|
||||
|
||||
The clock starts in an uninitialized state and must be started
|
||||
explicitly using the start() method before time measurement begins.
|
||||
"""
|
||||
self._time = 0
|
||||
|
||||
def get_time(self) -> int:
|
||||
"""Get the elapsed time since the clock was started.
|
||||
|
||||
Returns:
|
||||
The elapsed time in nanoseconds since start() was called.
|
||||
Returns 0 if the clock has not been started yet.
|
||||
"""
|
||||
return time.monotonic_ns() - self._time if self._time > 0 else 0
|
||||
|
||||
def start(self):
|
||||
"""Start the clock and begin time measurement.
|
||||
|
||||
Records the current monotonic time as the reference point
|
||||
for all subsequent get_time() calls.
|
||||
"""
|
||||
self._time = time.monotonic_ns()
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Daily.co room configuration utilities for Pipecat examples."""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from typing import Optional
|
||||
@@ -14,6 +16,17 @@ from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper
|
||||
|
||||
|
||||
async def configure(aiohttp_session: aiohttp.ClientSession):
|
||||
"""Configure Daily.co room URL and token from arguments or environment.
|
||||
|
||||
Args:
|
||||
aiohttp_session: HTTP session for making API requests.
|
||||
|
||||
Returns:
|
||||
Tuple containing the room URL and authentication token.
|
||||
|
||||
Raises:
|
||||
Exception: If room URL or API key are not provided.
|
||||
"""
|
||||
(url, token, _) = await configure_with_args(aiohttp_session)
|
||||
return (url, token)
|
||||
|
||||
@@ -21,6 +34,18 @@ async def configure(aiohttp_session: aiohttp.ClientSession):
|
||||
async def configure_with_args(
|
||||
aiohttp_session: aiohttp.ClientSession, parser: Optional[argparse.ArgumentParser] = None
|
||||
):
|
||||
"""Configure Daily.co room with command-line argument parsing.
|
||||
|
||||
Args:
|
||||
aiohttp_session: HTTP session for making API requests.
|
||||
parser: Optional argument parser. If None, creates a default one.
|
||||
|
||||
Returns:
|
||||
Tuple containing room URL, authentication token, and parsed arguments.
|
||||
|
||||
Raises:
|
||||
Exception: If room URL or API key are not provided via arguments or environment.
|
||||
"""
|
||||
if not parser:
|
||||
parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample")
|
||||
parser.add_argument(
|
||||
|
||||
@@ -4,10 +4,18 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Pipecat example runner with support for multiple transport types.
|
||||
|
||||
This module provides a unified interface for running Pipecat examples across
|
||||
different transport types including Daily.co, WebRTC, and Twilio. It handles
|
||||
setup, configuration, and lifecycle management for each transport type.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Any, Callable, Dict, Mapping, Optional
|
||||
@@ -35,6 +43,15 @@ load_dotenv(override=True)
|
||||
|
||||
|
||||
def get_transport_client_id(transport: BaseTransport, client: Any) -> str:
|
||||
"""Get client identifier from transport-specific client object.
|
||||
|
||||
Args:
|
||||
transport: The transport instance.
|
||||
client: Transport-specific client object.
|
||||
|
||||
Returns:
|
||||
Client identifier string, empty if transport not supported.
|
||||
"""
|
||||
if isinstance(transport, SmallWebRTCTransport):
|
||||
return client.pc_id
|
||||
elif isinstance(transport, DailyTransport):
|
||||
@@ -46,6 +63,13 @@ def get_transport_client_id(transport: BaseTransport, client: Any) -> str:
|
||||
async def maybe_capture_participant_camera(
|
||||
transport: BaseTransport, client: Any, framerate: int = 0
|
||||
):
|
||||
"""Capture participant camera video if transport supports it.
|
||||
|
||||
Args:
|
||||
transport: The transport instance.
|
||||
client: Transport-specific client object.
|
||||
framerate: Video capture framerate. Defaults to 0 (auto).
|
||||
"""
|
||||
if isinstance(transport, DailyTransport):
|
||||
await transport.capture_participant_video(
|
||||
client["id"], framerate=framerate, video_source="camera"
|
||||
@@ -55,17 +79,84 @@ async def maybe_capture_participant_camera(
|
||||
async def maybe_capture_participant_screen(
|
||||
transport: BaseTransport, client: Any, framerate: int = 0
|
||||
):
|
||||
"""Capture participant screen video if transport supports it.
|
||||
|
||||
Args:
|
||||
transport: The transport instance.
|
||||
client: Transport-specific client object.
|
||||
framerate: Video capture framerate. Defaults to 0 (auto).
|
||||
"""
|
||||
if isinstance(transport, DailyTransport):
|
||||
await transport.capture_participant_video(
|
||||
client["id"], framerate=framerate, video_source="screenVideo"
|
||||
)
|
||||
|
||||
|
||||
def smallwebrtc_sdp_cleanup_ice_candidates(text: str, pattern: str) -> str:
|
||||
"""Clean up ICE candidates in SDP text for SmallWebRTC.
|
||||
|
||||
Args:
|
||||
text: SDP text to clean up.
|
||||
pattern: Pattern to match for candidate filtering.
|
||||
|
||||
Returns:
|
||||
Cleaned SDP text with filtered ICE candidates.
|
||||
"""
|
||||
result = []
|
||||
lines = text.splitlines()
|
||||
for line in lines:
|
||||
if re.search("a=candidate", line):
|
||||
if re.search(pattern, line) and not re.search("raddr", line):
|
||||
result.append(line)
|
||||
else:
|
||||
result.append(line)
|
||||
return "\r\n".join(result)
|
||||
|
||||
|
||||
def smallwebrtc_sdp_cleanup_fingerprints(text: str) -> str:
|
||||
"""Remove unsupported fingerprint algorithms from SDP text.
|
||||
|
||||
Args:
|
||||
text: SDP text to clean up.
|
||||
|
||||
Returns:
|
||||
SDP text with sha-384 and sha-512 fingerprints removed.
|
||||
"""
|
||||
result = []
|
||||
lines = text.splitlines()
|
||||
for line in lines:
|
||||
if not re.search("sha-384", line) and not re.search("sha-512", line):
|
||||
result.append(line)
|
||||
return "\r\n".join(result)
|
||||
|
||||
|
||||
def smallwebrtc_sdp_munging(sdp: str, host: str) -> str:
|
||||
"""Apply SDP modifications for SmallWebRTC compatibility.
|
||||
|
||||
Args:
|
||||
sdp: Original SDP string.
|
||||
host: Host address for ICE candidate filtering.
|
||||
|
||||
Returns:
|
||||
Modified SDP string with fingerprint and ICE candidate cleanup.
|
||||
"""
|
||||
sdp = smallwebrtc_sdp_cleanup_fingerprints(sdp)
|
||||
sdp = smallwebrtc_sdp_cleanup_ice_candidates(sdp, host)
|
||||
return sdp
|
||||
|
||||
|
||||
def run_example_daily(
|
||||
run_example: Callable,
|
||||
args: argparse.Namespace,
|
||||
transport_params: Mapping[str, Callable] = {},
|
||||
):
|
||||
"""Run example using Daily.co transport.
|
||||
|
||||
Args:
|
||||
run_example: The example function to run.
|
||||
args: Parsed command-line arguments.
|
||||
transport_params: Mapping of transport names to parameter factory functions.
|
||||
"""
|
||||
logger.info("Running example with DailyTransport...")
|
||||
|
||||
from pipecat.examples.daily_runner import configure
|
||||
@@ -87,6 +178,13 @@ def run_example_webrtc(
|
||||
args: argparse.Namespace,
|
||||
transport_params: Mapping[str, Callable] = {},
|
||||
):
|
||||
"""Run example using WebRTC transport with FastAPI server.
|
||||
|
||||
Args:
|
||||
run_example: The example function to run.
|
||||
args: Parsed command-line arguments.
|
||||
transport_params: Mapping of transport names to parameter factory functions.
|
||||
"""
|
||||
logger.info("Running example with SmallWebRTCTransport...")
|
||||
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
@@ -96,21 +194,25 @@ def run_example_webrtc(
|
||||
# Store connections by pc_id
|
||||
pcs_map: Dict[str, SmallWebRTCConnection] = {}
|
||||
|
||||
ice_servers = [
|
||||
IceServer(
|
||||
urls="stun:stun.l.google.com:19302",
|
||||
)
|
||||
]
|
||||
|
||||
# Mount the frontend at /
|
||||
app.mount("/client", SmallWebRTCPrebuiltUI)
|
||||
|
||||
@app.get("/", include_in_schema=False)
|
||||
async def root_redirect():
|
||||
"""Redirect root requests to client interface."""
|
||||
return RedirectResponse(url="/client/")
|
||||
|
||||
@app.post("/api/offer")
|
||||
async def offer(request: dict, background_tasks: BackgroundTasks):
|
||||
"""Handle WebRTC offer requests and manage peer connections.
|
||||
|
||||
Args:
|
||||
request: WebRTC offer request containing SDP and connection details.
|
||||
background_tasks: FastAPI background tasks for running examples.
|
||||
|
||||
Returns:
|
||||
WebRTC answer with connection details.
|
||||
"""
|
||||
pc_id = request.get("pc_id")
|
||||
|
||||
if pc_id and pc_id in pcs_map:
|
||||
@@ -122,11 +224,16 @@ def run_example_webrtc(
|
||||
restart_pc=request.get("restart_pc", False),
|
||||
)
|
||||
else:
|
||||
pipecat_connection = SmallWebRTCConnection(ice_servers)
|
||||
pipecat_connection = SmallWebRTCConnection()
|
||||
await pipecat_connection.initialize(sdp=request["sdp"], type=request["type"])
|
||||
|
||||
@pipecat_connection.event_handler("closed")
|
||||
async def handle_disconnected(webrtc_connection: SmallWebRTCConnection):
|
||||
"""Handle WebRTC connection closure and cleanup.
|
||||
|
||||
Args:
|
||||
webrtc_connection: The closed WebRTC connection.
|
||||
"""
|
||||
logger.info(f"Discarding peer connection for pc_id: {webrtc_connection.pc_id}")
|
||||
pcs_map.pop(webrtc_connection.pc_id, None)
|
||||
|
||||
@@ -136,6 +243,10 @@ def run_example_webrtc(
|
||||
background_tasks.add_task(run_example, transport, args, False)
|
||||
|
||||
answer = pipecat_connection.get_answer()
|
||||
|
||||
if args.esp32 and args.host:
|
||||
answer["sdp"] = smallwebrtc_sdp_munging(answer["sdp"], args.host)
|
||||
|
||||
# Updating the peer connection inside the map
|
||||
pcs_map[answer["pc_id"]] = pipecat_connection
|
||||
|
||||
@@ -143,6 +254,14 @@ def run_example_webrtc(
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Manage FastAPI application lifecycle and cleanup connections.
|
||||
|
||||
Args:
|
||||
app: The FastAPI application instance.
|
||||
|
||||
Yields:
|
||||
Control to the FastAPI application runtime.
|
||||
"""
|
||||
yield # Run app
|
||||
coros = [pc.disconnect() for pc in pcs_map.values()]
|
||||
await asyncio.gather(*coros)
|
||||
@@ -156,6 +275,13 @@ def run_example_twilio(
|
||||
args: argparse.Namespace,
|
||||
transport_params: Mapping[str, Callable] = {},
|
||||
):
|
||||
"""Run example using Twilio transport with FastAPI WebSocket server.
|
||||
|
||||
Args:
|
||||
run_example: The example function to run.
|
||||
args: Parsed command-line arguments.
|
||||
transport_params: Mapping of transport names to parameter factory functions.
|
||||
"""
|
||||
logger.info("Running example with FastAPIWebsocketTransport (Twilio)...")
|
||||
|
||||
app = FastAPI()
|
||||
@@ -170,6 +296,11 @@ def run_example_twilio(
|
||||
|
||||
@app.post("/")
|
||||
async def start_call():
|
||||
"""Handle Twilio webhook and return TwiML response.
|
||||
|
||||
Returns:
|
||||
TwiML XML response directing call to WebSocket stream.
|
||||
"""
|
||||
logger.debug("POST TwiML")
|
||||
|
||||
xml_content = f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
@@ -184,6 +315,11 @@ def run_example_twilio(
|
||||
|
||||
@app.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
"""Handle Twilio WebSocket connections for voice streaming.
|
||||
|
||||
Args:
|
||||
websocket: The WebSocket connection from Twilio.
|
||||
"""
|
||||
await websocket.accept()
|
||||
|
||||
logger.debug("WebSocket connection accepted")
|
||||
@@ -216,6 +352,13 @@ def run_main(
|
||||
args: argparse.Namespace,
|
||||
transport_params: Mapping[str, Callable] = {},
|
||||
):
|
||||
"""Run the example with the specified transport type.
|
||||
|
||||
Args:
|
||||
run_example: The example function to run.
|
||||
args: Parsed command-line arguments.
|
||||
transport_params: Mapping of transport names to parameter factory functions.
|
||||
"""
|
||||
if args.transport not in transport_params:
|
||||
logger.error(f"Transport '{args.transport}' not supported by this example")
|
||||
return
|
||||
@@ -235,6 +378,13 @@ def main(
|
||||
parser: Optional[argparse.ArgumentParser] = None,
|
||||
transport_params: Mapping[str, Callable] = {},
|
||||
):
|
||||
"""Main entry point for running Pipecat examples with transport selection.
|
||||
|
||||
Args:
|
||||
run_example: The example function to run.
|
||||
parser: Optional argument parser. If None, creates a default one.
|
||||
transport_params: Mapping of transport names to parameter factory functions.
|
||||
"""
|
||||
if not parser:
|
||||
parser = argparse.ArgumentParser(description="Pipecat Bot Runner")
|
||||
parser.add_argument(
|
||||
@@ -254,9 +404,16 @@ def main(
|
||||
parser.add_argument(
|
||||
"--proxy", "-x", help="A public proxy host name (no protocol, e.g. proxy.example.com)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--esp32", action="store_true", default=False, help="Perform SDP munging for the ESP32"
|
||||
)
|
||||
parser.add_argument("--verbose", "-v", action="count", default=0)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.esp32 and args.host == "localhost":
|
||||
logger.error("For ESP32, you need to specify `--host IP` so we can do SDP munging.")
|
||||
return
|
||||
|
||||
# Log level
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="TRACE" if args.verbose else "DEBUG")
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user