Compare commits
196 Commits
mb/cli
...
hush/realt
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6101ff9661 | ||
|
|
c20aa78648 | ||
|
|
38f27ad991 | ||
|
|
0c38585034 | ||
|
|
8a09bbbf0e | ||
|
|
fb737ff671 | ||
|
|
b7a4d7371c | ||
|
|
ef88d6a2ea | ||
|
|
5c1bd8cda2 | ||
|
|
a82158045a | ||
|
|
b1533ddfc4 | ||
|
|
0abc699f24 | ||
|
|
09018071e8 | ||
|
|
1c53a5fd01 | ||
|
|
05d4753d3e | ||
|
|
87131850bc | ||
|
|
af83f45a49 | ||
|
|
62e45f466a | ||
|
|
e85e93b9b1 | ||
|
|
074d3ff162 | ||
|
|
d680ec2e69 | ||
|
|
d905b21f72 | ||
|
|
6c5d84ca4c | ||
|
|
334167e3d7 | ||
|
|
e3531a5f25 | ||
|
|
343e97666a | ||
|
|
653e84321b | ||
|
|
3585f724c4 | ||
|
|
5fe597d355 | ||
|
|
67ab3773f6 | ||
|
|
c6e12b9358 | ||
|
|
0f5030bafa | ||
|
|
ed93e29850 | ||
|
|
7eb880c5e8 | ||
|
|
4fa0de6660 | ||
|
|
396c1bcc13 | ||
|
|
57f6ae9e50 | ||
|
|
2d03e51109 | ||
|
|
1e7143e5f3 | ||
|
|
f820c20fa2 | ||
|
|
83f395ff8f | ||
|
|
09a7e08cbf | ||
|
|
6f172bba8f | ||
|
|
1433df4de2 | ||
|
|
6ade5617fb | ||
|
|
685d440206 | ||
|
|
ac5734d0ed | ||
|
|
5e00133e64 | ||
|
|
42f0490414 | ||
|
|
19f046a338 | ||
|
|
ec95618b94 | ||
|
|
74fb6e7676 | ||
|
|
8fa6cbac51 | ||
|
|
a997655eac | ||
|
|
3b3a215155 | ||
|
|
e458d3edfe | ||
|
|
d7d409df60 | ||
|
|
5174b18176 | ||
|
|
9c5690d670 | ||
|
|
e0933e20d2 | ||
|
|
ce13155d26 | ||
|
|
817a485d94 | ||
|
|
b094418d1e | ||
|
|
08a1e09020 | ||
|
|
52b33e5106 | ||
|
|
5db0871a20 | ||
|
|
222c362fa4 | ||
|
|
9d509bb409 | ||
|
|
8d0e7e5e16 | ||
|
|
e7b8da7a83 | ||
|
|
35c48a45cf | ||
|
|
14a365aa16 | ||
|
|
779fc0419d | ||
|
|
057e0c3973 | ||
|
|
8a6abdd44b | ||
|
|
7872fa2e88 | ||
|
|
e86c546a1a | ||
|
|
abf34bcccf | ||
|
|
56eb633390 | ||
|
|
6299b9db87 | ||
|
|
bcffa590a3 | ||
|
|
8b739aa444 | ||
|
|
8f15980c67 | ||
|
|
89e9acf0e1 | ||
|
|
ddac24e6c9 | ||
|
|
d0f52feba3 | ||
|
|
8894db4290 | ||
|
|
1f96cdf970 | ||
|
|
0282033208 | ||
|
|
917ea27352 | ||
|
|
8c03df1463 | ||
|
|
15aa76efba | ||
|
|
8ac421f8fd | ||
|
|
75b3ea9c96 | ||
|
|
95be1510ac | ||
|
|
df19011080 | ||
|
|
e42cf78e79 | ||
|
|
0495de52b6 | ||
|
|
9bc02afd0d | ||
|
|
6140fdb2c9 | ||
|
|
b6a1886dae | ||
|
|
42d0a097c5 | ||
|
|
3761804146 | ||
|
|
46e97c57c2 | ||
|
|
19770b76b4 | ||
|
|
b34461bf93 | ||
|
|
bab0aaf585 | ||
|
|
61944d22ef | ||
|
|
47756319be | ||
|
|
5fa56df014 | ||
|
|
8a151235c3 | ||
|
|
ec42f8c24e | ||
|
|
29fd17b9ff | ||
|
|
3ea1e357f2 | ||
|
|
351ef617ae | ||
|
|
9dafb715c4 | ||
|
|
82d494d3d4 | ||
|
|
e893aaa620 | ||
|
|
65c17a698e | ||
|
|
615aae5b95 | ||
|
|
b0acbeffb9 | ||
|
|
2f1061f300 | ||
|
|
9307079af2 | ||
|
|
efa64642a4 | ||
|
|
ede6c32149 | ||
|
|
4050e8b7dc | ||
|
|
b0f5fc02c4 | ||
|
|
493d6bf91e | ||
|
|
aaebcae2e8 | ||
|
|
408264a0fd | ||
|
|
df8aa3e4b0 | ||
|
|
4d82a1260b | ||
|
|
f974c66e12 | ||
|
|
533372ed37 | ||
|
|
a9118eb2cd | ||
|
|
84ed2468e5 | ||
|
|
d82d855c20 | ||
|
|
412ff2a4a1 | ||
|
|
82ccc160fb | ||
|
|
9ef60bd468 | ||
|
|
f3c4bf08dd | ||
|
|
f2cfbee3c3 | ||
|
|
8b063116ab | ||
|
|
8096e62b34 | ||
|
|
20f4b0e8ff | ||
|
|
6feaf91789 | ||
|
|
91d3ae07b3 | ||
|
|
71841f71ef | ||
|
|
949b807023 | ||
|
|
4ad15f9a01 | ||
|
|
99d94fc625 | ||
|
|
a3d630c0d1 | ||
|
|
04b482c445 | ||
|
|
b2bce4916f | ||
|
|
60e9817f16 | ||
|
|
c655d0d313 | ||
|
|
ea6e146f2d | ||
|
|
ec890a834f | ||
|
|
5b921fc054 | ||
|
|
f1040100f4 | ||
|
|
54691ee781 | ||
|
|
49239a23c6 | ||
|
|
e0c43de13f | ||
|
|
cc4c96d099 | ||
|
|
788465cb04 | ||
|
|
db934eade0 | ||
|
|
0b8c966a11 | ||
|
|
5849485bc6 | ||
|
|
459af58540 | ||
|
|
576bd67e85 | ||
|
|
1e8629bf96 | ||
|
|
776a3526f9 | ||
|
|
2ced044418 | ||
|
|
151f187837 | ||
|
|
67afa718d0 | ||
|
|
52ab0eccc0 | ||
|
|
d1f1b68b71 | ||
|
|
a479c32665 | ||
|
|
9f66b0ba41 | ||
|
|
23385ca3d2 | ||
|
|
8b24bae9c5 | ||
|
|
0502ec6c44 | ||
|
|
81645910e0 | ||
|
|
d6ab4c41b0 | ||
|
|
2f92cb8781 | ||
|
|
fbf274374c | ||
|
|
427efecf5b | ||
|
|
b3e54546ac | ||
|
|
de46631bac | ||
|
|
abf0150261 | ||
|
|
5052da8ce6 | ||
|
|
d1d74c571c | ||
|
|
9acc36c58e | ||
|
|
1ecf6e05fe | ||
|
|
5cc1d8a024 | ||
|
|
1e31fc7f9b |
330
CHANGELOG.md
330
CHANGELOG.md
@@ -9,6 +9,307 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Added
|
||||
|
||||
- Added support for loading external observers. You can now register custom
|
||||
pipeline observers by setting the `PIPECAT_OBSERVER_FILES` environment
|
||||
variable. This variable should contain a colon-separated list of Python files
|
||||
(e.g. `export PIPECAT_OBSERVER_FILES="observer1.py:observer2.py:..."`). Each
|
||||
file must define a function with the following signature:
|
||||
|
||||
```python
|
||||
async def create_observers(task: PipelineTask) -> Iterable[BaseObserver]:
|
||||
...
|
||||
```
|
||||
|
||||
- Added support for new sonic-3 languages in `CartesiaTTSService` and
|
||||
`CartesiaHttpTTSService`.
|
||||
|
||||
- `EndFrame` and `EndTaskFrame` have an optional `reason` field to indicate why
|
||||
the pipeline is being ended.
|
||||
|
||||
- `CancelFrame` and `CancelTaskFrame` have an optional `reason` field to
|
||||
indicate why the pipeline is being canceled. This can be also specified when
|
||||
you cancel a task with `PipelineTask.cancel(reason="cancellation your
|
||||
reason")`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed `GeminiLiveLLMService` session resumption after a connection timeout.
|
||||
|
||||
- `GeminiLiveLLMService` now properly supports context-provided system
|
||||
instruction and tools.
|
||||
|
||||
## [0.0.92] - 2025-10-31 🎃 "The Haunted Edition" 👻
|
||||
|
||||
### Added
|
||||
|
||||
- Added supprt for Sarvam Speech-to-Text service (`SarvamSTTService`) with streaming WebSocket
|
||||
support for `saarika` (STT) and `saaras` (STT-translate) models.
|
||||
|
||||
- Added a new `DeepgramHttpTTSService`, which delivers a meaningful reduction
|
||||
in latency when compared to the `DeepgramTTSService`.
|
||||
|
||||
- Add support for `speaking_rate` input parameter in `GoogleHttpTTSService`.
|
||||
|
||||
- Added `enable_speaker_diarization` and `enable_language_identification` to
|
||||
`SonioxSTTService`.
|
||||
|
||||
- Added `SpeechmaticsTTSService`, which uses Speechmatic's TTS API. Updated
|
||||
examples 07a\* to use the new TTS service.
|
||||
|
||||
- Added support for including images or audio to LLM context messages using
|
||||
`LLMContext.create_image_message()` or `LLMContext.create_image_url_message()`
|
||||
(not all LLMs support URLs) and `LLMContext.create_audio_message()`. For
|
||||
example, when creating `LLMMessagesAppendFrame`:
|
||||
|
||||
```python
|
||||
message = LLMContext.create_image_message(image=..., size= ...)
|
||||
await self.push_frame(LLMMessagesAppendFrame(messages=[message], run_llm=True))
|
||||
```
|
||||
|
||||
- New event handlers for the `DeepgramFluxSTTService`: `on_start_of_turn`,
|
||||
`on_turn_resumed`, `on_end_of_turn`, `on_eager_end_of_turn`, `on_update`.
|
||||
|
||||
- Added `generation_config` parameter support to `CartesiaTTSService` and
|
||||
`CartesiaHttpTTSService` for Cartesia Sonic-3 models. Includes a new
|
||||
`GenerationConfig` class with `volume` (0.5-2.0), `speed` (0.6-1.5),
|
||||
and `emotion` (60+ options) parameters for fine-grained speech generation
|
||||
control.
|
||||
|
||||
- Expanded support for univeral `LLMContext` to `OpenAIRealtimeLLMService`.
|
||||
As a reminder, the context-setup pattern when using `LLMContext` is:
|
||||
|
||||
```python
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
```
|
||||
|
||||
(Note that even though `OpenAIRealtimeLLMService` now supports the universal
|
||||
`LLMContext`, it is not meant to be swapped out for another LLM service at
|
||||
runtime with `LLMSwitcher`.)
|
||||
|
||||
Note: `TranscriptionFrame`s and `InterimTranscriptionFrame`s now go upstream
|
||||
from `OpenAIRealtimeLLMService`, so if you're using `TranscriptProcessor`,
|
||||
say, you'll want to adjust accordingly:
|
||||
|
||||
```python
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
context_aggregator.user(),
|
||||
|
||||
# BEFORE
|
||||
llm,
|
||||
transcript.user(),
|
||||
|
||||
# AFTER
|
||||
transcript.user(),
|
||||
llm,
|
||||
|
||||
transport.output(),
|
||||
transcript.assistant(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
Also worth noting: whether or not you use the new context-setup pattern with
|
||||
`OpenAIRealtimeLLMService`, some types have changed under the hood:
|
||||
|
||||
```python
|
||||
## BEFORE:
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: OpenAIContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: OpenAILLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: OpenAIRealtimeLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
|
||||
## AFTER:
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: LLMContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: LLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
```
|
||||
|
||||
Also note that `RealtimeMessagesUpdateFrame` and
|
||||
`RealtimeFunctionCallResultFrame` have been deprecated, since they're no
|
||||
longer used by `OpenAIRealtimeLLMService`. OpenAI Realtime now works more
|
||||
like other LLM services in Pipecat, relying on updates to its context, pushed
|
||||
by context aggregators, to update its internal state. Listen for
|
||||
`LLMContextFrame`s for context updates.
|
||||
|
||||
Finally, `LLMTextFrame`s are no longer pushed from `OpenAIRealtimeLLMService`
|
||||
when it's configured with `output_modalities=['audio']`. If you need
|
||||
to process its output, listen for `TTSTextFrame`s instead.
|
||||
|
||||
- Expanded support for universal `LLMContext` to `GeminiLiveLLMService`.
|
||||
As a reminder, the context-setup pattern when using `LLMContext` is:
|
||||
|
||||
```python
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
```
|
||||
|
||||
(Note that even though `GeminiLiveLLMService` now supports the universal
|
||||
`LLMContext`, it is not meant to be swapped out for another LLM service at
|
||||
runtime with `LLMSwitcher`.)
|
||||
|
||||
Worth noting: whether or not you use the new context-setup pattern with
|
||||
`GeminiLiveLLMService`, some types have changed under the hood:
|
||||
|
||||
```python
|
||||
## BEFORE:
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: GeminiLiveContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: OpenAILLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: GeminiLiveLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
|
||||
## AFTER:
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: LLMContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: LLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
```
|
||||
|
||||
Also note that `LLMTextFrame`s are no longer pushed from `GeminiLiveLLMService`
|
||||
when it's configured with `modalities=GeminiModalities.AUDIO`. If you need
|
||||
to process its output, listen for `TTSTextFrame`s instead.
|
||||
|
||||
### Changed
|
||||
|
||||
- The development runner's `/start` endpoint now supports passing
|
||||
`dailyRoomProperties` and `dailyMeetingTokenProperties` in the request body
|
||||
when `createDailyRoom` is true. Properties are validated against the
|
||||
`DailyRoomProperties` and `DailyMeetingTokenProperties` types respectively
|
||||
and passed to Daily's room and token creation APIs.
|
||||
|
||||
- `UserImageRawFrame` new fields `append_to_context` and `text`. The
|
||||
`append_to_context` field indicates if this image and text should be added to
|
||||
the LLM context (by the LLM assistant aggregator). The `text` field, if set,
|
||||
might also guide the LLM or the vision service on how to analyze the image.
|
||||
|
||||
- `UserImageRequestFrame` new fiels `append_to_context` and `text`. Both fields
|
||||
will be used to set the same fields on the captured `UserImageRawFrame`.
|
||||
|
||||
- `UserImageRequestFrame` don't require function call name and ID anymore.
|
||||
|
||||
- Updated `MoondreamService` to process `UserImageRawFrame`.
|
||||
|
||||
- `VisionService` expects `UserImageRawFrame` in order to analyze images.
|
||||
|
||||
- `DailyTransport` triggers `on_error` event if transcription can't be started
|
||||
or stopped.
|
||||
|
||||
- `DailyTransport` updates: `start_dialout()` now returns two values:
|
||||
`session_id` and `error`. `start_recording()` now returns two values:
|
||||
`stream_id` and `error`.
|
||||
|
||||
- Updated `daily-python` to 0.21.0.
|
||||
|
||||
- `SimliVideoService` now accepts `api_key` and `face_id` parameters directly,
|
||||
with optional `params` for `max_session_length` and `max_idle_time`
|
||||
configuration, aligning with other Pipecat service patterns.
|
||||
|
||||
- Updated the default model to `sonic-3` for `CartesiaTTSService` and
|
||||
`CartesiaHttpTTSService`.
|
||||
|
||||
- `FunctionFilter` now has a `filter_system_frames` arg, which controls whether
|
||||
or not SystemFrames are filtered.
|
||||
|
||||
- Upgraded `aws_sdk_bedrock_runtime` to v0.1.1 to resolve potential CPU issues
|
||||
when running `AWSNovaSonicLLMService`.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- The `expect_stripped_words` parameter of `LLMAssistantAggregatorParams` is
|
||||
ignored when used with the newer `LLMAssistantAggregator`, which now handles
|
||||
word spacing automatically.
|
||||
|
||||
- `LLMService.request_image_frame()` is deprecated, push a
|
||||
`UserImageRequestFrame` instead.
|
||||
|
||||
- `UserResponseAggregator` is deprecated and will be removed in a future version.
|
||||
|
||||
- The `send_transcription_frames` argument to `OpenAIRealtimeLLMService` is
|
||||
deprecated. Transcription frames are now always sent. They go upstream, to be
|
||||
handled by the user context aggregator. See "Added" section for details.
|
||||
|
||||
- Types in `pipecat.services.openai.realtime.context` and
|
||||
`pipecat.services.openai.realtime.frames` are deprecated, as they're no
|
||||
longer used by `OpenAIRealtimeLLMService`. See "Added" section for details.
|
||||
|
||||
- `SimliVideoService` `simli_config` parameter is deprecated. Use `api_key` and
|
||||
`face_id` parameters instead.
|
||||
|
||||
### Removed
|
||||
|
||||
- Removed `enable_non_final_tokens` and `max_non_final_tokens_duration_ms` from
|
||||
`SonioxSTTService`.
|
||||
|
||||
- Removed the `aiohttp_session` arg from `SarvamTTSService` as it's no longer
|
||||
used.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed a `PipelineTask` issue that was causing an idle timeout for frames that
|
||||
were being generated but not reaching the end of the pipeline. Since the exact
|
||||
point when frames are discarded is unknown, we now monitor pipeline frames
|
||||
using an observer. If the observer detects frames are being generated, it will
|
||||
prevent the pipeline from being considered idle.
|
||||
|
||||
- Fixed an issue in `HumeTTSService` that was only using Octave 2, which does
|
||||
not support the `description` field. Now, if a description is provided, it
|
||||
switches to Octave 1.
|
||||
|
||||
- Fixed an issue where `DailyTransport` would timeout prematurely on join and on
|
||||
leave.
|
||||
|
||||
- Fixed an issue in the runner where starting a DailyTransport room via
|
||||
`/start` didn't support using the `DAILY_SAMPLE_ROOM_URL` env var.
|
||||
|
||||
- Fixed an issue in `ServiceSwitcher` where the `STTService`s would result in
|
||||
all STT services producing `TranscriptionFrame`s.
|
||||
|
||||
### Other
|
||||
|
||||
- Updated all vision 12-series foundational examples to load images from a file.
|
||||
|
||||
- Added 14-series video examples for different services. These new examples
|
||||
request an image from the user camera through a function call.
|
||||
|
||||
## [0.0.91] - 2025-10-21
|
||||
|
||||
### Added
|
||||
|
||||
- It is now possible to start a bot from the `/start` endpoint when using the
|
||||
runner Daily's transport. This follows the Pipecat Cloud format with
|
||||
`createDailyRoom` and `body` fields in the POST request body.
|
||||
|
||||
- Added an ellipsis character (`…`) to the end of sentence detection in the
|
||||
string utils.
|
||||
|
||||
- Expanded support for universal `LLMContext` to `AWSNovaSonicLLMService`.
|
||||
As a reminder, the context-setup pattern when using `LLMContext` is:
|
||||
|
||||
@@ -19,7 +320,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
(Note that even though `AWSNovaSonicLLMService` now supports the universal
|
||||
`LLMContext`, it is not meant to be swapped out for another LLM service at
|
||||
runtime.)
|
||||
runtime with `LLMSwitcher`.)
|
||||
|
||||
Worth noting: whether or not you use the new context-setup pattern with
|
||||
`AWSNovaSonicLLMService`, some types have changed under the hood:
|
||||
@@ -38,9 +339,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.messages
|
||||
|
||||
## AFTER:
|
||||
|
||||
# Context aggregator type
|
||||
@@ -51,9 +349,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.get_messages()
|
||||
```
|
||||
|
||||
- Added support for `bulbul:v3` model in `SarvamTTSService` and
|
||||
@@ -85,6 +380,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Changed
|
||||
|
||||
- `RunnerArguments` now include the `body` field, so there's no need to add it
|
||||
to subclasses. Also, all `RunnerArguments` fields are now keyword-only.
|
||||
|
||||
- `CartesiaSTTService` now inherits from `WebsocketSTTService`.
|
||||
|
||||
- Package upgrades:
|
||||
@@ -101,13 +399,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
deprecated. Transcription frames are now always sent. They go upstream, to be
|
||||
handled by the user context aggregator. See "Added" section for details.
|
||||
|
||||
- Types in `pipecat.services.aws.nova_sonic.context` have been deprecated due
|
||||
to changes to support `LLMContext`. See "Changed" section for details.
|
||||
- Types in `pipecat.services.aws.nova_sonic.context` are deprecated, as they're
|
||||
no longer used by `AWSNovaSonicLLMService`. See "Added" section for
|
||||
details.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where the `RTVIProcessor` was sending duplicate
|
||||
`UserStartedSpeakingFrame` and `UserStoppedSpeakingFrame` messages.
|
||||
|
||||
- Fixed an issue in `AWSBedrockLLMService` where both `temperature` and `top_p`
|
||||
were always sent together, causing conflicts with models like Claude Sonnet 4.5
|
||||
that don't allow both parameters simultaneously. The service now only includes
|
||||
inference parameters that are explicitly set, and `InputParams` defaults have
|
||||
been changed to `None` to rely on AWS Bedrock's built-in model defaults.
|
||||
|
||||
- Fixed an issue in `RivaSegmentedSTTService` where a runtime error occurred due
|
||||
to a mismatch in the _handle_transcription method's signature.
|
||||
to a mismatch in the `_handle_transcription` method's signature.
|
||||
|
||||
- Fixed multiple pipeline task cancellation issues. `asyncio.CancelledError` is
|
||||
now handled properly in `PipelineTask` making it possible to cancel an asyncio
|
||||
@@ -1131,6 +1439,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Added
|
||||
|
||||
- Added `SonioxSTTService` using Soniox's STT websocket API.
|
||||
|
||||
- Added `enable_emulated_vad_interruptions` to `LLMUserAggregatorParams`.
|
||||
When user speech is emulated (e.g. when a transcription is received but
|
||||
VAD doesn't detect speech), this parameter controls whether the emulated
|
||||
|
||||
30
README.md
30
README.md
@@ -44,6 +44,10 @@ Looking to build structured conversations? Check out [Pipecat Flows](https://git
|
||||
|
||||
Want to build beautiful and engaging experiences? Checkout the [Voice UI Kit](https://github.com/pipecat-ai/voice-ui-kit), a collection of components, hooks and templates for building voice AI applications quickly.
|
||||
|
||||
### 🛠️ Create and deploy projects
|
||||
|
||||
Create a new project in under a minute with the [Pipecat CLI](https://github.com/pipecat-ai/pipecat-cli). Then use the CLI to monitor and deploy your agent to production.
|
||||
|
||||
### 🔍 Debugging
|
||||
|
||||
Looking for help debugging your pipeline and processors? Check out [Whisker](https://github.com/pipecat-ai/whisker), a real-time Pipecat debugger.
|
||||
@@ -68,19 +72,19 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout
|
||||
|
||||
## 🧩 Available services
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
| Category | Services |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||
|
||||
|
||||
200
env.example
200
env.example
@@ -4,6 +4,9 @@ AICOUSTICS_LICENSE_KEY=...
|
||||
# Anthropic
|
||||
ANTHROPIC_API_KEY=...
|
||||
|
||||
# Assembly AI
|
||||
ASSEMBLYAI_API_KEY=...
|
||||
|
||||
# Async
|
||||
ASYNCAI_API_KEY=...
|
||||
ASYNCAI_VOICE_ID=...
|
||||
@@ -21,12 +24,19 @@ AZURE_CHATGPT_API_KEY=...
|
||||
AZURE_CHATGPT_ENDPOINT=https://...
|
||||
AZURE_CHATGPT_MODEL=...
|
||||
|
||||
AZURE_REALTIME_API_KEY=...
|
||||
AZURE_REALTIME_BASE_URL=...
|
||||
|
||||
AZURE_DALLE_API_KEY=...
|
||||
AZURE_DALLE_ENDPOINT=https://...
|
||||
AZURE_DALLE_MODEL=...
|
||||
|
||||
# Cartesia
|
||||
CARTESIA_API_KEY=...
|
||||
CARTESIA_VOICE_ID=...
|
||||
|
||||
# Cerebras
|
||||
CEREBRAS_API_KEY=...
|
||||
|
||||
# Daily
|
||||
DAILY_API_KEY=...
|
||||
@@ -35,57 +45,48 @@ DAILY_SAMPLE_ROOM_URL=https://...
|
||||
# Deepgram
|
||||
DEEPGRAM_API_KEY=...
|
||||
|
||||
# DeepSeek
|
||||
DEEPSEEK_API_KEY=...
|
||||
|
||||
# ElevenLabs
|
||||
ELEVENLABS_API_KEY=...
|
||||
ELEVENLABS_VOICE_ID=...
|
||||
|
||||
# Neuphonic
|
||||
NEUPHONIC_API_KEY=...
|
||||
|
||||
# Fal
|
||||
FAL_KEY=...
|
||||
|
||||
# Fireworks
|
||||
FIREWORKS_API_KEY=...
|
||||
|
||||
# Fish Audio
|
||||
FISH_API_KEY=...
|
||||
|
||||
# Gladia
|
||||
GLADIA_API_KEY=...
|
||||
GLADIA_REGION=...
|
||||
|
||||
# Google
|
||||
GOOGLE_API_KEY=...
|
||||
GOOGLE_CLOUD_PROJECT_ID=...
|
||||
GOOGLE_TEST_CREDENTIALS=...
|
||||
GOOGLE_VERTEX_TEST_CREDENTIALS=...
|
||||
GOOGLE_CLOUD_PROJECT_ID=...
|
||||
GOOGLE_CLOUD_LOCATION=...
|
||||
GOOGLE_TEST_CREDENTIALS=...
|
||||
|
||||
# Grok
|
||||
GROK_API_KEY=...
|
||||
|
||||
# Groq
|
||||
GROQ_API_KEY=...
|
||||
|
||||
# Heygen
|
||||
HEYGEN_API_KEY=...
|
||||
|
||||
# Hume
|
||||
HUME_API_KEY=...
|
||||
HUME_VOICE_ID=...
|
||||
|
||||
# LMNT
|
||||
LMNT_API_KEY=...
|
||||
LMNT_VOICE_ID=...
|
||||
|
||||
# Perplexity
|
||||
PERPLEXITY_API_KEY=...
|
||||
|
||||
# PlayHT
|
||||
PLAYHT_USER_ID=...
|
||||
PLAYHT_API_KEY=...
|
||||
|
||||
# OpenAI
|
||||
OPENAI_API_KEY=...
|
||||
|
||||
# OpenPipe
|
||||
OPENPIPE_API_KEY=...
|
||||
|
||||
# Tavus
|
||||
TAVUS_API_KEY=...
|
||||
TAVUS_REPLICA_ID=...
|
||||
TAVUS_PERSONA_ID=...
|
||||
|
||||
# Simli
|
||||
SIMLI_API_KEY=...
|
||||
SIMLI_FACE_ID=...
|
||||
# Inworld
|
||||
INWORLD_API_KEY=...
|
||||
|
||||
# Krisp
|
||||
KRISP_MODEL_PATH=...
|
||||
@@ -93,77 +94,100 @@ KRISP_MODEL_PATH=...
|
||||
# Krisp Viva
|
||||
KRISP_VIVA_MODEL_PATH=...
|
||||
|
||||
# DeepSeek
|
||||
DEEPSEEK_API_KEY=...
|
||||
# LiveKit
|
||||
LIVEKIT_API_KEY=...
|
||||
LIVEKIT_API_SECRET=...
|
||||
|
||||
# Groq
|
||||
GROQ_API_KEY=...
|
||||
|
||||
# Grok
|
||||
GROK_API_KEY=...
|
||||
|
||||
# Inworld
|
||||
INWORLD_API_KEY=...
|
||||
|
||||
# Together.ai
|
||||
TOGETHER_API_KEY=...
|
||||
|
||||
# Cerebras
|
||||
CEREBRAS_API_KEY=...
|
||||
|
||||
# Fish Audio
|
||||
FISH_API_KEY=...
|
||||
|
||||
# Assembly AI
|
||||
ASSEMBLYAI_API_KEY=...
|
||||
|
||||
# OpenRouter
|
||||
OPENROUTER_API_KEY=...
|
||||
|
||||
# Piper
|
||||
PIPER_BASE_URL=...
|
||||
|
||||
# Smart turn
|
||||
LOCAL_SMART_TURN_MODEL_PATH=...
|
||||
FAL_SMART_TURN_API_KEY=...
|
||||
|
||||
# Twilio
|
||||
TWILIO_ACCOUNT_SID=...
|
||||
TWILIO_AUTH_TOKEN=...
|
||||
# LMNT
|
||||
LMNT_API_KEY=...
|
||||
LMNT_VOICE_ID=...
|
||||
|
||||
# MiniMax
|
||||
MINIMAX_API_KEY=...
|
||||
MINIMAX_GROUP_ID=...
|
||||
|
||||
# Sarvam AI
|
||||
SARVAM_API_KEY=...
|
||||
|
||||
# Soniox
|
||||
SONIOX_API_KEY=
|
||||
|
||||
# Speechmatics
|
||||
SPEECHMATICS_API_KEY=...
|
||||
|
||||
# SambaNova
|
||||
SAMBANOVA_API_KEY=...
|
||||
|
||||
# Sentry
|
||||
SENTRY_DSN=...
|
||||
|
||||
# Heygen
|
||||
HEYGEN_API_KEY=...
|
||||
|
||||
# Mistral
|
||||
MISTRAL_API_KEY=...
|
||||
|
||||
# Neuphonic
|
||||
NEUPHONIC_API_KEY=...
|
||||
|
||||
# NVIDIA
|
||||
NVIDIA_API_KEY=...
|
||||
|
||||
# OpenAI
|
||||
OPENAI_API_KEY=...
|
||||
|
||||
# OpenPipe
|
||||
OPENPIPE_API_KEY=...
|
||||
|
||||
# OpenRouter
|
||||
OPENROUTER_API_KEY=...
|
||||
|
||||
# Perplexity
|
||||
PERPLEXITY_API_KEY=...
|
||||
|
||||
# Picovoice Koala
|
||||
KOALA_ACCESS_KEY=...
|
||||
|
||||
# Piper
|
||||
PIPER_BASE_URL=...
|
||||
|
||||
# PlayHT
|
||||
PLAYHT_USER_ID=...
|
||||
PLAYHT_API_KEY=...
|
||||
|
||||
# Plivo
|
||||
PLIVO_AUTH_ID=...
|
||||
PLIVO_AUTH_TOKEN=...
|
||||
|
||||
# Qwen
|
||||
QWEN_API_KEY=...
|
||||
|
||||
# Rime
|
||||
RIME_API_KEY=...
|
||||
RIME_VOICE_ID=...
|
||||
|
||||
# SambaNova
|
||||
SAMBANOVA_API_KEY=...
|
||||
|
||||
# Sarvam AI
|
||||
SARVAM_API_KEY=...
|
||||
|
||||
# Sentry
|
||||
SENTRY_DSN=...
|
||||
|
||||
# Simli
|
||||
SIMLI_API_KEY=...
|
||||
SIMLI_FACE_ID=...
|
||||
|
||||
# Smart turn
|
||||
LOCAL_SMART_TURN_MODEL_PATH=...
|
||||
FAL_SMART_TURN_API_KEY=...
|
||||
|
||||
# Soniox
|
||||
SONIOX_API_KEY=...
|
||||
|
||||
# Speechmatics
|
||||
SPEECHMATICS_API_KEY=...
|
||||
|
||||
# Tavus
|
||||
TAVUS_API_KEY=...
|
||||
TAVUS_REPLICA_ID=...
|
||||
|
||||
# Telnyx
|
||||
TELNYX_API_KEY=...
|
||||
TELNYX_ACCOUNT_SID=...
|
||||
|
||||
# Together.ai
|
||||
TOGETHER_API_KEY=...
|
||||
|
||||
# Twilio
|
||||
TWILIO_ACCOUNT_SID=...
|
||||
TWILIO_AUTH_TOKEN=...
|
||||
|
||||
# WhatsApp
|
||||
WHATSAPP_TOKEN=
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN=
|
||||
WHATSAPP_PHONE_NUMBER_ID=
|
||||
WHATSAPP_APP_SECRET=
|
||||
WHATSAPP_TOKEN=...
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN=...
|
||||
WHATSAPP_PHONE_NUMBER_ID=...
|
||||
WHATSAPP_APP_SECRET=...
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
@@ -20,10 +21,10 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
|
||||
from pipecat.services.openai.base_llm import BaseOpenAILLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.speechmatics.stt import SpeechmaticsSTTService
|
||||
from pipecat.services.speechmatics.tts import SpeechmaticsTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -51,121 +52,127 @@ transport_params = {
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"""Speechmatics STT Service Example
|
||||
"""Speechmatics STT and TTS Service Example
|
||||
|
||||
This example demonstrates using Speechmatics Speech-to-Text service with speaker diarization and intelligent speaker management. Key features:
|
||||
This example demonstrates using Speechmatics Speech-to-Text and Text-to-Speech services
|
||||
with speaker diarization and intelligent speaker management. Key features:
|
||||
|
||||
1. Speaker Diarization
|
||||
1. Speaker Diarization (STT)
|
||||
- Automatically identifies and distinguishes between different speakers
|
||||
- First speaker is identified as 'S1', others get subsequent IDs
|
||||
- Uses `enable_diarization` parameter to manage speaker detection
|
||||
|
||||
2. Smart Speaker Control
|
||||
2. Smart Speaker Control (STT)
|
||||
- `focus_speakers` parameter lets you target specific speakers (e.g. ["S1"])
|
||||
- Other speakers will be wrapped in PASSIVE tags
|
||||
- Only processes speech from focused speakers
|
||||
- Words from all speakers are wrapped with XML tags for clear speaker identification
|
||||
- Other speakers' speech only sent when focused speaker is active
|
||||
|
||||
3. Voice Activity Detection
|
||||
3. Voice Activity Detection (STT)
|
||||
- Built-in VAD using `enable_vad` parameter
|
||||
- Remove `vad_analyzer` from `transport` config to use module's VAD
|
||||
- Emits speaker started/stopped events
|
||||
|
||||
4. Configuration Options
|
||||
4. Text-to-Speech (TTS)
|
||||
- Low latency streaming audio synthesis
|
||||
- Multiple voice options available including `sarah`, `theo`, and `megan`
|
||||
|
||||
5. Configuration Options
|
||||
- `operating_point` parameter defaults to `ENHANCED` for optimal accuracy
|
||||
- Configurable `end_of_utterance_silence_trigger` (default 0.5s)
|
||||
- Customizable speaker formatting
|
||||
- Additional diarization settings available
|
||||
|
||||
For detailed information about operating points and configuration:
|
||||
https://docs.speechmatics.com/rt-api-ref
|
||||
For detailed information:
|
||||
- STT: https://docs.speechmatics.com/rt-api-ref
|
||||
- TTS: https://docs.speechmatics.com/text-to-speech/quickstart
|
||||
"""
|
||||
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
enable_vad=True,
|
||||
enable_diarization=True,
|
||||
focus_speakers=["S1"],
|
||||
end_of_utterance_silence_trigger=0.5,
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
speaker_passive_format="<PASSIVE><{speaker_id}>{text}</{speaker_id}></PASSIVE>",
|
||||
),
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
model="eleven_turbo_v2_5",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
params=BaseOpenAILLMService.InputParams(temperature=0.75),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful British assistant called Alfred. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be converted to audio so don't include special characters in your answers. "
|
||||
"Always include punctuation in your responses. "
|
||||
"Give very short replies - do not give longer replies unless strictly necessary. "
|
||||
"Respond to what the user said in a concise, funny, creative and helpful way. "
|
||||
"Use `<Sn/>` tags to identify different speakers - do not use tags in your replies. "
|
||||
"Do not respond to speakers within `<PASSIVE/>` tags unless explicitly asked to. "
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
enable_vad=True,
|
||||
enable_diarization=True,
|
||||
focus_speakers=["S1"],
|
||||
end_of_utterance_silence_trigger=0.5,
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
speaker_passive_format="<PASSIVE><{speaker_id}>{text}</{speaker_id}></PASSIVE>",
|
||||
),
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(aggregation_timeout=0.005),
|
||||
)
|
||||
tts = SpeechmaticsTTSService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
voice_id="sarah",
|
||||
aiohttp_session=session,
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
params=BaseOpenAILLMService.InputParams(temperature=0.75),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful British assistant called Sarah. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be converted to audio so don't include special characters in your answers. "
|
||||
"Always include punctuation in your responses. "
|
||||
"Give very short replies - do not give longer replies unless strictly necessary. "
|
||||
"Respond to what the user said in a concise, funny, creative and helpful way. "
|
||||
"Use `<Sn/>` tags to identify different speakers - do not use tags in your replies. "
|
||||
"Do not respond to speakers within `<PASSIVE/>` tags unless explicitly asked to. "
|
||||
),
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(aggregation_timeout=0.005),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
await runner.run(task)
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
@@ -24,10 +25,10 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
|
||||
from pipecat.services.openai.base_llm import BaseOpenAILLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.speechmatics.stt import SpeechmaticsSTTService
|
||||
from pipecat.services.speechmatics.tts import SpeechmaticsTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -61,100 +62,106 @@ transport_params = {
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"""Run example using Speechmatics STT.
|
||||
"""Run example using Speechmatics STT and TTS.
|
||||
|
||||
This example will use diarization within our STT service and output the words spoken by
|
||||
each individual speaker and wrap them with XML tags for the LLM to process. Note the
|
||||
instructions in the system context for the LLM. This greatly improves the conversation
|
||||
experience by allowing the LLM to understand who is speaking in a multi-party call.
|
||||
This example demonstrates a complete Speechmatics integration with both Speech-to-Text
|
||||
and Text-to-Speech services:
|
||||
|
||||
By default, this example will use our ENHANCED operating point, which is optimized for
|
||||
high accuracy. You can change this by setting the `operating_point` parameter to a different
|
||||
value.
|
||||
STT Features:
|
||||
- Diarization to identify and distinguish between different speakers
|
||||
- Words spoken by each speaker are wrapped with XML tags for LLM processing
|
||||
- System context instructions help the LLM understand multi-party conversations
|
||||
- ENHANCED operating point by default for optimal accuracy
|
||||
|
||||
For more information on operating points, see the Speechmatics documentation:
|
||||
https://docs.speechmatics.com/rt-api-ref
|
||||
TTS Features:
|
||||
- Low latency streaming audio synthesis
|
||||
- Multiple voice options available including `sarah`, `theo`, and `megan`
|
||||
|
||||
For more information:
|
||||
- STT: https://docs.speechmatics.com/rt-api-ref
|
||||
- TTS: https://docs.speechmatics.com/text-to-speech/quickstart
|
||||
"""
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
enable_diarization=True,
|
||||
end_of_utterance_silence_trigger=0.5,
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
),
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
model="eleven_turbo_v2_5",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
params=BaseOpenAILLMService.InputParams(temperature=0.75),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful British assistant called Alfred. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be converted to audio so don't include special characters in your answers. "
|
||||
"Always include punctuation in your responses. "
|
||||
"Give very short replies - do not give longer replies unless strictly necessary. "
|
||||
"Respond to what the user said in a concise, funny, creative and helpful way. "
|
||||
"Use `<Sn/>` tags to identify different speakers - do not use tags in your replies."
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
enable_diarization=True,
|
||||
end_of_utterance_silence_trigger=0.5,
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
),
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(aggregation_timeout=0.005),
|
||||
)
|
||||
tts = SpeechmaticsTTSService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
voice_id="sarah",
|
||||
aiohttp_session=session,
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
params=BaseOpenAILLMService.InputParams(temperature=0.75),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful British assistant called Sarah. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be converted to audio so don't include special characters in your answers. "
|
||||
"Always include punctuation in your responses. "
|
||||
"Give very short replies - do not give longer replies unless strictly necessary. "
|
||||
"Respond to what the user said in a concise, funny, creative and helpful way. "
|
||||
"Use `<Sn/>` tags to identify different speakers - do not use tags in your replies."
|
||||
),
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(aggregation_timeout=0.005),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
await runner.run(task)
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
|
||||
@@ -101,6 +101,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@stt.event_handler("on_update")
|
||||
async def on_deepgram_flux_update(stt, transcript):
|
||||
logger.debug(f"On deeggram flux update: {transcript}")
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
132
examples/foundational/07c-interruptible-deepgram-http.py
Normal file
132
examples/foundational/07c-interruptible-deepgram-http.py
Normal file
@@ -0,0 +1,132 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramHttpTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = DeepgramHttpTTSService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
voice="aura-2-andromeda-en",
|
||||
aiohttp_session=session,
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -67,8 +67,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
llm = AWSBedrockLLMService(
|
||||
aws_region="us-west-2",
|
||||
model="us.anthropic.claude-3-5-haiku-20241022-v1:0",
|
||||
params=AWSBedrockLLMService.InputParams(temperature=0.8, latency="optimized"),
|
||||
model="us.anthropic.claude-haiku-4-5-20251001-v1:0",
|
||||
params=AWSBedrockLLMService.InputParams(temperature=0.8),
|
||||
)
|
||||
|
||||
messages = [
|
||||
|
||||
@@ -22,8 +22,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.sarvam.stt import SarvamSTTService
|
||||
from pipecat.services.sarvam.tts import SarvamHttpTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
@@ -63,7 +63,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Create an HTTP session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = SarvamSTTService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
model="saarika:v2.5",
|
||||
)
|
||||
|
||||
tts = SarvamHttpTTSService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
|
||||
@@ -24,8 +24,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.sarvam.stt import SarvamSTTService
|
||||
from pipecat.services.sarvam.tts import SarvamTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -62,7 +62,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = SarvamSTTService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
model="saarika:v2.5",
|
||||
)
|
||||
|
||||
tts = SarvamTTSService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
|
||||
@@ -1,147 +0,0 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from typing import Tuple
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from pipecat.frames.frames import AudioFrame, EndFrame, ImageFrame, LLMContextFrame, TextFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.processors.aggregators import SentenceAggregator
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.daily import configure
|
||||
from pipecat.services.azure import AzureLLMService, AzureTTSService
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.services.fal import FalImageGenService
|
||||
from pipecat.transports.daily.transport import DailyTransport
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("pipecat")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
(room_url, _) = await configure(session)
|
||||
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
None,
|
||||
"Respond bot",
|
||||
duration_minutes=10,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"),
|
||||
)
|
||||
tts1 = AzureTTSService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"),
|
||||
)
|
||||
tts2 = ElevenLabsTTSService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id="jBpfuIE2acCO8z3wKNLl",
|
||||
)
|
||||
dalle = FalImageGenService(
|
||||
params=FalImageGenService.InputParams(image_size="1024x1024"),
|
||||
aiohttp_session=session,
|
||||
key=os.getenv("FAL_KEY"),
|
||||
)
|
||||
|
||||
bot1_messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a stern librarian. You strongly believe that a hot dog is a sandwich. Start by stating this fact in a few sentences, then be prepared to debate this with the user. You shouldn't ever compromise on the fundamental truth that a hot dog is a sandwich. Your responses should only be a few sentences long.",
|
||||
},
|
||||
]
|
||||
bot2_messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a silly cat, and you strongly believe that a hot dog is not a sandwich. Debate this with the user, only responding with a few sentences. Don't ever accept that a hot dog is a sandwich.",
|
||||
},
|
||||
]
|
||||
|
||||
async def get_text_and_audio(messages) -> Tuple[str, bytearray]:
|
||||
"""This function streams text from the LLM and uses the TTS service to convert
|
||||
that text to speech as it's received.
|
||||
"""
|
||||
source_queue = asyncio.Queue()
|
||||
sink_queue = asyncio.Queue()
|
||||
sentence_aggregator = SentenceAggregator()
|
||||
pipeline = Pipeline([llm, sentence_aggregator, tts1], source_queue, sink_queue)
|
||||
|
||||
await source_queue.put(LLMContextFrame(LLMContext(messages)))
|
||||
await source_queue.put(EndFrame())
|
||||
await pipeline.run_pipeline()
|
||||
|
||||
message = ""
|
||||
all_audio = bytearray()
|
||||
while sink_queue.qsize():
|
||||
frame = sink_queue.get_nowait()
|
||||
if isinstance(frame, TextFrame):
|
||||
message += frame.text
|
||||
elif isinstance(frame, AudioFrame):
|
||||
all_audio.extend(frame.audio)
|
||||
|
||||
return (message, all_audio)
|
||||
|
||||
async def get_bot1_statement():
|
||||
message, audio = await get_text_and_audio(bot1_messages)
|
||||
|
||||
bot1_messages.append({"role": "assistant", "content": message})
|
||||
bot2_messages.append({"role": "user", "content": message})
|
||||
|
||||
return audio
|
||||
|
||||
async def get_bot2_statement():
|
||||
message, audio = await get_text_and_audio(bot2_messages)
|
||||
|
||||
bot2_messages.append({"role": "assistant", "content": message})
|
||||
bot1_messages.append({"role": "user", "content": message})
|
||||
|
||||
return audio
|
||||
|
||||
async def argue():
|
||||
for i in range(100):
|
||||
print(f"In iteration {i}")
|
||||
|
||||
bot1_description = "A woman conservatively dressed as a librarian in a library surrounded by books, cartoon, serious, highly detailed"
|
||||
|
||||
(audio1, image_data1) = await asyncio.gather(
|
||||
get_bot1_statement(), dalle.run_image_gen(bot1_description)
|
||||
)
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageFrame(image_data1[1], image_data1[2]),
|
||||
AudioFrame(audio1),
|
||||
]
|
||||
)
|
||||
|
||||
bot2_description = "A cat dressed in a hot dog costume, cartoon, bright colors, funny, highly detailed"
|
||||
|
||||
(audio2, image_data2) = await asyncio.gather(
|
||||
get_bot2_statement(), dalle.run_image_gen(bot2_description)
|
||||
)
|
||||
await transport.send_queue.put(
|
||||
[
|
||||
ImageFrame(image_data2[1], image_data2[2]),
|
||||
AudioFrame(audio2),
|
||||
]
|
||||
)
|
||||
|
||||
await asyncio.gather(transport.run(), argue())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -4,8 +4,9 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import io
|
||||
import os
|
||||
from typing import Optional
|
||||
import re
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
@@ -16,24 +17,17 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
TextFrame,
|
||||
TTSSpeakFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
LLMRunFrame,
|
||||
MetricsFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
get_transport_client_id,
|
||||
maybe_capture_participant_camera,
|
||||
)
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
@@ -43,46 +37,41 @@ from pipecat.transports.daily.transport import DailyParams
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
"""Converts incoming text into requests for user images."""
|
||||
def format_metrics(metrics, indent=0):
|
||||
lines = []
|
||||
tab = "\t" * indent
|
||||
|
||||
def __init__(self, participant_id: Optional[str] = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
for metric in metrics:
|
||||
lines.append(tab + type(metric).__name__)
|
||||
for field, value in vars(metric).items():
|
||||
if hasattr(value, "__dict__") and not isinstance(
|
||||
value, (str, int, float, bool, type(None))
|
||||
):
|
||||
lines.append(f"{tab}\t{field}={type(value).__name__}")
|
||||
for k, v in vars(value).items():
|
||||
lines.append(f"{tab}\t\t{k}={repr(v)}")
|
||||
else:
|
||||
lines.append(f"{tab}\t{field}={repr(value)}")
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
class MetricsFrameLogger(FrameProcessor):
|
||||
"""MetricsFrameLogger formats and logs all MetericsFrames"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(self._participant_id, context=frame.text),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
else:
|
||||
if isinstance(frame, MetricsFrame):
|
||||
logger.info(f"{frame.name}\n {format_metrics(frame.data)}")
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
"""Converts incoming user images into context frames."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserImageRawFrame):
|
||||
if frame.request and frame.request.context:
|
||||
context = LLMContext()
|
||||
context.add_image_frame_message(
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
size=frame.size,
|
||||
format=frame.format,
|
||||
)
|
||||
frame = LLMContextFrame(context)
|
||||
await self.push_frame(frame)
|
||||
# ALWAYS push all frames
|
||||
else:
|
||||
# SUPER IMPORTANT: always push every frame!
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
@@ -93,14 +82,13 @@ transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
video_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
@@ -110,33 +98,37 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
# Initialize the image requester without setting the participant ID yet
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
image_processor = UserImageProcessor()
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
# OpenAI GPT-4o for vision analysis
|
||||
openai = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
metrics_frame_processor = MetricsFrameLogger()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_response,
|
||||
image_requester,
|
||||
image_processor,
|
||||
openai,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
metrics_frame_processor, # pretty print metrics frames
|
||||
]
|
||||
)
|
||||
|
||||
@@ -152,15 +144,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
image_requester.set_participant_id(client_id)
|
||||
|
||||
# Welcome message
|
||||
await task.queue_frame(TTSSpeakFrame("Hi there! Feel free to ask me about what I see."))
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
141
examples/foundational/12-describe-image-openai.py
Normal file
141
examples/foundational/12-describe-image-openai.py
Normal file
@@ -0,0 +1,141 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are also able to describe images.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
if not runner_args.body:
|
||||
script_dir = os.path.dirname(__file__)
|
||||
runner_args.body = {
|
||||
"image_path": os.path.join(script_dir, "assets", "cat.jpg"),
|
||||
"question": "Describe this image",
|
||||
}
|
||||
|
||||
image_path = runner_args.body["image_path"]
|
||||
question = runner_args.body["question"]
|
||||
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
text=question,
|
||||
)
|
||||
messages.append(message)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -1,180 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
TextFrame,
|
||||
TTSSpeakFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
get_transport_client_id,
|
||||
maybe_capture_participant_camera,
|
||||
)
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.moondream.vision import MoondreamService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
"""Converts incoming text into requests for user images."""
|
||||
|
||||
def __init__(self, participant_id: Optional[str] = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(self._participant_id, context=frame.text),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
"""Converts incoming user images into context frames."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserImageRawFrame):
|
||||
if frame.request and frame.request.context:
|
||||
context = LLMContext()
|
||||
context.add_image_frame_message(
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
size=frame.size,
|
||||
format=frame.format,
|
||||
)
|
||||
frame = LLMContextFrame(context)
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
# Initialize the image requester without setting the participant ID yet
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
image_processor = UserImageProcessor()
|
||||
|
||||
# If you run into weird description, try with use_cpu=True
|
||||
moondream = MoondreamService()
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_response,
|
||||
image_requester,
|
||||
image_processor,
|
||||
moondream,
|
||||
tts,
|
||||
transport.output(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
image_requester.set_participant_id(client_id)
|
||||
|
||||
# Welcome message
|
||||
await task.queue_frame(TTSSpeakFrame("Hi there! Feel free to ask me about what I see."))
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -4,36 +4,25 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
TextFrame,
|
||||
TTSSpeakFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
get_transport_client_id,
|
||||
maybe_capture_participant_camera,
|
||||
)
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.anthropic.llm import AnthropicLLMService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
@@ -43,49 +32,6 @@ from pipecat.transports.daily.transport import DailyParams
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
"""Converts incoming text into requests for user images."""
|
||||
|
||||
def __init__(self, participant_id: Optional[str] = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(self._participant_id, context=frame.text),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
"""Converts incoming user images into context frames."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserImageRawFrame):
|
||||
if frame.request and frame.request.context:
|
||||
context = LLMContext()
|
||||
context.add_image_frame_message(
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
size=frame.size,
|
||||
format=frame.format,
|
||||
)
|
||||
frame = LLMContextFrame(context)
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -93,14 +39,12 @@ transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
@@ -110,33 +54,34 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
# Initialize the image requester without setting the participant ID yet
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
image_processor = UserImageProcessor()
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
# Anthropic for vision analysis
|
||||
anthropic = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are also able to describe images.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_response,
|
||||
image_requester,
|
||||
image_processor,
|
||||
anthropic,
|
||||
tts,
|
||||
transport.output(),
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -151,16 +96,28 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
logger.info(f"Client connected")
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
if not runner_args.body:
|
||||
script_dir = os.path.dirname(__file__)
|
||||
runner_args.body = {
|
||||
"image_path": os.path.join(script_dir, "assets", "cat.jpg"),
|
||||
"question": "Describe this image",
|
||||
}
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
image_requester.set_participant_id(client_id)
|
||||
image_path = runner_args.body["image_path"]
|
||||
question = runner_args.body["question"]
|
||||
|
||||
# Welcome message
|
||||
await task.queue_frame(TTSSpeakFrame("Hi there! Feel free to ask me about what I see."))
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
text=question,
|
||||
)
|
||||
messages.append(message)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
148
examples/foundational/12b-describe-image-aws.py
Normal file
148
examples/foundational/12b-describe-image-aws.py
Normal file
@@ -0,0 +1,148 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.aws.llm import AWSBedrockLLMService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = AWSBedrockLLMService(
|
||||
aws_region="us-west-2",
|
||||
model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
|
||||
# Note: usually, prefer providing latency="optimized" param.
|
||||
# Here we can't because AWS Bedrock doesn't support it for Claude 3.7,
|
||||
# which we need for image input.
|
||||
params=AWSBedrockLLMService.InputParams(temperature=0.8),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are also able to describe images.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
if not runner_args.body:
|
||||
script_dir = os.path.dirname(__file__)
|
||||
runner_args.body = {
|
||||
"image_path": os.path.join(script_dir, "assets", "cat.jpg"),
|
||||
"question": "Describe this image",
|
||||
}
|
||||
|
||||
image_path = runner_args.body["image_path"]
|
||||
question = runner_args.body["question"]
|
||||
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
text=question,
|
||||
)
|
||||
messages.append(message)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
141
examples/foundational/12c-describe-image-gemini-flash.py
Normal file
141
examples/foundational/12c-describe-image-gemini-flash.py
Normal file
@@ -0,0 +1,141 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are also able to describe images.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
if not runner_args.body:
|
||||
script_dir = os.path.dirname(__file__)
|
||||
runner_args.body = {
|
||||
"image_path": os.path.join(script_dir, "assets", "cat.jpg"),
|
||||
"question": "Describe this image",
|
||||
}
|
||||
|
||||
image_path = runner_args.body["image_path"]
|
||||
question = runner_args.body["question"]
|
||||
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
text=question,
|
||||
)
|
||||
messages.append(message)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
122
examples/foundational/12d-describe-image-moondream.py
Normal file
122
examples/foundational/12d-describe-image-moondream.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import UserImageRawFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.moondream.vision import MoondreamService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
vision = MoondreamService()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
vision, # Vision
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
if not runner_args.body:
|
||||
script_dir = os.path.dirname(__file__)
|
||||
runner_args.body = {
|
||||
"image_path": os.path.join(script_dir, "assets", "cat.jpg"),
|
||||
"question": "Describe this image",
|
||||
}
|
||||
|
||||
image_path = runner_args.body["image_path"]
|
||||
question = runner_args.body["question"]
|
||||
|
||||
# Describe the image.
|
||||
image = Image.open(image_path)
|
||||
await task.queue_frames(
|
||||
[
|
||||
UserImageRawFrame(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
text=question,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -4,8 +4,6 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
@@ -17,12 +15,13 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
@@ -39,34 +38,30 @@ from pipecat.transports.daily.transport import DailyParams
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# Global variable to store the client ID
|
||||
client_id = ""
|
||||
async def fetch_user_image(params: FunctionCallParams):
|
||||
"""Fetch the user image and push it to the LLM.
|
||||
|
||||
|
||||
async def get_weather(params: FunctionCallParams):
|
||||
location = params.arguments["location"]
|
||||
await params.result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
|
||||
|
||||
|
||||
async def get_image(params: FunctionCallParams):
|
||||
When called, this function pushes a UserImageRequestFrame upstream to the
|
||||
transport. As a result, the transport will request the user image and push a
|
||||
UserImageRawFrame downstream which will be added to the context by the LLM
|
||||
assistant aggregator.
|
||||
"""
|
||||
user_id = params.arguments["user_id"]
|
||||
question = params.arguments["question"]
|
||||
logger.debug(f"Requesting image with user_id={client_id}, question={question}")
|
||||
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
|
||||
|
||||
# Request the image frame
|
||||
await params.llm.request_image_frame(
|
||||
user_id=client_id,
|
||||
function_name=params.function_name,
|
||||
tool_call_id=params.tool_call_id,
|
||||
text_content=question,
|
||||
# Request a user image frame and indicate that it should be added to the
|
||||
# context.
|
||||
await params.llm.push_frame(
|
||||
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
# Wait a short time for the frame to be processed
|
||||
await asyncio.sleep(0.5)
|
||||
await params.result_callback(None)
|
||||
|
||||
# Return a result to complete the function call
|
||||
await params.result_callback(
|
||||
f"I've captured an image from your camera and I'm analyzing what you asked about: {question}"
|
||||
)
|
||||
# Instead of None, it's possible to also provide a tool call answer to
|
||||
# tell the LLM that we are grabbing the image to analyze.
|
||||
# await params.result_callback({"result": "Image is being captured."})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
@@ -100,70 +95,32 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
model="claude-3-7-sonnet-latest",
|
||||
params=AnthropicLLMService.InputParams(enable_prompt_caching=True),
|
||||
)
|
||||
llm.register_function("get_weather", get_weather)
|
||||
llm.register_function("get_image", get_image)
|
||||
# Anthropic for vision analysis
|
||||
llm = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
llm.register_function("fetch_user_image", fetch_user_image)
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_weather",
|
||||
description="Get the current weather",
|
||||
fetch_image_function = FunctionSchema(
|
||||
name="fetch_user_image",
|
||||
description="Called when the user requests a description of their camera feed",
|
||||
properties={
|
||||
"location": {
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
"description": "The ID of the user to grab the image from",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
get_image_function = FunctionSchema(
|
||||
name="get_image",
|
||||
description="Get an image from the video stream.",
|
||||
properties={
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question that the user is asking about the image.",
|
||||
}
|
||||
"description": "The question that the user is asking about the image",
|
||||
},
|
||||
},
|
||||
required=["question"],
|
||||
required=["user_id", "question"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function, get_image_function])
|
||||
|
||||
system_prompt = """\
|
||||
You are a helpful assistant who converses with a user and answers questions. Respond concisely to general questions.
|
||||
|
||||
Your response will be turned into speech so use only simple words and punctuation.
|
||||
|
||||
You have access to two tools: get_weather and get_image.
|
||||
|
||||
You can respond to questions about the weather using the get_weather tool.
|
||||
|
||||
You can answer questions about the user's video stream using the get_image tool. Some examples of phrases that \
|
||||
indicate you should use the get_image tool are:
|
||||
- What do you see?
|
||||
- What's in the video?
|
||||
- Can you describe the video?
|
||||
- Tell me about what you see.
|
||||
- Tell me something interesting about what you see.
|
||||
- What's happening in the video?
|
||||
|
||||
If you need to use a tool, simply use the tool. Do not tell the user the tool you are using. Be brief and concise.
|
||||
"""
|
||||
tools = ToolsSchema(standard_tools=[fetch_image_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": system_prompt,
|
||||
}
|
||||
],
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are able to describe images from the user camera.",
|
||||
},
|
||||
{"role": "user", "content": "Start the conversation by introducing yourself."},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
@@ -173,11 +130,11 @@ If you need to use a tool, simply use the tool. Do not tell the user the tool yo
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User speech to text
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses and tool context
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -196,10 +153,16 @@ If you need to use a tool, simply use the tool. Do not tell the user the tool yo
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
|
||||
global client_id
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
@@ -5,29 +5,23 @@
|
||||
#
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
TextFrame,
|
||||
TTSSpeakFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
@@ -37,54 +31,37 @@ from pipecat.runner.utils import (
|
||||
from pipecat.services.aws.llm import AWSBedrockLLMService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
"""Converts incoming text into requests for user images."""
|
||||
async def fetch_user_image(params: FunctionCallParams):
|
||||
"""Fetch the user image and push it to the LLM.
|
||||
|
||||
def __init__(self, participant_id: Optional[str] = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
When called, this function pushes a UserImageRequestFrame upstream to the
|
||||
transport. As a result, the transport will request the user image and push a
|
||||
UserImageRawFrame downstream which will be added to the context by the LLM
|
||||
assistant aggregator.
|
||||
"""
|
||||
user_id = params.arguments["user_id"]
|
||||
question = params.arguments["question"]
|
||||
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
# Request a user image frame and indicate that it should be added to the
|
||||
# context.
|
||||
await params.llm.push_frame(
|
||||
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
await params.result_callback(None)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(self._participant_id, context=frame.text),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
"""Converts incoming user images into context frames."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserImageRawFrame):
|
||||
if frame.request and frame.request.context:
|
||||
# Note: AWS Bedrock does not yet support the universal LLMContext
|
||||
context = LLMContext()
|
||||
context.add_image_frame_message(
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
size=frame.size,
|
||||
format=frame.format,
|
||||
)
|
||||
frame = LLMContextFrame(context)
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
# Instead of None, it's possible to also provide a tool call answer to
|
||||
# tell the LLM that we are grabbing the image to analyze.
|
||||
# await params.result_callback({"result": "Image is being captured."})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
@@ -111,17 +88,15 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
# Initialize the image requester without setting the participant ID yet
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
image_processor = UserImageProcessor()
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
# AWS for vision analysis
|
||||
aws = AWSBedrockLLMService(
|
||||
llm = AWSBedrockLLMService(
|
||||
aws_region="us-west-2",
|
||||
model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
|
||||
# Note: usually, prefer providing latency="optimized" param.
|
||||
@@ -129,22 +104,44 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# which we need for image input.
|
||||
params=AWSBedrockLLMService.InputParams(temperature=0.8),
|
||||
)
|
||||
llm.register_function("fetch_user_image", fetch_user_image)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
fetch_image_function = FunctionSchema(
|
||||
name="fetch_user_image",
|
||||
description="Called when the user requests a description of their camera feed",
|
||||
properties={
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the user to grab the image from",
|
||||
},
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question that the user is asking about the image",
|
||||
},
|
||||
},
|
||||
required=["user_id", "question"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[fetch_image_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are able to describe images from the user camera.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_response,
|
||||
image_requester,
|
||||
image_processor,
|
||||
aws,
|
||||
tts,
|
||||
transport.output(),
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -165,10 +162,15 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
image_requester.set_participant_id(client_id)
|
||||
|
||||
# Welcome message
|
||||
await task.queue_frame(TTSSpeakFrame("Hi there! Feel free to ask me about what I see."))
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
@@ -5,29 +5,23 @@
|
||||
#
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
TextFrame,
|
||||
TTSSpeakFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
@@ -37,53 +31,37 @@ from pipecat.runner.utils import (
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
"""Converts incoming text into requests for user images."""
|
||||
async def fetch_user_image(params: FunctionCallParams):
|
||||
"""Fetch the user image and push it to the LLM.
|
||||
|
||||
def __init__(self, participant_id: Optional[str] = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
When called, this function pushes a UserImageRequestFrame upstream to the
|
||||
transport. As a result, the transport will request the user image and push a
|
||||
UserImageRawFrame downstream which will be added to the context by the LLM
|
||||
assistant aggregator.
|
||||
"""
|
||||
user_id = params.arguments["user_id"]
|
||||
question = params.arguments["question"]
|
||||
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
# Request a user image frame and indicate that it should be added to the
|
||||
# context.
|
||||
await params.llm.push_frame(
|
||||
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
await params.result_callback(None)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(self._participant_id, context=frame.text),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
"""Converts incoming user images into context frames."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserImageRawFrame):
|
||||
if frame.request and frame.request.context:
|
||||
context = LLMContext()
|
||||
context.add_image_frame_message(
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
size=frame.size,
|
||||
format=frame.format,
|
||||
)
|
||||
frame = LLMContextFrame(context)
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
# Instead of None, it's possible to also provide a tool call answer to
|
||||
# tell the LLM that we are grabbing the image to analyze.
|
||||
# await params.result_callback({"result": "Image is being captured."})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
@@ -110,33 +88,53 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
# Initialize the image requester without setting the participant ID yet
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
image_processor = UserImageProcessor()
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
# Google Gemini model for vision analysis
|
||||
google = GoogleLLMService(model="gemini-2.0-flash-001", api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
# Google Gemini model for vision analysis
|
||||
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
llm.register_function("fetch_user_image", fetch_user_image)
|
||||
|
||||
fetch_image_function = FunctionSchema(
|
||||
name="fetch_user_image",
|
||||
description="Called when the user requests a description of their camera feed",
|
||||
properties={
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the user to grab the image from",
|
||||
},
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question that the user is asking about the image",
|
||||
},
|
||||
},
|
||||
required=["user_id", "question"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[fetch_image_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are able to describe images from the user camera.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_response,
|
||||
image_requester,
|
||||
image_processor,
|
||||
google,
|
||||
tts,
|
||||
transport.output(),
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -157,10 +155,15 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
image_requester.set_participant_id(client_id)
|
||||
|
||||
# Welcome message
|
||||
await task.queue_frame(TTSSpeakFrame("Hi there! Feel free to ask me about what I see."))
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
190
examples/foundational/14d-function-calling-moondream-video.py
Normal file
190
examples/foundational/14d-function-calling-moondream-video.py
Normal file
@@ -0,0 +1,190 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
get_transport_client_id,
|
||||
maybe_capture_participant_camera,
|
||||
)
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.moondream.vision import MoondreamService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_user_image(params: FunctionCallParams):
|
||||
"""Fetch the user image.
|
||||
|
||||
When called, this function pushes a UserImageRequestFrame upstream to the
|
||||
transport. As a result, the transport will request the user image and push a
|
||||
UserImageRawFrame downstream.
|
||||
"""
|
||||
user_id = params.arguments["user_id"]
|
||||
question = params.arguments["question"]
|
||||
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
|
||||
|
||||
# Request a user image frame. In this case, we don't want the requested
|
||||
# image to be added to the context because we will process it with
|
||||
# Moondream.
|
||||
await params.llm.push_frame(
|
||||
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=False),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
await params.result_callback(None)
|
||||
|
||||
# Instead of None, it's possible to also provide a tool call answer to
|
||||
# tell the LLM that we are grabbing the image to analyze.
|
||||
# await params.result_callback({"result": "Image is being captured."})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm.register_function("fetch_user_image", fetch_user_image)
|
||||
|
||||
fetch_image_function = FunctionSchema(
|
||||
name="fetch_user_image",
|
||||
description="Called when the user requests a description of their camera feed",
|
||||
properties={
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the user to grab the image from",
|
||||
},
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question that the user is asking about the image",
|
||||
},
|
||||
},
|
||||
required=["user_id", "question"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[fetch_image_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are able to describe images from the user camera.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# If you run into weird description, try with use_cpu=True
|
||||
moondream = MoondreamService()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
ParallelPipeline(
|
||||
[llm], # LLM
|
||||
[moondream],
|
||||
),
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -5,7 +5,6 @@
|
||||
#
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
@@ -17,12 +16,13 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
@@ -39,34 +39,30 @@ from pipecat.transports.daily.transport import DailyParams
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# Global variable to store the client ID
|
||||
client_id = ""
|
||||
async def fetch_user_image(params: FunctionCallParams):
|
||||
"""Fetch the user image and push it to the LLM.
|
||||
|
||||
|
||||
async def get_weather(params: FunctionCallParams):
|
||||
location = params.arguments["location"]
|
||||
await params.result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
|
||||
|
||||
|
||||
async def get_image(params: FunctionCallParams):
|
||||
When called, this function pushes a UserImageRequestFrame upstream to the
|
||||
transport. As a result, the transport will request the user image and push a
|
||||
UserImageRawFrame downstream which will be added to the context by the LLM
|
||||
assistant aggregator.
|
||||
"""
|
||||
user_id = params.arguments["user_id"]
|
||||
question = params.arguments["question"]
|
||||
logger.debug(f"Requesting image with user_id={client_id}, question={question}")
|
||||
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
|
||||
|
||||
# Request the image frame
|
||||
await params.llm.request_image_frame(
|
||||
user_id=client_id,
|
||||
function_name=params.function_name,
|
||||
tool_call_id=params.tool_call_id,
|
||||
text_content=question,
|
||||
# Request a user image frame and indicate that it should be added to the
|
||||
# context.
|
||||
await params.llm.push_frame(
|
||||
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
# Wait a short time for the frame to be processed
|
||||
await asyncio.sleep(0.5)
|
||||
await params.result_callback(None)
|
||||
|
||||
# Return a result to complete the function call
|
||||
await params.result_callback(
|
||||
f"I've captured an image from your camera and I'm analyzing what you asked about: {question}"
|
||||
)
|
||||
# Instead of None, it's possible to also provide a tool call answer to
|
||||
# tell the LLM that we are grabbing the image to analyze.
|
||||
# await params.result_callback({"result": "Image is being captured."})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
@@ -101,58 +97,30 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm.register_function("get_weather", get_weather)
|
||||
llm.register_function("get_image", get_image)
|
||||
llm.register_function("fetch_user_image", fetch_user_image)
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_weather",
|
||||
description="Get the current weather",
|
||||
fetch_image_function = FunctionSchema(
|
||||
name="fetch_user_image",
|
||||
description="Called when the user requests a description of their camera feed",
|
||||
properties={
|
||||
"location": {
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
"description": "The ID of the user to grab the image from",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
get_image_function = FunctionSchema(
|
||||
name="get_image",
|
||||
description="Get an image from the video stream.",
|
||||
properties={
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question that the user is asking about the image.",
|
||||
}
|
||||
"description": "The question that the user is asking about the image",
|
||||
},
|
||||
},
|
||||
required=["question"],
|
||||
required=["user_id", "question"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function, get_image_function])
|
||||
tools = ToolsSchema(standard_tools=[fetch_image_function])
|
||||
|
||||
system_prompt = """\
|
||||
You are a helpful assistant who converses with a user and answers questions. Respond concisely to general questions.
|
||||
|
||||
Your response will be turned into speech so use only simple words and punctuation.
|
||||
|
||||
You have access to two tools: get_weather and get_image.
|
||||
|
||||
You can respond to questions about the weather using the get_weather tool.
|
||||
|
||||
You can answer questions about the user's video stream using the get_image tool. Some examples of phrases that \
|
||||
indicate you should use the get_image tool are:
|
||||
- What do you see?
|
||||
- What's in the video?
|
||||
- Can you describe the video?
|
||||
- Tell me about what you see.
|
||||
- Tell me something interesting about what you see.
|
||||
- What's happening in the video?
|
||||
"""
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are able to describe images from the user camera.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
@@ -160,13 +128,13 @@ indicate you should use the get_image tool are:
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -185,10 +153,15 @@ indicate you should use the get_image tool are:
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
|
||||
global client_id
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
@@ -75,7 +75,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# text_filters=[MarkdownTextFilter()],
|
||||
)
|
||||
|
||||
llm = NimLLMService(api_key=os.getenv("NVIDIA_API_KEY"), model="meta/llama-3.3-70b-instruct")
|
||||
llm = NimLLMService(
|
||||
api_key=os.getenv("NVIDIA_API_KEY"),
|
||||
model="nvidia/llama-3.3-nemotron-super-49b-v1.5",
|
||||
# Recommended when turning thinking off
|
||||
params=NimLLMService.InputParams(temperature=0.0),
|
||||
)
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
@@ -102,6 +107,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function])
|
||||
messages = [
|
||||
# Disable thinking by sending this message first
|
||||
# Check the model for the corresponding "no thinking" message
|
||||
{"role": "system", "content": "/no_think"},
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
|
||||
@@ -79,8 +79,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
llm = AWSBedrockLLMService(
|
||||
aws_region="us-west-2",
|
||||
model="us.anthropic.claude-3-5-haiku-20241022-v1:0",
|
||||
params=AWSBedrockLLMService.InputParams(temperature=0.8, latency="optimized"),
|
||||
model="us.anthropic.claude-haiku-4-5-20251001-v1:0",
|
||||
params=AWSBedrockLLMService.InputParams(temperature=0.8),
|
||||
)
|
||||
|
||||
# You can also register a function_name of None to get all functions
|
||||
|
||||
@@ -4,9 +4,39 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""OpenAI Realtime API Example with Mem0 Memory Integration.
|
||||
|
||||
This example demonstrates how to use OpenAI's Realtime API with Pipecat
|
||||
for conversational AI with memory capabilities using Mem0.
|
||||
|
||||
The example:
|
||||
1. Sets up a real-time audio conversation using OpenAI's Realtime API
|
||||
2. Uses Mem0 to store and retrieve memories from conversations
|
||||
3. Creates personalized greetings based on previous interactions
|
||||
4. Demonstrates function calling capabilities
|
||||
5. Shows how to add tools dynamically at runtime
|
||||
|
||||
Example usage (run from pipecat root directory):
|
||||
$ pip install "pipecat-ai[daily,openai,mem0]"
|
||||
$ python examples/foundational/19-openai-realtime.py
|
||||
|
||||
Requirements:
|
||||
- OpenAI API key (for Realtime API)
|
||||
- Daily API key (for video/audio transport)
|
||||
- Mem0 API key (for cloud-based memory storage)
|
||||
- [Optional] Deepgram API key (for STT fallback)
|
||||
|
||||
Environment variables (set in .env or in your terminal using `export`):
|
||||
DAILY_SAMPLE_ROOM_URL=daily_sample_room_url
|
||||
DAILY_API_KEY=daily_api_key
|
||||
OPENAI_API_KEY=openai_api_key
|
||||
MEM0_API_KEY=mem0_api_key
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Union
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
@@ -14,16 +44,19 @@ from loguru import logger
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame, TranscriptionMessage
|
||||
from pipecat.frames.frames import LLMRunFrame, LLMSetToolsFrame, TranscriptionMessage
|
||||
from pipecat.observers.loggers.transcription_log_observer import TranscriptionLogObserver
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.mem0.memory import Mem0MemoryService
|
||||
from pipecat.services.openai.realtime.events import (
|
||||
AudioConfiguration,
|
||||
AudioInput,
|
||||
@@ -39,6 +72,64 @@ from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
try:
|
||||
from mem0 import Memory, MemoryClient # noqa: F401
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
"In order to use Mem0, you need to `pip install mem0ai`. Also, set the environment variable MEM0_API_KEY."
|
||||
)
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
async def get_initial_greeting(
|
||||
memory_client: Union[MemoryClient, Memory], user_id: str, agent_id: str, run_id: str
|
||||
) -> str:
|
||||
"""Fetch all memories for the user and create a personalized greeting.
|
||||
|
||||
Returns:
|
||||
A personalized greeting based on user memories
|
||||
"""
|
||||
try:
|
||||
if isinstance(memory_client, Memory):
|
||||
filters = {"user_id": user_id, "agent_id": agent_id, "run_id": run_id}
|
||||
filters = {k: v for k, v in filters.items() if v is not None}
|
||||
memories = memory_client.get_all(**filters)
|
||||
else:
|
||||
# Create filters based on available IDs
|
||||
id_pairs = [("user_id", user_id), ("agent_id", agent_id), ("run_id", run_id)]
|
||||
clauses = [{name: value} for name, value in id_pairs if value is not None]
|
||||
filters = {"AND": clauses} if clauses else {}
|
||||
|
||||
# Get all memories for this user
|
||||
memories = memory_client.get_all(filters=filters, version="v2", output_format="v1.1")
|
||||
|
||||
if not memories or len(memories) == 0:
|
||||
logger.debug(f"!!! No memories found for this user. {memories}")
|
||||
return "Hello! It's nice to meet you. How can I help you today?"
|
||||
|
||||
# Create a personalized greeting based on memories
|
||||
greeting = "Hello! It's great to see you again. "
|
||||
|
||||
# Add some personalization based on memories (limit to 3 memories for brevity)
|
||||
if len(memories) > 0:
|
||||
greeting += "Based on our previous conversations, I remember: "
|
||||
for i, memory in enumerate(memories["results"][:3], 1):
|
||||
memory_content = memory.get("memory", "")
|
||||
# Keep memory references brief
|
||||
if len(memory_content) > 100:
|
||||
memory_content = memory_content[:97] + "..."
|
||||
greeting += f"{memory_content} "
|
||||
|
||||
greeting += "How can I help you today?"
|
||||
|
||||
logger.debug(f"Created personalized greeting from {len(memories)} memories")
|
||||
return greeting
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error retrieving initial memories from Mem0: {e}")
|
||||
return "Hello! How can I help you today?"
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
|
||||
@@ -52,6 +143,18 @@ async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
)
|
||||
|
||||
|
||||
async def get_news(params: FunctionCallParams):
|
||||
await params.result_callback(
|
||||
{
|
||||
"news": [
|
||||
"Massive UFO currently hovering above New York City",
|
||||
"Stock markets reach all-time highs",
|
||||
"Living dinosaur species discovered in the Amazon rainforest",
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
@@ -73,6 +176,13 @@ weather_function = FunctionSchema(
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
get_news_function = FunctionSchema(
|
||||
name="get_news",
|
||||
description="Get the current news.",
|
||||
properties={},
|
||||
required=[],
|
||||
)
|
||||
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
@@ -112,8 +222,62 @@ transport_params = {
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# Note: You can pass the user_id as a parameter in API call
|
||||
USER_ID = "pipecat-realtime-user"
|
||||
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# =====================================================================
|
||||
# OPTION 1: Using Mem0 API (cloud-based approach)
|
||||
# This approach uses Mem0's cloud service for memory management
|
||||
# Requires: MEM0_API_KEY set in your environment
|
||||
# =====================================================================
|
||||
memory = Mem0MemoryService(
|
||||
api_key=os.getenv("MEM0_API_KEY"), # Your Mem0 API key
|
||||
user_id=USER_ID, # Unique identifier for the user
|
||||
agent_id="realtime-agent", # Optional identifier for the agent
|
||||
run_id="realtime-session", # Optional identifier for the run
|
||||
params=Mem0MemoryService.InputParams(
|
||||
search_limit=10,
|
||||
search_threshold=0.3,
|
||||
api_version="v2",
|
||||
system_prompt="Based on previous conversations, I recall: \n\n",
|
||||
add_as_system_message=True,
|
||||
position=1,
|
||||
),
|
||||
)
|
||||
|
||||
# =====================================================================
|
||||
# OPTION 2: Using Mem0 with local configuration (self-hosted approach)
|
||||
# This approach uses a local LLM configuration for memory management
|
||||
# Requires: Anthropic API key if using Claude model
|
||||
# =====================================================================
|
||||
# Uncomment the following code and comment out the previous memory initialization to use local config
|
||||
|
||||
# local_config = {
|
||||
# "llm": {
|
||||
# "provider": "anthropic",
|
||||
# "config": {
|
||||
# "model": "claude-3-5-sonnet-20240620",
|
||||
# "api_key": os.getenv("ANTHROPIC_API_KEY"), # Make sure to set this in your .env
|
||||
# }
|
||||
# },
|
||||
# "embedder": {
|
||||
# "provider": "openai",
|
||||
# "config": {
|
||||
# "model": "text-embedding-3-large"
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
# # Initialize Mem0 memory service with local configuration
|
||||
# memory = Mem0MemoryService(
|
||||
# local_config=local_config, # Use local LLM for memory processing
|
||||
# user_id=USER_ID, # Unique identifier for the user
|
||||
# # agent_id="realtime-agent", # Optional identifier for the agent
|
||||
# # run_id="realtime-session", # Optional identifier for the run
|
||||
# )
|
||||
|
||||
session_properties = SessionProperties(
|
||||
audio=AudioConfiguration(
|
||||
input=AudioInput(
|
||||
@@ -127,7 +291,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
),
|
||||
# tools=tools,
|
||||
instructions="""You are a helpful and friendly AI.
|
||||
instructions="""You are a helpful and friendly AI with memory capabilities.
|
||||
|
||||
Act like a human, but remember that you aren't a human and that you can't do human
|
||||
things in the real world. Your voice and personality should be warm and engaging, with a lively and
|
||||
@@ -140,9 +304,8 @@ even if you're asked about them.
|
||||
You are participating in a voice conversation. Keep your responses concise, short, and to the point
|
||||
unless specifically asked to elaborate on a topic.
|
||||
|
||||
You have access to the following tools:
|
||||
- get_current_weather: Get the current weather for a given location.
|
||||
- get_restaurant_recommendation: Get a restaurant recommendation for a given location.
|
||||
You can remember things about the person you are talking to. If the user asks you to remember
|
||||
something, make sure to remember it. Greet the user by their name if you know about it.
|
||||
|
||||
Remember, your responses should be short. Just one or two sentences, usually. Respond in English.""",
|
||||
)
|
||||
@@ -157,25 +320,28 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
|
||||
# llm.register_function(None, fetch_weather_from_api)
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
llm.register_function("get_news", get_news)
|
||||
|
||||
transcript = TranscriptProcessor()
|
||||
|
||||
# Create a standard OpenAI LLM context object using the normal messages format. The
|
||||
# OpenAIRealtimeLLMService will convert this internally to messages that the
|
||||
# openai WebSocket API can understand.
|
||||
context = OpenAILLMContext(
|
||||
[{"role": "user", "content": "Say hello!"}],
|
||||
# We'll add the initial greeting message after getting memories
|
||||
context = LLMContext(
|
||||
[],
|
||||
tools,
|
||||
)
|
||||
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
context_aggregator.user(),
|
||||
transcript.user(), # LLM pushes TranscriptionFrames upstream
|
||||
memory, # Mem0 memory service
|
||||
llm, # LLM
|
||||
transcript.user(), # Placed after the LLM, as LLM pushes TranscriptionFrames downstream
|
||||
transport.output(), # Transport bot output
|
||||
transcript.assistant(), # After the transcript output, to time with the audio output
|
||||
context_aggregator.assistant(),
|
||||
@@ -195,9 +361,28 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
# Get personalized greeting based on user memories
|
||||
greeting = await get_initial_greeting(
|
||||
memory_client=memory.memory_client,
|
||||
user_id=USER_ID,
|
||||
agent_id="realtime-agent",
|
||||
run_id="realtime-session",
|
||||
)
|
||||
|
||||
# Add the greeting as a user message to start the conversation
|
||||
context.add_message({"role": "user", "content": greeting})
|
||||
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
# Add a new tool at runtime after a delay.
|
||||
await asyncio.sleep(15)
|
||||
new_tools = ToolsSchema(
|
||||
standard_tools=[weather_function, restaurant_function, get_news_function]
|
||||
)
|
||||
await task.queue_frames([LLMSetToolsFrame(tools=new_tools)])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
|
||||
@@ -18,7 +18,9 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
|
||||
@@ -155,10 +157,10 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
# Create a standard OpenAI LLM context object using the normal messages format. The
|
||||
# Create a standard LLM context object using the normal messages format. The
|
||||
# OpenAIRealtimeBetaLLMService will convert this internally to messages that the
|
||||
# openai WebSocket API can understand.
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[{"role": "user", "content": "Say hello!"}],
|
||||
# [{"role": "user", "content": [{"type": "text", "text": "Say hello!"}]}],
|
||||
# [
|
||||
@@ -173,7 +175,7 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
|
||||
tools,
|
||||
)
|
||||
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -18,7 +18,8 @@ from pipecat.frames.frames import LLMRunFrame, TranscriptionMessage
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -169,20 +170,20 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
|
||||
# Create a standard OpenAI LLM context object using the normal messages format. The
|
||||
# OpenAIRealtimeLLMService will convert this internally to messages that the
|
||||
# openai WebSocket API can understand.
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[{"role": "user", "content": "Say hello!"}],
|
||||
tools,
|
||||
)
|
||||
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
context_aggregator.user(),
|
||||
transcript.user(), # LLM pushes TranscriptionFrames upstream
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transcript.user(), # Placed after the LLM, as LLM pushes TranscriptionFrames downstream
|
||||
transport.output(), # Transport bot output
|
||||
transcript.assistant(), # After the transcript output, to time with the audio output
|
||||
context_aggregator.assistant(),
|
||||
|
||||
@@ -13,14 +13,15 @@ from datetime import datetime
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import (
|
||||
OpenAILLMContext,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
@@ -69,11 +70,11 @@ async def save_conversation(params: FunctionCallParams):
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
|
||||
filename = f"{BASE_FILENAME}{timestamp}.json"
|
||||
logger.debug(
|
||||
f"writing conversation to {filename}\n{json.dumps(params.context.messages, indent=4)}"
|
||||
f"writing conversation to {filename}\n{json.dumps(params.context.get_messages(), indent=4)}"
|
||||
)
|
||||
try:
|
||||
with open(filename, "w") as file:
|
||||
messages = params.context.get_messages_for_persistent_storage()
|
||||
messages = params.context.get_messages()
|
||||
# remove the last message, which is the instruction we just gave to save the conversation
|
||||
messages.pop()
|
||||
json.dump(messages, file, indent=2)
|
||||
@@ -90,6 +91,10 @@ async def load_conversation(params: FunctionCallParams):
|
||||
with open(filename, "r") as file:
|
||||
params.context.set_messages(json.load(file))
|
||||
await params.llm.reset_conversation()
|
||||
# NOTE: we manually create a response here rather than relying
|
||||
# on the function callback to trigger one since we've reset the
|
||||
# conversation so the remote service doesn't know about the
|
||||
# in-progress tool call.
|
||||
await params.llm._create_response()
|
||||
except Exception as e:
|
||||
await params.result_callback({"success": False, "error": str(e)})
|
||||
@@ -97,14 +102,12 @@ async def load_conversation(params: FunctionCallParams):
|
||||
asyncio.create_task(_reset())
|
||||
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
tools = ToolsSchema(
|
||||
standard_tools=[
|
||||
FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
@@ -115,45 +118,33 @@ tools = [
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
"required": ["location", "format"],
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "save_conversation",
|
||||
"description": "Save the current conversatione. Use this function to persist the current conversation to external storage.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": [],
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_saved_conversation_filenames",
|
||||
"description": "Get a list of saved conversation histories. Returns a list of filenames. Each filename includes a date and timestamp. Each file is conversation history that can be loaded into this session.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": [],
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "load_conversation",
|
||||
"description": "Load a conversation history. Use this function to load a conversation history into the current session.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
required=["location", "format"],
|
||||
),
|
||||
FunctionSchema(
|
||||
name="save_conversation",
|
||||
description="Save the current conversatione. Use this function to persist the current conversation to external storage.",
|
||||
properties={},
|
||||
required=[],
|
||||
),
|
||||
FunctionSchema(
|
||||
name="get_saved_conversation_filenames",
|
||||
description="Get a list of saved conversation histories. Returns a list of filenames. Each filename includes a date and timestamp. Each file is conversation history that can be loaded into this session.",
|
||||
properties={},
|
||||
required=[],
|
||||
),
|
||||
FunctionSchema(
|
||||
name="load_conversation",
|
||||
description="Load a conversation history. Use this function to load a conversation history into the current session.",
|
||||
properties={
|
||||
"filename": {
|
||||
"type": "string",
|
||||
"description": "The filename of the conversation history to load.",
|
||||
}
|
||||
},
|
||||
"required": ["filename"],
|
||||
},
|
||||
},
|
||||
]
|
||||
required=["filename"],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
@@ -224,8 +215,8 @@ Remember, your responses should be short. Just one or two sentences, usually."""
|
||||
llm.register_function("get_saved_conversation_filenames", get_saved_conversation_filenames)
|
||||
llm.register_function("load_conversation", load_conversation)
|
||||
|
||||
context = OpenAILLMContext([], tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context = LLMContext([{"role": "user", "content": "Say hello!"}], tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -16,7 +16,9 @@ from pipecat.frames.frames import LLMRunFrame, TranscriptionMessage
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -72,7 +74,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# inference_on_context_initialization=False,
|
||||
)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -90,7 +92,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# },
|
||||
],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
transcript = TranscriptProcessor()
|
||||
|
||||
|
||||
@@ -19,7 +19,9 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
@@ -139,10 +141,18 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
[{"role": "user", "content": "Say hello."}],
|
||||
# You can provide the system instructions and tools in the context rather
|
||||
# than as arguments to GeminiLiveLLMService, but note that doing so will
|
||||
# trigger a (fast) reconnection when the GeminiLiveLLMService first
|
||||
# receives the context (i.e. when we send the LLMRunFrame below).
|
||||
context = LLMContext(
|
||||
[
|
||||
# {"role": "system", "content": system_instruction},
|
||||
{"role": "user", "content": "Say hello."},
|
||||
],
|
||||
# tools,
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -17,7 +17,9 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
@@ -65,7 +67,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# inference_on_context_initialization=False,
|
||||
)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -73,7 +75,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
},
|
||||
],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -16,7 +16,8 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
@@ -109,8 +110,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Set up conversation context and management
|
||||
# The context_aggregator will automatically collect conversation context
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -16,7 +16,9 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
@@ -90,7 +92,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -98,7 +100,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
}
|
||||
],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -16,7 +16,9 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
@@ -129,7 +131,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
mime_type = "text/plain"
|
||||
|
||||
# Create context with file reference
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -152,7 +154,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
except Exception as e:
|
||||
logger.error(f"Error uploading file: {e}")
|
||||
# Continue with a basic context if file upload fails
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -162,7 +164,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
# Create context aggregator
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# Build the pipeline
|
||||
pipeline = Pipeline(
|
||||
|
||||
@@ -10,7 +10,9 @@ from pipecat.frames.frames import Frame, LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -124,8 +126,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
]
|
||||
|
||||
# Set up conversation context and management
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -9,21 +9,21 @@ import os
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from google.genai.types import HttpOptions
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm_vertex import GeminiLiveVertexLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
@@ -139,10 +139,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
[{"role": "user", "content": "Say hello."}],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context = LLMContext([{"role": "user", "content": "Say hello."}])
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -18,7 +18,9 @@ from pipecat.frames.frames import EndTaskFrame, LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -62,7 +64,7 @@ You have three tools available to you:
|
||||
|
||||
After you've responded to the user three times, do two things, in order:
|
||||
1. Politely let them know that that's all the time you have today and say goodbye.
|
||||
2. Call the end_conversation tool to gracefully end the conversation.
|
||||
2. *WITHOUT WAITING FOR THE USER TO RESPOND*, call the end_conversation tool to gracefully end the conversation.
|
||||
"""
|
||||
|
||||
|
||||
@@ -152,10 +154,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
llm.register_function("end_conversation", end_conversation)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[{"role": "user", "content": "Say hello."}],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -9,7 +9,6 @@ import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from simli import SimliConfig
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
@@ -66,11 +65,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="a167e0f3-df7e-4d52-a9c3-f949145efdab",
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121",
|
||||
)
|
||||
|
||||
simli_ai = SimliVideoService(
|
||||
SimliConfig(os.getenv("SIMLI_API_KEY"), os.getenv("SIMLI_FACE_ID")),
|
||||
api_key=os.getenv("SIMLI_API_KEY"),
|
||||
face_id="cace3ef7-a4c4-425d-a8cf-a5358eb0c427",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o-mini")
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
@@ -63,10 +64,12 @@ class UrlToImageProcessor(FrameProcessor):
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
def extract_url(self, text: str):
|
||||
pattern = r"!\[[^\]]*\]\((https?://[^)]+\.(png|jpg|jpeg|PNG|JPG|JPEG))\)"
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
data = json.loads(text)
|
||||
if "artObject" in data:
|
||||
return data["artObject"]["webImage"]["url"]
|
||||
if "artworks" in data and len(data["artworks"]):
|
||||
return data["artworks"][0]["webImage"]["url"]
|
||||
|
||||
return None
|
||||
|
||||
async def run_image_process(self, image_url: str):
|
||||
@@ -130,9 +133,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
mcp = MCPClient(
|
||||
server_params=StdioServerParameters(
|
||||
command=shutil.which("npx"),
|
||||
args=["-y", "@programcomputer/nasa-mcp-server@latest"],
|
||||
# https://api.nasa.gov
|
||||
env={"NASA_API_KEY": os.getenv("NASA_API_KEY")},
|
||||
# https://github.com/r-huijts/rijksmuseum-mcp
|
||||
args=["-y", "mcp-server-rijksmuseum"],
|
||||
env={"RIJKSMUSEUM_API_KEY": os.getenv("RIJKSMUSEUM_API_KEY")},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -141,15 +144,20 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
mcp_image = UrlToImageProcessor(aiohttp_session=session)
|
||||
|
||||
tools = await mcp.register_tools(llm)
|
||||
tools = {}
|
||||
try:
|
||||
tools = await mcp.register_tools(llm)
|
||||
except Exception as e:
|
||||
logger.error(f"error registering tools")
|
||||
logger.exception("error trace:")
|
||||
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
Your goal is to demonstrate your capabilities in a succinct way.
|
||||
You have access to a number of tools provided by NASA MCP. Use any and all tools to help users.
|
||||
When asked for the astronomy picture of the day, PASS in NO date to the API.
|
||||
This ensures we get the latest picture available. If as specific date is asked for, you
|
||||
can pass in that date to the API.
|
||||
You have access to tools to search the Rijksmuseum collection.
|
||||
Offer, for example, to show the earliest Rembrandt work from the museum. Use the `search_artwork` tool.
|
||||
The tool may respond with a JSON object with an `artworks` array. Choose the art from that array.
|
||||
Once the tool has responded, tell the user the title and use the `open_image_in_browser` tool.
|
||||
Your output will be converted to audio so don't include special characters in your answers.
|
||||
Respond to what the user said in a creative and helpful way.
|
||||
Don't overexplain what you are doing.
|
||||
@@ -206,14 +214,13 @@ async def bot(runner_args: RunnerArguments):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.getenv("NASA_API_KEY"):
|
||||
if not os.getenv("RIJKSMUSEUM_API_KEY"):
|
||||
logger.error(
|
||||
f"Please set NASA_API_KEY environment variable for this example. See https://api.nasa.gov"
|
||||
f"Please set RIJKSMUSEUM_API_KEY environment variable for this example. See https://github.com/r-huijts/rijksmuseum-mcp and https://www.rijksmuseum.nl/en/register?redirectUrl=https://www.https://www.rijksmuseum.nl/en/rijksstudio/my/profile"
|
||||
)
|
||||
import sys
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
|
||||
@@ -79,7 +79,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.error(f"error setting up mcp")
|
||||
logger.exception("error trace:")
|
||||
|
||||
tools = await mcp.register_tools(llm)
|
||||
tools = {}
|
||||
try:
|
||||
tools = await mcp.register_tools(llm)
|
||||
except Exception as e:
|
||||
logger.error(f"error registering tools")
|
||||
logger.exception("error trace:")
|
||||
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
|
||||
@@ -132,9 +132,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
Your goal is to demonstrate your capabilities in a succinct way.
|
||||
You have access to a number of tools provided by NASA MCP. Use any and all tools to help users.
|
||||
When asked for today's date, use 'https://www.datetoday.net/'.
|
||||
When asked for the astronomy picture of the day, use 'https://www.datetoday.net/', to get today's date.
|
||||
You have access to tools to search the Rijksmuseum collection.
|
||||
Offer, for example, to show the earliest Rembrandt work from the museum. Use the `search_artwork` tool.
|
||||
The tool may respond with a JSON object with an `artworks` array. Choose the art from that array.
|
||||
Once the tool has responded, tell the user the title and use the `open_image_in_browser` tool.
|
||||
Your output will be converted to audio so don't include special characters in your answers.
|
||||
Respond to what the user said in a creative and helpful way.
|
||||
Don't overexplain what you are doing.
|
||||
@@ -147,13 +148,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
mcp = MCPClient(
|
||||
server_params=StdioServerParameters(
|
||||
command=shutil.which("npx"),
|
||||
args=["-y", "@programcomputer/nasa-mcp-server@latest"],
|
||||
# https://api.nasa.gov
|
||||
env={"NASA_API_KEY": os.getenv("NASA_API_KEY")},
|
||||
# https://github.com/r-huijts/rijksmuseum-mcp
|
||||
args=["-y", "mcp-server-error setting up mcp"],
|
||||
env={"RIJKSMUSEUM_API_KEY": os.getenv("RIJKSMUSEUM_API_KEY")},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"error setting up nasa mcp")
|
||||
logger.error(f"error setting up rijksmuseum mcp")
|
||||
logger.exception("error trace:")
|
||||
try:
|
||||
# https://docs.mcp.run/integrating/tutorials/mcp-run-sse-openai-agents/
|
||||
@@ -164,8 +165,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.error(f"error setting up mcp.run")
|
||||
logger.exception("error trace:")
|
||||
|
||||
tools = await mcp.register_tools(llm)
|
||||
run_tools = await mcp_run.register_tools(llm)
|
||||
tools = {}
|
||||
run_tools = {}
|
||||
try:
|
||||
tools = await mcp.register_tools(llm)
|
||||
run_tools = await mcp_run.register_tools(llm)
|
||||
except Exception as e:
|
||||
logger.error(f"error registering tools")
|
||||
logger.exception("error trace:")
|
||||
|
||||
all_standard_tools = run_tools.standard_tools + tools.standard_tools
|
||||
all_tools = ToolsSchema(standard_tools=all_standard_tools)
|
||||
@@ -219,9 +226,9 @@ async def bot(runner_args: RunnerArguments):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.getenv("NASA_API_KEY") or not os.getenv("MCP_RUN_SSE_URL"):
|
||||
if not os.getenv("RIJKSMUSEUM_API_KEY") or not os.getenv("MCP_RUN_SSE_URL"):
|
||||
logger.error(
|
||||
f"Please set NASA_API_KEY and MCP_RUN_SSE_URL environment variables. See https://api.nasa.gov and https://mcp.run"
|
||||
f"Please set RIJKSMUSEUM_API_KEY and MCP_RUN_SSE_URL environment variables. See https://github.com/r-huijts/rijksmuseum-mcp and https://mcp.run"
|
||||
)
|
||||
import sys
|
||||
|
||||
|
||||
@@ -85,7 +85,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.error(f"error setting up mcp")
|
||||
logger.exception("error trace:")
|
||||
|
||||
tools = await mcp.register_tools(llm)
|
||||
tools = {}
|
||||
try:
|
||||
tools = await mcp.register_tools(llm)
|
||||
except Exception as e:
|
||||
logger.error(f"error registering tools")
|
||||
logger.exception("error trace:")
|
||||
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
|
||||
@@ -15,7 +15,9 @@ from pipecat.frames.frames import Frame, InputImageRawFrame, LLMRunFrame, Output
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
@@ -108,8 +110,8 @@ async def run_bot(pipecat_transport):
|
||||
}
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# RTVI events for Pipecat client UI
|
||||
rtvi = RTVIProcessor()
|
||||
|
||||
153
examples/foundational/48-service-switcher.py
Normal file
153
examples/foundational/48-service-switcher.py
Normal file
@@ -0,0 +1,153 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, ManuallySwitchServiceFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.service_switcher import ServiceSwitcher, ServiceSwitcherStrategyManual
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.stt import CartesiaSTTService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt_cartesia = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
|
||||
stt_deepgram = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt_switcher = ServiceSwitcher(
|
||||
services=[stt_cartesia, stt_deepgram], strategy_type=ServiceSwitcherStrategyManual
|
||||
)
|
||||
|
||||
tts_cartesia = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121",
|
||||
)
|
||||
tts_deepgram = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
tts_switcher = ServiceSwitcher(
|
||||
services=[tts_cartesia, tts_deepgram], strategy_type=ServiceSwitcherStrategyManual
|
||||
)
|
||||
|
||||
llm_openai = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm_google = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
llm_switcher = ServiceSwitcher(
|
||||
services=[llm_openai, llm_google], strategy_type=ServiceSwitcherStrategyManual
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt_switcher,
|
||||
context_aggregator.user(), # User responses
|
||||
llm_switcher, # LLM
|
||||
tts_switcher, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
await asyncio.sleep(15)
|
||||
print(f"Switching to {stt_deepgram}")
|
||||
await task.queue_frames([ManuallySwitchServiceFrame(service=stt_deepgram)])
|
||||
await asyncio.sleep(15)
|
||||
print(f"Switching to {llm_google}")
|
||||
await task.queue_frames([ManuallySwitchServiceFrame(service=llm_google)])
|
||||
await asyncio.sleep(15)
|
||||
print(f"Switching to {tts_deepgram}")
|
||||
await task.queue_frames([ManuallySwitchServiceFrame(service=tts_deepgram)])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
BIN
examples/foundational/assets/cat.jpg
Normal file
BIN
examples/foundational/assets/cat.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 63 KiB |
@@ -73,13 +73,13 @@ Transform your local bot into a production-ready service. Pipecat Cloud handles
|
||||
|
||||
1. [Sign up for Pipecat Cloud](https://pipecat.daily.co/sign-up).
|
||||
|
||||
2. Install the Pipecat Cloud CLI:
|
||||
2. Install the Pipecat CLI:
|
||||
|
||||
```bash
|
||||
uv add pipecatcloud
|
||||
uv tool install pipecat-ai-cli
|
||||
```
|
||||
|
||||
> 💡 Tip: You can run the `pipecatcloud` CLI using the `pcc` alias.
|
||||
> 💡 Tip: You can run the `pipecat` CLI using the `pc` alias.
|
||||
|
||||
3. Set up Docker for building your bot image:
|
||||
|
||||
@@ -113,12 +113,22 @@ secret_set = "quickstart-secrets"
|
||||
|
||||
> 💡 Tip: [Set up `image_credentials`](https://docs.pipecat.ai/deployment/pipecat-cloud/fundamentals/secrets#image-pull-secrets) in your TOML file for authenticated image pulls
|
||||
|
||||
### Log in to Pipecat Cloud
|
||||
|
||||
To start using the CLI, authenticate to Pipecat Cloud:
|
||||
|
||||
```bash
|
||||
pipecat cloud auth login
|
||||
```
|
||||
|
||||
You'll be presented with a link that you can click to authenticate your client.
|
||||
|
||||
### Configure secrets
|
||||
|
||||
Upload your API keys to Pipecat Cloud's secure storage:
|
||||
|
||||
```bash
|
||||
uv run pcc secrets set quickstart-secrets --file .env
|
||||
pipecat cloud secrets set quickstart-secrets --file .env
|
||||
```
|
||||
|
||||
This creates a secret set called `quickstart-secrets` (matching your TOML file) and uploads all your API keys from `.env`.
|
||||
@@ -128,13 +138,13 @@ This creates a secret set called `quickstart-secrets` (matching your TOML file)
|
||||
Build your Docker image and push to Docker Hub:
|
||||
|
||||
```bash
|
||||
uv run pcc docker build-push
|
||||
pipecat cloud docker build-push
|
||||
```
|
||||
|
||||
Deploy to Pipecat Cloud:
|
||||
|
||||
```bash
|
||||
uv run pcc deploy
|
||||
pipecat cloud deploy
|
||||
```
|
||||
|
||||
### Connect to your agent
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
agent_name = "quickstart"
|
||||
image = "your_username/quickstart:0.1"
|
||||
secret_set = "quickstart-secrets"
|
||||
agent_profile = "agent-1x"
|
||||
|
||||
# RECOMMENDED: Set an image pull secret:
|
||||
# https://docs.pipecat.ai/deployment/pipecat-cloud/fundamentals/secrets#image-pull-secrets
|
||||
# image_credentials = "your_image_pull_secret"
|
||||
|
||||
[scaling]
|
||||
min_agents = 1
|
||||
|
||||
@@ -4,13 +4,14 @@ version = "0.1.0"
|
||||
description = "Quickstart example for building voice AI bots with Pipecat"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"pipecat-ai[webrtc,daily,silero,deepgram,openai,cartesia,local-smart-turn-v3,runner]>=0.0.86",
|
||||
"pipecatcloud>=0.2.4"
|
||||
"pipecat-ai[webrtc,daily,silero,deepgram,openai,cartesia,local-smart-turn-v3,runner]",
|
||||
"pipecat-ai-cli"
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"ruff~=0.12.1",
|
||||
"pyright>=1.1.404,<2",
|
||||
"ruff>=0.12.11,<1",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
|
||||
@@ -50,12 +50,12 @@ anthropic = [ "anthropic~=0.49.0" ]
|
||||
assemblyai = [ "pipecat-ai[websockets-base]" ]
|
||||
asyncai = [ "pipecat-ai[websockets-base]" ]
|
||||
aws = [ "aioboto3~=15.0.0", "pipecat-ai[websockets-base]" ]
|
||||
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.1.0; python_version>='3.12'" ]
|
||||
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.1.1; python_version>='3.12'" ]
|
||||
azure = [ "azure-cognitiveservices-speech~=1.42.0"]
|
||||
cartesia = [ "cartesia~=2.0.3", "pipecat-ai[websockets-base]" ]
|
||||
cerebras = []
|
||||
deepseek = []
|
||||
daily = [ "daily-python~=0.20.0" ]
|
||||
daily = [ "daily-python~=0.21.0" ]
|
||||
deepgram = [ "deepgram-sdk~=4.7.0" ]
|
||||
elevenlabs = [ "pipecat-ai[websockets-base]" ]
|
||||
fal = [ "fal-client~=0.5.9" ]
|
||||
@@ -93,7 +93,7 @@ rime = [ "pipecat-ai[websockets-base]" ]
|
||||
riva = [ "nvidia-riva-client~=2.21.1" ]
|
||||
runner = [ "python-dotenv>=1.0.0,<2.0.0", "uvicorn>=0.32.0,<1.0.0", "fastapi>=0.115.6,<0.117.0", "pipecat-ai-small-webrtc-prebuilt>=1.0.0"]
|
||||
sambanova = []
|
||||
sarvam = [ "pipecat-ai[websockets-base]" ]
|
||||
sarvam = [ "sarvamai==0.1.21", "pipecat-ai[websockets-base]" ]
|
||||
sentry = [ "sentry-sdk>=2.28.0,<3" ]
|
||||
local-smart-turn = [ "coremltools>=8.0", "transformers", "torch>=2.5.0,<3", "torchaudio>=2.5.0,<3" ]
|
||||
local-smart-turn-v3 = [ "transformers", "onnxruntime>=1.20.1,<2" ]
|
||||
|
||||
@@ -10,9 +10,10 @@ import os
|
||||
import re
|
||||
import time
|
||||
import wave
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
import aiofiles
|
||||
from deepgram import LiveOptions
|
||||
@@ -53,6 +54,14 @@ EVAL_TIMEOUT_SECS = 120
|
||||
EvalPrompt = str | Tuple[str, ImageFile]
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvalConfig:
|
||||
prompt: EvalPrompt
|
||||
eval: str
|
||||
eval_speaks_first: bool = False
|
||||
runner_args_body: Optional[Any] = None
|
||||
|
||||
|
||||
class EvalRunner:
|
||||
def __init__(
|
||||
self,
|
||||
@@ -93,9 +102,7 @@ class EvalRunner:
|
||||
async def run_eval(
|
||||
self,
|
||||
example_file: str,
|
||||
prompt: EvalPrompt,
|
||||
eval: str,
|
||||
user_speaks_first: bool = False,
|
||||
eval_config: EvalConfig,
|
||||
):
|
||||
if not re.match(self._pattern, example_file):
|
||||
return
|
||||
@@ -112,10 +119,8 @@ class EvalRunner:
|
||||
|
||||
try:
|
||||
tasks = [
|
||||
asyncio.create_task(run_example_pipeline(script_path)),
|
||||
asyncio.create_task(
|
||||
run_eval_pipeline(self, example_file, prompt, eval, user_speaks_first)
|
||||
),
|
||||
asyncio.create_task(run_example_pipeline(script_path, eval_config)),
|
||||
asyncio.create_task(run_eval_pipeline(self, example_file, eval_config)),
|
||||
]
|
||||
_, pending = await asyncio.wait(tasks, timeout=EVAL_TIMEOUT_SECS)
|
||||
if pending:
|
||||
@@ -177,7 +182,7 @@ class EvalRunner:
|
||||
return os.path.join(self._recordings_dir, f"{base_name}.wav")
|
||||
|
||||
|
||||
async def run_example_pipeline(script_path: Path):
|
||||
async def run_example_pipeline(script_path: Path, eval_config: EvalConfig):
|
||||
room_url = os.getenv("DAILY_SAMPLE_ROOM_URL")
|
||||
|
||||
module = load_module_from_path(script_path)
|
||||
@@ -196,6 +201,7 @@ async def run_example_pipeline(script_path: Path):
|
||||
|
||||
runner_args = RunnerArguments()
|
||||
runner_args.pipeline_idle_timeout_secs = PIPELINE_IDLE_TIMEOUT_SECS
|
||||
runner_args.body = eval_config.runner_args_body
|
||||
|
||||
await module.run_bot(transport, runner_args)
|
||||
|
||||
@@ -203,9 +209,7 @@ async def run_example_pipeline(script_path: Path):
|
||||
async def run_eval_pipeline(
|
||||
eval_runner: EvalRunner,
|
||||
example_file: str,
|
||||
prompt: EvalPrompt,
|
||||
eval: str,
|
||||
user_speaks_first: bool = False,
|
||||
eval_config: EvalConfig,
|
||||
):
|
||||
logger.info(f"Starting eval bot")
|
||||
|
||||
@@ -262,17 +266,16 @@ async def run_eval_pipeline(
|
||||
# Load example prompt depending on image.
|
||||
example_prompt = ""
|
||||
example_image: Optional[ImageFile] = None
|
||||
if isinstance(prompt, str):
|
||||
example_prompt = prompt
|
||||
elif isinstance(prompt, tuple):
|
||||
example_prompt, example_image = prompt
|
||||
if isinstance(eval_config.prompt, str):
|
||||
example_prompt = eval_config.prompt
|
||||
elif isinstance(eval_config.prompt, tuple):
|
||||
example_prompt, example_image = eval_config.prompt
|
||||
|
||||
eval_prompt = f"The answer is correct if it matches: {eval}."
|
||||
common_system_prompt = (
|
||||
"The user might say things other than the answer and that's allowed. "
|
||||
f"You should only call the eval function with your assessment when the user actually answers the question. {eval_prompt}"
|
||||
f"You should only call the eval function when the user: {eval_config.eval}"
|
||||
)
|
||||
if user_speaks_first:
|
||||
if eval_config.eval_speaks_first:
|
||||
system_prompt = f"You are an LLM eval, be extremly brief. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}"
|
||||
else:
|
||||
system_prompt = f"You are an LLM eval, be extremly brief. Your goal is to first ask one question: {example_prompt}. {common_system_prompt}"
|
||||
@@ -330,9 +333,9 @@ async def run_eval_pipeline(
|
||||
|
||||
# Default behavior is for the bot to speak first
|
||||
# If the eval bot speaks first, we append the prompt to the messages
|
||||
if user_speaks_first:
|
||||
if eval_config.eval_speaks_first:
|
||||
messages.append(
|
||||
{"role": "user", "content": f"Start by saying this exactly: '{prompt}'"}
|
||||
{"role": "user", "content": f"Start by saying this exactly: '{eval_config.prompt}'"}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from eval import EvalRunner
|
||||
from eval import EvalConfig, EvalRunner
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
from utils import check_env_variables
|
||||
@@ -24,189 +24,184 @@ ASSETS_DIR = SCRIPT_DIR / "assets"
|
||||
|
||||
FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational"
|
||||
|
||||
# Speaking order constants
|
||||
USER_SPEAKS_FIRST = True
|
||||
BOT_SPEAKS_FIRST = False
|
||||
|
||||
# Math
|
||||
PROMPT_SIMPLE_MATH = "A simple math addition."
|
||||
EVAL_SIMPLE_MATH = "Correct math addition."
|
||||
|
||||
# Weather
|
||||
PROMPT_WEATHER = "What's the weather in San Francisco?"
|
||||
EVAL_WEATHER = (
|
||||
"Something specific about the current weather in San Francisco, including the degrees."
|
||||
EVAL_SIMPLE_MATH = EvalConfig(
|
||||
prompt="A simple math addition.",
|
||||
eval="The user answers the math addition correctly.",
|
||||
)
|
||||
|
||||
# Online search
|
||||
PROMPT_ONLINE_SEARCH = "What's the date right now in London?"
|
||||
EVAL_ONLINE_SEARCH = f"Today is {datetime.now(timezone.utc).strftime('%B %d, %Y')}."
|
||||
EVAL_WEATHER = EvalConfig(
|
||||
prompt="What's the weather in San Francisco?",
|
||||
eval="The user says something specific about the current weather in San Francisco, including the degrees.",
|
||||
)
|
||||
|
||||
# Switch language
|
||||
PROMPT_SWITCH_LANGUAGE = "Say something in Spanish."
|
||||
EVAL_SWITCH_LANGUAGE = "The user is now talking in Spanish."
|
||||
EVAL_ONLINE_SEARCH = EvalConfig(
|
||||
prompt="What's the date right now in London?",
|
||||
eval=f"The user says today is {datetime.now(timezone.utc).strftime('%B %d, %Y')} in London.",
|
||||
)
|
||||
|
||||
# Vision
|
||||
PROMPT_VISION = ("What do you see?", Image.open(ASSETS_DIR / "cat.jpg"))
|
||||
EVAL_VISION = "A cat description."
|
||||
EVAL_SWITCH_LANGUAGE = EvalConfig(
|
||||
prompt="Say something in Spanish.",
|
||||
eval="The user talks in Spanish.",
|
||||
)
|
||||
|
||||
EVAL_VISION_CAMERA = EvalConfig(
|
||||
prompt=("Briefly describe what you see.", Image.open(ASSETS_DIR / "cat.jpg")),
|
||||
eval="The user provides a cat description.",
|
||||
)
|
||||
|
||||
|
||||
def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
|
||||
return EvalConfig(
|
||||
prompt="Briefly describe this image.",
|
||||
eval="The user provides a cat description.",
|
||||
eval_speaks_first=eval_speaks_first,
|
||||
runner_args_body={
|
||||
"image_path": ASSETS_DIR / "cat.jpg",
|
||||
"question": "Briefly describe this image.",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
EVAL_VOICEMAIL = EvalConfig(
|
||||
prompt="Please leave a message.",
|
||||
eval="The user leaves a voicemail message.",
|
||||
eval_speaks_first=True,
|
||||
)
|
||||
|
||||
EVAL_CONVERSATION = EvalConfig(
|
||||
prompt="Hello, this is Mark.",
|
||||
eval="The user replies with a greeting.",
|
||||
eval_speaks_first=True,
|
||||
)
|
||||
|
||||
# Voicemail
|
||||
PROMPT_VOICEMAIL = "Please leave a message after the beep."
|
||||
EVAL_VOICEMAIL = "Assess the conversation and determine if it is a voicemail."
|
||||
PROMPT_CONVERSATION = "Hello, this is Mark."
|
||||
EVAL_CONVERSATION = "A start of a conversation, not a voicemail."
|
||||
|
||||
TESTS_07 = [
|
||||
# 07 series
|
||||
("07-interruptible.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07-interruptible-cartesia-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07a-interruptible-speechmatics.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07aa-interruptible-soniox.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ab-interruptible-inworld-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ac-interruptible-asyncai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07c-interruptible-deepgram-flux.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"07d-interruptible-elevenlabs-http.py",
|
||||
PROMPT_SIMPLE_MATH,
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07j-interruptible-gladia.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07k-interruptible-lmnt.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07l-interruptible-groq.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07m-interruptible-aws.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07m-interruptible-aws-strands.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("07n-interruptible-gemini.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07n-interruptible-google.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07o-interruptible-assemblyai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07q-interruptible-rime.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07q-interruptible-rime-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07r-interruptible-riva-nim.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"07s-interruptible-google-audio-in.py",
|
||||
PROMPT_SIMPLE_MATH,
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("07t-interruptible-fish.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07v-interruptible-neuphonic.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07v-interruptible-neuphonic-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ae-interruptible-hume.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07-interruptible.py", EVAL_SIMPLE_MATH),
|
||||
("07-interruptible-cartesia-http.py", EVAL_SIMPLE_MATH),
|
||||
("07a-interruptible-speechmatics.py", EVAL_SIMPLE_MATH),
|
||||
("07aa-interruptible-soniox.py", EVAL_SIMPLE_MATH),
|
||||
("07ab-interruptible-inworld-http.py", EVAL_SIMPLE_MATH),
|
||||
("07ac-interruptible-asyncai.py", EVAL_SIMPLE_MATH),
|
||||
("07ac-interruptible-asyncai-http.py", EVAL_SIMPLE_MATH),
|
||||
("07b-interruptible-langchain.py", EVAL_SIMPLE_MATH),
|
||||
("07c-interruptible-deepgram.py", EVAL_SIMPLE_MATH),
|
||||
("07c-interruptible-deepgram-flux.py", EVAL_SIMPLE_MATH),
|
||||
("07c-interruptible-deepgram-http.py", EVAL_SIMPLE_MATH),
|
||||
("07d-interruptible-elevenlabs.py", EVAL_SIMPLE_MATH),
|
||||
("07d-interruptible-elevenlabs-http.py", EVAL_SIMPLE_MATH),
|
||||
("07f-interruptible-azure.py", EVAL_SIMPLE_MATH),
|
||||
("07g-interruptible-openai.py", EVAL_SIMPLE_MATH),
|
||||
("07h-interruptible-openpipe.py", EVAL_SIMPLE_MATH),
|
||||
("07j-interruptible-gladia.py", EVAL_SIMPLE_MATH),
|
||||
("07k-interruptible-lmnt.py", EVAL_SIMPLE_MATH),
|
||||
("07l-interruptible-groq.py", EVAL_SIMPLE_MATH),
|
||||
("07m-interruptible-aws.py", EVAL_SIMPLE_MATH),
|
||||
("07m-interruptible-aws-strands.py", EVAL_WEATHER),
|
||||
("07n-interruptible-gemini.py", EVAL_SIMPLE_MATH),
|
||||
("07n-interruptible-google.py", EVAL_SIMPLE_MATH),
|
||||
("07o-interruptible-assemblyai.py", EVAL_SIMPLE_MATH),
|
||||
("07q-interruptible-rime.py", EVAL_SIMPLE_MATH),
|
||||
("07q-interruptible-rime-http.py", EVAL_SIMPLE_MATH),
|
||||
("07r-interruptible-riva-nim.py", EVAL_SIMPLE_MATH),
|
||||
("07s-interruptible-google-audio-in.py", EVAL_SIMPLE_MATH),
|
||||
("07t-interruptible-fish.py", EVAL_SIMPLE_MATH),
|
||||
("07v-interruptible-neuphonic.py", EVAL_SIMPLE_MATH),
|
||||
("07v-interruptible-neuphonic-http.py", EVAL_SIMPLE_MATH),
|
||||
("07w-interruptible-fal.py", EVAL_SIMPLE_MATH),
|
||||
("07y-interruptible-minimax.py", EVAL_SIMPLE_MATH),
|
||||
("07z-interruptible-sarvam.py", EVAL_SIMPLE_MATH),
|
||||
("07ae-interruptible-hume.py", EVAL_SIMPLE_MATH),
|
||||
# Needs a local XTTS docker instance running.
|
||||
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# ("07i-interruptible-xtts.py", EVAL_SIMPLE_MATH),
|
||||
# Needs a Krisp license.
|
||||
# ("07p-interruptible-krisp.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# ("07p-interruptible-krisp.py", EVAL_SIMPLE_MATH),
|
||||
# Needs GPU resources.
|
||||
# ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# ("07u-interruptible-ultravox.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_12 = [
|
||||
("12-describe-video.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
("12a-describe-video-gemini-flash.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
("12b-describe-video-gpt-4o.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
("12c-describe-video-anthropic.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
("12-describe-image-openai.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
|
||||
("12a-describe-image-anthropic.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
|
||||
("12b-describe-image-aws.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
|
||||
("12c-describe-image-gemini-flash.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
|
||||
("12d-describe-image-moondream.py", EVAL_VISION_IMAGE()),
|
||||
]
|
||||
|
||||
TESTS_14 = [
|
||||
("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14b-function-calling-anthropic-video.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14d-function-calling-video.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14e-function-calling-google.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14f-function-calling-groq.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14g-function-calling-grok.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14h-function-calling-azure.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14i-function-calling-fireworks.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14j-function-calling-nim.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14k-function-calling-cerebras.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14m-function-calling-openrouter.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14n-function-calling-perplexity.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14p-function-calling-gemini-vertex-ai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14q-function-calling-qwen.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14w-function-calling-mistral.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14x-function-calling-openpipe.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14-function-calling.py", EVAL_WEATHER),
|
||||
("14a-function-calling-anthropic.py", EVAL_WEATHER),
|
||||
("14e-function-calling-google.py", EVAL_WEATHER),
|
||||
("14f-function-calling-groq.py", EVAL_WEATHER),
|
||||
("14g-function-calling-grok.py", EVAL_WEATHER),
|
||||
("14h-function-calling-azure.py", EVAL_WEATHER),
|
||||
("14i-function-calling-fireworks.py", EVAL_WEATHER),
|
||||
("14j-function-calling-nim.py", EVAL_WEATHER),
|
||||
("14k-function-calling-cerebras.py", EVAL_WEATHER),
|
||||
("14m-function-calling-openrouter.py", EVAL_WEATHER),
|
||||
("14n-function-calling-perplexity.py", EVAL_WEATHER),
|
||||
("14p-function-calling-gemini-vertex-ai.py", EVAL_WEATHER),
|
||||
("14q-function-calling-qwen.py", EVAL_WEATHER),
|
||||
("14r-function-calling-aws.py", EVAL_WEATHER),
|
||||
("14v-function-calling-openai.py", EVAL_WEATHER),
|
||||
("14w-function-calling-mistral.py", EVAL_WEATHER),
|
||||
("14x-function-calling-openpipe.py", EVAL_WEATHER),
|
||||
# Video
|
||||
("14d-function-calling-anthropic-video.py", EVAL_VISION_CAMERA),
|
||||
("14d-function-calling-aws-video.py", EVAL_VISION_CAMERA),
|
||||
("14d-function-calling-gemini-flash-video.py", EVAL_VISION_CAMERA),
|
||||
("14d-function-calling-moondream-video.py", EVAL_VISION_CAMERA),
|
||||
("14d-function-calling-openai-video.py", EVAL_VISION_CAMERA),
|
||||
# Currently not working.
|
||||
# ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("14o-function-calling-gemini-openai-format.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("14c-function-calling-together.py", EVAL_WEATHER),
|
||||
# ("14l-function-calling-deepseek.py", EVAL_WEATHER),
|
||||
# ("14o-function-calling-gemini-openai-format.py", EVAL_WEATHER),
|
||||
]
|
||||
|
||||
TESTS_15 = [
|
||||
("15a-switch-languages.py", PROMPT_SWITCH_LANGUAGE, EVAL_SWITCH_LANGUAGE, BOT_SPEAKS_FIRST),
|
||||
("15a-switch-languages.py", EVAL_SWITCH_LANGUAGE),
|
||||
]
|
||||
|
||||
TESTS_19 = [
|
||||
("19-openai-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19-openai-realtime.py", EVAL_WEATHER),
|
||||
("19-openai-realtime-beta.py", EVAL_WEATHER),
|
||||
# OpenAI Realtime not released on Azure yet
|
||||
# ("19a-azure-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19b-openai-realtime-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("19a-azure-realtime.py", EVAL_WEATHER),
|
||||
("19a-azure-realtime-beta.py", EVAL_WEATHER),
|
||||
("19b-openai-realtime-text.py", EVAL_WEATHER),
|
||||
("19b-openai-realtime-beta-text.py", EVAL_WEATHER),
|
||||
]
|
||||
|
||||
TESTS_21 = [
|
||||
("21a-tavus-video-service.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("21a-tavus-video-service.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_26 = [
|
||||
("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26a-gemini-live-transcription.py",
|
||||
PROMPT_SIMPLE_MATH,
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
(
|
||||
"26b-gemini-live-function-calling.py",
|
||||
PROMPT_WEATHER,
|
||||
EVAL_WEATHER,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("26c-gemini-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26e-gemini-multimodal-google-search.py",
|
||||
PROMPT_ONLINE_SEARCH,
|
||||
EVAL_ONLINE_SEARCH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("26-gemini-live.py", EVAL_SIMPLE_MATH),
|
||||
("26a-gemini-live-transcription.py", EVAL_SIMPLE_MATH),
|
||||
("26b-gemini-live-function-calling.py", EVAL_WEATHER),
|
||||
("26c-gemini-live-video.py", EVAL_SIMPLE_MATH),
|
||||
("26e-gemini-live-google-search.py", EVAL_ONLINE_SEARCH),
|
||||
("26h-gemini-live-vertex-function-calling.py", EVAL_WEATHER),
|
||||
# Currently not working.
|
||||
# ("26d-gemini-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26h-gemini-live-vertex-function-calling.py",
|
||||
PROMPT_WEATHER,
|
||||
EVAL_WEATHER,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
# ("26d-gemini-live-text.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_27 = [
|
||||
("27-simli-layer.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("27-simli-layer.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_40 = [
|
||||
("40-aws-nova-sonic.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("40-aws-nova-sonic.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_43 = [
|
||||
("43a-heygen-video-service.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("43a-heygen-video-service.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_44 = [
|
||||
("44-voicemail-detection.py", PROMPT_VOICEMAIL, EVAL_VOICEMAIL, USER_SPEAKS_FIRST),
|
||||
("44-voicemail-detection.py", PROMPT_CONVERSATION, EVAL_CONVERSATION, USER_SPEAKS_FIRST),
|
||||
("44-voicemail-detection.py", EVAL_VOICEMAIL),
|
||||
("44-voicemail-detection.py", EVAL_CONVERSATION),
|
||||
]
|
||||
|
||||
TESTS = [
|
||||
@@ -244,9 +239,9 @@ async def main(args: argparse.Namespace):
|
||||
|
||||
# Parse test config: (test, prompt, eval, user_speaks_first)
|
||||
for test_config in TESTS:
|
||||
test, prompt, eval, user_speaks_first = test_config
|
||||
test, eval_config = test_config
|
||||
|
||||
await runner.run_eval(test, prompt, eval, user_speaks_first)
|
||||
await runner.run_eval(test, eval_config)
|
||||
|
||||
runner.print_results()
|
||||
|
||||
|
||||
@@ -22,9 +22,12 @@ class AdapterType(Enum):
|
||||
|
||||
Parameters:
|
||||
GEMINI: Google Gemini adapter - currently the only service supporting custom tools.
|
||||
SHIM: Backward compatibility shim for creating ToolsSchemas from lists of tools in
|
||||
any format, used by LLMContext.from_openai_context.
|
||||
"""
|
||||
|
||||
GEMINI = "gemini" # that is the only service where we are able to add custom tools for now
|
||||
SHIM = "shim" # for use as backward compatibility shim for creating ToolsSchemas from list of tools in any format
|
||||
|
||||
|
||||
class ToolsSchema:
|
||||
|
||||
@@ -110,7 +110,7 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
system = NOT_GIVEN
|
||||
messages = []
|
||||
|
||||
# first, map messages using self._from_universal_context_message(m)
|
||||
# First, map messages using self._from_universal_context_message(m)
|
||||
try:
|
||||
messages = [self._from_universal_context_message(m) for m in universal_context_messages]
|
||||
except Exception as e:
|
||||
@@ -245,13 +245,25 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
item["text"] = "(empty)"
|
||||
# handle image_url -> image conversion
|
||||
if item["type"] == "image_url":
|
||||
item["type"] = "image"
|
||||
item["source"] = {
|
||||
"type": "base64",
|
||||
"media_type": "image/jpeg",
|
||||
"data": item["image_url"]["url"].split(",")[1],
|
||||
}
|
||||
del item["image_url"]
|
||||
if item["image_url"]["url"].startswith("data:"):
|
||||
item["type"] = "image"
|
||||
item["source"] = {
|
||||
"type": "base64",
|
||||
"media_type": "image/jpeg",
|
||||
"data": item["image_url"]["url"].split(",")[1],
|
||||
}
|
||||
del item["image_url"]
|
||||
elif item["image_url"]["url"].startswith("http"):
|
||||
item["type"] = "image"
|
||||
item["source"] = {
|
||||
"type": "url",
|
||||
"url": item["image_url"]["url"],
|
||||
}
|
||||
del item["image_url"]
|
||||
else:
|
||||
url = item["image_url"]["url"]
|
||||
logger.warning(f"Unsupported 'image_url': {url}")
|
||||
|
||||
# In the case where there's a single image in the list (like what
|
||||
# would result from a UserImageRawFrame), ensure that the image
|
||||
# comes before text, as recommended by Anthropic docs
|
||||
|
||||
@@ -16,7 +16,7 @@ from loguru import logger
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext, LLMContextMessage
|
||||
|
||||
|
||||
@@ -210,4 +210,18 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||
List of dictionaries in AWS Nova Sonic function format.
|
||||
"""
|
||||
functions_schema = tools_schema.standard_tools
|
||||
return [self._to_aws_nova_sonic_function_format(func) for func in functions_schema]
|
||||
standard_tools = [
|
||||
self._to_aws_nova_sonic_function_format(func) for func in functions_schema
|
||||
]
|
||||
|
||||
# For backward compatibility, AWS Nova Sonic can still be used with
|
||||
# tools in dict format, even though it always uses `LLMContext` under
|
||||
# the hood (via `LLMContext.from_openai_context()`).
|
||||
# To support this behavior, we use "shimmed" custom tools here.
|
||||
# (We maintain this backward compatibility because users aren't
|
||||
# *knowingly* opting into the new `LLMContext`.)
|
||||
shimmed_tools = []
|
||||
if tools_schema.custom_tools:
|
||||
shimmed_tools = tools_schema.custom_tools.get(AdapterType.SHIM, [])
|
||||
|
||||
return standard_tools + shimmed_tools
|
||||
|
||||
@@ -107,7 +107,7 @@ class AWSBedrockLLMAdapter(BaseLLMAdapter[AWSBedrockLLMInvocationParams]):
|
||||
system = None
|
||||
messages = []
|
||||
|
||||
# first, map messages using self._from_universal_context_message(m)
|
||||
# First, map messages using self._from_universal_context_message(m)
|
||||
try:
|
||||
messages = [self._from_universal_context_message(m) for m in universal_context_messages]
|
||||
except Exception as e:
|
||||
@@ -256,15 +256,22 @@ class AWSBedrockLLMAdapter(BaseLLMAdapter[AWSBedrockLLMInvocationParams]):
|
||||
new_content.append({"text": text_content})
|
||||
# handle image_url -> image conversion
|
||||
if item["type"] == "image_url":
|
||||
new_item = {
|
||||
"image": {
|
||||
"format": "jpeg",
|
||||
"source": {
|
||||
"bytes": base64.b64decode(item["image_url"]["url"].split(",")[1])
|
||||
},
|
||||
if item["image_url"]["url"].startswith("data:"):
|
||||
new_item = {
|
||||
"image": {
|
||||
"format": "jpeg",
|
||||
"source": {
|
||||
"bytes": base64.b64decode(
|
||||
item["image_url"]["url"].split(",")[1]
|
||||
)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
new_content.append(new_item)
|
||||
new_content.append(new_item)
|
||||
else:
|
||||
url = item["image_url"]["url"]
|
||||
logger.warning(f"Unsupported 'image_url': {url}")
|
||||
|
||||
# In the case where there's a single image in the list (like what
|
||||
# would result from a UserImageRawFrame), ensure that the image
|
||||
# comes before text
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
|
||||
import base64
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, TypedDict
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Tuple, TypedDict
|
||||
|
||||
from loguru import logger
|
||||
from openai import NotGiven
|
||||
@@ -24,13 +24,7 @@ from pipecat.processors.aggregators.llm_context import (
|
||||
)
|
||||
|
||||
try:
|
||||
from google.genai.types import (
|
||||
Blob,
|
||||
Content,
|
||||
FunctionCall,
|
||||
FunctionResponse,
|
||||
Part,
|
||||
)
|
||||
from google.genai.types import Blob, Content, FileData, FunctionCall, FunctionResponse, Part
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
||||
@@ -133,6 +127,28 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
messages: List[Content]
|
||||
system_instruction: Optional[str] = None
|
||||
|
||||
@dataclass
|
||||
class MessageConversionResult:
|
||||
"""Result of converting a single universal context message to Google format.
|
||||
|
||||
Either content (a Google Content object) or a system instruction string
|
||||
is guaranteed to be set.
|
||||
|
||||
Also returns a tool call ID to name mapping for any tool calls
|
||||
discovered in the message.
|
||||
"""
|
||||
|
||||
content: Optional[Content] = None
|
||||
system_instruction: Optional[str] = None
|
||||
tool_call_id_to_name_mapping: Dict[str, str] = field(default_factory=dict)
|
||||
|
||||
@dataclass
|
||||
class MessageConversionParams:
|
||||
"""Parameters for converting a single universal context message to Google format."""
|
||||
|
||||
already_have_system_instruction: bool
|
||||
tool_call_id_to_name_mapping: Dict[str, str]
|
||||
|
||||
def _from_universal_context_messages(
|
||||
self, universal_context_messages: List[LLMContextMessage]
|
||||
) -> ConvertedMessages:
|
||||
@@ -156,24 +172,26 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
"""
|
||||
system_instruction = None
|
||||
messages = []
|
||||
tool_call_id_to_name_mapping = {}
|
||||
|
||||
# Process each message, preserving Google-formatted messages and converting others
|
||||
for message in universal_context_messages:
|
||||
if isinstance(message, LLMSpecificMessage):
|
||||
# Assume that LLMSpecificMessage wraps a message in Google format
|
||||
messages.append(message.message)
|
||||
continue
|
||||
|
||||
# Convert standard format to Google format
|
||||
converted = self._from_standard_message(
|
||||
message, already_have_system_instruction=bool(system_instruction)
|
||||
result = self._from_universal_context_message(
|
||||
message,
|
||||
params=self.MessageConversionParams(
|
||||
already_have_system_instruction=bool(system_instruction),
|
||||
tool_call_id_to_name_mapping=tool_call_id_to_name_mapping,
|
||||
),
|
||||
)
|
||||
if isinstance(converted, Content):
|
||||
# Regular (non-system) message
|
||||
messages.append(converted)
|
||||
else:
|
||||
# System instruction
|
||||
system_instruction = converted
|
||||
# Each result is either a Content or a system instruction
|
||||
if result.content:
|
||||
messages.append(result.content)
|
||||
elif result.system_instruction:
|
||||
system_instruction = result.system_instruction
|
||||
|
||||
# Merge tool call ID to name mapping
|
||||
if result.tool_call_id_to_name_mapping:
|
||||
tool_call_id_to_name_mapping.update(result.tool_call_id_to_name_mapping)
|
||||
|
||||
# Check if we only have function-related messages (no regular text)
|
||||
has_regular_messages = any(
|
||||
@@ -193,9 +211,16 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
|
||||
return self.ConvertedMessages(messages=messages, system_instruction=system_instruction)
|
||||
|
||||
def _from_universal_context_message(
|
||||
self, message: LLMContextMessage, *, params: MessageConversionParams
|
||||
) -> MessageConversionResult:
|
||||
if isinstance(message, LLMSpecificMessage):
|
||||
return self.MessageConversionResult(content=message.message)
|
||||
return self._from_standard_message(message, params=params)
|
||||
|
||||
def _from_standard_message(
|
||||
self, message: LLMStandardMessage, already_have_system_instruction: bool
|
||||
) -> Content | str:
|
||||
self, message: LLMStandardMessage, *, params: MessageConversionParams
|
||||
) -> MessageConversionResult:
|
||||
"""Convert standard universal context message to Google Content object.
|
||||
|
||||
Handles conversion of text, images, and function calls to Google's
|
||||
@@ -205,10 +230,11 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
Args:
|
||||
message: Message in standard universal context format.
|
||||
already_have_system_instruction: Whether we already have a system instruction
|
||||
params: Parameters for conversion.
|
||||
|
||||
Returns:
|
||||
Content object with role and parts, or a plain string for system
|
||||
messages.
|
||||
MessageConversionResult containing either a Content object or a
|
||||
system instruction string.
|
||||
|
||||
Examples:
|
||||
Standard text message::
|
||||
@@ -242,38 +268,49 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
Converts to Google Content with::
|
||||
|
||||
Content(
|
||||
role="model",
|
||||
role="user",
|
||||
parts=[Part(function_call=FunctionCall(name="search", args={"query": "test"}))]
|
||||
)
|
||||
"""
|
||||
role = message["role"]
|
||||
content = message.get("content", [])
|
||||
|
||||
if role == "system":
|
||||
if already_have_system_instruction:
|
||||
if params.already_have_system_instruction:
|
||||
role = "user" # Convert system message to user role if we already have a system instruction
|
||||
else:
|
||||
# System instructions are returned as plain text
|
||||
system_instruction: str = None
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
system_instruction = content
|
||||
elif isinstance(content, list):
|
||||
# If content is a list, we assume it's a list of text parts, per the standard
|
||||
return " ".join(part["text"] for part in content if part.get("type") == "text")
|
||||
system_instruction = " ".join(
|
||||
part["text"] for part in content if part.get("type") == "text"
|
||||
)
|
||||
if system_instruction:
|
||||
return self.MessageConversionResult(system_instruction=system_instruction)
|
||||
elif role == "assistant":
|
||||
role = "model"
|
||||
|
||||
parts = []
|
||||
tool_call_id_to_name_mapping = {}
|
||||
|
||||
if message.get("tool_calls"):
|
||||
for tc in message["tool_calls"]:
|
||||
id = tc["id"]
|
||||
name = tc["function"]["name"]
|
||||
tool_call_id_to_name_mapping[id] = name
|
||||
parts.append(
|
||||
Part(
|
||||
function_call=FunctionCall(
|
||||
name=tc["function"]["name"],
|
||||
id=id,
|
||||
name=name,
|
||||
args=json.loads(tc["function"]["arguments"]),
|
||||
)
|
||||
)
|
||||
)
|
||||
elif role == "tool":
|
||||
role = "model"
|
||||
role = "user"
|
||||
try:
|
||||
response = json.loads(message["content"])
|
||||
if isinstance(response, dict):
|
||||
@@ -284,10 +321,18 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
# Response might not be JSON-deserializable.
|
||||
# This occurs with a UserImageFrame, for example, where we get a plain "COMPLETED" string.
|
||||
response_dict = {"value": message["content"]}
|
||||
|
||||
# Get function name from mapping using tool_call_id, or fallback
|
||||
tool_call_id = message.get("tool_call_id")
|
||||
function_name = "tool_call_result" # Default fallback
|
||||
if tool_call_id and tool_call_id in params.tool_call_id_to_name_mapping:
|
||||
function_name = params.tool_call_id_to_name_mapping[tool_call_id]
|
||||
|
||||
parts.append(
|
||||
Part(
|
||||
function_response=FunctionResponse(
|
||||
name="tool_call_result", # seems to work to hard-code the same name every time
|
||||
id=tool_call_id,
|
||||
name=function_name,
|
||||
response=response_dict,
|
||||
)
|
||||
)
|
||||
@@ -298,7 +343,7 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
for c in content:
|
||||
if c["type"] == "text":
|
||||
parts.append(Part(text=c["text"]))
|
||||
elif c["type"] == "image_url":
|
||||
elif c["type"] == "image_url" and c["image_url"]["url"].startswith("data:"):
|
||||
parts.append(
|
||||
Part(
|
||||
inline_data=Blob(
|
||||
@@ -307,9 +352,25 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
)
|
||||
)
|
||||
)
|
||||
elif c["type"] == "image_url":
|
||||
url = c["image_url"]["url"]
|
||||
logger.warning(f"Unsupported 'image_url': {url}")
|
||||
elif c["type"] == "input_audio":
|
||||
input_audio = c["input_audio"]
|
||||
audio_bytes = base64.b64decode(input_audio["data"])
|
||||
parts.append(Part(inline_data=Blob(mime_type="audio/wav", data=audio_bytes)))
|
||||
elif c["type"] == "file_data":
|
||||
file_data = c["file_data"]
|
||||
parts.append(
|
||||
Part(
|
||||
file_data=FileData(
|
||||
mime_type=file_data.get("mime_type"),
|
||||
file_uri=file_data.get("file_uri"),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
return Content(role=role, parts=parts)
|
||||
return self.MessageConversionResult(
|
||||
content=Content(role=role, parts=parts),
|
||||
tool_call_id_to_name_mapping=tool_call_id_to_name_mapping,
|
||||
)
|
||||
|
||||
@@ -6,12 +6,18 @@
|
||||
|
||||
"""OpenAI Realtime LLM adapter for Pipecat."""
|
||||
|
||||
from typing import Any, Dict, List, TypedDict
|
||||
import copy
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, TypedDict
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext, LLMContextMessage
|
||||
from pipecat.services.openai.realtime import events
|
||||
|
||||
|
||||
class OpenAIRealtimeLLMInvocationParams(TypedDict):
|
||||
@@ -20,7 +26,9 @@ class OpenAIRealtimeLLMInvocationParams(TypedDict):
|
||||
This is a placeholder until support for universal LLMContext machinery is added for OpenAI Realtime.
|
||||
"""
|
||||
|
||||
pass
|
||||
system_instruction: Optional[str]
|
||||
messages: List[events.ConversationItem]
|
||||
tools: List[Dict[str, Any]]
|
||||
|
||||
|
||||
class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
@@ -33,7 +41,7 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
@property
|
||||
def id_for_llm_specific_messages(self) -> str:
|
||||
"""Get the identifier used in LLMSpecificMessage instances for OpenAI Realtime."""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for OpenAI Realtime.")
|
||||
return "openai-realtime"
|
||||
|
||||
def get_llm_invocation_params(self, context: LLMContext) -> OpenAIRealtimeLLMInvocationParams:
|
||||
"""Get OpenAI Realtime-specific LLM invocation parameters from a universal LLM context.
|
||||
@@ -46,7 +54,13 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
Returns:
|
||||
Dictionary of parameters for invoking OpenAI Realtime's API.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for OpenAI Realtime.")
|
||||
messages = self._from_universal_context_messages(self.get_messages(context))
|
||||
return {
|
||||
"system_instruction": messages.system_instruction,
|
||||
"messages": messages.messages,
|
||||
# NOTE: LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
|
||||
"tools": self.from_standard_tools(context.tools) or [],
|
||||
}
|
||||
|
||||
def get_messages_for_logging(self, context) -> List[Dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about OpenAI Realtime.
|
||||
@@ -61,7 +75,124 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
Returns:
|
||||
List of messages in a format ready for logging about OpenAI Realtime.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for OpenAI Realtime.")
|
||||
# NOTE: this is the same as in OpenAIAdapter, as that's what it was
|
||||
# prior to a refactor. Worth noting that for OpenAI Realtime
|
||||
# specifically, not everything handled here is necessarily supported
|
||||
# (or supported yet).
|
||||
msgs = []
|
||||
for message in self.get_messages(context):
|
||||
msg = copy.deepcopy(message)
|
||||
if "content" in msg:
|
||||
if isinstance(msg["content"], list):
|
||||
for item in msg["content"]:
|
||||
if item["type"] == "image_url":
|
||||
if item["image_url"]["url"].startswith("data:image/"):
|
||||
item["image_url"]["url"] = "data:image/..."
|
||||
if item["type"] == "input_audio":
|
||||
item["input_audio"]["data"] = "..."
|
||||
if "mime_type" in msg and msg["mime_type"].startswith("image/"):
|
||||
msg["data"] = "..."
|
||||
msgs.append(msg)
|
||||
return msgs
|
||||
|
||||
@dataclass
|
||||
class ConvertedMessages:
|
||||
"""Container for OpenAI-formatted messages converted from universal context."""
|
||||
|
||||
messages: List[events.ConversationItem]
|
||||
system_instruction: Optional[str] = None
|
||||
|
||||
def _from_universal_context_messages(
|
||||
self, universal_context_messages: List[LLMContextMessage]
|
||||
) -> ConvertedMessages:
|
||||
# We can't load a long conversation history into the openai realtime api yet. (The API/model
|
||||
# forgets that it can do audio, if you do a series of `conversation.item.create` calls.) So
|
||||
# our general strategy until this is fixed is just to put everything into a first "user"
|
||||
# message as a single input.
|
||||
|
||||
if not universal_context_messages:
|
||||
return self.ConvertedMessages(messages=[])
|
||||
|
||||
messages = copy.deepcopy(universal_context_messages)
|
||||
system_instruction = None
|
||||
|
||||
# If we have a "system" message as our first message, let's pull that out into session
|
||||
# "instructions"
|
||||
if messages[0].get("role") == "system":
|
||||
system = messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
system_instruction = content
|
||||
elif isinstance(content, list):
|
||||
system_instruction = content[0].get("text")
|
||||
if not messages:
|
||||
return self.ConvertedMessages(messages=[], system_instruction=system_instruction)
|
||||
|
||||
# If we have just a single "user" item, we can just send it normally
|
||||
if len(messages) == 1 and messages[0].get("role") == "user":
|
||||
return self.ConvertedMessages(
|
||||
messages=[self._from_universal_context_message(messages[0])],
|
||||
system_instruction=system_instruction,
|
||||
)
|
||||
|
||||
# Otherwise, let's pack everything into a single "user" message with a bit of
|
||||
# explanation for the LLM
|
||||
intro_text = """
|
||||
This is a previously saved conversation. Please treat this conversation history as a
|
||||
starting point for the current conversation."""
|
||||
|
||||
trailing_text = """
|
||||
This is the end of the previously saved conversation. Please continue the conversation
|
||||
from here. If the last message is a user instruction or question, act on that instruction
|
||||
or answer the question. If the last message is an assistant response, simple say that you
|
||||
are ready to continue the conversation."""
|
||||
|
||||
return self.ConvertedMessages(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"type": "message",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "\n\n".join(
|
||||
[intro_text, json.dumps(messages, indent=2), trailing_text]
|
||||
),
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
system_instruction=system_instruction,
|
||||
)
|
||||
|
||||
def _from_universal_context_message(
|
||||
self, message: LLMContextMessage
|
||||
) -> events.ConversationItem:
|
||||
if message.get("role") == "user":
|
||||
content = message.get("content")
|
||||
if isinstance(message.get("content"), list):
|
||||
content = ""
|
||||
for c in message.get("content"):
|
||||
if c.get("type") == "text":
|
||||
content += " " + c.get("text")
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
)
|
||||
return events.ConversationItem(
|
||||
role="user",
|
||||
type="message",
|
||||
content=[events.ItemContent(type="input_text", text=content)],
|
||||
)
|
||||
if message.get("role") == "assistant" and message.get("tool_calls"):
|
||||
tc = message.get("tool_calls")[0]
|
||||
return events.ConversationItem(
|
||||
type="function_call",
|
||||
call_id=tc["id"],
|
||||
name=tc["function"]["name"],
|
||||
arguments=tc["function"]["arguments"],
|
||||
)
|
||||
logger.error(f"Unhandled message type in _from_universal_context_message: {message}")
|
||||
|
||||
@staticmethod
|
||||
def _to_openai_realtime_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||
@@ -94,4 +225,18 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
List of function definitions in OpenAI Realtime format.
|
||||
"""
|
||||
functions_schema = tools_schema.standard_tools
|
||||
return [self._to_openai_realtime_function_format(func) for func in functions_schema]
|
||||
standard_tools = [
|
||||
self._to_openai_realtime_function_format(func) for func in functions_schema
|
||||
]
|
||||
|
||||
# For backward compatibility, OpenAI Realtime can still be used with
|
||||
# tools in dict format, even though it always uses `LLMContext` under
|
||||
# the hood (via `LLMContext.from_openai_context()`).
|
||||
# To support this behavior, we use "shimmed" custom tools here.
|
||||
# (We maintain this backward compatibility because users aren't
|
||||
# *knowingly* opting into the new `LLMContext`.)
|
||||
shimmed_tools = []
|
||||
if tools_schema.custom_tools:
|
||||
shimmed_tools = tools_schema.custom_tools.get(AdapterType.SHIM, [])
|
||||
|
||||
return standard_tools + shimmed_tools
|
||||
|
||||
@@ -773,9 +773,15 @@ class CancelFrame(SystemFrame):
|
||||
|
||||
Indicates that a pipeline needs to stop right away without
|
||||
processing remaining queued frames.
|
||||
|
||||
Parameters:
|
||||
reason: Optional reason for pushing a cancel frame.
|
||||
"""
|
||||
|
||||
pass
|
||||
reason: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1201,26 +1207,23 @@ class TransportMessageUrgentFrame(OutputTransportMessageUrgentFrame):
|
||||
class UserImageRequestFrame(SystemFrame):
|
||||
"""Frame requesting an image from a specific user.
|
||||
|
||||
A frame to request an image from the given user. The frame might be
|
||||
generated by a function call in which case the corresponding fields will be
|
||||
properly set.
|
||||
A frame to request an image from the given user. The request might come with
|
||||
a text that can be later used to describe the requested image.
|
||||
|
||||
Parameters:
|
||||
user_id: Identifier of the user to request image from.
|
||||
context: Optional context for the image request.
|
||||
function_name: Name of function that generated this request (if any).
|
||||
tool_call_id: Tool call ID if generated by function call.
|
||||
text: An optional text associated to the image request.
|
||||
append_to_context: Whether the requested image should be appended to the LLM context.
|
||||
video_source: Specific video source to capture from.
|
||||
"""
|
||||
|
||||
user_id: str
|
||||
context: Optional[Any] = None
|
||||
function_name: Optional[str] = None
|
||||
tool_call_id: Optional[str] = None
|
||||
text: Optional[str] = None
|
||||
append_to_context: Optional[bool] = None
|
||||
video_source: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(user: {self.user_id}, video_source: {self.video_source}, function: {self.function_name}, request: {self.tool_call_id})"
|
||||
return f"{self.name}(user: {self.user_id}, text: {self.text}, append_to_context: {self.append_to_context}, {self.video_source})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1294,15 +1297,17 @@ class UserImageRawFrame(InputImageRawFrame):
|
||||
|
||||
Parameters:
|
||||
user_id: Identifier of the user who provided this image.
|
||||
request: The original image request frame if this is a response.
|
||||
text: An optional text associated to this image.
|
||||
append_to_context: Whether the requested image should be appended to the LLM context.
|
||||
"""
|
||||
|
||||
user_id: str = ""
|
||||
request: Optional[UserImageRequestFrame] = None
|
||||
text: Optional[str] = None
|
||||
append_to_context: Optional[bool] = None
|
||||
|
||||
def __str__(self):
|
||||
pts = format_pts(self.pts)
|
||||
return f"{self.name}(pts: {pts}, user: {self.user_id}, source: {self.transport_source}, size: {self.size}, format: {self.format}, request: {self.request})"
|
||||
return f"{self.name}(pts: {pts}, user: {self.user_id}, source: {self.transport_source}, size: {self.size}, format: {self.format}, text: {self.text}, append_to_context: {self.append_to_context})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1367,9 +1372,15 @@ class EndTaskFrame(TaskFrame):
|
||||
This is used to notify the pipeline task that the pipeline should be
|
||||
closed nicely (flushing all the queued frames) by pushing an EndFrame
|
||||
downstream. This frame should be pushed upstream.
|
||||
|
||||
Parameters:
|
||||
reason: Optional reason for pushing an end frame.
|
||||
"""
|
||||
|
||||
pass
|
||||
reason: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1379,9 +1390,15 @@ class CancelTaskFrame(TaskFrame):
|
||||
This is used to notify the pipeline task that the pipeline should be
|
||||
stopped immediately by pushing a CancelFrame downstream. This frame
|
||||
should be pushed upstream.
|
||||
|
||||
Parameters:
|
||||
reason: Optional reason for pushing a cancel frame.
|
||||
"""
|
||||
|
||||
pass
|
||||
reason: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1452,9 +1469,15 @@ class EndFrame(ControlFrame):
|
||||
sending frames to its output channel(s) and close all its threads. Note,
|
||||
that this is a control frame, which means it will be received in the order it
|
||||
was sent.
|
||||
|
||||
Parameters:
|
||||
reason: Optional reason for pushing an end frame.
|
||||
"""
|
||||
|
||||
pass
|
||||
reason: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -14,20 +14,41 @@ from pipecat.services.llm_service import LLMService
|
||||
|
||||
|
||||
class LLMSwitcher(ServiceSwitcher[StrategyType]):
|
||||
"""A pipeline that switches between different LLMs at runtime."""
|
||||
"""A pipeline that switches between different LLMs at runtime.
|
||||
|
||||
Example::
|
||||
|
||||
llm_switcher = LLMSwitcher(
|
||||
llms=[openai_llm, anthropic_llm],
|
||||
strategy_type=ServiceSwitcherStrategyManual
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self, llms: List[LLMService], strategy_type: Type[StrategyType]):
|
||||
"""Initialize the service switcher with a list of LLMs and a switching strategy."""
|
||||
"""Initialize the service switcher with a list of LLMs and a switching strategy.
|
||||
|
||||
Args:
|
||||
llms: List of LLM services to switch between.
|
||||
strategy_type: The strategy class to use for switching between LLMs.
|
||||
"""
|
||||
super().__init__(llms, strategy_type)
|
||||
|
||||
@property
|
||||
def llms(self) -> List[LLMService]:
|
||||
"""Get the list of LLMs managed by this switcher."""
|
||||
"""Get the list of LLMs managed by this switcher.
|
||||
|
||||
Returns:
|
||||
List of LLM services managed by this switcher.
|
||||
"""
|
||||
return self.services
|
||||
|
||||
@property
|
||||
def active_llm(self) -> Optional[LLMService]:
|
||||
"""Get the currently active LLM, if any."""
|
||||
"""Get the currently active LLM.
|
||||
|
||||
Returns:
|
||||
The currently active LLM service, or None if no LLM is active.
|
||||
"""
|
||||
return self.strategy.active_service
|
||||
|
||||
async def run_inference(self, context: LLMContext) -> Optional[str]:
|
||||
|
||||
@@ -21,10 +21,22 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
class ServiceSwitcherStrategy:
|
||||
"""Base class for service switching strategies."""
|
||||
"""Base class for service switching strategies.
|
||||
|
||||
Note:
|
||||
Strategy classes are instantiated internally by ServiceSwitcher.
|
||||
Developers should pass the strategy class (not an instance) to ServiceSwitcher.
|
||||
"""
|
||||
|
||||
def __init__(self, services: List[FrameProcessor]):
|
||||
"""Initialize the service switcher strategy with a list of services."""
|
||||
"""Initialize the service switcher strategy with a list of services.
|
||||
|
||||
Note:
|
||||
This is called internally by ServiceSwitcher. Do not instantiate directly.
|
||||
|
||||
Args:
|
||||
services: List of frame processors to switch between.
|
||||
"""
|
||||
self.services = services
|
||||
self.active_service: Optional[FrameProcessor] = None
|
||||
|
||||
@@ -46,10 +58,24 @@ class ServiceSwitcherStrategyManual(ServiceSwitcherStrategy):
|
||||
|
||||
This strategy allows the user to manually select which service is active.
|
||||
The initial active service is the first one in the list.
|
||||
|
||||
Example::
|
||||
|
||||
stt_switcher = ServiceSwitcher(
|
||||
services=[stt_1, stt_2],
|
||||
strategy_type=ServiceSwitcherStrategyManual
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self, services: List[FrameProcessor]):
|
||||
"""Initialize the manual service switcher strategy with a list of services."""
|
||||
"""Initialize the manual service switcher strategy with a list of services.
|
||||
|
||||
Note:
|
||||
This is called internally by ServiceSwitcher. Do not instantiate directly.
|
||||
|
||||
Args:
|
||||
services: List of frame processors to switch between.
|
||||
"""
|
||||
super().__init__(services)
|
||||
self.active_service = services[0] if services else None
|
||||
|
||||
@@ -85,7 +111,12 @@ class ServiceSwitcher(ParallelPipeline, Generic[StrategyType]):
|
||||
"""A pipeline that switches between different services at runtime."""
|
||||
|
||||
def __init__(self, services: List[FrameProcessor], strategy_type: Type[StrategyType]):
|
||||
"""Initialize the service switcher with a list of services and a switching strategy."""
|
||||
"""Initialize the service switcher with a list of services and a switching strategy.
|
||||
|
||||
Args:
|
||||
services: List of frame processors to switch between.
|
||||
strategy_type: The strategy class to use for switching between services.
|
||||
"""
|
||||
strategy = strategy_type(services)
|
||||
super().__init__(*self._make_pipeline_definitions(services, strategy))
|
||||
self.services = services
|
||||
@@ -100,14 +131,20 @@ class ServiceSwitcher(ParallelPipeline, Generic[StrategyType]):
|
||||
active_service: FrameProcessor,
|
||||
direction: FrameDirection,
|
||||
):
|
||||
"""Initialize the service switcher filter with a strategy and direction."""
|
||||
"""Initialize the service switcher filter with a strategy and direction.
|
||||
|
||||
Args:
|
||||
wrapped_service: The service that this filter wraps.
|
||||
active_service: The currently active service.
|
||||
direction: The direction of frame flow to filter.
|
||||
"""
|
||||
self._wrapped_service = wrapped_service
|
||||
self._active_service = active_service
|
||||
|
||||
async def filter(_: Frame) -> bool:
|
||||
return self._wrapped_service == self._active_service
|
||||
|
||||
super().__init__(filter, direction)
|
||||
self._wrapped_service = wrapped_service
|
||||
self._active_service = active_service
|
||||
super().__init__(filter, direction, filter_system_frames=True)
|
||||
|
||||
async def process_frame(self, frame, direction):
|
||||
"""Process a frame through the filter, handling special internal filter-updating frames."""
|
||||
|
||||
@@ -12,7 +12,9 @@ including heartbeats, idle detection, and observer integration.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
import importlib.util
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, AsyncIterable, Dict, Iterable, List, Optional, Tuple, Type
|
||||
|
||||
from loguru import logger
|
||||
@@ -39,7 +41,7 @@ from pipecat.frames.frames import (
|
||||
UserSpeakingFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import ProcessingMetricsData, TTFBMetricsData
|
||||
from pipecat.observers.base_observer import BaseObserver
|
||||
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
||||
from pipecat.observers.turn_tracking_observer import TurnTrackingObserver
|
||||
from pipecat.pipeline.base_task import BasePipelineTask, PipelineTaskParams
|
||||
from pipecat.pipeline.pipeline import Pipeline, PipelineSink, PipelineSource
|
||||
@@ -57,6 +59,43 @@ IDLE_TIMEOUT_SECS = 300
|
||||
CANCEL_TIMEOUT_SECS = 20.0
|
||||
|
||||
|
||||
class IdleFrameObserver(BaseObserver):
|
||||
"""Idle timeout observer.
|
||||
|
||||
This observer waits for specific frames being generated in the pipeline. If
|
||||
the frames are generated the given asyncio event is set. If the event is not
|
||||
set it means the pipeline is probably idle.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *, idle_event: asyncio.Event, idle_timeout_frames: Tuple[Type[Frame], ...]):
|
||||
"""Initialize the observer.
|
||||
|
||||
Args:
|
||||
idle_event: The event to set if the idle timeout frames are being pushed.
|
||||
idle_timeout_frames: A tuple with the frames that should set the event when received
|
||||
"""
|
||||
super().__init__()
|
||||
self._idle_event = idle_event
|
||||
self._idle_timeout_frames = idle_timeout_frames
|
||||
self._processed_frames = set()
|
||||
|
||||
async def on_push_frame(self, data: FramePushed):
|
||||
"""Callback executed when a frame is pushed in the pipeline.
|
||||
|
||||
Args:
|
||||
data: The frame push event data.
|
||||
"""
|
||||
# Skip already processed frames
|
||||
if data.frame.id in self._processed_frames:
|
||||
return
|
||||
|
||||
self._processed_frames.add(data.frame.id)
|
||||
|
||||
if isinstance(data.frame, StartFrame) or isinstance(data.frame, self._idle_timeout_frames):
|
||||
self._idle_event.set()
|
||||
|
||||
|
||||
class PipelineParams(BaseModel):
|
||||
"""Configuration parameters for pipeline execution.
|
||||
|
||||
@@ -215,7 +254,6 @@ class PipelineTask(BasePipelineTask):
|
||||
self._conversation_id = conversation_id
|
||||
self._enable_tracing = enable_tracing and is_tracing_available()
|
||||
self._enable_turn_tracking = enable_turn_tracking
|
||||
self._idle_timeout_frames = idle_timeout_frames
|
||||
self._idle_timeout_secs = idle_timeout_secs
|
||||
if self._params.observers:
|
||||
import warnings
|
||||
@@ -250,16 +288,24 @@ class PipelineTask(BasePipelineTask):
|
||||
# This queue is the queue used to push frames to the pipeline.
|
||||
self._push_queue = asyncio.Queue()
|
||||
self._process_push_task: Optional[asyncio.Task] = None
|
||||
|
||||
# This is the heartbeat queue. When a heartbeat frame is received in the
|
||||
# down queue we add it to the heartbeat queue for processing.
|
||||
self._heartbeat_queue = asyncio.Queue()
|
||||
self._heartbeat_push_task: Optional[asyncio.Task] = None
|
||||
self._heartbeat_monitor_task: Optional[asyncio.Task] = None
|
||||
# This is the idle queue. When frames are received downstream they are
|
||||
# put in the queue. If no frame is received the pipeline is considered
|
||||
# idle.
|
||||
self._idle_queue = asyncio.Queue()
|
||||
|
||||
# This is the idle event. When selected frames are pushed from any
|
||||
# processor we consider the pipeline is not idle. We use an observer
|
||||
# which will be listening any part of the pipeline.
|
||||
self._idle_event = asyncio.Event()
|
||||
self._idle_monitor_task: Optional[asyncio.Task] = None
|
||||
if self._idle_timeout_secs:
|
||||
idle_frame_observer = IdleFrameObserver(
|
||||
idle_event=self._idle_event,
|
||||
idle_timeout_frames=idle_timeout_frames,
|
||||
)
|
||||
observers.append(idle_frame_observer)
|
||||
|
||||
# This event is used to indicate the StartFrame has been received at the
|
||||
# end of the pipeline.
|
||||
@@ -403,10 +449,14 @@ class PipelineTask(BasePipelineTask):
|
||||
logger.debug(f"Task {self} scheduled to stop when done")
|
||||
await self.queue_frame(EndFrame())
|
||||
|
||||
async def cancel(self):
|
||||
"""Request the running pipeline to cancel."""
|
||||
async def cancel(self, *, reason: Optional[str] = None):
|
||||
"""Request the running pipeline to cancel.
|
||||
|
||||
Args:
|
||||
reason: Optional reason to indicate why the pipeline is being cancelled.
|
||||
"""
|
||||
if not self._finished:
|
||||
await self._cancel()
|
||||
await self._cancel(reason=reason)
|
||||
|
||||
async def run(self, params: PipelineTaskParams):
|
||||
"""Start and manage the pipeline execution until completion or cancellation.
|
||||
@@ -470,12 +520,16 @@ class PipelineTask(BasePipelineTask):
|
||||
for frame in frames:
|
||||
await self.queue_frame(frame)
|
||||
|
||||
async def _cancel(self):
|
||||
"""Internal cancellation logic for the pipeline task."""
|
||||
async def _cancel(self, *, reason: Optional[str] = None):
|
||||
"""Internal cancellation logic for the pipeline task.
|
||||
|
||||
Args:
|
||||
reason: Optional reason to indicate why the pipeline is being cancelled.
|
||||
"""
|
||||
if not self._cancelled:
|
||||
logger.debug(f"Cancelling pipeline task {self}")
|
||||
self._cancelled = True
|
||||
await self.queue_frame(CancelFrame())
|
||||
await self.queue_frame(CancelFrame(reason=reason))
|
||||
|
||||
async def _create_tasks(self):
|
||||
"""Create and start all pipeline processing tasks."""
|
||||
@@ -530,7 +584,7 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
async def _maybe_cancel_idle_task(self):
|
||||
"""Cancel idle monitoring task if it is running."""
|
||||
if self._idle_timeout_secs and self._idle_monitor_task:
|
||||
if self._idle_monitor_task:
|
||||
await self._task_manager.cancel_task(self._idle_monitor_task)
|
||||
self._idle_monitor_task = None
|
||||
|
||||
@@ -590,6 +644,9 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
async def _setup(self, params: PipelineTaskParams):
|
||||
"""Set up the pipeline task and all processors."""
|
||||
# Load additional observers.
|
||||
await self._load_observer_files()
|
||||
|
||||
mgr_params = TaskManagerParams(loop=params.loop)
|
||||
self._task_manager.setup(mgr_params)
|
||||
|
||||
@@ -673,11 +730,11 @@ class PipelineTask(BasePipelineTask):
|
||||
if isinstance(frame, EndTaskFrame):
|
||||
# Tell the task we should end nicely.
|
||||
logger.debug(f"{self}: received end task frame {frame}")
|
||||
await self.queue_frame(EndFrame())
|
||||
await self.queue_frame(EndFrame(reason=frame.reason))
|
||||
elif isinstance(frame, CancelTaskFrame):
|
||||
# Tell the task we should end right away.
|
||||
logger.debug(f"{self}: received cancel task frame {frame}")
|
||||
await self.queue_frame(CancelFrame())
|
||||
await self.queue_frame(CancelFrame(reason=frame.reason))
|
||||
elif isinstance(frame, StopTaskFrame):
|
||||
# Tell the task we should stop nicely.
|
||||
logger.debug(f"{self}: received stop task frame {frame}")
|
||||
@@ -706,10 +763,6 @@ class PipelineTask(BasePipelineTask):
|
||||
processors have handled the EndFrame and therefore we can exit the task
|
||||
cleanly.
|
||||
"""
|
||||
# Queue received frame to the idle queue so we can monitor idle
|
||||
# pipelines.
|
||||
await self._idle_queue.put(frame)
|
||||
|
||||
if isinstance(frame, self._reached_downstream_types):
|
||||
await self._call_event_handler("on_frame_reached_downstream", frame)
|
||||
|
||||
@@ -772,33 +825,10 @@ class PipelineTask(BasePipelineTask):
|
||||
Note: Heartbeats are excluded from idle detection.
|
||||
"""
|
||||
running = True
|
||||
last_frame_time = 0
|
||||
|
||||
while running:
|
||||
try:
|
||||
frame = await asyncio.wait_for(
|
||||
self._idle_queue.get(), timeout=self._idle_timeout_secs
|
||||
)
|
||||
|
||||
if isinstance(frame, StartFrame) or isinstance(frame, self._idle_timeout_frames):
|
||||
# If we find a StartFrame or one of the frames that prevents a
|
||||
# time out we update the time.
|
||||
last_frame_time = time.time()
|
||||
else:
|
||||
# If we find any other frame we check if the pipeline is
|
||||
# idle by checking the last time we received one of the
|
||||
# valid frames.
|
||||
diff_time = time.time() - last_frame_time
|
||||
if diff_time >= self._idle_timeout_secs:
|
||||
running = await self._idle_timeout_detected()
|
||||
# Reset `last_frame_time` so we don't trigger another
|
||||
# immediate idle timeout if we are not cancelling. For
|
||||
# example, we might want to force the bot to say goodbye
|
||||
# and then clean nicely with an `EndFrame`.
|
||||
last_frame_time = time.time()
|
||||
|
||||
self._idle_queue.task_done()
|
||||
|
||||
await asyncio.wait_for(self._idle_event.wait(), timeout=self._idle_timeout_secs)
|
||||
self._idle_event.clear()
|
||||
except asyncio.TimeoutError:
|
||||
running = await self._idle_timeout_detected()
|
||||
|
||||
@@ -810,7 +840,7 @@ class PipelineTask(BasePipelineTask):
|
||||
"""
|
||||
# If we are cancelling, just exit the task.
|
||||
if self._cancelled:
|
||||
return True
|
||||
return False
|
||||
|
||||
logger.warning("Idle timeout detected.")
|
||||
await self._call_event_handler("on_idle_timeout")
|
||||
@@ -820,6 +850,27 @@ class PipelineTask(BasePipelineTask):
|
||||
return False
|
||||
return True
|
||||
|
||||
async def _load_observer_files(self):
|
||||
observer_files = os.environ.get("PIPECAT_OBSERVER_FILES", "").split(":")
|
||||
for f in observer_files:
|
||||
try:
|
||||
path = Path(f).resolve()
|
||||
module_name = path.stem
|
||||
spec = importlib.util.spec_from_file_location(module_name, str(path))
|
||||
if spec:
|
||||
logger.debug(f"{self} loading observers from {path}")
|
||||
|
||||
# Load module.
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
|
||||
# Create observers.
|
||||
observers = await module.create_observers(self)
|
||||
for observer in observers:
|
||||
self.add_observer(observer)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} error loading external observers from {f}: {e}")
|
||||
|
||||
def _print_dangling_tasks(self):
|
||||
"""Log any dangling tasks that haven't been properly cleaned up."""
|
||||
tasks = [t.get_name() for t in self._task_manager.current_tasks()]
|
||||
|
||||
@@ -129,7 +129,7 @@ class TaskObserver(BaseObserver):
|
||||
for proxy in self._proxies:
|
||||
await proxy.cleanup()
|
||||
|
||||
async def on_process_frame(self, data: FramePushed):
|
||||
async def on_process_frame(self, data: FrameProcessed):
|
||||
"""Queue frame data for all managed observers.
|
||||
|
||||
Args:
|
||||
@@ -189,7 +189,7 @@ class TaskObserver(BaseObserver):
|
||||
if isinstance(data, FramePushed):
|
||||
if on_push_frame_deprecated:
|
||||
await observer.on_push_frame(
|
||||
data.src, data.dst, data.frame, data.direction, data.timestamp
|
||||
data.source, data.destination, data.frame, data.direction, data.timestamp
|
||||
)
|
||||
else:
|
||||
await observer.on_push_frame(data)
|
||||
|
||||
@@ -15,8 +15,8 @@ service-specific adapter.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import copy
|
||||
import io
|
||||
import wave
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, List, Optional, TypeAlias, Union
|
||||
|
||||
@@ -29,7 +29,7 @@ from openai.types.chat import (
|
||||
)
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.frames.frames import AudioRawFrame
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -83,9 +83,17 @@ class LLMContext:
|
||||
Returns:
|
||||
New LLMContext instance with converted messages and settings.
|
||||
"""
|
||||
# Convert tools to ToolsSchema if needed.
|
||||
# If the tools are already a ToolsSchema, this is a no-op.
|
||||
# Otherwise, we wrap them in a shim ToolsSchema.
|
||||
converted_tools = openai_context.tools
|
||||
if isinstance(converted_tools, list):
|
||||
converted_tools = ToolsSchema(
|
||||
standard_tools=[], custom_tools={AdapterType.SHIM: converted_tools}
|
||||
)
|
||||
return LLMContext(
|
||||
messages=openai_context.get_messages(),
|
||||
tools=openai_context.tools,
|
||||
tools=converted_tools,
|
||||
tool_choice=openai_context.tool_choice,
|
||||
)
|
||||
|
||||
@@ -106,6 +114,129 @@ class LLMContext:
|
||||
self._tools: ToolsSchema | NotGiven = LLMContext._normalize_and_validate_tools(tools)
|
||||
self._tool_choice: LLMContextToolChoice | NotGiven = tool_choice
|
||||
|
||||
@staticmethod
|
||||
def create_image_url_message(
|
||||
*,
|
||||
role: str = "user",
|
||||
url: str,
|
||||
text: Optional[str] = None,
|
||||
) -> LLMContextMessage:
|
||||
"""Create a context message containing an image URL.
|
||||
|
||||
Args:
|
||||
role: The role of this message (defaults to "user").
|
||||
url: The URL of the image.
|
||||
text: Optional text to include with the image.
|
||||
"""
|
||||
content = []
|
||||
if text:
|
||||
content.append({"type": "text", "text": text})
|
||||
|
||||
content.append({"type": "image_url", "image_url": {"url": url}})
|
||||
|
||||
return {"role": role, "content": content}
|
||||
|
||||
@staticmethod
|
||||
def create_image_message(
|
||||
*,
|
||||
role: str = "user",
|
||||
format: str,
|
||||
size: tuple[int, int],
|
||||
image: bytes,
|
||||
text: Optional[str] = None,
|
||||
) -> LLMContextMessage:
|
||||
"""Create a context message containing an image.
|
||||
|
||||
Args:
|
||||
role: The role of this message (defaults to "user").
|
||||
format: Image format (e.g., 'RGB', 'RGBA').
|
||||
size: Image dimensions as (width, height) tuple.
|
||||
image: Raw image bytes.
|
||||
text: Optional text to include with the image.
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
Image.frombytes(format, size, image).save(buffer, format="JPEG")
|
||||
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
url = f"data:image/jpeg;base64,{encoded_image}"
|
||||
|
||||
return LLMContext.create_image_url_message(role=role, url=url, text=text)
|
||||
|
||||
@staticmethod
|
||||
def create_audio_message(
|
||||
*, role: str = "user", audio_frames: list[AudioRawFrame], text: str = "Audio follows"
|
||||
) -> LLMContextMessage:
|
||||
"""Create a context message containing audio.
|
||||
|
||||
Args:
|
||||
role: The role of this message (defaults to "user").
|
||||
audio_frames: List of audio frame objects to include.
|
||||
text: Optional text to include with the audio.
|
||||
"""
|
||||
sample_rate = audio_frames[0].sample_rate
|
||||
num_channels = audio_frames[0].num_channels
|
||||
|
||||
content = []
|
||||
content.append({"type": "text", "text": text})
|
||||
data = b"".join(frame.audio for frame in audio_frames)
|
||||
|
||||
with io.BytesIO() as buffer:
|
||||
with wave.open(buffer, "wb") as wf:
|
||||
wf.setsampwidth(2)
|
||||
wf.setnchannels(num_channels)
|
||||
wf.setframerate(sample_rate)
|
||||
wf.writeframes(data)
|
||||
|
||||
encoded_audio = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
|
||||
content.append(
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {"data": encoded_audio, "format": "wav"},
|
||||
}
|
||||
)
|
||||
|
||||
return {"role": role, "content": content}
|
||||
|
||||
@property
|
||||
def messages(self) -> List[LLMContextMessage]:
|
||||
"""Get the current messages list.
|
||||
|
||||
NOTE: This is equivalent to calling `get_messages()` with no filter. If
|
||||
you want to filter out LLM-specific messages that don't pertain to your
|
||||
LLM, use `get_messages()` directly.
|
||||
|
||||
Returns:
|
||||
List of conversation messages.
|
||||
"""
|
||||
return self.get_messages()
|
||||
|
||||
def get_messages_for_persistent_storage(self) -> List[LLMContextMessage]:
|
||||
"""Get messages suitable for persistent storage.
|
||||
|
||||
NOTE: the only reason this method exists is because we're "silently"
|
||||
switching from OpenAILLMContext to LLMContext under the hood in some
|
||||
services and don't want to trip up users who may have been relying on
|
||||
this method, which is part of the public API of OpenAILLMContext but
|
||||
doesn't need to be for LLMContext.
|
||||
|
||||
.. deprecated::
|
||||
Use `get_messages()` instead.
|
||||
|
||||
Returns:
|
||||
List of conversation messages.
|
||||
"""
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"get_messages_for_persistent_storage() is deprecated, use get_messages() instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
return self.get_messages()
|
||||
|
||||
def get_messages(self, llm_specific_filter: Optional[str] = None) -> List[LLMContextMessage]:
|
||||
"""Get the current messages list.
|
||||
|
||||
@@ -113,7 +244,8 @@ class LLMContext:
|
||||
llm_specific_filter: Optional filter to return LLM-specific
|
||||
messages for the given LLM, in addition to the standard
|
||||
messages. If messages end up being filtered, an error will be
|
||||
logged.
|
||||
logged; this is intended to catch accidental use of
|
||||
incompatible LLM-specific messages.
|
||||
|
||||
Returns:
|
||||
List of conversation messages.
|
||||
@@ -190,7 +322,7 @@ class LLMContext:
|
||||
self._tool_choice = tool_choice
|
||||
|
||||
def add_image_frame_message(
|
||||
self, *, format: str, size: tuple[int, int], image: bytes, text: str = None
|
||||
self, *, format: str, size: tuple[int, int], image: bytes, text: Optional[str] = None
|
||||
):
|
||||
"""Add a message containing an image frame.
|
||||
|
||||
@@ -200,17 +332,8 @@ class LLMContext:
|
||||
image: Raw image bytes.
|
||||
text: Optional text to include with the image.
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
Image.frombytes(format, size, image).save(buffer, format="JPEG")
|
||||
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
|
||||
content = []
|
||||
if text:
|
||||
content.append({"type": "text", "text": text})
|
||||
content.append(
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
|
||||
)
|
||||
self.add_message({"role": "user", "content": content})
|
||||
message = LLMContext.create_image_message(format=format, size=size, image=image, text=text)
|
||||
self.add_message(message)
|
||||
|
||||
def add_audio_frames_message(
|
||||
self, *, audio_frames: list[AudioRawFrame], text: str = "Audio follows"
|
||||
@@ -221,66 +344,8 @@ class LLMContext:
|
||||
audio_frames: List of audio frame objects to include.
|
||||
text: Optional text to include with the audio.
|
||||
"""
|
||||
if not audio_frames:
|
||||
return
|
||||
|
||||
sample_rate = audio_frames[0].sample_rate
|
||||
num_channels = audio_frames[0].num_channels
|
||||
|
||||
content = []
|
||||
content.append({"type": "text", "text": text})
|
||||
data = b"".join(frame.audio for frame in audio_frames)
|
||||
data = bytes(
|
||||
self._create_wav_header(
|
||||
sample_rate,
|
||||
num_channels,
|
||||
16,
|
||||
len(data),
|
||||
)
|
||||
+ data
|
||||
)
|
||||
encoded_audio = base64.b64encode(data).decode("utf-8")
|
||||
content.append(
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {"data": encoded_audio, "format": "wav"},
|
||||
}
|
||||
)
|
||||
self.add_message({"role": "user", "content": content})
|
||||
|
||||
def _create_wav_header(self, sample_rate, num_channels, bits_per_sample, data_size):
|
||||
"""Create a WAV file header for audio data.
|
||||
|
||||
Args:
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
num_channels: Number of audio channels.
|
||||
bits_per_sample: Bits per audio sample.
|
||||
data_size: Size of audio data in bytes.
|
||||
|
||||
Returns:
|
||||
WAV header as a bytearray.
|
||||
"""
|
||||
# RIFF chunk descriptor
|
||||
header = bytearray()
|
||||
header.extend(b"RIFF") # ChunkID
|
||||
header.extend((data_size + 36).to_bytes(4, "little")) # ChunkSize: total size - 8
|
||||
header.extend(b"WAVE") # Format
|
||||
# "fmt " sub-chunk
|
||||
header.extend(b"fmt ") # Subchunk1ID
|
||||
header.extend((16).to_bytes(4, "little")) # Subchunk1Size (16 for PCM)
|
||||
header.extend((1).to_bytes(2, "little")) # AudioFormat (1 for PCM)
|
||||
header.extend(num_channels.to_bytes(2, "little")) # NumChannels
|
||||
header.extend(sample_rate.to_bytes(4, "little")) # SampleRate
|
||||
# Calculate byte rate and block align
|
||||
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
|
||||
block_align = num_channels * (bits_per_sample // 8)
|
||||
header.extend(byte_rate.to_bytes(4, "little")) # ByteRate
|
||||
header.extend(block_align.to_bytes(2, "little")) # BlockAlign
|
||||
header.extend(bits_per_sample.to_bytes(2, "little")) # BitsPerSample
|
||||
# "data" sub-chunk
|
||||
header.extend(b"data") # Subchunk2ID
|
||||
header.extend(data_size.to_bytes(4, "little")) # Subchunk2Size
|
||||
return header
|
||||
message = LLMContext.create_audio_message(audio_frames=audio_frames, text=text)
|
||||
self.add_message(message)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_and_validate_tools(tools: ToolsSchema | NotGiven) -> ToolsSchema | NotGiven:
|
||||
|
||||
@@ -89,7 +89,9 @@ class LLMAssistantAggregatorParams:
|
||||
|
||||
Parameters:
|
||||
expect_stripped_words: Whether to expect and handle stripped words
|
||||
in text frames by adding spaces between tokens.
|
||||
in text frames by adding spaces between tokens. This parameter is
|
||||
ignored when used with the newer LLMAssistantAggregator, which
|
||||
handles word spacing automatically.
|
||||
"""
|
||||
|
||||
expect_stripped_words: bool = True
|
||||
|
||||
@@ -13,6 +13,7 @@ LLM processing, and text-to-speech components in conversational AI pipelines.
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import warnings
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, List, Literal, Optional, Set
|
||||
|
||||
@@ -65,6 +66,7 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.string import concatenate_aggregated_text
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
|
||||
@@ -88,7 +90,7 @@ class LLMContextAggregator(FrameProcessor):
|
||||
self._context = context
|
||||
self._role = role
|
||||
|
||||
self._aggregation: str = ""
|
||||
self._aggregation: List[str] = []
|
||||
|
||||
@property
|
||||
def messages(self) -> List[LLMContextMessage]:
|
||||
@@ -168,13 +170,21 @@ class LLMContextAggregator(FrameProcessor):
|
||||
|
||||
async def reset(self):
|
||||
"""Reset the aggregation state."""
|
||||
self._aggregation = ""
|
||||
self._aggregation = []
|
||||
|
||||
@abstractmethod
|
||||
async def push_aggregation(self):
|
||||
"""Push the current aggregation downstream."""
|
||||
pass
|
||||
|
||||
def aggregation_string(self) -> str:
|
||||
"""Get the current aggregation as a string.
|
||||
|
||||
Returns:
|
||||
The concatenated aggregation string.
|
||||
"""
|
||||
return concatenate_aggregated_text(self._aggregation)
|
||||
|
||||
|
||||
class LLMUserAggregator(LLMContextAggregator):
|
||||
"""User LLM aggregator that processes speech-to-text transcriptions.
|
||||
@@ -212,8 +222,6 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
self._turn_params: Optional[SmartTurnParams] = None
|
||||
|
||||
if "aggregation_timeout" in kwargs:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
@@ -290,6 +298,12 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
await self._handle_llm_messages_update(frame)
|
||||
elif isinstance(frame, LLMSetToolsFrame):
|
||||
self.set_tools(frame.tools)
|
||||
# Push the LLMSetToolsFrame as well, since speech-to-speech LLM
|
||||
# services (like OpenAI Realtime) may need to know about tool
|
||||
# changes; unlike text-based LLM services they won't just "pick up
|
||||
# the change" on the next LLM run, as the LLM is continuously
|
||||
# running.
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, LLMSetToolChoiceFrame):
|
||||
self.set_tool_choice(frame.tool_choice)
|
||||
elif isinstance(frame, SpeechControlParamsFrame):
|
||||
@@ -301,7 +315,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
|
||||
async def _process_aggregation(self):
|
||||
"""Process the current aggregation and push it downstream."""
|
||||
aggregation = self._aggregation
|
||||
aggregation = self.aggregation_string()
|
||||
await self.reset()
|
||||
self._context.add_message({"role": self.role, "content": aggregation})
|
||||
frame = LLMContextFrame(self._context)
|
||||
@@ -349,7 +363,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
"""
|
||||
|
||||
async def should_interrupt(strategy: BaseInterruptionStrategy):
|
||||
await strategy.append_text(self._aggregation)
|
||||
await strategy.append_text(self.aggregation_string())
|
||||
return await strategy.should_interrupt()
|
||||
|
||||
return any([await should_interrupt(s) for s in self._interruption_strategies])
|
||||
@@ -419,7 +433,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
self._aggregation += f" {text}" if self._aggregation else text
|
||||
self._aggregation.append(text)
|
||||
# We just got a final result, so let's reset interim results.
|
||||
self._seen_interim_results = False
|
||||
# Reset aggregation timer.
|
||||
@@ -544,23 +558,31 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
Args:
|
||||
context: The OpenAI LLM context for conversation storage.
|
||||
params: Configuration parameters for aggregation behavior.
|
||||
**kwargs: Additional arguments. Supports deprecated 'expect_stripped_words'.
|
||||
**kwargs: Additional arguments.
|
||||
"""
|
||||
super().__init__(context=context, role="assistant", **kwargs)
|
||||
self._params = params or LLMAssistantAggregatorParams()
|
||||
|
||||
if "expect_stripped_words" in kwargs:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Parameter 'expect_stripped_words' is deprecated, use 'params' instead.",
|
||||
"Parameter 'expect_stripped_words' is deprecated. "
|
||||
"LLMAssistantAggregator now handles word spacing automatically.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
self._params.expect_stripped_words = kwargs["expect_stripped_words"]
|
||||
|
||||
if params and not params.expect_stripped_words:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"params.expect_stripped_words is deprecated. "
|
||||
"LLMAssistantAggregator now handles word spacing automatically.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
self._started = 0
|
||||
self._function_calls_in_progress: Dict[str, Optional[FunctionCallInProgressFrame]] = {}
|
||||
self._context_updated_tasks: Set[asyncio.Task] = set()
|
||||
@@ -610,7 +632,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
await self._handle_function_call_result(frame)
|
||||
elif isinstance(frame, FunctionCallCancelFrame):
|
||||
await self._handle_function_call_cancel(frame)
|
||||
elif isinstance(frame, UserImageRawFrame) and frame.request and frame.request.tool_call_id:
|
||||
elif isinstance(frame, UserImageRawFrame):
|
||||
await self._handle_user_image_frame(frame)
|
||||
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||
await self.push_aggregation()
|
||||
@@ -623,7 +645,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
if not self._aggregation:
|
||||
return
|
||||
|
||||
aggregation = self._aggregation.strip()
|
||||
aggregation = self.aggregation_string()
|
||||
await self.reset()
|
||||
|
||||
if aggregation:
|
||||
@@ -761,27 +783,16 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
message["content"] = result
|
||||
|
||||
async def _handle_user_image_frame(self, frame: UserImageRawFrame):
|
||||
logger.debug(
|
||||
f"{self} UserImageRawFrame: [{frame.request.function_name}:{frame.request.tool_call_id}]"
|
||||
)
|
||||
|
||||
if frame.request.tool_call_id not in self._function_calls_in_progress:
|
||||
logger.warning(
|
||||
f"UserImageRawFrame tool_call_id [{frame.request.tool_call_id}] is not running"
|
||||
)
|
||||
if not frame.append_to_context:
|
||||
return
|
||||
|
||||
del self._function_calls_in_progress[frame.request.tool_call_id]
|
||||
logger.debug(f"{self} Appending UserImageRawFrame to LLM context (size: {frame.size})")
|
||||
|
||||
# Update context with the image frame
|
||||
self._update_function_call_result(
|
||||
frame.request.function_name, frame.request.tool_call_id, "COMPLETED"
|
||||
)
|
||||
self._context.add_image_frame_message(
|
||||
format=frame.format,
|
||||
size=frame.size,
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
text=frame.text,
|
||||
)
|
||||
|
||||
await self.push_aggregation()
|
||||
@@ -798,10 +809,11 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
if not self._started:
|
||||
return
|
||||
|
||||
if self._params.expect_stripped_words:
|
||||
self._aggregation += f" {frame.text}" if self._aggregation else frame.text
|
||||
else:
|
||||
self._aggregation += frame.text
|
||||
# Make sure we really have text (spaces count, too!)
|
||||
if len(frame.text) == 0:
|
||||
return
|
||||
|
||||
self._aggregation.append(frame.text)
|
||||
|
||||
def _context_updated_task_finished(self, task: asyncio.Task):
|
||||
self._context_updated_tasks.discard(task)
|
||||
|
||||
@@ -27,11 +27,24 @@ class UserResponseAggregator(LLMUserAggregator):
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize the user response aggregator.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
`UserResponseAggregator` is deprecated and will be removed in a future version.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional arguments passed to parent LLMUserAggregator.
|
||||
"""
|
||||
super().__init__(context=LLMContext(), **kwargs)
|
||||
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"`UserResponseAggregator` is deprecated and will be removed in a future version.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
async def push_aggregation(self):
|
||||
"""Push the aggregated user response as a TextFrame.
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ allowing for flexible frame filtering logic in processing pipelines.
|
||||
|
||||
from typing import Awaitable, Callable
|
||||
|
||||
from pipecat.frames.frames import EndFrame, Frame, SystemFrame
|
||||
from pipecat.frames.frames import CancelFrame, EndFrame, Frame, StartFrame, SystemFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ class FunctionFilter(FrameProcessor):
|
||||
self,
|
||||
filter: Callable[[Frame], Awaitable[bool]],
|
||||
direction: FrameDirection = FrameDirection.DOWNSTREAM,
|
||||
filter_system_frames: bool = False,
|
||||
):
|
||||
"""Initialize the function filter.
|
||||
|
||||
@@ -36,22 +37,32 @@ class FunctionFilter(FrameProcessor):
|
||||
frame should pass through, False otherwise.
|
||||
direction: The direction to apply filtering. Only frames moving in
|
||||
this direction will be filtered. Defaults to DOWNSTREAM.
|
||||
filter_system_frames: Whether to filter system frames. Defaults to False.
|
||||
"""
|
||||
super().__init__()
|
||||
self._filter = filter
|
||||
self._direction = direction
|
||||
self._filter_system_frames = filter_system_frames
|
||||
|
||||
#
|
||||
# Frame processor
|
||||
#
|
||||
|
||||
# Ignore system frames, end frames and frames that are not following the
|
||||
# direction of this gate
|
||||
def _should_passthrough_frame(self, frame, direction):
|
||||
"""Check if a frame should pass through without filtering."""
|
||||
# Ignore system frames, end frames and frames that are not following the
|
||||
# direction of this gate
|
||||
return isinstance(frame, (SystemFrame, EndFrame)) or direction != self._direction
|
||||
# Always passthrough frames in the wrong direction
|
||||
if direction != self._direction:
|
||||
return True
|
||||
|
||||
# Always passthrough lifecycle frames
|
||||
if isinstance(frame, (StartFrame, EndFrame, CancelFrame)):
|
||||
return True
|
||||
|
||||
# If not filtering system frames, passthrough all other system frames
|
||||
if not self._filter_system_frames and isinstance(frame, SystemFrame):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process a frame through the filter.
|
||||
|
||||
@@ -1018,6 +1018,7 @@ class RTVIObserver(BaseObserver):
|
||||
|
||||
if (
|
||||
isinstance(frame, (UserStartedSpeakingFrame, UserStoppedSpeakingFrame))
|
||||
and (direction == FrameDirection.DOWNSTREAM)
|
||||
and self._params.user_speaking_enabled
|
||||
):
|
||||
await self._handle_interruptions(frame)
|
||||
|
||||
@@ -26,6 +26,7 @@ from pipecat.frames.frames import (
|
||||
TTSTextFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.string import concatenate_aggregated_text
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
|
||||
@@ -140,29 +141,7 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor):
|
||||
Result: "Hello there how are you"
|
||||
"""
|
||||
if self._current_text_parts and self._aggregation_start_time:
|
||||
# Check specifically for space characters, previously isspace() was used
|
||||
# but that includes all whitespace characters (e.g. \n), not just spaces.
|
||||
has_leading_spaces = any(
|
||||
part and part[0] == " " for part in self._current_text_parts[1:]
|
||||
)
|
||||
has_trailing_spaces = any(
|
||||
part and part[-1] == " " for part in self._current_text_parts[:-1]
|
||||
)
|
||||
|
||||
# If there are embedded spaces in the fragments, use direct concatenation
|
||||
contains_spacing_between_fragments = has_leading_spaces or has_trailing_spaces
|
||||
|
||||
# Apply corresponding joining method
|
||||
if contains_spacing_between_fragments:
|
||||
# Fragments already have spacing - just concatenate
|
||||
content = "".join(self._current_text_parts)
|
||||
else:
|
||||
# Word-by-word fragments - join with spaces
|
||||
content = " ".join(self._current_text_parts)
|
||||
|
||||
# Clean up any excessive whitespace
|
||||
content = content.strip()
|
||||
|
||||
content = concatenate_aggregated_text(self._current_text_parts)
|
||||
if content:
|
||||
logger.trace(f"Emitting aggregated assistant message: {content}")
|
||||
message = TranscriptionMessage(
|
||||
|
||||
@@ -44,6 +44,8 @@ from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.transports.daily.utils import (
|
||||
DailyMeetingTokenParams,
|
||||
DailyMeetingTokenProperties,
|
||||
DailyRESTHelper,
|
||||
DailyRoomParams,
|
||||
DailyRoomProperties,
|
||||
@@ -76,6 +78,7 @@ class DailyRoomConfig(BaseModel):
|
||||
async def configure(
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
*,
|
||||
api_key: Optional[str] = None,
|
||||
room_exp_duration: Optional[float] = 2.0,
|
||||
token_exp_duration: Optional[float] = 2.0,
|
||||
sip_caller_phone: Optional[str] = None,
|
||||
@@ -83,6 +86,7 @@ async def configure(
|
||||
sip_num_endpoints: Optional[int] = 1,
|
||||
sip_codecs: Optional[Dict[str, List[str]]] = None,
|
||||
room_properties: Optional[DailyRoomProperties] = None,
|
||||
token_properties: Optional["DailyMeetingTokenProperties"] = None,
|
||||
) -> DailyRoomConfig:
|
||||
"""Configure Daily room URL and token with optional SIP capabilities.
|
||||
|
||||
@@ -92,6 +96,7 @@ async def configure(
|
||||
|
||||
Args:
|
||||
aiohttp_session: HTTP session for making API requests.
|
||||
api_key: Daily API key.
|
||||
room_exp_duration: Room expiration time in hours.
|
||||
token_exp_duration: Token expiration time in hours.
|
||||
sip_caller_phone: Phone number or identifier for SIP display name.
|
||||
@@ -104,6 +109,9 @@ async def configure(
|
||||
individual parameters. When provided, this overrides room_exp_duration and
|
||||
SIP-related parameters. If not provided, properties are built from the
|
||||
individual parameters as before.
|
||||
token_properties: Optional DailyMeetingTokenProperties to customize the meeting
|
||||
token. When provided, these properties are passed to the token creation API.
|
||||
Note that room_name, exp, and is_owner will be set automatically.
|
||||
|
||||
Returns:
|
||||
DailyRoomConfig: Object with room_url, token, and optional sip_endpoint.
|
||||
@@ -129,7 +137,7 @@ async def configure(
|
||||
config = await configure(session, room_properties=custom_props)
|
||||
"""
|
||||
# Check for required API key
|
||||
api_key = os.getenv("DAILY_API_KEY")
|
||||
api_key = api_key or os.getenv("DAILY_API_KEY")
|
||||
if not api_key:
|
||||
raise Exception(
|
||||
"DAILY_API_KEY environment variable is required. "
|
||||
@@ -177,7 +185,10 @@ async def configure(
|
||||
|
||||
# Create token and return standard format
|
||||
expiry_time: float = token_exp_duration * 60 * 60
|
||||
token = await daily_rest_helper.get_token(room_url, expiry_time)
|
||||
token_params = None
|
||||
if token_properties:
|
||||
token_params = DailyMeetingTokenParams(properties=token_properties)
|
||||
token = await daily_rest_helper.get_token(room_url, expiry_time, params=token_params)
|
||||
return DailyRoomConfig(room_url=room_url, token=token)
|
||||
|
||||
# Create a new room
|
||||
@@ -219,7 +230,12 @@ async def configure(
|
||||
|
||||
# Create meeting token
|
||||
token_expiry_seconds = token_exp_duration * 60 * 60
|
||||
token = await daily_rest_helper.get_token(room_url, token_expiry_seconds)
|
||||
token_params = None
|
||||
if token_properties:
|
||||
token_params = DailyMeetingTokenParams(properties=token_properties)
|
||||
token = await daily_rest_helper.get_token(
|
||||
room_url, token_expiry_seconds, params=token_params
|
||||
)
|
||||
|
||||
if sip_enabled:
|
||||
# Return SIP configuration object
|
||||
|
||||
@@ -82,6 +82,7 @@ from loguru import logger
|
||||
|
||||
from pipecat.runner.types import (
|
||||
DailyRunnerArguments,
|
||||
RunnerArguments,
|
||||
SmallWebRTCRunnerArguments,
|
||||
WebSocketRunnerArguments,
|
||||
)
|
||||
@@ -309,7 +310,7 @@ def _setup_webrtc_routes(
|
||||
):
|
||||
"""Mimic Pipecat Cloud's proxy."""
|
||||
active_session = active_sessions.get(session_id)
|
||||
if not active_session:
|
||||
if active_session is None:
|
||||
return Response(content="Invalid or not-yet-ready session_id", status_code=404)
|
||||
|
||||
if path.endswith("api/offer"):
|
||||
@@ -529,9 +530,9 @@ def _setup_daily_routes(app: FastAPI):
|
||||
"""Set up Daily-specific routes."""
|
||||
|
||||
@app.get("/")
|
||||
async def start_agent():
|
||||
async def create_room_and_start_agent():
|
||||
"""Launch a Daily bot and redirect to room."""
|
||||
print("Starting bot with Daily transport")
|
||||
print("Starting bot with Daily transport and redirecting to Daily room")
|
||||
|
||||
import aiohttp
|
||||
|
||||
@@ -546,14 +547,15 @@ def _setup_daily_routes(app: FastAPI):
|
||||
asyncio.create_task(bot_module.bot(runner_args))
|
||||
return RedirectResponse(room_url)
|
||||
|
||||
async def _handle_rtvi_request(request: Request):
|
||||
"""Common handler for both /start and /connect endpoints.
|
||||
@app.post("/start")
|
||||
async def start_agent(request: Request):
|
||||
"""Handler for /start endpoints.
|
||||
|
||||
Expects POST body like::
|
||||
|
||||
{
|
||||
"createDailyRoom": true,
|
||||
"dailyRoomProperties": { "start_video_off": true },
|
||||
"dailyMeetingTokenProperties": { "is_owner": true, "user_name": "Bot" },
|
||||
"body": { "custom_data": "value" }
|
||||
}
|
||||
"""
|
||||
@@ -567,45 +569,68 @@ def _setup_daily_routes(app: FastAPI):
|
||||
logger.error(f"Failed to parse request body: {e}")
|
||||
request_data = {}
|
||||
|
||||
# Extract the body data that should be passed to the bot
|
||||
# This mimics Pipecat Cloud's behavior
|
||||
bot_body = request_data.get("body", {})
|
||||
create_daily_room = request_data.get("createDailyRoom", False)
|
||||
body = request_data.get("body", {})
|
||||
daily_room_properties_dict = request_data.get("dailyRoomProperties", None)
|
||||
daily_token_properties_dict = request_data.get("dailyMeetingTokenProperties", None)
|
||||
|
||||
# Log the extracted body data for debugging
|
||||
if bot_body:
|
||||
logger.info(f"Extracted body data for bot: {bot_body}")
|
||||
bot_module = _get_bot_module()
|
||||
|
||||
existing_room_url = os.getenv("DAILY_SAMPLE_ROOM_URL")
|
||||
|
||||
result = None
|
||||
|
||||
# Configure room if:
|
||||
# 1. Explicitly requested via createDailyRoom in payload
|
||||
# 2. Using pre-configured room from DAILY_SAMPLE_ROOM_URL env var
|
||||
if create_daily_room or existing_room_url:
|
||||
import aiohttp
|
||||
|
||||
from pipecat.runner.daily import configure
|
||||
from pipecat.transports.daily.utils import (
|
||||
DailyMeetingTokenProperties,
|
||||
DailyRoomProperties,
|
||||
)
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Parse dailyRoomProperties if provided
|
||||
room_properties = None
|
||||
if daily_room_properties_dict:
|
||||
try:
|
||||
room_properties = DailyRoomProperties(**daily_room_properties_dict)
|
||||
logger.debug(f"Using custom room properties: {room_properties}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse dailyRoomProperties: {e}")
|
||||
# Continue without custom properties
|
||||
|
||||
# Parse dailyMeetingTokenProperties if provided
|
||||
token_properties = None
|
||||
if daily_token_properties_dict:
|
||||
try:
|
||||
token_properties = DailyMeetingTokenProperties(
|
||||
**daily_token_properties_dict
|
||||
)
|
||||
logger.debug(f"Using custom token properties: {token_properties}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse dailyMeetingTokenProperties: {e}")
|
||||
# Continue without custom properties
|
||||
|
||||
room_url, token = await configure(
|
||||
session, room_properties=room_properties, token_properties=token_properties
|
||||
)
|
||||
runner_args = DailyRunnerArguments(room_url=room_url, token=token, body=body)
|
||||
result = {
|
||||
"dailyRoom": room_url,
|
||||
"dailyToken": token,
|
||||
"sessionId": str(uuid.uuid4()),
|
||||
}
|
||||
else:
|
||||
logger.debug("No body data provided in request")
|
||||
runner_args = RunnerArguments(body=body)
|
||||
|
||||
from pipecat.runner.daily import configure
|
||||
# Start the bot in the background
|
||||
asyncio.create_task(bot_module.bot(runner_args))
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
room_url, token = await configure(session)
|
||||
|
||||
# Start the bot in the background with extracted body data
|
||||
bot_module = _get_bot_module()
|
||||
runner_args = DailyRunnerArguments(room_url=room_url, token=token, body=bot_body)
|
||||
asyncio.create_task(bot_module.bot(runner_args))
|
||||
# Match PCC /start endpoint response format:
|
||||
return {"dailyRoom": room_url, "dailyToken": token}
|
||||
|
||||
@app.post("/start")
|
||||
async def rtvi_start(request: Request):
|
||||
"""Launch a Daily bot and return connection info for RTVI clients."""
|
||||
return await _handle_rtvi_request(request)
|
||||
|
||||
@app.post("/connect")
|
||||
async def rtvi_connect(request: Request):
|
||||
"""Launch a Daily bot and return connection info for RTVI clients.
|
||||
|
||||
.. deprecated:: 0.0.78
|
||||
Use /start instead. This endpoint will be removed in a future version.
|
||||
"""
|
||||
logger.warning(
|
||||
"DEPRECATED: /connect endpoint is deprecated. Please use /start instead. "
|
||||
"This endpoint will be removed in a future version."
|
||||
)
|
||||
return await _handle_rtvi_request(request)
|
||||
return result
|
||||
|
||||
|
||||
def _setup_telephony_routes(app: FastAPI, *, transport_type: str, proxy: str):
|
||||
@@ -800,10 +825,6 @@ def main():
|
||||
logger.error("For ESP32, you need to specify `--host IP` so we can do SDP munging.")
|
||||
return
|
||||
|
||||
if args.transport in TELEPHONY_TRANSPORTS and not args.proxy:
|
||||
logger.error(f"For telephony transports, you need to specify `--proxy PROXY`.")
|
||||
return
|
||||
|
||||
# Log level
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="TRACE" if args.verbose else "DEBUG")
|
||||
|
||||
@@ -20,9 +20,11 @@ from fastapi import WebSocket
|
||||
class RunnerArguments:
|
||||
"""Base class for runner session arguments."""
|
||||
|
||||
handle_sigint: bool = field(init=False)
|
||||
handle_sigterm: bool = field(init=False)
|
||||
pipeline_idle_timeout_secs: int = field(init=False)
|
||||
# Use kw_only so subclasses don't need to worry about ordering.
|
||||
handle_sigint: bool = field(init=False, kw_only=True)
|
||||
handle_sigterm: bool = field(init=False, kw_only=True)
|
||||
pipeline_idle_timeout_secs: int = field(init=False, kw_only=True)
|
||||
body: Optional[Any] = field(default_factory=dict, kw_only=True)
|
||||
|
||||
def __post_init__(self):
|
||||
self.handle_sigint = False
|
||||
@@ -42,7 +44,6 @@ class DailyRunnerArguments(RunnerArguments):
|
||||
|
||||
room_url: str
|
||||
token: Optional[str] = None
|
||||
body: Optional[Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -55,7 +56,6 @@ class WebSocketRunnerArguments(RunnerArguments):
|
||||
"""
|
||||
|
||||
websocket: WebSocket
|
||||
body: Optional[Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -216,6 +216,7 @@ async def parse_telephony_websocket(websocket: WebSocket):
|
||||
"account_sid": start_data.get("account_sid"),
|
||||
"from": start_data.get("from", ""),
|
||||
"to": start_data.get("to", ""),
|
||||
"custom_parameters": start_data.get("custom_parameters", ""),
|
||||
}
|
||||
|
||||
else:
|
||||
|
||||
@@ -720,11 +720,11 @@ class AWSBedrockLLMService(LLMService):
|
||||
additional_model_request_fields: Additional model-specific parameters.
|
||||
"""
|
||||
|
||||
max_tokens: Optional[int] = Field(default_factory=lambda: 4096, ge=1)
|
||||
temperature: Optional[float] = Field(default_factory=lambda: 0.7, ge=0.0, le=1.0)
|
||||
top_p: Optional[float] = Field(default_factory=lambda: 0.999, ge=0.0, le=1.0)
|
||||
max_tokens: Optional[int] = Field(default=None, ge=1)
|
||||
temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
||||
top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
||||
stop_sequences: Optional[List[str]] = Field(default_factory=lambda: [])
|
||||
latency: Optional[str] = Field(default_factory=lambda: "standard")
|
||||
latency: Optional[str] = Field(default=None)
|
||||
additional_model_request_fields: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
def __init__(
|
||||
@@ -801,6 +801,24 @@ class AWSBedrockLLMService(LLMService):
|
||||
"""
|
||||
return True
|
||||
|
||||
def _build_inference_config(self) -> Dict[str, Any]:
|
||||
"""Build inference config with only the parameters that are set.
|
||||
|
||||
This prevents conflicts with models (e.g., Claude Sonnet 4.5) that don't
|
||||
allow certain parameter combinations like temperature and top_p together.
|
||||
|
||||
Returns:
|
||||
Dictionary containing only the inference parameters that are not None.
|
||||
"""
|
||||
inference_config = {}
|
||||
if self._settings["max_tokens"] is not None:
|
||||
inference_config["maxTokens"] = self._settings["max_tokens"]
|
||||
if self._settings["temperature"] is not None:
|
||||
inference_config["temperature"] = self._settings["temperature"]
|
||||
if self._settings["top_p"] is not None:
|
||||
inference_config["topP"] = self._settings["top_p"]
|
||||
return inference_config
|
||||
|
||||
async def run_inference(self, context: LLMContext | OpenAILLMContext) -> Optional[str]:
|
||||
"""Run a one-shot, out-of-band (i.e. out-of-pipeline) inference with the given LLM context.
|
||||
|
||||
@@ -826,16 +844,16 @@ class AWSBedrockLLMService(LLMService):
|
||||
model_id = self.model_name
|
||||
|
||||
# Prepare request parameters
|
||||
inference_config = self._build_inference_config()
|
||||
|
||||
request_params = {
|
||||
"modelId": model_id,
|
||||
"messages": messages,
|
||||
"inferenceConfig": {
|
||||
"maxTokens": 8192,
|
||||
"temperature": 0.7,
|
||||
"topP": 0.9,
|
||||
},
|
||||
}
|
||||
|
||||
if inference_config:
|
||||
request_params["inferenceConfig"] = inference_config
|
||||
|
||||
if system:
|
||||
request_params["system"] = system
|
||||
|
||||
@@ -974,21 +992,20 @@ class AWSBedrockLLMService(LLMService):
|
||||
tools = params_from_context["tools"]
|
||||
tool_choice = params_from_context["tool_choice"]
|
||||
|
||||
# Set up inference config
|
||||
inference_config = {
|
||||
"maxTokens": self._settings["max_tokens"],
|
||||
"temperature": self._settings["temperature"],
|
||||
"topP": self._settings["top_p"],
|
||||
}
|
||||
# Set up inference config - only include parameters that are set
|
||||
inference_config = self._build_inference_config()
|
||||
|
||||
# Prepare request parameters
|
||||
request_params = {
|
||||
"modelId": self.model_name,
|
||||
"messages": messages,
|
||||
"inferenceConfig": inference_config,
|
||||
"additionalModelRequestFields": self._settings["additional_model_request_fields"],
|
||||
}
|
||||
|
||||
# Only add inference config if it has parameters
|
||||
if inference_config:
|
||||
request_params["inferenceConfig"] = inference_config
|
||||
|
||||
# Add system message
|
||||
if system:
|
||||
request_params["system"] = system
|
||||
|
||||
@@ -10,7 +10,8 @@ This module provides specialized context aggregators and message handling for AW
|
||||
including conversation history management and role-specific message processing.
|
||||
|
||||
.. deprecated:: 0.0.91
|
||||
AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`.
|
||||
AWS Nova Sonic no longer uses types from this module under the hood.
|
||||
It now uses `LLMContext` and `LLMContextAggregatorPair`.
|
||||
Using the new patterns should allow you to not need types from this module.
|
||||
|
||||
BEFORE:
|
||||
@@ -26,9 +27,6 @@ including conversation history management and role-specific message processing.
|
||||
context: AWSNovaSonicLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.messages
|
||||
```
|
||||
|
||||
AFTER:
|
||||
@@ -42,9 +40,6 @@ including conversation history management and role-specific message processing.
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.get_messages()
|
||||
```
|
||||
"""
|
||||
|
||||
@@ -53,8 +48,10 @@ import warnings
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.aws.nova_sonic.context are deprecated. \n"
|
||||
"AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`. \n"
|
||||
"Types in pipecat.services.aws.nova_sonic.context (or "
|
||||
"pipecat.services.aws_nova_sonic.context) are deprecated. \n"
|
||||
"AWS Nova Sonic no longer uses types from this module under the hood. \n"
|
||||
"It now uses `LLMContext` and `LLMContextAggregatorPair`. \n"
|
||||
"Using the new patterns should allow you to not need types from this module.\n\n"
|
||||
"BEFORE:\n"
|
||||
"```\n"
|
||||
@@ -67,8 +64,6 @@ with warnings.catch_warnings():
|
||||
"context: AWSNovaSonicLLMContext\n"
|
||||
"# or\n"
|
||||
"context: OpenAILLMContext\n\n"
|
||||
"# Reading messages from context\n"
|
||||
"messages = context.messages\n"
|
||||
"```\n\n"
|
||||
"AFTER:\n"
|
||||
"```\n"
|
||||
@@ -79,9 +74,363 @@ with warnings.catch_warnings():
|
||||
"frame: LLMContextFrame\n\n"
|
||||
"# Context type\n"
|
||||
"context: LLMContext\n\n"
|
||||
"# Reading messages from context\n"
|
||||
"messages = context.messages\n"
|
||||
"```",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
import copy
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
BotStoppedSpeakingFrame,
|
||||
DataFrame,
|
||||
Frame,
|
||||
FunctionCallResultFrame,
|
||||
InterruptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMSetToolChoiceFrame,
|
||||
LLMSetToolsFrame,
|
||||
TextFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.aws.nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame
|
||||
from pipecat.services.openai.llm import (
|
||||
OpenAIAssistantContextAggregator,
|
||||
OpenAIUserContextAggregator,
|
||||
)
|
||||
|
||||
|
||||
class Role(Enum):
|
||||
"""Roles supported in AWS Nova Sonic conversations.
|
||||
|
||||
Parameters:
|
||||
SYSTEM: System-level messages (not used in conversation history).
|
||||
USER: Messages sent by the user.
|
||||
ASSISTANT: Messages sent by the assistant.
|
||||
TOOL: Messages sent by tools (not used in conversation history).
|
||||
"""
|
||||
|
||||
SYSTEM = "SYSTEM"
|
||||
USER = "USER"
|
||||
ASSISTANT = "ASSISTANT"
|
||||
TOOL = "TOOL"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicConversationHistoryMessage:
|
||||
"""A single message in AWS Nova Sonic conversation history.
|
||||
|
||||
Parameters:
|
||||
role: The role of the message sender (USER or ASSISTANT only).
|
||||
text: The text content of the message.
|
||||
"""
|
||||
|
||||
role: Role # only USER and ASSISTANT
|
||||
text: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicConversationHistory:
|
||||
"""Complete conversation history for AWS Nova Sonic initialization.
|
||||
|
||||
Parameters:
|
||||
system_instruction: System-level instruction for the conversation.
|
||||
messages: List of conversation messages between user and assistant.
|
||||
"""
|
||||
|
||||
system_instruction: str = None
|
||||
messages: list[AWSNovaSonicConversationHistoryMessage] = field(default_factory=list)
|
||||
|
||||
|
||||
class AWSNovaSonicLLMContext(OpenAILLMContext):
|
||||
"""Specialized LLM context for AWS Nova Sonic service.
|
||||
|
||||
Extends OpenAI context with Nova Sonic-specific message handling,
|
||||
conversation history management, and text buffering capabilities.
|
||||
"""
|
||||
|
||||
def __init__(self, messages=None, tools=None, **kwargs):
|
||||
"""Initialize AWS Nova Sonic LLM context.
|
||||
|
||||
Args:
|
||||
messages: Initial messages for the context.
|
||||
tools: Available tools for the context.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__(messages=messages, tools=tools, **kwargs)
|
||||
self.__setup_local()
|
||||
|
||||
def __setup_local(self, system_instruction: str = ""):
|
||||
self._assistant_text = ""
|
||||
self._user_text = ""
|
||||
self._system_instruction = system_instruction
|
||||
|
||||
@staticmethod
|
||||
def upgrade_to_nova_sonic(
|
||||
obj: OpenAILLMContext, system_instruction: str
|
||||
) -> "AWSNovaSonicLLMContext":
|
||||
"""Upgrade an OpenAI context to AWS Nova Sonic context.
|
||||
|
||||
Args:
|
||||
obj: The OpenAI context to upgrade.
|
||||
system_instruction: System instruction for the context.
|
||||
|
||||
Returns:
|
||||
The upgraded AWS Nova Sonic context.
|
||||
"""
|
||||
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AWSNovaSonicLLMContext):
|
||||
obj.__class__ = AWSNovaSonicLLMContext
|
||||
obj.__setup_local(system_instruction)
|
||||
return obj
|
||||
|
||||
# NOTE: this method has the side-effect of updating _system_instruction from messages
|
||||
def get_messages_for_initializing_history(self) -> AWSNovaSonicConversationHistory:
|
||||
"""Get conversation history for initializing AWS Nova Sonic session.
|
||||
|
||||
Processes stored messages and extracts system instruction and conversation
|
||||
history in the format expected by AWS Nova Sonic.
|
||||
|
||||
Returns:
|
||||
Formatted conversation history with system instruction and messages.
|
||||
"""
|
||||
history = AWSNovaSonicConversationHistory(system_instruction=self._system_instruction)
|
||||
|
||||
# Bail if there are no messages
|
||||
if not self.messages:
|
||||
return history
|
||||
|
||||
messages = copy.deepcopy(self.messages)
|
||||
|
||||
# If we have a "system" message as our first message, let's pull that out into "instruction"
|
||||
if messages[0].get("role") == "system":
|
||||
system = messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
history.system_instruction = content
|
||||
elif isinstance(content, list):
|
||||
history.system_instruction = content[0].get("text")
|
||||
if history.system_instruction:
|
||||
self._system_instruction = history.system_instruction
|
||||
|
||||
# Process remaining messages to fill out conversation history.
|
||||
# Nova Sonic supports "user" and "assistant" messages in history.
|
||||
for message in messages:
|
||||
history_message = self.from_standard_message(message)
|
||||
if history_message:
|
||||
history.messages.append(history_message)
|
||||
|
||||
return history
|
||||
|
||||
def get_messages_for_persistent_storage(self):
|
||||
"""Get messages formatted for persistent storage.
|
||||
|
||||
Returns:
|
||||
List of messages including system instruction if present.
|
||||
"""
|
||||
messages = super().get_messages_for_persistent_storage()
|
||||
# If we have a system instruction and messages doesn't already contain it, add it
|
||||
if self._system_instruction and not (messages and messages[0].get("role") == "system"):
|
||||
messages.insert(0, {"role": "system", "content": self._system_instruction})
|
||||
return messages
|
||||
|
||||
def from_standard_message(self, message) -> AWSNovaSonicConversationHistoryMessage:
|
||||
"""Convert standard message format to Nova Sonic format.
|
||||
|
||||
Args:
|
||||
message: Standard message dictionary to convert.
|
||||
|
||||
Returns:
|
||||
Nova Sonic conversation history message, or None if not convertible.
|
||||
"""
|
||||
role = message.get("role")
|
||||
if message.get("role") == "user" or message.get("role") == "assistant":
|
||||
content = message.get("content")
|
||||
if isinstance(message.get("content"), list):
|
||||
content = ""
|
||||
for c in message.get("content"):
|
||||
if c.get("type") == "text":
|
||||
content += " " + c.get("text")
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
)
|
||||
# There won't be content if this is an assistant tool call entry.
|
||||
# We're ignoring those since they can't be loaded into AWS Nova Sonic conversation
|
||||
# history
|
||||
if content:
|
||||
return AWSNovaSonicConversationHistoryMessage(role=Role[role.upper()], text=content)
|
||||
# NOTE: we're ignoring messages with role "tool" since they can't be loaded into AWS Nova
|
||||
# Sonic conversation history
|
||||
|
||||
def buffer_user_text(self, text):
|
||||
"""Buffer user text for later flushing to context.
|
||||
|
||||
Args:
|
||||
text: User text to buffer.
|
||||
"""
|
||||
self._user_text += f" {text}" if self._user_text else text
|
||||
# logger.debug(f"User text buffered: {self._user_text}")
|
||||
|
||||
def flush_aggregated_user_text(self) -> str:
|
||||
"""Flush buffered user text to context as a complete message.
|
||||
|
||||
Returns:
|
||||
The flushed user text, or empty string if no text was buffered.
|
||||
"""
|
||||
if not self._user_text:
|
||||
return ""
|
||||
user_text = self._user_text
|
||||
message = {
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": user_text}],
|
||||
}
|
||||
self._user_text = ""
|
||||
self.add_message(message)
|
||||
# logger.debug(f"Context updated (user): {self.get_messages_for_logging()}")
|
||||
return user_text
|
||||
|
||||
def buffer_assistant_text(self, text):
|
||||
"""Buffer assistant text for later flushing to context.
|
||||
|
||||
Args:
|
||||
text: Assistant text to buffer.
|
||||
"""
|
||||
self._assistant_text += text
|
||||
# logger.debug(f"Assistant text buffered: {self._assistant_text}")
|
||||
|
||||
def flush_aggregated_assistant_text(self):
|
||||
"""Flush buffered assistant text to context as a complete message."""
|
||||
if not self._assistant_text:
|
||||
return
|
||||
message = {
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": self._assistant_text}],
|
||||
}
|
||||
self._assistant_text = ""
|
||||
self.add_message(message)
|
||||
# logger.debug(f"Context updated (assistant): {self.get_messages_for_logging()}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicMessagesUpdateFrame(DataFrame):
|
||||
"""Frame containing updated AWS Nova Sonic context.
|
||||
|
||||
Parameters:
|
||||
context: The updated AWS Nova Sonic LLM context.
|
||||
"""
|
||||
|
||||
context: AWSNovaSonicLLMContext
|
||||
|
||||
|
||||
class AWSNovaSonicUserContextAggregator(OpenAIUserContextAggregator):
|
||||
"""Context aggregator for user messages in AWS Nova Sonic conversations.
|
||||
|
||||
Extends the OpenAI user context aggregator to emit Nova Sonic-specific
|
||||
context update frames.
|
||||
"""
|
||||
|
||||
async def process_frame(
|
||||
self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM
|
||||
):
|
||||
"""Process frames and emit Nova Sonic-specific context updates.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction the frame is traveling.
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
# Parent does not push LLMMessagesUpdateFrame
|
||||
if isinstance(frame, LLMMessagesUpdateFrame):
|
||||
await self.push_frame(AWSNovaSonicMessagesUpdateFrame(context=self._context))
|
||||
|
||||
|
||||
class AWSNovaSonicAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
||||
"""Context aggregator for assistant messages in AWS Nova Sonic conversations.
|
||||
|
||||
Provides specialized handling for assistant responses and function calls
|
||||
in AWS Nova Sonic context, with custom frame processing logic.
|
||||
"""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames with Nova Sonic-specific logic.
|
||||
|
||||
Args:
|
||||
frame: The frame to process.
|
||||
direction: The direction the frame is traveling.
|
||||
"""
|
||||
# HACK: For now, disable the context aggregator by making it just pass through all frames
|
||||
# that the parent handles (except the function call stuff, which we still need).
|
||||
# For an explanation of this hack, see
|
||||
# AWSNovaSonicLLMService._report_assistant_response_text_added.
|
||||
if isinstance(
|
||||
frame,
|
||||
(
|
||||
InterruptionFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
TextFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMMessagesUpdateFrame,
|
||||
LLMSetToolsFrame,
|
||||
LLMSetToolChoiceFrame,
|
||||
UserImageRawFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
),
|
||||
):
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
async def handle_function_call_result(self, frame: FunctionCallResultFrame):
|
||||
"""Handle function call results for AWS Nova Sonic.
|
||||
|
||||
Args:
|
||||
frame: The function call result frame to handle.
|
||||
"""
|
||||
await super().handle_function_call_result(frame)
|
||||
|
||||
# The standard function callback code path pushes the FunctionCallResultFrame from the LLM
|
||||
# itself, so we didn't have a chance to add the result to the AWS Nova Sonic server-side
|
||||
# context. Let's push a special frame to do that.
|
||||
await self.push_frame(
|
||||
AWSNovaSonicFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AWSNovaSonicContextAggregatorPair:
|
||||
"""Pair of user and assistant context aggregators for AWS Nova Sonic.
|
||||
|
||||
Parameters:
|
||||
_user: The user context aggregator.
|
||||
_assistant: The assistant context aggregator.
|
||||
"""
|
||||
|
||||
_user: AWSNovaSonicUserContextAggregator
|
||||
_assistant: AWSNovaSonicAssistantContextAggregator
|
||||
|
||||
def user(self) -> AWSNovaSonicUserContextAggregator:
|
||||
"""Get the user context aggregator.
|
||||
|
||||
Returns:
|
||||
The user context aggregator instance.
|
||||
"""
|
||||
return self._user
|
||||
|
||||
def assistant(self) -> AWSNovaSonicAssistantContextAggregator:
|
||||
"""Get the assistant context aggregator.
|
||||
|
||||
Returns:
|
||||
The assistant context aggregator instance.
|
||||
"""
|
||||
return self._assistant
|
||||
|
||||
@@ -10,78 +10,12 @@ This module provides specialized context aggregators and message handling for AW
|
||||
including conversation history management and role-specific message processing.
|
||||
|
||||
.. deprecated:: 0.0.91
|
||||
AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`.
|
||||
AWS Nova Sonic no longer uses types from this module under the hood.
|
||||
It now uses `LLMContext` and `LLMContextAggregatorPair`.
|
||||
Using the new patterns should allow you to not need types from this module.
|
||||
|
||||
BEFORE:
|
||||
```
|
||||
# Setup
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
# Context frame type
|
||||
frame: OpenAILLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: AWSNovaSonicLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.messages
|
||||
```
|
||||
|
||||
AFTER:
|
||||
```
|
||||
# Setup
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# Context frame type
|
||||
frame: LLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
|
||||
# Reading messages from context
|
||||
messages = context.get_messages()
|
||||
```
|
||||
See deprecation warning in pipecat.services.aws.nova_sonic.context for more
|
||||
details.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.aws_nova_sonic.context are deprecated. \n"
|
||||
"AWS Nova Sonic now supports `LLMContext` and `LLMContextAggregatorPair`. \n"
|
||||
"Using the new patterns should allow you to not need types from this module.\n\n"
|
||||
"BEFORE:\n"
|
||||
"```\n"
|
||||
"# Setup\n"
|
||||
"context = OpenAILLMContext(messages, tools)\n"
|
||||
"context_aggregator = llm.create_context_aggregator(context)\n\n"
|
||||
"# Context frame type\n"
|
||||
"frame: OpenAILLMContextFrame\n\n"
|
||||
"# Context type\n"
|
||||
"context: AWSNovaSonicLLMContext\n"
|
||||
"# or\n"
|
||||
"context: OpenAILLMContext\n\n"
|
||||
"# Reading messages from context\n"
|
||||
"messages = context.messages\n"
|
||||
"```\n\n"
|
||||
"AFTER:\n"
|
||||
"```\n"
|
||||
"# Setup\n"
|
||||
"context = LLMContext(messages, tools)\n"
|
||||
"context_aggregator = LLMContextAggregatorPair(context)\n\n"
|
||||
"# Context frame type\n"
|
||||
"frame: LLMContextFrame\n\n"
|
||||
"# Context type\n"
|
||||
"context: LLMContext\n\n"
|
||||
"# Reading messages from context\n"
|
||||
"messages = context.messages\n"
|
||||
"```",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
from pipecat.services.aws.nova_sonic.context import *
|
||||
|
||||
@@ -38,7 +38,7 @@ class AzureRealtimeLLMService(OpenAIRealtimeLLMService):
|
||||
Args:
|
||||
api_key: The API key for the Azure OpenAI service.
|
||||
base_url: The full Azure WebSocket endpoint URL including api-version and deployment.
|
||||
Example: "wss://my-project.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=my-realtime-deployment"
|
||||
Example: "wss://my-project.openai.azure.com/openai/realtime?api-version=2025-04-01-preview&deployment=my-realtime-deployment"
|
||||
**kwargs: Additional arguments passed to parent OpenAIRealtimeLLMService.
|
||||
"""
|
||||
super().__init__(base_url=base_url, api_key=api_key, **kwargs)
|
||||
@@ -52,7 +52,7 @@ class AzureRealtimeLLMService(OpenAIRealtimeLLMService):
|
||||
# handle disconnections in the send/recv code paths.
|
||||
return
|
||||
|
||||
logger.info(f"Connecting to {self.base_url}, api key: {self.api_key}")
|
||||
logger.info(f"Connecting to {self.base_url}")
|
||||
self._websocket = await websocket_connect(
|
||||
uri=self.base_url,
|
||||
additional_headers={
|
||||
|
||||
@@ -48,6 +48,26 @@ except ModuleNotFoundError as e:
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class GenerationConfig(BaseModel):
|
||||
"""Configuration for Cartesia Sonic-3 generation parameters.
|
||||
|
||||
Sonic-3 interprets these parameters as guidance to ensure natural speech.
|
||||
Test against your content for best results.
|
||||
|
||||
Parameters:
|
||||
volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
|
||||
speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
|
||||
emotion: Single emotion string to guide the emotional tone. Examples include neutral,
|
||||
angry, excited, content, sad, scared. Over 60 emotions are supported. For best
|
||||
results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
|
||||
and Marian.
|
||||
"""
|
||||
|
||||
volume: Optional[float] = None
|
||||
speed: Optional[float] = None
|
||||
emotion: Optional[str] = None
|
||||
|
||||
|
||||
def language_to_cartesia_language(language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to Cartesia language code.
|
||||
|
||||
@@ -58,20 +78,47 @@ def language_to_cartesia_language(language: Language) -> Optional[str]:
|
||||
The corresponding Cartesia language code, or None if not supported.
|
||||
"""
|
||||
BASE_LANGUAGES = {
|
||||
Language.AR: "ar",
|
||||
Language.BG: "bg",
|
||||
Language.BN: "bn",
|
||||
Language.CS: "cs",
|
||||
Language.DA: "da",
|
||||
Language.DE: "de",
|
||||
Language.EN: "en",
|
||||
Language.EL: "el",
|
||||
Language.ES: "es",
|
||||
Language.FI: "fi",
|
||||
Language.FR: "fr",
|
||||
Language.GU: "gu",
|
||||
Language.HE: "he",
|
||||
Language.HI: "hi",
|
||||
Language.HR: "hr",
|
||||
Language.HU: "hu",
|
||||
Language.ID: "id",
|
||||
Language.IT: "it",
|
||||
Language.JA: "ja",
|
||||
Language.KA: "ka",
|
||||
Language.KN: "kn",
|
||||
Language.KO: "ko",
|
||||
Language.ML: "ml",
|
||||
Language.MR: "mr",
|
||||
Language.MS: "ms",
|
||||
Language.NL: "nl",
|
||||
Language.NO: "no",
|
||||
Language.PA: "pa",
|
||||
Language.PL: "pl",
|
||||
Language.PT: "pt",
|
||||
Language.RO: "ro",
|
||||
Language.RU: "ru",
|
||||
Language.SK: "sk",
|
||||
Language.SV: "sv",
|
||||
Language.TA: "ta",
|
||||
Language.TE: "te",
|
||||
Language.TH: "th",
|
||||
Language.TL: "tl",
|
||||
Language.TR: "tr",
|
||||
Language.UK: "uk",
|
||||
Language.VI: "vi",
|
||||
Language.ZH: "zh",
|
||||
}
|
||||
|
||||
@@ -101,16 +148,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
|
||||
Parameters:
|
||||
language: Language to use for synthesis.
|
||||
speed: Voice speed control.
|
||||
emotion: List of emotion controls.
|
||||
speed: Voice speed control for non-Sonic-3 models (literal values).
|
||||
emotion: List of emotion controls for non-Sonic-3 models.
|
||||
|
||||
.. deprecated:: 0.0.68
|
||||
The `emotion` parameter is deprecated and will be removed in a future version.
|
||||
|
||||
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
||||
speed (numeric), and emotion (string) parameters.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = Language.EN
|
||||
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
||||
emotion: Optional[List[str]] = []
|
||||
generation_config: Optional[GenerationConfig] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -119,7 +170,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
voice_id: str,
|
||||
cartesia_version: str = "2025-04-16",
|
||||
url: str = "wss://api.cartesia.ai/tts/websocket",
|
||||
model: str = "sonic-2",
|
||||
model: str = "sonic-3",
|
||||
sample_rate: Optional[int] = None,
|
||||
encoding: str = "pcm_s16le",
|
||||
container: str = "raw",
|
||||
@@ -135,7 +186,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
voice_id: ID of the voice to use for synthesis.
|
||||
cartesia_version: API version string for Cartesia service.
|
||||
url: WebSocket URL for Cartesia TTS API.
|
||||
model: TTS model to use (e.g., "sonic-2").
|
||||
model: TTS model to use (e.g., "sonic-3").
|
||||
sample_rate: Audio sample rate. If None, uses default.
|
||||
encoding: Audio encoding format.
|
||||
container: Audio container format.
|
||||
@@ -179,6 +230,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
else "en",
|
||||
"speed": params.speed,
|
||||
"emotion": params.emotion,
|
||||
"generation_config": params.generation_config,
|
||||
}
|
||||
self.set_model_name(model)
|
||||
self.set_voice(voice_id)
|
||||
@@ -297,6 +349,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
if self._settings["speed"]:
|
||||
msg["speed"] = self._settings["speed"]
|
||||
|
||||
if self._settings["generation_config"]:
|
||||
msg["generation_config"] = self._settings["generation_config"].model_dump(
|
||||
exclude_none=True
|
||||
)
|
||||
|
||||
return json.dumps(msg)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
@@ -482,23 +539,27 @@ class CartesiaHttpTTSService(TTSService):
|
||||
|
||||
Parameters:
|
||||
language: Language to use for synthesis.
|
||||
speed: Voice speed control.
|
||||
emotion: List of emotion controls.
|
||||
speed: Voice speed control for non-Sonic-3 models (literal values).
|
||||
emotion: List of emotion controls for non-Sonic-3 models.
|
||||
|
||||
.. deprecated:: 0.0.68
|
||||
The `emotion` parameter is deprecated and will be removed in a future version.
|
||||
|
||||
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
||||
speed (numeric), and emotion (string) parameters.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = Language.EN
|
||||
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
||||
emotion: Optional[List[str]] = Field(default_factory=list)
|
||||
generation_config: Optional[GenerationConfig] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
voice_id: str,
|
||||
model: str = "sonic-2",
|
||||
model: str = "sonic-3",
|
||||
base_url: str = "https://api.cartesia.ai",
|
||||
cartesia_version: str = "2024-11-13",
|
||||
sample_rate: Optional[int] = None,
|
||||
@@ -512,7 +573,7 @@ class CartesiaHttpTTSService(TTSService):
|
||||
Args:
|
||||
api_key: Cartesia API key for authentication.
|
||||
voice_id: ID of the voice to use for synthesis.
|
||||
model: TTS model to use (e.g., "sonic-2").
|
||||
model: TTS model to use (e.g., "sonic-3").
|
||||
base_url: Base URL for Cartesia HTTP API.
|
||||
cartesia_version: API version string for Cartesia service.
|
||||
sample_rate: Audio sample rate. If None, uses default.
|
||||
@@ -539,6 +600,7 @@ class CartesiaHttpTTSService(TTSService):
|
||||
else "en",
|
||||
"speed": params.speed,
|
||||
"emotion": params.emotion,
|
||||
"generation_config": params.generation_config,
|
||||
}
|
||||
self.set_voice(voice_id)
|
||||
self.set_model_name(model)
|
||||
@@ -632,6 +694,11 @@ class CartesiaHttpTTSService(TTSService):
|
||||
if self._settings["speed"]:
|
||||
payload["speed"] = self._settings["speed"]
|
||||
|
||||
if self._settings["generation_config"]:
|
||||
payload["generation_config"] = self._settings["generation_config"].model_dump(
|
||||
exclude_none=True
|
||||
)
|
||||
|
||||
yield TTSStartedFrame()
|
||||
|
||||
session = await self._client._get_session()
|
||||
|
||||
@@ -156,6 +156,12 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
self._language = Language.EN
|
||||
self._websocket_url = None
|
||||
self._receive_task = None
|
||||
# Flux event handlers
|
||||
self._register_event_handler("on_start_of_turn")
|
||||
self._register_event_handler("on_turn_resumed")
|
||||
self._register_event_handler("on_end_of_turn")
|
||||
self._register_event_handler("on_eager_end_of_turn")
|
||||
self._register_event_handler("on_update")
|
||||
|
||||
async def _connect(self):
|
||||
"""Connect to WebSocket and start background tasks.
|
||||
@@ -523,6 +529,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
await self.start_metrics()
|
||||
await self._call_event_handler("on_start_of_turn", transcript)
|
||||
if transcript:
|
||||
logger.trace(f"Start of turn transcript: {transcript}")
|
||||
|
||||
@@ -537,6 +544,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
event: The event type string for logging purposes.
|
||||
"""
|
||||
logger.trace(f"Received event TurnResumed: {event}")
|
||||
await self._call_event_handler("on_turn_resumed")
|
||||
|
||||
async def _handle_end_of_turn(self, transcript: str, data: Dict[str, Any]):
|
||||
"""Handle EndOfTurn events from Deepgram Flux.
|
||||
@@ -571,6 +579,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
await self.stop_processing_metrics()
|
||||
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
await self._call_event_handler("on_end_of_turn", transcript)
|
||||
|
||||
async def _handle_eager_end_of_turn(self, transcript: str, data: Dict[str, Any]):
|
||||
"""Handle EagerEndOfTurn events from Deepgram Flux.
|
||||
@@ -615,6 +624,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
result=data,
|
||||
)
|
||||
)
|
||||
await self._call_event_handler("on_eager_end_of_turn", transcript)
|
||||
|
||||
async def _handle_update(self, transcript: str):
|
||||
"""Handle Update events from Deepgram Flux.
|
||||
@@ -638,3 +648,4 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
# both the "user started speaking" event and the first transcript simultaneously,
|
||||
# making this timing measurement meaningless in this context.
|
||||
# await self.stop_ttfb_metrics()
|
||||
await self._call_event_handler("on_update", transcript)
|
||||
|
||||
@@ -12,6 +12,7 @@ for generating speech from text using various voice models.
|
||||
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
@@ -117,3 +118,114 @@ class DeepgramTTSService(TTSService):
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} exception: {e}")
|
||||
yield ErrorFrame(f"Error getting audio: {str(e)}")
|
||||
|
||||
|
||||
class DeepgramHttpTTSService(TTSService):
|
||||
"""Deepgram HTTP text-to-speech service.
|
||||
|
||||
Provides text-to-speech synthesis using Deepgram's HTTP TTS API.
|
||||
Supports various voice models and audio encoding formats with
|
||||
configurable sample rates and quality settings.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
voice: str = "aura-2-helena-en",
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
base_url: str = "https://api.deepgram.com",
|
||||
sample_rate: Optional[int] = None,
|
||||
encoding: str = "linear16",
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Deepgram TTS service.
|
||||
|
||||
Args:
|
||||
api_key: Deepgram API key for authentication.
|
||||
voice: Voice model to use for synthesis. Defaults to "aura-2-helena-en".
|
||||
aiohttp_session: Shared aiohttp session for HTTP requests with connection pooling.
|
||||
base_url: Custom base URL for Deepgram API. Defaults to "https://api.deepgram.com".
|
||||
sample_rate: Audio sample rate in Hz. If None, uses service default.
|
||||
encoding: Audio encoding format. Defaults to "linear16".
|
||||
**kwargs: Additional arguments passed to parent TTSService class.
|
||||
"""
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._session = aiohttp_session
|
||||
self._base_url = base_url
|
||||
self._settings = {
|
||||
"encoding": encoding,
|
||||
}
|
||||
self.set_voice(voice)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if the service can generate metrics.
|
||||
|
||||
Returns:
|
||||
True, as Deepgram TTS service supports metrics generation.
|
||||
"""
|
||||
return True
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using Deepgram's TTS API.
|
||||
|
||||
Args:
|
||||
text: The text to synthesize into speech.
|
||||
|
||||
Yields:
|
||||
Frame: Audio frames containing the synthesized speech, plus start/stop frames.
|
||||
"""
|
||||
logger.debug(f"{self}: Generating TTS [{text}]")
|
||||
|
||||
# Build URL with parameters
|
||||
url = f"{self._base_url}/v1/speak"
|
||||
|
||||
headers = {"Authorization": f"Token {self._api_key}", "Content-Type": "application/json"}
|
||||
|
||||
params = {
|
||||
"model": self._voice_id,
|
||||
"encoding": self._settings["encoding"],
|
||||
"sample_rate": self.sample_rate,
|
||||
"container": "none",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"text": text,
|
||||
}
|
||||
|
||||
try:
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
async with self._session.post(
|
||||
url, headers=headers, json=payload, params=params
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
raise Exception(f"HTTP {response.status}: {error_text}")
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
yield TTSStartedFrame()
|
||||
|
||||
CHUNK_SIZE = self.chunk_size
|
||||
|
||||
first_chunk = True
|
||||
async for chunk in response.content.iter_chunked(CHUNK_SIZE):
|
||||
if first_chunk:
|
||||
await self.stop_ttfb_metrics()
|
||||
first_chunk = False
|
||||
|
||||
if chunk:
|
||||
yield TTSAudioRawFrame(
|
||||
audio=chunk,
|
||||
sample_rate=self.sample_rate,
|
||||
num_channels=1,
|
||||
)
|
||||
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} exception: {e}")
|
||||
yield ErrorFrame(f"Error getting audio: {str(e)}")
|
||||
|
||||
@@ -17,6 +17,7 @@ import json
|
||||
import random
|
||||
import time
|
||||
import uuid
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
@@ -56,10 +57,12 @@ from pipecat.frames.frames import (
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import LLMTokenUsage
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantAggregatorParams,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.aggregators.openai_llm_context import (
|
||||
OpenAILLMContext,
|
||||
OpenAILLMContextFrame,
|
||||
@@ -219,6 +222,10 @@ class GeminiLiveContext(OpenAILLMContext):
|
||||
|
||||
Provides Gemini-specific context management including system instruction
|
||||
extraction and message format conversion for the Live API.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
Gemini Live no longer uses `GeminiLiveContext` under the hood.
|
||||
It now uses `LLMContext`.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@@ -231,6 +238,22 @@ class GeminiLiveContext(OpenAILLMContext):
|
||||
Returns:
|
||||
The upgraded Gemini context instance.
|
||||
"""
|
||||
# This warning is here rather than `__init__` since `upgrade()` was the
|
||||
# "main" way that GeminiLiveContext instances were created.
|
||||
# Almost no users should be seeing this message anyway, as
|
||||
# GeminiLiveContext instances were typically created under the hood:
|
||||
# the user would pass an OpenAILLMContext instance, which would be
|
||||
# upgraded without them necessarily knowing.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"GeminiLiveContext is deprecated. "
|
||||
"Gemini Live no longer uses GeminiLiveContext under the hood. "
|
||||
"It now uses LLMContext.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, GeminiLiveContext):
|
||||
logger.debug(f"Upgrading to Gemini Live Context: {obj}")
|
||||
obj.__class__ = GeminiLiveContext
|
||||
@@ -328,8 +351,28 @@ class GeminiLiveUserContextAggregator(OpenAIUserContextAggregator):
|
||||
|
||||
Extends OpenAI user aggregator to handle Gemini-specific message passing
|
||||
while maintaining compatibility with the standard aggregation pipeline.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
Gemini Live no longer expects a `GeminiLiveUserContextAggregator`.
|
||||
It now expects a `LLMUserAggregator`.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""Initialize Gemini Live user context aggregator."""
|
||||
# Almost no users should be seeing this message, as
|
||||
# `GeminiLiveUserContextAggregator`` instances were typically created
|
||||
# under the hood, as part of `llm.create_context_aggregator()`.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"GeminiLiveUserContextAggregator is deprecated. "
|
||||
"Gemini Live no longer expects a GeminiLiveUserContextAggregator. "
|
||||
"It now expects a LLMUserAggregator.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
async def process_frame(self, frame, direction):
|
||||
"""Process incoming frames for user context aggregation.
|
||||
|
||||
@@ -349,8 +392,28 @@ class GeminiLiveAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
||||
Handles assistant response aggregation while filtering out LLMTextFrames
|
||||
to prevent duplicate context entries, as Gemini Live pushes both
|
||||
LLMTextFrames and TTSTextFrames.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
Gemini Live no longer uses `GeminiLiveAssistantContextAggregator` under the hood.
|
||||
It now uses `LLMAssistantAggregator`.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""Initialize Gemini Live assistant context aggregator."""
|
||||
# Almost no users should be seeing this message, as
|
||||
# `GeminiLiveAssistantContextAggregator` instances were typically
|
||||
# created under the hood, as part of `llm.create_context_aggregator()`.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"GeminiLiveAssistantContextAggregator is deprecated. "
|
||||
"Gemini Live no longer uses GeminiLiveAssistantContextAggregator under the hood. "
|
||||
"It now uses LLMAssistantAggregator.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process incoming frames for assistant context aggregation.
|
||||
|
||||
@@ -380,6 +443,10 @@ class GeminiLiveAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
||||
class GeminiLiveContextAggregatorPair:
|
||||
"""Pair of user and assistant context aggregators for Gemini Live.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
`GeminiLiveContextAggregatorPair` is deprecated.
|
||||
Use `LLMContextAggregatorPair` instead.
|
||||
|
||||
Parameters:
|
||||
_user: The user context aggregator instance.
|
||||
_assistant: The assistant context aggregator instance.
|
||||
@@ -388,6 +455,19 @@ class GeminiLiveContextAggregatorPair:
|
||||
_user: GeminiLiveUserContextAggregator
|
||||
_assistant: GeminiLiveAssistantContextAggregator
|
||||
|
||||
def __post_init__(self):
|
||||
# Almost no users should be seeing this message, as
|
||||
# `GeminiLiveContextAggregatorPair` instances were typically created
|
||||
# under the hood, with `llm.create_context_aggregator()`.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"GeminiLiveContextAggregatorPair is deprecated. "
|
||||
"Use LLMContextAggregatorPair instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
def user(self) -> GeminiLiveUserContextAggregator:
|
||||
"""Get the user context aggregator.
|
||||
|
||||
@@ -592,8 +672,8 @@ class GeminiLiveLLMService(LLMService):
|
||||
self._voice_id = voice_id
|
||||
self._language_code = params.language
|
||||
|
||||
self._system_instruction = system_instruction
|
||||
self._tools = tools
|
||||
self._system_instruction_from_init = system_instruction
|
||||
self._tools_from_init = tools
|
||||
self._inference_on_context_initialization = inference_on_context_initialization
|
||||
self._needs_turn_complete_message = False
|
||||
|
||||
@@ -609,7 +689,7 @@ class GeminiLiveLLMService(LLMService):
|
||||
self._run_llm_when_session_ready = False
|
||||
|
||||
self._user_is_speaking = False
|
||||
self._bot_is_speaking = False
|
||||
self._bot_is_responding = False
|
||||
self._user_audio_buffer = bytearray()
|
||||
self._user_transcription_buffer = ""
|
||||
self._last_transcription_sent = ""
|
||||
@@ -665,6 +745,9 @@ class GeminiLiveLLMService(LLMService):
|
||||
# Initialize the API client. Subclasses can override this if needed.
|
||||
self.create_client()
|
||||
|
||||
# Bookkeeping for tool calls
|
||||
self._completed_tool_calls = set()
|
||||
|
||||
def create_client(self):
|
||||
"""Create the Gemini API client instance. Subclasses can override this."""
|
||||
self._client = Client(api_key=self._api_key, http_options=self._http_options)
|
||||
@@ -787,9 +870,13 @@ class GeminiLiveLLMService(LLMService):
|
||||
#
|
||||
|
||||
async def _handle_interruption(self):
|
||||
await self._set_bot_is_speaking(False)
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
if self._bot_is_responding:
|
||||
await self._set_bot_is_responding(False)
|
||||
if self._settings.get("modalities") == GeminiModalities.AUDIO:
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
# Do not send LLMFullResponseEndFrame here - an interruption
|
||||
# already tells the assistant context aggregator that the response
|
||||
# is over.
|
||||
|
||||
async def _handle_user_started_speaking(self, frame):
|
||||
self._user_is_speaking = True
|
||||
@@ -807,7 +894,6 @@ class GeminiLiveLLMService(LLMService):
|
||||
|
||||
#
|
||||
# frame processing
|
||||
#
|
||||
# StartFrame, StopFrame, CancelFrame implemented in base class
|
||||
#
|
||||
|
||||
@@ -820,7 +906,7 @@ class GeminiLiveLLMService(LLMService):
|
||||
"""
|
||||
# Defer EndFrame handling until after the bot turn is finished
|
||||
if isinstance(frame, EndFrame):
|
||||
if self._bot_is_speaking:
|
||||
if self._bot_is_responding:
|
||||
logger.debug("Deferring handling EndFrame until bot turn is finished")
|
||||
self._end_frame_pending_bot_turn_finished = frame
|
||||
return
|
||||
@@ -829,22 +915,13 @@ class GeminiLiveLLMService(LLMService):
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, OpenAILLMContextFrame):
|
||||
context: GeminiLiveContext = GeminiLiveContext.upgrade(frame.context)
|
||||
# For now, we'll only trigger inference here when either:
|
||||
# 1. We have not seen a context frame before
|
||||
# 2. The last message is a tool call result
|
||||
if not self._context:
|
||||
self._context = context
|
||||
if frame.context.tools:
|
||||
self._tools = frame.context.tools
|
||||
await self._create_initial_response()
|
||||
elif context.messages and context.messages[-1].get("role") == "tool":
|
||||
# Support just one tool call per context frame for now
|
||||
tool_result_message = context.messages[-1]
|
||||
await self._tool_result(tool_result_message)
|
||||
elif isinstance(frame, LLMContextFrame):
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for Gemini Live.")
|
||||
elif isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
|
||||
context = (
|
||||
frame.context
|
||||
if isinstance(frame, LLMContextFrame)
|
||||
else LLMContext.from_openai_context(frame.context)
|
||||
)
|
||||
await self._handle_context(context)
|
||||
elif isinstance(frame, InputTextRawFrame):
|
||||
await self._send_user_text(frame.text)
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -883,13 +960,83 @@ class GeminiLiveLLMService(LLMService):
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _set_bot_is_speaking(self, speaking: bool):
|
||||
if self._bot_is_speaking == speaking:
|
||||
async def _handle_context(self, context: LLMContext):
|
||||
if not self._context:
|
||||
# We got our initial context
|
||||
self._context = context
|
||||
|
||||
# If context contains system instruction or tools, reconnect in
|
||||
# order to apply them.
|
||||
# (Context-provided system instruction and tools take precedence
|
||||
# over the ones provided at initialization time. Note that we could
|
||||
# do more sophisticated comparisons here, but for now this is
|
||||
# sufficient: we'll assume folks won't mean to provide these
|
||||
# settings both in the context and at initialization time. In a
|
||||
# future change, we could/should implement the ability to swap
|
||||
# these settings at any point).
|
||||
adapter: GeminiLLMAdapter = self.get_llm_adapter()
|
||||
params = adapter.get_llm_invocation_params(self._context)
|
||||
system_instruction = params["system_instruction"]
|
||||
tools = params["tools"]
|
||||
if system_instruction and self._system_instruction_from_init:
|
||||
logger.warning(
|
||||
"System instruction provided both at init time and in context; using context-provided value."
|
||||
)
|
||||
if tools and self._tools_from_init:
|
||||
logger.warning(
|
||||
"Tools provided both at init time and in context; using context-provided value."
|
||||
)
|
||||
if system_instruction or tools:
|
||||
await self._reconnect()
|
||||
|
||||
# Initialize our bookkeeping of already-completed tool calls in
|
||||
# the context
|
||||
await self._process_completed_function_calls(send_new_results=False)
|
||||
|
||||
# Create initial response if needed, based on conversation history
|
||||
# in context
|
||||
await self._create_initial_response()
|
||||
else:
|
||||
# We got an updated context.
|
||||
self._context = context
|
||||
|
||||
# Here we assume that the updated context will contain either:
|
||||
# - new messages (that the Gemini Live service, with its own
|
||||
# context management, is already aware of), or
|
||||
# - tool call results (that we need to tell the remote service
|
||||
# about).
|
||||
# (In the future, we could do more sophisticated diffing here,
|
||||
# which would enable the user to programmatically manipulate the
|
||||
# context).
|
||||
|
||||
# Send results for newly-completed function calls, if any.
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _process_completed_function_calls(self, send_new_results: bool):
|
||||
# Check for set of completed function calls in the context
|
||||
adapter: GeminiLLMAdapter = self.get_llm_adapter()
|
||||
messages = adapter.get_llm_invocation_params(self._context).get("messages", [])
|
||||
for message in messages:
|
||||
if message.parts:
|
||||
for part in message.parts:
|
||||
if part.function_response:
|
||||
tool_call_id = part.function_response.id
|
||||
tool_name = part.function_response.name
|
||||
if tool_call_id and tool_call_id not in self._completed_tool_calls:
|
||||
# Found a newly-completed function call - send the result to the service
|
||||
if send_new_results:
|
||||
await self._tool_result(
|
||||
tool_call_id, tool_name, part.function_response.response
|
||||
)
|
||||
self._completed_tool_calls.add(tool_call_id)
|
||||
|
||||
async def _set_bot_is_responding(self, responding: bool):
|
||||
if self._bot_is_responding == responding:
|
||||
return
|
||||
|
||||
self._bot_is_speaking = speaking
|
||||
self._bot_is_responding = responding
|
||||
|
||||
if not self._bot_is_speaking and self._end_frame_pending_bot_turn_finished:
|
||||
if not self._bot_is_responding and self._end_frame_pending_bot_turn_finished:
|
||||
await self.queue_frame(self._end_frame_pending_bot_turn_finished)
|
||||
self._end_frame_pending_bot_turn_finished = None
|
||||
|
||||
@@ -991,18 +1138,25 @@ class GeminiLiveLLMService(LLMService):
|
||||
automatic_activity_detection=vad_config
|
||||
)
|
||||
|
||||
# Add system instruction to configuration, if provided
|
||||
system_instruction = self._system_instruction or ""
|
||||
if self._context and hasattr(self._context, "extract_system_instructions"):
|
||||
system_instruction += "\n" + self._context.extract_system_instructions()
|
||||
# Add system instruction and tools to configuration, if provided.
|
||||
# These settings from the context take precedence over the ones
|
||||
# provided at initialization time.
|
||||
adapter: GeminiLLMAdapter = self.get_llm_adapter()
|
||||
system_instruction = None
|
||||
tools = None
|
||||
if self._context:
|
||||
params = adapter.get_llm_invocation_params(self._context)
|
||||
system_instruction = params["system_instruction"]
|
||||
tools = params["tools"]
|
||||
else:
|
||||
system_instruction = self._system_instruction_from_init
|
||||
tools = adapter.from_standard_tools(self._tools_from_init)
|
||||
if system_instruction:
|
||||
logger.debug(f"Setting system instruction: {system_instruction}")
|
||||
config.system_instruction = system_instruction
|
||||
|
||||
# Add tools to configuration, if provided
|
||||
if self._tools:
|
||||
logger.debug(f"Setting tools: {self._tools}")
|
||||
config.tools = self.get_llm_adapter().from_standard_tools(self._tools)
|
||||
if tools:
|
||||
logger.debug(f"Setting tools: {tools}")
|
||||
config.tools = tools
|
||||
|
||||
# Start the connection
|
||||
self._connection_task = self.create_task(self._connection_task_handler(config=config))
|
||||
@@ -1116,6 +1270,7 @@ class GeminiLiveLLMService(LLMService):
|
||||
if self._session:
|
||||
await self._session.close()
|
||||
self._session = None
|
||||
self._completed_tool_calls = set()
|
||||
self._disconnecting = False
|
||||
except Exception as e:
|
||||
logger.error(f"{self} error disconnecting: {e}")
|
||||
@@ -1195,7 +1350,8 @@ class GeminiLiveLLMService(LLMService):
|
||||
self._run_llm_when_session_ready = True
|
||||
return
|
||||
|
||||
messages = self._context.get_messages_for_initializing_history()
|
||||
adapter: GeminiLLMAdapter = self.get_llm_adapter()
|
||||
messages = adapter.get_llm_invocation_params(self._context).get("messages", [])
|
||||
if not messages:
|
||||
return
|
||||
|
||||
@@ -1223,8 +1379,9 @@ class GeminiLiveLLMService(LLMService):
|
||||
|
||||
# Create a throwaway context just for the purpose of getting messages
|
||||
# in the right format
|
||||
context = GeminiLiveContext.upgrade(OpenAILLMContext(messages=messages_list))
|
||||
messages = context.get_messages_for_initializing_history()
|
||||
context = LLMContext(messages=messages_list)
|
||||
adapter: GeminiLLMAdapter = self.get_llm_adapter()
|
||||
messages = adapter.get_llm_invocation_params(context).get("messages", [])
|
||||
|
||||
if not messages:
|
||||
return
|
||||
@@ -1239,17 +1396,16 @@ class GeminiLiveLLMService(LLMService):
|
||||
await self._handle_send_error(e)
|
||||
|
||||
@traced_gemini_live(operation="llm_tool_result")
|
||||
async def _tool_result(self, tool_result_message):
|
||||
async def _tool_result(
|
||||
self, tool_call_id: str, tool_name: str, tool_result_message: Dict[str, Any]
|
||||
):
|
||||
"""Send tool result back to the API."""
|
||||
if self._disconnecting or not self._session:
|
||||
return
|
||||
|
||||
# For now we're shoving the name into the tool_call_id field, so this
|
||||
# will work until we revisit that.
|
||||
id = tool_result_message.get("tool_call_id")
|
||||
name = tool_result_message.get("tool_call_name")
|
||||
result = json.loads(tool_result_message.get("content") or "")
|
||||
response = FunctionResponse(name=name, id=id, response=result)
|
||||
response = FunctionResponse(name=tool_name, id=tool_call_id, response=tool_result_message)
|
||||
|
||||
try:
|
||||
await self._session.send_tool_response(function_responses=response)
|
||||
@@ -1277,7 +1433,10 @@ class GeminiLiveLLMService(LLMService):
|
||||
# part.text is added when `modalities` is set to TEXT; otherwise, it's None
|
||||
text = part.text
|
||||
if text:
|
||||
if not self._bot_text_buffer:
|
||||
if not self._bot_is_responding:
|
||||
# Update bot responding state and send service start frame
|
||||
# (AUDIO modality case)
|
||||
await self._set_bot_is_responding(True)
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
|
||||
self._bot_text_buffer += text
|
||||
@@ -1288,6 +1447,8 @@ class GeminiLiveLLMService(LLMService):
|
||||
if msg.server_content and msg.server_content.grounding_metadata:
|
||||
self._accumulated_grounding_metadata = msg.server_content.grounding_metadata
|
||||
|
||||
# If we have no audio, stop here.
|
||||
# All logic below this point pertains to the AUDIO modality.
|
||||
inline_data = part.inline_data
|
||||
if not inline_data:
|
||||
return
|
||||
@@ -1313,8 +1474,10 @@ class GeminiLiveLLMService(LLMService):
|
||||
if not audio:
|
||||
return
|
||||
|
||||
if not self._bot_is_speaking:
|
||||
await self._set_bot_is_speaking(True)
|
||||
# Update bot responding state and send service start frames
|
||||
# (AUDIO modality case)
|
||||
if not self._bot_is_responding:
|
||||
await self._set_bot_is_responding(True)
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
|
||||
@@ -1354,7 +1517,6 @@ class GeminiLiveLLMService(LLMService):
|
||||
@traced_gemini_live(operation="llm_response")
|
||||
async def _handle_msg_turn_complete(self, message: LiveServerMessage):
|
||||
"""Handle the turn complete message."""
|
||||
await self._set_bot_is_speaking(False)
|
||||
text = self._bot_text_buffer
|
||||
|
||||
# Trace the complete LLM response (this will be handled by the decorator)
|
||||
@@ -1373,13 +1535,15 @@ class GeminiLiveLLMService(LLMService):
|
||||
self._search_result_buffer = ""
|
||||
self._accumulated_grounding_metadata = None
|
||||
|
||||
# Only push the TTSStoppedFrame if the bot is outputting audio
|
||||
# when text is found, modalities is set to TEXT and no audio
|
||||
# is produced.
|
||||
if not text:
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
if self._bot_is_responding:
|
||||
await self._set_bot_is_responding(False)
|
||||
if not text:
|
||||
# AUDIO modality case
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
else:
|
||||
# TEXT modality case
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
|
||||
@traced_stt
|
||||
async def _handle_user_transcription(
|
||||
@@ -1442,8 +1606,8 @@ class GeminiLiveLLMService(LLMService):
|
||||
return
|
||||
|
||||
# This is the output transcription text when modalities is set to AUDIO.
|
||||
# In this case, we push LLMTextFrame and TTSTextFrame to be handled by the
|
||||
# downstream assistant context aggregator.
|
||||
# In this case, we push TTSTextFrame to be handled by the downstream
|
||||
# assistant context aggregator.
|
||||
text = message.server_content.output_transcription.text
|
||||
|
||||
if not text:
|
||||
@@ -1458,7 +1622,17 @@ class GeminiLiveLLMService(LLMService):
|
||||
# Collect text for tracing
|
||||
self._llm_output_buffer += text
|
||||
|
||||
await self.push_frame(LLMTextFrame(text=text))
|
||||
# NOTE: Shoot. When using Vertex AI, output transcription messages
|
||||
# arrive *before* the model_turn messages with audio, so we need to
|
||||
# handle sending TTSStartedFrame and LLMFullResponseStartFrame here as
|
||||
# well. These messages also contain much *more* text (it looks further
|
||||
# ahead). That means that on an interruption our recorded context will
|
||||
# contain some text that was actually never spoken.
|
||||
if not self._bot_is_responding:
|
||||
await self._set_bot_is_responding(True)
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
|
||||
await self.push_frame(TTSTextFrame(text=text))
|
||||
|
||||
async def _handle_msg_grounding_metadata(self, message: LiveServerMessage):
|
||||
@@ -1543,13 +1717,17 @@ class GeminiLiveLLMService(LLMService):
|
||||
self._session_resumption_handle = update.new_handle
|
||||
|
||||
async def _handle_send_error(self, error: Exception):
|
||||
# Ignore "expected" errors that may have occurred for messages that
|
||||
# were in-flight when a disconnection occurred.
|
||||
if self._disconnecting or not self._session:
|
||||
return
|
||||
|
||||
# In server-to-server contexts, a WebSocket error should be quite rare.
|
||||
# Given how hard it is to recover from a send-side error with proper
|
||||
# state management, and that exponential backoff for retries can have
|
||||
# cost/stability implications for a service cluster, let's just treat a
|
||||
# send-side error as fatal.
|
||||
if not self._disconnecting:
|
||||
await self.push_error(ErrorFrame(error=f"{self} Send error: {error}", fatal=True))
|
||||
await self.push_error(ErrorFrame(error=f"{self} Send error: {error}", fatal=True))
|
||||
|
||||
def create_context_aggregator(
|
||||
self,
|
||||
@@ -1557,26 +1735,26 @@ class GeminiLiveLLMService(LLMService):
|
||||
*,
|
||||
user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
|
||||
assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
|
||||
) -> GeminiLiveContextAggregatorPair:
|
||||
) -> LLMContextAggregatorPair:
|
||||
"""Create an instance of GeminiLiveContextAggregatorPair from an OpenAILLMContext.
|
||||
|
||||
Constructor keyword arguments for both the user and assistant aggregators can be provided.
|
||||
|
||||
NOTE: this method exists only for backward compatibility. New code
|
||||
should instead do:
|
||||
context = LLMContext(...)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
Args:
|
||||
context: The LLM context to use.
|
||||
user_params: User aggregator parameters. Defaults to LLMUserAggregatorParams().
|
||||
assistant_params: Assistant aggregator parameters. Defaults to LLMAssistantAggregatorParams().
|
||||
|
||||
Returns:
|
||||
GeminiLiveContextAggregatorPair: A pair of context
|
||||
aggregators, one for the user and one for the assistant,
|
||||
encapsulated in an GeminiLiveContextAggregatorPair.
|
||||
A pair of user and assistant context aggregators.
|
||||
"""
|
||||
context.set_llm_adapter(self.get_llm_adapter())
|
||||
|
||||
GeminiLiveContext.upgrade(context)
|
||||
user = GeminiLiveUserContextAggregator(context, params=user_params)
|
||||
|
||||
context = LLMContext.from_openai_context(context)
|
||||
assistant_params.expect_stripped_words = False
|
||||
assistant = GeminiLiveAssistantContextAggregator(context, params=assistant_params)
|
||||
return GeminiLiveContextAggregatorPair(_user=user, _assistant=assistant)
|
||||
return LLMContextAggregatorPair(
|
||||
context, user_params=user_params, assistant_params=assistant_params
|
||||
)
|
||||
|
||||
@@ -1034,6 +1034,23 @@ class GoogleLLMService(LLMService):
|
||||
if context:
|
||||
await self._process_context(context)
|
||||
|
||||
async def stop(self, frame):
|
||||
"""Override stop to gracefully close the client."""
|
||||
await super().stop(frame)
|
||||
await self._close_client()
|
||||
|
||||
async def cancel(self, frame):
|
||||
"""Override cancel to gracefully close the client."""
|
||||
await super().cancel(frame)
|
||||
await self._close_client()
|
||||
|
||||
async def _close_client(self):
|
||||
try:
|
||||
await self._client.aio.aclose()
|
||||
except Exception:
|
||||
# Do nothing - we're shutting down anyway
|
||||
pass
|
||||
|
||||
def create_context_aggregator(
|
||||
self,
|
||||
context: OpenAILLMContext,
|
||||
|
||||
@@ -22,7 +22,7 @@ from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
# Suppress gRPC fork warnings
|
||||
os.environ["GRPC_ENABLE_FORK_SUPPORT"] = "false"
|
||||
|
||||
from typing import AsyncGenerator, List, Literal, Optional
|
||||
from typing import Any, AsyncGenerator, List, Literal, Mapping, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
@@ -248,7 +248,8 @@ class GoogleHttpTTSService(TTSService):
|
||||
|
||||
Parameters:
|
||||
pitch: Voice pitch adjustment (e.g., "+2st", "-50%").
|
||||
rate: Speaking rate adjustment (e.g., "slow", "fast", "125%").
|
||||
rate: Speaking rate adjustment (e.g., "slow", "fast", "125%"). Used for SSML prosody tags (non-Chirp voices).
|
||||
speaking_rate: Speaking rate for AudioConfig (Chirp/Journey voices). Range [0.25, 2.0].
|
||||
volume: Volume adjustment (e.g., "loud", "soft", "+6dB").
|
||||
emphasis: Emphasis level for the text.
|
||||
language: Language for synthesis. Defaults to English.
|
||||
@@ -258,6 +259,7 @@ class GoogleHttpTTSService(TTSService):
|
||||
|
||||
pitch: Optional[str] = None
|
||||
rate: Optional[str] = None
|
||||
speaking_rate: Optional[float] = None
|
||||
volume: Optional[str] = None
|
||||
emphasis: Optional[Literal["strong", "moderate", "reduced", "none"]] = None
|
||||
language: Optional[Language] = Language.EN
|
||||
@@ -291,6 +293,7 @@ class GoogleHttpTTSService(TTSService):
|
||||
self._settings = {
|
||||
"pitch": params.pitch,
|
||||
"rate": params.rate,
|
||||
"speaking_rate": params.speaking_rate,
|
||||
"volume": params.volume,
|
||||
"emphasis": params.emphasis,
|
||||
"language": self.language_to_service_language(params.language)
|
||||
@@ -360,6 +363,22 @@ class GoogleHttpTTSService(TTSService):
|
||||
"""
|
||||
return language_to_google_tts_language(language)
|
||||
|
||||
async def _update_settings(self, settings: Mapping[str, Any]):
|
||||
"""Override to handle speaking_rate updates for Chirp/Journey voices.
|
||||
|
||||
Args:
|
||||
settings: Dictionary of settings to update. Can include 'speaking_rate' (float)
|
||||
"""
|
||||
if "speaking_rate" in settings:
|
||||
rate_value = float(settings["speaking_rate"])
|
||||
if 0.25 <= rate_value <= 2.0:
|
||||
self._settings["speaking_rate"] = rate_value
|
||||
else:
|
||||
logger.warning(
|
||||
f"Invalid speaking_rate value: {rate_value}. Must be between 0.25 and 2.0"
|
||||
)
|
||||
await super()._update_settings(settings)
|
||||
|
||||
def _construct_ssml(self, text: str) -> str:
|
||||
ssml = "<speak>"
|
||||
|
||||
@@ -436,10 +455,17 @@ class GoogleHttpTTSService(TTSService):
|
||||
voice = texttospeech_v1.VoiceSelectionParams(
|
||||
language_code=self._settings["language"], name=self._voice_id
|
||||
)
|
||||
audio_config = texttospeech_v1.AudioConfig(
|
||||
audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
|
||||
sample_rate_hertz=self.sample_rate,
|
||||
)
|
||||
# Build audio config with conditional speaking_rate
|
||||
audio_config_params = {
|
||||
"audio_encoding": texttospeech_v1.AudioEncoding.LINEAR16,
|
||||
"sample_rate_hertz": self.sample_rate,
|
||||
}
|
||||
|
||||
# For Chirp and Journey voices, include speaking_rate in AudioConfig
|
||||
if (is_chirp_voice or is_journey_voice) and self._settings["speaking_rate"] is not None:
|
||||
audio_config_params["speaking_rate"] = self._settings["speaking_rate"]
|
||||
|
||||
audio_config = texttospeech_v1.AudioConfig(**audio_config_params)
|
||||
|
||||
request = texttospeech_v1.SynthesizeSpeechRequest(
|
||||
input=synthesis_input, voice=voice, audio_config=audio_config
|
||||
@@ -500,7 +526,7 @@ class GoogleTTSService(TTSService):
|
||||
|
||||
Parameters:
|
||||
language: Language for synthesis. Defaults to English.
|
||||
speaking_rate: The speaking rate, in the range [0.25, 4.0].
|
||||
speaking_rate: The speaking rate, in the range [0.25, 2.0].
|
||||
"""
|
||||
|
||||
language: Optional[Language] = Language.EN
|
||||
@@ -591,6 +617,22 @@ class GoogleTTSService(TTSService):
|
||||
"""
|
||||
return language_to_google_tts_language(language)
|
||||
|
||||
async def _update_settings(self, settings: Mapping[str, Any]):
|
||||
"""Override to handle speaking_rate updates for streaming API.
|
||||
|
||||
Args:
|
||||
settings: Dictionary of settings to update. Can include 'speaking_rate' (float)
|
||||
"""
|
||||
if "speaking_rate" in settings:
|
||||
rate_value = float(settings["speaking_rate"])
|
||||
if 0.25 <= rate_value <= 2.0:
|
||||
self._settings["speaking_rate"] = rate_value
|
||||
else:
|
||||
logger.warning(
|
||||
f"Invalid speaking_rate value: {rate_value}. Must be between 0.25 and 2.0"
|
||||
)
|
||||
await super()._update_settings(settings)
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate streaming speech from text using Google's streaming API.
|
||||
|
||||
@@ -184,11 +184,15 @@ class HumeTTSService(TTSService):
|
||||
# Hume emits mono PCM at 48 kHz; downstream can resample if needed.
|
||||
# We buffer audio bytes before sending to prevent glitches.
|
||||
self._audio_bytes = b""
|
||||
|
||||
# Use version "2" by default if no description is provided
|
||||
# Version "1" is needed when description is used
|
||||
version = "1" if self._params.description is not None else "2"
|
||||
async for chunk in self._client.tts.synthesize_json_streaming(
|
||||
utterances=[utterance],
|
||||
format=pcm_fmt,
|
||||
instant_mode=True,
|
||||
version="2",
|
||||
version=version,
|
||||
):
|
||||
audio_b64 = getattr(chunk, "audio", None)
|
||||
if not audio_b64:
|
||||
|
||||
@@ -492,11 +492,19 @@ class LLMService(AIService):
|
||||
tool_call_id: Optional[str] = None,
|
||||
text_content: Optional[str] = None,
|
||||
video_source: Optional[str] = None,
|
||||
timeout: Optional[float] = 10.0,
|
||||
):
|
||||
"""Request an image from a user.
|
||||
|
||||
Pushes a UserImageRequestFrame upstream to request an image from the
|
||||
specified user.
|
||||
specified user. The user image can then be processed by the LLM.
|
||||
|
||||
Use this function from a function call if you want the LLM to process
|
||||
the image. If you expect the image to be processed by a vision service,
|
||||
you might want to push a UserImageRequestFrame upstream directly.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
This method is deprecated, push a `UserImageRequestFrame` instead.
|
||||
|
||||
Args:
|
||||
user_id: The ID of the user to request an image from.
|
||||
@@ -504,15 +512,19 @@ class LLMService(AIService):
|
||||
tool_call_id: Optional tool call ID associated with the request.
|
||||
text_content: Optional text content/context for the image request.
|
||||
video_source: Optional video source identifier.
|
||||
timeout: Optional timeout for the requested image to be added to the LLM context.
|
||||
|
||||
"""
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Method `request_image_frame()` is deprecated, push a `UserImageRequestFrame` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(
|
||||
user_id=user_id,
|
||||
function_name=function_name,
|
||||
tool_call_id=tool_call_id,
|
||||
context=text_content,
|
||||
video_source=video_source,
|
||||
),
|
||||
UserImageRequestFrame(user_id=user_id, text=text_content),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
|
||||
@@ -11,15 +11,17 @@ for image analysis and description generation.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, TextFrame
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
TextFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
from pipecat.services.vision_service import VisionService
|
||||
|
||||
try:
|
||||
@@ -92,16 +94,16 @@ class MoondreamService(VisionService):
|
||||
trust_remote_code=True,
|
||||
revision=revision,
|
||||
device_map={"": device},
|
||||
torch_dtype=dtype,
|
||||
dtype=dtype,
|
||||
).eval()
|
||||
|
||||
logger.debug("Loaded Moondream model")
|
||||
|
||||
async def run_vision(self, context: LLMContext) -> AsyncGenerator[Frame, None]:
|
||||
async def run_vision(self, frame: UserImageRawFrame) -> AsyncGenerator[Frame, None]:
|
||||
"""Analyze an image and generate a description.
|
||||
|
||||
Args:
|
||||
context: The context to process, containing image data.
|
||||
frame: The image frame to process.
|
||||
|
||||
Yields:
|
||||
Frame: TextFrame containing the generated image description, or ErrorFrame
|
||||
@@ -112,45 +114,14 @@ class MoondreamService(VisionService):
|
||||
yield ErrorFrame("Moondream model not available")
|
||||
return
|
||||
|
||||
image_bytes = None
|
||||
text = None
|
||||
try:
|
||||
messages = context.get_messages()
|
||||
last_message = messages[-1]
|
||||
last_message_content = last_message.get("content")
|
||||
logger.debug(f"Analyzing image (bytes length: {len(frame.image)})")
|
||||
|
||||
for item in last_message_content:
|
||||
if isinstance(item, dict):
|
||||
if (
|
||||
"image_url" in item
|
||||
and isinstance(item["image_url"], dict)
|
||||
and item["image_url"].get("url")
|
||||
):
|
||||
image_bytes = base64.b64decode(item["image_url"]["url"].split(",")[1])
|
||||
elif "text" in item and isinstance(item["text"], str):
|
||||
text = item["text"]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Exception during image extraction: {e}")
|
||||
yield ErrorFrame("Failed to extract image from context")
|
||||
return
|
||||
|
||||
if not image_bytes:
|
||||
logger.error("No image found in context")
|
||||
yield ErrorFrame("No image found in context")
|
||||
return
|
||||
|
||||
logger.debug(
|
||||
f"Analyzing image (bytes length: {len(image_bytes) if image_bytes else 'None'})"
|
||||
)
|
||||
|
||||
def get_image_description(bytes: bytes, text: Optional[str]) -> str:
|
||||
image_buffer = BytesIO(bytes)
|
||||
image = Image.open(image_buffer)
|
||||
def get_image_description(image_bytes: bytes, text: Optional[str]) -> str:
|
||||
image = Image.frombytes(frame.format, frame.size, image_bytes)
|
||||
image_embeds = self._model.encode_image(image)
|
||||
description = self._model.query(image_embeds, text)["answer"]
|
||||
return description
|
||||
|
||||
description = await asyncio.to_thread(get_image_description, image_bytes, text)
|
||||
description = await asyncio.to_thread(get_image_description, frame.image, frame.text)
|
||||
|
||||
yield TextFrame(text=description)
|
||||
|
||||
@@ -4,7 +4,85 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""OpenAI Realtime LLM context and aggregator implementations."""
|
||||
"""OpenAI Realtime LLM context and aggregator implementations.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
OpenAI Realtime no longer uses types from this module under the hood.
|
||||
It now uses `LLMContext` and `LLMContextAggregatorPair`.
|
||||
Using the new patterns should allow you to not need types from this module.
|
||||
|
||||
BEFORE:
|
||||
```
|
||||
# Setup
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: OpenAIContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: OpenAILLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: OpenAIRealtimeLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
```
|
||||
|
||||
AFTER:
|
||||
```
|
||||
# Setup
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: LLMContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: LLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
```
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.openai.realtime.llm (or "
|
||||
"pipecat.services.openai_realtime.llm) are deprecated. \n"
|
||||
"OpenAI Realtime no longer uses types from this module under the hood. \n"
|
||||
"It now uses `LLMContext` and `LLMContextAggregatorPair`. \n"
|
||||
"Using the new patterns should allow you to not need types from this module.\n\n"
|
||||
"BEFORE:\n"
|
||||
"```\n"
|
||||
"# Setup\n"
|
||||
"context = OpenAILLMContext(messages, tools)\n"
|
||||
"context_aggregator = llm.create_context_aggregator(context)\n\n"
|
||||
"# Context aggregator type\n"
|
||||
"context_aggregator: OpenAIContextAggregatorPair\n\n"
|
||||
"# Context frame type\n"
|
||||
"frame: OpenAILLMContextFrame\n\n"
|
||||
"# Context type\n"
|
||||
"context: OpenAIRealtimeLLMContext\n"
|
||||
"# or\n"
|
||||
"context: OpenAILLMContext\n\n"
|
||||
"```\n\n"
|
||||
"AFTER:\n"
|
||||
"```\n"
|
||||
"# Setup\n"
|
||||
"context = LLMContext(messages, tools)\n"
|
||||
"context_aggregator = LLMContextAggregatorPair(context)\n\n"
|
||||
"# Context aggregator type\n"
|
||||
"context_aggregator: LLMContextAggregatorPair\n\n"
|
||||
"# Context frame type\n"
|
||||
"frame: LLMContextFrame\n\n"
|
||||
"# Context type\n"
|
||||
"context: LLMContext\n\n"
|
||||
"```\n",
|
||||
)
|
||||
|
||||
import copy
|
||||
import json
|
||||
|
||||
@@ -4,7 +4,28 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Custom frame types for OpenAI Realtime API integration."""
|
||||
"""Custom frame types for OpenAI Realtime API integration.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
OpenAI Realtime no longer uses types from this module under the hood.
|
||||
|
||||
It now works more like most LLM services in Pipecat, relying on updates to
|
||||
its context, pushed by context aggregators, to update its internal state.
|
||||
|
||||
Listen for `LLMContextFrame`s for context updates.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.openai.realtime.frames are deprecated. \n"
|
||||
"OpenAI Realtime no longer uses types from this module under the hood. \n\n"
|
||||
"It now works more like other LLM services in Pipecat, relying on updates to \n"
|
||||
"its context, pushed by context aggregators, to update its internal state.\n\n"
|
||||
"Listen for `LLMContextFrame`s for context updates.\n"
|
||||
)
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@@ -14,7 +14,9 @@ from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.services.open_ai_realtime_adapter import OpenAIRealtimeLLMAdapter
|
||||
from pipecat.adapters.services.open_ai_realtime_adapter import (
|
||||
OpenAIRealtimeLLMAdapter,
|
||||
)
|
||||
from pipecat.frames.frames import (
|
||||
BotStoppedSpeakingFrame,
|
||||
CancelFrame,
|
||||
@@ -41,10 +43,12 @@ from pipecat.frames.frames import (
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import LLMTokenUsage
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantAggregatorParams,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.aggregators.openai_llm_context import (
|
||||
OpenAILLMContext,
|
||||
OpenAILLMContextFrame,
|
||||
@@ -57,12 +61,6 @@ from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_openai_realtime, traced_stt
|
||||
|
||||
from . import events
|
||||
from .context import (
|
||||
OpenAIRealtimeAssistantContextAggregator,
|
||||
OpenAIRealtimeLLMContext,
|
||||
OpenAIRealtimeUserContextAggregator,
|
||||
)
|
||||
from .frames import RealtimeFunctionCallResultFrame, RealtimeMessagesUpdateFrame
|
||||
|
||||
try:
|
||||
from websockets.asyncio.client import connect as websocket_connect
|
||||
@@ -108,22 +106,39 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
base_url: str = "wss://api.openai.com/v1/realtime",
|
||||
session_properties: Optional[events.SessionProperties] = None,
|
||||
start_audio_paused: bool = False,
|
||||
send_transcription_frames: bool = True,
|
||||
send_transcription_frames: Optional[bool] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the OpenAI Realtime LLM service.
|
||||
|
||||
Args:
|
||||
api_key: OpenAI API key for authentication.
|
||||
model: OpenAI model name. Defaults to "gpt-4o-realtime-preview-2025-06-03".
|
||||
model: OpenAI model name. Defaults to "gpt-realtime".
|
||||
base_url: WebSocket base URL for the realtime API.
|
||||
Defaults to "wss://api.openai.com/v1/realtime".
|
||||
session_properties: Configuration properties for the realtime session.
|
||||
If None, uses default SessionProperties.
|
||||
start_audio_paused: Whether to start with audio input paused. Defaults to False.
|
||||
send_transcription_frames: Whether to emit transcription frames. Defaults to True.
|
||||
send_transcription_frames: Whether to emit transcription frames.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
This parameter is deprecated and will be removed in a future version.
|
||||
Transcription frames are always sent.
|
||||
|
||||
**kwargs: Additional arguments passed to parent LLMService.
|
||||
"""
|
||||
if send_transcription_frames is not None:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"`send_transcription_frames` is deprecated and will be removed in a future version. "
|
||||
"Transcription frames are always sent.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
full_url = f"{base_url}?model={model}"
|
||||
super().__init__(base_url=full_url, **kwargs)
|
||||
|
||||
@@ -135,10 +150,11 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
session_properties or events.SessionProperties()
|
||||
)
|
||||
self._audio_input_paused = start_audio_paused
|
||||
self._send_transcription_frames = send_transcription_frames
|
||||
self._websocket = None
|
||||
self._receive_task = None
|
||||
self._context = None
|
||||
self._context: LLMContext = None
|
||||
|
||||
self._llm_needs_conversation_setup = True
|
||||
|
||||
self._disconnecting = False
|
||||
self._api_session_ready = False
|
||||
@@ -148,8 +164,8 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
self._current_audio_response = None
|
||||
|
||||
self._messages_added_manually = {}
|
||||
self._user_and_response_message_tuple = None
|
||||
self._pending_function_calls = {} # Track function calls by call_id
|
||||
self._completed_tool_calls = set()
|
||||
|
||||
self._register_event_handler("on_conversation_item_created")
|
||||
self._register_event_handler("on_conversation_item_updated")
|
||||
@@ -347,22 +363,13 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
pass
|
||||
elif isinstance(frame, OpenAILLMContextFrame):
|
||||
context: OpenAIRealtimeLLMContext = OpenAIRealtimeLLMContext.upgrade_to_realtime(
|
||||
elif isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
|
||||
context = (
|
||||
frame.context
|
||||
if isinstance(frame, LLMContextFrame)
|
||||
else LLMContext.from_openai_context(frame.context)
|
||||
)
|
||||
if not self._context:
|
||||
self._context = context
|
||||
elif frame.context is not self._context:
|
||||
# If the context has changed, reset the conversation
|
||||
self._context = context
|
||||
await self.reset_conversation()
|
||||
# Run the LLM at next opportunity
|
||||
await self._create_response()
|
||||
elif isinstance(frame, LLMContextFrame):
|
||||
raise NotImplementedError(
|
||||
"Universal LLMContext is not yet supported for OpenAI Realtime."
|
||||
)
|
||||
await self._handle_context(context)
|
||||
elif isinstance(frame, InputAudioRawFrame):
|
||||
if not self._audio_input_paused:
|
||||
await self._send_user_audio(frame)
|
||||
@@ -376,29 +383,33 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
await self._handle_bot_stopped_speaking()
|
||||
elif isinstance(frame, LLMMessagesAppendFrame):
|
||||
await self._handle_messages_append(frame)
|
||||
elif isinstance(frame, RealtimeMessagesUpdateFrame):
|
||||
self._context = frame.context
|
||||
elif isinstance(frame, LLMUpdateSettingsFrame):
|
||||
self._session_properties = events.SessionProperties(**frame.settings)
|
||||
await self._update_settings()
|
||||
elif isinstance(frame, LLMSetToolsFrame):
|
||||
await self._update_settings()
|
||||
elif isinstance(frame, RealtimeFunctionCallResultFrame):
|
||||
await self._handle_function_call_result(frame.result_frame)
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _handle_context(self, context: LLMContext):
|
||||
if not self._context:
|
||||
# We got our initial context
|
||||
self._context = context
|
||||
# Initialize our bookkeeping of already-completed tool calls in
|
||||
# the context
|
||||
await self._process_completed_function_calls(send_new_results=False)
|
||||
# Run the LLM at next opportunity
|
||||
await self._create_response()
|
||||
else:
|
||||
# We got an updated context.
|
||||
# This may contain a new user message or tool call result.
|
||||
self._context = context
|
||||
# Send results for newly-completed function calls, if any.
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _handle_messages_append(self, frame):
|
||||
logger.error("!!! NEED TO IMPLEMENT MESSAGES APPEND")
|
||||
|
||||
async def _handle_function_call_result(self, frame):
|
||||
item = events.ConversationItem(
|
||||
type="function_call_output",
|
||||
call_id=frame.tool_call_id,
|
||||
output=json.dumps(frame.result),
|
||||
)
|
||||
await self.send_client_event(events.ConversationItemCreateEvent(item=item))
|
||||
|
||||
#
|
||||
# websocket communication
|
||||
#
|
||||
@@ -439,16 +450,21 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
if self._receive_task:
|
||||
await self.cancel_task(self._receive_task, timeout=1.0)
|
||||
self._receive_task = None
|
||||
self._completed_tool_calls = set()
|
||||
self._disconnecting = False
|
||||
except Exception as e:
|
||||
logger.error(f"{self} error disconnecting: {e}")
|
||||
|
||||
async def _ws_send(self, realtime_message):
|
||||
try:
|
||||
if self._websocket:
|
||||
if not self._disconnecting and self._websocket:
|
||||
await self._websocket.send(json.dumps(realtime_message))
|
||||
except Exception as e:
|
||||
if self._disconnecting:
|
||||
if self._disconnecting or not self._websocket:
|
||||
# We're in the process of disconnecting.
|
||||
# (If not self._websocket, that could indicate that we
|
||||
# somehow *started* the websocket send attempt while we still
|
||||
# had a connection)
|
||||
return
|
||||
logger.error(f"Error sending message to websocket: {e}")
|
||||
# In server-to-server contexts, a WebSocket error should be quite rare. Given how hard
|
||||
@@ -459,13 +475,20 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
|
||||
async def _update_settings(self):
|
||||
settings = self._session_properties
|
||||
# tools given in the context override the tools in the session properties
|
||||
if self._context and self._context.tools:
|
||||
settings.tools = self._context.tools
|
||||
# instructions in the context come from an initial "system" message in the
|
||||
# messages list, and override instructions in the session properties
|
||||
if self._context and self._context._session_instructions:
|
||||
settings.instructions = self._context._session_instructions
|
||||
|
||||
if self._context:
|
||||
adapter: OpenAIRealtimeLLMAdapter = self.get_llm_adapter()
|
||||
llm_invocation_params = adapter.get_llm_invocation_params(self._context)
|
||||
|
||||
# tools given in the context override the tools in the session properties
|
||||
if llm_invocation_params["tools"]:
|
||||
settings.tools = llm_invocation_params["tools"]
|
||||
|
||||
# instructions in the context come from an initial "system" message in the
|
||||
# messages list, and override instructions in the session properties
|
||||
if llm_invocation_params["system_instruction"]:
|
||||
settings.instructions = llm_invocation_params["system_instruction"]
|
||||
|
||||
await self.send_client_event(events.SessionUpdateEvent(session=settings))
|
||||
|
||||
#
|
||||
@@ -571,12 +594,7 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
del self._messages_added_manually[evt.item.id]
|
||||
return
|
||||
|
||||
if evt.item.role == "user":
|
||||
# We need to wait for completion of both user message and response message. Then we'll
|
||||
# add both to the context. User message is complete when we have a "transcript" field
|
||||
# that is not None. Response message is complete when we get a "response.done" event.
|
||||
self._user_and_response_message_tuple = (evt.item, {"done": False, "output": []})
|
||||
elif evt.item.role == "assistant":
|
||||
if evt.item.role == "assistant":
|
||||
self._current_assistant_response = evt.item
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
|
||||
@@ -587,11 +605,11 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
# For now, no additional logic needed beyond the event handler call
|
||||
|
||||
async def _handle_evt_input_audio_transcription_delta(self, evt):
|
||||
if self._send_transcription_frames:
|
||||
await self.push_frame(
|
||||
# no way to get a language code?
|
||||
InterimTranscriptionFrame(evt.delta, "", time_now_iso8601(), result=evt)
|
||||
)
|
||||
await self.push_frame(
|
||||
# no way to get a language code?
|
||||
InterimTranscriptionFrame(evt.delta, "", time_now_iso8601(), result=evt),
|
||||
direction=FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
@traced_stt
|
||||
async def _handle_user_transcription(
|
||||
@@ -608,22 +626,12 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
"""
|
||||
await self._call_event_handler("on_conversation_item_updated", evt.item_id, None)
|
||||
|
||||
if self._send_transcription_frames:
|
||||
await self.push_frame(
|
||||
# no way to get a language code?
|
||||
TranscriptionFrame(evt.transcript, "", time_now_iso8601(), result=evt)
|
||||
)
|
||||
await self._handle_user_transcription(evt.transcript, True, Language.EN)
|
||||
pair = self._user_and_response_message_tuple
|
||||
if pair:
|
||||
user, assistant = pair
|
||||
user.content[0].transcript = evt.transcript
|
||||
if assistant["done"]:
|
||||
self._user_and_response_message_tuple = None
|
||||
self._context.add_user_content_item_as_message(user)
|
||||
else:
|
||||
# User message without preceding conversation.item.created. Bug?
|
||||
logger.warning(f"Transcript for unknown user message: {evt}")
|
||||
await self.push_frame(
|
||||
# no way to get a language code?
|
||||
TranscriptionFrame(evt.transcript, "", time_now_iso8601(), result=evt),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
await self._handle_user_transcription(evt.transcript, True, Language.EN)
|
||||
|
||||
async def _handle_conversation_item_retrieved(self, evt: events.ConversationItemRetrieved):
|
||||
futures = self._retrieve_conversation_item_futures.pop(evt.item.id, None)
|
||||
@@ -653,26 +661,17 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
# response content
|
||||
for item in evt.response.output:
|
||||
await self._call_event_handler("on_conversation_item_updated", item.id, item)
|
||||
pair = self._user_and_response_message_tuple
|
||||
if pair:
|
||||
user, assistant = pair
|
||||
assistant["done"] = True
|
||||
assistant["output"] = evt.response.output
|
||||
if user.content[0].transcript is not None:
|
||||
self._user_and_response_message_tuple = None
|
||||
self._context.add_user_content_item_as_message(user)
|
||||
else:
|
||||
# Response message without preceding user message (standalone response)
|
||||
# Function calls in this response were already processed immediately when arguments were complete
|
||||
logger.debug(f"Handling standalone response: {evt.response.id}")
|
||||
|
||||
async def _handle_evt_text_delta(self, evt):
|
||||
# We receive text deltas (as opposed to audio transcript deltas) when
|
||||
# the output modality is "text"
|
||||
if evt.delta:
|
||||
await self.push_frame(LLMTextFrame(evt.delta))
|
||||
|
||||
async def _handle_evt_audio_transcript_delta(self, evt):
|
||||
# We receive audio transcript deltas (as opposed to text deltas) when
|
||||
# the output modality is "audio" (the default)
|
||||
if evt.delta:
|
||||
await self.push_frame(LLMTextFrame(evt.delta))
|
||||
await self.push_frame(TTSTextFrame(evt.delta))
|
||||
|
||||
async def _handle_evt_function_call_arguments_done(self, evt):
|
||||
@@ -760,9 +759,11 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
"""
|
||||
logger.debug("Resetting conversation")
|
||||
await self._disconnect()
|
||||
if self._context:
|
||||
self._context.llm_needs_settings_update = True
|
||||
self._context.llm_needs_initial_messages = True
|
||||
|
||||
# Prepare to setup server-side conversation from local context again
|
||||
self._llm_needs_conversation_setup = True
|
||||
await self._process_completed_function_calls(send_new_results=False)
|
||||
|
||||
await self._connect()
|
||||
|
||||
@traced_openai_realtime(operation="llm_request")
|
||||
@@ -771,19 +772,29 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
self._run_llm_when_api_session_ready = True
|
||||
return
|
||||
|
||||
if self._context.llm_needs_initial_messages:
|
||||
messages = self._context.get_messages_for_initializing_history()
|
||||
adapter: OpenAIRealtimeLLMAdapter = self.get_llm_adapter()
|
||||
|
||||
# Configure the LLM for this session if needed
|
||||
if self._llm_needs_conversation_setup:
|
||||
logger.debug(
|
||||
f"Setting up conversation on OpenAI Realtime LLM service with initial messages: {adapter.get_messages_for_logging(self._context)}"
|
||||
)
|
||||
|
||||
# Send initial messages
|
||||
llm_invocation_params = adapter.get_llm_invocation_params(self._context)
|
||||
messages = llm_invocation_params["messages"]
|
||||
for item in messages:
|
||||
evt = events.ConversationItemCreateEvent(item=item)
|
||||
self._messages_added_manually[evt.item.id] = True
|
||||
await self.send_client_event(evt)
|
||||
self._context.llm_needs_initial_messages = False
|
||||
|
||||
if self._context.llm_needs_settings_update:
|
||||
# Send new settings if needed
|
||||
await self._update_settings()
|
||||
self._context.llm_needs_settings_update = False
|
||||
|
||||
logger.debug(f"Creating response: {self._context.get_messages_for_logging()}")
|
||||
# We're done configuring the LLM for this session
|
||||
self._llm_needs_conversation_setup = False
|
||||
|
||||
logger.debug(f"Creating response")
|
||||
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
await self.start_processing_metrics()
|
||||
@@ -794,19 +805,50 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
)
|
||||
)
|
||||
|
||||
async def _process_completed_function_calls(self, send_new_results: bool):
|
||||
# Check for set of completed function calls in the context
|
||||
sent_new_result = False
|
||||
for message in self._context.get_messages():
|
||||
if message.get("role") and message.get("content") != "IN_PROGRESS":
|
||||
tool_call_id = message.get("tool_call_id")
|
||||
if tool_call_id and tool_call_id not in self._completed_tool_calls:
|
||||
# Found a newly-completed function call - send the result to the service
|
||||
if send_new_results:
|
||||
sent_new_result = True
|
||||
await self._send_tool_result(tool_call_id, message.get("content"))
|
||||
self._completed_tool_calls.add(tool_call_id)
|
||||
|
||||
# If we reported any new tool call results to the service, trigger
|
||||
# another response
|
||||
if sent_new_result:
|
||||
await self._create_response()
|
||||
|
||||
async def _send_user_audio(self, frame):
|
||||
payload = base64.b64encode(frame.audio).decode("utf-8")
|
||||
await self.send_client_event(events.InputAudioBufferAppendEvent(audio=payload))
|
||||
|
||||
async def _send_tool_result(self, tool_call_id: str, result: str):
|
||||
item = events.ConversationItem(
|
||||
type="function_call_output",
|
||||
call_id=tool_call_id,
|
||||
output=json.dumps(result),
|
||||
)
|
||||
await self.send_client_event(events.ConversationItemCreateEvent(item=item))
|
||||
|
||||
def create_context_aggregator(
|
||||
self,
|
||||
context: OpenAILLMContext,
|
||||
*,
|
||||
user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
|
||||
assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
|
||||
) -> OpenAIContextAggregatorPair:
|
||||
) -> LLMContextAggregatorPair:
|
||||
"""Create an instance of OpenAIContextAggregatorPair from an OpenAILLMContext.
|
||||
|
||||
NOTE: this method exists only for backward compatibility. New code
|
||||
should instead do:
|
||||
context = LLMContext(...)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
Constructor keyword arguments for both the user and assistant aggregators can be provided.
|
||||
|
||||
Args:
|
||||
@@ -819,11 +861,41 @@ class OpenAIRealtimeLLMService(LLMService):
|
||||
the user and one for the assistant, encapsulated in an
|
||||
OpenAIContextAggregatorPair.
|
||||
"""
|
||||
context.set_llm_adapter(self.get_llm_adapter())
|
||||
|
||||
OpenAIRealtimeLLMContext.upgrade_to_realtime(context)
|
||||
user = OpenAIRealtimeUserContextAggregator(context, params=user_params)
|
||||
# Log warning about transcription frame direction change in 0.0.92.
|
||||
# We're putting this warning here rather than in the constructor so
|
||||
# that it shows up for folks who haven't updated their code at all
|
||||
# since 0.0.92, gives them a way to acknowledge and dismiss the
|
||||
# warning, and encourages adoption of a new preferred pattern.
|
||||
logger.warning(
|
||||
"As of version 0.0.92, TranscriptionFrames and InterimTranscriptionFrames "
|
||||
"now go upstream from OpenAIRealtimeLLMService, so if you're using "
|
||||
"TranscriptProcessor, say, you'll want to adjust accordingly:\n\n"
|
||||
"pipeline = Pipeline(\n"
|
||||
" [\n"
|
||||
" transport.input(),\n"
|
||||
" context_aggregator.user(),\n\n"
|
||||
" # BEFORE\n"
|
||||
" llm,\n"
|
||||
" transcript.user(),\n\n"
|
||||
" # AFTER\n"
|
||||
" transcript.user(),\n"
|
||||
" llm,\n\n"
|
||||
" transport.output(),\n"
|
||||
" transcript.assistant(),\n"
|
||||
" context_aggregator.assistant(),\n"
|
||||
" ]\n"
|
||||
")\n\n"
|
||||
"Also, LLMTextFrames are no longer pushed from "
|
||||
"OpenAIRealtimeLLMService when it's configured with "
|
||||
"output_modalities=['audio']. Listen for TTSTextFrames instead.\n\n"
|
||||
"Once you've made the appropriate changes (if needed), you can "
|
||||
"dismiss this warning by updating to the new context-setup pattern:\n\n"
|
||||
" context = LLMContext(messages, tools)\n"
|
||||
" context_aggregator = LLMContextAggregatorPair(context)\n"
|
||||
)
|
||||
|
||||
context = LLMContext.from_openai_context(context)
|
||||
assistant_params.expect_stripped_words = False
|
||||
assistant = OpenAIRealtimeAssistantContextAggregator(context, params=assistant_params)
|
||||
return OpenAIContextAggregatorPair(_user=user, _assistant=assistant)
|
||||
return LLMContextAggregatorPair(
|
||||
context, user_params=user_params, assistant_params=assistant_params
|
||||
)
|
||||
|
||||
@@ -4,18 +4,15 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""OpenAI Realtime LLM context and aggregator implementations."""
|
||||
"""OpenAI Realtime LLM context and aggregator implementations.
|
||||
|
||||
import warnings
|
||||
.. deprecated:: 0.0.91
|
||||
OpenAI Realtime no longer uses types from this module under the hood.
|
||||
It now uses `LLMContext` and `LLMContextAggregatorPair`.
|
||||
Using the new patterns should allow you to not need types from this module.
|
||||
|
||||
See deprecation warning in pipecat.services.openai.realtime.context for
|
||||
more details.
|
||||
"""
|
||||
|
||||
from pipecat.services.openai.realtime.context import *
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Types in pipecat.services.openai_realtime.context are deprecated. "
|
||||
"Please use the equivalent types from "
|
||||
"pipecat.services.openai.realtime.context instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@@ -70,7 +70,7 @@ class AzureRealtimeBetaLLMService(OpenAIRealtimeBetaLLMService):
|
||||
# handle disconnections in the send/recv code paths.
|
||||
return
|
||||
|
||||
logger.info(f"Connecting to {self.base_url}, api key: {self.api_key}")
|
||||
logger.info(f"Connecting to {self.base_url}")
|
||||
self._websocket = await websocket_connect(
|
||||
uri=self.base_url,
|
||||
additional_headers={
|
||||
|
||||
468
src/pipecat/services/sarvam/stt.py
Normal file
468
src/pipecat/services/sarvam/stt.py
Normal file
@@ -0,0 +1,468 @@
|
||||
"""Sarvam AI Speech-to-Text service implementation.
|
||||
|
||||
This module provides a streaming Speech-to-Text service using Sarvam AI's WebSocket-based
|
||||
API. It supports real-time transcription with Voice Activity Detection (VAD) and
|
||||
can handle multiple audio formats for Indian language speech recognition.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
StartFrame,
|
||||
TranscriptionFrame,
|
||||
)
|
||||
from pipecat.services.stt_service import STTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
|
||||
try:
|
||||
from sarvamai import AsyncSarvamAI
|
||||
from sarvamai.core.api_error import ApiError
|
||||
from sarvamai.core.events import EventType
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Sarvam, you need to `pip install pipecat-ai[sarvam]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
def language_to_sarvam_language(language: Language) -> str:
|
||||
"""Convert a Language enum to Sarvam's language code format.
|
||||
|
||||
Args:
|
||||
language: The Language enum value to convert.
|
||||
|
||||
Returns:
|
||||
The Sarvam language code string.
|
||||
"""
|
||||
# Mapping of pipecat Language enum to Sarvam language codes
|
||||
SARVAM_LANGUAGES = {
|
||||
Language.BN_IN: "bn-IN",
|
||||
Language.GU_IN: "gu-IN",
|
||||
Language.HI_IN: "hi-IN",
|
||||
Language.KN_IN: "kn-IN",
|
||||
Language.ML_IN: "ml-IN",
|
||||
Language.MR_IN: "mr-IN",
|
||||
Language.TA_IN: "ta-IN",
|
||||
Language.TE_IN: "te-IN",
|
||||
Language.PA_IN: "pa-IN",
|
||||
Language.OR_IN: "od-IN",
|
||||
Language.EN_IN: "en-IN",
|
||||
Language.AS_IN: "as-IN",
|
||||
}
|
||||
|
||||
return SARVAM_LANGUAGES.get(
|
||||
language, "unknown"
|
||||
) # Default to unknown (Sarvam models auto-detect the language)
|
||||
|
||||
|
||||
class SarvamSTTService(STTService):
|
||||
"""Sarvam speech-to-text service.
|
||||
|
||||
Provides real-time speech recognition using Sarvam's WebSocket API.
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Configuration parameters for Sarvam STT service.
|
||||
|
||||
Parameters:
|
||||
language: Target language for transcription. Defaults to None (required for saarika models).
|
||||
prompt: Optional prompt to guide translation style/context for STT-Translate models.
|
||||
Only applicable to saaras (STT-Translate) models. Defaults to None.
|
||||
vad_signals: Enable VAD signals in response. Defaults to True.
|
||||
high_vad_sensitivity: Enable high VAD (Voice Activity Detection) sensitivity. Defaults to False.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = None
|
||||
prompt: Optional[str] = None
|
||||
vad_signals: bool = True
|
||||
high_vad_sensitivity: bool = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
model: str = "saarika:v2.5",
|
||||
sample_rate: Optional[int] = None,
|
||||
input_audio_codec: str = "wav",
|
||||
params: Optional[InputParams] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Sarvam STT service.
|
||||
|
||||
Args:
|
||||
api_key: Sarvam API key for authentication.
|
||||
model: Sarvam model to use for transcription.
|
||||
sample_rate: Audio sample rate. Defaults to 16000 if not specified.
|
||||
input_audio_codec: Audio codec/format of the input file. Defaults to "wav".
|
||||
params: Configuration parameters for Sarvam STT service.
|
||||
**kwargs: Additional arguments passed to the parent STTService.
|
||||
"""
|
||||
params = params or SarvamSTTService.InputParams()
|
||||
|
||||
# Validate that saaras models don't accept language parameter
|
||||
if "saaras" in model.lower():
|
||||
if params.language is not None:
|
||||
raise ValueError(
|
||||
f"Model '{model}' does not accept language parameter. "
|
||||
"STT-Translate models auto-detect language."
|
||||
)
|
||||
|
||||
# Validate that saarika models don't accept prompt parameter
|
||||
if "saarika" in model.lower():
|
||||
if params.prompt is not None:
|
||||
raise ValueError(
|
||||
f"Model '{model}' does not accept prompt parameter. "
|
||||
"Prompts are only supported for STT-Translate models"
|
||||
)
|
||||
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
self.set_model_name(model)
|
||||
self._api_key = api_key
|
||||
self._language_code = params.language
|
||||
# For saarika models, default to "unknown" if language is not provided
|
||||
if params.language:
|
||||
self._language_string = language_to_sarvam_language(params.language)
|
||||
elif "saarika" in model.lower():
|
||||
self._language_string = "unknown"
|
||||
else:
|
||||
self._language_string = None
|
||||
self._prompt = params.prompt
|
||||
|
||||
# Store connection parameters
|
||||
self._vad_signals = params.vad_signals
|
||||
self._high_vad_sensitivity = params.high_vad_sensitivity
|
||||
self._input_audio_codec = input_audio_codec
|
||||
|
||||
# Initialize Sarvam SDK client
|
||||
self._sarvam_client = AsyncSarvamAI(api_subscription_key=api_key)
|
||||
self._websocket_context = None
|
||||
self._socket_client = None
|
||||
self._receive_task = None
|
||||
|
||||
def language_to_service_language(self, language: Language) -> str:
|
||||
"""Convert pipecat Language enum to Sarvam's language code.
|
||||
|
||||
Args:
|
||||
language: The Language enum value to convert.
|
||||
|
||||
Returns:
|
||||
The Sarvam language code string.
|
||||
"""
|
||||
return language_to_sarvam_language(language)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as Sarvam service supports metrics generation.
|
||||
"""
|
||||
return True
|
||||
|
||||
async def set_language(self, language: Language):
|
||||
"""Set the recognition language and reconnect.
|
||||
|
||||
Args:
|
||||
language: The language to use for speech recognition.
|
||||
"""
|
||||
# saaras models do not accept a language parameter
|
||||
if "saaras" in self.model_name.lower():
|
||||
raise ValueError(
|
||||
f"Model '{self.model_name}' (saaras) does not accept language parameter. "
|
||||
"saaras models auto-detect language."
|
||||
)
|
||||
|
||||
logger.info(f"Switching STT language to: [{language}]")
|
||||
self._language_code = language
|
||||
self._language_string = language_to_sarvam_language(language)
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
|
||||
async def set_prompt(self, prompt: Optional[str]):
|
||||
"""Set the translation prompt and reconnect.
|
||||
|
||||
Args:
|
||||
prompt: Prompt text to guide translation style/context.
|
||||
Pass None to clear/disable prompt.
|
||||
Only applicable to STT-Translate models, not STT models.
|
||||
"""
|
||||
# saarika models do not accept prompt parameter
|
||||
if "saarika" in self.model_name.lower():
|
||||
if prompt is not None:
|
||||
raise ValueError(
|
||||
f"Model '{self.model_name}' does not accept prompt parameter. "
|
||||
"Prompts are only supported for STT-Translate models."
|
||||
)
|
||||
# If prompt is None and it's saarika, just silently return (no-op)
|
||||
return
|
||||
|
||||
logger.info("Updating STT-Translate prompt.")
|
||||
self._prompt = prompt
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
"""Start the Sarvam STT service.
|
||||
|
||||
Args:
|
||||
frame: The start frame containing initialization parameters.
|
||||
"""
|
||||
await super().start(frame)
|
||||
await self._connect()
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
"""Stop the Sarvam STT service.
|
||||
|
||||
Args:
|
||||
frame: The end frame.
|
||||
"""
|
||||
await super().stop(frame)
|
||||
await self._disconnect()
|
||||
|
||||
async def cancel(self, frame: CancelFrame):
|
||||
"""Cancel the Sarvam STT service.
|
||||
|
||||
Args:
|
||||
frame: The cancel frame.
|
||||
"""
|
||||
await super().cancel(frame)
|
||||
await self._disconnect()
|
||||
|
||||
async def run_stt(self, audio: bytes):
|
||||
"""Send audio data to Sarvam for transcription.
|
||||
|
||||
Args:
|
||||
audio: Raw audio bytes to transcribe.
|
||||
|
||||
Yields:
|
||||
Frame: None (transcription results come via WebSocket callbacks).
|
||||
"""
|
||||
if not self._socket_client:
|
||||
logger.warning("WebSocket not connected, cannot process audio")
|
||||
yield None
|
||||
return
|
||||
|
||||
try:
|
||||
# Convert audio bytes to base64 for Sarvam API
|
||||
audio_base64 = base64.b64encode(audio).decode("utf-8")
|
||||
|
||||
# Convert input_audio_codec to encoding format (prepend "audio/" if needed)
|
||||
encoding = (
|
||||
self._input_audio_codec
|
||||
if self._input_audio_codec.startswith("audio/")
|
||||
else f"audio/{self._input_audio_codec}"
|
||||
)
|
||||
|
||||
# Build method arguments
|
||||
method_kwargs = {
|
||||
"audio": audio_base64,
|
||||
"encoding": encoding,
|
||||
"sample_rate": self.sample_rate,
|
||||
}
|
||||
|
||||
# Use appropriate method based on service type
|
||||
if "saarika" in self.model_name.lower():
|
||||
# STT service
|
||||
await self._socket_client.transcribe(**method_kwargs)
|
||||
else:
|
||||
# STT-Translate service - auto-detects input language and returns translated text
|
||||
await self._socket_client.translate(**method_kwargs)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending audio to Sarvam: {e}")
|
||||
await self.push_error(ErrorFrame(f"Failed to send audio: {e}"))
|
||||
|
||||
yield None
|
||||
|
||||
async def _connect(self):
|
||||
"""Connect to Sarvam WebSocket API using the SDK."""
|
||||
logger.debug("Connecting to Sarvam")
|
||||
|
||||
try:
|
||||
# Convert boolean parameters to string for SDK
|
||||
vad_signals_str = "true" if self._vad_signals else "false"
|
||||
high_vad_sensitivity_str = "true" if self._high_vad_sensitivity else "false"
|
||||
|
||||
# Build common connection parameters
|
||||
connect_kwargs = {
|
||||
"model": self.model_name,
|
||||
"vad_signals": vad_signals_str,
|
||||
"high_vad_sensitivity": high_vad_sensitivity_str,
|
||||
"input_audio_codec": self._input_audio_codec,
|
||||
"sample_rate": str(self.sample_rate),
|
||||
}
|
||||
|
||||
# Choose the appropriate service based on model
|
||||
if "saarika" in self.model_name.lower():
|
||||
# STT service - requires language_code
|
||||
connect_kwargs["language_code"] = self._language_string
|
||||
self._websocket_context = self._sarvam_client.speech_to_text_streaming.connect(
|
||||
**connect_kwargs
|
||||
)
|
||||
else:
|
||||
# STT-Translate service - auto-detects input language and returns translated text
|
||||
self._websocket_context = (
|
||||
self._sarvam_client.speech_to_text_translate_streaming.connect(**connect_kwargs)
|
||||
)
|
||||
|
||||
# Enter the async context manager
|
||||
self._socket_client = await self._websocket_context.__aenter__()
|
||||
|
||||
# Set prompt if provided (only for STT-Translate models, after connection)
|
||||
if self._prompt is not None and "saaras" in self.model_name.lower():
|
||||
await self._socket_client.set_prompt(self._prompt)
|
||||
|
||||
# Register event handler for incoming messages
|
||||
def _message_handler(message):
|
||||
"""Wrapper to handle async response handler."""
|
||||
# Use Pipecat's built-in task management
|
||||
self.create_task(self._handle_message(message))
|
||||
|
||||
self._socket_client.on(EventType.MESSAGE, _message_handler)
|
||||
|
||||
# Start receive task using Pipecat's task management
|
||||
self._receive_task = self.create_task(self._receive_task_handler())
|
||||
|
||||
logger.info("Connected to Sarvam successfully")
|
||||
|
||||
except ApiError as e:
|
||||
logger.error(f"Sarvam API error: {e}")
|
||||
await self.push_error(ErrorFrame(f"Sarvam API error: {e}"))
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to Sarvam: {e}")
|
||||
self._socket_client = None
|
||||
self._websocket_context = None
|
||||
await self.push_error(ErrorFrame(f"Failed to connect to Sarvam: {e}"))
|
||||
|
||||
async def _disconnect(self):
|
||||
"""Disconnect from Sarvam WebSocket API using SDK."""
|
||||
if self._receive_task:
|
||||
await self.cancel_task(self._receive_task)
|
||||
self._receive_task = None
|
||||
|
||||
if self._websocket_context and self._socket_client:
|
||||
try:
|
||||
# Exit the async context manager
|
||||
await self._websocket_context.__aexit__(None, None, None)
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing WebSocket connection: {e}")
|
||||
finally:
|
||||
logger.debug("Disconnected from Sarvam WebSocket")
|
||||
self._socket_client = None
|
||||
self._websocket_context = None
|
||||
|
||||
async def _receive_task_handler(self):
|
||||
"""Handle incoming messages from Sarvam WebSocket.
|
||||
|
||||
This task wraps the SDK's start_listening() method which processes
|
||||
messages via the registered event handler callback.
|
||||
"""
|
||||
if not self._socket_client:
|
||||
return
|
||||
|
||||
try:
|
||||
# Start listening for messages from the Sarvam SDK
|
||||
# Messages will be handled via the _message_handler callback
|
||||
await self._socket_client.start_listening()
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Sarvam receive task: {e}")
|
||||
await self.push_error(ErrorFrame(f"Sarvam receive task error: {e}"))
|
||||
|
||||
async def _handle_message(self, message):
|
||||
"""Handle incoming WebSocket message from Sarvam SDK.
|
||||
|
||||
Processes transcription data and VAD events from the Sarvam service.
|
||||
|
||||
Args:
|
||||
message: The parsed response object from Sarvam WebSocket.
|
||||
"""
|
||||
logger.debug(f"Received response: {message}")
|
||||
|
||||
try:
|
||||
if message.type == "events":
|
||||
# VAD event
|
||||
signal = message.data.signal_type
|
||||
timestamp = message.data.occured_at
|
||||
logger.debug(f"VAD Signal: {signal}, Occurred at: {timestamp}")
|
||||
|
||||
if signal == "START_SPEECH":
|
||||
await self.start_metrics()
|
||||
logger.debug("User started speaking")
|
||||
await self._call_event_handler("on_speech_started")
|
||||
|
||||
elif message.type == "data":
|
||||
await self.stop_ttfb_metrics()
|
||||
transcript = message.data.transcript
|
||||
language_code = message.data.language_code
|
||||
# Prefer language from message (auto-detected for translate models). Fallback to configured.
|
||||
if language_code:
|
||||
language = self._map_language_code_to_enum(language_code)
|
||||
elif self._language_string:
|
||||
language = self._map_language_code_to_enum(self._language_string)
|
||||
else:
|
||||
language = Language.HI_IN
|
||||
|
||||
# Emit utterance end event
|
||||
await self._call_event_handler("on_utterance_end")
|
||||
|
||||
if transcript and transcript.strip():
|
||||
# Record tracing for this transcription event
|
||||
await self._handle_transcription(transcript, True, language)
|
||||
await self.push_frame(
|
||||
TranscriptionFrame(
|
||||
transcript,
|
||||
self._user_id,
|
||||
time_now_iso8601(),
|
||||
language,
|
||||
result=(message.dict() if hasattr(message, "dict") else str(message)),
|
||||
)
|
||||
)
|
||||
|
||||
await self.stop_processing_metrics()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error handling Sarvam message: {e}")
|
||||
await self.push_error(ErrorFrame(f"Failed to handle message: {e}"))
|
||||
await self.stop_all_metrics()
|
||||
|
||||
@traced_stt
|
||||
async def _handle_transcription(
|
||||
self, transcript: str, is_final: bool, language: Optional[Language] = None
|
||||
):
|
||||
"""Handle a transcription result with tracing.
|
||||
|
||||
This method is decorated with @traced_stt for observability.
|
||||
"""
|
||||
pass
|
||||
|
||||
def _map_language_code_to_enum(self, language_code: str) -> Language:
|
||||
"""Map Sarvam language code to pipecat Language enum."""
|
||||
mapping = {
|
||||
"bn-IN": Language.BN_IN,
|
||||
"gu-IN": Language.GU_IN,
|
||||
"hi-IN": Language.HI_IN,
|
||||
"kn-IN": Language.KN_IN,
|
||||
"ml-IN": Language.ML_IN,
|
||||
"mr-IN": Language.MR_IN,
|
||||
"ta-IN": Language.TA_IN,
|
||||
"te-IN": Language.TE_IN,
|
||||
"pa-IN": Language.PA_IN,
|
||||
"od-IN": Language.OR_IN,
|
||||
"en-US": Language.EN_US,
|
||||
"en-IN": Language.EN_IN,
|
||||
"as-IN": Language.AS_IN,
|
||||
}
|
||||
return mapping.get(language_code, Language.HI_IN)
|
||||
|
||||
async def start_metrics(self):
|
||||
"""Start TTFB and processing metrics collection."""
|
||||
await self.start_ttfb_metrics()
|
||||
await self.start_processing_metrics()
|
||||
@@ -374,7 +374,6 @@ class SarvamTTSService(InterruptibleTTSService):
|
||||
model: str = "bulbul:v2",
|
||||
voice_id: str = "anushka",
|
||||
url: str = "wss://api.sarvam.ai/text-to-speech/ws",
|
||||
aiohttp_session: Optional[aiohttp.ClientSession] = None,
|
||||
aggregate_sentences: Optional[bool] = True,
|
||||
sample_rate: Optional[int] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
@@ -388,11 +387,6 @@ class SarvamTTSService(InterruptibleTTSService):
|
||||
Supports "bulbul:v2", "bulbul:v3-beta" and "bulbul:v3".
|
||||
voice_id: Voice identifier for synthesis (default "anushka").
|
||||
url: WebSocket URL for connecting to the TTS backend (default production URL).
|
||||
aiohttp_session: Optional shared aiohttp session. To maintain backward compatibility.
|
||||
|
||||
.. deprecated:: 0.0.81
|
||||
aiohttp_session is no longer used. This parameter will be removed in a future version.
|
||||
|
||||
aggregate_sentences: Whether to merge multiple sentences into one audio chunk (default True).
|
||||
sample_rate: Desired sample rate for the output audio in Hz (overrides default if set).
|
||||
params: Optional input parameters to override global configuration.
|
||||
@@ -413,16 +407,7 @@ class SarvamTTSService(InterruptibleTTSService):
|
||||
**kwargs,
|
||||
)
|
||||
params = params or SarvamTTSService.InputParams()
|
||||
if aiohttp_session is not None:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"The 'aiohttp_session' parameter is deprecated and will be removed in a future version. ",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
# WebSocket endpoint URL
|
||||
self._websocket_url = f"{url}?model={model}"
|
||||
self._api_key = api_key
|
||||
|
||||
@@ -7,9 +7,12 @@
|
||||
"""Simli video service for real-time avatar generation."""
|
||||
|
||||
import asyncio
|
||||
import warnings
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
@@ -41,30 +44,103 @@ class SimliVideoService(FrameProcessor):
|
||||
audio resampling, video frame processing, and connection management.
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Input parameters for Simli video configuration.
|
||||
|
||||
Parameters:
|
||||
max_session_length: Absolute maximum session duration in seconds.
|
||||
Avatar will disconnect after this time even if it's speaking.
|
||||
max_idle_time: Maximum duration in seconds the avatar is not speaking
|
||||
before the avatar disconnects.
|
||||
"""
|
||||
|
||||
max_session_length: Optional[int] = None
|
||||
max_idle_time: Optional[int] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
simli_config: SimliConfig,
|
||||
*,
|
||||
api_key: Optional[str] = None,
|
||||
face_id: Optional[str] = None,
|
||||
simli_config: Optional[SimliConfig] = None,
|
||||
use_turn_server: bool = False,
|
||||
latency_interval: int = 0,
|
||||
simli_url: str = "https://api.simli.ai",
|
||||
is_trinity_avatar: bool = False,
|
||||
params: Optional[InputParams] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Simli video service.
|
||||
|
||||
Args:
|
||||
api_key: Simli API key for authentication.
|
||||
face_id: Simli Face ID. For Trinity avatars, specify "faceId/emotionId"
|
||||
to use a different emotion than the default.
|
||||
simli_config: Configuration object for Simli client settings.
|
||||
use_turn_server: Whether to use TURN server for connection. Defaults to False.
|
||||
latency_interval: Latency interval setting for sending health checks to check the latency to Simli Servers. Defaults to 0.
|
||||
simli_url: URL of the simli servers. Can be changed for custom deployments of enterprise users.
|
||||
is_trinity_avatar: boolean to tell simli client that this is a Trinity avatar which reduces latency when using Trinity.
|
||||
Use api_key and face_id instead.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
The 'simli_config' parameter is deprecated and will be removed in a future version.
|
||||
Please use 'api_key' and 'face_id' parameters instead.
|
||||
|
||||
use_turn_server: Whether to use TURN server for connection. Defaults to False.
|
||||
latency_interval: Latency interval setting for sending health checks to check
|
||||
the latency to Simli Servers. Defaults to 0.
|
||||
simli_url: URL of the simli servers. Can be changed for custom deployments
|
||||
of enterprise users.
|
||||
is_trinity_avatar: Boolean to tell simli client that this is a Trinity avatar
|
||||
which reduces latency when using Trinity.
|
||||
params: Additional input parameters for session configuration.
|
||||
**kwargs: Additional arguments passed to the parent FrameProcessor.
|
||||
"""
|
||||
super().__init__()
|
||||
super().__init__(**kwargs)
|
||||
|
||||
params = params or SimliVideoService.InputParams()
|
||||
|
||||
# Handle deprecated simli_config parameter
|
||||
if simli_config is not None:
|
||||
if api_key is not None or face_id is not None:
|
||||
raise ValueError(
|
||||
"Cannot specify both simli_config and api_key/face_id. "
|
||||
"Please use api_key and face_id (simli_config is deprecated)."
|
||||
)
|
||||
|
||||
warnings.warn(
|
||||
"The 'simli_config' parameter is deprecated and will be removed in a future version. "
|
||||
"Please use 'api_key' and 'face_id' parameters instead, with optional 'params' for "
|
||||
"max_session_length and max_idle_time configuration.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# Use the provided simli_config
|
||||
config = simli_config
|
||||
else:
|
||||
# Validate new parameters
|
||||
if api_key is None:
|
||||
raise ValueError("api_key is required")
|
||||
if face_id is None:
|
||||
raise ValueError("face_id is required")
|
||||
|
||||
# Build SimliConfig from new parameters
|
||||
# Only pass optional parameters if explicitly provided to use SimliConfig defaults
|
||||
config_kwargs = {
|
||||
"apiKey": api_key,
|
||||
"faceId": face_id,
|
||||
}
|
||||
if params.max_session_length is not None:
|
||||
config_kwargs["maxSessionLength"] = params.max_session_length
|
||||
if params.max_idle_time is not None:
|
||||
config_kwargs["maxIdleTime"] = params.max_idle_time
|
||||
|
||||
config = SimliConfig(**config_kwargs)
|
||||
|
||||
self._initialized = False
|
||||
simli_config.maxIdleTime += 5
|
||||
simli_config.maxSessionLength += 5
|
||||
# Add buffer time to session limits
|
||||
config.maxIdleTime += 5
|
||||
config.maxSessionLength += 5
|
||||
self._simli_client = SimliClient(
|
||||
simli_config,
|
||||
config,
|
||||
use_turn_server,
|
||||
latency_interval,
|
||||
simliURL=simli_url,
|
||||
|
||||
@@ -49,6 +49,33 @@ END_TOKEN = "<end>"
|
||||
FINALIZED_TOKEN = "<fin>"
|
||||
|
||||
|
||||
class SonioxContextGeneralItem(BaseModel):
|
||||
"""Represents a key-value pair for structured general context information."""
|
||||
|
||||
key: str
|
||||
value: str
|
||||
|
||||
|
||||
class SonioxContextTranslationTerm(BaseModel):
|
||||
"""Represents a custom translation mapping for ambiguous or domain-specific terms."""
|
||||
|
||||
source: str
|
||||
target: str
|
||||
|
||||
|
||||
class SonioxContextObject(BaseModel):
|
||||
"""Context object for models with context_version 2, for Soniox stt-rt-v3-preview and higher.
|
||||
|
||||
Learn more about context in the documentation:
|
||||
https://soniox.com/docs/stt/concepts/context
|
||||
"""
|
||||
|
||||
general: Optional[List[SonioxContextGeneralItem]] = None
|
||||
text: Optional[str] = None
|
||||
terms: Optional[List[str]] = None
|
||||
translation_terms: Optional[List[SonioxContextTranslationTerm]] = None
|
||||
|
||||
|
||||
class SonioxInputParams(BaseModel):
|
||||
"""Real-time transcription settings.
|
||||
|
||||
@@ -60,9 +87,9 @@ class SonioxInputParams(BaseModel):
|
||||
audio_format: Audio format to use for transcription.
|
||||
num_channels: Number of channels to use for transcription.
|
||||
language_hints: List of language hints to use for transcription.
|
||||
context: Customization for transcription.
|
||||
enable_non_final_tokens: Whether to enable non-final tokens. If false, only final tokens will be returned.
|
||||
max_non_final_tokens_duration_ms: Maximum duration of non-final tokens.
|
||||
context: Customization for transcription. String for models with context_version 1 and ContextObject for models with context_version 2.
|
||||
enable_speaker_diarization: Whether to enable speaker diarization. Tokens are annotated with speaker IDs.
|
||||
enable_language_identification: Whether to enable language identification. Tokens are annotated with language IDs.
|
||||
client_reference_id: Client reference ID to use for transcription.
|
||||
"""
|
||||
|
||||
@@ -72,10 +99,10 @@ class SonioxInputParams(BaseModel):
|
||||
num_channels: Optional[int] = 1
|
||||
|
||||
language_hints: Optional[List[Language]] = None
|
||||
context: Optional[str] = None
|
||||
context: Optional[SonioxContextObject | str] = None
|
||||
|
||||
enable_non_final_tokens: Optional[bool] = True
|
||||
max_non_final_tokens_duration_ms: Optional[int] = None
|
||||
enable_speaker_diarization: Optional[bool] = False
|
||||
enable_language_identification: Optional[bool] = False
|
||||
|
||||
client_reference_id: Optional[str] = None
|
||||
|
||||
@@ -173,6 +200,10 @@ class SonioxSTTService(STTService):
|
||||
# Either one or the other is required.
|
||||
enable_endpoint_detection = not self._vad_force_turn_endpoint
|
||||
|
||||
context = self._params.context
|
||||
if isinstance(context, SonioxContextObject):
|
||||
context = context.model_dump()
|
||||
|
||||
# Send the initial configuration message.
|
||||
config = {
|
||||
"api_key": self._api_key,
|
||||
@@ -182,9 +213,9 @@ class SonioxSTTService(STTService):
|
||||
"enable_endpoint_detection": enable_endpoint_detection,
|
||||
"sample_rate": self.sample_rate,
|
||||
"language_hints": _prepare_language_hints(self._params.language_hints),
|
||||
"context": self._params.context,
|
||||
"enable_non_final_tokens": self._params.enable_non_final_tokens,
|
||||
"max_non_final_tokens_duration_ms": self._params.max_non_final_tokens_duration_ms,
|
||||
"context": context,
|
||||
"enable_speaker_diarization": self._params.enable_speaker_diarization,
|
||||
"enable_language_identification": self._params.enable_language_identification,
|
||||
"client_reference_id": self._params.client_reference_id,
|
||||
}
|
||||
|
||||
|
||||
189
src/pipecat/services/speechmatics/tts.py
Normal file
189
src/pipecat/services/speechmatics/tts.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Speechmatics TTS service integration."""
|
||||
|
||||
from typing import AsyncGenerator, Optional
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.services.tts_service import TTSService
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
try:
|
||||
from speechmatics.rt import __version__
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
"In order to use Speechmatics, you need to `pip install pipecat-ai[speechmatics]`."
|
||||
)
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class SpeechmaticsTTSService(TTSService):
|
||||
"""Speechmatics TTS service implementation.
|
||||
|
||||
This service provides text-to-speech synthesis using the Speechmatics HTTP API.
|
||||
It converts text to speech and returns raw PCM audio data for real-time playback.
|
||||
"""
|
||||
|
||||
SPEECHMATICS_SAMPLE_RATE = 16000
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Optional input parameters for Speechmatics TTS configuration."""
|
||||
|
||||
pass
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
base_url: str = "https://preview.tts.speechmatics.com",
|
||||
voice_id: str = "sarah",
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
sample_rate: Optional[int] = SPEECHMATICS_SAMPLE_RATE,
|
||||
params: Optional[InputParams] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Speechmatics TTS service.
|
||||
|
||||
Args:
|
||||
api_key: Speechmatics API key for authentication.
|
||||
base_url: Base URL for Speechmatics TTS API.
|
||||
voice_id: Voice model to use for synthesis.
|
||||
aiohttp_session: Shared aiohttp session for HTTP requests.
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
params: Optional[InputParams]: Input parameters for the service.
|
||||
**kwargs: Additional arguments passed to TTSService.
|
||||
"""
|
||||
if sample_rate and sample_rate != self.SPEECHMATICS_SAMPLE_RATE:
|
||||
logger.warning(
|
||||
f"Speechmatics TTS only supports {self.SPEECHMATICS_SAMPLE_RATE}Hz sample rate. "
|
||||
f"Current rate of {sample_rate}Hz may cause issues."
|
||||
)
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
# Service parameters
|
||||
self._api_key: str = api_key
|
||||
self._base_url: str = base_url
|
||||
self._session = aiohttp_session
|
||||
|
||||
# Check we have required attributes
|
||||
if not self._api_key:
|
||||
raise ValueError("Missing Speechmatics API key")
|
||||
|
||||
# Default parameters
|
||||
self._params = params or SpeechmaticsTTSService.InputParams()
|
||||
|
||||
# Set voice from constructor parameter
|
||||
self.set_voice(voice_id)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as Speechmatics service supports metrics generation.
|
||||
"""
|
||||
return True
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using Speechmatics' HTTP API.
|
||||
|
||||
Args:
|
||||
text: The text to synthesize into speech.
|
||||
|
||||
Yields:
|
||||
Frame: Audio frames containing the synthesized speech.
|
||||
"""
|
||||
logger.debug(f"{self}: Generating TTS [{text}]")
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self._api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"text": text,
|
||||
}
|
||||
|
||||
url = _get_endpoint_url(self._base_url, self._voice_id, self.sample_rate)
|
||||
|
||||
try:
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
async with self._session.post(url, json=payload, headers=headers) as response:
|
||||
if response.status != 200:
|
||||
error_message = f"Speechmatics TTS error: HTTP {response.status}"
|
||||
logger.error(error_message)
|
||||
yield ErrorFrame(error=error_message)
|
||||
return
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
yield TTSStartedFrame()
|
||||
|
||||
# Process the response in streaming chunks
|
||||
first_chunk = True
|
||||
buffer = b""
|
||||
|
||||
async for chunk in response.content.iter_any():
|
||||
if not chunk:
|
||||
continue
|
||||
if first_chunk:
|
||||
await self.stop_ttfb_metrics()
|
||||
first_chunk = False
|
||||
|
||||
buffer += chunk
|
||||
|
||||
# Emit all complete 2-byte int16 samples from buffer
|
||||
if len(buffer) >= 2:
|
||||
complete_samples = len(buffer) // 2
|
||||
complete_bytes = complete_samples * 2
|
||||
|
||||
audio_data = buffer[:complete_bytes]
|
||||
buffer = buffer[complete_bytes:] # Keep remaining bytes for next iteration
|
||||
|
||||
yield TTSAudioRawFrame(
|
||||
audio=audio_data,
|
||||
sample_rate=self.sample_rate,
|
||||
num_channels=1,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error generating TTS: {e}")
|
||||
yield ErrorFrame(error=f"Speechmatics TTS error: {str(e)}")
|
||||
finally:
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
|
||||
def _get_endpoint_url(base_url: str, voice: str, sample_rate: int) -> str:
|
||||
"""Format the TTS endpoint URL with voice, output format, and version params.
|
||||
|
||||
Args:
|
||||
base_url: The base URL for the TTS endpoint.
|
||||
voice: The voice model to use.
|
||||
sample_rate: The audio sample rate.
|
||||
|
||||
Returns:
|
||||
str: The formatted TTS endpoint URL.
|
||||
"""
|
||||
query_params = {}
|
||||
query_params["output_format"] = f"pcm_{sample_rate}"
|
||||
query_params["sm-app"] = f"pipecat/{__version__}"
|
||||
query = urlencode(query_params)
|
||||
|
||||
return f"{base_url}/generate/{voice}?{query}"
|
||||
@@ -14,8 +14,7 @@ visual content.
|
||||
from abc import abstractmethod
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from pipecat.frames.frames import Frame, LLMContextFrame
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.frames.frames import Frame, UserImageRawFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_service import AIService
|
||||
|
||||
@@ -38,15 +37,15 @@ class VisionService(AIService):
|
||||
self._describe_text = None
|
||||
|
||||
@abstractmethod
|
||||
async def run_vision(self, context: LLMContext) -> AsyncGenerator[Frame, None]:
|
||||
"""Process the latest image in the context and generate results.
|
||||
async def run_vision(self, frame: UserImageRawFrame) -> AsyncGenerator[Frame, None]:
|
||||
"""Process the given vision image and generate results.
|
||||
|
||||
This method must be implemented by subclasses to provide actual computer
|
||||
vision functionality such as image description, object detection, or
|
||||
visual question answering.
|
||||
|
||||
Args:
|
||||
context: The context to process, containing image data.
|
||||
frame: The image frame to process.
|
||||
|
||||
Yields:
|
||||
Frame: Frames containing the vision analysis results, typically TextFrame
|
||||
@@ -57,7 +56,7 @@ class VisionService(AIService):
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames, handling vision image frames for analysis.
|
||||
|
||||
Automatically processes VisionImageRawFrame objects by calling run_vision
|
||||
Automatically processes UserImageRawFrame objects by calling run_vision
|
||||
and handles metrics tracking. Other frames are passed through unchanged.
|
||||
|
||||
Args:
|
||||
@@ -66,9 +65,9 @@ class VisionService(AIService):
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, LLMContextFrame):
|
||||
if isinstance(frame, UserImageRawFrame) and frame.text:
|
||||
await self.start_processing_metrics()
|
||||
await self.process_generator(self.run_vision(frame.context))
|
||||
await self.process_generator(self.run_vision(frame))
|
||||
await self.stop_processing_metrics()
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Tuple
|
||||
from typing import Awaitable, Callable, List, Optional, Sequence, Tuple
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
EndFrame,
|
||||
|
||||
@@ -16,7 +16,7 @@ import time
|
||||
from concurrent.futures import CancelledError as FuturesCancelledError
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Awaitable, Callable, Dict, Mapping, Optional
|
||||
from typing import Any, Awaitable, Callable, Dict, Mapping, Optional, Tuple
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
@@ -419,6 +419,11 @@ class DailyAudioTrack:
|
||||
track: CustomAudioTrack
|
||||
|
||||
|
||||
# This is just a type alias for the errors returned by daily-python. Right now
|
||||
# they are just a string.
|
||||
CallClientError = str
|
||||
|
||||
|
||||
class DailyTransportClient(EventHandler):
|
||||
"""Core client for interacting with Daily's API.
|
||||
|
||||
@@ -553,14 +558,17 @@ class DailyTransportClient(EventHandler):
|
||||
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
) -> Optional[CallClientError]:
|
||||
"""Send an application message to participants.
|
||||
|
||||
Args:
|
||||
frame: The message frame to send.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
if not self._joined:
|
||||
return
|
||||
return "Unable to send messages before joining."
|
||||
|
||||
participant_id = None
|
||||
if isinstance(
|
||||
@@ -572,7 +580,7 @@ class DailyTransportClient(EventHandler):
|
||||
self._client.send_app_message(
|
||||
frame.message, participant_id, completion=completion_callback(future)
|
||||
)
|
||||
await future
|
||||
return await future
|
||||
|
||||
async def read_next_audio_frame(self) -> Optional[InputAudioRawFrame]:
|
||||
"""Reads the next 20ms audio frame from the virtual speaker."""
|
||||
@@ -744,32 +752,24 @@ class DailyTransportClient(EventHandler):
|
||||
|
||||
self._client.set_user_name(self._bot_name)
|
||||
|
||||
try:
|
||||
(data, error) = await self._join()
|
||||
(data, error) = await self._join()
|
||||
|
||||
if not error:
|
||||
self._joined = True
|
||||
self._joining = False
|
||||
# Increment leave counter if we successfully joined.
|
||||
self._leave_counter += 1
|
||||
|
||||
logger.info(f"Joined {self._room_url}")
|
||||
|
||||
if self._params.transcription_enabled:
|
||||
await self.start_transcription(self._params.transcription_settings)
|
||||
|
||||
await self._callbacks.on_joined(data)
|
||||
|
||||
self._joined_event.set()
|
||||
else:
|
||||
error_msg = f"Error joining {self._room_url}: {error}"
|
||||
logger.error(error_msg)
|
||||
await self._callbacks.on_error(error_msg)
|
||||
except asyncio.TimeoutError:
|
||||
error_msg = f"Time out joining {self._room_url}"
|
||||
logger.error(error_msg)
|
||||
if not error:
|
||||
self._joined = True
|
||||
self._joining = False
|
||||
# Increment leave counter if we successfully joined.
|
||||
self._leave_counter += 1
|
||||
|
||||
logger.info(f"Joined {self._room_url}")
|
||||
|
||||
await self._callbacks.on_joined(data)
|
||||
|
||||
self._joined_event.set()
|
||||
else:
|
||||
error_msg = f"Error joining {self._room_url}: {error}"
|
||||
logger.error(error_msg)
|
||||
await self._callbacks.on_error(error_msg)
|
||||
self._joining = False
|
||||
|
||||
async def _join(self):
|
||||
"""Execute the actual room join operation."""
|
||||
@@ -828,7 +828,7 @@ class DailyTransportClient(EventHandler):
|
||||
},
|
||||
)
|
||||
|
||||
return await asyncio.wait_for(future, timeout=10)
|
||||
return await future
|
||||
|
||||
async def leave(self):
|
||||
"""Leave the Daily room and cleanup resources."""
|
||||
@@ -847,24 +847,16 @@ class DailyTransportClient(EventHandler):
|
||||
# Call callback before leaving.
|
||||
await self._callbacks.on_before_leave()
|
||||
|
||||
if self._params.transcription_enabled:
|
||||
await self.stop_transcription()
|
||||
|
||||
# Remove any custom tracks, if any.
|
||||
for track_name, _ in self._custom_audio_tracks.items():
|
||||
await self.remove_custom_audio_track(track_name)
|
||||
|
||||
try:
|
||||
error = await self._leave()
|
||||
if not error:
|
||||
logger.info(f"Left {self._room_url}")
|
||||
await self._callbacks.on_left()
|
||||
else:
|
||||
error_msg = f"Error leaving {self._room_url}: {error}"
|
||||
logger.error(error_msg)
|
||||
await self._callbacks.on_error(error_msg)
|
||||
except asyncio.TimeoutError:
|
||||
error_msg = f"Time out leaving {self._room_url}"
|
||||
error = await self._leave()
|
||||
if not error:
|
||||
logger.info(f"Left {self._room_url}")
|
||||
await self._callbacks.on_left()
|
||||
else:
|
||||
error_msg = f"Error leaving {self._room_url}: {error}"
|
||||
logger.error(error_msg)
|
||||
await self._callbacks.on_error(error_msg)
|
||||
|
||||
@@ -875,7 +867,7 @@ class DailyTransportClient(EventHandler):
|
||||
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.leave(completion=completion_callback(future))
|
||||
return await asyncio.wait_for(future, timeout=10)
|
||||
return await future
|
||||
|
||||
def _cleanup(self):
|
||||
"""Cleanup the Daily client instance."""
|
||||
@@ -883,7 +875,7 @@ class DailyTransportClient(EventHandler):
|
||||
self._client.release()
|
||||
self._client = None
|
||||
|
||||
def participants(self):
|
||||
def participants(self) -> Mapping[str, Any]:
|
||||
"""Get current participants in the room.
|
||||
|
||||
Returns:
|
||||
@@ -891,7 +883,7 @@ class DailyTransportClient(EventHandler):
|
||||
"""
|
||||
return self._client.participants()
|
||||
|
||||
def participant_counts(self):
|
||||
def participant_counts(self) -> Mapping[str, Any]:
|
||||
"""Get participant count information.
|
||||
|
||||
Returns:
|
||||
@@ -899,165 +891,173 @@ class DailyTransportClient(EventHandler):
|
||||
"""
|
||||
return self._client.participant_counts()
|
||||
|
||||
async def start_dialout(self, settings):
|
||||
async def start_dialout(self, settings) -> Tuple[str, Optional[CallClientError]]:
|
||||
"""Start a dial-out call to a phone number.
|
||||
|
||||
Args:
|
||||
settings: Dial-out configuration settings.
|
||||
"""
|
||||
logger.debug(f"Starting dialout: settings={settings}")
|
||||
|
||||
Returns:
|
||||
session_id: Dail-out session ID.
|
||||
error: An error description or None.
|
||||
"""
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.start_dialout(settings, completion=completion_callback(future))
|
||||
error = await future
|
||||
if error:
|
||||
logger.error(f"Unable to start dialout: {error}")
|
||||
return await future
|
||||
|
||||
async def stop_dialout(self, participant_id):
|
||||
async def stop_dialout(self, participant_id) -> Optional[CallClientError]:
|
||||
"""Stop a dial-out call for a specific participant.
|
||||
|
||||
Args:
|
||||
participant_id: ID of the participant to stop dial-out for.
|
||||
"""
|
||||
logger.debug(f"Stopping dialout: participant_id={participant_id}")
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.stop_dialout(participant_id, completion=completion_callback(future))
|
||||
error = await future
|
||||
if error:
|
||||
logger.error(f"Unable to stop dialout: {error}")
|
||||
return await future
|
||||
|
||||
async def send_dtmf(self, settings):
|
||||
async def send_dtmf(self, settings) -> Optional[CallClientError]:
|
||||
"""Send DTMF tones during a call.
|
||||
|
||||
Args:
|
||||
settings: DTMF settings including tones and target session.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
session_id = settings.get("sessionId") or self._dial_out_session_id
|
||||
if not session_id:
|
||||
logger.error("Unable to send DTMF: 'sessionId' is not set")
|
||||
return
|
||||
return "Can't send DTMF if 'sessionId' is not set"
|
||||
|
||||
# Update 'sessionId' field.
|
||||
settings["sessionId"] = session_id
|
||||
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.send_dtmf(settings, completion=completion_callback(future))
|
||||
await future
|
||||
return await future
|
||||
|
||||
async def sip_call_transfer(self, settings):
|
||||
async def sip_call_transfer(self, settings) -> Optional[CallClientError]:
|
||||
"""Transfer a SIP call to another destination.
|
||||
|
||||
Args:
|
||||
settings: SIP call transfer settings.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
session_id = (
|
||||
settings.get("sessionId") or self._dial_out_session_id or self._dial_in_session_id
|
||||
)
|
||||
if not session_id:
|
||||
logger.error("Unable to transfer SIP call: 'sessionId' is not set")
|
||||
return
|
||||
return "Can't transfer SIP call if 'sessionId' is not set"
|
||||
|
||||
# Update 'sessionId' field.
|
||||
settings["sessionId"] = session_id
|
||||
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.sip_call_transfer(settings, completion=completion_callback(future))
|
||||
await future
|
||||
return await future
|
||||
|
||||
async def sip_refer(self, settings):
|
||||
async def sip_refer(self, settings) -> Optional[CallClientError]:
|
||||
"""Send a SIP REFER request.
|
||||
|
||||
Args:
|
||||
settings: SIP REFER settings.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.sip_refer(settings, completion=completion_callback(future))
|
||||
await future
|
||||
return await future
|
||||
|
||||
async def start_recording(self, streaming_settings, stream_id, force_new):
|
||||
async def start_recording(
|
||||
self, streaming_settings, stream_id, force_new
|
||||
) -> Tuple[str, Optional[CallClientError]]:
|
||||
"""Start recording the call.
|
||||
|
||||
Args:
|
||||
streaming_settings: Recording configuration settings.
|
||||
stream_id: Unique identifier for the recording stream.
|
||||
force_new: Whether to force a new recording session.
|
||||
"""
|
||||
logger.debug(
|
||||
f"Starting recording: stream_id={stream_id} force_new={force_new} settings={streaming_settings}"
|
||||
)
|
||||
|
||||
Returns:
|
||||
stream_id: Unique identifier for the recording stream.
|
||||
error: An error description or None.
|
||||
"""
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.start_recording(
|
||||
streaming_settings, stream_id, force_new, completion=completion_callback(future)
|
||||
)
|
||||
error = await future
|
||||
if error:
|
||||
logger.error(f"Unable to start recording: {error}")
|
||||
return await future
|
||||
|
||||
async def stop_recording(self, stream_id):
|
||||
async def stop_recording(self, stream_id) -> Optional[CallClientError]:
|
||||
"""Stop recording the call.
|
||||
|
||||
Args:
|
||||
stream_id: Unique identifier for the recording stream to stop.
|
||||
"""
|
||||
logger.debug(f"Stopping recording: stream_id={stream_id}")
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.stop_recording(stream_id, completion=completion_callback(future))
|
||||
error = await future
|
||||
if error:
|
||||
logger.error(f"Unable to stop recording: {error}")
|
||||
return await future
|
||||
|
||||
async def start_transcription(self, settings):
|
||||
async def start_transcription(self, settings) -> Optional[CallClientError]:
|
||||
"""Start transcription for the call.
|
||||
|
||||
Args:
|
||||
settings: Transcription configuration settings.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
if not self._token:
|
||||
logger.warning("Transcription can't be started without a room token")
|
||||
return
|
||||
|
||||
logger.debug(f"Starting transcription: settings={settings}")
|
||||
return "Transcription can't be started without a room token"
|
||||
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.start_transcription(
|
||||
settings=self._params.transcription_settings.model_dump(exclude_none=True),
|
||||
completion=completion_callback(future),
|
||||
)
|
||||
error = await future
|
||||
if error:
|
||||
logger.error(f"Unable to start transcription: {error}")
|
||||
return await future
|
||||
|
||||
async def stop_transcription(self):
|
||||
"""Stop transcription for the call."""
|
||||
async def stop_transcription(self) -> Optional[CallClientError]:
|
||||
"""Stop transcription for the call.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
if not self._token:
|
||||
return
|
||||
|
||||
logger.debug(f"Stopping transcription")
|
||||
return "Transcription can't be stopped without a room token"
|
||||
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.stop_transcription(completion=completion_callback(future))
|
||||
error = await future
|
||||
if error:
|
||||
logger.error(f"Unable to stop transcription: {error}")
|
||||
return await future
|
||||
|
||||
async def send_prebuilt_chat_message(self, message: str, user_name: Optional[str] = None):
|
||||
async def send_prebuilt_chat_message(
|
||||
self, message: str, user_name: Optional[str] = None
|
||||
) -> Optional[CallClientError]:
|
||||
"""Send a chat message to Daily's Prebuilt main room.
|
||||
|
||||
Args:
|
||||
message: The chat message to send.
|
||||
user_name: Optional user name that will appear as sender of the message.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
if not self._joined:
|
||||
return
|
||||
return "Can't send message if not joined"
|
||||
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.send_prebuilt_chat_message(
|
||||
message, user_name=user_name, completion=completion_callback(future)
|
||||
)
|
||||
await future
|
||||
return await future
|
||||
|
||||
async def capture_participant_transcription(self, participant_id: str):
|
||||
"""Enable transcription capture for a specific participant.
|
||||
@@ -1177,38 +1177,51 @@ class DailyTransportClient(EventHandler):
|
||||
|
||||
return track
|
||||
|
||||
async def remove_custom_audio_track(self, track_name: str):
|
||||
async def remove_custom_audio_track(self, track_name: str) -> Optional[CallClientError]:
|
||||
"""Remove a custom audio track.
|
||||
|
||||
Args:
|
||||
track_name: Name of the custom audio track to remove.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.remove_custom_audio_track(
|
||||
track_name=track_name,
|
||||
completion=completion_callback(future),
|
||||
)
|
||||
await future
|
||||
return await future
|
||||
|
||||
async def update_transcription(self, participants=None, instance_id=None):
|
||||
async def update_transcription(
|
||||
self, participants=None, instance_id=None
|
||||
) -> Optional[CallClientError]:
|
||||
"""Update transcription settings for specific participants.
|
||||
|
||||
Args:
|
||||
participants: List of participant IDs to enable transcription for.
|
||||
instance_id: Optional transcription instance ID.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.update_transcription(
|
||||
participants, instance_id, completion=completion_callback(future)
|
||||
)
|
||||
await future
|
||||
return await future
|
||||
|
||||
async def update_subscriptions(self, participant_settings=None, profile_settings=None):
|
||||
async def update_subscriptions(
|
||||
self, participant_settings=None, profile_settings=None
|
||||
) -> Optional[CallClientError]:
|
||||
"""Update media subscription settings.
|
||||
|
||||
Args:
|
||||
participant_settings: Per-participant subscription settings.
|
||||
profile_settings: Global subscription profile settings.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.update_subscriptions(
|
||||
@@ -1216,32 +1229,42 @@ class DailyTransportClient(EventHandler):
|
||||
profile_settings=profile_settings,
|
||||
completion=completion_callback(future),
|
||||
)
|
||||
await future
|
||||
return await future
|
||||
|
||||
async def update_publishing(self, publishing_settings: Mapping[str, Any]):
|
||||
async def update_publishing(
|
||||
self, publishing_settings: Mapping[str, Any]
|
||||
) -> Optional[CallClientError]:
|
||||
"""Update media publishing settings.
|
||||
|
||||
Args:
|
||||
publishing_settings: Publishing configuration settings.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.update_publishing(
|
||||
publishing_settings=publishing_settings,
|
||||
completion=completion_callback(future),
|
||||
)
|
||||
await future
|
||||
return await future
|
||||
|
||||
async def update_remote_participants(self, remote_participants: Mapping[str, Any]):
|
||||
async def update_remote_participants(
|
||||
self, remote_participants: Mapping[str, Any]
|
||||
) -> Optional[CallClientError]:
|
||||
"""Update settings for remote participants.
|
||||
|
||||
Args:
|
||||
remote_participants: Remote participant configuration settings.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
future = self._get_event_loop().create_future()
|
||||
self._client.update_remote_participants(
|
||||
remote_participants=remote_participants, completion=completion_callback(future)
|
||||
)
|
||||
await future
|
||||
return await future
|
||||
|
||||
#
|
||||
#
|
||||
@@ -1816,10 +1839,11 @@ class DailyInputTransport(BaseInputTransport):
|
||||
if render_frame:
|
||||
frame = UserImageRawFrame(
|
||||
user_id=participant_id,
|
||||
request=request_frame,
|
||||
image=video_frame.buffer,
|
||||
size=(video_frame.width, video_frame.height),
|
||||
format=video_frame.color_format,
|
||||
text=request_frame.text if request_frame else None,
|
||||
append_to_context=request_frame.append_to_context if request_frame else None,
|
||||
)
|
||||
frame.transport_source = video_source
|
||||
await self.push_video_frame(frame)
|
||||
@@ -1932,7 +1956,9 @@ class DailyOutputTransport(BaseOutputTransport):
|
||||
Args:
|
||||
frame: The transport message frame to send.
|
||||
"""
|
||||
await self._client.send_message(frame)
|
||||
error = await self._client.send_message(frame)
|
||||
if error:
|
||||
logger.error(f"Unable to send message: {error}")
|
||||
|
||||
async def register_video_destination(self, destination: str):
|
||||
"""Register a video output destination.
|
||||
@@ -2176,7 +2202,7 @@ class DailyTransport(BaseTransport):
|
||||
if self._output:
|
||||
await self._output.queue_frame(frame, FrameDirection.DOWNSTREAM)
|
||||
|
||||
def participants(self):
|
||||
def participants(self) -> Mapping[str, Any]:
|
||||
"""Get current participants in the room.
|
||||
|
||||
Returns:
|
||||
@@ -2184,7 +2210,7 @@ class DailyTransport(BaseTransport):
|
||||
"""
|
||||
return self._client.participants()
|
||||
|
||||
def participant_counts(self):
|
||||
def participant_counts(self) -> Mapping[str, Any]:
|
||||
"""Get participant count information.
|
||||
|
||||
Returns:
|
||||
@@ -2192,76 +2218,155 @@ class DailyTransport(BaseTransport):
|
||||
"""
|
||||
return self._client.participant_counts()
|
||||
|
||||
async def start_dialout(self, settings=None):
|
||||
async def start_dialout(self, settings=None) -> Tuple[str, Optional[CallClientError]]:
|
||||
"""Start a dial-out call to a phone number.
|
||||
|
||||
Args:
|
||||
settings: Dial-out configuration settings.
|
||||
"""
|
||||
await self._client.start_dialout(settings)
|
||||
|
||||
async def stop_dialout(self, participant_id):
|
||||
Returns:
|
||||
session_id: Dail-out session ID.
|
||||
error: An error description or None.
|
||||
"""
|
||||
logger.debug(f"Starting dialout: settings={settings}")
|
||||
|
||||
session_id, error = await self._client.start_dialout(settings)
|
||||
if error:
|
||||
logger.error(f"Unable to start dialout: {error}")
|
||||
return session_id, error
|
||||
|
||||
async def stop_dialout(self, participant_id) -> Optional[CallClientError]:
|
||||
"""Stop a dial-out call for a specific participant.
|
||||
|
||||
Args:
|
||||
participant_id: ID of the participant to stop dial-out for.
|
||||
"""
|
||||
await self._client.stop_dialout(participant_id)
|
||||
|
||||
async def sip_call_transfer(self, settings):
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
logger.debug(f"Stopping dialout: participant_id={participant_id}")
|
||||
|
||||
error = await self._client.stop_dialout(participant_id)
|
||||
if error:
|
||||
logger.error(f"Unable to stop dialout: {error}")
|
||||
return error
|
||||
|
||||
async def sip_call_transfer(self, settings) -> Optional[CallClientError]:
|
||||
"""Transfer a SIP call to another destination.
|
||||
|
||||
Args:
|
||||
settings: SIP call transfer settings.
|
||||
"""
|
||||
await self._client.sip_call_transfer(settings)
|
||||
|
||||
async def sip_refer(self, settings):
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
logger.debug(f"Staring SIP call transfer: settings={settings}")
|
||||
|
||||
error = await self._client.sip_call_transfer(settings)
|
||||
if error:
|
||||
logger.error(f"Unable to transfer SIP call: {error}")
|
||||
return error
|
||||
|
||||
async def sip_refer(self, settings) -> Optional[CallClientError]:
|
||||
"""Send a SIP REFER request.
|
||||
|
||||
Args:
|
||||
settings: SIP REFER settings.
|
||||
"""
|
||||
await self._client.sip_refer(settings)
|
||||
|
||||
async def start_recording(self, streaming_settings=None, stream_id=None, force_new=None):
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
logger.debug(f"Staring SIP REFER: settings={settings}")
|
||||
|
||||
error = await self._client.sip_refer(settings)
|
||||
if error:
|
||||
logger.error(f"Unable to perform SIP REFER: {error}")
|
||||
return error
|
||||
|
||||
async def start_recording(
|
||||
self, streaming_settings=None, stream_id=None, force_new=None
|
||||
) -> Tuple[str, Optional[CallClientError]]:
|
||||
"""Start recording the call.
|
||||
|
||||
Args:
|
||||
streaming_settings: Recording configuration settings.
|
||||
stream_id: Unique identifier for the recording stream.
|
||||
force_new: Whether to force a new recording session.
|
||||
"""
|
||||
await self._client.start_recording(streaming_settings, stream_id, force_new)
|
||||
|
||||
async def stop_recording(self, stream_id=None):
|
||||
Returns:
|
||||
stream_id: Unique identifier for the recording stream.
|
||||
error: An error description or None.
|
||||
"""
|
||||
logger.debug(
|
||||
f"Starting recording: stream_id={stream_id} force_new={force_new} settings={streaming_settings}"
|
||||
)
|
||||
|
||||
r_id, error = await self._client.start_recording(streaming_settings, stream_id, force_new)
|
||||
if error:
|
||||
logger.error(f"Unable to start recording: {error}")
|
||||
return r_id, error
|
||||
|
||||
async def stop_recording(self, stream_id=None) -> Optional[CallClientError]:
|
||||
"""Stop recording the call.
|
||||
|
||||
Args:
|
||||
stream_id: Unique identifier for the recording stream to stop.
|
||||
"""
|
||||
await self._client.stop_recording(stream_id)
|
||||
|
||||
async def start_transcription(self, settings=None):
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
logger.debug(f"Stopping recording: stream_id={stream_id}")
|
||||
|
||||
error = await self._client.stop_recording(stream_id)
|
||||
if error:
|
||||
logger.error(f"Unable to stop recording: {error}")
|
||||
return error
|
||||
|
||||
async def start_transcription(self, settings=None) -> Optional[CallClientError]:
|
||||
"""Start transcription for the call.
|
||||
|
||||
Args:
|
||||
settings: Transcription configuration settings.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
await self._client.start_transcription(settings)
|
||||
logger.debug(f"Starting transcription: settings={settings}")
|
||||
|
||||
async def stop_transcription(self):
|
||||
"""Stop transcription for the call."""
|
||||
await self._client.stop_transcription()
|
||||
error = await self._client.start_transcription(settings)
|
||||
if error:
|
||||
logger.error(f"Unable to start transcription: {error}")
|
||||
return error
|
||||
|
||||
async def send_prebuilt_chat_message(self, message: str, user_name: Optional[str] = None):
|
||||
async def stop_transcription(self) -> Optional[CallClientError]:
|
||||
"""Stop transcription for the call.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
logger.debug(f"Stopping transcription")
|
||||
|
||||
error = await self._client.stop_transcription()
|
||||
if error:
|
||||
logger.error(f"Unable to stop transcription: {error}")
|
||||
return error
|
||||
|
||||
async def send_prebuilt_chat_message(
|
||||
self, message: str, user_name: Optional[str] = None
|
||||
) -> Optional[CallClientError]:
|
||||
"""Send a chat message to Daily's Prebuilt main room.
|
||||
|
||||
Args:
|
||||
message: The chat message to send.
|
||||
user_name: Optional user name that will appear as sender of the message.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
await self._client.send_prebuilt_chat_message(message, user_name)
|
||||
error = await self._client.send_prebuilt_chat_message(message, user_name)
|
||||
if error:
|
||||
logger.error(f"Unable to send prebuilt chat message: {error}")
|
||||
return error
|
||||
|
||||
async def capture_participant_transcription(self, participant_id: str):
|
||||
"""Enable transcription capture for a specific participant.
|
||||
@@ -2307,32 +2412,66 @@ class DailyTransport(BaseTransport):
|
||||
participant_id, framerate, video_source, color_format
|
||||
)
|
||||
|
||||
async def update_publishing(self, publishing_settings: Mapping[str, Any]):
|
||||
async def update_publishing(
|
||||
self, publishing_settings: Mapping[str, Any]
|
||||
) -> Optional[CallClientError]:
|
||||
"""Update media publishing settings.
|
||||
|
||||
Args:
|
||||
publishing_settings: Publishing configuration settings.
|
||||
"""
|
||||
await self._client.update_publishing(publishing_settings=publishing_settings)
|
||||
|
||||
async def update_subscriptions(self, participant_settings=None, profile_settings=None):
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
logger.debug(f"Updating publishing settings: settings={publishing_settings}")
|
||||
|
||||
error = await self._client.update_publishing(publishing_settings=publishing_settings)
|
||||
if error:
|
||||
logger.error(f"Unable to update publishing settings: {error}")
|
||||
return error
|
||||
|
||||
async def update_subscriptions(
|
||||
self, participant_settings=None, profile_settings=None
|
||||
) -> Optional[CallClientError]:
|
||||
"""Update media subscription settings.
|
||||
|
||||
Args:
|
||||
participant_settings: Per-participant subscription settings.
|
||||
profile_settings: Global subscription profile settings.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
await self._client.update_subscriptions(
|
||||
participant_settings=participant_settings, profile_settings=profile_settings
|
||||
logger.debug(
|
||||
f"Updating subscriptions: participant_settings={participant_settings} profile_settings={profile_settings}"
|
||||
)
|
||||
|
||||
async def update_remote_participants(self, remote_participants: Mapping[str, Any]):
|
||||
error = await self._client.update_subscriptions(
|
||||
participant_settings=participant_settings, profile_settings=profile_settings
|
||||
)
|
||||
if error:
|
||||
logger.error(f"Unable to update subscription settings: {error}")
|
||||
return error
|
||||
|
||||
async def update_remote_participants(
|
||||
self, remote_participants: Mapping[str, Any]
|
||||
) -> Optional[CallClientError]:
|
||||
"""Update settings for remote participants.
|
||||
|
||||
Args:
|
||||
remote_participants: Remote participant configuration settings.
|
||||
|
||||
Returns:
|
||||
error: An error description or None.
|
||||
"""
|
||||
await self._client.update_remote_participants(remote_participants=remote_participants)
|
||||
logger.debug(f"Updating remote participants: remote_participants={remote_participants}")
|
||||
|
||||
error = await self._client.update_remote_participants(
|
||||
remote_participants=remote_participants
|
||||
)
|
||||
if error:
|
||||
logger.error(f"Unable to update remote participants: {error}")
|
||||
return error
|
||||
|
||||
async def _on_active_speaker_changed(self, participant: Any):
|
||||
"""Handle active speaker change events."""
|
||||
@@ -2340,6 +2479,12 @@ class DailyTransport(BaseTransport):
|
||||
|
||||
async def _on_joined(self, data):
|
||||
"""Handle room joined events."""
|
||||
if self._params.transcription_enabled:
|
||||
# We report an error because we are starting transcription
|
||||
# internally and if it fails we need to know.
|
||||
error = await self.start_transcription(self._params.transcription_settings)
|
||||
if error:
|
||||
await self._on_error(f"Unable to start transcription: {error}")
|
||||
await self._call_event_handler("on_joined", data)
|
||||
|
||||
async def _on_left(self):
|
||||
@@ -2348,6 +2493,12 @@ class DailyTransport(BaseTransport):
|
||||
|
||||
async def _on_before_leave(self):
|
||||
"""Handle before leave room events."""
|
||||
if self._params.transcription_enabled:
|
||||
# We report an error because we are stopping transcription
|
||||
# internally and if it fails we need to know.
|
||||
error = await self.stop_transcription()
|
||||
if error:
|
||||
await self._on_error(f"Unable to stop transcription: {error}")
|
||||
await self._call_event_handler("on_before_leave")
|
||||
|
||||
async def _on_error(self, error):
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user