Compare commits
239 Commits
cb/test-se
...
v0.0.93
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
16e2d5b998 | ||
|
|
4cf9e1409e | ||
|
|
0ed430e7e2 | ||
|
|
342a8b121b | ||
|
|
5729722dcd | ||
|
|
38aac44a1e | ||
|
|
4f1468e0fa | ||
|
|
9b1192ca9b | ||
|
|
5e7f59a0b0 | ||
|
|
2ad4122b77 | ||
|
|
5950f734f5 | ||
|
|
8d0364b630 | ||
|
|
bfe031604a | ||
|
|
9bfde61183 | ||
|
|
cb40a39a01 | ||
|
|
03001f8047 | ||
|
|
10f1c314b6 | ||
|
|
4d1d6465fc | ||
|
|
359d220162 | ||
|
|
6feecf05f7 | ||
|
|
c3306bb4f2 | ||
|
|
07a4aae248 | ||
|
|
925a6cc2ef | ||
|
|
613ad74103 | ||
|
|
2ab6b71890 | ||
|
|
c2bd8d22a0 | ||
|
|
eda12f56e6 | ||
|
|
3daa1b7850 | ||
|
|
4c8c44ecc3 | ||
|
|
8c34e1efba | ||
|
|
f6916428b1 | ||
|
|
a14d00b806 | ||
|
|
927cf751c0 | ||
|
|
1fb6d6bd23 | ||
|
|
94a3306679 | ||
|
|
16bd1fe32d | ||
|
|
58b552171d | ||
|
|
4732a442d4 | ||
|
|
accdddce95 | ||
|
|
daf9da823c | ||
|
|
f6b6aa8766 | ||
|
|
935eb58951 | ||
|
|
9f2ddcc5f4 | ||
|
|
961e28517e | ||
|
|
34d6f3fa00 | ||
|
|
616abfd96c | ||
|
|
d7774ac599 | ||
|
|
c8c13ecee2 | ||
|
|
314acc104e | ||
|
|
1dfa59257d | ||
|
|
376dcc34f7 | ||
|
|
5ee8c56899 | ||
|
|
4397deddc7 | ||
|
|
13d6078ea0 | ||
|
|
61aec08794 | ||
|
|
0f69d4aea3 | ||
|
|
84ba628dfb | ||
|
|
9ce33f23b9 | ||
|
|
75245e1daa | ||
|
|
24365aeefe | ||
|
|
29ef0f419f | ||
|
|
a9d78bd956 | ||
|
|
e6f881bb08 | ||
|
|
bee4165ba4 | ||
|
|
e2f6ce1078 | ||
|
|
0184493711 | ||
|
|
eb3c4c59fc | ||
|
|
d844829538 | ||
|
|
11b101e8a6 | ||
|
|
3db5ab9f23 | ||
|
|
9a96e4060c | ||
|
|
d826279946 | ||
|
|
e4212fb3c0 | ||
|
|
234aae3091 | ||
|
|
c33b81bb92 | ||
|
|
a1c07039ee | ||
|
|
33be73692f | ||
|
|
f6d7b6ae5f | ||
|
|
2ee54c985f | ||
|
|
76c336644a | ||
|
|
dd8711dee1 | ||
|
|
c26c27fe21 | ||
|
|
159dbd078d | ||
|
|
c18ff999a5 | ||
|
|
80d127aaa4 | ||
|
|
bbc7d3e2fb | ||
|
|
3486d63ef6 | ||
|
|
842c4a3485 | ||
|
|
0b779a880b | ||
|
|
01f3421052 | ||
|
|
c20aa78648 | ||
|
|
38f27ad991 | ||
|
|
0c38585034 | ||
|
|
8a09bbbf0e | ||
|
|
fb737ff671 | ||
|
|
b7a4d7371c | ||
|
|
ef88d6a2ea | ||
|
|
5c1bd8cda2 | ||
|
|
a82158045a | ||
|
|
b1533ddfc4 | ||
|
|
0abc699f24 | ||
|
|
09018071e8 | ||
|
|
1c53a5fd01 | ||
|
|
05d4753d3e | ||
|
|
87131850bc | ||
|
|
af83f45a49 | ||
|
|
62e45f466a | ||
|
|
e85e93b9b1 | ||
|
|
074d3ff162 | ||
|
|
d680ec2e69 | ||
|
|
d905b21f72 | ||
|
|
6c5d84ca4c | ||
|
|
334167e3d7 | ||
|
|
e3531a5f25 | ||
|
|
343e97666a | ||
|
|
653e84321b | ||
|
|
3585f724c4 | ||
|
|
5fe597d355 | ||
|
|
67ab3773f6 | ||
|
|
c6e12b9358 | ||
|
|
0f5030bafa | ||
|
|
ed93e29850 | ||
|
|
7eb880c5e8 | ||
|
|
4fa0de6660 | ||
|
|
396c1bcc13 | ||
|
|
57f6ae9e50 | ||
|
|
2d03e51109 | ||
|
|
1e7143e5f3 | ||
|
|
f820c20fa2 | ||
|
|
83f395ff8f | ||
|
|
09a7e08cbf | ||
|
|
6f172bba8f | ||
|
|
1433df4de2 | ||
|
|
6ade5617fb | ||
|
|
685d440206 | ||
|
|
ac5734d0ed | ||
|
|
5e00133e64 | ||
|
|
42f0490414 | ||
|
|
19f046a338 | ||
|
|
ec95618b94 | ||
|
|
74fb6e7676 | ||
|
|
8fa6cbac51 | ||
|
|
a997655eac | ||
|
|
3b3a215155 | ||
|
|
e458d3edfe | ||
|
|
d7d409df60 | ||
|
|
5174b18176 | ||
|
|
9c5690d670 | ||
|
|
e0933e20d2 | ||
|
|
ce13155d26 | ||
|
|
817a485d94 | ||
|
|
b094418d1e | ||
|
|
08a1e09020 | ||
|
|
52b33e5106 | ||
|
|
5db0871a20 | ||
|
|
222c362fa4 | ||
|
|
9d509bb409 | ||
|
|
8d0e7e5e16 | ||
|
|
e7b8da7a83 | ||
|
|
35c48a45cf | ||
|
|
14a365aa16 | ||
|
|
779fc0419d | ||
|
|
057e0c3973 | ||
|
|
8a6abdd44b | ||
|
|
7872fa2e88 | ||
|
|
e86c546a1a | ||
|
|
abf34bcccf | ||
|
|
56eb633390 | ||
|
|
6299b9db87 | ||
|
|
bcffa590a3 | ||
|
|
8b739aa444 | ||
|
|
8f15980c67 | ||
|
|
89e9acf0e1 | ||
|
|
ddac24e6c9 | ||
|
|
d0f52feba3 | ||
|
|
8894db4290 | ||
|
|
1f96cdf970 | ||
|
|
0282033208 | ||
|
|
917ea27352 | ||
|
|
8c03df1463 | ||
|
|
15aa76efba | ||
|
|
8ac421f8fd | ||
|
|
75b3ea9c96 | ||
|
|
95be1510ac | ||
|
|
df19011080 | ||
|
|
e42cf78e79 | ||
|
|
0495de52b6 | ||
|
|
9bc02afd0d | ||
|
|
6140fdb2c9 | ||
|
|
b6a1886dae | ||
|
|
42d0a097c5 | ||
|
|
3761804146 | ||
|
|
46e97c57c2 | ||
|
|
19770b76b4 | ||
|
|
b34461bf93 | ||
|
|
bab0aaf585 | ||
|
|
61944d22ef | ||
|
|
47756319be | ||
|
|
5fa56df014 | ||
|
|
8a151235c3 | ||
|
|
ec42f8c24e | ||
|
|
29fd17b9ff | ||
|
|
3ea1e357f2 | ||
|
|
351ef617ae | ||
|
|
9dafb715c4 | ||
|
|
82d494d3d4 | ||
|
|
e893aaa620 | ||
|
|
65c17a698e | ||
|
|
615aae5b95 | ||
|
|
b0acbeffb9 | ||
|
|
2f1061f300 | ||
|
|
9307079af2 | ||
|
|
efa64642a4 | ||
|
|
ede6c32149 | ||
|
|
4050e8b7dc | ||
|
|
b0f5fc02c4 | ||
|
|
493d6bf91e | ||
|
|
aaebcae2e8 | ||
|
|
408264a0fd | ||
|
|
df8aa3e4b0 | ||
|
|
4d82a1260b | ||
|
|
f974c66e12 | ||
|
|
533372ed37 | ||
|
|
a9118eb2cd | ||
|
|
84ed2468e5 | ||
|
|
d82d855c20 | ||
|
|
412ff2a4a1 | ||
|
|
82ccc160fb | ||
|
|
9ef60bd468 | ||
|
|
06e86cc107 | ||
|
|
f3c4bf08dd | ||
|
|
f2cfbee3c3 | ||
|
|
8b063116ab | ||
|
|
8096e62b34 | ||
|
|
5052da8ce6 | ||
|
|
9acc36c58e | ||
|
|
1ecf6e05fe | ||
|
|
5cc1d8a024 | ||
|
|
1e31fc7f9b |
394
CHANGELOG.md
394
CHANGELOG.md
@@ -5,24 +5,407 @@ All notable changes to **Pipecat** will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
## [0.0.93] - 2025-11-07
|
||||
|
||||
### Added
|
||||
|
||||
- Added support for Sarvam Speech-to-Text service (`SarvamSTTService`) with
|
||||
streaming WebSocket support for `saarika` (STT) and `saaras` (STT-translate)
|
||||
models.
|
||||
|
||||
- Added support for passing in a `ToolsSchema` in lieu of a list of provider-
|
||||
specific dicts when initializing `OpenAIRealtimeLLMService` or when updating
|
||||
it using `LLMUpdateSettingsFrame`.
|
||||
|
||||
- Added `TransportParams.audio_out_silence_secs`, which specifies how many
|
||||
seconds of silence to output when an `EndFrame` reaches the output
|
||||
transport. This can help ensure that all audio data is fully delivered to
|
||||
clients.
|
||||
|
||||
- Added new `FrameProcessor.broadcast_frame()` method. This will push two
|
||||
instances of a given frame class, one upstream and the other downstream.
|
||||
|
||||
```python
|
||||
await self.broadcast_frame(UserSpeakingFrame)
|
||||
```
|
||||
|
||||
- Added `MetricsLogObserver` for logging performance metrics from `MetricsFrame`
|
||||
instances. Supports filtering via `include_metrics` parameter to control which
|
||||
metrics types are logged (TTFB, processing time, LLM token usage, TTS usage,
|
||||
smart turn metrics).
|
||||
|
||||
- Added `pronunciation_dictionary_locators` to `ElevenLabsTTSService` and
|
||||
`ElevenLabsHttpTTSService`.
|
||||
|
||||
- Added support for loading external observers. You can now register custom
|
||||
pipeline observers by setting the `PIPECAT_OBSERVER_FILES` environment
|
||||
variable. This variable should contain a colon-separated list of Python files
|
||||
(e.g. `export PIPECAT_OBSERVER_FILES="observer1.py:observer2.py:..."`). Each
|
||||
file must define a function with the following signature:
|
||||
|
||||
```python
|
||||
async def create_observers(task: PipelineTask) -> Iterable[BaseObserver]:
|
||||
...
|
||||
```
|
||||
|
||||
- Added support for new sonic-3 languages in `CartesiaTTSService` and
|
||||
`CartesiaHttpTTSService`.
|
||||
|
||||
- `EndFrame` and `EndTaskFrame` have an optional `reason` field to indicate why
|
||||
the pipeline is being ended.
|
||||
|
||||
- `CancelFrame` and `CancelTaskFrame` have an optional `reason` field to
|
||||
indicate why the pipeline is being canceled. This can be also specified when
|
||||
you cancel a task with `PipelineTask.cancel(reason="cancellation reason")`.
|
||||
|
||||
- Added `include_prob_metrics` parameter to Whisper STT services to enable access
|
||||
to probability metrics from transcription results.
|
||||
|
||||
- Added utility functions `extract_whisper_probability()`,
|
||||
`extract_openai_gpt4o_probability()`, and `extract_deepgram_probability()` to
|
||||
extract probability metrics from `TranscriptionFrame` objects for Whisper-based,
|
||||
OpenAI GPT-4o-transcribe, and Deepgram STT services respectively.
|
||||
|
||||
- Added `LLMSwitcher.register_direct_function()`. It works much like
|
||||
`LLMSwitcher.register_function()` in that it's a shorthand for registering
|
||||
functions on all LLMs in the switcher, but for direct functions.
|
||||
|
||||
- Added `LLMSwitcher.register_direct_function()`. It works much like
|
||||
`LLMSwitcher.register_function()` in that it's a shorthand for registering
|
||||
a function on all LLMs in the switcher, except this new method takes a direct
|
||||
function (a `FunctionSchema`-less function).
|
||||
|
||||
- Added `MCPClient.get_tools_schema()` and `MCPClient.register_tools_schema()`
|
||||
as a two-step alternative to `MCPClient.register_tools()`, to allow users to
|
||||
pass MCP tools to, say, `GeminiLiveLLMService` (as well as other
|
||||
speech-to-speech services) in the constructor.
|
||||
|
||||
- Added support for passing in an `LLMSwicher` to `MCPClient.register_tools()`
|
||||
(as well as the new `MCPClient.register_tools_schema()`).
|
||||
|
||||
- Added `cpu_count` parameter to `LocalSmartTurnAnalyzerV3`. This is set to `1`
|
||||
by default for more predictable performance on low-CPU systems.
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated `simli-ai` to 0.1.25.
|
||||
|
||||
- Improved `concatenate_aggregated_text()` to one word outputs from OpenAI
|
||||
Realtime and Gemini Live. Text fragments are now correctly concatenated
|
||||
without spaces when these patterns are detected.
|
||||
|
||||
- `STTMuteFilter` no longer sends `STTMuteFrame` to the STT service. The filter
|
||||
now blocks frames locally without instructing the STT service to stop
|
||||
processing audio. This prevents inactivity-related errors (such as 409 errors
|
||||
from Google STT) while maintaining the same muting behavior at the application
|
||||
level. Important: The STTMuteFilter should be placed _after_ the STT service
|
||||
itself.
|
||||
|
||||
- Improved `GoogleSTTService` error handling to properly catch gRPC `Aborted`
|
||||
exceptions (corresponding to 409 errors) caused by stream inactivity. These
|
||||
exceptions are now logged at DEBUG level instead of ERROR level, since they
|
||||
indicate expected behavior when no audio is sent for 10+ seconds (e.g., during
|
||||
long silences or when audio input is blocked). The service automatically
|
||||
reconnects when this occurs.
|
||||
|
||||
- Bumped the `fastapi` dependency's upperbound to `<0.122.0`.
|
||||
|
||||
- Updated the default model for `GoogleVertexLLMService` to `gemini-2.5-flash`.
|
||||
|
||||
- Updated the `GoogleVertexLLMService` to use the `GoogleLLMService` as a base
|
||||
class instead of the `OpenAILLMService`.
|
||||
|
||||
- Updated STT and TTS services to pass through unverified language codes with a
|
||||
warning instead of returning None. This allows developers to use newly
|
||||
supported languages before Pipecat's service classes are updated, while still
|
||||
providing guidance on verified languages.
|
||||
|
||||
### Removed
|
||||
|
||||
- Removed `needs_mcp_alternate_schema()` from `LLMService`. The mechanism that
|
||||
relied on it went away.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Restore backwards compatibility for vision/image features (broken in 0.0.92)
|
||||
when using non-universal context and assistant aggregators.
|
||||
|
||||
- Fixed `DeepgramSTTService._disconnect()` to properly await `is_connected()`
|
||||
method call, which is an async coroutine in the Deepgram SDK.
|
||||
|
||||
- Fixed an issue where the `SmallWebRTCRequest` dataclass in runner would scrub
|
||||
arbitrary request data from client due to camelCase typing. This fixes data
|
||||
passthrough for JS clients where `APIRequest` is used.
|
||||
|
||||
- Fixed a bug in `GeminiLiveLLMService` where in some circumstances it wouldn't
|
||||
respond after a tool call.
|
||||
|
||||
- Fixed `GeminiLiveLLMService` session resumption after a connection timeout.
|
||||
|
||||
- `GeminiLiveLLMService` now properly supports context-provided system
|
||||
instruction and tools.
|
||||
|
||||
- Fixed `GoogleLLMService` token counting to avoid double-counting tokens when
|
||||
Gemini sends usage metadata across multiple streaming chunks.
|
||||
|
||||
## [0.0.92] - 2025-10-31 🎃 "The Haunted Edition" 👻
|
||||
|
||||
### Added
|
||||
|
||||
- Added a new `DeepgramHttpTTSService`, which delivers a meaningful reduction
|
||||
in latency when compared to the `DeepgramTTSService`.
|
||||
|
||||
- Add support for `speaking_rate` input parameter in `GoogleHttpTTSService`.
|
||||
|
||||
- Added `enable_speaker_diarization` and `enable_language_identification` to
|
||||
`SonioxSTTService`.
|
||||
|
||||
- Added `SpeechmaticsTTSService`, which uses Speechmatic's TTS API. Updated
|
||||
examples 07a\* to use the new TTS service.
|
||||
|
||||
- Added support for including images or audio to LLM context messages using
|
||||
`LLMContext.create_image_message()` or `LLMContext.create_image_url_message()`
|
||||
(not all LLMs support URLs) and `LLMContext.create_audio_message()`. For
|
||||
example, when creating `LLMMessagesAppendFrame`:
|
||||
|
||||
```python
|
||||
message = LLMContext.create_image_message(image=..., size= ...)
|
||||
await self.push_frame(LLMMessagesAppendFrame(messages=[message], run_llm=True))
|
||||
```
|
||||
|
||||
- New event handlers for the `DeepgramFluxSTTService`: `on_start_of_turn`,
|
||||
`on_turn_resumed`, `on_end_of_turn`, `on_eager_end_of_turn`, `on_update`.
|
||||
|
||||
- Added `generation_config` parameter support to `CartesiaTTSService` and
|
||||
`CartesiaHttpTTSService` for Cartesia Sonic-3 models. Includes a new
|
||||
`GenerationConfig` class with `volume` (0.5-2.0), `speed` (0.6-1.5),
|
||||
and `emotion` (60+ options) parameters for fine-grained speech generation
|
||||
control.
|
||||
|
||||
- Expanded support for univeral `LLMContext` to `OpenAIRealtimeLLMService`.
|
||||
As a reminder, the context-setup pattern when using `LLMContext` is:
|
||||
|
||||
```python
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
```
|
||||
|
||||
(Note that even though `OpenAIRealtimeLLMService` now supports the universal
|
||||
`LLMContext`, it is not meant to be swapped out for another LLM service at
|
||||
runtime with `LLMSwitcher`.)
|
||||
|
||||
Note: `TranscriptionFrame`s and `InterimTranscriptionFrame`s now go upstream
|
||||
from `OpenAIRealtimeLLMService`, so if you're using `TranscriptProcessor`,
|
||||
say, you'll want to adjust accordingly:
|
||||
|
||||
```python
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
context_aggregator.user(),
|
||||
|
||||
# BEFORE
|
||||
llm,
|
||||
transcript.user(),
|
||||
|
||||
# AFTER
|
||||
transcript.user(),
|
||||
llm,
|
||||
|
||||
transport.output(),
|
||||
transcript.assistant(),
|
||||
context_aggregator.assistant(),
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
Also worth noting: whether or not you use the new context-setup pattern with
|
||||
`OpenAIRealtimeLLMService`, some types have changed under the hood:
|
||||
|
||||
```python
|
||||
## BEFORE:
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: OpenAIContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: OpenAILLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: OpenAIRealtimeLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
|
||||
## AFTER:
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: LLMContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: LLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
```
|
||||
|
||||
Also note that `RealtimeMessagesUpdateFrame` and
|
||||
`RealtimeFunctionCallResultFrame` have been deprecated, since they're no
|
||||
longer used by `OpenAIRealtimeLLMService`. OpenAI Realtime now works more
|
||||
like other LLM services in Pipecat, relying on updates to its context, pushed
|
||||
by context aggregators, to update its internal state. Listen for
|
||||
`LLMContextFrame`s for context updates.
|
||||
|
||||
Finally, `LLMTextFrame`s are no longer pushed from `OpenAIRealtimeLLMService`
|
||||
when it's configured with `output_modalities=['audio']`. If you need
|
||||
to process its output, listen for `TTSTextFrame`s instead.
|
||||
|
||||
- Expanded support for universal `LLMContext` to `GeminiLiveLLMService`.
|
||||
As a reminder, the context-setup pattern when using `LLMContext` is:
|
||||
|
||||
```python
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
```
|
||||
|
||||
(Note that even though `GeminiLiveLLMService` now supports the universal
|
||||
`LLMContext`, it is not meant to be swapped out for another LLM service at
|
||||
runtime with `LLMSwitcher`.)
|
||||
|
||||
Worth noting: whether or not you use the new context-setup pattern with
|
||||
`GeminiLiveLLMService`, some types have changed under the hood:
|
||||
|
||||
```python
|
||||
## BEFORE:
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: GeminiLiveContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: OpenAILLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: GeminiLiveLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
|
||||
## AFTER:
|
||||
|
||||
# Context aggregator type
|
||||
context_aggregator: LLMContextAggregatorPair
|
||||
|
||||
# Context frame type
|
||||
frame: LLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
```
|
||||
|
||||
Also note that `LLMTextFrame`s are no longer pushed from `GeminiLiveLLMService`
|
||||
when it's configured with `modalities=GeminiModalities.AUDIO`. If you need
|
||||
to process its output, listen for `TTSTextFrame`s instead.
|
||||
|
||||
### Changed
|
||||
|
||||
- The development runner's `/start` endpoint now supports passing
|
||||
`dailyRoomProperties` and `dailyMeetingTokenProperties` in the request body
|
||||
when `createDailyRoom` is true. Properties are validated against the
|
||||
`DailyRoomProperties` and `DailyMeetingTokenProperties` types respectively
|
||||
and passed to Daily's room and token creation APIs.
|
||||
|
||||
- `UserImageRawFrame` new fields `append_to_context` and `text`. The
|
||||
`append_to_context` field indicates if this image and text should be added to
|
||||
the LLM context (by the LLM assistant aggregator). The `text` field, if set,
|
||||
might also guide the LLM or the vision service on how to analyze the image.
|
||||
|
||||
- `UserImageRequestFrame` new fiels `append_to_context` and `text`. Both fields
|
||||
will be used to set the same fields on the captured `UserImageRawFrame`.
|
||||
|
||||
- `UserImageRequestFrame` don't require function call name and ID anymore.
|
||||
|
||||
- Updated `MoondreamService` to process `UserImageRawFrame`.
|
||||
|
||||
- `VisionService` expects `UserImageRawFrame` in order to analyze images.
|
||||
|
||||
- `DailyTransport` triggers `on_error` event if transcription can't be started
|
||||
or stopped.
|
||||
|
||||
- `DailyTransport` updates: `start_dialout()` now returns two values:
|
||||
`session_id` and `error`. `start_recording()` now returns two values:
|
||||
`stream_id` and `error`.
|
||||
|
||||
- Updated `daily-python` to 0.21.0.
|
||||
|
||||
- `SimliVideoService` now accepts `api_key` and `face_id` parameters directly,
|
||||
with optional `params` for `max_session_length` and `max_idle_time`
|
||||
configuration, aligning with other Pipecat service patterns.
|
||||
|
||||
- Updated the default model to `sonic-3` for `CartesiaTTSService` and
|
||||
`CartesiaHttpTTSService`.
|
||||
|
||||
- `FunctionFilter` now has a `filter_system_frames` arg, which controls whether
|
||||
or not SystemFrames are filtered.
|
||||
|
||||
- Upgraded `aws_sdk_bedrock_runtime` to v0.1.1 to resolve potential CPU issues
|
||||
when running `AWSNovaSonicLLMService`.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- The `expect_stripped_words` parameter of `LLMAssistantAggregatorParams` is
|
||||
ignored when used with the newer `LLMAssistantAggregator`, which now handles
|
||||
word spacing automatically.
|
||||
|
||||
- `LLMService.request_image_frame()` is deprecated, push a
|
||||
`UserImageRequestFrame` instead.
|
||||
|
||||
- `UserResponseAggregator` is deprecated and will be removed in a future version.
|
||||
|
||||
- The `send_transcription_frames` argument to `OpenAIRealtimeLLMService` is
|
||||
deprecated. Transcription frames are now always sent. They go upstream, to be
|
||||
handled by the user context aggregator. See "Added" section for details.
|
||||
|
||||
- Types in `pipecat.services.openai.realtime.context` and
|
||||
`pipecat.services.openai.realtime.frames` are deprecated, as they're no
|
||||
longer used by `OpenAIRealtimeLLMService`. See "Added" section for details.
|
||||
|
||||
- `SimliVideoService` `simli_config` parameter is deprecated. Use `api_key` and
|
||||
`face_id` parameters instead.
|
||||
|
||||
### Removed
|
||||
|
||||
- Removed `enable_non_final_tokens` and `max_non_final_tokens_duration_ms` from
|
||||
`SonioxSTTService`.
|
||||
|
||||
- Removed the `aiohttp_session` arg from `SarvamTTSService` as it's no longer
|
||||
used.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed a `PipelineTask` issue that was causing an idle timeout for frames that
|
||||
were being generated but not reaching the end of the pipeline. Since the exact
|
||||
point when frames are discarded is unknown, we now monitor pipeline frames
|
||||
using an observer. If the observer detects frames are being generated, it will
|
||||
prevent the pipeline from being considered idle.
|
||||
|
||||
- Fixed an issue in `HumeTTSService` that was only using Octave 2, which does
|
||||
not support the `description` field. Now, if a description is provided, it
|
||||
switches to Octave 1.
|
||||
|
||||
- Fixed an issue where `DailyTransport` would timeout prematurely on join and on
|
||||
leave.
|
||||
|
||||
- Fixed an issue in the runner where starting a DailyTransport room via
|
||||
`/start` didn't support using the `DAILY_SAMPLE_ROOM_URL` env var.
|
||||
|
||||
- Fixed an issue in `ServiceSwitcher` where the `STTService`s would result in
|
||||
all STT services producing `TranscriptionFrame`s.
|
||||
|
||||
### Other
|
||||
|
||||
- Updated all vision 12-series foundational examples to load images from a file.
|
||||
|
||||
- Added 14-series video examples for different services. These new examples
|
||||
request an image from the user camera through a function call.
|
||||
|
||||
## [0.0.91] - 2025-10-21
|
||||
|
||||
### Added
|
||||
@@ -44,7 +427,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
(Note that even though `AWSNovaSonicLLMService` now supports the universal
|
||||
`LLMContext`, it is not meant to be swapped out for another LLM service at
|
||||
runtime.)
|
||||
runtime with `LLMSwitcher`.)
|
||||
|
||||
Worth noting: whether or not you use the new context-setup pattern with
|
||||
`AWSNovaSonicLLMService`, some types have changed under the hood:
|
||||
@@ -123,8 +506,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
deprecated. Transcription frames are now always sent. They go upstream, to be
|
||||
handled by the user context aggregator. See "Added" section for details.
|
||||
|
||||
- Types in `pipecat.services.aws.nova_sonic.context` have been deprecated due
|
||||
to changes to support `LLMContext`. See "Changed" section for details.
|
||||
- Types in `pipecat.services.aws.nova_sonic.context` are deprecated, as they're
|
||||
no longer used by `AWSNovaSonicLLMService`. See "Added" section for
|
||||
details.
|
||||
|
||||
### Fixed
|
||||
|
||||
@@ -1162,6 +1546,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Added
|
||||
|
||||
- Added `SonioxSTTService` using Soniox's STT websocket API.
|
||||
|
||||
- Added `enable_emulated_vad_interruptions` to `LLMUserAggregatorParams`.
|
||||
When user speech is emulated (e.g. when a transcription is received but
|
||||
VAD doesn't detect speech), this parameter controls whether the emulated
|
||||
|
||||
26
README.md
26
README.md
@@ -72,19 +72,19 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout
|
||||
|
||||
## 🧩 Available services
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
| Category | Services |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||
|
||||
|
||||
@@ -21,8 +21,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.stt import CartesiaSTTService
|
||||
from pipecat.services.cartesia.tts import CartesiaHttpTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -59,7 +59,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
|
||||
|
||||
tts = CartesiaHttpTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
|
||||
@@ -21,8 +21,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.stt import CartesiaSTTService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -58,7 +58,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
@@ -20,10 +21,10 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
|
||||
from pipecat.services.openai.base_llm import BaseOpenAILLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.speechmatics.stt import SpeechmaticsSTTService
|
||||
from pipecat.services.speechmatics.tts import SpeechmaticsTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -51,121 +52,127 @@ transport_params = {
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"""Speechmatics STT Service Example
|
||||
"""Speechmatics STT and TTS Service Example
|
||||
|
||||
This example demonstrates using Speechmatics Speech-to-Text service with speaker diarization and intelligent speaker management. Key features:
|
||||
This example demonstrates using Speechmatics Speech-to-Text and Text-to-Speech services
|
||||
with speaker diarization and intelligent speaker management. Key features:
|
||||
|
||||
1. Speaker Diarization
|
||||
1. Speaker Diarization (STT)
|
||||
- Automatically identifies and distinguishes between different speakers
|
||||
- First speaker is identified as 'S1', others get subsequent IDs
|
||||
- Uses `enable_diarization` parameter to manage speaker detection
|
||||
|
||||
2. Smart Speaker Control
|
||||
2. Smart Speaker Control (STT)
|
||||
- `focus_speakers` parameter lets you target specific speakers (e.g. ["S1"])
|
||||
- Other speakers will be wrapped in PASSIVE tags
|
||||
- Only processes speech from focused speakers
|
||||
- Words from all speakers are wrapped with XML tags for clear speaker identification
|
||||
- Other speakers' speech only sent when focused speaker is active
|
||||
|
||||
3. Voice Activity Detection
|
||||
3. Voice Activity Detection (STT)
|
||||
- Built-in VAD using `enable_vad` parameter
|
||||
- Remove `vad_analyzer` from `transport` config to use module's VAD
|
||||
- Emits speaker started/stopped events
|
||||
|
||||
4. Configuration Options
|
||||
4. Text-to-Speech (TTS)
|
||||
- Low latency streaming audio synthesis
|
||||
- Multiple voice options available including `sarah`, `theo`, and `megan`
|
||||
|
||||
5. Configuration Options
|
||||
- `operating_point` parameter defaults to `ENHANCED` for optimal accuracy
|
||||
- Configurable `end_of_utterance_silence_trigger` (default 0.5s)
|
||||
- Customizable speaker formatting
|
||||
- Additional diarization settings available
|
||||
|
||||
For detailed information about operating points and configuration:
|
||||
https://docs.speechmatics.com/rt-api-ref
|
||||
For detailed information:
|
||||
- STT: https://docs.speechmatics.com/rt-api-ref
|
||||
- TTS: https://docs.speechmatics.com/text-to-speech/quickstart
|
||||
"""
|
||||
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
enable_vad=True,
|
||||
enable_diarization=True,
|
||||
focus_speakers=["S1"],
|
||||
end_of_utterance_silence_trigger=0.5,
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
speaker_passive_format="<PASSIVE><{speaker_id}>{text}</{speaker_id}></PASSIVE>",
|
||||
),
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
model="eleven_turbo_v2_5",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
params=BaseOpenAILLMService.InputParams(temperature=0.75),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful British assistant called Alfred. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be converted to audio so don't include special characters in your answers. "
|
||||
"Always include punctuation in your responses. "
|
||||
"Give very short replies - do not give longer replies unless strictly necessary. "
|
||||
"Respond to what the user said in a concise, funny, creative and helpful way. "
|
||||
"Use `<Sn/>` tags to identify different speakers - do not use tags in your replies. "
|
||||
"Do not respond to speakers within `<PASSIVE/>` tags unless explicitly asked to. "
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
enable_vad=True,
|
||||
enable_diarization=True,
|
||||
focus_speakers=["S1"],
|
||||
end_of_utterance_silence_trigger=0.5,
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
speaker_passive_format="<PASSIVE><{speaker_id}>{text}</{speaker_id}></PASSIVE>",
|
||||
),
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(aggregation_timeout=0.005),
|
||||
)
|
||||
tts = SpeechmaticsTTSService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
voice_id="sarah",
|
||||
aiohttp_session=session,
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
params=BaseOpenAILLMService.InputParams(temperature=0.75),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful British assistant called Sarah. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be converted to audio so don't include special characters in your answers. "
|
||||
"Always include punctuation in your responses. "
|
||||
"Give very short replies - do not give longer replies unless strictly necessary. "
|
||||
"Respond to what the user said in a concise, funny, creative and helpful way. "
|
||||
"Use `<Sn/>` tags to identify different speakers - do not use tags in your replies. "
|
||||
"Do not respond to speakers within `<PASSIVE/>` tags unless explicitly asked to. "
|
||||
),
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(aggregation_timeout=0.005),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
await runner.run(task)
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
@@ -24,10 +25,10 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
|
||||
from pipecat.services.openai.base_llm import BaseOpenAILLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.speechmatics.stt import SpeechmaticsSTTService
|
||||
from pipecat.services.speechmatics.tts import SpeechmaticsTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -61,100 +62,106 @@ transport_params = {
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"""Run example using Speechmatics STT.
|
||||
"""Run example using Speechmatics STT and TTS.
|
||||
|
||||
This example will use diarization within our STT service and output the words spoken by
|
||||
each individual speaker and wrap them with XML tags for the LLM to process. Note the
|
||||
instructions in the system context for the LLM. This greatly improves the conversation
|
||||
experience by allowing the LLM to understand who is speaking in a multi-party call.
|
||||
This example demonstrates a complete Speechmatics integration with both Speech-to-Text
|
||||
and Text-to-Speech services:
|
||||
|
||||
By default, this example will use our ENHANCED operating point, which is optimized for
|
||||
high accuracy. You can change this by setting the `operating_point` parameter to a different
|
||||
value.
|
||||
STT Features:
|
||||
- Diarization to identify and distinguish between different speakers
|
||||
- Words spoken by each speaker are wrapped with XML tags for LLM processing
|
||||
- System context instructions help the LLM understand multi-party conversations
|
||||
- ENHANCED operating point by default for optimal accuracy
|
||||
|
||||
For more information on operating points, see the Speechmatics documentation:
|
||||
https://docs.speechmatics.com/rt-api-ref
|
||||
TTS Features:
|
||||
- Low latency streaming audio synthesis
|
||||
- Multiple voice options available including `sarah`, `theo`, and `megan`
|
||||
|
||||
For more information:
|
||||
- STT: https://docs.speechmatics.com/rt-api-ref
|
||||
- TTS: https://docs.speechmatics.com/text-to-speech/quickstart
|
||||
"""
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
enable_diarization=True,
|
||||
end_of_utterance_silence_trigger=0.5,
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
),
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
model="eleven_turbo_v2_5",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
params=BaseOpenAILLMService.InputParams(temperature=0.75),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful British assistant called Alfred. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be converted to audio so don't include special characters in your answers. "
|
||||
"Always include punctuation in your responses. "
|
||||
"Give very short replies - do not give longer replies unless strictly necessary. "
|
||||
"Respond to what the user said in a concise, funny, creative and helpful way. "
|
||||
"Use `<Sn/>` tags to identify different speakers - do not use tags in your replies."
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
enable_diarization=True,
|
||||
end_of_utterance_silence_trigger=0.5,
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
),
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(aggregation_timeout=0.005),
|
||||
)
|
||||
tts = SpeechmaticsTTSService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
voice_id="sarah",
|
||||
aiohttp_session=session,
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
params=BaseOpenAILLMService.InputParams(temperature=0.75),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful British assistant called Sarah. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be converted to audio so don't include special characters in your answers. "
|
||||
"Always include punctuation in your responses. "
|
||||
"Give very short replies - do not give longer replies unless strictly necessary. "
|
||||
"Respond to what the user said in a concise, funny, creative and helpful way. "
|
||||
"Use `<Sn/>` tags to identify different speakers - do not use tags in your replies."
|
||||
),
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(aggregation_timeout=0.005),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
await runner.run(task)
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
|
||||
@@ -101,6 +101,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@stt.event_handler("on_update")
|
||||
async def on_deepgram_flux_update(stt, transcript):
|
||||
logger.debug(f"On deeggram flux update: {transcript}")
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
132
examples/foundational/07c-interruptible-deepgram-http.py
Normal file
132
examples/foundational/07c-interruptible-deepgram-http.py
Normal file
@@ -0,0 +1,132 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramHttpTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = DeepgramHttpTTSService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
voice="aura-2-andromeda-en",
|
||||
aiohttp_session=session,
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -61,8 +61,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = GoogleSTTService(
|
||||
params=GoogleSTTService.InputParams(languages=Language.EN_US),
|
||||
params=GoogleSTTService.InputParams(languages=Language.EN_US, model="chirp_3"),
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
location="us",
|
||||
)
|
||||
|
||||
tts = GoogleTTSService(
|
||||
|
||||
@@ -22,8 +22,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.sarvam.stt import SarvamSTTService
|
||||
from pipecat.services.sarvam.tts import SarvamHttpTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
@@ -63,7 +63,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Create an HTTP session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = SarvamSTTService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
model="saarika:v2.5",
|
||||
)
|
||||
|
||||
tts = SarvamHttpTTSService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
|
||||
@@ -24,8 +24,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.sarvam.stt import SarvamSTTService
|
||||
from pipecat.services.sarvam.tts import SarvamTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -62,7 +62,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = SarvamSTTService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
model="saarika:v2.5",
|
||||
)
|
||||
|
||||
tts = SarvamTTSService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
|
||||
@@ -4,36 +4,25 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
TextFrame,
|
||||
TTSSpeakFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
get_transport_client_id,
|
||||
maybe_capture_participant_camera,
|
||||
)
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
@@ -43,49 +32,6 @@ from pipecat.transports.daily.transport import DailyParams
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
"""Converts incoming text into requests for user images."""
|
||||
|
||||
def __init__(self, participant_id: Optional[str] = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(self._participant_id, context=frame.text),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
"""Converts incoming user images into context frames."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserImageRawFrame):
|
||||
if frame.request and frame.request.context:
|
||||
context = LLMContext()
|
||||
context.add_image_frame_message(
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
size=frame.size,
|
||||
format=frame.format,
|
||||
)
|
||||
frame = LLMContextFrame(context)
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -93,14 +39,12 @@ transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
@@ -110,33 +54,34 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
# Initialize the image requester without setting the participant ID yet
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
image_processor = UserImageProcessor()
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
# OpenAI GPT-4o for vision analysis
|
||||
openai = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are also able to describe images.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_response,
|
||||
image_requester,
|
||||
image_processor,
|
||||
openai,
|
||||
tts,
|
||||
transport.output(),
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -151,16 +96,28 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
logger.info(f"Client connected")
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
if not runner_args.body:
|
||||
script_dir = os.path.dirname(__file__)
|
||||
runner_args.body = {
|
||||
"image_path": os.path.join(script_dir, "assets", "cat.jpg"),
|
||||
"question": "Describe this image",
|
||||
}
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
image_requester.set_participant_id(client_id)
|
||||
image_path = runner_args.body["image_path"]
|
||||
question = runner_args.body["question"]
|
||||
|
||||
# Welcome message
|
||||
await task.queue_frame(TTSSpeakFrame("Hi there! Feel free to ask me about what I see."))
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
text=question,
|
||||
)
|
||||
messages.append(message)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
@@ -1,180 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
TextFrame,
|
||||
TTSSpeakFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
get_transport_client_id,
|
||||
maybe_capture_participant_camera,
|
||||
)
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.moondream.vision import MoondreamService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
"""Converts incoming text into requests for user images."""
|
||||
|
||||
def __init__(self, participant_id: Optional[str] = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(self._participant_id, context=frame.text),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
"""Converts incoming user images into context frames."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserImageRawFrame):
|
||||
if frame.request and frame.request.context:
|
||||
context = LLMContext()
|
||||
context.add_image_frame_message(
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
size=frame.size,
|
||||
format=frame.format,
|
||||
)
|
||||
frame = LLMContextFrame(context)
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
# Initialize the image requester without setting the participant ID yet
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
image_processor = UserImageProcessor()
|
||||
|
||||
# If you run into weird description, try with use_cpu=True
|
||||
moondream = MoondreamService()
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_response,
|
||||
image_requester,
|
||||
image_processor,
|
||||
moondream,
|
||||
tts,
|
||||
transport.output(),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
image_requester.set_participant_id(client_id)
|
||||
|
||||
# Welcome message
|
||||
await task.queue_frame(TTSSpeakFrame("Hi there! Feel free to ask me about what I see."))
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -4,36 +4,25 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
TextFrame,
|
||||
TTSSpeakFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
get_transport_client_id,
|
||||
maybe_capture_participant_camera,
|
||||
)
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.anthropic.llm import AnthropicLLMService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
@@ -43,49 +32,6 @@ from pipecat.transports.daily.transport import DailyParams
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
"""Converts incoming text into requests for user images."""
|
||||
|
||||
def __init__(self, participant_id: Optional[str] = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(self._participant_id, context=frame.text),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
"""Converts incoming user images into context frames."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserImageRawFrame):
|
||||
if frame.request and frame.request.context:
|
||||
context = LLMContext()
|
||||
context.add_image_frame_message(
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
size=frame.size,
|
||||
format=frame.format,
|
||||
)
|
||||
frame = LLMContextFrame(context)
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -93,14 +39,12 @@ transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
@@ -110,33 +54,34 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
# Initialize the image requester without setting the participant ID yet
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
image_processor = UserImageProcessor()
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
# Anthropic for vision analysis
|
||||
anthropic = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are also able to describe images.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_response,
|
||||
image_requester,
|
||||
image_processor,
|
||||
anthropic,
|
||||
tts,
|
||||
transport.output(),
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -151,16 +96,28 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
logger.info(f"Client connected")
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
if not runner_args.body:
|
||||
script_dir = os.path.dirname(__file__)
|
||||
runner_args.body = {
|
||||
"image_path": os.path.join(script_dir, "assets", "cat.jpg"),
|
||||
"question": "Describe this image",
|
||||
}
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
image_requester.set_participant_id(client_id)
|
||||
image_path = runner_args.body["image_path"]
|
||||
question = runner_args.body["question"]
|
||||
|
||||
# Welcome message
|
||||
await task.queue_frame(TTSSpeakFrame("Hi there! Feel free to ask me about what I see."))
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
text=question,
|
||||
)
|
||||
messages.append(message)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
148
examples/foundational/12b-describe-image-aws.py
Normal file
148
examples/foundational/12b-describe-image-aws.py
Normal file
@@ -0,0 +1,148 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.aws.llm import AWSBedrockLLMService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = AWSBedrockLLMService(
|
||||
aws_region="us-west-2",
|
||||
model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
|
||||
# Note: usually, prefer providing latency="optimized" param.
|
||||
# Here we can't because AWS Bedrock doesn't support it for Claude 3.7,
|
||||
# which we need for image input.
|
||||
params=AWSBedrockLLMService.InputParams(temperature=0.8),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are also able to describe images.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
if not runner_args.body:
|
||||
script_dir = os.path.dirname(__file__)
|
||||
runner_args.body = {
|
||||
"image_path": os.path.join(script_dir, "assets", "cat.jpg"),
|
||||
"question": "Describe this image",
|
||||
}
|
||||
|
||||
image_path = runner_args.body["image_path"]
|
||||
question = runner_args.body["question"]
|
||||
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
text=question,
|
||||
)
|
||||
messages.append(message)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
141
examples/foundational/12c-describe-image-gemini-flash.py
Normal file
141
examples/foundational/12c-describe-image-gemini-flash.py
Normal file
@@ -0,0 +1,141 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are also able to describe images.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
if not runner_args.body:
|
||||
script_dir = os.path.dirname(__file__)
|
||||
runner_args.body = {
|
||||
"image_path": os.path.join(script_dir, "assets", "cat.jpg"),
|
||||
"question": "Describe this image",
|
||||
}
|
||||
|
||||
image_path = runner_args.body["image_path"]
|
||||
question = runner_args.body["question"]
|
||||
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
text=question,
|
||||
)
|
||||
messages.append(message)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
122
examples/foundational/12d-describe-image-moondream.py
Normal file
122
examples/foundational/12d-describe-image-moondream.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import UserImageRawFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.moondream.vision import MoondreamService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
vision = MoondreamService()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
vision, # Vision
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
if not runner_args.body:
|
||||
script_dir = os.path.dirname(__file__)
|
||||
runner_args.body = {
|
||||
"image_path": os.path.join(script_dir, "assets", "cat.jpg"),
|
||||
"question": "Describe this image",
|
||||
}
|
||||
|
||||
image_path = runner_args.body["image_path"]
|
||||
question = runner_args.body["question"]
|
||||
|
||||
# Describe the image.
|
||||
image = Image.open(image_path)
|
||||
await task.queue_frames(
|
||||
[
|
||||
UserImageRawFrame(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
text=question,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -4,8 +4,6 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
@@ -17,12 +15,13 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
@@ -39,34 +38,30 @@ from pipecat.transports.daily.transport import DailyParams
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# Global variable to store the client ID
|
||||
client_id = ""
|
||||
async def fetch_user_image(params: FunctionCallParams):
|
||||
"""Fetch the user image and push it to the LLM.
|
||||
|
||||
|
||||
async def get_weather(params: FunctionCallParams):
|
||||
location = params.arguments["location"]
|
||||
await params.result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
|
||||
|
||||
|
||||
async def get_image(params: FunctionCallParams):
|
||||
When called, this function pushes a UserImageRequestFrame upstream to the
|
||||
transport. As a result, the transport will request the user image and push a
|
||||
UserImageRawFrame downstream which will be added to the context by the LLM
|
||||
assistant aggregator.
|
||||
"""
|
||||
user_id = params.arguments["user_id"]
|
||||
question = params.arguments["question"]
|
||||
logger.debug(f"Requesting image with user_id={client_id}, question={question}")
|
||||
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
|
||||
|
||||
# Request the image frame
|
||||
await params.llm.request_image_frame(
|
||||
user_id=client_id,
|
||||
function_name=params.function_name,
|
||||
tool_call_id=params.tool_call_id,
|
||||
text_content=question,
|
||||
# Request a user image frame and indicate that it should be added to the
|
||||
# context.
|
||||
await params.llm.push_frame(
|
||||
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
# Wait a short time for the frame to be processed
|
||||
await asyncio.sleep(0.5)
|
||||
await params.result_callback(None)
|
||||
|
||||
# Return a result to complete the function call
|
||||
await params.result_callback(
|
||||
f"I've captured an image from your camera and I'm analyzing what you asked about: {question}"
|
||||
)
|
||||
# Instead of None, it's possible to also provide a tool call answer to
|
||||
# tell the LLM that we are grabbing the image to analyze.
|
||||
# await params.result_callback({"result": "Image is being captured."})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
@@ -100,70 +95,32 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
model="claude-3-7-sonnet-latest",
|
||||
params=AnthropicLLMService.InputParams(enable_prompt_caching=True),
|
||||
)
|
||||
llm.register_function("get_weather", get_weather)
|
||||
llm.register_function("get_image", get_image)
|
||||
# Anthropic for vision analysis
|
||||
llm = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
llm.register_function("fetch_user_image", fetch_user_image)
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_weather",
|
||||
description="Get the current weather",
|
||||
fetch_image_function = FunctionSchema(
|
||||
name="fetch_user_image",
|
||||
description="Called when the user requests a description of their camera feed",
|
||||
properties={
|
||||
"location": {
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
"description": "The ID of the user to grab the image from",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
get_image_function = FunctionSchema(
|
||||
name="get_image",
|
||||
description="Get an image from the video stream.",
|
||||
properties={
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question that the user is asking about the image.",
|
||||
}
|
||||
"description": "The question that the user is asking about the image",
|
||||
},
|
||||
},
|
||||
required=["question"],
|
||||
required=["user_id", "question"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function, get_image_function])
|
||||
|
||||
system_prompt = """\
|
||||
You are a helpful assistant who converses with a user and answers questions. Respond concisely to general questions.
|
||||
|
||||
Your response will be turned into speech so use only simple words and punctuation.
|
||||
|
||||
You have access to two tools: get_weather and get_image.
|
||||
|
||||
You can respond to questions about the weather using the get_weather tool.
|
||||
|
||||
You can answer questions about the user's video stream using the get_image tool. Some examples of phrases that \
|
||||
indicate you should use the get_image tool are:
|
||||
- What do you see?
|
||||
- What's in the video?
|
||||
- Can you describe the video?
|
||||
- Tell me about what you see.
|
||||
- Tell me something interesting about what you see.
|
||||
- What's happening in the video?
|
||||
|
||||
If you need to use a tool, simply use the tool. Do not tell the user the tool you are using. Be brief and concise.
|
||||
"""
|
||||
tools = ToolsSchema(standard_tools=[fetch_image_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": system_prompt,
|
||||
}
|
||||
],
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are able to describe images from the user camera.",
|
||||
},
|
||||
{"role": "user", "content": "Start the conversation by introducing yourself."},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
@@ -173,11 +130,11 @@ If you need to use a tool, simply use the tool. Do not tell the user the tool yo
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User speech to text
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses and tool context
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -196,10 +153,16 @@ If you need to use a tool, simply use the tool. Do not tell the user the tool yo
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
|
||||
global client_id
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
@@ -5,29 +5,23 @@
|
||||
#
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
TextFrame,
|
||||
TTSSpeakFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
@@ -37,54 +31,37 @@ from pipecat.runner.utils import (
|
||||
from pipecat.services.aws.llm import AWSBedrockLLMService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
"""Converts incoming text into requests for user images."""
|
||||
async def fetch_user_image(params: FunctionCallParams):
|
||||
"""Fetch the user image and push it to the LLM.
|
||||
|
||||
def __init__(self, participant_id: Optional[str] = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
When called, this function pushes a UserImageRequestFrame upstream to the
|
||||
transport. As a result, the transport will request the user image and push a
|
||||
UserImageRawFrame downstream which will be added to the context by the LLM
|
||||
assistant aggregator.
|
||||
"""
|
||||
user_id = params.arguments["user_id"]
|
||||
question = params.arguments["question"]
|
||||
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
# Request a user image frame and indicate that it should be added to the
|
||||
# context.
|
||||
await params.llm.push_frame(
|
||||
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
await params.result_callback(None)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(self._participant_id, context=frame.text),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
"""Converts incoming user images into context frames."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserImageRawFrame):
|
||||
if frame.request and frame.request.context:
|
||||
# Note: AWS Bedrock does not yet support the universal LLMContext
|
||||
context = LLMContext()
|
||||
context.add_image_frame_message(
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
size=frame.size,
|
||||
format=frame.format,
|
||||
)
|
||||
frame = LLMContextFrame(context)
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
# Instead of None, it's possible to also provide a tool call answer to
|
||||
# tell the LLM that we are grabbing the image to analyze.
|
||||
# await params.result_callback({"result": "Image is being captured."})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
@@ -111,17 +88,15 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
# Initialize the image requester without setting the participant ID yet
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
image_processor = UserImageProcessor()
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
# AWS for vision analysis
|
||||
aws = AWSBedrockLLMService(
|
||||
llm = AWSBedrockLLMService(
|
||||
aws_region="us-west-2",
|
||||
model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
|
||||
# Note: usually, prefer providing latency="optimized" param.
|
||||
@@ -129,22 +104,44 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# which we need for image input.
|
||||
params=AWSBedrockLLMService.InputParams(temperature=0.8),
|
||||
)
|
||||
llm.register_function("fetch_user_image", fetch_user_image)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
fetch_image_function = FunctionSchema(
|
||||
name="fetch_user_image",
|
||||
description="Called when the user requests a description of their camera feed",
|
||||
properties={
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the user to grab the image from",
|
||||
},
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question that the user is asking about the image",
|
||||
},
|
||||
},
|
||||
required=["user_id", "question"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[fetch_image_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are able to describe images from the user camera.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_response,
|
||||
image_requester,
|
||||
image_processor,
|
||||
aws,
|
||||
tts,
|
||||
transport.output(),
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -165,10 +162,15 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
image_requester.set_participant_id(client_id)
|
||||
|
||||
# Welcome message
|
||||
await task.queue_frame(TTSSpeakFrame("Hi there! Feel free to ask me about what I see."))
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
@@ -5,29 +5,23 @@
|
||||
#
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
TextFrame,
|
||||
TTSSpeakFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.user_response import UserResponseAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
@@ -37,53 +31,37 @@ from pipecat.runner.utils import (
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class UserImageRequester(FrameProcessor):
|
||||
"""Converts incoming text into requests for user images."""
|
||||
async def fetch_user_image(params: FunctionCallParams):
|
||||
"""Fetch the user image and push it to the LLM.
|
||||
|
||||
def __init__(self, participant_id: Optional[str] = None):
|
||||
super().__init__()
|
||||
self._participant_id = participant_id
|
||||
When called, this function pushes a UserImageRequestFrame upstream to the
|
||||
transport. As a result, the transport will request the user image and push a
|
||||
UserImageRawFrame downstream which will be added to the context by the LLM
|
||||
assistant aggregator.
|
||||
"""
|
||||
user_id = params.arguments["user_id"]
|
||||
question = params.arguments["question"]
|
||||
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
|
||||
|
||||
def set_participant_id(self, participant_id: str):
|
||||
self._participant_id = participant_id
|
||||
# Request a user image frame and indicate that it should be added to the
|
||||
# context.
|
||||
await params.llm.push_frame(
|
||||
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
await params.result_callback(None)
|
||||
|
||||
if self._participant_id and isinstance(frame, TextFrame):
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(self._participant_id, context=frame.text),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class UserImageProcessor(FrameProcessor):
|
||||
"""Converts incoming user images into context frames."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserImageRawFrame):
|
||||
if frame.request and frame.request.context:
|
||||
context = LLMContext()
|
||||
context.add_image_frame_message(
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
size=frame.size,
|
||||
format=frame.format,
|
||||
)
|
||||
frame = LLMContextFrame(context)
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
# Instead of None, it's possible to also provide a tool call answer to
|
||||
# tell the LLM that we are grabbing the image to analyze.
|
||||
# await params.result_callback({"result": "Image is being captured."})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
@@ -110,33 +88,53 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
user_response = UserResponseAggregator()
|
||||
|
||||
# Initialize the image requester without setting the participant ID yet
|
||||
image_requester = UserImageRequester()
|
||||
|
||||
image_processor = UserImageProcessor()
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
# Google Gemini model for vision analysis
|
||||
google = GoogleLLMService(model="gemini-2.0-flash-001", api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
# Google Gemini model for vision analysis
|
||||
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
llm.register_function("fetch_user_image", fetch_user_image)
|
||||
|
||||
fetch_image_function = FunctionSchema(
|
||||
name="fetch_user_image",
|
||||
description="Called when the user requests a description of their camera feed",
|
||||
properties={
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the user to grab the image from",
|
||||
},
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question that the user is asking about the image",
|
||||
},
|
||||
},
|
||||
required=["user_id", "question"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[fetch_image_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are able to describe images from the user camera.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_response,
|
||||
image_requester,
|
||||
image_processor,
|
||||
google,
|
||||
tts,
|
||||
transport.output(),
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -157,10 +155,15 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
image_requester.set_participant_id(client_id)
|
||||
|
||||
# Welcome message
|
||||
await task.queue_frame(TTSSpeakFrame("Hi there! Feel free to ask me about what I see."))
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
190
examples/foundational/14d-function-calling-moondream-video.py
Normal file
190
examples/foundational/14d-function-calling-moondream-video.py
Normal file
@@ -0,0 +1,190 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
get_transport_client_id,
|
||||
maybe_capture_participant_camera,
|
||||
)
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.moondream.vision import MoondreamService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_user_image(params: FunctionCallParams):
|
||||
"""Fetch the user image.
|
||||
|
||||
When called, this function pushes a UserImageRequestFrame upstream to the
|
||||
transport. As a result, the transport will request the user image and push a
|
||||
UserImageRawFrame downstream.
|
||||
"""
|
||||
user_id = params.arguments["user_id"]
|
||||
question = params.arguments["question"]
|
||||
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
|
||||
|
||||
# Request a user image frame. In this case, we don't want the requested
|
||||
# image to be added to the context because we will process it with
|
||||
# Moondream.
|
||||
await params.llm.push_frame(
|
||||
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=False),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
await params.result_callback(None)
|
||||
|
||||
# Instead of None, it's possible to also provide a tool call answer to
|
||||
# tell the LLM that we are grabbing the image to analyze.
|
||||
# await params.result_callback({"result": "Image is being captured."})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
video_in_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm.register_function("fetch_user_image", fetch_user_image)
|
||||
|
||||
fetch_image_function = FunctionSchema(
|
||||
name="fetch_user_image",
|
||||
description="Called when the user requests a description of their camera feed",
|
||||
properties={
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the user to grab the image from",
|
||||
},
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question that the user is asking about the image",
|
||||
},
|
||||
},
|
||||
required=["user_id", "question"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[fetch_image_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are able to describe images from the user camera.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# If you run into weird description, try with use_cpu=True
|
||||
moondream = MoondreamService()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
ParallelPipeline(
|
||||
[llm], # LLM
|
||||
[moondream],
|
||||
),
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
|
||||
# Set the participant ID in the image requester
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -5,7 +5,6 @@
|
||||
#
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
@@ -17,12 +16,13 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
@@ -39,34 +39,30 @@ from pipecat.transports.daily.transport import DailyParams
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# Global variable to store the client ID
|
||||
client_id = ""
|
||||
async def fetch_user_image(params: FunctionCallParams):
|
||||
"""Fetch the user image and push it to the LLM.
|
||||
|
||||
|
||||
async def get_weather(params: FunctionCallParams):
|
||||
location = params.arguments["location"]
|
||||
await params.result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
|
||||
|
||||
|
||||
async def get_image(params: FunctionCallParams):
|
||||
When called, this function pushes a UserImageRequestFrame upstream to the
|
||||
transport. As a result, the transport will request the user image and push a
|
||||
UserImageRawFrame downstream which will be added to the context by the LLM
|
||||
assistant aggregator.
|
||||
"""
|
||||
user_id = params.arguments["user_id"]
|
||||
question = params.arguments["question"]
|
||||
logger.debug(f"Requesting image with user_id={client_id}, question={question}")
|
||||
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
|
||||
|
||||
# Request the image frame
|
||||
await params.llm.request_image_frame(
|
||||
user_id=client_id,
|
||||
function_name=params.function_name,
|
||||
tool_call_id=params.tool_call_id,
|
||||
text_content=question,
|
||||
# Request a user image frame and indicate that it should be added to the
|
||||
# context.
|
||||
await params.llm.push_frame(
|
||||
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
|
||||
# Wait a short time for the frame to be processed
|
||||
await asyncio.sleep(0.5)
|
||||
await params.result_callback(None)
|
||||
|
||||
# Return a result to complete the function call
|
||||
await params.result_callback(
|
||||
f"I've captured an image from your camera and I'm analyzing what you asked about: {question}"
|
||||
)
|
||||
# Instead of None, it's possible to also provide a tool call answer to
|
||||
# tell the LLM that we are grabbing the image to analyze.
|
||||
# await params.result_callback({"result": "Image is being captured."})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
@@ -101,58 +97,30 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm.register_function("get_weather", get_weather)
|
||||
llm.register_function("get_image", get_image)
|
||||
llm.register_function("fetch_user_image", fetch_user_image)
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_weather",
|
||||
description="Get the current weather",
|
||||
fetch_image_function = FunctionSchema(
|
||||
name="fetch_user_image",
|
||||
description="Called when the user requests a description of their camera feed",
|
||||
properties={
|
||||
"location": {
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
"description": "The ID of the user to grab the image from",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
get_image_function = FunctionSchema(
|
||||
name="get_image",
|
||||
description="Get an image from the video stream.",
|
||||
properties={
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "The question that the user is asking about the image.",
|
||||
}
|
||||
"description": "The question that the user is asking about the image",
|
||||
},
|
||||
},
|
||||
required=["question"],
|
||||
required=["user_id", "question"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function, get_image_function])
|
||||
tools = ToolsSchema(standard_tools=[fetch_image_function])
|
||||
|
||||
system_prompt = """\
|
||||
You are a helpful assistant who converses with a user and answers questions. Respond concisely to general questions.
|
||||
|
||||
Your response will be turned into speech so use only simple words and punctuation.
|
||||
|
||||
You have access to two tools: get_weather and get_image.
|
||||
|
||||
You can respond to questions about the weather using the get_weather tool.
|
||||
|
||||
You can answer questions about the user's video stream using the get_image tool. Some examples of phrases that \
|
||||
indicate you should use the get_image tool are:
|
||||
- What do you see?
|
||||
- What's in the video?
|
||||
- Can you describe the video?
|
||||
- Tell me about what you see.
|
||||
- Tell me something interesting about what you see.
|
||||
- What's happening in the video?
|
||||
"""
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way. You are able to describe images from the user camera.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
@@ -160,13 +128,13 @@ indicate you should use the get_image tool are:
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
context_aggregator.assistant(),
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -185,10 +153,15 @@ indicate you should use the get_image tool are:
|
||||
|
||||
await maybe_capture_participant_camera(transport, client)
|
||||
|
||||
global client_id
|
||||
client_id = get_transport_client_id(transport, client)
|
||||
|
||||
# Kick off the conversation.
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
@@ -75,7 +75,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# text_filters=[MarkdownTextFilter()],
|
||||
)
|
||||
|
||||
llm = NimLLMService(api_key=os.getenv("NVIDIA_API_KEY"), model="meta/llama-3.3-70b-instruct")
|
||||
llm = NimLLMService(
|
||||
api_key=os.getenv("NVIDIA_API_KEY"),
|
||||
model="nvidia/llama-3.3-nemotron-super-49b-v1.5",
|
||||
# Recommended when turning thinking off
|
||||
params=NimLLMService.InputParams(temperature=0.0),
|
||||
)
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
@@ -102,6 +107,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function])
|
||||
messages = [
|
||||
# Disable thinking by sending this message first
|
||||
# Check the model for the corresponding "no thinking" message
|
||||
{"role": "system", "content": "/no_think"},
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
|
||||
@@ -77,7 +77,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but try to be brief.",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
@@ -13,13 +14,21 @@ from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.adapters.services.open_ai_realtime_adapter import OpenAIRealtimeLLMAdapter
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame, TranscriptionMessage
|
||||
from pipecat.frames.frames import (
|
||||
LLMRunFrame,
|
||||
LLMSetToolsFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
TranscriptionMessage,
|
||||
)
|
||||
from pipecat.observers.loggers.transcription_log_observer import TranscriptionLogObserver
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -52,6 +61,18 @@ async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
)
|
||||
|
||||
|
||||
async def get_news(params: FunctionCallParams):
|
||||
await params.result_callback(
|
||||
{
|
||||
"news": [
|
||||
"Massive UFO currently hovering above New York City",
|
||||
"Stock markets reach all-time highs",
|
||||
"Living dinosaur species discovered in the Amazon rainforest",
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
@@ -73,6 +94,13 @@ weather_function = FunctionSchema(
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
get_news_function = FunctionSchema(
|
||||
name="get_news",
|
||||
description="Get the current news.",
|
||||
properties={},
|
||||
required=[],
|
||||
)
|
||||
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
@@ -126,6 +154,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
noise_reduction=InputAudioNoiseReduction(type="near_field"),
|
||||
)
|
||||
),
|
||||
# In this example we provide tools through the context, but you could
|
||||
# alternatively provide them here.
|
||||
# tools=tools,
|
||||
instructions="""You are a helpful and friendly AI.
|
||||
|
||||
@@ -140,10 +170,6 @@ even if you're asked about them.
|
||||
You are participating in a voice conversation. Keep your responses concise, short, and to the point
|
||||
unless specifically asked to elaborate on a topic.
|
||||
|
||||
You have access to the following tools:
|
||||
- get_current_weather: Get the current weather for a given location.
|
||||
- get_restaurant_recommendation: Get a restaurant recommendation for a given location.
|
||||
|
||||
Remember, your responses should be short. Just one or two sentences, usually. Respond in English.""",
|
||||
)
|
||||
|
||||
@@ -157,25 +183,26 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
|
||||
# llm.register_function(None, fetch_weather_from_api)
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
llm.register_function("get_news", get_news)
|
||||
|
||||
transcript = TranscriptProcessor()
|
||||
|
||||
# Create a standard OpenAI LLM context object using the normal messages format. The
|
||||
# OpenAIRealtimeLLMService will convert this internally to messages that the
|
||||
# openai WebSocket API can understand.
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[{"role": "user", "content": "Say hello!"}],
|
||||
tools,
|
||||
)
|
||||
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
context_aggregator.user(),
|
||||
transcript.user(), # LLM pushes TranscriptionFrames upstream
|
||||
llm, # LLM
|
||||
transcript.user(), # Placed after the LLM, as LLM pushes TranscriptionFrames downstream
|
||||
transport.output(), # Transport bot output
|
||||
transcript.assistant(), # After the transcript output, to time with the audio output
|
||||
context_aggregator.assistant(),
|
||||
@@ -198,6 +225,22 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
# Add a new tool at runtime after a delay.
|
||||
await asyncio.sleep(15)
|
||||
new_tools = ToolsSchema(
|
||||
standard_tools=[weather_function, restaurant_function, get_news_function]
|
||||
)
|
||||
await task.queue_frames([LLMSetToolsFrame(tools=new_tools)])
|
||||
# Alternative pattern, useful if you're changing other session properties, too.
|
||||
# (Though note that tools in your LLMContext take precedence over those
|
||||
# in session properties, so if you have context-provided tools, prefer
|
||||
# LLMSetToolsFrame instead, as it updates your context. Ditto for
|
||||
# updating system instructions: send an LLMMessagesUpdateFrame with
|
||||
# context messages updated with your new desired system message.)
|
||||
# await task.queue_frames(
|
||||
# [LLMUpdateSettingsFrame(settings=SessionProperties(tools=new_tools).model_dump())]
|
||||
# )
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
|
||||
@@ -18,7 +18,9 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
|
||||
@@ -155,10 +157,10 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
# Create a standard OpenAI LLM context object using the normal messages format. The
|
||||
# Create a standard LLM context object using the normal messages format. The
|
||||
# OpenAIRealtimeBetaLLMService will convert this internally to messages that the
|
||||
# openai WebSocket API can understand.
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[{"role": "user", "content": "Say hello!"}],
|
||||
# [{"role": "user", "content": [{"type": "text", "text": "Say hello!"}]}],
|
||||
# [
|
||||
@@ -173,7 +175,7 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
|
||||
tools,
|
||||
)
|
||||
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -18,7 +18,8 @@ from pipecat.frames.frames import LLMRunFrame, TranscriptionMessage
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -169,20 +170,20 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
|
||||
# Create a standard OpenAI LLM context object using the normal messages format. The
|
||||
# OpenAIRealtimeLLMService will convert this internally to messages that the
|
||||
# openai WebSocket API can understand.
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[{"role": "user", "content": "Say hello!"}],
|
||||
tools,
|
||||
)
|
||||
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
context_aggregator.user(),
|
||||
transcript.user(), # LLM pushes TranscriptionFrames upstream
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transcript.user(), # Placed after the LLM, as LLM pushes TranscriptionFrames downstream
|
||||
transport.output(), # Transport bot output
|
||||
transcript.assistant(), # After the transcript output, to time with the audio output
|
||||
context_aggregator.assistant(),
|
||||
|
||||
@@ -13,14 +13,15 @@ from datetime import datetime
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import (
|
||||
OpenAILLMContext,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
@@ -69,11 +70,11 @@ async def save_conversation(params: FunctionCallParams):
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
|
||||
filename = f"{BASE_FILENAME}{timestamp}.json"
|
||||
logger.debug(
|
||||
f"writing conversation to {filename}\n{json.dumps(params.context.messages, indent=4)}"
|
||||
f"writing conversation to {filename}\n{json.dumps(params.context.get_messages(), indent=4)}"
|
||||
)
|
||||
try:
|
||||
with open(filename, "w") as file:
|
||||
messages = params.context.get_messages_for_persistent_storage()
|
||||
messages = params.context.get_messages()
|
||||
# remove the last message, which is the instruction we just gave to save the conversation
|
||||
messages.pop()
|
||||
json.dump(messages, file, indent=2)
|
||||
@@ -90,6 +91,10 @@ async def load_conversation(params: FunctionCallParams):
|
||||
with open(filename, "r") as file:
|
||||
params.context.set_messages(json.load(file))
|
||||
await params.llm.reset_conversation()
|
||||
# NOTE: we manually create a response here rather than relying
|
||||
# on the function callback to trigger one since we've reset the
|
||||
# conversation so the remote service doesn't know about the
|
||||
# in-progress tool call.
|
||||
await params.llm._create_response()
|
||||
except Exception as e:
|
||||
await params.result_callback({"success": False, "error": str(e)})
|
||||
@@ -97,14 +102,12 @@ async def load_conversation(params: FunctionCallParams):
|
||||
asyncio.create_task(_reset())
|
||||
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
tools = ToolsSchema(
|
||||
standard_tools=[
|
||||
FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
@@ -115,45 +118,33 @@ tools = [
|
||||
"description": "The temperature unit to use. Infer this from the users location.",
|
||||
},
|
||||
},
|
||||
"required": ["location", "format"],
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "save_conversation",
|
||||
"description": "Save the current conversatione. Use this function to persist the current conversation to external storage.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": [],
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_saved_conversation_filenames",
|
||||
"description": "Get a list of saved conversation histories. Returns a list of filenames. Each filename includes a date and timestamp. Each file is conversation history that can be loaded into this session.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": [],
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "load_conversation",
|
||||
"description": "Load a conversation history. Use this function to load a conversation history into the current session.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
required=["location", "format"],
|
||||
),
|
||||
FunctionSchema(
|
||||
name="save_conversation",
|
||||
description="Save the current conversatione. Use this function to persist the current conversation to external storage.",
|
||||
properties={},
|
||||
required=[],
|
||||
),
|
||||
FunctionSchema(
|
||||
name="get_saved_conversation_filenames",
|
||||
description="Get a list of saved conversation histories. Returns a list of filenames. Each filename includes a date and timestamp. Each file is conversation history that can be loaded into this session.",
|
||||
properties={},
|
||||
required=[],
|
||||
),
|
||||
FunctionSchema(
|
||||
name="load_conversation",
|
||||
description="Load a conversation history. Use this function to load a conversation history into the current session.",
|
||||
properties={
|
||||
"filename": {
|
||||
"type": "string",
|
||||
"description": "The filename of the conversation history to load.",
|
||||
}
|
||||
},
|
||||
"required": ["filename"],
|
||||
},
|
||||
},
|
||||
]
|
||||
required=["filename"],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
@@ -224,8 +215,8 @@ Remember, your responses should be short. Just one or two sentences, usually."""
|
||||
llm.register_function("get_saved_conversation_filenames", get_saved_conversation_filenames)
|
||||
llm.register_function("load_conversation", load_conversation)
|
||||
|
||||
context = OpenAILLMContext([], tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context = LLMContext([{"role": "user", "content": "Say hello!"}], tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -16,7 +16,9 @@ from pipecat.frames.frames import LLMRunFrame, TranscriptionMessage
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.transcript_processor import TranscriptProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -72,7 +74,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# inference_on_context_initialization=False,
|
||||
)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -90,7 +92,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# },
|
||||
],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
transcript = TranscriptProcessor()
|
||||
|
||||
|
||||
@@ -19,7 +19,9 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
@@ -139,10 +141,18 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
[{"role": "user", "content": "Say hello."}],
|
||||
# You can provide the system instructions and tools in the context rather
|
||||
# than as arguments to GeminiLiveLLMService, but note that doing so will
|
||||
# trigger a (fast) reconnection when the GeminiLiveLLMService first
|
||||
# receives the context (i.e. when we send the LLMRunFrame below).
|
||||
context = LLMContext(
|
||||
[
|
||||
# {"role": "system", "content": system_instruction},
|
||||
{"role": "user", "content": "Say hello."},
|
||||
],
|
||||
# tools,
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -17,7 +17,9 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import (
|
||||
create_transport,
|
||||
@@ -65,7 +67,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# inference_on_context_initialization=False,
|
||||
)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -73,7 +75,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
},
|
||||
],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -16,7 +16,8 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
@@ -109,8 +110,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Set up conversation context and management
|
||||
# The context_aggregator will automatically collect conversation context
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -16,7 +16,9 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
@@ -90,7 +92,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -98,7 +100,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
}
|
||||
],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -16,7 +16,9 @@ from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
@@ -129,7 +131,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
mime_type = "text/plain"
|
||||
|
||||
# Create context with file reference
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -152,7 +154,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
except Exception as e:
|
||||
logger.error(f"Error uploading file: {e}")
|
||||
# Continue with a basic context if file upload fails
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -162,7 +164,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
# Create context aggregator
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# Build the pipeline
|
||||
pipeline = Pipeline(
|
||||
|
||||
@@ -10,7 +10,9 @@ from pipecat.frames.frames import Frame, LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -124,8 +126,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
]
|
||||
|
||||
# Set up conversation context and management
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -9,21 +9,21 @@ import os
|
||||
from datetime import datetime
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from google.genai.types import HttpOptions
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.google.gemini_live.llm_vertex import GeminiLiveVertexLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
@@ -139,10 +139,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
[{"role": "user", "content": "Say hello."}],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context = LLMContext([{"role": "user", "content": "Say hello."}])
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -18,7 +18,9 @@ from pipecat.frames.frames import EndTaskFrame, LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -62,7 +64,7 @@ You have three tools available to you:
|
||||
|
||||
After you've responded to the user three times, do two things, in order:
|
||||
1. Politely let them know that that's all the time you have today and say goodbye.
|
||||
2. Call the end_conversation tool to gracefully end the conversation.
|
||||
2. *WITHOUT WAITING FOR THE USER TO RESPOND*, call the end_conversation tool to gracefully end the conversation.
|
||||
"""
|
||||
|
||||
|
||||
@@ -152,10 +154,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
llm.register_function("end_conversation", end_conversation)
|
||||
|
||||
context = OpenAILLMContext(
|
||||
context = LLMContext(
|
||||
[{"role": "user", "content": "Say hello."}],
|
||||
)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -9,7 +9,6 @@ import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from simli import SimliConfig
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
@@ -66,11 +65,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="a167e0f3-df7e-4d52-a9c3-f949145efdab",
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121",
|
||||
)
|
||||
|
||||
simli_ai = SimliVideoService(
|
||||
SimliConfig(os.getenv("SIMLI_API_KEY"), os.getenv("SIMLI_FACE_ID")),
|
||||
api_key=os.getenv("SIMLI_API_KEY"),
|
||||
face_id="cace3ef7-a4c4-425d-a8cf-a5358eb0c427",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o-mini")
|
||||
|
||||
@@ -20,6 +20,7 @@ from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
@@ -78,9 +79,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
rtvi = RTVIProcessor()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
rtvi,
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
@@ -96,15 +100,20 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
observers=[RTVIObserver(rtvi)],
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@rtvi.event_handler("on_client_ready")
|
||||
async def on_client_ready(rtvi):
|
||||
await rtvi.set_bot_ready()
|
||||
# Kick off the conversation
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
@@ -63,10 +64,12 @@ class UrlToImageProcessor(FrameProcessor):
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
def extract_url(self, text: str):
|
||||
pattern = r"!\[[^\]]*\]\((https?://[^)]+\.(png|jpg|jpeg|PNG|JPG|JPEG))\)"
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
data = json.loads(text)
|
||||
if "artObject" in data:
|
||||
return data["artObject"]["webImage"]["url"]
|
||||
if "artworks" in data and len(data["artworks"]):
|
||||
return data["artworks"][0]["webImage"]["url"]
|
||||
|
||||
return None
|
||||
|
||||
async def run_image_process(self, image_url: str):
|
||||
@@ -130,9 +133,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
mcp = MCPClient(
|
||||
server_params=StdioServerParameters(
|
||||
command=shutil.which("npx"),
|
||||
args=["-y", "@programcomputer/nasa-mcp-server@latest"],
|
||||
# https://api.nasa.gov
|
||||
env={"NASA_API_KEY": os.getenv("NASA_API_KEY")},
|
||||
# https://github.com/r-huijts/rijksmuseum-mcp
|
||||
args=["-y", "mcp-server-rijksmuseum"],
|
||||
env={"RIJKSMUSEUM_API_KEY": os.getenv("RIJKSMUSEUM_API_KEY")},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -141,15 +144,20 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
mcp_image = UrlToImageProcessor(aiohttp_session=session)
|
||||
|
||||
tools = await mcp.register_tools(llm)
|
||||
tools = {}
|
||||
try:
|
||||
tools = await mcp.register_tools(llm)
|
||||
except Exception as e:
|
||||
logger.error(f"error registering tools")
|
||||
logger.exception("error trace:")
|
||||
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
Your goal is to demonstrate your capabilities in a succinct way.
|
||||
You have access to a number of tools provided by NASA MCP. Use any and all tools to help users.
|
||||
When asked for the astronomy picture of the day, PASS in NO date to the API.
|
||||
This ensures we get the latest picture available. If as specific date is asked for, you
|
||||
can pass in that date to the API.
|
||||
You have access to tools to search the Rijksmuseum collection.
|
||||
Offer, for example, to show the earliest Rembrandt work from the museum. Use the `search_artwork` tool.
|
||||
The tool may respond with a JSON object with an `artworks` array. Choose the art from that array.
|
||||
Once the tool has responded, tell the user the title and use the `open_image_in_browser` tool.
|
||||
Your output will be converted to audio so don't include special characters in your answers.
|
||||
Respond to what the user said in a creative and helpful way.
|
||||
Don't overexplain what you are doing.
|
||||
@@ -206,14 +214,13 @@ async def bot(runner_args: RunnerArguments):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.getenv("NASA_API_KEY"):
|
||||
if not os.getenv("RIJKSMUSEUM_API_KEY"):
|
||||
logger.error(
|
||||
f"Please set NASA_API_KEY environment variable for this example. See https://api.nasa.gov"
|
||||
f"Please set RIJKSMUSEUM_API_KEY environment variable for this example. See https://github.com/r-huijts/rijksmuseum-mcp and https://www.rijksmuseum.nl/en/register?redirectUrl=https://www.https://www.rijksmuseum.nl/en/rijksstudio/my/profile"
|
||||
)
|
||||
import sys
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
|
||||
@@ -79,7 +79,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.error(f"error setting up mcp")
|
||||
logger.exception("error trace:")
|
||||
|
||||
tools = await mcp.register_tools(llm)
|
||||
tools = {}
|
||||
try:
|
||||
tools = await mcp.register_tools(llm)
|
||||
except Exception as e:
|
||||
logger.error(f"error registering tools")
|
||||
logger.exception("error trace:")
|
||||
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
|
||||
@@ -132,9 +132,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
Your goal is to demonstrate your capabilities in a succinct way.
|
||||
You have access to a number of tools provided by NASA MCP. Use any and all tools to help users.
|
||||
When asked for today's date, use 'https://www.datetoday.net/'.
|
||||
When asked for the astronomy picture of the day, use 'https://www.datetoday.net/', to get today's date.
|
||||
You have access to tools to search the Rijksmuseum collection.
|
||||
Offer, for example, to show the earliest Rembrandt work from the museum. Use the `search_artwork` tool.
|
||||
The tool may respond with a JSON object with an `artworks` array. Choose the art from that array.
|
||||
Once the tool has responded, tell the user the title and use the `open_image_in_browser` tool.
|
||||
Your output will be converted to audio so don't include special characters in your answers.
|
||||
Respond to what the user said in a creative and helpful way.
|
||||
Don't overexplain what you are doing.
|
||||
@@ -147,13 +148,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
mcp = MCPClient(
|
||||
server_params=StdioServerParameters(
|
||||
command=shutil.which("npx"),
|
||||
args=["-y", "@programcomputer/nasa-mcp-server@latest"],
|
||||
# https://api.nasa.gov
|
||||
env={"NASA_API_KEY": os.getenv("NASA_API_KEY")},
|
||||
# https://github.com/r-huijts/rijksmuseum-mcp
|
||||
args=["-y", "mcp-server-error setting up mcp"],
|
||||
env={"RIJKSMUSEUM_API_KEY": os.getenv("RIJKSMUSEUM_API_KEY")},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"error setting up nasa mcp")
|
||||
logger.error(f"error setting up rijksmuseum mcp")
|
||||
logger.exception("error trace:")
|
||||
try:
|
||||
# https://docs.mcp.run/integrating/tutorials/mcp-run-sse-openai-agents/
|
||||
@@ -164,8 +165,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.error(f"error setting up mcp.run")
|
||||
logger.exception("error trace:")
|
||||
|
||||
tools = await mcp.register_tools(llm)
|
||||
run_tools = await mcp_run.register_tools(llm)
|
||||
tools = {}
|
||||
run_tools = {}
|
||||
try:
|
||||
tools = await mcp.register_tools(llm)
|
||||
run_tools = await mcp_run.register_tools(llm)
|
||||
except Exception as e:
|
||||
logger.error(f"error registering tools")
|
||||
logger.exception("error trace:")
|
||||
|
||||
all_standard_tools = run_tools.standard_tools + tools.standard_tools
|
||||
all_tools = ToolsSchema(standard_tools=all_standard_tools)
|
||||
@@ -219,9 +226,9 @@ async def bot(runner_args: RunnerArguments):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.getenv("NASA_API_KEY") or not os.getenv("MCP_RUN_SSE_URL"):
|
||||
if not os.getenv("RIJKSMUSEUM_API_KEY") or not os.getenv("MCP_RUN_SSE_URL"):
|
||||
logger.error(
|
||||
f"Please set NASA_API_KEY and MCP_RUN_SSE_URL environment variables. See https://api.nasa.gov and https://mcp.run"
|
||||
f"Please set RIJKSMUSEUM_API_KEY and MCP_RUN_SSE_URL environment variables. See https://github.com/r-huijts/rijksmuseum-mcp and https://mcp.run"
|
||||
)
|
||||
import sys
|
||||
|
||||
|
||||
@@ -85,7 +85,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.error(f"error setting up mcp")
|
||||
logger.exception("error trace:")
|
||||
|
||||
tools = await mcp.register_tools(llm)
|
||||
tools = {}
|
||||
try:
|
||||
tools = await mcp.register_tools(llm)
|
||||
except Exception as e:
|
||||
logger.error(f"error registering tools")
|
||||
logger.exception("error trace:")
|
||||
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
|
||||
165
examples/foundational/39d-mcp-run-http-gemini-live.py
Normal file
165
examples/foundational/39d-mcp-run-http-gemini-live.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from mcp.client.session_group import StreamableHttpParameters
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import NOT_GIVEN, LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.mcp_service import MCPClient
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
try:
|
||||
# Github MCP docs: https://github.com/github/github-mcp-server
|
||||
# Enable Github Copilot on your GitHub account. Free tier is ok. (https://github.com/settings/copilot)
|
||||
# Generate a personal access token. It must be a Fine-grained token, classic tokens are not supported. (https://github.com/settings/personal-access-tokens)
|
||||
# Set permissions you want to use (eg. "all repositories", "profile: read/write", etc)
|
||||
mcp = MCPClient(
|
||||
server_params=StreamableHttpParameters(
|
||||
url="https://api.githubcopilot.com/mcp/",
|
||||
headers={"Authorization": f"Bearer {os.getenv('GITHUB_PERSONAL_ACCESS_TOKEN')}"},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"error setting up mcp")
|
||||
logger.exception("error trace:")
|
||||
|
||||
tools = {}
|
||||
try:
|
||||
tools = await mcp.get_tools_schema()
|
||||
except Exception as e:
|
||||
logger.error(f"error registering tools")
|
||||
logger.exception("error trace:")
|
||||
|
||||
system = f"""
|
||||
You are a helpful LLM in a WebRTC call.
|
||||
Your goal is to answer questions about the user's GitHub repositories and account.
|
||||
You have access to a number of tools provided by Github. Use any and all tools to help users.
|
||||
Your output will be converted to audio so don't include special characters in your answers.
|
||||
Don't overexplain what you are doing.
|
||||
Just respond with short sentences when you are carrying out tool calls.
|
||||
"""
|
||||
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
system_instruction=system,
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
await mcp.register_tools_schema(tools, llm)
|
||||
|
||||
context = LLMContext([{"role": "user", "content": "Please introduce yourself."}])
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
context_aggregator.user(), # User spoken responses
|
||||
llm, # LLM
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses and tool context
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN"):
|
||||
logger.error(
|
||||
f"Please set GITHUB_PERSONAL_ACCESS_TOKEN environment variable for this example."
|
||||
)
|
||||
import sys
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -15,7 +15,9 @@ from pipecat.frames.frames import Frame, InputImageRawFrame, LLMRunFrame, Output
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import LLMAssistantAggregatorParams
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
@@ -108,8 +110,8 @@ async def run_bot(pipecat_transport):
|
||||
}
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# RTVI events for Pipecat client UI
|
||||
rtvi = RTVIProcessor()
|
||||
|
||||
@@ -10,11 +10,14 @@ import os
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, ManuallySwitchServiceFrame
|
||||
from pipecat.pipeline.llm_switcher import LLMSwitcher
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.service_switcher import ServiceSwitcher, ServiceSwitcherStrategyManual
|
||||
@@ -28,6 +31,7 @@ from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -35,6 +39,23 @@ from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# "Classic" function
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
# "Direct" function
|
||||
async def get_restaurant_recommendation(params: FunctionCallParams, location: str):
|
||||
"""
|
||||
Get a restaurant recommendation.
|
||||
|
||||
Args:
|
||||
location (str): The city and state, e.g. "San Francisco, CA".
|
||||
"""
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -63,6 +84,23 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
stt_cartesia = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
|
||||
stt_deepgram = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt_switcher = ServiceSwitcher(
|
||||
@@ -80,9 +118,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
llm_openai = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm_google = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
llm_switcher = ServiceSwitcher(
|
||||
services=[llm_openai, llm_google], strategy_type=ServiceSwitcherStrategyManual
|
||||
llm_switcher = LLMSwitcher(
|
||||
llms=[llm_openai, llm_google], strategy_type=ServiceSwitcherStrategyManual
|
||||
)
|
||||
# Register a "classic" function
|
||||
llm_switcher.register_function("get_current_weather", fetch_weather_from_api)
|
||||
# Register a "direct" function
|
||||
llm_switcher.register_direct_function(get_restaurant_recommendation)
|
||||
|
||||
messages = [
|
||||
{
|
||||
@@ -90,8 +132,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
tools = ToolsSchema(standard_tools=[weather_function, get_restaurant_recommendation])
|
||||
|
||||
context = LLMContext(messages)
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
|
||||
BIN
examples/foundational/assets/cat.jpg
Normal file
BIN
examples/foundational/assets/cat.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 63 KiB |
@@ -55,7 +55,7 @@ azure = [ "azure-cognitiveservices-speech~=1.42.0"]
|
||||
cartesia = [ "cartesia~=2.0.3", "pipecat-ai[websockets-base]" ]
|
||||
cerebras = []
|
||||
deepseek = []
|
||||
daily = [ "daily-python~=0.20.0" ]
|
||||
daily = [ "daily-python~=0.21.0" ]
|
||||
deepgram = [ "deepgram-sdk~=4.7.0" ]
|
||||
elevenlabs = [ "pipecat-ai[websockets-base]" ]
|
||||
fal = [ "fal-client~=0.5.9" ]
|
||||
@@ -91,17 +91,17 @@ playht = [ "pipecat-ai[websockets-base]" ]
|
||||
qwen = []
|
||||
rime = [ "pipecat-ai[websockets-base]" ]
|
||||
riva = [ "nvidia-riva-client~=2.21.1" ]
|
||||
runner = [ "python-dotenv>=1.0.0,<2.0.0", "uvicorn>=0.32.0,<1.0.0", "fastapi>=0.115.6,<0.117.0", "pipecat-ai-small-webrtc-prebuilt>=1.0.0"]
|
||||
runner = [ "python-dotenv>=1.0.0,<2.0.0", "uvicorn>=0.32.0,<1.0.0", "fastapi>=0.115.6,<0.122.0", "pipecat-ai-small-webrtc-prebuilt>=1.0.0"]
|
||||
sambanova = []
|
||||
sarvam = [ "pipecat-ai[websockets-base]" ]
|
||||
sarvam = [ "sarvamai==0.1.21", "pipecat-ai[websockets-base]" ]
|
||||
sentry = [ "sentry-sdk>=2.28.0,<3" ]
|
||||
local-smart-turn = [ "coremltools>=8.0", "transformers", "torch>=2.5.0,<3", "torchaudio>=2.5.0,<3" ]
|
||||
local-smart-turn-v3 = [ "transformers", "onnxruntime>=1.20.1,<2" ]
|
||||
remote-smart-turn = []
|
||||
silero = [ "onnxruntime>=1.20.1,<2" ]
|
||||
simli = [ "simli-ai~=0.1.10"]
|
||||
simli = [ "simli-ai~=0.1.25"]
|
||||
soniox = [ "pipecat-ai[websockets-base]" ]
|
||||
soundfile = [ "soundfile~=0.13.0" ]
|
||||
soundfile = [ "soundfile~=0.13.1" ]
|
||||
speechmatics = [ "speechmatics-rt>=0.5.0" ]
|
||||
strands = [ "strands-agents>=1.9.1,<2" ]
|
||||
tavus=[]
|
||||
@@ -109,7 +109,7 @@ together = []
|
||||
tracing = [ "opentelemetry-sdk>=1.33.0", "opentelemetry-api>=1.33.0", "opentelemetry-instrumentation>=0.54b0" ]
|
||||
ultravox = [ "transformers>=4.48.0", "vllm>=0.9.0" ]
|
||||
webrtc = [ "aiortc>=1.13.0,<2", "opencv-python>=4.11.0.86,<5" ]
|
||||
websocket = [ "pipecat-ai[websockets-base]", "fastapi>=0.115.6,<0.117.0" ]
|
||||
websocket = [ "pipecat-ai[websockets-base]", "fastapi>=0.115.6,<0.122.0" ]
|
||||
websockets-base = [ "websockets>=13.1,<16.0" ]
|
||||
whisper = [ "faster-whisper~=1.1.1" ]
|
||||
|
||||
|
||||
@@ -10,9 +10,10 @@ import os
|
||||
import re
|
||||
import time
|
||||
import wave
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
import aiofiles
|
||||
from deepgram import LiveOptions
|
||||
@@ -53,6 +54,14 @@ EVAL_TIMEOUT_SECS = 120
|
||||
EvalPrompt = str | Tuple[str, ImageFile]
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvalConfig:
|
||||
prompt: EvalPrompt
|
||||
eval: str
|
||||
eval_speaks_first: bool = False
|
||||
runner_args_body: Optional[Any] = None
|
||||
|
||||
|
||||
class EvalRunner:
|
||||
def __init__(
|
||||
self,
|
||||
@@ -93,9 +102,7 @@ class EvalRunner:
|
||||
async def run_eval(
|
||||
self,
|
||||
example_file: str,
|
||||
prompt: EvalPrompt,
|
||||
eval: str,
|
||||
user_speaks_first: bool = False,
|
||||
eval_config: EvalConfig,
|
||||
):
|
||||
if not re.match(self._pattern, example_file):
|
||||
return
|
||||
@@ -112,10 +119,8 @@ class EvalRunner:
|
||||
|
||||
try:
|
||||
tasks = [
|
||||
asyncio.create_task(run_example_pipeline(script_path)),
|
||||
asyncio.create_task(
|
||||
run_eval_pipeline(self, example_file, prompt, eval, user_speaks_first)
|
||||
),
|
||||
asyncio.create_task(run_example_pipeline(script_path, eval_config)),
|
||||
asyncio.create_task(run_eval_pipeline(self, example_file, eval_config)),
|
||||
]
|
||||
_, pending = await asyncio.wait(tasks, timeout=EVAL_TIMEOUT_SECS)
|
||||
if pending:
|
||||
@@ -177,7 +182,7 @@ class EvalRunner:
|
||||
return os.path.join(self._recordings_dir, f"{base_name}.wav")
|
||||
|
||||
|
||||
async def run_example_pipeline(script_path: Path):
|
||||
async def run_example_pipeline(script_path: Path, eval_config: EvalConfig):
|
||||
room_url = os.getenv("DAILY_SAMPLE_ROOM_URL")
|
||||
|
||||
module = load_module_from_path(script_path)
|
||||
@@ -196,6 +201,7 @@ async def run_example_pipeline(script_path: Path):
|
||||
|
||||
runner_args = RunnerArguments()
|
||||
runner_args.pipeline_idle_timeout_secs = PIPELINE_IDLE_TIMEOUT_SECS
|
||||
runner_args.body = eval_config.runner_args_body
|
||||
|
||||
await module.run_bot(transport, runner_args)
|
||||
|
||||
@@ -203,9 +209,7 @@ async def run_example_pipeline(script_path: Path):
|
||||
async def run_eval_pipeline(
|
||||
eval_runner: EvalRunner,
|
||||
example_file: str,
|
||||
prompt: EvalPrompt,
|
||||
eval: str,
|
||||
user_speaks_first: bool = False,
|
||||
eval_config: EvalConfig,
|
||||
):
|
||||
logger.info(f"Starting eval bot")
|
||||
|
||||
@@ -240,10 +244,10 @@ async def run_eval_pipeline(
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
llm.register_function("assert_eval", eval_runner.assert_eval)
|
||||
llm.register_function("eval_function", eval_runner.assert_eval)
|
||||
|
||||
eval_function = FunctionSchema(
|
||||
name="assert_eval",
|
||||
name="eval_function",
|
||||
description="Called when the user answers a question.",
|
||||
properties={
|
||||
"result": {
|
||||
@@ -262,20 +266,21 @@ async def run_eval_pipeline(
|
||||
# Load example prompt depending on image.
|
||||
example_prompt = ""
|
||||
example_image: Optional[ImageFile] = None
|
||||
if isinstance(prompt, str):
|
||||
example_prompt = prompt
|
||||
elif isinstance(prompt, tuple):
|
||||
example_prompt, example_image = prompt
|
||||
if isinstance(eval_config.prompt, str):
|
||||
example_prompt = eval_config.prompt
|
||||
elif isinstance(eval_config.prompt, tuple):
|
||||
example_prompt, example_image = eval_config.prompt
|
||||
|
||||
eval_prompt = f"The answer is correct if it matches: {eval}."
|
||||
common_system_prompt = (
|
||||
"The user might say things other than the answer and that's allowed. "
|
||||
f"You should only call the eval function with your assessment when the user actually answers the question. {eval_prompt}"
|
||||
"You should only call the eval function if:\n"
|
||||
"- The user explicitly attempts to answer the question, AND\n"
|
||||
f"- Their answer can be cleanly evaluated using: {eval_config.eval}\n"
|
||||
"Ignore greetings, comments, non-answers, or requests for clarification."
|
||||
)
|
||||
if user_speaks_first:
|
||||
system_prompt = f"You are an LLM eval, be extremly brief. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}"
|
||||
if eval_config.eval_speaks_first:
|
||||
system_prompt = f"You are an evaluation agent, be extremly brief. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}"
|
||||
else:
|
||||
system_prompt = f"You are an LLM eval, be extremly brief. Your goal is to first ask one question: {example_prompt}. {common_system_prompt}"
|
||||
system_prompt = f"You are an evaluation agent, be extremly brief. First, ask one question: {example_prompt}. {common_system_prompt}"
|
||||
|
||||
messages = [
|
||||
{
|
||||
@@ -330,9 +335,9 @@ async def run_eval_pipeline(
|
||||
|
||||
# Default behavior is for the bot to speak first
|
||||
# If the eval bot speaks first, we append the prompt to the messages
|
||||
if user_speaks_first:
|
||||
if eval_config.eval_speaks_first:
|
||||
messages.append(
|
||||
{"role": "user", "content": f"Start by saying this exactly: '{prompt}'"}
|
||||
{"role": "user", "content": f"Start by saying this exactly: '{eval_config.prompt}'"}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from eval import EvalRunner
|
||||
from eval import EvalConfig, EvalRunner
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
from utils import check_env_variables
|
||||
@@ -24,189 +24,184 @@ ASSETS_DIR = SCRIPT_DIR / "assets"
|
||||
|
||||
FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational"
|
||||
|
||||
# Speaking order constants
|
||||
USER_SPEAKS_FIRST = True
|
||||
BOT_SPEAKS_FIRST = False
|
||||
|
||||
# Math
|
||||
PROMPT_SIMPLE_MATH = "A simple math addition."
|
||||
EVAL_SIMPLE_MATH = "Correct math addition."
|
||||
|
||||
# Weather
|
||||
PROMPT_WEATHER = "What's the weather in San Francisco?"
|
||||
EVAL_WEATHER = (
|
||||
"Something specific about the current weather in San Francisco, including the degrees."
|
||||
EVAL_SIMPLE_MATH = EvalConfig(
|
||||
prompt="A simple math addition.",
|
||||
eval="The user answers the math addition correctly.",
|
||||
)
|
||||
|
||||
# Online search
|
||||
PROMPT_ONLINE_SEARCH = "What's the date right now in London?"
|
||||
EVAL_ONLINE_SEARCH = f"Today is {datetime.now(timezone.utc).strftime('%B %d, %Y')}."
|
||||
EVAL_WEATHER = EvalConfig(
|
||||
prompt="What's the weather in San Francisco?",
|
||||
eval="The user says something specific about the current weather in San Francisco, including the degrees.",
|
||||
)
|
||||
|
||||
# Switch language
|
||||
PROMPT_SWITCH_LANGUAGE = "Say something in Spanish."
|
||||
EVAL_SWITCH_LANGUAGE = "The user is now talking in Spanish."
|
||||
EVAL_ONLINE_SEARCH = EvalConfig(
|
||||
prompt="What's the date right now in London?",
|
||||
eval=f"The user says today is {datetime.now(timezone.utc).strftime('%B %d, %Y')} in London.",
|
||||
)
|
||||
|
||||
# Vision
|
||||
PROMPT_VISION = ("What do you see?", Image.open(ASSETS_DIR / "cat.jpg"))
|
||||
EVAL_VISION = "A cat description."
|
||||
EVAL_SWITCH_LANGUAGE = EvalConfig(
|
||||
prompt="Say something in Spanish.",
|
||||
eval="The user talks in Spanish.",
|
||||
)
|
||||
|
||||
EVAL_VISION_CAMERA = EvalConfig(
|
||||
prompt=("Briefly describe what you see.", Image.open(ASSETS_DIR / "cat.jpg")),
|
||||
eval="The user provides a cat description.",
|
||||
)
|
||||
|
||||
|
||||
def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
|
||||
return EvalConfig(
|
||||
prompt="Briefly describe this image.",
|
||||
eval="The user provides a cat description.",
|
||||
eval_speaks_first=eval_speaks_first,
|
||||
runner_args_body={
|
||||
"image_path": ASSETS_DIR / "cat.jpg",
|
||||
"question": "Briefly describe this image.",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
EVAL_VOICEMAIL = EvalConfig(
|
||||
prompt="Please leave a message.",
|
||||
eval="The user leaves a voicemail message.",
|
||||
eval_speaks_first=True,
|
||||
)
|
||||
|
||||
EVAL_CONVERSATION = EvalConfig(
|
||||
prompt="Hello, this is Mark.",
|
||||
eval="The user replies with a greeting.",
|
||||
eval_speaks_first=True,
|
||||
)
|
||||
|
||||
# Voicemail
|
||||
PROMPT_VOICEMAIL = "Please leave a message after the beep."
|
||||
EVAL_VOICEMAIL = "Assess the conversation and determine if it is a voicemail."
|
||||
PROMPT_CONVERSATION = "Hello, this is Mark."
|
||||
EVAL_CONVERSATION = "A start of a conversation, not a voicemail."
|
||||
|
||||
TESTS_07 = [
|
||||
# 07 series
|
||||
("07-interruptible.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07-interruptible-cartesia-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07a-interruptible-speechmatics.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07aa-interruptible-soniox.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ab-interruptible-inworld-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ac-interruptible-asyncai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07c-interruptible-deepgram-flux.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"07d-interruptible-elevenlabs-http.py",
|
||||
PROMPT_SIMPLE_MATH,
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07j-interruptible-gladia.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07k-interruptible-lmnt.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07l-interruptible-groq.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07m-interruptible-aws.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07m-interruptible-aws-strands.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("07n-interruptible-gemini.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07n-interruptible-google.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07o-interruptible-assemblyai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07q-interruptible-rime.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07q-interruptible-rime-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07r-interruptible-riva-nim.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"07s-interruptible-google-audio-in.py",
|
||||
PROMPT_SIMPLE_MATH,
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("07t-interruptible-fish.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07v-interruptible-neuphonic.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07v-interruptible-neuphonic-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ae-interruptible-hume.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07-interruptible.py", EVAL_SIMPLE_MATH),
|
||||
("07-interruptible-cartesia-http.py", EVAL_SIMPLE_MATH),
|
||||
("07a-interruptible-speechmatics.py", EVAL_SIMPLE_MATH),
|
||||
("07aa-interruptible-soniox.py", EVAL_SIMPLE_MATH),
|
||||
("07ab-interruptible-inworld-http.py", EVAL_SIMPLE_MATH),
|
||||
("07ac-interruptible-asyncai.py", EVAL_SIMPLE_MATH),
|
||||
("07ac-interruptible-asyncai-http.py", EVAL_SIMPLE_MATH),
|
||||
("07b-interruptible-langchain.py", EVAL_SIMPLE_MATH),
|
||||
("07c-interruptible-deepgram.py", EVAL_SIMPLE_MATH),
|
||||
("07c-interruptible-deepgram-flux.py", EVAL_SIMPLE_MATH),
|
||||
("07c-interruptible-deepgram-http.py", EVAL_SIMPLE_MATH),
|
||||
("07d-interruptible-elevenlabs.py", EVAL_SIMPLE_MATH),
|
||||
("07d-interruptible-elevenlabs-http.py", EVAL_SIMPLE_MATH),
|
||||
("07f-interruptible-azure.py", EVAL_SIMPLE_MATH),
|
||||
("07g-interruptible-openai.py", EVAL_SIMPLE_MATH),
|
||||
("07h-interruptible-openpipe.py", EVAL_SIMPLE_MATH),
|
||||
("07j-interruptible-gladia.py", EVAL_SIMPLE_MATH),
|
||||
("07k-interruptible-lmnt.py", EVAL_SIMPLE_MATH),
|
||||
("07l-interruptible-groq.py", EVAL_SIMPLE_MATH),
|
||||
("07m-interruptible-aws.py", EVAL_SIMPLE_MATH),
|
||||
("07m-interruptible-aws-strands.py", EVAL_WEATHER),
|
||||
("07n-interruptible-gemini.py", EVAL_SIMPLE_MATH),
|
||||
("07n-interruptible-google.py", EVAL_SIMPLE_MATH),
|
||||
("07o-interruptible-assemblyai.py", EVAL_SIMPLE_MATH),
|
||||
("07q-interruptible-rime.py", EVAL_SIMPLE_MATH),
|
||||
("07q-interruptible-rime-http.py", EVAL_SIMPLE_MATH),
|
||||
("07r-interruptible-riva-nim.py", EVAL_SIMPLE_MATH),
|
||||
("07s-interruptible-google-audio-in.py", EVAL_SIMPLE_MATH),
|
||||
("07t-interruptible-fish.py", EVAL_SIMPLE_MATH),
|
||||
("07v-interruptible-neuphonic.py", EVAL_SIMPLE_MATH),
|
||||
("07v-interruptible-neuphonic-http.py", EVAL_SIMPLE_MATH),
|
||||
("07w-interruptible-fal.py", EVAL_SIMPLE_MATH),
|
||||
("07y-interruptible-minimax.py", EVAL_SIMPLE_MATH),
|
||||
("07z-interruptible-sarvam.py", EVAL_SIMPLE_MATH),
|
||||
("07ae-interruptible-hume.py", EVAL_SIMPLE_MATH),
|
||||
# Needs a local XTTS docker instance running.
|
||||
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# ("07i-interruptible-xtts.py", EVAL_SIMPLE_MATH),
|
||||
# Needs a Krisp license.
|
||||
# ("07p-interruptible-krisp.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# ("07p-interruptible-krisp.py", EVAL_SIMPLE_MATH),
|
||||
# Needs GPU resources.
|
||||
# ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# ("07u-interruptible-ultravox.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_12 = [
|
||||
("12-describe-video.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
("12a-describe-video-gemini-flash.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
("12b-describe-video-gpt-4o.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
("12c-describe-video-anthropic.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
|
||||
("12-describe-image-openai.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
|
||||
("12a-describe-image-anthropic.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
|
||||
("12b-describe-image-aws.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
|
||||
("12c-describe-image-gemini-flash.py", EVAL_VISION_IMAGE(eval_speaks_first=True)),
|
||||
("12d-describe-image-moondream.py", EVAL_VISION_IMAGE()),
|
||||
]
|
||||
|
||||
TESTS_14 = [
|
||||
("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14b-function-calling-anthropic-video.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14d-function-calling-video.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14e-function-calling-google.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14f-function-calling-groq.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14g-function-calling-grok.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14h-function-calling-azure.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14i-function-calling-fireworks.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14j-function-calling-nim.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14k-function-calling-cerebras.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14m-function-calling-openrouter.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14n-function-calling-perplexity.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14p-function-calling-gemini-vertex-ai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14q-function-calling-qwen.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14w-function-calling-mistral.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14x-function-calling-openpipe.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("14-function-calling.py", EVAL_WEATHER),
|
||||
("14a-function-calling-anthropic.py", EVAL_WEATHER),
|
||||
("14e-function-calling-google.py", EVAL_WEATHER),
|
||||
("14f-function-calling-groq.py", EVAL_WEATHER),
|
||||
("14g-function-calling-grok.py", EVAL_WEATHER),
|
||||
("14h-function-calling-azure.py", EVAL_WEATHER),
|
||||
("14i-function-calling-fireworks.py", EVAL_WEATHER),
|
||||
("14j-function-calling-nim.py", EVAL_WEATHER),
|
||||
("14k-function-calling-cerebras.py", EVAL_WEATHER),
|
||||
("14m-function-calling-openrouter.py", EVAL_WEATHER),
|
||||
("14n-function-calling-perplexity.py", EVAL_WEATHER),
|
||||
("14p-function-calling-gemini-vertex-ai.py", EVAL_WEATHER),
|
||||
("14q-function-calling-qwen.py", EVAL_WEATHER),
|
||||
("14r-function-calling-aws.py", EVAL_WEATHER),
|
||||
("14v-function-calling-openai.py", EVAL_WEATHER),
|
||||
("14w-function-calling-mistral.py", EVAL_WEATHER),
|
||||
("14x-function-calling-openpipe.py", EVAL_WEATHER),
|
||||
# Video
|
||||
("14d-function-calling-anthropic-video.py", EVAL_VISION_CAMERA),
|
||||
("14d-function-calling-aws-video.py", EVAL_VISION_CAMERA),
|
||||
("14d-function-calling-gemini-flash-video.py", EVAL_VISION_CAMERA),
|
||||
("14d-function-calling-moondream-video.py", EVAL_VISION_CAMERA),
|
||||
("14d-function-calling-openai-video.py", EVAL_VISION_CAMERA),
|
||||
# Currently not working.
|
||||
# ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("14o-function-calling-gemini-openai-format.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("14c-function-calling-together.py", EVAL_WEATHER),
|
||||
# ("14l-function-calling-deepseek.py", EVAL_WEATHER),
|
||||
# ("14o-function-calling-gemini-openai-format.py", EVAL_WEATHER),
|
||||
]
|
||||
|
||||
TESTS_15 = [
|
||||
("15a-switch-languages.py", PROMPT_SWITCH_LANGUAGE, EVAL_SWITCH_LANGUAGE, BOT_SPEAKS_FIRST),
|
||||
("15a-switch-languages.py", EVAL_SWITCH_LANGUAGE),
|
||||
]
|
||||
|
||||
TESTS_19 = [
|
||||
("19-openai-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19-openai-realtime.py", EVAL_WEATHER),
|
||||
("19-openai-realtime-beta.py", EVAL_WEATHER),
|
||||
# OpenAI Realtime not released on Azure yet
|
||||
# ("19a-azure-realtime.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19b-openai-realtime-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
|
||||
# ("19a-azure-realtime.py", EVAL_WEATHER),
|
||||
("19a-azure-realtime-beta.py", EVAL_WEATHER),
|
||||
("19b-openai-realtime-text.py", EVAL_WEATHER),
|
||||
("19b-openai-realtime-beta-text.py", EVAL_WEATHER),
|
||||
]
|
||||
|
||||
TESTS_21 = [
|
||||
("21a-tavus-video-service.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("21a-tavus-video-service.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_26 = [
|
||||
("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26a-gemini-live-transcription.py",
|
||||
PROMPT_SIMPLE_MATH,
|
||||
EVAL_SIMPLE_MATH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
(
|
||||
"26b-gemini-live-function-calling.py",
|
||||
PROMPT_WEATHER,
|
||||
EVAL_WEATHER,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("26c-gemini-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26e-gemini-multimodal-google-search.py",
|
||||
PROMPT_ONLINE_SEARCH,
|
||||
EVAL_ONLINE_SEARCH,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
("26-gemini-live.py", EVAL_SIMPLE_MATH),
|
||||
("26a-gemini-live-transcription.py", EVAL_SIMPLE_MATH),
|
||||
("26b-gemini-live-function-calling.py", EVAL_WEATHER),
|
||||
("26c-gemini-live-video.py", EVAL_VISION_CAMERA),
|
||||
("26e-gemini-live-google-search.py", EVAL_ONLINE_SEARCH),
|
||||
("26h-gemini-live-vertex-function-calling.py", EVAL_WEATHER),
|
||||
# Currently not working.
|
||||
# ("26d-gemini-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"26h-gemini-live-vertex-function-calling.py",
|
||||
PROMPT_WEATHER,
|
||||
EVAL_WEATHER,
|
||||
BOT_SPEAKS_FIRST,
|
||||
),
|
||||
# ("26d-gemini-live-text.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_27 = [
|
||||
("27-simli-layer.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("27-simli-layer.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_40 = [
|
||||
("40-aws-nova-sonic.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("40-aws-nova-sonic.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_43 = [
|
||||
("43a-heygen-video-service.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("43a-heygen-video-service.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
TESTS_44 = [
|
||||
("44-voicemail-detection.py", PROMPT_VOICEMAIL, EVAL_VOICEMAIL, USER_SPEAKS_FIRST),
|
||||
("44-voicemail-detection.py", PROMPT_CONVERSATION, EVAL_CONVERSATION, USER_SPEAKS_FIRST),
|
||||
("44-voicemail-detection.py", EVAL_VOICEMAIL),
|
||||
("44-voicemail-detection.py", EVAL_CONVERSATION),
|
||||
]
|
||||
|
||||
TESTS = [
|
||||
@@ -244,9 +239,9 @@ async def main(args: argparse.Namespace):
|
||||
|
||||
# Parse test config: (test, prompt, eval, user_speaks_first)
|
||||
for test_config in TESTS:
|
||||
test, prompt, eval, user_speaks_first = test_config
|
||||
test, eval_config = test_config
|
||||
|
||||
await runner.run_eval(test, prompt, eval, user_speaks_first)
|
||||
await runner.run_eval(test, eval_config)
|
||||
|
||||
runner.print_results()
|
||||
|
||||
|
||||
@@ -22,9 +22,12 @@ class AdapterType(Enum):
|
||||
|
||||
Parameters:
|
||||
GEMINI: Google Gemini adapter - currently the only service supporting custom tools.
|
||||
SHIM: Backward compatibility shim for creating ToolsSchemas from lists of tools in
|
||||
any format, used by LLMContext.from_openai_context.
|
||||
"""
|
||||
|
||||
GEMINI = "gemini" # that is the only service where we are able to add custom tools for now
|
||||
SHIM = "shim" # for use as backward compatibility shim for creating ToolsSchemas from list of tools in any format
|
||||
|
||||
|
||||
class ToolsSchema:
|
||||
|
||||
@@ -245,13 +245,25 @@ class AnthropicLLMAdapter(BaseLLMAdapter[AnthropicLLMInvocationParams]):
|
||||
item["text"] = "(empty)"
|
||||
# handle image_url -> image conversion
|
||||
if item["type"] == "image_url":
|
||||
item["type"] = "image"
|
||||
item["source"] = {
|
||||
"type": "base64",
|
||||
"media_type": "image/jpeg",
|
||||
"data": item["image_url"]["url"].split(",")[1],
|
||||
}
|
||||
del item["image_url"]
|
||||
if item["image_url"]["url"].startswith("data:"):
|
||||
item["type"] = "image"
|
||||
item["source"] = {
|
||||
"type": "base64",
|
||||
"media_type": "image/jpeg",
|
||||
"data": item["image_url"]["url"].split(",")[1],
|
||||
}
|
||||
del item["image_url"]
|
||||
elif item["image_url"]["url"].startswith("http"):
|
||||
item["type"] = "image"
|
||||
item["source"] = {
|
||||
"type": "url",
|
||||
"url": item["image_url"]["url"],
|
||||
}
|
||||
del item["image_url"]
|
||||
else:
|
||||
url = item["image_url"]["url"]
|
||||
logger.warning(f"Unsupported 'image_url': {url}")
|
||||
|
||||
# In the case where there's a single image in the list (like what
|
||||
# would result from a UserImageRawFrame), ensure that the image
|
||||
# comes before text, as recommended by Anthropic docs
|
||||
|
||||
@@ -16,7 +16,7 @@ from loguru import logger
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext, LLMContextMessage
|
||||
|
||||
|
||||
@@ -210,4 +210,18 @@ class AWSNovaSonicLLMAdapter(BaseLLMAdapter[AWSNovaSonicLLMInvocationParams]):
|
||||
List of dictionaries in AWS Nova Sonic function format.
|
||||
"""
|
||||
functions_schema = tools_schema.standard_tools
|
||||
return [self._to_aws_nova_sonic_function_format(func) for func in functions_schema]
|
||||
standard_tools = [
|
||||
self._to_aws_nova_sonic_function_format(func) for func in functions_schema
|
||||
]
|
||||
|
||||
# For backward compatibility, AWS Nova Sonic can still be used with
|
||||
# tools in dict format, even though it always uses `LLMContext` under
|
||||
# the hood (via `LLMContext.from_openai_context()`).
|
||||
# To support this behavior, we use "shimmed" custom tools here.
|
||||
# (We maintain this backward compatibility because users aren't
|
||||
# *knowingly* opting into the new `LLMContext`.)
|
||||
shimmed_tools = []
|
||||
if tools_schema.custom_tools:
|
||||
shimmed_tools = tools_schema.custom_tools.get(AdapterType.SHIM, [])
|
||||
|
||||
return standard_tools + shimmed_tools
|
||||
|
||||
@@ -256,15 +256,22 @@ class AWSBedrockLLMAdapter(BaseLLMAdapter[AWSBedrockLLMInvocationParams]):
|
||||
new_content.append({"text": text_content})
|
||||
# handle image_url -> image conversion
|
||||
if item["type"] == "image_url":
|
||||
new_item = {
|
||||
"image": {
|
||||
"format": "jpeg",
|
||||
"source": {
|
||||
"bytes": base64.b64decode(item["image_url"]["url"].split(",")[1])
|
||||
},
|
||||
if item["image_url"]["url"].startswith("data:"):
|
||||
new_item = {
|
||||
"image": {
|
||||
"format": "jpeg",
|
||||
"source": {
|
||||
"bytes": base64.b64decode(
|
||||
item["image_url"]["url"].split(",")[1]
|
||||
)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
new_content.append(new_item)
|
||||
new_content.append(new_item)
|
||||
else:
|
||||
url = item["image_url"]["url"]
|
||||
logger.warning(f"Unsupported 'image_url': {url}")
|
||||
|
||||
# In the case where there's a single image in the list (like what
|
||||
# would result from a UserImageRawFrame), ensure that the image
|
||||
# comes before text
|
||||
|
||||
@@ -24,13 +24,7 @@ from pipecat.processors.aggregators.llm_context import (
|
||||
)
|
||||
|
||||
try:
|
||||
from google.genai.types import (
|
||||
Blob,
|
||||
Content,
|
||||
FunctionCall,
|
||||
FunctionResponse,
|
||||
Part,
|
||||
)
|
||||
from google.genai.types import Blob, Content, FileData, FunctionCall, FunctionResponse, Part
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
||||
@@ -86,12 +80,48 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
List of tool definitions formatted for Gemini's function-calling API.
|
||||
Includes both converted standard tools and any custom Gemini-specific tools.
|
||||
"""
|
||||
|
||||
def _strip_additional_properties(schema: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Recursively remove "additionalProperties" fields from JSON schema, as they're not supported by Gemini.
|
||||
|
||||
Args:
|
||||
schema: The JSON schema dict to process.
|
||||
|
||||
Returns:
|
||||
JSON schema dict with "additionalProperties" stripped out.
|
||||
"""
|
||||
if not isinstance(schema, dict):
|
||||
return schema
|
||||
|
||||
result = {}
|
||||
|
||||
for key, value in schema.items():
|
||||
if key == "additionalProperties":
|
||||
continue
|
||||
elif isinstance(value, dict):
|
||||
result[key] = _strip_additional_properties(value)
|
||||
elif isinstance(value, list):
|
||||
result[key] = [
|
||||
_strip_additional_properties(item) if isinstance(item, dict) else item
|
||||
for item in value
|
||||
]
|
||||
else:
|
||||
result[key] = value
|
||||
|
||||
return result
|
||||
|
||||
functions_schema = tools_schema.standard_tools
|
||||
formatted_standard_tools = (
|
||||
[{"function_declarations": [func.to_default_dict() for func in functions_schema]}]
|
||||
if functions_schema
|
||||
else []
|
||||
)
|
||||
if functions_schema:
|
||||
formatted_functions = []
|
||||
for func in functions_schema:
|
||||
func_dict = func.to_default_dict()
|
||||
func_dict["parameters"]["properties"] = _strip_additional_properties(
|
||||
func_dict["parameters"]["properties"]
|
||||
)
|
||||
formatted_functions.append(func_dict)
|
||||
formatted_standard_tools = [{"function_declarations": formatted_functions}]
|
||||
else:
|
||||
formatted_standard_tools = []
|
||||
custom_gemini_tools = []
|
||||
if tools_schema.custom_tools:
|
||||
custom_gemini_tools = tools_schema.custom_tools.get(AdapterType.GEMINI, [])
|
||||
@@ -309,6 +339,7 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
parts.append(
|
||||
Part(
|
||||
function_call=FunctionCall(
|
||||
id=id,
|
||||
name=name,
|
||||
args=json.loads(tc["function"]["arguments"]),
|
||||
)
|
||||
@@ -334,9 +365,12 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
function_name = params.tool_call_id_to_name_mapping[tool_call_id]
|
||||
|
||||
parts.append(
|
||||
Part.from_function_response(
|
||||
name=function_name,
|
||||
response=response_dict,
|
||||
Part(
|
||||
function_response=FunctionResponse(
|
||||
id=tool_call_id,
|
||||
name=function_name,
|
||||
response=response_dict,
|
||||
)
|
||||
)
|
||||
)
|
||||
elif isinstance(content, str):
|
||||
@@ -345,7 +379,7 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
for c in content:
|
||||
if c["type"] == "text":
|
||||
parts.append(Part(text=c["text"]))
|
||||
elif c["type"] == "image_url":
|
||||
elif c["type"] == "image_url" and c["image_url"]["url"].startswith("data:"):
|
||||
parts.append(
|
||||
Part(
|
||||
inline_data=Blob(
|
||||
@@ -354,10 +388,23 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
|
||||
)
|
||||
)
|
||||
)
|
||||
elif c["type"] == "image_url":
|
||||
url = c["image_url"]["url"]
|
||||
logger.warning(f"Unsupported 'image_url': {url}")
|
||||
elif c["type"] == "input_audio":
|
||||
input_audio = c["input_audio"]
|
||||
audio_bytes = base64.b64decode(input_audio["data"])
|
||||
parts.append(Part(inline_data=Blob(mime_type="audio/wav", data=audio_bytes)))
|
||||
elif c["type"] == "file_data":
|
||||
file_data = c["file_data"]
|
||||
parts.append(
|
||||
Part(
|
||||
file_data=FileData(
|
||||
mime_type=file_data.get("mime_type"),
|
||||
file_uri=file_data.get("file_uri"),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
return self.MessageConversionResult(
|
||||
content=Content(role=role, parts=parts),
|
||||
|
||||
@@ -6,12 +6,18 @@
|
||||
|
||||
"""OpenAI Realtime LLM adapter for Pipecat."""
|
||||
|
||||
from typing import Any, Dict, List, TypedDict
|
||||
import copy
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, TypedDict
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext, LLMContextMessage
|
||||
from pipecat.services.openai.realtime import events
|
||||
|
||||
|
||||
class OpenAIRealtimeLLMInvocationParams(TypedDict):
|
||||
@@ -20,7 +26,9 @@ class OpenAIRealtimeLLMInvocationParams(TypedDict):
|
||||
This is a placeholder until support for universal LLMContext machinery is added for OpenAI Realtime.
|
||||
"""
|
||||
|
||||
pass
|
||||
system_instruction: Optional[str]
|
||||
messages: List[events.ConversationItem]
|
||||
tools: List[Dict[str, Any]]
|
||||
|
||||
|
||||
class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
@@ -33,7 +41,7 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
@property
|
||||
def id_for_llm_specific_messages(self) -> str:
|
||||
"""Get the identifier used in LLMSpecificMessage instances for OpenAI Realtime."""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for OpenAI Realtime.")
|
||||
return "openai-realtime"
|
||||
|
||||
def get_llm_invocation_params(self, context: LLMContext) -> OpenAIRealtimeLLMInvocationParams:
|
||||
"""Get OpenAI Realtime-specific LLM invocation parameters from a universal LLM context.
|
||||
@@ -46,7 +54,13 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
Returns:
|
||||
Dictionary of parameters for invoking OpenAI Realtime's API.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for OpenAI Realtime.")
|
||||
messages = self._from_universal_context_messages(self.get_messages(context))
|
||||
return {
|
||||
"system_instruction": messages.system_instruction,
|
||||
"messages": messages.messages,
|
||||
# NOTE: LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
|
||||
"tools": self.from_standard_tools(context.tools) or [],
|
||||
}
|
||||
|
||||
def get_messages_for_logging(self, context) -> List[Dict[str, Any]]:
|
||||
"""Get messages from a universal LLM context in a format ready for logging about OpenAI Realtime.
|
||||
@@ -61,7 +75,124 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
Returns:
|
||||
List of messages in a format ready for logging about OpenAI Realtime.
|
||||
"""
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for OpenAI Realtime.")
|
||||
# NOTE: this is the same as in OpenAIAdapter, as that's what it was
|
||||
# prior to a refactor. Worth noting that for OpenAI Realtime
|
||||
# specifically, not everything handled here is necessarily supported
|
||||
# (or supported yet).
|
||||
msgs = []
|
||||
for message in self.get_messages(context):
|
||||
msg = copy.deepcopy(message)
|
||||
if "content" in msg:
|
||||
if isinstance(msg["content"], list):
|
||||
for item in msg["content"]:
|
||||
if item["type"] == "image_url":
|
||||
if item["image_url"]["url"].startswith("data:image/"):
|
||||
item["image_url"]["url"] = "data:image/..."
|
||||
if item["type"] == "input_audio":
|
||||
item["input_audio"]["data"] = "..."
|
||||
if "mime_type" in msg and msg["mime_type"].startswith("image/"):
|
||||
msg["data"] = "..."
|
||||
msgs.append(msg)
|
||||
return msgs
|
||||
|
||||
@dataclass
|
||||
class ConvertedMessages:
|
||||
"""Container for OpenAI-formatted messages converted from universal context."""
|
||||
|
||||
messages: List[events.ConversationItem]
|
||||
system_instruction: Optional[str] = None
|
||||
|
||||
def _from_universal_context_messages(
|
||||
self, universal_context_messages: List[LLMContextMessage]
|
||||
) -> ConvertedMessages:
|
||||
# We can't load a long conversation history into the openai realtime api yet. (The API/model
|
||||
# forgets that it can do audio, if you do a series of `conversation.item.create` calls.) So
|
||||
# our general strategy until this is fixed is just to put everything into a first "user"
|
||||
# message as a single input.
|
||||
|
||||
if not universal_context_messages:
|
||||
return self.ConvertedMessages(messages=[])
|
||||
|
||||
messages = copy.deepcopy(universal_context_messages)
|
||||
system_instruction = None
|
||||
|
||||
# If we have a "system" message as our first message, let's pull that out into session
|
||||
# "instructions"
|
||||
if messages[0].get("role") == "system":
|
||||
system = messages.pop(0)
|
||||
content = system.get("content")
|
||||
if isinstance(content, str):
|
||||
system_instruction = content
|
||||
elif isinstance(content, list):
|
||||
system_instruction = content[0].get("text")
|
||||
if not messages:
|
||||
return self.ConvertedMessages(messages=[], system_instruction=system_instruction)
|
||||
|
||||
# If we have just a single "user" item, we can just send it normally
|
||||
if len(messages) == 1 and messages[0].get("role") == "user":
|
||||
return self.ConvertedMessages(
|
||||
messages=[self._from_universal_context_message(messages[0])],
|
||||
system_instruction=system_instruction,
|
||||
)
|
||||
|
||||
# Otherwise, let's pack everything into a single "user" message with a bit of
|
||||
# explanation for the LLM
|
||||
intro_text = """
|
||||
This is a previously saved conversation. Please treat this conversation history as a
|
||||
starting point for the current conversation."""
|
||||
|
||||
trailing_text = """
|
||||
This is the end of the previously saved conversation. Please continue the conversation
|
||||
from here. If the last message is a user instruction or question, act on that instruction
|
||||
or answer the question. If the last message is an assistant response, simple say that you
|
||||
are ready to continue the conversation."""
|
||||
|
||||
return self.ConvertedMessages(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"type": "message",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "\n\n".join(
|
||||
[intro_text, json.dumps(messages, indent=2), trailing_text]
|
||||
),
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
system_instruction=system_instruction,
|
||||
)
|
||||
|
||||
def _from_universal_context_message(
|
||||
self, message: LLMContextMessage
|
||||
) -> events.ConversationItem:
|
||||
if message.get("role") == "user":
|
||||
content = message.get("content")
|
||||
if isinstance(message.get("content"), list):
|
||||
content = ""
|
||||
for c in message.get("content"):
|
||||
if c.get("type") == "text":
|
||||
content += " " + c.get("text")
|
||||
else:
|
||||
logger.error(
|
||||
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
||||
)
|
||||
return events.ConversationItem(
|
||||
role="user",
|
||||
type="message",
|
||||
content=[events.ItemContent(type="input_text", text=content)],
|
||||
)
|
||||
if message.get("role") == "assistant" and message.get("tool_calls"):
|
||||
tc = message.get("tool_calls")[0]
|
||||
return events.ConversationItem(
|
||||
type="function_call",
|
||||
call_id=tc["id"],
|
||||
name=tc["function"]["name"],
|
||||
arguments=tc["function"]["arguments"],
|
||||
)
|
||||
logger.error(f"Unhandled message type in _from_universal_context_message: {message}")
|
||||
|
||||
@staticmethod
|
||||
def _to_openai_realtime_function_format(function: FunctionSchema) -> Dict[str, Any]:
|
||||
@@ -94,4 +225,18 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
||||
List of function definitions in OpenAI Realtime format.
|
||||
"""
|
||||
functions_schema = tools_schema.standard_tools
|
||||
return [self._to_openai_realtime_function_format(func) for func in functions_schema]
|
||||
standard_tools = [
|
||||
self._to_openai_realtime_function_format(func) for func in functions_schema
|
||||
]
|
||||
|
||||
# For backward compatibility, OpenAI Realtime can still be used with
|
||||
# tools in dict format, even though it always uses `LLMContext` under
|
||||
# the hood (via `LLMContext.from_openai_context()`).
|
||||
# To support this behavior, we use "shimmed" custom tools here.
|
||||
# (We maintain this backward compatibility because users aren't
|
||||
# *knowingly* opting into the new `LLMContext`.)
|
||||
shimmed_tools = []
|
||||
if tools_schema.custom_tools:
|
||||
shimmed_tools = tools_schema.custom_tools.get(AdapterType.SHIM, [])
|
||||
|
||||
return standard_tools + shimmed_tools
|
||||
|
||||
@@ -35,12 +35,15 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
||||
enabling offline operation without network dependencies.
|
||||
"""
|
||||
|
||||
def __init__(self, *, smart_turn_model_path: Optional[str] = None, **kwargs):
|
||||
def __init__(
|
||||
self, *, smart_turn_model_path: Optional[str] = None, cpu_count: int = 1, **kwargs
|
||||
):
|
||||
"""Initialize the local ONNX smart-turn-v3 analyzer.
|
||||
|
||||
Args:
|
||||
smart_turn_model_path: Path to the ONNX model file. If this is not
|
||||
set, the bundled smart-turn-v3.0 model will be used.
|
||||
cpu_count: The number of CPUs to use for inference. Defaults to 1.
|
||||
**kwargs: Additional arguments passed to BaseSmartTurn.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
@@ -70,6 +73,7 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
||||
so = ort.SessionOptions()
|
||||
so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
||||
so.inter_op_num_threads = 1
|
||||
so.intra_op_num_threads = cpu_count
|
||||
so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
|
||||
self._feature_extractor = WhisperFeatureExtractor(chunk_length=8)
|
||||
|
||||
@@ -773,9 +773,15 @@ class CancelFrame(SystemFrame):
|
||||
|
||||
Indicates that a pipeline needs to stop right away without
|
||||
processing remaining queued frames.
|
||||
|
||||
Parameters:
|
||||
reason: Optional reason for pushing a cancel frame.
|
||||
"""
|
||||
|
||||
pass
|
||||
reason: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1201,26 +1207,43 @@ class TransportMessageUrgentFrame(OutputTransportMessageUrgentFrame):
|
||||
class UserImageRequestFrame(SystemFrame):
|
||||
"""Frame requesting an image from a specific user.
|
||||
|
||||
A frame to request an image from the given user. The frame might be
|
||||
generated by a function call in which case the corresponding fields will be
|
||||
properly set.
|
||||
A frame to request an image from the given user. The request might come with
|
||||
a text that can be later used to describe the requested image.
|
||||
|
||||
Parameters:
|
||||
user_id: Identifier of the user to request image from.
|
||||
context: Optional context for the image request.
|
||||
function_name: Name of function that generated this request (if any).
|
||||
tool_call_id: Tool call ID if generated by function call.
|
||||
text: An optional text associated to the image request.
|
||||
append_to_context: Whether the requested image should be appended to the LLM context.
|
||||
video_source: Specific video source to capture from.
|
||||
context: [DEPRECATED] Optional context for the image request.
|
||||
function_name: [DEPRECATED] Name of function that generated this request (if any).
|
||||
tool_call_id: [DEPRECATED] Tool call ID if generated by function call.
|
||||
"""
|
||||
|
||||
user_id: str
|
||||
text: Optional[str] = None
|
||||
append_to_context: Optional[bool] = None
|
||||
video_source: Optional[str] = None
|
||||
context: Optional[Any] = None
|
||||
function_name: Optional[str] = None
|
||||
tool_call_id: Optional[str] = None
|
||||
video_source: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
|
||||
if self.context or self.function_name or self.tool_call_id:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"`UserImageRequestFrame` fields `context`, `function_name` and `tool_call_id` are deprecated.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(user: {self.user_id}, video_source: {self.video_source}, function: {self.function_name}, request: {self.tool_call_id})"
|
||||
return f"{self.name}(user: {self.user_id}, text: {self.text}, append_to_context: {self.append_to_context}, {self.video_source})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1294,15 +1317,33 @@ class UserImageRawFrame(InputImageRawFrame):
|
||||
|
||||
Parameters:
|
||||
user_id: Identifier of the user who provided this image.
|
||||
request: The original image request frame if this is a response.
|
||||
text: An optional text associated to this image.
|
||||
append_to_context: Whether the requested image should be appended to the LLM context.
|
||||
request: [DEPRECATED] The original image request frame if this is a response.
|
||||
"""
|
||||
|
||||
user_id: str = ""
|
||||
text: Optional[str] = None
|
||||
append_to_context: Optional[bool] = None
|
||||
request: Optional[UserImageRequestFrame] = None
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
|
||||
if self.request:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"`UserImageRawFrame` field `request` is deprecated.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
pts = format_pts(self.pts)
|
||||
return f"{self.name}(pts: {pts}, user: {self.user_id}, source: {self.transport_source}, size: {self.size}, format: {self.format}, request: {self.request})"
|
||||
return f"{self.name}(pts: {pts}, user: {self.user_id}, source: {self.transport_source}, size: {self.size}, format: {self.format}, text: {self.text}, append_to_context: {self.append_to_context})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1367,9 +1408,15 @@ class EndTaskFrame(TaskFrame):
|
||||
This is used to notify the pipeline task that the pipeline should be
|
||||
closed nicely (flushing all the queued frames) by pushing an EndFrame
|
||||
downstream. This frame should be pushed upstream.
|
||||
|
||||
Parameters:
|
||||
reason: Optional reason for pushing an end frame.
|
||||
"""
|
||||
|
||||
pass
|
||||
reason: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1379,9 +1426,15 @@ class CancelTaskFrame(TaskFrame):
|
||||
This is used to notify the pipeline task that the pipeline should be
|
||||
stopped immediately by pushing a CancelFrame downstream. This frame
|
||||
should be pushed upstream.
|
||||
|
||||
Parameters:
|
||||
reason: Optional reason for pushing a cancel frame.
|
||||
"""
|
||||
|
||||
pass
|
||||
reason: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1452,9 +1505,15 @@ class EndFrame(ControlFrame):
|
||||
sending frames to its output channel(s) and close all its threads. Note,
|
||||
that this is a control frame, which means it will be received in the order it
|
||||
was sent.
|
||||
|
||||
Parameters:
|
||||
reason: Optional reason for pushing an end frame.
|
||||
"""
|
||||
|
||||
pass
|
||||
reason: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(reason: {self.reason})"
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
218
src/pipecat/observers/loggers/metrics_log_observer.py
Normal file
218
src/pipecat/observers/loggers/metrics_log_observer.py
Normal file
@@ -0,0 +1,218 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Metrics logging observer for Pipecat.
|
||||
|
||||
This module provides an observer that logs metrics frames to the console,
|
||||
allowing developers to monitor performance metrics, token usage, and other
|
||||
statistics in real-time.
|
||||
"""
|
||||
|
||||
from typing import Optional, Set, Type
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import MetricsFrame
|
||||
from pipecat.metrics.metrics import (
|
||||
LLMTokenUsage,
|
||||
LLMUsageMetricsData,
|
||||
MetricsData,
|
||||
ProcessingMetricsData,
|
||||
SmartTurnMetricsData,
|
||||
TTFBMetricsData,
|
||||
TTSUsageMetricsData,
|
||||
)
|
||||
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
||||
|
||||
|
||||
class MetricsLogObserver(BaseObserver):
|
||||
"""Observer to log metrics activity to the console.
|
||||
|
||||
Monitors and logs all MetricsFrame instances, including:
|
||||
|
||||
- TTFBMetricsData (Time To First Byte)
|
||||
- ProcessingMetricsData (General processing time)
|
||||
- LLMUsageMetricsData (Token usage statistics)
|
||||
- TTSUsageMetricsData (Text-to-Speech character counts)
|
||||
- SmartTurnMetricsData (Turn prediction metrics)
|
||||
|
||||
This allows developers to track performance metrics, token usage,
|
||||
and other statistics throughout the pipeline.
|
||||
|
||||
Examples:
|
||||
Log all metrics types::
|
||||
|
||||
observers = [MetricsLogObserver()]
|
||||
|
||||
Log only LLM and TTS metrics::
|
||||
|
||||
from pipecat.metrics.metrics import LLMUsageMetricsData, TTSUsageMetricsData
|
||||
observers = [
|
||||
MetricsLogObserver(
|
||||
include_metrics={LLMUsageMetricsData, TTSUsageMetricsData}
|
||||
)
|
||||
]
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
include_metrics: Optional[Set[Type[MetricsData]]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the metrics log observer.
|
||||
|
||||
Args:
|
||||
include_metrics: Set of metrics types to include. If specified, only these
|
||||
metrics types will be logged. If None, all metrics are logged.
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._include_metrics = include_metrics
|
||||
self._frames_seen = set()
|
||||
|
||||
async def on_push_frame(self, data: FramePushed):
|
||||
"""Handle frame push events and log metrics frames.
|
||||
|
||||
Logs MetricsFrame instances with detailed information about the
|
||||
metrics data, formatted appropriately for each metrics type.
|
||||
|
||||
Args:
|
||||
data: Frame push event data containing source, frame, and timestamp.
|
||||
"""
|
||||
frame = data.frame
|
||||
timestamp = data.timestamp
|
||||
|
||||
if not isinstance(frame, MetricsFrame):
|
||||
return
|
||||
|
||||
# Skip frames we've already seen to avoid duplicate logging
|
||||
if frame.id in self._frames_seen:
|
||||
return
|
||||
|
||||
self._frames_seen.add(frame.id)
|
||||
|
||||
time_sec = timestamp / 1_000_000_000
|
||||
|
||||
# Process each metrics data item in the frame
|
||||
for metrics_data in frame.data:
|
||||
# Check if this metrics type should be logged
|
||||
if not self._should_log_metrics(metrics_data):
|
||||
continue
|
||||
|
||||
self._log_metrics_data(metrics_data, time_sec)
|
||||
|
||||
def _should_log_metrics(self, metrics_data: MetricsData) -> bool:
|
||||
"""Determine if a metrics data item should be logged based on filters.
|
||||
|
||||
Args:
|
||||
metrics_data: The metrics data to check.
|
||||
|
||||
Returns:
|
||||
True if the metrics should be logged, False otherwise.
|
||||
"""
|
||||
# If include_metrics is specified, only log those types
|
||||
if self._include_metrics is not None:
|
||||
return type(metrics_data) in self._include_metrics
|
||||
|
||||
# Otherwise, log all metrics
|
||||
return True
|
||||
|
||||
def _log_metrics_data(self, metrics_data: MetricsData, time_sec: float):
|
||||
"""Log a single metrics data item.
|
||||
|
||||
Args:
|
||||
metrics_data: The metrics data to log.
|
||||
time_sec: Timestamp in seconds.
|
||||
"""
|
||||
processor_info = f"[{metrics_data.processor}]"
|
||||
model_info = f" ({metrics_data.model})" if metrics_data.model else ""
|
||||
|
||||
if isinstance(metrics_data, TTFBMetricsData):
|
||||
logger.debug(
|
||||
f"📊 {processor_info} TTFB{model_info}: {metrics_data.value}s at {time_sec:.3f}s"
|
||||
)
|
||||
elif isinstance(metrics_data, ProcessingMetricsData):
|
||||
logger.debug(
|
||||
f"📊 {processor_info} PROCESSING TIME{model_info}: {metrics_data.value}s at {time_sec:.3f}s"
|
||||
)
|
||||
elif isinstance(metrics_data, LLMUsageMetricsData):
|
||||
self._log_llm_usage(metrics_data, processor_info, model_info, time_sec)
|
||||
elif isinstance(metrics_data, TTSUsageMetricsData):
|
||||
logger.debug(
|
||||
f"📊 {processor_info} TTS USAGE{model_info}: {metrics_data.value} characters at {time_sec:.3f}s"
|
||||
)
|
||||
elif isinstance(metrics_data, SmartTurnMetricsData):
|
||||
self._log_smart_turn(metrics_data, processor_info, model_info, time_sec)
|
||||
else:
|
||||
# Generic fallback for unknown metrics types
|
||||
logger.debug(
|
||||
f"📊 {processor_info} METRICS{model_info}: {metrics_data} at {time_sec:.3f}s"
|
||||
)
|
||||
|
||||
def _log_llm_usage(
|
||||
self,
|
||||
metrics_data: LLMUsageMetricsData,
|
||||
processor_info: str,
|
||||
model_info: str,
|
||||
time_sec: float,
|
||||
):
|
||||
"""Log LLM token usage metrics.
|
||||
|
||||
Args:
|
||||
metrics_data: The LLM usage metrics data.
|
||||
processor_info: Formatted processor name string.
|
||||
model_info: Formatted model name string.
|
||||
time_sec: Timestamp in seconds.
|
||||
"""
|
||||
usage: LLMTokenUsage = metrics_data.value
|
||||
|
||||
# Build usage details
|
||||
details = [
|
||||
f"prompt: {usage.prompt_tokens}",
|
||||
f"completion: {usage.completion_tokens}",
|
||||
f"total: {usage.total_tokens}",
|
||||
]
|
||||
|
||||
if usage.cache_read_input_tokens is not None:
|
||||
details.append(f"cache_read: {usage.cache_read_input_tokens}")
|
||||
|
||||
if usage.cache_creation_input_tokens is not None:
|
||||
details.append(f"cache_creation: {usage.cache_creation_input_tokens}")
|
||||
|
||||
if usage.reasoning_tokens is not None:
|
||||
details.append(f"reasoning: {usage.reasoning_tokens}")
|
||||
|
||||
usage_str = ", ".join(details)
|
||||
|
||||
logger.debug(
|
||||
f"📊 {processor_info} LLM TOKEN USAGE{model_info}: {usage_str} at {time_sec:.2f}s"
|
||||
)
|
||||
|
||||
def _log_smart_turn(
|
||||
self,
|
||||
metrics_data: SmartTurnMetricsData,
|
||||
processor_info: str,
|
||||
model_info: str,
|
||||
time_sec: float,
|
||||
):
|
||||
"""Log smart turn prediction metrics.
|
||||
|
||||
Args:
|
||||
metrics_data: The smart turn metrics data.
|
||||
processor_info: Formatted processor name string.
|
||||
model_info: Formatted model name string.
|
||||
time_sec: Timestamp in seconds.
|
||||
"""
|
||||
complete_str = "COMPLETE" if metrics_data.is_complete else "INCOMPLETE"
|
||||
|
||||
logger.debug(
|
||||
f"📊 {processor_info} SMART TURN{model_info}: {complete_str} "
|
||||
f"(probability: {metrics_data.probability:.2%}, "
|
||||
f"inference: {metrics_data.inference_time_ms:.1f}ms, "
|
||||
f"server: {metrics_data.server_total_time_ms:.1f}ms, "
|
||||
f"e2e: {metrics_data.e2e_processing_time_ms:.1f}ms) "
|
||||
f"at {time_sec:.2f}s"
|
||||
)
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
from typing import Any, List, Optional, Type
|
||||
|
||||
from pipecat.adapters.schemas.direct_function import DirectFunction
|
||||
from pipecat.pipeline.service_switcher import ServiceSwitcher, StrategyType
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.services.llm_service import LLMService
|
||||
@@ -95,3 +96,22 @@ class LLMSwitcher(ServiceSwitcher[StrategyType]):
|
||||
start_callback=start_callback,
|
||||
cancel_on_interruption=cancel_on_interruption,
|
||||
)
|
||||
|
||||
def register_direct_function(
|
||||
self,
|
||||
handler: DirectFunction,
|
||||
*,
|
||||
cancel_on_interruption: bool = True,
|
||||
):
|
||||
"""Register a direct function handler for LLM function calls, on all LLMs, active or not.
|
||||
|
||||
Args:
|
||||
handler: The direct function to register. Must follow DirectFunction protocol.
|
||||
cancel_on_interruption: Whether to cancel this function call when an
|
||||
interruption occurs. Defaults to True.
|
||||
"""
|
||||
for llm in self.llms:
|
||||
llm.register_direct_function(
|
||||
handler=handler,
|
||||
cancel_on_interruption=cancel_on_interruption,
|
||||
)
|
||||
|
||||
@@ -12,7 +12,9 @@ including heartbeats, idle detection, and observer integration.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
import importlib.util
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, AsyncIterable, Dict, Iterable, List, Optional, Tuple, Type
|
||||
|
||||
from loguru import logger
|
||||
@@ -39,7 +41,7 @@ from pipecat.frames.frames import (
|
||||
UserSpeakingFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import ProcessingMetricsData, TTFBMetricsData
|
||||
from pipecat.observers.base_observer import BaseObserver
|
||||
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
||||
from pipecat.observers.turn_tracking_observer import TurnTrackingObserver
|
||||
from pipecat.pipeline.base_task import BasePipelineTask, PipelineTaskParams
|
||||
from pipecat.pipeline.pipeline import Pipeline, PipelineSink, PipelineSource
|
||||
@@ -57,6 +59,43 @@ IDLE_TIMEOUT_SECS = 300
|
||||
CANCEL_TIMEOUT_SECS = 20.0
|
||||
|
||||
|
||||
class IdleFrameObserver(BaseObserver):
|
||||
"""Idle timeout observer.
|
||||
|
||||
This observer waits for specific frames being generated in the pipeline. If
|
||||
the frames are generated the given asyncio event is set. If the event is not
|
||||
set it means the pipeline is probably idle.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *, idle_event: asyncio.Event, idle_timeout_frames: Tuple[Type[Frame], ...]):
|
||||
"""Initialize the observer.
|
||||
|
||||
Args:
|
||||
idle_event: The event to set if the idle timeout frames are being pushed.
|
||||
idle_timeout_frames: A tuple with the frames that should set the event when received
|
||||
"""
|
||||
super().__init__()
|
||||
self._idle_event = idle_event
|
||||
self._idle_timeout_frames = idle_timeout_frames
|
||||
self._processed_frames = set()
|
||||
|
||||
async def on_push_frame(self, data: FramePushed):
|
||||
"""Callback executed when a frame is pushed in the pipeline.
|
||||
|
||||
Args:
|
||||
data: The frame push event data.
|
||||
"""
|
||||
# Skip already processed frames
|
||||
if data.frame.id in self._processed_frames:
|
||||
return
|
||||
|
||||
self._processed_frames.add(data.frame.id)
|
||||
|
||||
if isinstance(data.frame, StartFrame) or isinstance(data.frame, self._idle_timeout_frames):
|
||||
self._idle_event.set()
|
||||
|
||||
|
||||
class PipelineParams(BaseModel):
|
||||
"""Configuration parameters for pipeline execution.
|
||||
|
||||
@@ -215,7 +254,6 @@ class PipelineTask(BasePipelineTask):
|
||||
self._conversation_id = conversation_id
|
||||
self._enable_tracing = enable_tracing and is_tracing_available()
|
||||
self._enable_turn_tracking = enable_turn_tracking
|
||||
self._idle_timeout_frames = idle_timeout_frames
|
||||
self._idle_timeout_secs = idle_timeout_secs
|
||||
if self._params.observers:
|
||||
import warnings
|
||||
@@ -250,16 +288,24 @@ class PipelineTask(BasePipelineTask):
|
||||
# This queue is the queue used to push frames to the pipeline.
|
||||
self._push_queue = asyncio.Queue()
|
||||
self._process_push_task: Optional[asyncio.Task] = None
|
||||
|
||||
# This is the heartbeat queue. When a heartbeat frame is received in the
|
||||
# down queue we add it to the heartbeat queue for processing.
|
||||
self._heartbeat_queue = asyncio.Queue()
|
||||
self._heartbeat_push_task: Optional[asyncio.Task] = None
|
||||
self._heartbeat_monitor_task: Optional[asyncio.Task] = None
|
||||
# This is the idle queue. When frames are received downstream they are
|
||||
# put in the queue. If no frame is received the pipeline is considered
|
||||
# idle.
|
||||
self._idle_queue = asyncio.Queue()
|
||||
|
||||
# This is the idle event. When selected frames are pushed from any
|
||||
# processor we consider the pipeline is not idle. We use an observer
|
||||
# which will be listening any part of the pipeline.
|
||||
self._idle_event = asyncio.Event()
|
||||
self._idle_monitor_task: Optional[asyncio.Task] = None
|
||||
if self._idle_timeout_secs:
|
||||
idle_frame_observer = IdleFrameObserver(
|
||||
idle_event=self._idle_event,
|
||||
idle_timeout_frames=idle_timeout_frames,
|
||||
)
|
||||
observers.append(idle_frame_observer)
|
||||
|
||||
# This event is used to indicate the StartFrame has been received at the
|
||||
# end of the pipeline.
|
||||
@@ -403,10 +449,14 @@ class PipelineTask(BasePipelineTask):
|
||||
logger.debug(f"Task {self} scheduled to stop when done")
|
||||
await self.queue_frame(EndFrame())
|
||||
|
||||
async def cancel(self):
|
||||
"""Request the running pipeline to cancel."""
|
||||
async def cancel(self, *, reason: Optional[str] = None):
|
||||
"""Request the running pipeline to cancel.
|
||||
|
||||
Args:
|
||||
reason: Optional reason to indicate why the pipeline is being cancelled.
|
||||
"""
|
||||
if not self._finished:
|
||||
await self._cancel()
|
||||
await self._cancel(reason=reason)
|
||||
|
||||
async def run(self, params: PipelineTaskParams):
|
||||
"""Start and manage the pipeline execution until completion or cancellation.
|
||||
@@ -470,12 +520,16 @@ class PipelineTask(BasePipelineTask):
|
||||
for frame in frames:
|
||||
await self.queue_frame(frame)
|
||||
|
||||
async def _cancel(self):
|
||||
"""Internal cancellation logic for the pipeline task."""
|
||||
async def _cancel(self, *, reason: Optional[str] = None):
|
||||
"""Internal cancellation logic for the pipeline task.
|
||||
|
||||
Args:
|
||||
reason: Optional reason to indicate why the pipeline is being cancelled.
|
||||
"""
|
||||
if not self._cancelled:
|
||||
logger.debug(f"Cancelling pipeline task {self}")
|
||||
self._cancelled = True
|
||||
await self.queue_frame(CancelFrame())
|
||||
await self.queue_frame(CancelFrame(reason=reason))
|
||||
|
||||
async def _create_tasks(self):
|
||||
"""Create and start all pipeline processing tasks."""
|
||||
@@ -530,7 +584,7 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
async def _maybe_cancel_idle_task(self):
|
||||
"""Cancel idle monitoring task if it is running."""
|
||||
if self._idle_timeout_secs and self._idle_monitor_task:
|
||||
if self._idle_monitor_task:
|
||||
await self._task_manager.cancel_task(self._idle_monitor_task)
|
||||
self._idle_monitor_task = None
|
||||
|
||||
@@ -590,6 +644,9 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
async def _setup(self, params: PipelineTaskParams):
|
||||
"""Set up the pipeline task and all processors."""
|
||||
# Load additional observers.
|
||||
await self._load_observer_files()
|
||||
|
||||
mgr_params = TaskManagerParams(loop=params.loop)
|
||||
self._task_manager.setup(mgr_params)
|
||||
|
||||
@@ -673,11 +730,11 @@ class PipelineTask(BasePipelineTask):
|
||||
if isinstance(frame, EndTaskFrame):
|
||||
# Tell the task we should end nicely.
|
||||
logger.debug(f"{self}: received end task frame {frame}")
|
||||
await self.queue_frame(EndFrame())
|
||||
await self.queue_frame(EndFrame(reason=frame.reason))
|
||||
elif isinstance(frame, CancelTaskFrame):
|
||||
# Tell the task we should end right away.
|
||||
logger.debug(f"{self}: received cancel task frame {frame}")
|
||||
await self.queue_frame(CancelFrame())
|
||||
await self.queue_frame(CancelFrame(reason=frame.reason))
|
||||
elif isinstance(frame, StopTaskFrame):
|
||||
# Tell the task we should stop nicely.
|
||||
logger.debug(f"{self}: received stop task frame {frame}")
|
||||
@@ -706,10 +763,6 @@ class PipelineTask(BasePipelineTask):
|
||||
processors have handled the EndFrame and therefore we can exit the task
|
||||
cleanly.
|
||||
"""
|
||||
# Queue received frame to the idle queue so we can monitor idle
|
||||
# pipelines.
|
||||
await self._idle_queue.put(frame)
|
||||
|
||||
if isinstance(frame, self._reached_downstream_types):
|
||||
await self._call_event_handler("on_frame_reached_downstream", frame)
|
||||
|
||||
@@ -772,33 +825,10 @@ class PipelineTask(BasePipelineTask):
|
||||
Note: Heartbeats are excluded from idle detection.
|
||||
"""
|
||||
running = True
|
||||
last_frame_time = 0
|
||||
|
||||
while running:
|
||||
try:
|
||||
frame = await asyncio.wait_for(
|
||||
self._idle_queue.get(), timeout=self._idle_timeout_secs
|
||||
)
|
||||
|
||||
if isinstance(frame, StartFrame) or isinstance(frame, self._idle_timeout_frames):
|
||||
# If we find a StartFrame or one of the frames that prevents a
|
||||
# time out we update the time.
|
||||
last_frame_time = time.time()
|
||||
else:
|
||||
# If we find any other frame we check if the pipeline is
|
||||
# idle by checking the last time we received one of the
|
||||
# valid frames.
|
||||
diff_time = time.time() - last_frame_time
|
||||
if diff_time >= self._idle_timeout_secs:
|
||||
running = await self._idle_timeout_detected()
|
||||
# Reset `last_frame_time` so we don't trigger another
|
||||
# immediate idle timeout if we are not cancelling. For
|
||||
# example, we might want to force the bot to say goodbye
|
||||
# and then clean nicely with an `EndFrame`.
|
||||
last_frame_time = time.time()
|
||||
|
||||
self._idle_queue.task_done()
|
||||
|
||||
await asyncio.wait_for(self._idle_event.wait(), timeout=self._idle_timeout_secs)
|
||||
self._idle_event.clear()
|
||||
except asyncio.TimeoutError:
|
||||
running = await self._idle_timeout_detected()
|
||||
|
||||
@@ -810,7 +840,7 @@ class PipelineTask(BasePipelineTask):
|
||||
"""
|
||||
# If we are cancelling, just exit the task.
|
||||
if self._cancelled:
|
||||
return True
|
||||
return False
|
||||
|
||||
logger.warning("Idle timeout detected.")
|
||||
await self._call_event_handler("on_idle_timeout")
|
||||
@@ -820,6 +850,27 @@ class PipelineTask(BasePipelineTask):
|
||||
return False
|
||||
return True
|
||||
|
||||
async def _load_observer_files(self):
|
||||
observer_files = os.environ.get("PIPECAT_OBSERVER_FILES", "").split(":")
|
||||
for f in observer_files:
|
||||
try:
|
||||
path = Path(f).resolve()
|
||||
module_name = path.stem
|
||||
spec = importlib.util.spec_from_file_location(module_name, str(path))
|
||||
if spec:
|
||||
logger.debug(f"{self} loading observers from {path}")
|
||||
|
||||
# Load module.
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
|
||||
# Create observers.
|
||||
observers = await module.create_observers(self)
|
||||
for observer in observers:
|
||||
self.add_observer(observer)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} error loading external observers from {f}: {e}")
|
||||
|
||||
def _print_dangling_tasks(self):
|
||||
"""Log any dangling tasks that haven't been properly cleaned up."""
|
||||
tasks = [t.get_name() for t in self._task_manager.current_tasks()]
|
||||
|
||||
@@ -129,7 +129,7 @@ class TaskObserver(BaseObserver):
|
||||
for proxy in self._proxies:
|
||||
await proxy.cleanup()
|
||||
|
||||
async def on_process_frame(self, data: FramePushed):
|
||||
async def on_process_frame(self, data: FrameProcessed):
|
||||
"""Queue frame data for all managed observers.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -15,8 +15,8 @@ service-specific adapter.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import copy
|
||||
import io
|
||||
import wave
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, List, Optional, TypeAlias, Union
|
||||
|
||||
@@ -29,7 +29,7 @@ from openai.types.chat import (
|
||||
)
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.adapters.schemas.tools_schema import AdapterType, ToolsSchema
|
||||
from pipecat.frames.frames import AudioRawFrame
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -83,9 +83,17 @@ class LLMContext:
|
||||
Returns:
|
||||
New LLMContext instance with converted messages and settings.
|
||||
"""
|
||||
# Convert tools to ToolsSchema if needed.
|
||||
# If the tools are already a ToolsSchema, this is a no-op.
|
||||
# Otherwise, we wrap them in a shim ToolsSchema.
|
||||
converted_tools = openai_context.tools
|
||||
if isinstance(converted_tools, list):
|
||||
converted_tools = ToolsSchema(
|
||||
standard_tools=[], custom_tools={AdapterType.SHIM: converted_tools}
|
||||
)
|
||||
return LLMContext(
|
||||
messages=openai_context.get_messages(),
|
||||
tools=openai_context.tools,
|
||||
tools=converted_tools,
|
||||
tool_choice=openai_context.tool_choice,
|
||||
)
|
||||
|
||||
@@ -106,6 +114,89 @@ class LLMContext:
|
||||
self._tools: ToolsSchema | NotGiven = LLMContext._normalize_and_validate_tools(tools)
|
||||
self._tool_choice: LLMContextToolChoice | NotGiven = tool_choice
|
||||
|
||||
@staticmethod
|
||||
def create_image_url_message(
|
||||
*,
|
||||
role: str = "user",
|
||||
url: str,
|
||||
text: Optional[str] = None,
|
||||
) -> LLMContextMessage:
|
||||
"""Create a context message containing an image URL.
|
||||
|
||||
Args:
|
||||
role: The role of this message (defaults to "user").
|
||||
url: The URL of the image.
|
||||
text: Optional text to include with the image.
|
||||
"""
|
||||
content = []
|
||||
if text:
|
||||
content.append({"type": "text", "text": text})
|
||||
|
||||
content.append({"type": "image_url", "image_url": {"url": url}})
|
||||
|
||||
return {"role": role, "content": content}
|
||||
|
||||
@staticmethod
|
||||
def create_image_message(
|
||||
*,
|
||||
role: str = "user",
|
||||
format: str,
|
||||
size: tuple[int, int],
|
||||
image: bytes,
|
||||
text: Optional[str] = None,
|
||||
) -> LLMContextMessage:
|
||||
"""Create a context message containing an image.
|
||||
|
||||
Args:
|
||||
role: The role of this message (defaults to "user").
|
||||
format: Image format (e.g., 'RGB', 'RGBA').
|
||||
size: Image dimensions as (width, height) tuple.
|
||||
image: Raw image bytes.
|
||||
text: Optional text to include with the image.
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
Image.frombytes(format, size, image).save(buffer, format="JPEG")
|
||||
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
url = f"data:image/jpeg;base64,{encoded_image}"
|
||||
|
||||
return LLMContext.create_image_url_message(role=role, url=url, text=text)
|
||||
|
||||
@staticmethod
|
||||
def create_audio_message(
|
||||
*, role: str = "user", audio_frames: list[AudioRawFrame], text: str = "Audio follows"
|
||||
) -> LLMContextMessage:
|
||||
"""Create a context message containing audio.
|
||||
|
||||
Args:
|
||||
role: The role of this message (defaults to "user").
|
||||
audio_frames: List of audio frame objects to include.
|
||||
text: Optional text to include with the audio.
|
||||
"""
|
||||
sample_rate = audio_frames[0].sample_rate
|
||||
num_channels = audio_frames[0].num_channels
|
||||
|
||||
content = []
|
||||
content.append({"type": "text", "text": text})
|
||||
data = b"".join(frame.audio for frame in audio_frames)
|
||||
|
||||
with io.BytesIO() as buffer:
|
||||
with wave.open(buffer, "wb") as wf:
|
||||
wf.setsampwidth(2)
|
||||
wf.setnchannels(num_channels)
|
||||
wf.setframerate(sample_rate)
|
||||
wf.writeframes(data)
|
||||
|
||||
encoded_audio = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
|
||||
content.append(
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {"data": encoded_audio, "format": "wav"},
|
||||
}
|
||||
)
|
||||
|
||||
return {"role": role, "content": content}
|
||||
|
||||
@property
|
||||
def messages(self) -> List[LLMContextMessage]:
|
||||
"""Get the current messages list.
|
||||
@@ -119,6 +210,33 @@ class LLMContext:
|
||||
"""
|
||||
return self.get_messages()
|
||||
|
||||
def get_messages_for_persistent_storage(self) -> List[LLMContextMessage]:
|
||||
"""Get messages suitable for persistent storage.
|
||||
|
||||
NOTE: the only reason this method exists is because we're "silently"
|
||||
switching from OpenAILLMContext to LLMContext under the hood in some
|
||||
services and don't want to trip up users who may have been relying on
|
||||
this method, which is part of the public API of OpenAILLMContext but
|
||||
doesn't need to be for LLMContext.
|
||||
|
||||
.. deprecated::
|
||||
Use `get_messages()` instead.
|
||||
|
||||
Returns:
|
||||
List of conversation messages.
|
||||
"""
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"get_messages_for_persistent_storage() is deprecated, use get_messages() instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
return self.get_messages()
|
||||
|
||||
def get_messages(self, llm_specific_filter: Optional[str] = None) -> List[LLMContextMessage]:
|
||||
"""Get the current messages list.
|
||||
|
||||
@@ -204,7 +322,7 @@ class LLMContext:
|
||||
self._tool_choice = tool_choice
|
||||
|
||||
def add_image_frame_message(
|
||||
self, *, format: str, size: tuple[int, int], image: bytes, text: str = None
|
||||
self, *, format: str, size: tuple[int, int], image: bytes, text: Optional[str] = None
|
||||
):
|
||||
"""Add a message containing an image frame.
|
||||
|
||||
@@ -214,17 +332,8 @@ class LLMContext:
|
||||
image: Raw image bytes.
|
||||
text: Optional text to include with the image.
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
Image.frombytes(format, size, image).save(buffer, format="JPEG")
|
||||
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
|
||||
content = []
|
||||
if text:
|
||||
content.append({"type": "text", "text": text})
|
||||
content.append(
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
|
||||
)
|
||||
self.add_message({"role": "user", "content": content})
|
||||
message = LLMContext.create_image_message(format=format, size=size, image=image, text=text)
|
||||
self.add_message(message)
|
||||
|
||||
def add_audio_frames_message(
|
||||
self, *, audio_frames: list[AudioRawFrame], text: str = "Audio follows"
|
||||
@@ -235,66 +344,8 @@ class LLMContext:
|
||||
audio_frames: List of audio frame objects to include.
|
||||
text: Optional text to include with the audio.
|
||||
"""
|
||||
if not audio_frames:
|
||||
return
|
||||
|
||||
sample_rate = audio_frames[0].sample_rate
|
||||
num_channels = audio_frames[0].num_channels
|
||||
|
||||
content = []
|
||||
content.append({"type": "text", "text": text})
|
||||
data = b"".join(frame.audio for frame in audio_frames)
|
||||
data = bytes(
|
||||
self._create_wav_header(
|
||||
sample_rate,
|
||||
num_channels,
|
||||
16,
|
||||
len(data),
|
||||
)
|
||||
+ data
|
||||
)
|
||||
encoded_audio = base64.b64encode(data).decode("utf-8")
|
||||
content.append(
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {"data": encoded_audio, "format": "wav"},
|
||||
}
|
||||
)
|
||||
self.add_message({"role": "user", "content": content})
|
||||
|
||||
def _create_wav_header(self, sample_rate, num_channels, bits_per_sample, data_size):
|
||||
"""Create a WAV file header for audio data.
|
||||
|
||||
Args:
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
num_channels: Number of audio channels.
|
||||
bits_per_sample: Bits per audio sample.
|
||||
data_size: Size of audio data in bytes.
|
||||
|
||||
Returns:
|
||||
WAV header as a bytearray.
|
||||
"""
|
||||
# RIFF chunk descriptor
|
||||
header = bytearray()
|
||||
header.extend(b"RIFF") # ChunkID
|
||||
header.extend((data_size + 36).to_bytes(4, "little")) # ChunkSize: total size - 8
|
||||
header.extend(b"WAVE") # Format
|
||||
# "fmt " sub-chunk
|
||||
header.extend(b"fmt ") # Subchunk1ID
|
||||
header.extend((16).to_bytes(4, "little")) # Subchunk1Size (16 for PCM)
|
||||
header.extend((1).to_bytes(2, "little")) # AudioFormat (1 for PCM)
|
||||
header.extend(num_channels.to_bytes(2, "little")) # NumChannels
|
||||
header.extend(sample_rate.to_bytes(4, "little")) # SampleRate
|
||||
# Calculate byte rate and block align
|
||||
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
|
||||
block_align = num_channels * (bits_per_sample // 8)
|
||||
header.extend(byte_rate.to_bytes(4, "little")) # ByteRate
|
||||
header.extend(block_align.to_bytes(2, "little")) # BlockAlign
|
||||
header.extend(bits_per_sample.to_bytes(2, "little")) # BitsPerSample
|
||||
# "data" sub-chunk
|
||||
header.extend(b"data") # Subchunk2ID
|
||||
header.extend(data_size.to_bytes(4, "little")) # Subchunk2Size
|
||||
return header
|
||||
message = LLMContext.create_audio_message(audio_frames=audio_frames, text=text)
|
||||
self.add_message(message)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_and_validate_tools(tools: ToolsSchema | NotGiven) -> ToolsSchema | NotGiven:
|
||||
|
||||
@@ -89,7 +89,9 @@ class LLMAssistantAggregatorParams:
|
||||
|
||||
Parameters:
|
||||
expect_stripped_words: Whether to expect and handle stripped words
|
||||
in text frames by adding spaces between tokens.
|
||||
in text frames by adding spaces between tokens. This parameter is
|
||||
ignored when used with the newer LLMAssistantAggregator, which
|
||||
handles word spacing automatically.
|
||||
"""
|
||||
|
||||
expect_stripped_words: bool = True
|
||||
|
||||
@@ -13,6 +13,7 @@ LLM processing, and text-to-speech components in conversational AI pipelines.
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import warnings
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, List, Literal, Optional, Set
|
||||
|
||||
@@ -65,6 +66,7 @@ from pipecat.processors.aggregators.llm_response import (
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.string import concatenate_aggregated_text
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
|
||||
@@ -88,7 +90,7 @@ class LLMContextAggregator(FrameProcessor):
|
||||
self._context = context
|
||||
self._role = role
|
||||
|
||||
self._aggregation: str = ""
|
||||
self._aggregation: List[str] = []
|
||||
|
||||
@property
|
||||
def messages(self) -> List[LLMContextMessage]:
|
||||
@@ -168,13 +170,21 @@ class LLMContextAggregator(FrameProcessor):
|
||||
|
||||
async def reset(self):
|
||||
"""Reset the aggregation state."""
|
||||
self._aggregation = ""
|
||||
self._aggregation = []
|
||||
|
||||
@abstractmethod
|
||||
async def push_aggregation(self):
|
||||
"""Push the current aggregation downstream."""
|
||||
pass
|
||||
|
||||
def aggregation_string(self) -> str:
|
||||
"""Get the current aggregation as a string.
|
||||
|
||||
Returns:
|
||||
The concatenated aggregation string.
|
||||
"""
|
||||
return concatenate_aggregated_text(self._aggregation)
|
||||
|
||||
|
||||
class LLMUserAggregator(LLMContextAggregator):
|
||||
"""User LLM aggregator that processes speech-to-text transcriptions.
|
||||
@@ -212,8 +222,6 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
self._turn_params: Optional[SmartTurnParams] = None
|
||||
|
||||
if "aggregation_timeout" in kwargs:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
@@ -290,6 +298,12 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
await self._handle_llm_messages_update(frame)
|
||||
elif isinstance(frame, LLMSetToolsFrame):
|
||||
self.set_tools(frame.tools)
|
||||
# Push the LLMSetToolsFrame as well, since speech-to-speech LLM
|
||||
# services (like OpenAI Realtime) may need to know about tool
|
||||
# changes; unlike text-based LLM services they won't just "pick up
|
||||
# the change" on the next LLM run, as the LLM is continuously
|
||||
# running.
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, LLMSetToolChoiceFrame):
|
||||
self.set_tool_choice(frame.tool_choice)
|
||||
elif isinstance(frame, SpeechControlParamsFrame):
|
||||
@@ -301,7 +315,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
|
||||
async def _process_aggregation(self):
|
||||
"""Process the current aggregation and push it downstream."""
|
||||
aggregation = self._aggregation
|
||||
aggregation = self.aggregation_string()
|
||||
await self.reset()
|
||||
self._context.add_message({"role": self.role, "content": aggregation})
|
||||
frame = LLMContextFrame(self._context)
|
||||
@@ -349,7 +363,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
"""
|
||||
|
||||
async def should_interrupt(strategy: BaseInterruptionStrategy):
|
||||
await strategy.append_text(self._aggregation)
|
||||
await strategy.append_text(self.aggregation_string())
|
||||
return await strategy.should_interrupt()
|
||||
|
||||
return any([await should_interrupt(s) for s in self._interruption_strategies])
|
||||
@@ -419,7 +433,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
self._aggregation += f" {text}" if self._aggregation else text
|
||||
self._aggregation.append(text)
|
||||
# We just got a final result, so let's reset interim results.
|
||||
self._seen_interim_results = False
|
||||
# Reset aggregation timer.
|
||||
@@ -544,23 +558,31 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
Args:
|
||||
context: The OpenAI LLM context for conversation storage.
|
||||
params: Configuration parameters for aggregation behavior.
|
||||
**kwargs: Additional arguments. Supports deprecated 'expect_stripped_words'.
|
||||
**kwargs: Additional arguments.
|
||||
"""
|
||||
super().__init__(context=context, role="assistant", **kwargs)
|
||||
self._params = params or LLMAssistantAggregatorParams()
|
||||
|
||||
if "expect_stripped_words" in kwargs:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Parameter 'expect_stripped_words' is deprecated, use 'params' instead.",
|
||||
"Parameter 'expect_stripped_words' is deprecated. "
|
||||
"LLMAssistantAggregator now handles word spacing automatically.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
self._params.expect_stripped_words = kwargs["expect_stripped_words"]
|
||||
|
||||
if params and not params.expect_stripped_words:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"params.expect_stripped_words is deprecated. "
|
||||
"LLMAssistantAggregator now handles word spacing automatically.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
self._started = 0
|
||||
self._function_calls_in_progress: Dict[str, Optional[FunctionCallInProgressFrame]] = {}
|
||||
self._context_updated_tasks: Set[asyncio.Task] = set()
|
||||
@@ -610,7 +632,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
await self._handle_function_call_result(frame)
|
||||
elif isinstance(frame, FunctionCallCancelFrame):
|
||||
await self._handle_function_call_cancel(frame)
|
||||
elif isinstance(frame, UserImageRawFrame) and frame.request and frame.request.tool_call_id:
|
||||
elif isinstance(frame, UserImageRawFrame):
|
||||
await self._handle_user_image_frame(frame)
|
||||
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||
await self.push_aggregation()
|
||||
@@ -623,7 +645,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
if not self._aggregation:
|
||||
return
|
||||
|
||||
aggregation = self._aggregation.strip()
|
||||
aggregation = self.aggregation_string()
|
||||
await self.reset()
|
||||
|
||||
if aggregation:
|
||||
@@ -761,27 +783,16 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
message["content"] = result
|
||||
|
||||
async def _handle_user_image_frame(self, frame: UserImageRawFrame):
|
||||
logger.debug(
|
||||
f"{self} UserImageRawFrame: [{frame.request.function_name}:{frame.request.tool_call_id}]"
|
||||
)
|
||||
|
||||
if frame.request.tool_call_id not in self._function_calls_in_progress:
|
||||
logger.warning(
|
||||
f"UserImageRawFrame tool_call_id [{frame.request.tool_call_id}] is not running"
|
||||
)
|
||||
if not frame.append_to_context:
|
||||
return
|
||||
|
||||
del self._function_calls_in_progress[frame.request.tool_call_id]
|
||||
logger.debug(f"{self} Appending UserImageRawFrame to LLM context (size: {frame.size})")
|
||||
|
||||
# Update context with the image frame
|
||||
self._update_function_call_result(
|
||||
frame.request.function_name, frame.request.tool_call_id, "COMPLETED"
|
||||
)
|
||||
self._context.add_image_frame_message(
|
||||
format=frame.format,
|
||||
size=frame.size,
|
||||
image=frame.image,
|
||||
text=frame.request.context,
|
||||
text=frame.text,
|
||||
)
|
||||
|
||||
await self.push_aggregation()
|
||||
@@ -798,10 +809,11 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
if not self._started:
|
||||
return
|
||||
|
||||
if self._params.expect_stripped_words:
|
||||
self._aggregation += f" {frame.text}" if self._aggregation else frame.text
|
||||
else:
|
||||
self._aggregation += frame.text
|
||||
# Make sure we really have text (spaces count, too!)
|
||||
if len(frame.text) == 0:
|
||||
return
|
||||
|
||||
self._aggregation.append(frame.text)
|
||||
|
||||
def _context_updated_task_finished(self, task: asyncio.Task):
|
||||
self._context_updated_tasks.discard(task)
|
||||
|
||||
@@ -27,11 +27,24 @@ class UserResponseAggregator(LLMUserAggregator):
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize the user response aggregator.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
`UserResponseAggregator` is deprecated and will be removed in a future version.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional arguments passed to parent LLMUserAggregator.
|
||||
"""
|
||||
super().__init__(context=LLMContext(), **kwargs)
|
||||
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"`UserResponseAggregator` is deprecated and will be removed in a future version.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
async def push_aggregation(self):
|
||||
"""Push the aggregated user response as a TextFrame.
|
||||
|
||||
|
||||
@@ -27,7 +27,6 @@ from pipecat.frames.frames import (
|
||||
InterimTranscriptionFrame,
|
||||
InterruptionFrame,
|
||||
StartFrame,
|
||||
STTMuteFrame,
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
@@ -118,24 +117,16 @@ class STTMuteFilter(FrameProcessor):
|
||||
self._first_speech_handled = False
|
||||
self._bot_is_speaking = False
|
||||
self._function_call_in_progress = False
|
||||
self._is_muted = False # Initialize as unmuted, will set state on StartFrame if needed
|
||||
|
||||
@property
|
||||
def is_muted(self) -> bool:
|
||||
"""Check if STT is currently muted.
|
||||
|
||||
Returns:
|
||||
True if STT is currently muted and audio frames are being suppressed.
|
||||
"""
|
||||
return self._is_muted
|
||||
self._is_muted = False
|
||||
|
||||
async def _handle_mute_state(self, should_mute: bool):
|
||||
"""Handle STT muting and interruption control state changes."""
|
||||
if should_mute != self.is_muted:
|
||||
if should_mute != self._is_muted:
|
||||
logger.debug(f"STTMuteFilter {'muting' if should_mute else 'unmuting'}")
|
||||
self._is_muted = should_mute
|
||||
await self.push_frame(STTMuteFrame(mute=should_mute), FrameDirection.UPSTREAM)
|
||||
await self.push_frame(STTMuteFrame(mute=should_mute), FrameDirection.DOWNSTREAM)
|
||||
# Note: We don't send STTMuteFrame to the STT service itself.
|
||||
# The filter blocks frames locally, but the STT service continues
|
||||
# processing audio to keep streaming connections alive (e.g., Google STT).
|
||||
|
||||
async def _should_mute(self) -> bool:
|
||||
"""Determine if STT should be muted based on current state and strategies."""
|
||||
@@ -215,7 +206,7 @@ class STTMuteFilter(FrameProcessor):
|
||||
),
|
||||
):
|
||||
# Only pass VAD-related frames when not muted
|
||||
if not self.is_muted:
|
||||
if not self._is_muted:
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
logger.trace(f"{frame.__class__.__name__} suppressed - STT currently muted")
|
||||
@@ -224,5 +215,5 @@ class STTMuteFilter(FrameProcessor):
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
# Finally handle mute state change if needed
|
||||
if should_mute is not None and should_mute != self.is_muted:
|
||||
if should_mute is not None and should_mute != self._is_muted:
|
||||
await self._handle_mute_state(should_mute)
|
||||
|
||||
@@ -14,7 +14,7 @@ management, and frame flow control mechanisms.
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Awaitable, Callable, Coroutine, List, Optional, Sequence, Tuple
|
||||
from typing import Any, Awaitable, Callable, Coroutine, List, Optional, Sequence, Tuple, Type
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -83,12 +83,7 @@ class FrameProcessorQueue(asyncio.PriorityQueue):
|
||||
LOW_PRIORITY = 2
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the FrameProcessorQueue.
|
||||
|
||||
Args:
|
||||
manager (BaseTaskManager): The task manager used by the internal watchdog queues.
|
||||
|
||||
"""
|
||||
"""Initialize the FrameProcessorQueue."""
|
||||
super().__init__()
|
||||
self.__high_counter = 0
|
||||
self.__low_counter = 0
|
||||
@@ -689,6 +684,19 @@ class FrameProcessor(BaseObject):
|
||||
|
||||
self._wait_for_interruption = False
|
||||
|
||||
async def broadcast_frame(self, frame_cls: Type[Frame], **kwargs):
|
||||
"""Broadcasts a frame of the specified class upstream and downstream.
|
||||
|
||||
This method creates two instances of the given frame class using the
|
||||
provided keyword arguments and pushes them upstream and downstream.
|
||||
|
||||
Args:
|
||||
frame_cls: The class of the frame to be broadcasted.
|
||||
**kwargs: Keyword arguments to be passed to the frame's constructor.
|
||||
"""
|
||||
await self.push_frame(frame_cls(**kwargs))
|
||||
await self.push_frame(frame_cls(**kwargs), FrameDirection.UPSTREAM)
|
||||
|
||||
async def __start(self, frame: StartFrame):
|
||||
"""Handle the start frame to initialize processor state.
|
||||
|
||||
|
||||
@@ -1314,7 +1314,11 @@ class RTVIProcessor(FrameProcessor):
|
||||
async def set_bot_ready(self):
|
||||
"""Mark the bot as ready and send the bot-ready message."""
|
||||
self._bot_ready = True
|
||||
await self._update_config(self._config, False)
|
||||
# Only call the (deprecated) _update_config method if the we're using a
|
||||
# config (which is deprecated). Otherwise we'd always print an
|
||||
# unnecessary deprecation warning.
|
||||
if self._config.config:
|
||||
await self._update_config(self._config, False)
|
||||
await self._send_bot_ready()
|
||||
|
||||
async def interrupt_bot(self):
|
||||
|
||||
@@ -26,6 +26,7 @@ from pipecat.frames.frames import (
|
||||
TTSTextFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.string import concatenate_aggregated_text
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
|
||||
@@ -140,29 +141,7 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor):
|
||||
Result: "Hello there how are you"
|
||||
"""
|
||||
if self._current_text_parts and self._aggregation_start_time:
|
||||
# Check specifically for space characters, previously isspace() was used
|
||||
# but that includes all whitespace characters (e.g. \n), not just spaces.
|
||||
has_leading_spaces = any(
|
||||
part and part[0] == " " for part in self._current_text_parts[1:]
|
||||
)
|
||||
has_trailing_spaces = any(
|
||||
part and part[-1] == " " for part in self._current_text_parts[:-1]
|
||||
)
|
||||
|
||||
# If there are embedded spaces in the fragments, use direct concatenation
|
||||
contains_spacing_between_fragments = has_leading_spaces or has_trailing_spaces
|
||||
|
||||
# Apply corresponding joining method
|
||||
if contains_spacing_between_fragments:
|
||||
# Fragments already have spacing - just concatenate
|
||||
content = "".join(self._current_text_parts)
|
||||
else:
|
||||
# Word-by-word fragments - join with spaces
|
||||
content = " ".join(self._current_text_parts)
|
||||
|
||||
# Clean up any excessive whitespace
|
||||
content = content.strip()
|
||||
|
||||
content = concatenate_aggregated_text(self._current_text_parts)
|
||||
if content:
|
||||
logger.trace(f"Emitting aggregated assistant message: {content}")
|
||||
message = TranscriptionMessage(
|
||||
|
||||
@@ -44,6 +44,8 @@ from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.transports.daily.utils import (
|
||||
DailyMeetingTokenParams,
|
||||
DailyMeetingTokenProperties,
|
||||
DailyRESTHelper,
|
||||
DailyRoomParams,
|
||||
DailyRoomProperties,
|
||||
@@ -84,6 +86,7 @@ async def configure(
|
||||
sip_num_endpoints: Optional[int] = 1,
|
||||
sip_codecs: Optional[Dict[str, List[str]]] = None,
|
||||
room_properties: Optional[DailyRoomProperties] = None,
|
||||
token_properties: Optional["DailyMeetingTokenProperties"] = None,
|
||||
) -> DailyRoomConfig:
|
||||
"""Configure Daily room URL and token with optional SIP capabilities.
|
||||
|
||||
@@ -106,6 +109,9 @@ async def configure(
|
||||
individual parameters. When provided, this overrides room_exp_duration and
|
||||
SIP-related parameters. If not provided, properties are built from the
|
||||
individual parameters as before.
|
||||
token_properties: Optional DailyMeetingTokenProperties to customize the meeting
|
||||
token. When provided, these properties are passed to the token creation API.
|
||||
Note that room_name, exp, and is_owner will be set automatically.
|
||||
|
||||
Returns:
|
||||
DailyRoomConfig: Object with room_url, token, and optional sip_endpoint.
|
||||
@@ -179,7 +185,10 @@ async def configure(
|
||||
|
||||
# Create token and return standard format
|
||||
expiry_time: float = token_exp_duration * 60 * 60
|
||||
token = await daily_rest_helper.get_token(room_url, expiry_time)
|
||||
token_params = None
|
||||
if token_properties:
|
||||
token_params = DailyMeetingTokenParams(properties=token_properties)
|
||||
token = await daily_rest_helper.get_token(room_url, expiry_time, params=token_params)
|
||||
return DailyRoomConfig(room_url=room_url, token=token)
|
||||
|
||||
# Create a new room
|
||||
@@ -221,7 +230,12 @@ async def configure(
|
||||
|
||||
# Create meeting token
|
||||
token_expiry_seconds = token_exp_duration * 60 * 60
|
||||
token = await daily_rest_helper.get_token(room_url, token_expiry_seconds)
|
||||
token_params = None
|
||||
if token_properties:
|
||||
token_params = DailyMeetingTokenParams(properties=token_properties)
|
||||
token = await daily_rest_helper.get_token(
|
||||
room_url, token_expiry_seconds, params=token_params
|
||||
)
|
||||
|
||||
if sip_enabled:
|
||||
# Return SIP configuration object
|
||||
|
||||
@@ -74,7 +74,7 @@ import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
from http import HTTPMethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, TypedDict
|
||||
from typing import Any, Dict, List, Optional, TypedDict, Union
|
||||
|
||||
import aiohttp
|
||||
from fastapi.responses import FileResponse, Response
|
||||
@@ -205,7 +205,7 @@ def _setup_webrtc_routes(
|
||||
try:
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
|
||||
from pipecat.transports.smallwebrtc.connection import IceServer, SmallWebRTCConnection
|
||||
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
|
||||
from pipecat.transports.smallwebrtc.request_handler import (
|
||||
IceCandidate,
|
||||
SmallWebRTCPatchRequest,
|
||||
@@ -216,6 +216,9 @@ def _setup_webrtc_routes(
|
||||
logger.error(f"WebRTC transport dependencies not installed: {e}")
|
||||
return
|
||||
|
||||
class IceServer(TypedDict, total=False):
|
||||
urls: Union[str, List[str]]
|
||||
|
||||
class IceConfig(TypedDict):
|
||||
iceServers: List[IceServer]
|
||||
|
||||
@@ -555,6 +558,7 @@ def _setup_daily_routes(app: FastAPI):
|
||||
{
|
||||
"createDailyRoom": true,
|
||||
"dailyRoomProperties": { "start_video_off": true },
|
||||
"dailyMeetingTokenProperties": { "is_owner": true, "user_name": "Bot" },
|
||||
"body": { "custom_data": "value" }
|
||||
}
|
||||
"""
|
||||
@@ -570,6 +574,8 @@ def _setup_daily_routes(app: FastAPI):
|
||||
|
||||
create_daily_room = request_data.get("createDailyRoom", False)
|
||||
body = request_data.get("body", {})
|
||||
daily_room_properties_dict = request_data.get("dailyRoomProperties", None)
|
||||
daily_token_properties_dict = request_data.get("dailyMeetingTokenProperties", None)
|
||||
|
||||
bot_module = _get_bot_module()
|
||||
|
||||
@@ -584,9 +590,37 @@ def _setup_daily_routes(app: FastAPI):
|
||||
import aiohttp
|
||||
|
||||
from pipecat.runner.daily import configure
|
||||
from pipecat.transports.daily.utils import (
|
||||
DailyMeetingTokenProperties,
|
||||
DailyRoomProperties,
|
||||
)
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
room_url, token = await configure(session)
|
||||
# Parse dailyRoomProperties if provided
|
||||
room_properties = None
|
||||
if daily_room_properties_dict:
|
||||
try:
|
||||
room_properties = DailyRoomProperties(**daily_room_properties_dict)
|
||||
logger.debug(f"Using custom room properties: {room_properties}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse dailyRoomProperties: {e}")
|
||||
# Continue without custom properties
|
||||
|
||||
# Parse dailyMeetingTokenProperties if provided
|
||||
token_properties = None
|
||||
if daily_token_properties_dict:
|
||||
try:
|
||||
token_properties = DailyMeetingTokenProperties(
|
||||
**daily_token_properties_dict
|
||||
)
|
||||
logger.debug(f"Using custom token properties: {token_properties}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse dailyMeetingTokenProperties: {e}")
|
||||
# Continue without custom properties
|
||||
|
||||
room_url, token = await configure(
|
||||
session, room_properties=room_properties, token_properties=token_properties
|
||||
)
|
||||
runner_args = DailyRunnerArguments(room_url=room_url, token=token, body=body)
|
||||
result = {
|
||||
"dailyRoom": room_url,
|
||||
@@ -794,10 +828,6 @@ def main():
|
||||
logger.error("For ESP32, you need to specify `--host IP` so we can do SDP munging.")
|
||||
return
|
||||
|
||||
if args.transport in TELEPHONY_TRANSPORTS and not args.proxy:
|
||||
logger.error(f"For telephony transports, you need to specify `--proxy PROXY`.")
|
||||
return
|
||||
|
||||
# Log level
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="TRACE" if args.verbose else "DEBUG")
|
||||
|
||||
@@ -216,6 +216,7 @@ async def parse_telephony_websocket(websocket: WebSocket):
|
||||
"account_sid": start_data.get("account_sid"),
|
||||
"from": start_data.get("from", ""),
|
||||
"to": start_data.get("to", ""),
|
||||
"custom_parameters": start_data.get("custom_parameters", ""),
|
||||
}
|
||||
|
||||
else:
|
||||
|
||||
@@ -493,6 +493,8 @@ class AnthropicLLMService(LLMService):
|
||||
elif isinstance(frame, LLMContextFrame):
|
||||
context = frame.context
|
||||
elif isinstance(frame, LLMMessagesFrame):
|
||||
# NOTE: LLMMessagesFrame is deprecated, so we don't support the newer universal
|
||||
# LLMContext with it
|
||||
context = AnthropicLLMContext.from_messages(frame.messages)
|
||||
elif isinstance(frame, LLMUpdateSettingsFrame):
|
||||
await self._update_settings(frame.settings)
|
||||
|
||||
@@ -28,7 +28,7 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.tts_service import InterruptibleTTSService, TTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
try:
|
||||
@@ -50,7 +50,7 @@ def language_to_async_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
The corresponding Async language code, or None if not supported.
|
||||
"""
|
||||
BASE_LANGUAGES = {
|
||||
LANGUAGE_MAP = {
|
||||
Language.EN: "en",
|
||||
Language.FR: "fr",
|
||||
Language.ES: "es",
|
||||
@@ -58,17 +58,7 @@ def language_to_async_language(language: Language) -> Optional[str]:
|
||||
Language.IT: "it",
|
||||
}
|
||||
|
||||
result = BASE_LANGUAGES.get(language)
|
||||
|
||||
# If not found in base languages, try to find the base language from a variant
|
||||
if not result:
|
||||
# Convert enum value to string and get the base language part (e.g. en-En -> en)
|
||||
lang_str = str(language.value)
|
||||
base_code = lang_str.split("-")[0].lower()
|
||||
# Look up the base code in our supported languages
|
||||
result = base_code if base_code in BASE_LANGUAGES.values() else None
|
||||
|
||||
return result
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=True)
|
||||
|
||||
|
||||
class AsyncAITTSService(InterruptibleTTSService):
|
||||
|
||||
@@ -1167,6 +1167,8 @@ class AWSBedrockLLMService(LLMService):
|
||||
if isinstance(frame, LLMContextFrame):
|
||||
context = frame.context
|
||||
elif isinstance(frame, LLMMessagesFrame):
|
||||
# NOTE: LLMMessagesFrame is deprecated, so we don't support the newer universal
|
||||
# LLMContext with it
|
||||
context = AWSBedrockLLMContext.from_messages(frame.messages)
|
||||
elif isinstance(frame, LLMUpdateSettingsFrame):
|
||||
await self._update_settings(frame.settings)
|
||||
|
||||
@@ -11,36 +11,34 @@ including conversation history management and role-specific message processing.
|
||||
|
||||
.. deprecated:: 0.0.91
|
||||
AWS Nova Sonic no longer uses types from this module under the hood.
|
||||
It now uses `LLMContext` and `LLMContextAggregatorPair`.
|
||||
It now uses ``LLMContext`` and ``LLMContextAggregatorPair``.
|
||||
Using the new patterns should allow you to not need types from this module.
|
||||
|
||||
BEFORE:
|
||||
```
|
||||
# Setup
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
BEFORE::
|
||||
|
||||
# Context frame type
|
||||
frame: OpenAILLMContextFrame
|
||||
# Setup
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
# Context type
|
||||
context: AWSNovaSonicLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
```
|
||||
# Context frame type
|
||||
frame: OpenAILLMContextFrame
|
||||
|
||||
AFTER:
|
||||
```
|
||||
# Setup
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
# Context type
|
||||
context: AWSNovaSonicLLMContext
|
||||
# or
|
||||
context: OpenAILLMContext
|
||||
|
||||
# Context frame type
|
||||
frame: LLMContextFrame
|
||||
AFTER::
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
```
|
||||
# Setup
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
# Context frame type
|
||||
frame: LLMContextFrame
|
||||
|
||||
# Context type
|
||||
context: LLMContext
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
@@ -1163,7 +1163,8 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
"""Create context aggregator pair for managing conversation context.
|
||||
|
||||
NOTE: this method exists only for backward compatibility. New code
|
||||
should instead do:
|
||||
should instead do::
|
||||
|
||||
context = LLMContext(...)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.services.aws.utils import build_event_message, decode_event, get_presigned_url
|
||||
from pipecat.services.stt_service import STTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
|
||||
@@ -326,7 +326,7 @@ class AWSTranscribeSTTService(STTService):
|
||||
Returns:
|
||||
AWS Transcribe compatible language code, or None if unsupported.
|
||||
"""
|
||||
language_map = {
|
||||
LANGUAGE_MAP = {
|
||||
# Afrikaans
|
||||
Language.AF: "af-ZA",
|
||||
Language.AF_ZA: "af-ZA",
|
||||
@@ -466,7 +466,7 @@ class AWSTranscribeSTTService(STTService):
|
||||
Language.ZU_ZA: "zu-ZA",
|
||||
}
|
||||
|
||||
return language_map.get(language)
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
|
||||
|
||||
@traced_stt
|
||||
async def _handle_transcription(
|
||||
|
||||
@@ -26,7 +26,7 @@ from pipecat.frames.frames import (
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.services.tts_service import TTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
try:
|
||||
@@ -47,7 +47,7 @@ def language_to_aws_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
The corresponding AWS Polly language code, or None if not supported.
|
||||
"""
|
||||
language_map = {
|
||||
LANGUAGE_MAP = {
|
||||
# Arabic
|
||||
Language.AR: "arb",
|
||||
Language.AR_AE: "ar-AE",
|
||||
@@ -119,7 +119,7 @@ def language_to_aws_language(language: Language) -> Optional[str]:
|
||||
Language.CY_GB: "cy-GB",
|
||||
}
|
||||
|
||||
return language_map.get(language)
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
|
||||
|
||||
|
||||
class AWSPollyTTSService(TTSService):
|
||||
|
||||
@@ -8,7 +8,9 @@
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from pipecat.transcriptions.language import Language
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
|
||||
|
||||
def language_to_azure_language(language: Language) -> Optional[str]:
|
||||
@@ -20,7 +22,7 @@ def language_to_azure_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
The corresponding Azure language code, or None if not supported.
|
||||
"""
|
||||
language_map = {
|
||||
LANGUAGE_MAP = {
|
||||
# Afrikaans
|
||||
Language.AF: "af-ZA",
|
||||
Language.AF_ZA: "af-ZA",
|
||||
@@ -341,4 +343,4 @@ def language_to_azure_language(language: Language) -> Optional[str]:
|
||||
Language.ZU: "zu-ZA",
|
||||
Language.ZU_ZA: "zu-ZA",
|
||||
}
|
||||
return language_map.get(language)
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
|
||||
|
||||
@@ -38,7 +38,7 @@ class AzureRealtimeLLMService(OpenAIRealtimeLLMService):
|
||||
Args:
|
||||
api_key: The API key for the Azure OpenAI service.
|
||||
base_url: The full Azure WebSocket endpoint URL including api-version and deployment.
|
||||
Example: "wss://my-project.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=my-realtime-deployment"
|
||||
Example: "wss://my-project.openai.azure.com/openai/realtime?api-version=2025-04-01-preview&deployment=my-realtime-deployment"
|
||||
**kwargs: Additional arguments passed to parent OpenAIRealtimeLLMService.
|
||||
"""
|
||||
super().__init__(base_url=base_url, api_key=api_key, **kwargs)
|
||||
@@ -52,7 +52,7 @@ class AzureRealtimeLLMService(OpenAIRealtimeLLMService):
|
||||
# handle disconnections in the send/recv code paths.
|
||||
return
|
||||
|
||||
logger.info(f"Connecting to {self.base_url}, api key: {self.api_key}")
|
||||
logger.info(f"Connecting to {self.base_url}")
|
||||
self._websocket = await websocket_connect(
|
||||
uri=self.base_url,
|
||||
additional_headers={
|
||||
|
||||
@@ -28,7 +28,7 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.tts_service import AudioContextWordTTSService, TTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
|
||||
from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
@@ -48,6 +48,26 @@ except ModuleNotFoundError as e:
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class GenerationConfig(BaseModel):
|
||||
"""Configuration for Cartesia Sonic-3 generation parameters.
|
||||
|
||||
Sonic-3 interprets these parameters as guidance to ensure natural speech.
|
||||
Test against your content for best results.
|
||||
|
||||
Parameters:
|
||||
volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
|
||||
speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
|
||||
emotion: Single emotion string to guide the emotional tone. Examples include neutral,
|
||||
angry, excited, content, sad, scared. Over 60 emotions are supported. For best
|
||||
results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
|
||||
and Marian.
|
||||
"""
|
||||
|
||||
volume: Optional[float] = None
|
||||
speed: Optional[float] = None
|
||||
emotion: Optional[str] = None
|
||||
|
||||
|
||||
def language_to_cartesia_language(language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to Cartesia language code.
|
||||
|
||||
@@ -57,35 +77,52 @@ def language_to_cartesia_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
The corresponding Cartesia language code, or None if not supported.
|
||||
"""
|
||||
BASE_LANGUAGES = {
|
||||
LANGUAGE_MAP = {
|
||||
Language.AR: "ar",
|
||||
Language.BG: "bg",
|
||||
Language.BN: "bn",
|
||||
Language.CS: "cs",
|
||||
Language.DA: "da",
|
||||
Language.DE: "de",
|
||||
Language.EN: "en",
|
||||
Language.EL: "el",
|
||||
Language.ES: "es",
|
||||
Language.FI: "fi",
|
||||
Language.FR: "fr",
|
||||
Language.GU: "gu",
|
||||
Language.HE: "he",
|
||||
Language.HI: "hi",
|
||||
Language.HR: "hr",
|
||||
Language.HU: "hu",
|
||||
Language.ID: "id",
|
||||
Language.IT: "it",
|
||||
Language.JA: "ja",
|
||||
Language.KA: "ka",
|
||||
Language.KN: "kn",
|
||||
Language.KO: "ko",
|
||||
Language.ML: "ml",
|
||||
Language.MR: "mr",
|
||||
Language.MS: "ms",
|
||||
Language.NL: "nl",
|
||||
Language.NO: "no",
|
||||
Language.PA: "pa",
|
||||
Language.PL: "pl",
|
||||
Language.PT: "pt",
|
||||
Language.RO: "ro",
|
||||
Language.RU: "ru",
|
||||
Language.SK: "sk",
|
||||
Language.SV: "sv",
|
||||
Language.TA: "ta",
|
||||
Language.TE: "te",
|
||||
Language.TH: "th",
|
||||
Language.TL: "tl",
|
||||
Language.TR: "tr",
|
||||
Language.UK: "uk",
|
||||
Language.VI: "vi",
|
||||
Language.ZH: "zh",
|
||||
}
|
||||
|
||||
result = BASE_LANGUAGES.get(language)
|
||||
|
||||
# If not found in base languages, try to find the base language from a variant
|
||||
if not result:
|
||||
# Convert enum value to string and get the base language part (e.g. es-ES -> es)
|
||||
lang_str = str(language.value)
|
||||
base_code = lang_str.split("-")[0].lower()
|
||||
# Look up the base code in our supported languages
|
||||
result = base_code if base_code in BASE_LANGUAGES.values() else None
|
||||
|
||||
return result
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=True)
|
||||
|
||||
|
||||
class CartesiaTTSService(AudioContextWordTTSService):
|
||||
@@ -101,16 +138,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
|
||||
Parameters:
|
||||
language: Language to use for synthesis.
|
||||
speed: Voice speed control.
|
||||
emotion: List of emotion controls.
|
||||
speed: Voice speed control for non-Sonic-3 models (literal values).
|
||||
emotion: List of emotion controls for non-Sonic-3 models.
|
||||
|
||||
.. deprecated:: 0.0.68
|
||||
The `emotion` parameter is deprecated and will be removed in a future version.
|
||||
|
||||
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
||||
speed (numeric), and emotion (string) parameters.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = Language.EN
|
||||
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
||||
emotion: Optional[List[str]] = []
|
||||
generation_config: Optional[GenerationConfig] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -119,7 +160,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
voice_id: str,
|
||||
cartesia_version: str = "2025-04-16",
|
||||
url: str = "wss://api.cartesia.ai/tts/websocket",
|
||||
model: str = "sonic-2",
|
||||
model: str = "sonic-3",
|
||||
sample_rate: Optional[int] = None,
|
||||
encoding: str = "pcm_s16le",
|
||||
container: str = "raw",
|
||||
@@ -135,7 +176,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
voice_id: ID of the voice to use for synthesis.
|
||||
cartesia_version: API version string for Cartesia service.
|
||||
url: WebSocket URL for Cartesia TTS API.
|
||||
model: TTS model to use (e.g., "sonic-2").
|
||||
model: TTS model to use (e.g., "sonic-3").
|
||||
sample_rate: Audio sample rate. If None, uses default.
|
||||
encoding: Audio encoding format.
|
||||
container: Audio container format.
|
||||
@@ -179,6 +220,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
else "en",
|
||||
"speed": params.speed,
|
||||
"emotion": params.emotion,
|
||||
"generation_config": params.generation_config,
|
||||
}
|
||||
self.set_model_name(model)
|
||||
self.set_voice(voice_id)
|
||||
@@ -297,6 +339,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
if self._settings["speed"]:
|
||||
msg["speed"] = self._settings["speed"]
|
||||
|
||||
if self._settings["generation_config"]:
|
||||
msg["generation_config"] = self._settings["generation_config"].model_dump(
|
||||
exclude_none=True
|
||||
)
|
||||
|
||||
return json.dumps(msg)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
@@ -482,23 +529,27 @@ class CartesiaHttpTTSService(TTSService):
|
||||
|
||||
Parameters:
|
||||
language: Language to use for synthesis.
|
||||
speed: Voice speed control.
|
||||
emotion: List of emotion controls.
|
||||
speed: Voice speed control for non-Sonic-3 models (literal values).
|
||||
emotion: List of emotion controls for non-Sonic-3 models.
|
||||
|
||||
.. deprecated:: 0.0.68
|
||||
The `emotion` parameter is deprecated and will be removed in a future version.
|
||||
|
||||
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
||||
speed (numeric), and emotion (string) parameters.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = Language.EN
|
||||
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
||||
emotion: Optional[List[str]] = Field(default_factory=list)
|
||||
generation_config: Optional[GenerationConfig] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
voice_id: str,
|
||||
model: str = "sonic-2",
|
||||
model: str = "sonic-3",
|
||||
base_url: str = "https://api.cartesia.ai",
|
||||
cartesia_version: str = "2024-11-13",
|
||||
sample_rate: Optional[int] = None,
|
||||
@@ -512,7 +563,7 @@ class CartesiaHttpTTSService(TTSService):
|
||||
Args:
|
||||
api_key: Cartesia API key for authentication.
|
||||
voice_id: ID of the voice to use for synthesis.
|
||||
model: TTS model to use (e.g., "sonic-2").
|
||||
model: TTS model to use (e.g., "sonic-3").
|
||||
base_url: Base URL for Cartesia HTTP API.
|
||||
cartesia_version: API version string for Cartesia service.
|
||||
sample_rate: Audio sample rate. If None, uses default.
|
||||
@@ -539,6 +590,7 @@ class CartesiaHttpTTSService(TTSService):
|
||||
else "en",
|
||||
"speed": params.speed,
|
||||
"emotion": params.emotion,
|
||||
"generation_config": params.generation_config,
|
||||
}
|
||||
self.set_voice(voice_id)
|
||||
self.set_model_name(model)
|
||||
@@ -632,6 +684,11 @@ class CartesiaHttpTTSService(TTSService):
|
||||
if self._settings["speed"]:
|
||||
payload["speed"] = self._settings["speed"]
|
||||
|
||||
if self._settings["generation_config"]:
|
||||
payload["generation_config"] = self._settings["generation_config"].model_dump(
|
||||
exclude_none=True
|
||||
)
|
||||
|
||||
yield TTSStartedFrame()
|
||||
|
||||
session = await self._client._get_session()
|
||||
|
||||
@@ -156,6 +156,12 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
self._language = Language.EN
|
||||
self._websocket_url = None
|
||||
self._receive_task = None
|
||||
# Flux event handlers
|
||||
self._register_event_handler("on_start_of_turn")
|
||||
self._register_event_handler("on_turn_resumed")
|
||||
self._register_event_handler("on_end_of_turn")
|
||||
self._register_event_handler("on_eager_end_of_turn")
|
||||
self._register_event_handler("on_update")
|
||||
|
||||
async def _connect(self):
|
||||
"""Connect to WebSocket and start background tasks.
|
||||
@@ -520,9 +526,9 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
"""
|
||||
logger.debug("User started speaking")
|
||||
await self.push_interruption_task_frame_and_wait()
|
||||
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
await self.broadcast_frame(UserStartedSpeakingFrame)
|
||||
await self.start_metrics()
|
||||
await self._call_event_handler("on_start_of_turn", transcript)
|
||||
if transcript:
|
||||
logger.trace(f"Start of turn transcript: {transcript}")
|
||||
|
||||
@@ -537,6 +543,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
event: The event type string for logging purposes.
|
||||
"""
|
||||
logger.trace(f"Received event TurnResumed: {event}")
|
||||
await self._call_event_handler("on_turn_resumed")
|
||||
|
||||
async def _handle_end_of_turn(self, transcript: str, data: Dict[str, Any]):
|
||||
"""Handle EndOfTurn events from Deepgram Flux.
|
||||
@@ -571,6 +578,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
await self.stop_processing_metrics()
|
||||
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
await self._call_event_handler("on_end_of_turn", transcript)
|
||||
|
||||
async def _handle_eager_end_of_turn(self, transcript: str, data: Dict[str, Any]):
|
||||
"""Handle EagerEndOfTurn events from Deepgram Flux.
|
||||
@@ -615,6 +623,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
result=data,
|
||||
)
|
||||
)
|
||||
await self._call_event_handler("on_eager_end_of_turn", transcript)
|
||||
|
||||
async def _handle_update(self, transcript: str):
|
||||
"""Handle Update events from Deepgram Flux.
|
||||
@@ -638,3 +647,4 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
# both the "user started speaking" event and the first transcript simultaneously,
|
||||
# making this timing measurement meaningless in this context.
|
||||
# await self.stop_ttfb_metrics()
|
||||
await self._call_event_handler("on_update", transcript)
|
||||
|
||||
@@ -236,7 +236,7 @@ class DeepgramSTTService(STTService):
|
||||
logger.error(f"{self}: unable to connect to Deepgram")
|
||||
|
||||
async def _disconnect(self):
|
||||
if self._connection.is_connected:
|
||||
if await self._connection.is_connected():
|
||||
logger.debug("Disconnecting from Deepgram")
|
||||
# Deepgram swallows asyncio.CancelledError internally which prevents
|
||||
# proper cancellation propagation. This issue was found with
|
||||
|
||||
@@ -12,6 +12,7 @@ for generating speech from text using various voice models.
|
||||
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
@@ -117,3 +118,114 @@ class DeepgramTTSService(TTSService):
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} exception: {e}")
|
||||
yield ErrorFrame(f"Error getting audio: {str(e)}")
|
||||
|
||||
|
||||
class DeepgramHttpTTSService(TTSService):
|
||||
"""Deepgram HTTP text-to-speech service.
|
||||
|
||||
Provides text-to-speech synthesis using Deepgram's HTTP TTS API.
|
||||
Supports various voice models and audio encoding formats with
|
||||
configurable sample rates and quality settings.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
voice: str = "aura-2-helena-en",
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
base_url: str = "https://api.deepgram.com",
|
||||
sample_rate: Optional[int] = None,
|
||||
encoding: str = "linear16",
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Deepgram TTS service.
|
||||
|
||||
Args:
|
||||
api_key: Deepgram API key for authentication.
|
||||
voice: Voice model to use for synthesis. Defaults to "aura-2-helena-en".
|
||||
aiohttp_session: Shared aiohttp session for HTTP requests with connection pooling.
|
||||
base_url: Custom base URL for Deepgram API. Defaults to "https://api.deepgram.com".
|
||||
sample_rate: Audio sample rate in Hz. If None, uses service default.
|
||||
encoding: Audio encoding format. Defaults to "linear16".
|
||||
**kwargs: Additional arguments passed to parent TTSService class.
|
||||
"""
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._session = aiohttp_session
|
||||
self._base_url = base_url
|
||||
self._settings = {
|
||||
"encoding": encoding,
|
||||
}
|
||||
self.set_voice(voice)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if the service can generate metrics.
|
||||
|
||||
Returns:
|
||||
True, as Deepgram TTS service supports metrics generation.
|
||||
"""
|
||||
return True
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using Deepgram's TTS API.
|
||||
|
||||
Args:
|
||||
text: The text to synthesize into speech.
|
||||
|
||||
Yields:
|
||||
Frame: Audio frames containing the synthesized speech, plus start/stop frames.
|
||||
"""
|
||||
logger.debug(f"{self}: Generating TTS [{text}]")
|
||||
|
||||
# Build URL with parameters
|
||||
url = f"{self._base_url}/v1/speak"
|
||||
|
||||
headers = {"Authorization": f"Token {self._api_key}", "Content-Type": "application/json"}
|
||||
|
||||
params = {
|
||||
"model": self._voice_id,
|
||||
"encoding": self._settings["encoding"],
|
||||
"sample_rate": self.sample_rate,
|
||||
"container": "none",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"text": text,
|
||||
}
|
||||
|
||||
try:
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
async with self._session.post(
|
||||
url, headers=headers, json=payload, params=params
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
raise Exception(f"HTTP {response.status}: {error_text}")
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
yield TTSStartedFrame()
|
||||
|
||||
CHUNK_SIZE = self.chunk_size
|
||||
|
||||
first_chunk = True
|
||||
async for chunk in response.content.iter_chunked(CHUNK_SIZE):
|
||||
if first_chunk:
|
||||
await self.stop_ttfb_metrics()
|
||||
first_chunk = False
|
||||
|
||||
if chunk:
|
||||
yield TTSAudioRawFrame(
|
||||
audio=chunk,
|
||||
sample_rate=self.sample_rate,
|
||||
num_channels=1,
|
||||
)
|
||||
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} exception: {e}")
|
||||
yield ErrorFrame(f"Error getting audio: {str(e)}")
|
||||
|
||||
@@ -20,7 +20,7 @@ from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
|
||||
from pipecat.services.stt_service import SegmentedSTTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
|
||||
@@ -37,7 +37,7 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
The corresponding ElevenLabs language code, or None if not supported.
|
||||
"""
|
||||
BASE_LANGUAGES = {
|
||||
LANGUAGE_MAP = {
|
||||
Language.AF: "afr", # Afrikaans
|
||||
Language.AM: "amh", # Amharic
|
||||
Language.AR: "ara", # Arabic
|
||||
@@ -139,15 +139,7 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
||||
Language.ZU: "zul", # Zulu
|
||||
}
|
||||
|
||||
result = BASE_LANGUAGES.get(language)
|
||||
|
||||
# If not found in base languages, try to find the base language from a variant
|
||||
if not result:
|
||||
lang_str = str(language.value)
|
||||
base_code = lang_str.split("-")[0].lower()
|
||||
result = base_code if base_code in BASE_LANGUAGES.values() else None
|
||||
|
||||
return result
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
|
||||
|
||||
|
||||
class ElevenLabsSTTService(SegmentedSTTService):
|
||||
|
||||
@@ -37,7 +37,7 @@ from pipecat.services.tts_service import (
|
||||
AudioContextWordTTSService,
|
||||
WordTTSService,
|
||||
)
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
# See .env.example for ElevenLabs configuration needed
|
||||
@@ -72,7 +72,7 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
The corresponding ElevenLabs language code, or None if not supported.
|
||||
"""
|
||||
BASE_LANGUAGES = {
|
||||
LANGUAGE_MAP = {
|
||||
Language.AR: "ar",
|
||||
Language.BG: "bg",
|
||||
Language.CS: "cs",
|
||||
@@ -107,17 +107,7 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
||||
Language.ZH: "zh",
|
||||
}
|
||||
|
||||
result = BASE_LANGUAGES.get(language)
|
||||
|
||||
# If not found in base languages, try to find the base language from a variant
|
||||
if not result:
|
||||
# Convert enum value to string and get the base language part (e.g. es-ES -> es)
|
||||
lang_str = str(language.value)
|
||||
base_code = lang_str.split("-")[0].lower()
|
||||
# Look up the base code in our supported languages
|
||||
result = base_code if base_code in BASE_LANGUAGES.values() else None
|
||||
|
||||
return result
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=True)
|
||||
|
||||
|
||||
def output_format_from_sample_rate(sample_rate: int) -> str:
|
||||
@@ -167,6 +157,18 @@ def build_elevenlabs_voice_settings(
|
||||
return voice_settings or None
|
||||
|
||||
|
||||
class PronunciationDictionaryLocator(BaseModel):
|
||||
"""Locator for a pronunciation dictionary.
|
||||
|
||||
Attributes:
|
||||
pronunciation_dictionary_id: The ID of the pronunciation dictionary.
|
||||
version_id: The version ID of the pronunciation dictionary.
|
||||
"""
|
||||
|
||||
pronunciation_dictionary_id: str
|
||||
version_id: str
|
||||
|
||||
|
||||
def calculate_word_times(
|
||||
alignment_info: Mapping[str, Any],
|
||||
cumulative_time: float,
|
||||
@@ -249,6 +251,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
enable_ssml_parsing: Whether to parse SSML tags in text.
|
||||
enable_logging: Whether to enable ElevenLabs logging.
|
||||
apply_text_normalization: Text normalization mode ("auto", "on", "off").
|
||||
pronunciation_dictionary_locators: List of pronunciation dictionary locators to use.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = None
|
||||
@@ -261,6 +264,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
enable_ssml_parsing: Optional[bool] = None
|
||||
enable_logging: Optional[bool] = None
|
||||
apply_text_normalization: Optional[Literal["auto", "on", "off"]] = None
|
||||
pronunciation_dictionary_locators: Optional[List[PronunciationDictionaryLocator]] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -331,6 +335,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
self.set_voice(voice_id)
|
||||
self._output_format = "" # initialized in start()
|
||||
self._voice_settings = self._set_voice_settings()
|
||||
self._pronunciation_dictionary_locators = params.pronunciation_dictionary_locators
|
||||
|
||||
# Indicates if we have sent TTSStartedFrame. It will reset to False when
|
||||
# there's an interruption or TTSStoppedFrame.
|
||||
@@ -714,12 +719,17 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
||||
if not self.audio_context_available(self._context_id):
|
||||
await self.create_audio_context(self._context_id)
|
||||
|
||||
# Initialize context with voice settings
|
||||
# Initialize context with voice settings and pronunciation dictionaries
|
||||
msg = {"text": " ", "context_id": self._context_id}
|
||||
if self._voice_settings:
|
||||
msg["voice_settings"] = self._voice_settings
|
||||
if self._pronunciation_dictionary_locators:
|
||||
msg["pronunciation_dictionary_locators"] = [
|
||||
locator.model_dump()
|
||||
for locator in self._pronunciation_dictionary_locators
|
||||
]
|
||||
await self._websocket.send(json.dumps(msg))
|
||||
logger.trace(f"Created new context {self._context_id} with voice settings")
|
||||
logger.trace(f"Created new context {self._context_id}")
|
||||
|
||||
await self._send_text(text)
|
||||
await self.start_tts_usage_metrics(text)
|
||||
@@ -755,6 +765,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
use_speaker_boost: Whether to use speaker boost enhancement.
|
||||
speed: Voice speed control (0.25 to 4.0).
|
||||
apply_text_normalization: Text normalization mode ("auto", "on", "off").
|
||||
pronunciation_dictionary_locators: List of pronunciation dictionary locators to use.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = None
|
||||
@@ -765,6 +776,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
use_speaker_boost: Optional[bool] = None
|
||||
speed: Optional[float] = None
|
||||
apply_text_normalization: Optional[Literal["auto", "on", "off"]] = None
|
||||
pronunciation_dictionary_locators: Optional[List[PronunciationDictionaryLocator]] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -823,6 +835,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
self.set_voice(voice_id)
|
||||
self._output_format = "" # initialized in start()
|
||||
self._voice_settings = self._set_voice_settings()
|
||||
self._pronunciation_dictionary_locators = params.pronunciation_dictionary_locators
|
||||
|
||||
# Track cumulative time to properly sequence word timestamps across utterances
|
||||
self._cumulative_time = 0
|
||||
@@ -987,6 +1000,11 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
if self._voice_settings:
|
||||
payload["voice_settings"] = self._voice_settings
|
||||
|
||||
if self._pronunciation_dictionary_locators:
|
||||
payload["pronunciation_dictionary_locators"] = [
|
||||
locator.model_dump() for locator in self._pronunciation_dictionary_locators
|
||||
]
|
||||
|
||||
if self._settings["apply_text_normalization"] is not None:
|
||||
payload["apply_text_normalization"] = self._settings["apply_text_normalization"]
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
|
||||
from pipecat.services.stt_service import SegmentedSTTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
|
||||
@@ -41,7 +41,7 @@ def language_to_fal_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
The corresponding Fal Wizper language code, or None if not supported.
|
||||
"""
|
||||
BASE_LANGUAGES = {
|
||||
LANGUAGE_MAP = {
|
||||
Language.AF: "af",
|
||||
Language.AM: "am",
|
||||
Language.AR: "ar",
|
||||
@@ -142,15 +142,7 @@ def language_to_fal_language(language: Language) -> Optional[str]:
|
||||
Language.ZH: "zh",
|
||||
}
|
||||
|
||||
result = BASE_LANGUAGES.get(language)
|
||||
|
||||
# If not found in base languages, try to find the base language from a variant
|
||||
if not result:
|
||||
lang_str = str(language.value)
|
||||
base_code = lang_str.split("-")[0].lower()
|
||||
result = base_code if base_code in BASE_LANGUAGES.values() else None
|
||||
|
||||
return result
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=True)
|
||||
|
||||
|
||||
class FalSTTService(SegmentedSTTService):
|
||||
|
||||
@@ -31,7 +31,7 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.services.gladia.config import GladiaInputParams
|
||||
from pipecat.services.stt_service import STTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
|
||||
@@ -54,7 +54,7 @@ def language_to_gladia_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
The Gladia language code string or None if not supported.
|
||||
"""
|
||||
BASE_LANGUAGES = {
|
||||
LANGUAGE_MAP = {
|
||||
Language.AF: "af",
|
||||
Language.AM: "am",
|
||||
Language.AR: "ar",
|
||||
@@ -156,17 +156,7 @@ def language_to_gladia_language(language: Language) -> Optional[str]:
|
||||
Language.ZH: "zh",
|
||||
}
|
||||
|
||||
result = BASE_LANGUAGES.get(language)
|
||||
|
||||
# If not found in base languages, try to find the base language from a variant
|
||||
if not result:
|
||||
# Convert enum value to string and get the base language part (e.g. es-ES -> es)
|
||||
lang_str = str(language.value)
|
||||
base_code = lang_str.split("-")[0].lower()
|
||||
# Look up the base code in our supported languages
|
||||
result = base_code if base_code in BASE_LANGUAGES.values() else None
|
||||
|
||||
return result
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=True)
|
||||
|
||||
|
||||
# Deprecation warning for nested InputParams
|
||||
|
||||
@@ -13,10 +13,9 @@ voice transcription, streaming responses, and tool usage.
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
import uuid
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
@@ -56,10 +55,12 @@ from pipecat.frames.frames import (
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import LLMTokenUsage
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantAggregatorParams,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.aggregators.openai_llm_context import (
|
||||
OpenAILLMContext,
|
||||
OpenAILLMContextFrame,
|
||||
@@ -71,7 +72,7 @@ from pipecat.services.openai.llm import (
|
||||
OpenAIAssistantContextAggregator,
|
||||
OpenAIUserContextAggregator,
|
||||
)
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.string import match_endofsentence
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_gemini_live, traced_stt
|
||||
@@ -130,7 +131,7 @@ def language_to_gemini_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
The Gemini language code string, or None if the language is not supported.
|
||||
"""
|
||||
language_map = {
|
||||
LANGUAGE_MAP = {
|
||||
# Arabic
|
||||
Language.AR: "ar-XA",
|
||||
# Bengali
|
||||
@@ -211,7 +212,8 @@ def language_to_gemini_language(language: Language) -> Optional[str]:
|
||||
Language.VI: "vi-VN",
|
||||
Language.VI_VN: "vi-VN",
|
||||
}
|
||||
return language_map.get(language)
|
||||
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
|
||||
|
||||
|
||||
class GeminiLiveContext(OpenAILLMContext):
|
||||
@@ -219,6 +221,10 @@ class GeminiLiveContext(OpenAILLMContext):
|
||||
|
||||
Provides Gemini-specific context management including system instruction
|
||||
extraction and message format conversion for the Live API.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
Gemini Live no longer uses `GeminiLiveContext` under the hood.
|
||||
It now uses `LLMContext`.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@@ -231,6 +237,22 @@ class GeminiLiveContext(OpenAILLMContext):
|
||||
Returns:
|
||||
The upgraded Gemini context instance.
|
||||
"""
|
||||
# This warning is here rather than `__init__` since `upgrade()` was the
|
||||
# "main" way that GeminiLiveContext instances were created.
|
||||
# Almost no users should be seeing this message anyway, as
|
||||
# GeminiLiveContext instances were typically created under the hood:
|
||||
# the user would pass an OpenAILLMContext instance, which would be
|
||||
# upgraded without them necessarily knowing.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"GeminiLiveContext is deprecated. "
|
||||
"Gemini Live no longer uses GeminiLiveContext under the hood. "
|
||||
"It now uses LLMContext.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, GeminiLiveContext):
|
||||
logger.debug(f"Upgrading to Gemini Live Context: {obj}")
|
||||
obj.__class__ = GeminiLiveContext
|
||||
@@ -328,8 +350,28 @@ class GeminiLiveUserContextAggregator(OpenAIUserContextAggregator):
|
||||
|
||||
Extends OpenAI user aggregator to handle Gemini-specific message passing
|
||||
while maintaining compatibility with the standard aggregation pipeline.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
Gemini Live no longer expects a `GeminiLiveUserContextAggregator`.
|
||||
It now expects a `LLMUserAggregator`.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""Initialize Gemini Live user context aggregator."""
|
||||
# Almost no users should be seeing this message, as
|
||||
# `GeminiLiveUserContextAggregator`` instances were typically created
|
||||
# under the hood, as part of `llm.create_context_aggregator()`.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"GeminiLiveUserContextAggregator is deprecated. "
|
||||
"Gemini Live no longer expects a GeminiLiveUserContextAggregator. "
|
||||
"It now expects a LLMUserAggregator.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
async def process_frame(self, frame, direction):
|
||||
"""Process incoming frames for user context aggregation.
|
||||
|
||||
@@ -349,8 +391,28 @@ class GeminiLiveAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
||||
Handles assistant response aggregation while filtering out LLMTextFrames
|
||||
to prevent duplicate context entries, as Gemini Live pushes both
|
||||
LLMTextFrames and TTSTextFrames.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
Gemini Live no longer uses `GeminiLiveAssistantContextAggregator` under the hood.
|
||||
It now uses `LLMAssistantAggregator`.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""Initialize Gemini Live assistant context aggregator."""
|
||||
# Almost no users should be seeing this message, as
|
||||
# `GeminiLiveAssistantContextAggregator` instances were typically
|
||||
# created under the hood, as part of `llm.create_context_aggregator()`.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"GeminiLiveAssistantContextAggregator is deprecated. "
|
||||
"Gemini Live no longer uses GeminiLiveAssistantContextAggregator under the hood. "
|
||||
"It now uses LLMAssistantAggregator.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process incoming frames for assistant context aggregation.
|
||||
|
||||
@@ -380,6 +442,10 @@ class GeminiLiveAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
||||
class GeminiLiveContextAggregatorPair:
|
||||
"""Pair of user and assistant context aggregators for Gemini Live.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
`GeminiLiveContextAggregatorPair` is deprecated.
|
||||
Use `LLMContextAggregatorPair` instead.
|
||||
|
||||
Parameters:
|
||||
_user: The user context aggregator instance.
|
||||
_assistant: The assistant context aggregator instance.
|
||||
@@ -388,6 +454,19 @@ class GeminiLiveContextAggregatorPair:
|
||||
_user: GeminiLiveUserContextAggregator
|
||||
_assistant: GeminiLiveAssistantContextAggregator
|
||||
|
||||
def __post_init__(self):
|
||||
# Almost no users should be seeing this message, as
|
||||
# `GeminiLiveContextAggregatorPair` instances were typically created
|
||||
# under the hood, with `llm.create_context_aggregator()`.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"GeminiLiveContextAggregatorPair is deprecated. "
|
||||
"Use LLMContextAggregatorPair instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
def user(self) -> GeminiLiveUserContextAggregator:
|
||||
"""Get the user context aggregator.
|
||||
|
||||
@@ -592,8 +671,8 @@ class GeminiLiveLLMService(LLMService):
|
||||
self._voice_id = voice_id
|
||||
self._language_code = params.language
|
||||
|
||||
self._system_instruction = system_instruction
|
||||
self._tools = tools
|
||||
self._system_instruction_from_init = system_instruction
|
||||
self._tools_from_init = tools
|
||||
self._inference_on_context_initialization = inference_on_context_initialization
|
||||
self._needs_turn_complete_message = False
|
||||
|
||||
@@ -609,7 +688,7 @@ class GeminiLiveLLMService(LLMService):
|
||||
self._run_llm_when_session_ready = False
|
||||
|
||||
self._user_is_speaking = False
|
||||
self._bot_is_speaking = False
|
||||
self._bot_is_responding = False
|
||||
self._user_audio_buffer = bytearray()
|
||||
self._user_transcription_buffer = ""
|
||||
self._last_transcription_sent = ""
|
||||
@@ -665,6 +744,9 @@ class GeminiLiveLLMService(LLMService):
|
||||
# Initialize the API client. Subclasses can override this if needed.
|
||||
self.create_client()
|
||||
|
||||
# Bookkeeping for tool calls
|
||||
self._completed_tool_calls = set()
|
||||
|
||||
def create_client(self):
|
||||
"""Create the Gemini API client instance. Subclasses can override this."""
|
||||
self._client = Client(api_key=self._api_key, http_options=self._http_options)
|
||||
@@ -688,17 +770,6 @@ class GeminiLiveLLMService(LLMService):
|
||||
"""
|
||||
return True
|
||||
|
||||
def needs_mcp_alternate_schema(self) -> bool:
|
||||
"""Check if this LLM service requires alternate MCP schema.
|
||||
|
||||
Google/Gemini has stricter JSON schema validation and requires
|
||||
certain properties to be removed or modified for compatibility.
|
||||
|
||||
Returns:
|
||||
True for Google/Gemini services.
|
||||
"""
|
||||
return True
|
||||
|
||||
def set_audio_input_paused(self, paused: bool):
|
||||
"""Set the audio input pause state.
|
||||
|
||||
@@ -787,9 +858,13 @@ class GeminiLiveLLMService(LLMService):
|
||||
#
|
||||
|
||||
async def _handle_interruption(self):
|
||||
await self._set_bot_is_speaking(False)
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
if self._bot_is_responding:
|
||||
await self._set_bot_is_responding(False)
|
||||
if self._settings.get("modalities") == GeminiModalities.AUDIO:
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
# Do not send LLMFullResponseEndFrame here - an interruption
|
||||
# already tells the assistant context aggregator that the response
|
||||
# is over.
|
||||
|
||||
async def _handle_user_started_speaking(self, frame):
|
||||
self._user_is_speaking = True
|
||||
@@ -807,7 +882,6 @@ class GeminiLiveLLMService(LLMService):
|
||||
|
||||
#
|
||||
# frame processing
|
||||
#
|
||||
# StartFrame, StopFrame, CancelFrame implemented in base class
|
||||
#
|
||||
|
||||
@@ -820,7 +894,7 @@ class GeminiLiveLLMService(LLMService):
|
||||
"""
|
||||
# Defer EndFrame handling until after the bot turn is finished
|
||||
if isinstance(frame, EndFrame):
|
||||
if self._bot_is_speaking:
|
||||
if self._bot_is_responding:
|
||||
logger.debug("Deferring handling EndFrame until bot turn is finished")
|
||||
self._end_frame_pending_bot_turn_finished = frame
|
||||
return
|
||||
@@ -829,22 +903,13 @@ class GeminiLiveLLMService(LLMService):
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, OpenAILLMContextFrame):
|
||||
context: GeminiLiveContext = GeminiLiveContext.upgrade(frame.context)
|
||||
# For now, we'll only trigger inference here when either:
|
||||
# 1. We have not seen a context frame before
|
||||
# 2. The last message is a tool call result
|
||||
if not self._context:
|
||||
self._context = context
|
||||
if frame.context.tools:
|
||||
self._tools = frame.context.tools
|
||||
await self._create_initial_response()
|
||||
elif context.messages and context.messages[-1].get("role") == "tool":
|
||||
# Support just one tool call per context frame for now
|
||||
tool_result_message = context.messages[-1]
|
||||
await self._tool_result(tool_result_message)
|
||||
elif isinstance(frame, LLMContextFrame):
|
||||
raise NotImplementedError("Universal LLMContext is not yet supported for Gemini Live.")
|
||||
elif isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
|
||||
context = (
|
||||
frame.context
|
||||
if isinstance(frame, LLMContextFrame)
|
||||
else LLMContext.from_openai_context(frame.context)
|
||||
)
|
||||
await self._handle_context(context)
|
||||
elif isinstance(frame, InputTextRawFrame):
|
||||
await self._send_user_text(frame.text)
|
||||
await self.push_frame(frame, direction)
|
||||
@@ -883,13 +948,106 @@ class GeminiLiveLLMService(LLMService):
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _set_bot_is_speaking(self, speaking: bool):
|
||||
if self._bot_is_speaking == speaking:
|
||||
async def _handle_context(self, context: LLMContext):
|
||||
if not self._context:
|
||||
# We got our initial context
|
||||
self._context = context
|
||||
|
||||
# If context contains system instruction or tools, reconnect in
|
||||
# order to apply them.
|
||||
# (Context-provided system instruction and tools take precedence
|
||||
# over the ones provided at initialization time. Note that we could
|
||||
# do more sophisticated comparisons here, but for now this is
|
||||
# sufficient: we'll assume folks won't mean to provide these
|
||||
# settings both in the context and at initialization time. In a
|
||||
# future change, we could/should implement the ability to swap
|
||||
# these settings at any point).
|
||||
adapter: GeminiLLMAdapter = self.get_llm_adapter()
|
||||
params = adapter.get_llm_invocation_params(self._context)
|
||||
system_instruction = params["system_instruction"]
|
||||
tools = params["tools"]
|
||||
if system_instruction and self._system_instruction_from_init:
|
||||
logger.warning(
|
||||
"System instruction provided both at init time and in context; using context-provided value."
|
||||
)
|
||||
if tools and self._tools_from_init:
|
||||
logger.warning(
|
||||
"Tools provided both at init time and in context; using context-provided value."
|
||||
)
|
||||
if system_instruction or tools:
|
||||
await self._reconnect()
|
||||
|
||||
# Initialize our bookkeeping of already-completed tool calls in
|
||||
# the context
|
||||
await self._process_completed_function_calls(send_new_results=False)
|
||||
|
||||
# Create initial response if needed, based on conversation history
|
||||
# in context.
|
||||
# (If the context has no messages but we do have a system
|
||||
# instruction — meaning it was provided at init time — doctor our
|
||||
# context now so that we'll have something to send to the service
|
||||
# to trigger a response).
|
||||
messages = params["messages"]
|
||||
if not messages and self._inference_on_context_initialization:
|
||||
if self._system_instruction_from_init:
|
||||
logger.debug(
|
||||
"No messages found in initial context; seeding with system instruction to trigger bot response."
|
||||
)
|
||||
self._context.add_message(
|
||||
{"role": "system", "content": self._system_instruction_from_init}
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"No messages found in initial context; cannot trigger initial bot response without messages or system instruction."
|
||||
)
|
||||
await self._create_initial_response()
|
||||
else:
|
||||
# We got an updated context.
|
||||
self._context = context
|
||||
|
||||
# Here we assume that the updated context will contain either:
|
||||
# - new messages (that the Gemini Live service, with its own
|
||||
# context management, is already aware of), or
|
||||
# - tool call results (that we need to tell the remote service
|
||||
# about).
|
||||
# (In the future, we could do more sophisticated diffing here,
|
||||
# which would enable the user to programmatically manipulate the
|
||||
# context).
|
||||
|
||||
# Send results for newly-completed function calls, if any.
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _process_completed_function_calls(self, send_new_results: bool):
|
||||
# Check for set of completed function calls in the context
|
||||
adapter: GeminiLLMAdapter = self.get_llm_adapter()
|
||||
messages = adapter.get_llm_invocation_params(self._context).get("messages", [])
|
||||
for message in messages:
|
||||
if message.parts:
|
||||
for part in message.parts:
|
||||
if part.function_response:
|
||||
tool_call_id = part.function_response.id
|
||||
tool_name = part.function_response.name
|
||||
response = part.function_response.response
|
||||
if (
|
||||
tool_call_id
|
||||
and tool_call_id not in self._completed_tool_calls
|
||||
and response
|
||||
and response.get("value") != "IN_PROGRESS"
|
||||
):
|
||||
# Found a newly-completed function call - send the result to the service
|
||||
if send_new_results:
|
||||
await self._tool_result(
|
||||
tool_call_id, tool_name, part.function_response.response
|
||||
)
|
||||
self._completed_tool_calls.add(tool_call_id)
|
||||
|
||||
async def _set_bot_is_responding(self, responding: bool):
|
||||
if self._bot_is_responding == responding:
|
||||
return
|
||||
|
||||
self._bot_is_speaking = speaking
|
||||
self._bot_is_responding = responding
|
||||
|
||||
if not self._bot_is_speaking and self._end_frame_pending_bot_turn_finished:
|
||||
if not self._bot_is_responding and self._end_frame_pending_bot_turn_finished:
|
||||
await self.queue_frame(self._end_frame_pending_bot_turn_finished)
|
||||
self._end_frame_pending_bot_turn_finished = None
|
||||
|
||||
@@ -991,18 +1149,26 @@ class GeminiLiveLLMService(LLMService):
|
||||
automatic_activity_detection=vad_config
|
||||
)
|
||||
|
||||
# Add system instruction to configuration, if provided
|
||||
system_instruction = self._system_instruction or ""
|
||||
if self._context and hasattr(self._context, "extract_system_instructions"):
|
||||
system_instruction += "\n" + self._context.extract_system_instructions()
|
||||
# Add system instruction and tools to configuration, if provided.
|
||||
# These settings from the context take precedence over the ones
|
||||
# provided at initialization time.
|
||||
adapter: GeminiLLMAdapter = self.get_llm_adapter()
|
||||
system_instruction = None
|
||||
tools = None
|
||||
if self._context:
|
||||
params = adapter.get_llm_invocation_params(self._context)
|
||||
system_instruction = params["system_instruction"]
|
||||
tools = params["tools"]
|
||||
if not system_instruction:
|
||||
system_instruction = self._system_instruction_from_init
|
||||
if not tools:
|
||||
tools = adapter.from_standard_tools(self._tools_from_init)
|
||||
if system_instruction:
|
||||
logger.debug(f"Setting system instruction: {system_instruction}")
|
||||
config.system_instruction = system_instruction
|
||||
|
||||
# Add tools to configuration, if provided
|
||||
if self._tools:
|
||||
logger.debug(f"Setting tools: {self._tools}")
|
||||
config.tools = self.get_llm_adapter().from_standard_tools(self._tools)
|
||||
if tools:
|
||||
logger.debug(f"Setting tools: {tools}")
|
||||
config.tools = tools
|
||||
|
||||
# Start the connection
|
||||
self._connection_task = self.create_task(self._connection_task_handler(config=config))
|
||||
@@ -1116,6 +1282,7 @@ class GeminiLiveLLMService(LLMService):
|
||||
if self._session:
|
||||
await self._session.close()
|
||||
self._session = None
|
||||
self._completed_tool_calls = set()
|
||||
self._disconnecting = False
|
||||
except Exception as e:
|
||||
logger.error(f"{self} error disconnecting: {e}")
|
||||
@@ -1195,7 +1362,8 @@ class GeminiLiveLLMService(LLMService):
|
||||
self._run_llm_when_session_ready = True
|
||||
return
|
||||
|
||||
messages = self._context.get_messages_for_initializing_history()
|
||||
adapter: GeminiLLMAdapter = self.get_llm_adapter()
|
||||
messages = adapter.get_llm_invocation_params(self._context).get("messages", [])
|
||||
if not messages:
|
||||
return
|
||||
|
||||
@@ -1223,8 +1391,9 @@ class GeminiLiveLLMService(LLMService):
|
||||
|
||||
# Create a throwaway context just for the purpose of getting messages
|
||||
# in the right format
|
||||
context = GeminiLiveContext.upgrade(OpenAILLMContext(messages=messages_list))
|
||||
messages = context.get_messages_for_initializing_history()
|
||||
context = LLMContext(messages=messages_list)
|
||||
adapter: GeminiLLMAdapter = self.get_llm_adapter()
|
||||
messages = adapter.get_llm_invocation_params(context).get("messages", [])
|
||||
|
||||
if not messages:
|
||||
return
|
||||
@@ -1239,17 +1408,16 @@ class GeminiLiveLLMService(LLMService):
|
||||
await self._handle_send_error(e)
|
||||
|
||||
@traced_gemini_live(operation="llm_tool_result")
|
||||
async def _tool_result(self, tool_result_message):
|
||||
async def _tool_result(
|
||||
self, tool_call_id: str, tool_name: str, tool_result_message: Dict[str, Any]
|
||||
):
|
||||
"""Send tool result back to the API."""
|
||||
if self._disconnecting or not self._session:
|
||||
return
|
||||
|
||||
# For now we're shoving the name into the tool_call_id field, so this
|
||||
# will work until we revisit that.
|
||||
id = tool_result_message.get("tool_call_id")
|
||||
name = tool_result_message.get("tool_call_name")
|
||||
result = json.loads(tool_result_message.get("content") or "")
|
||||
response = FunctionResponse(name=name, id=id, response=result)
|
||||
response = FunctionResponse(name=tool_name, id=tool_call_id, response=tool_result_message)
|
||||
|
||||
try:
|
||||
await self._session.send_tool_response(function_responses=response)
|
||||
@@ -1277,7 +1445,10 @@ class GeminiLiveLLMService(LLMService):
|
||||
# part.text is added when `modalities` is set to TEXT; otherwise, it's None
|
||||
text = part.text
|
||||
if text:
|
||||
if not self._bot_text_buffer:
|
||||
if not self._bot_is_responding:
|
||||
# Update bot responding state and send service start frame
|
||||
# (AUDIO modality case)
|
||||
await self._set_bot_is_responding(True)
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
|
||||
self._bot_text_buffer += text
|
||||
@@ -1288,6 +1459,8 @@ class GeminiLiveLLMService(LLMService):
|
||||
if msg.server_content and msg.server_content.grounding_metadata:
|
||||
self._accumulated_grounding_metadata = msg.server_content.grounding_metadata
|
||||
|
||||
# If we have no audio, stop here.
|
||||
# All logic below this point pertains to the AUDIO modality.
|
||||
inline_data = part.inline_data
|
||||
if not inline_data:
|
||||
return
|
||||
@@ -1313,8 +1486,10 @@ class GeminiLiveLLMService(LLMService):
|
||||
if not audio:
|
||||
return
|
||||
|
||||
if not self._bot_is_speaking:
|
||||
await self._set_bot_is_speaking(True)
|
||||
# Update bot responding state and send service start frames
|
||||
# (AUDIO modality case)
|
||||
if not self._bot_is_responding:
|
||||
await self._set_bot_is_responding(True)
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
|
||||
@@ -1354,7 +1529,6 @@ class GeminiLiveLLMService(LLMService):
|
||||
@traced_gemini_live(operation="llm_response")
|
||||
async def _handle_msg_turn_complete(self, message: LiveServerMessage):
|
||||
"""Handle the turn complete message."""
|
||||
await self._set_bot_is_speaking(False)
|
||||
text = self._bot_text_buffer
|
||||
|
||||
# Trace the complete LLM response (this will be handled by the decorator)
|
||||
@@ -1373,13 +1547,15 @@ class GeminiLiveLLMService(LLMService):
|
||||
self._search_result_buffer = ""
|
||||
self._accumulated_grounding_metadata = None
|
||||
|
||||
# Only push the TTSStoppedFrame if the bot is outputting audio
|
||||
# when text is found, modalities is set to TEXT and no audio
|
||||
# is produced.
|
||||
if not text:
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
if self._bot_is_responding:
|
||||
await self._set_bot_is_responding(False)
|
||||
if not text:
|
||||
# AUDIO modality case
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
else:
|
||||
# TEXT modality case
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
|
||||
@traced_stt
|
||||
async def _handle_user_transcription(
|
||||
@@ -1442,8 +1618,8 @@ class GeminiLiveLLMService(LLMService):
|
||||
return
|
||||
|
||||
# This is the output transcription text when modalities is set to AUDIO.
|
||||
# In this case, we push LLMTextFrame and TTSTextFrame to be handled by the
|
||||
# downstream assistant context aggregator.
|
||||
# In this case, we push TTSTextFrame to be handled by the downstream
|
||||
# assistant context aggregator.
|
||||
text = message.server_content.output_transcription.text
|
||||
|
||||
if not text:
|
||||
@@ -1458,7 +1634,17 @@ class GeminiLiveLLMService(LLMService):
|
||||
# Collect text for tracing
|
||||
self._llm_output_buffer += text
|
||||
|
||||
await self.push_frame(LLMTextFrame(text=text))
|
||||
# NOTE: Shoot. When using Vertex AI, output transcription messages
|
||||
# arrive *before* the model_turn messages with audio, so we need to
|
||||
# handle sending TTSStartedFrame and LLMFullResponseStartFrame here as
|
||||
# well. These messages also contain much *more* text (it looks further
|
||||
# ahead). That means that on an interruption our recorded context will
|
||||
# contain some text that was actually never spoken.
|
||||
if not self._bot_is_responding:
|
||||
await self._set_bot_is_responding(True)
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
|
||||
await self.push_frame(TTSTextFrame(text=text))
|
||||
|
||||
async def _handle_msg_grounding_metadata(self, message: LiveServerMessage):
|
||||
@@ -1543,13 +1729,17 @@ class GeminiLiveLLMService(LLMService):
|
||||
self._session_resumption_handle = update.new_handle
|
||||
|
||||
async def _handle_send_error(self, error: Exception):
|
||||
# Ignore "expected" errors that may have occurred for messages that
|
||||
# were in-flight when a disconnection occurred.
|
||||
if self._disconnecting or not self._session:
|
||||
return
|
||||
|
||||
# In server-to-server contexts, a WebSocket error should be quite rare.
|
||||
# Given how hard it is to recover from a send-side error with proper
|
||||
# state management, and that exponential backoff for retries can have
|
||||
# cost/stability implications for a service cluster, let's just treat a
|
||||
# send-side error as fatal.
|
||||
if not self._disconnecting:
|
||||
await self.push_error(ErrorFrame(error=f"{self} Send error: {error}", fatal=True))
|
||||
await self.push_error(ErrorFrame(error=f"{self} Send error: {error}", fatal=True))
|
||||
|
||||
def create_context_aggregator(
|
||||
self,
|
||||
@@ -1557,26 +1747,27 @@ class GeminiLiveLLMService(LLMService):
|
||||
*,
|
||||
user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
|
||||
assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
|
||||
) -> GeminiLiveContextAggregatorPair:
|
||||
) -> LLMContextAggregatorPair:
|
||||
"""Create an instance of GeminiLiveContextAggregatorPair from an OpenAILLMContext.
|
||||
|
||||
Constructor keyword arguments for both the user and assistant aggregators can be provided.
|
||||
|
||||
NOTE: this method exists only for backward compatibility. New code
|
||||
should instead do::
|
||||
|
||||
context = LLMContext(...)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
Args:
|
||||
context: The LLM context to use.
|
||||
user_params: User aggregator parameters. Defaults to LLMUserAggregatorParams().
|
||||
assistant_params: Assistant aggregator parameters. Defaults to LLMAssistantAggregatorParams().
|
||||
|
||||
Returns:
|
||||
GeminiLiveContextAggregatorPair: A pair of context
|
||||
aggregators, one for the user and one for the assistant,
|
||||
encapsulated in an GeminiLiveContextAggregatorPair.
|
||||
A pair of user and assistant context aggregators.
|
||||
"""
|
||||
context.set_llm_adapter(self.get_llm_adapter())
|
||||
|
||||
GeminiLiveContext.upgrade(context)
|
||||
user = GeminiLiveUserContextAggregator(context, params=user_params)
|
||||
|
||||
context = LLMContext.from_openai_context(context)
|
||||
assistant_params.expect_stripped_words = False
|
||||
assistant = GeminiLiveAssistantContextAggregator(context, params=assistant_params)
|
||||
return GeminiLiveContextAggregatorPair(_user=user, _assistant=assistant)
|
||||
return LLMContextAggregatorPair(
|
||||
context, user_params=user_params, assistant_params=assistant_params
|
||||
)
|
||||
|
||||
@@ -715,7 +715,6 @@ class GoogleLLMService(LLMService):
|
||||
self._system_instruction = system_instruction
|
||||
self._http_options = http_options
|
||||
|
||||
self._create_client(api_key, http_options)
|
||||
self._settings = {
|
||||
"max_tokens": params.max_tokens,
|
||||
"temperature": params.temperature,
|
||||
@@ -726,6 +725,9 @@ class GoogleLLMService(LLMService):
|
||||
self._tools = tools
|
||||
self._tool_config = tool_config
|
||||
|
||||
# Initialize the API client. Subclasses can override this if needed.
|
||||
self.create_client()
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if the service can generate usage metrics.
|
||||
|
||||
@@ -734,8 +736,9 @@ class GoogleLLMService(LLMService):
|
||||
"""
|
||||
return True
|
||||
|
||||
def _create_client(self, api_key: str, http_options: Optional[HttpOptions] = None):
|
||||
self._client = genai.Client(api_key=api_key, http_options=http_options)
|
||||
def create_client(self):
|
||||
"""Create the Gemini client instance. Subclasses can override this."""
|
||||
self._client = genai.Client(api_key=self._api_key, http_options=self._http_options)
|
||||
|
||||
async def run_inference(self, context: LLMContext | OpenAILLMContext) -> Optional[str]:
|
||||
"""Run a one-shot, out-of-band (i.e. out-of-pipeline) inference with the given LLM context.
|
||||
@@ -775,17 +778,6 @@ class GoogleLLMService(LLMService):
|
||||
|
||||
return None
|
||||
|
||||
def needs_mcp_alternate_schema(self) -> bool:
|
||||
"""Check if this LLM service requires alternate MCP schema.
|
||||
|
||||
Google/Gemini has stricter JSON schema validation and requires
|
||||
certain properties to be removed or modified for compatibility.
|
||||
|
||||
Returns:
|
||||
True for Google/Gemini services.
|
||||
"""
|
||||
return True
|
||||
|
||||
def _maybe_unset_thinking_budget(self, generation_params: Dict[str, Any]):
|
||||
try:
|
||||
# There's no way to introspect on model capabilities, so
|
||||
@@ -907,12 +899,18 @@ class GoogleLLMService(LLMService):
|
||||
async for chunk in response:
|
||||
# Stop TTFB metrics after the first chunk
|
||||
await self.stop_ttfb_metrics()
|
||||
# Gemini may send usage_metadata in multiple chunks with varying behavior:
|
||||
# - Sometimes a single chunk, sometimes multiple chunks
|
||||
# - Token counts may be cumulative (growing) or may change between chunks
|
||||
# - Early chunks may include estimates/overhead that gets refined
|
||||
# We use assignment (not accumulation) because the final chunk always contains
|
||||
# the authoritative, billable token usage for the entire response.
|
||||
if chunk.usage_metadata:
|
||||
prompt_tokens += chunk.usage_metadata.prompt_token_count or 0
|
||||
completion_tokens += chunk.usage_metadata.candidates_token_count or 0
|
||||
total_tokens += chunk.usage_metadata.total_token_count or 0
|
||||
cache_read_input_tokens += chunk.usage_metadata.cached_content_token_count or 0
|
||||
reasoning_tokens += chunk.usage_metadata.thoughts_token_count or 0
|
||||
prompt_tokens = chunk.usage_metadata.prompt_token_count or 0
|
||||
completion_tokens = chunk.usage_metadata.candidates_token_count or 0
|
||||
total_tokens = chunk.usage_metadata.total_token_count or 0
|
||||
cache_read_input_tokens = chunk.usage_metadata.cached_content_token_count or 0
|
||||
reasoning_tokens = chunk.usage_metadata.thoughts_token_count or 0
|
||||
|
||||
if not chunk.candidates:
|
||||
continue
|
||||
|
||||
@@ -6,8 +6,8 @@
|
||||
|
||||
"""Google Vertex AI LLM service implementation.
|
||||
|
||||
This module provides integration with Google's AI models via Vertex AI while
|
||||
maintaining OpenAI API compatibility through Google's OpenAI-compatible endpoint.
|
||||
This module provides integration with Google's AI models via Vertex AI,
|
||||
extending the GoogleLLMService with Vertex AI authentication.
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -20,12 +20,14 @@ from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
|
||||
try:
|
||||
from google.auth import default
|
||||
from google.auth.exceptions import GoogleAuthError
|
||||
from google.auth.transport.requests import Request
|
||||
from google.genai import Client
|
||||
from google.genai.types import HttpOptions
|
||||
from google.oauth2 import service_account
|
||||
|
||||
except ModuleNotFoundError as e:
|
||||
@@ -36,19 +38,19 @@ except ModuleNotFoundError as e:
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class GoogleVertexLLMService(OpenAILLMService):
|
||||
"""Google Vertex AI LLM service with OpenAI API compatibility.
|
||||
class GoogleVertexLLMService(GoogleLLMService):
|
||||
"""Google Vertex AI LLM service extending GoogleLLMService.
|
||||
|
||||
Provides access to Google's AI models via Vertex AI while maintaining
|
||||
OpenAI API compatibility. Handles authentication using Google service
|
||||
account credentials and constructs appropriate endpoint URLs for
|
||||
different GCP regions and projects.
|
||||
Provides access to Google's AI models via Vertex AI while using the same
|
||||
Google AI client and message format as GoogleLLMService. Handles authentication
|
||||
using Google service account credentials and configures the client for
|
||||
Vertex AI endpoints.
|
||||
|
||||
Reference:
|
||||
https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/call-vertex-using-openai-library
|
||||
https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
|
||||
"""
|
||||
|
||||
class InputParams(OpenAILLMService.InputParams):
|
||||
class InputParams(GoogleLLMService.InputParams):
|
||||
"""Input parameters specific to Vertex AI.
|
||||
|
||||
Parameters:
|
||||
@@ -97,9 +99,14 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
*,
|
||||
credentials: Optional[str] = None,
|
||||
credentials_path: Optional[str] = None,
|
||||
model: str = "google/gemini-2.0-flash-001",
|
||||
model: str = "gemini-2.5-flash",
|
||||
location: Optional[str] = None,
|
||||
project_id: Optional[str] = None,
|
||||
params: Optional[GoogleLLMService.InputParams] = None,
|
||||
system_instruction: Optional[str] = None,
|
||||
tools: Optional[list] = None,
|
||||
tool_config: Optional[dict] = None,
|
||||
http_options: Optional[HttpOptions] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initializes the VertexLLMService.
|
||||
@@ -107,14 +114,29 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
Args:
|
||||
credentials: JSON string of service account credentials.
|
||||
credentials_path: Path to the service account JSON file.
|
||||
model: Model identifier (e.g., "google/gemini-2.0-flash-001").
|
||||
model: Model identifier (e.g., "gemini-2.5-flash").
|
||||
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
|
||||
project_id: Google Cloud project ID.
|
||||
**kwargs: Additional arguments passed to OpenAILLMService.
|
||||
params: Input parameters for the model.
|
||||
system_instruction: System instruction/prompt for the model.
|
||||
tools: List of available tools/functions.
|
||||
tool_config: Configuration for tool usage.
|
||||
http_options: HTTP options for the client.
|
||||
**kwargs: Additional arguments passed to GoogleLLMService.
|
||||
"""
|
||||
# Check if user incorrectly passed api_key, which is used by parent
|
||||
# class but not here.
|
||||
if "api_key" in kwargs:
|
||||
logger.error(
|
||||
"GoogleVertexLLMService does not accept 'api_key' parameter. "
|
||||
"Use 'credentials' or 'credentials_path' instead for Vertex AI authentication."
|
||||
)
|
||||
raise ValueError(
|
||||
"Invalid parameter 'api_key'. Use 'credentials' or 'credentials_path' for Vertex AI authentication."
|
||||
)
|
||||
|
||||
# Handle deprecated InputParams fields
|
||||
if "params" in kwargs and isinstance(kwargs["params"], GoogleVertexLLMService.InputParams):
|
||||
params = kwargs["params"]
|
||||
if params and isinstance(params, GoogleVertexLLMService.InputParams):
|
||||
# Extract location and project_id from params if not provided
|
||||
# directly, for backward compatibility
|
||||
if project_id is None:
|
||||
@@ -122,13 +144,12 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
if location is None:
|
||||
location = params.location
|
||||
# Convert to base InputParams
|
||||
params = OpenAILLMService.InputParams(
|
||||
params = GoogleLLMService.InputParams(
|
||||
**params.model_dump(exclude={"location", "project_id"}, exclude_unset=True)
|
||||
)
|
||||
kwargs["params"] = params
|
||||
|
||||
# Validate project_id and location parameters
|
||||
# NOTE: once we remove Vertex-spcific InputParams class, we can update
|
||||
# NOTE: once we remove Vertex-specific InputParams class, we can update
|
||||
# __init__() signature as follows:
|
||||
# - location: str = "us-east4",
|
||||
# - project_id: str,
|
||||
@@ -143,29 +164,38 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
logger.warning("location is not provided. Defaulting to 'us-east4'.")
|
||||
location = "us-east4" # Default location if not provided
|
||||
|
||||
base_url = self._get_base_url(location, project_id)
|
||||
self._api_key = self._get_api_token(credentials, credentials_path)
|
||||
# These need to be set before calling super().__init__() because
|
||||
# super().__init__() invokes _create_client(), which needs these.
|
||||
self._credentials = self._get_credentials(credentials, credentials_path)
|
||||
self._project_id = project_id
|
||||
self._location = location
|
||||
|
||||
# Call parent constructor with dummy api_key
|
||||
# (api_key is required by parent class, but not actually used with Vertex)
|
||||
super().__init__(
|
||||
api_key=self._api_key,
|
||||
base_url=base_url,
|
||||
api_key="dummy",
|
||||
model=model,
|
||||
params=params,
|
||||
system_instruction=system_instruction,
|
||||
tools=tools,
|
||||
tool_config=tool_config,
|
||||
http_options=http_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_base_url(location: str, project_id: str) -> str:
|
||||
"""Construct the base URL for Vertex AI API."""
|
||||
# Determine the correct API host based on location
|
||||
if location == "global":
|
||||
api_host = "aiplatform.googleapis.com"
|
||||
else:
|
||||
api_host = f"{location}-aiplatform.googleapis.com"
|
||||
return f"https://{api_host}/v1/projects/{project_id}/locations/{location}/endpoints/openapi"
|
||||
def create_client(self):
|
||||
"""Create the Gemini client instance configured for Vertex AI."""
|
||||
self._client = Client(
|
||||
vertexai=True,
|
||||
credentials=self._credentials,
|
||||
project=self._project_id,
|
||||
location=self._location,
|
||||
http_options=self._http_options,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_api_token(credentials: Optional[str], credentials_path: Optional[str]) -> str:
|
||||
"""Retrieve an authentication token using Google service account credentials.
|
||||
def _get_credentials(credentials: Optional[str], credentials_path: Optional[str]):
|
||||
"""Retrieve Credentials using Google service account credentials.
|
||||
|
||||
Supports multiple authentication methods:
|
||||
1. Direct JSON credentials string
|
||||
@@ -177,7 +207,7 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
credentials_path: Path to the service account JSON file.
|
||||
|
||||
Returns:
|
||||
OAuth token for API authentication.
|
||||
Google credentials object for API authentication.
|
||||
|
||||
Raises:
|
||||
ValueError: If no valid credentials are provided or found.
|
||||
@@ -209,4 +239,4 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
|
||||
creds.refresh(Request()) # Ensure token is up-to-date, lifetime is 1 hour.
|
||||
|
||||
return creds.token
|
||||
return creds
|
||||
|
||||
@@ -36,11 +36,12 @@ from pipecat.frames.frames import (
|
||||
TranscriptionFrame,
|
||||
)
|
||||
from pipecat.services.stt_service import STTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
try:
|
||||
from google.api_core.client_options import ClientOptions
|
||||
from google.api_core.exceptions import Aborted
|
||||
from google.auth import default
|
||||
from google.auth.exceptions import GoogleAuthError
|
||||
from google.cloud import speech_v2
|
||||
@@ -64,7 +65,7 @@ def language_to_google_stt_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
Optional[str]: Google STT language code or None if not supported.
|
||||
"""
|
||||
language_map = {
|
||||
LANGUAGE_MAP = {
|
||||
# Afrikaans
|
||||
Language.AF: "af-ZA",
|
||||
Language.AF_ZA: "af-ZA",
|
||||
@@ -351,7 +352,7 @@ def language_to_google_stt_language(language: Language) -> Optional[str]:
|
||||
Language.ZU_ZA: "zu-ZA",
|
||||
}
|
||||
|
||||
return language_map.get(language)
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
|
||||
|
||||
|
||||
class GoogleSTTService(STTService):
|
||||
@@ -886,6 +887,18 @@ class GoogleSTTService(STTService):
|
||||
result=result,
|
||||
)
|
||||
)
|
||||
except Aborted as e:
|
||||
# Handle stream abort due to inactivity (409 error).
|
||||
# This occurs when no audio is sent to the stream for 10+ seconds,
|
||||
# which can happen when InputAudioRawFrames are blocked (e.g., by STTMuteFilter).
|
||||
# Google's STT service automatically closes the stream in this case.
|
||||
# We log at DEBUG level (not ERROR) since this is recoverable, then re-raise
|
||||
# to trigger automatic reconnection in _stream_audio.
|
||||
logger.debug(
|
||||
f"{self} Stream aborted due to inactivity (no audio input). "
|
||||
f"Reconnecting automatically..."
|
||||
)
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Google STT responses: {e}")
|
||||
# Re-raise the exception to let it propagate (e.g. in the case of a
|
||||
|
||||
@@ -22,7 +22,7 @@ from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
# Suppress gRPC fork warnings
|
||||
os.environ["GRPC_ENABLE_FORK_SUPPORT"] = "false"
|
||||
|
||||
from typing import AsyncGenerator, List, Literal, Optional
|
||||
from typing import Any, AsyncGenerator, List, Literal, Mapping, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
@@ -36,7 +36,7 @@ from pipecat.frames.frames import (
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.services.tts_service import TTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
|
||||
try:
|
||||
from google.auth import default
|
||||
@@ -70,7 +70,7 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
The corresponding Google TTS language code, or None if not supported.
|
||||
"""
|
||||
language_map = {
|
||||
LANGUAGE_MAP = {
|
||||
# Afrikaans
|
||||
Language.AF: "af-ZA",
|
||||
Language.AF_ZA: "af-ZA",
|
||||
@@ -226,7 +226,7 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
||||
Language.VI_VN: "vi-VN",
|
||||
}
|
||||
|
||||
return language_map.get(language)
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
|
||||
|
||||
|
||||
class GoogleHttpTTSService(TTSService):
|
||||
@@ -248,7 +248,8 @@ class GoogleHttpTTSService(TTSService):
|
||||
|
||||
Parameters:
|
||||
pitch: Voice pitch adjustment (e.g., "+2st", "-50%").
|
||||
rate: Speaking rate adjustment (e.g., "slow", "fast", "125%").
|
||||
rate: Speaking rate adjustment (e.g., "slow", "fast", "125%"). Used for SSML prosody tags (non-Chirp voices).
|
||||
speaking_rate: Speaking rate for AudioConfig (Chirp/Journey voices). Range [0.25, 2.0].
|
||||
volume: Volume adjustment (e.g., "loud", "soft", "+6dB").
|
||||
emphasis: Emphasis level for the text.
|
||||
language: Language for synthesis. Defaults to English.
|
||||
@@ -258,6 +259,7 @@ class GoogleHttpTTSService(TTSService):
|
||||
|
||||
pitch: Optional[str] = None
|
||||
rate: Optional[str] = None
|
||||
speaking_rate: Optional[float] = None
|
||||
volume: Optional[str] = None
|
||||
emphasis: Optional[Literal["strong", "moderate", "reduced", "none"]] = None
|
||||
language: Optional[Language] = Language.EN
|
||||
@@ -291,6 +293,7 @@ class GoogleHttpTTSService(TTSService):
|
||||
self._settings = {
|
||||
"pitch": params.pitch,
|
||||
"rate": params.rate,
|
||||
"speaking_rate": params.speaking_rate,
|
||||
"volume": params.volume,
|
||||
"emphasis": params.emphasis,
|
||||
"language": self.language_to_service_language(params.language)
|
||||
@@ -360,6 +363,22 @@ class GoogleHttpTTSService(TTSService):
|
||||
"""
|
||||
return language_to_google_tts_language(language)
|
||||
|
||||
async def _update_settings(self, settings: Mapping[str, Any]):
|
||||
"""Override to handle speaking_rate updates for Chirp/Journey voices.
|
||||
|
||||
Args:
|
||||
settings: Dictionary of settings to update. Can include 'speaking_rate' (float)
|
||||
"""
|
||||
if "speaking_rate" in settings:
|
||||
rate_value = float(settings["speaking_rate"])
|
||||
if 0.25 <= rate_value <= 2.0:
|
||||
self._settings["speaking_rate"] = rate_value
|
||||
else:
|
||||
logger.warning(
|
||||
f"Invalid speaking_rate value: {rate_value}. Must be between 0.25 and 2.0"
|
||||
)
|
||||
await super()._update_settings(settings)
|
||||
|
||||
def _construct_ssml(self, text: str) -> str:
|
||||
ssml = "<speak>"
|
||||
|
||||
@@ -436,10 +455,17 @@ class GoogleHttpTTSService(TTSService):
|
||||
voice = texttospeech_v1.VoiceSelectionParams(
|
||||
language_code=self._settings["language"], name=self._voice_id
|
||||
)
|
||||
audio_config = texttospeech_v1.AudioConfig(
|
||||
audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
|
||||
sample_rate_hertz=self.sample_rate,
|
||||
)
|
||||
# Build audio config with conditional speaking_rate
|
||||
audio_config_params = {
|
||||
"audio_encoding": texttospeech_v1.AudioEncoding.LINEAR16,
|
||||
"sample_rate_hertz": self.sample_rate,
|
||||
}
|
||||
|
||||
# For Chirp and Journey voices, include speaking_rate in AudioConfig
|
||||
if (is_chirp_voice or is_journey_voice) and self._settings["speaking_rate"] is not None:
|
||||
audio_config_params["speaking_rate"] = self._settings["speaking_rate"]
|
||||
|
||||
audio_config = texttospeech_v1.AudioConfig(**audio_config_params)
|
||||
|
||||
request = texttospeech_v1.SynthesizeSpeechRequest(
|
||||
input=synthesis_input, voice=voice, audio_config=audio_config
|
||||
@@ -500,7 +526,7 @@ class GoogleTTSService(TTSService):
|
||||
|
||||
Parameters:
|
||||
language: Language for synthesis. Defaults to English.
|
||||
speaking_rate: The speaking rate, in the range [0.25, 4.0].
|
||||
speaking_rate: The speaking rate, in the range [0.25, 2.0].
|
||||
"""
|
||||
|
||||
language: Optional[Language] = Language.EN
|
||||
@@ -591,6 +617,22 @@ class GoogleTTSService(TTSService):
|
||||
"""
|
||||
return language_to_google_tts_language(language)
|
||||
|
||||
async def _update_settings(self, settings: Mapping[str, Any]):
|
||||
"""Override to handle speaking_rate updates for streaming API.
|
||||
|
||||
Args:
|
||||
settings: Dictionary of settings to update. Can include 'speaking_rate' (float)
|
||||
"""
|
||||
if "speaking_rate" in settings:
|
||||
rate_value = float(settings["speaking_rate"])
|
||||
if 0.25 <= rate_value <= 2.0:
|
||||
self._settings["speaking_rate"] = rate_value
|
||||
else:
|
||||
logger.warning(
|
||||
f"Invalid speaking_rate value: {rate_value}. Must be between 0.25 and 2.0"
|
||||
)
|
||||
await super()._update_settings(settings)
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate streaming speech from text using Google's streaming API.
|
||||
|
||||
@@ -58,7 +58,8 @@ class GroqSTTService(BaseWhisperSTTService):
|
||||
kwargs = {
|
||||
"file": ("audio.wav", audio, "audio/wav"),
|
||||
"model": self.model_name,
|
||||
"response_format": "json",
|
||||
# Use verbose_json to get probability metrics
|
||||
"response_format": "verbose_json" if self._include_prob_metrics else "json",
|
||||
"language": self._language,
|
||||
}
|
||||
|
||||
|
||||
@@ -184,11 +184,15 @@ class HumeTTSService(TTSService):
|
||||
# Hume emits mono PCM at 48 kHz; downstream can resample if needed.
|
||||
# We buffer audio bytes before sending to prevent glitches.
|
||||
self._audio_bytes = b""
|
||||
|
||||
# Use version "2" by default if no description is provided
|
||||
# Version "1" is needed when description is used
|
||||
version = "1" if self._params.description is not None else "2"
|
||||
async for chunk in self._client.tts.synthesize_json_streaming(
|
||||
utterances=[utterance],
|
||||
format=pcm_fmt,
|
||||
instant_mode=True,
|
||||
version="2",
|
||||
version=version,
|
||||
):
|
||||
audio_b64 = getattr(chunk, "audio", None)
|
||||
if not audio_b64:
|
||||
|
||||
@@ -419,17 +419,6 @@ class LLMService(AIService):
|
||||
return True
|
||||
return function_name in self._functions.keys()
|
||||
|
||||
def needs_mcp_alternate_schema(self) -> bool:
|
||||
"""Check if this LLM service requires alternate MCP schema.
|
||||
|
||||
Some LLM services have stricter JSON schema validation and require
|
||||
certain properties to be removed or modified for compatibility.
|
||||
|
||||
Returns:
|
||||
True if MCP schemas should be cleaned for this service, False otherwise.
|
||||
"""
|
||||
return False
|
||||
|
||||
async def run_function_calls(self, function_calls: Sequence[FunctionCallFromLLM]):
|
||||
"""Execute a sequence of function calls from the LLM.
|
||||
|
||||
@@ -444,11 +433,7 @@ class LLMService(AIService):
|
||||
|
||||
await self._call_event_handler("on_function_calls_started", function_calls)
|
||||
|
||||
# Push frame both downstream and upstream
|
||||
started_frame_downstream = FunctionCallsStartedFrame(function_calls=function_calls)
|
||||
started_frame_upstream = FunctionCallsStartedFrame(function_calls=function_calls)
|
||||
await self.push_frame(started_frame_downstream, FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(started_frame_upstream, FrameDirection.UPSTREAM)
|
||||
await self.broadcast_frame(FunctionCallsStartedFrame, function_calls=function_calls)
|
||||
|
||||
for function_call in function_calls:
|
||||
if function_call.function_name in self._functions.keys():
|
||||
@@ -492,11 +477,19 @@ class LLMService(AIService):
|
||||
tool_call_id: Optional[str] = None,
|
||||
text_content: Optional[str] = None,
|
||||
video_source: Optional[str] = None,
|
||||
timeout: Optional[float] = 10.0,
|
||||
):
|
||||
"""Request an image from a user.
|
||||
|
||||
Pushes a UserImageRequestFrame upstream to request an image from the
|
||||
specified user.
|
||||
specified user. The user image can then be processed by the LLM.
|
||||
|
||||
Use this function from a function call if you want the LLM to process
|
||||
the image. If you expect the image to be processed by a vision service,
|
||||
you might want to push a UserImageRequestFrame upstream directly.
|
||||
|
||||
.. deprecated:: 0.0.92
|
||||
This method is deprecated, push a `UserImageRequestFrame` instead.
|
||||
|
||||
Args:
|
||||
user_id: The ID of the user to request an image from.
|
||||
@@ -504,14 +497,25 @@ class LLMService(AIService):
|
||||
tool_call_id: Optional tool call ID associated with the request.
|
||||
text_content: Optional text content/context for the image request.
|
||||
video_source: Optional video source identifier.
|
||||
timeout: Optional timeout for the requested image to be added to the LLM context.
|
||||
|
||||
"""
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Method `request_image_frame()` is deprecated, push a `UserImageRequestFrame` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
await self.push_frame(
|
||||
UserImageRequestFrame(
|
||||
user_id=user_id,
|
||||
text=text_content,
|
||||
# Deprecated fields below.
|
||||
function_name=function_name,
|
||||
tool_call_id=tool_call_id,
|
||||
context=text_content,
|
||||
video_source=video_source,
|
||||
),
|
||||
FrameDirection.UPSTREAM,
|
||||
)
|
||||
@@ -551,33 +555,24 @@ class LLMService(AIService):
|
||||
# NOTE(aleix): This needs to be removed after we remove the deprecation.
|
||||
await self._call_start_function(runner_item.context, runner_item.function_name)
|
||||
|
||||
# Push a function call in-progress downstream. This frame will let our
|
||||
# assistant context aggregator know that we are in the middle of a
|
||||
# function call. Some contexts/aggregators may not need this. But some
|
||||
# definitely do (Anthropic, for example). Also push it upstream for use
|
||||
# by other processors, like STTMuteFilter.
|
||||
progress_frame_downstream = FunctionCallInProgressFrame(
|
||||
# Broadcast function call in-progress. This frame will let our assistant
|
||||
# context aggregator know that we are in the middle of a function
|
||||
# call. Some contexts/aggregators may not need this. But some definitely
|
||||
# do (Anthropic, for example).
|
||||
await self.broadcast_frame(
|
||||
FunctionCallInProgressFrame,
|
||||
function_name=runner_item.function_name,
|
||||
tool_call_id=runner_item.tool_call_id,
|
||||
arguments=runner_item.arguments,
|
||||
cancel_on_interruption=item.cancel_on_interruption,
|
||||
)
|
||||
progress_frame_upstream = FunctionCallInProgressFrame(
|
||||
function_name=runner_item.function_name,
|
||||
tool_call_id=runner_item.tool_call_id,
|
||||
arguments=runner_item.arguments,
|
||||
cancel_on_interruption=item.cancel_on_interruption,
|
||||
)
|
||||
|
||||
# Push frame both downstream and upstream
|
||||
await self.push_frame(progress_frame_downstream, FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(progress_frame_upstream, FrameDirection.UPSTREAM)
|
||||
|
||||
# Define a callback function that pushes a FunctionCallResultFrame upstream & downstream.
|
||||
async def function_call_result_callback(
|
||||
result: Any, *, properties: Optional[FunctionCallResultProperties] = None
|
||||
):
|
||||
result_frame_downstream = FunctionCallResultFrame(
|
||||
await self.broadcast_frame(
|
||||
FunctionCallResultFrame,
|
||||
function_name=runner_item.function_name,
|
||||
tool_call_id=runner_item.tool_call_id,
|
||||
arguments=runner_item.arguments,
|
||||
@@ -585,17 +580,6 @@ class LLMService(AIService):
|
||||
run_llm=runner_item.run_llm,
|
||||
properties=properties,
|
||||
)
|
||||
result_frame_upstream = FunctionCallResultFrame(
|
||||
function_name=runner_item.function_name,
|
||||
tool_call_id=runner_item.tool_call_id,
|
||||
arguments=runner_item.arguments,
|
||||
result=result,
|
||||
run_llm=runner_item.run_llm,
|
||||
properties=properties,
|
||||
)
|
||||
|
||||
await self.push_frame(result_frame_downstream, FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(result_frame_upstream, FrameDirection.UPSTREAM)
|
||||
|
||||
if isinstance(item.handler, DirectFunctionWrapper):
|
||||
# Handler is a DirectFunctionWrapper
|
||||
|
||||
@@ -24,7 +24,7 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.tts_service import InterruptibleTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
# See .env.example for LMNT configuration needed
|
||||
@@ -46,7 +46,7 @@ def language_to_lmnt_language(language: Language) -> Optional[str]:
|
||||
Returns:
|
||||
The corresponding LMNT language code, or None if not supported.
|
||||
"""
|
||||
BASE_LANGUAGES = {
|
||||
LANGUAGE_MAP = {
|
||||
Language.DE: "de",
|
||||
Language.EN: "en",
|
||||
Language.ES: "es",
|
||||
@@ -68,17 +68,7 @@ def language_to_lmnt_language(language: Language) -> Optional[str]:
|
||||
Language.ZH: "zh",
|
||||
}
|
||||
|
||||
result = BASE_LANGUAGES.get(language)
|
||||
|
||||
# If not found in base languages, try to find the base language from a variant
|
||||
if not result:
|
||||
# Convert enum value to string and get the base language part (e.g. es-ES -> es)
|
||||
lang_str = str(language.value)
|
||||
base_code = lang_str.split("-")[0].lower()
|
||||
# Look up the base code in our supported languages
|
||||
result = base_code if base_code in BASE_LANGUAGES.values() else None
|
||||
|
||||
return result
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=True)
|
||||
|
||||
|
||||
class LmntTTSService(InterruptibleTTSService):
|
||||
|
||||
@@ -13,7 +13,8 @@ from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.pipeline.llm_switcher import LLMSwitcher
|
||||
from pipecat.services.llm_service import FunctionCallParams, LLMService
|
||||
from pipecat.utils.base_object import BaseObject
|
||||
|
||||
try:
|
||||
@@ -56,75 +57,67 @@ class MCPClient(BaseObject):
|
||||
super().__init__(**kwargs)
|
||||
self._server_params = server_params
|
||||
self._session = ClientSession
|
||||
self._needs_alternate_schema = False
|
||||
|
||||
if isinstance(server_params, StdioServerParameters):
|
||||
self._client = stdio_client
|
||||
self._register_tools = self._stdio_register_tools
|
||||
self._list_tools = self._stdio_list_tools
|
||||
self._tool_wrapper = self._stdio_tool_wrapper
|
||||
elif isinstance(server_params, SseServerParameters):
|
||||
self._client = sse_client
|
||||
self._register_tools = self._sse_register_tools
|
||||
self._list_tools = self._sse_list_tools
|
||||
self._tool_wrapper = self._sse_tool_wrapper
|
||||
elif isinstance(server_params, StreamableHttpParameters):
|
||||
self._client = streamablehttp_client
|
||||
self._register_tools = self._streamable_http_register_tools
|
||||
self._list_tools = self._streamable_http_list_tools
|
||||
self._tool_wrapper = self._streamable_http_tool_wrapper
|
||||
else:
|
||||
raise TypeError(
|
||||
f"{self} invalid argument type: `server_params` must be either StdioServerParameters, SseServerParameters, or StreamableHttpParameters."
|
||||
)
|
||||
|
||||
async def register_tools(self, llm) -> ToolsSchema:
|
||||
async def register_tools(self, llm: LLMService | LLMSwitcher) -> ToolsSchema:
|
||||
"""Register all available MCP tools with an LLM service.
|
||||
|
||||
Connects to the MCP server, discovers available tools, converts their
|
||||
schemas to Pipecat format, and registers them with the LLM service.
|
||||
|
||||
This is the equivalent of calling get_tools_schema() followed by
|
||||
register_tools_schema().
|
||||
|
||||
Args:
|
||||
llm: The Pipecat LLM service to register tools with.
|
||||
|
||||
Returns:
|
||||
A ToolsSchema containing all successfully registered tools.
|
||||
"""
|
||||
# Check once if the LLM needs alternate strict schema
|
||||
self._needs_alternate_schema = llm and llm.needs_mcp_alternate_schema()
|
||||
tools_schema = await self._register_tools(llm)
|
||||
tools_schema = await self.get_tools_schema()
|
||||
await self.register_tools_schema(tools_schema, llm)
|
||||
return tools_schema
|
||||
|
||||
def _get_alternate_schema_for_strict_validation(self, schema: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get an alternate JSON schema to be compatible with LLMs that have strict validation.
|
||||
async def get_tools_schema(self) -> ToolsSchema:
|
||||
"""Get the schema of all available MCP tools without registering them.
|
||||
|
||||
Some LLMs have stricter validation and don't allow certain schema properties
|
||||
that are valid in standard JSON Schema.
|
||||
|
||||
Args:
|
||||
schema: The JSON schema to get an alternate schema for
|
||||
Connects to the MCP server, discovers available tools, and converts their
|
||||
schemas to Pipecat format.
|
||||
|
||||
Returns:
|
||||
An alternate schema compatible with strict validation
|
||||
A ToolsSchema containing all available tools. This can be used for
|
||||
subsequent registration using register_tools_schema().
|
||||
"""
|
||||
if not isinstance(schema, dict):
|
||||
return schema
|
||||
tools_schema = await self._list_tools()
|
||||
return tools_schema
|
||||
|
||||
alternate_schema = {}
|
||||
async def register_tools_schema(
|
||||
self, tools_schema: ToolsSchema, llm: LLMService | LLMSwitcher
|
||||
) -> None:
|
||||
"""Register the MCP tools (previously obtained from get_tools_schema()) with the LLM service.
|
||||
|
||||
for key, value in schema.items():
|
||||
# Skip additionalProperties as some LLMs don't like additionalProperties: false
|
||||
if key == "additionalProperties":
|
||||
continue
|
||||
|
||||
# Recursively get alternate schema for nested objects
|
||||
if isinstance(value, dict):
|
||||
alternate_schema[key] = self._get_alternate_schema_for_strict_validation(value)
|
||||
elif isinstance(value, list):
|
||||
alternate_schema[key] = [
|
||||
self._get_alternate_schema_for_strict_validation(item)
|
||||
if isinstance(item, dict)
|
||||
else item
|
||||
for item in value
|
||||
]
|
||||
else:
|
||||
alternate_schema[key] = value
|
||||
|
||||
return alternate_schema
|
||||
Args:
|
||||
tools_schema: The ToolsSchema to register with the LLM service.
|
||||
llm: The Pipecat LLM service to register tools with.
|
||||
"""
|
||||
for function_schema in tools_schema.standard_tools:
|
||||
llm.register_function(function_schema.name, self._tool_wrapper)
|
||||
|
||||
def _convert_mcp_schema_to_pipecat(
|
||||
self, tool_name: str, tool_schema: Dict[str, Any]
|
||||
@@ -143,11 +136,6 @@ class MCPClient(BaseObject):
|
||||
properties = tool_schema["input_schema"].get("properties", {})
|
||||
required = tool_schema["input_schema"].get("required", [])
|
||||
|
||||
# Only get alternate schema for LLMs that need strict schema validation
|
||||
if self._needs_alternate_schema:
|
||||
logger.debug("Getting alternate schema for strict validation")
|
||||
properties = self._get_alternate_schema_for_strict_validation(properties)
|
||||
|
||||
schema = FunctionSchema(
|
||||
name=tool_name,
|
||||
description=tool_schema["description"],
|
||||
@@ -159,112 +147,76 @@ class MCPClient(BaseObject):
|
||||
|
||||
return schema
|
||||
|
||||
async def _sse_register_tools(self, llm) -> ToolsSchema:
|
||||
"""Register all available mcp tools with the LLM service.
|
||||
async def _sse_list_tools(self) -> ToolsSchema:
|
||||
"""List all available mcp tools with the LLM service.
|
||||
|
||||
Args:
|
||||
llm: The Pipecat LLM service to register tools with
|
||||
Returns:
|
||||
A ToolsSchema containing all registered tools
|
||||
"""
|
||||
|
||||
async def mcp_tool_wrapper(params: FunctionCallParams) -> None:
|
||||
"""Wrapper for mcp tool calls to match Pipecat's function call interface."""
|
||||
logger.debug(
|
||||
f"Executing tool '{params.function_name}' with call ID: {params.tool_call_id}"
|
||||
)
|
||||
logger.trace(f"Tool arguments: {json.dumps(params.arguments, indent=2)}")
|
||||
try:
|
||||
async with self._client(**self._server_params.model_dump()) as (read, write):
|
||||
async with self._session(read, write) as session:
|
||||
await session.initialize()
|
||||
await self._call_tool(
|
||||
session, params.function_name, params.arguments, params.result_callback
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = f"Error calling mcp tool {params.function_name}: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
logger.exception("Full exception details:")
|
||||
await params.result_callback(error_msg)
|
||||
|
||||
logger.debug(f"SSE server parameters: {self._server_params}")
|
||||
logger.debug("Starting registration of mcp tools")
|
||||
logger.debug(f"Starting reading mcp tools")
|
||||
|
||||
async with self._client(**self._server_params.model_dump()) as (read, write):
|
||||
async with self._session(read, write) as session:
|
||||
await session.initialize()
|
||||
tools_schema = await self._list_tools(session, mcp_tool_wrapper, llm)
|
||||
tools_schema = await self._list_tools_helper(session)
|
||||
return tools_schema
|
||||
|
||||
async def _stdio_register_tools(self, llm) -> ToolsSchema:
|
||||
"""Register all available mcp tools with the LLM service.
|
||||
async def _sse_tool_wrapper(self, params: FunctionCallParams) -> None:
|
||||
"""Wrapper for mcp tool calls to match Pipecat's function call interface."""
|
||||
logger.debug(f"Executing tool '{params.function_name}' with call ID: {params.tool_call_id}")
|
||||
logger.trace(f"Tool arguments: {json.dumps(params.arguments, indent=2)}")
|
||||
try:
|
||||
async with self._client(**self._server_params.model_dump()) as (read, write):
|
||||
async with self._session(read, write) as session:
|
||||
await session.initialize()
|
||||
await self._call_tool(
|
||||
session, params.function_name, params.arguments, params.result_callback
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = f"Error calling mcp tool {params.function_name}: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
logger.exception("Full exception details:")
|
||||
await params.result_callback(error_msg)
|
||||
|
||||
async def _stdio_list_tools(self) -> ToolsSchema:
|
||||
"""List all available mcp tools with the LLM service.
|
||||
|
||||
Args:
|
||||
llm: The Pipecat LLM service to register tools with
|
||||
Returns:
|
||||
A ToolsSchema containing all registered tools
|
||||
A ToolsSchema containing all available tools.
|
||||
"""
|
||||
|
||||
async def mcp_tool_wrapper(params: FunctionCallParams) -> None:
|
||||
"""Wrapper for mcp tool calls to match Pipecat's function call interface."""
|
||||
logger.debug(
|
||||
f"Executing tool '{params.function_name}' with call ID: {params.tool_call_id}"
|
||||
)
|
||||
logger.trace(f"Tool arguments: {json.dumps(params.arguments, indent=2)}")
|
||||
try:
|
||||
async with self._client(self._server_params) as streams:
|
||||
async with self._session(streams[0], streams[1]) as session:
|
||||
await session.initialize()
|
||||
await self._call_tool(
|
||||
session, params.function_name, params.arguments, params.result_callback
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = f"Error calling mcp tool {params.function_name}: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
logger.exception("Full exception details:")
|
||||
await params.result_callback(error_msg)
|
||||
|
||||
logger.debug("Starting registration of mcp tools")
|
||||
logger.debug(f"Starting reading mcp tools")
|
||||
|
||||
async with self._client(self._server_params) as streams:
|
||||
async with self._session(streams[0], streams[1]) as session:
|
||||
await session.initialize()
|
||||
tools_schema = await self._list_tools(session, mcp_tool_wrapper, llm)
|
||||
tools_schema = await self._list_tools_helper(session)
|
||||
return tools_schema
|
||||
|
||||
async def _streamable_http_register_tools(self, llm) -> ToolsSchema:
|
||||
"""Register all available mcp tools with the LLM service using streamable HTTP.
|
||||
async def _stdio_tool_wrapper(self, params: FunctionCallParams) -> None:
|
||||
"""Wrapper for mcp tool calls to match Pipecat's function call interface."""
|
||||
logger.debug(f"Executing tool '{params.function_name}' with call ID: {params.tool_call_id}")
|
||||
logger.trace(f"Tool arguments: {json.dumps(params.arguments, indent=2)}")
|
||||
try:
|
||||
async with self._client(self._server_params) as streams:
|
||||
async with self._session(streams[0], streams[1]) as session:
|
||||
await session.initialize()
|
||||
await self._call_tool(
|
||||
session, params.function_name, params.arguments, params.result_callback
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = f"Error calling mcp tool {params.function_name}: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
logger.exception("Full exception details:")
|
||||
await params.result_callback(error_msg)
|
||||
|
||||
async def _streamable_http_list_tools(self) -> ToolsSchema:
|
||||
"""List all available mcp tools with the LLM service using streamable HTTP.
|
||||
|
||||
Args:
|
||||
llm: The Pipecat LLM service to register tools with
|
||||
Returns:
|
||||
A ToolsSchema containing all registered tools
|
||||
A ToolsSchema containing all available tools.
|
||||
"""
|
||||
|
||||
async def mcp_tool_wrapper(params: FunctionCallParams) -> None:
|
||||
"""Wrapper for mcp tool calls to match Pipecat's function call interface."""
|
||||
logger.debug(
|
||||
f"Executing tool '{params.function_name}' with call ID: {params.tool_call_id}"
|
||||
)
|
||||
logger.trace(f"Tool arguments: {json.dumps(params.arguments, indent=2)}")
|
||||
try:
|
||||
async with self._client(**self._server_params.model_dump()) as (
|
||||
read_stream,
|
||||
write_stream,
|
||||
_,
|
||||
):
|
||||
async with self._session(read_stream, write_stream) as session:
|
||||
await session.initialize()
|
||||
await self._call_tool(
|
||||
session, params.function_name, params.arguments, params.result_callback
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = f"Error calling mcp tool {params.function_name}: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
logger.exception("Full exception details:")
|
||||
await params.result_callback(error_msg)
|
||||
|
||||
logger.debug("Starting registration of mcp tools using streamable HTTP")
|
||||
logger.debug(f"Starting reading mcp tools using streamable HTTP")
|
||||
|
||||
async with self._client(**self._server_params.model_dump()) as (
|
||||
read_stream,
|
||||
@@ -273,9 +225,30 @@ class MCPClient(BaseObject):
|
||||
):
|
||||
async with self._session(read_stream, write_stream) as session:
|
||||
await session.initialize()
|
||||
tools_schema = await self._list_tools(session, mcp_tool_wrapper, llm)
|
||||
tools_schema = await self._list_tools_helper(session)
|
||||
return tools_schema
|
||||
|
||||
async def _streamable_http_tool_wrapper(self, params: FunctionCallParams) -> None:
|
||||
"""Wrapper for mcp tool calls to match Pipecat's function call interface."""
|
||||
logger.debug(f"Executing tool '{params.function_name}' with call ID: {params.tool_call_id}")
|
||||
logger.trace(f"Tool arguments: {json.dumps(params.arguments, indent=2)}")
|
||||
try:
|
||||
async with self._client(**self._server_params.model_dump()) as (
|
||||
read_stream,
|
||||
write_stream,
|
||||
_,
|
||||
):
|
||||
async with self._session(read_stream, write_stream) as session:
|
||||
await session.initialize()
|
||||
await self._call_tool(
|
||||
session, params.function_name, params.arguments, params.result_callback
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = f"Error calling mcp tool {params.function_name}: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
logger.exception("Full exception details:")
|
||||
await params.result_callback(error_msg)
|
||||
|
||||
async def _call_tool(self, session, function_name, arguments, result_callback):
|
||||
logger.debug(f"Calling mcp tool '{function_name}'")
|
||||
try:
|
||||
@@ -302,7 +275,7 @@ class MCPClient(BaseObject):
|
||||
final_response = response if len(response) else "Sorry, could not call the mcp tool"
|
||||
await result_callback(final_response)
|
||||
|
||||
async def _list_tools(self, session, mcp_tool_wrapper, llm):
|
||||
async def _list_tools_helper(self, session):
|
||||
available_tools = await session.list_tools()
|
||||
tool_schemas: List[FunctionSchema] = []
|
||||
|
||||
@@ -323,20 +296,16 @@ class MCPClient(BaseObject):
|
||||
{"description": tool.description, "input_schema": tool.inputSchema},
|
||||
)
|
||||
|
||||
# Register the wrapped function
|
||||
logger.debug(f"Registering function handler for '{tool_name}'")
|
||||
llm.register_function(tool_name, mcp_tool_wrapper)
|
||||
|
||||
# Add to list of schemas
|
||||
tool_schemas.append(function_schema)
|
||||
logger.debug(f"Successfully registered tool '{tool_name}'")
|
||||
logger.debug(f"Successfully read tool '{tool_name}'")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register tool '{tool_name}': {str(e)}")
|
||||
logger.error(f"Failed to read tool '{tool_name}': {str(e)}")
|
||||
logger.exception("Full exception details:")
|
||||
continue
|
||||
|
||||
logger.debug(f"Completed registration of {len(tool_schemas)} tools")
|
||||
logger.debug(f"Completed reading {len(tool_schemas)} tools")
|
||||
tools_schema = ToolsSchema(standard_tools=tool_schemas)
|
||||
|
||||
return tools_schema
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user