Compare commits
196 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9697abe559 | ||
|
|
cb0335c82a | ||
|
|
f560614af9 | ||
|
|
d7a196a3f4 | ||
|
|
644e106c03 | ||
|
|
70f83b4a75 | ||
|
|
35ed37c539 | ||
|
|
58a038ddb2 | ||
|
|
de3c1d6e8b | ||
|
|
0a9878998f | ||
|
|
8459c01af8 | ||
|
|
baaabf7d73 | ||
|
|
4735b74776 | ||
|
|
0109aea04c | ||
|
|
ce1311f6ba | ||
|
|
2520243d9d | ||
|
|
8869e25142 | ||
|
|
822392b0d4 | ||
|
|
124863175a | ||
|
|
17a5e78fb4 | ||
|
|
bc29bdb95e | ||
|
|
005fe33b25 | ||
|
|
24154474c9 | ||
|
|
86effc4d10 | ||
|
|
58e50882d8 | ||
|
|
ef183d0c96 | ||
|
|
f078df7805 | ||
|
|
815cd44c2a | ||
|
|
e5941926be | ||
|
|
6266c026a6 | ||
|
|
e25dccfc6b | ||
|
|
3bbfc42854 | ||
|
|
3b2127f912 | ||
|
|
ea12b10742 | ||
|
|
a2fbed86cf | ||
|
|
f75f361629 | ||
|
|
4c153e5d3c | ||
|
|
4088992d97 | ||
|
|
f1b16a672a | ||
|
|
65b15a8528 | ||
|
|
108e32eb72 | ||
|
|
38a02271c5 | ||
|
|
2ce203aeb8 | ||
|
|
b30df95f13 | ||
|
|
6be8deee2a | ||
|
|
c113cacd59 | ||
|
|
d0495eeef6 | ||
|
|
c3eb69165c | ||
|
|
0302f6d05c | ||
|
|
b9ff333654 | ||
|
|
92610944af | ||
|
|
6a337f1bc6 | ||
|
|
ef7fa07bf7 | ||
|
|
ce1506792e | ||
|
|
70f3d32734 | ||
|
|
356618b448 | ||
|
|
1624d7a474 | ||
|
|
092b1dcb0f | ||
|
|
b90ea9bf6a | ||
|
|
05c97804d5 | ||
|
|
7a8357a569 | ||
|
|
44756de15a | ||
|
|
94304ec74e | ||
|
|
a3fe34f4a2 | ||
|
|
21f6c2afa5 | ||
|
|
4d14251f4a | ||
|
|
1421c4ba22 | ||
|
|
6b1d8d9fa5 | ||
|
|
ac810e57ed | ||
|
|
bba7ca80e3 | ||
|
|
79250f1fe0 | ||
|
|
4f6e76e6fd | ||
|
|
b0962861c8 | ||
|
|
ec7c35fe98 | ||
|
|
10b86b4bbe | ||
|
|
8ec56092c0 | ||
|
|
0c3c5e5c7d | ||
|
|
b64ed3f9e2 | ||
|
|
5872006d6b | ||
|
|
457eb7aa92 | ||
|
|
14cd476b20 | ||
|
|
3b0affe5b4 | ||
|
|
08fe9157cc | ||
|
|
3f3d3c9203 | ||
|
|
6b6896a543 | ||
|
|
7858813871 | ||
|
|
7bba74ebd6 | ||
|
|
f425e946eb | ||
|
|
75bd1b5b9b | ||
|
|
d953c201bd | ||
|
|
263cad41f0 | ||
|
|
df9642eb5a | ||
|
|
dcbe86d0fc | ||
|
|
7fc79511dd | ||
|
|
4d9dc64af8 | ||
|
|
21f5cfe21a | ||
|
|
308044808d | ||
|
|
c244a950eb | ||
|
|
847bd8af4b | ||
|
|
10e58d6e42 | ||
|
|
609a0a14e7 | ||
|
|
84891de04d | ||
|
|
9a49517609 | ||
|
|
d8f5c0be71 | ||
|
|
93393ea91c | ||
|
|
58a17c7b1b | ||
|
|
103ced1eaa | ||
|
|
ac9bea27aa | ||
|
|
648094da26 | ||
|
|
29d604f608 | ||
|
|
b838bd906b | ||
|
|
c091232f2f | ||
|
|
8e247f395b | ||
|
|
b0e3b69b35 | ||
|
|
9213b22852 | ||
|
|
81571beb1b | ||
|
|
a07bee2318 | ||
|
|
a0f79b4700 | ||
|
|
2c3f051a1f | ||
|
|
c1b3a9f4b5 | ||
|
|
9ded7bab1b | ||
|
|
34fb303c44 | ||
|
|
2aec2467cb | ||
|
|
9d8eefd2a2 | ||
|
|
b59c4775da | ||
|
|
03bd667f95 | ||
|
|
e8c3f73968 | ||
|
|
91e5b1ad9a | ||
|
|
f2a19cb1a3 | ||
|
|
74becffe55 | ||
|
|
995f897b80 | ||
|
|
74d11dc0aa | ||
|
|
b435ddfa44 | ||
|
|
6d3dfd8f64 | ||
|
|
ce9c214eec | ||
|
|
8c8b76e9d2 | ||
|
|
7b3141ba19 | ||
|
|
928ade993b | ||
|
|
42a6fc703c | ||
|
|
c5c18335fd | ||
|
|
3159503c7f | ||
|
|
0340e25e9f | ||
|
|
af861b7975 | ||
|
|
6bb4e8295f | ||
|
|
f5f92dea63 | ||
|
|
cb1463f9f1 | ||
|
|
4c19f5584c | ||
|
|
80fecab4de | ||
|
|
ab91047300 | ||
|
|
3127cc6161 | ||
|
|
36319ecbf0 | ||
|
|
c6a1837844 | ||
|
|
31127abd9a | ||
|
|
aa355e3d32 | ||
|
|
9bd51cd88c | ||
|
|
fc1c3b48dc | ||
|
|
4278a37ebc | ||
|
|
7e045257e8 | ||
|
|
b8a1f45d4c | ||
|
|
8ec85f981d | ||
|
|
2f52905d32 | ||
|
|
f86cf98c6d | ||
|
|
84fcba772d | ||
|
|
b3bb6fdaa5 | ||
|
|
12b8af3d89 | ||
|
|
1c4ffb7845 | ||
|
|
8d4feede23 | ||
|
|
b11a3bc43f | ||
|
|
8dce66933f | ||
|
|
7291026695 | ||
|
|
686e250db1 | ||
|
|
e8d6f611cd | ||
|
|
f094ce80fb | ||
|
|
9fbe1bf2a3 | ||
|
|
d8b0e78bc8 | ||
|
|
675b7df408 | ||
|
|
30f39d7395 | ||
|
|
fe2ef9c712 | ||
|
|
173cf39aee | ||
|
|
ac43a70d36 | ||
|
|
8e4fd10e0f | ||
|
|
aeab417cd1 | ||
|
|
d263ad3c34 | ||
|
|
f3c454dc54 | ||
|
|
fc63790657 | ||
|
|
9ffcccdd84 | ||
|
|
503782c8b2 | ||
|
|
b834a893fe | ||
|
|
ba023248d9 | ||
|
|
14cf783647 | ||
|
|
86e726107f | ||
|
|
215b2dc7f3 | ||
|
|
874e2878be | ||
|
|
9131fa5c12 | ||
|
|
68a3070ad4 | ||
|
|
a7bf9f538c |
6
.github/workflows/format.yaml
vendored
6
.github/workflows/format.yaml
vendored
@@ -32,7 +32,7 @@ jobs:
|
||||
run: uv python install 3.12
|
||||
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
run: uv sync --group dev --extra daily --extra tracing
|
||||
|
||||
- name: Ruff formatter
|
||||
id: ruff-format
|
||||
@@ -41,3 +41,7 @@ jobs:
|
||||
- name: Ruff linter (all rules)
|
||||
id: ruff-check
|
||||
run: uv run ruff check
|
||||
|
||||
- name: Type check (pyright)
|
||||
id: pyright
|
||||
run: uv run pyright
|
||||
|
||||
1
.github/workflows/update-docs.yml
vendored
1
.github/workflows/update-docs.yml
vendored
@@ -114,6 +114,7 @@ jobs:
|
||||
GH_TOKEN=$DOCS_SYNC_TOKEN gh pr create \
|
||||
--repo pipecat-ai/docs \
|
||||
--label auto-docs \
|
||||
--label pipecat \
|
||||
--title "docs: update for pipecat PR #${{ steps.pr.outputs.number }}" \
|
||||
--body "$(cat <<'BODY'
|
||||
Automated documentation update for [pipecat PR #${{ steps.pr.outputs.number }}](https://github.com/pipecat-ai/pipecat/pull/${{ steps.pr.outputs.number }}).
|
||||
|
||||
338
CHANGELOG.md
338
CHANGELOG.md
@@ -7,6 +7,344 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
<!-- towncrier release notes start -->
|
||||
|
||||
## [1.1.0] - 2026-04-27
|
||||
|
||||
### Added
|
||||
|
||||
- Added `MistralSTTService` for real-time speech-to-text using Mistral's
|
||||
Voxtral Realtime API (`voxtral-mini-transcribe-realtime-2602`). Supports
|
||||
streaming transcription with interim results, automatic language detection,
|
||||
and VAD-driven utterance lifecycle.
|
||||
(PR [#4253](https://github.com/pipecat-ai/pipecat/pull/4253))
|
||||
|
||||
- Added `buttons` field to `OutputDTMFFrame` and `OutputDTMFUrgentFrame` for
|
||||
sending multi-key DTMF sequences as a `list[KeypadEntry]`. Use
|
||||
`OutputDTMFFrame.from_string("123#")` (or the equivalent on
|
||||
`OutputDTMFUrgentFrame`) to build one from a dial string, and `to_string()`
|
||||
to convert back.
|
||||
(PR [#4313](https://github.com/pipecat-ai/pipecat/pull/4313))
|
||||
|
||||
- Added `DailyTransport.send_dtmf()` to expose the Daily call client's DTMF
|
||||
sending capability, enabling applications to send tones during a call (e.g.
|
||||
IVR navigation).
|
||||
(PR [#4313](https://github.com/pipecat-ai/pipecat/pull/4313))
|
||||
|
||||
- Added `DailyOutputDTMFFrame` and `DailyOutputDTMFUrgentFrame` frames. In
|
||||
addition to the inherited `buttons`, they accept `session_id`,
|
||||
`digit_duration_ms` and `method`, which are forwarded to Daily's `send_dtmf`
|
||||
as `sessionId`, `digitDurationMs` and `method`.
|
||||
(PR [#4313](https://github.com/pipecat-ai/pipecat/pull/4313))
|
||||
|
||||
- Added incremental `pyright` type checking. A `pyrightconfig.json` at the repo
|
||||
root uses `typeCheckingMode: "basic"` with an explicit `include` list of
|
||||
modules that pass cleanly (`clocks`, `metrics`, `transcriptions`, `frames`,
|
||||
`observers`, `extensions`, `turns`, `pipeline`, `runner`). Remaining modules
|
||||
will be added in subsequent PRs. CI enforces the checked set via `uv run
|
||||
pyright` in the format workflow.
|
||||
(PR [#4324](https://github.com/pipecat-ai/pipecat/pull/4324))
|
||||
|
||||
- Added multilingual support to `DeepgramFluxSTTService` via a new
|
||||
`language_hints: list[Language]` setting. Works with Deepgram's new
|
||||
`flux-general-multi` model to bias transcription across English, Spanish,
|
||||
French, German, Hindi, Russian, Portuguese, Japanese, Italian, and Dutch.
|
||||
Omit the hints to use auto-detection, or pass a subset to bias toward
|
||||
expected languages. Hints can be updated mid-stream via
|
||||
`STTUpdateSettingsFrame` (sent as a Deepgram `Configure` control message, no
|
||||
reconnect) to support detect-then-lock flows.
|
||||
(PR [#4326](https://github.com/pipecat-ai/pipecat/pull/4326))
|
||||
|
||||
- Added fine-grained server-side VAD tuning options to
|
||||
`SarvamSTTService.Settings` for the `saaras:v3` model, including speech
|
||||
thresholds, frame-count controls, pre-speech padding, interruption
|
||||
sensitivity, and initial-frame skipping.
|
||||
(PR [#4334](https://github.com/pipecat-ai/pipecat/pull/4334))
|
||||
|
||||
- Added `XAISTTService` for real-time speech-to-text using xAI's voice STT
|
||||
WebSocket API (`wss://api.x.ai/v1/stt`). Streams raw audio (PCM, µ-law, or
|
||||
A-law) and emits interim and final transcription frames driven by the
|
||||
server's `is_final` / `speech_final` flags. Settings expose
|
||||
`interim_results`, `endpointing`, `language`, `multichannel`, `channels`, and
|
||||
`diarize`. Requires the `xai` optional extra (`pip install
|
||||
"pipecat-ai[xai]"`).
|
||||
(PR [#4340](https://github.com/pipecat-ai/pipecat/pull/4340))
|
||||
|
||||
- Added `XAITTSService` for streaming text-to-speech using xAI's WebSocket TTS
|
||||
endpoint (`wss://api.x.ai/v1/tts`). Streams `text.delta` chunks up and base64
|
||||
`audio.delta` chunks down on the same connection so audio begins flowing
|
||||
before the full utterance finishes synthesizing; complements the batch-HTTP
|
||||
`XAIHttpTTSService`. Defaults to raw PCM output so `TTSAudioRawFrame` needs
|
||||
no decoding. The `xai` optional extra now pulls in
|
||||
`pipecat-ai[websockets-base]`.
|
||||
(PR [#4341](https://github.com/pipecat-ai/pipecat/pull/4341))
|
||||
|
||||
- Added `SonioxTTSService`, a real-time WebSocket TTS service that streams text
|
||||
in and audio out over a persistent connection. Install with `pip install
|
||||
"pipecat-ai[soniox]"`.
|
||||
(PR [#4360](https://github.com/pipecat-ai/pipecat/pull/4360))
|
||||
|
||||
- Added support for Daily's built-in `screenVideo` destination in
|
||||
`DailyTransport`. When `"screenVideo"` is included in
|
||||
`video_out_destinations` transport parameter, a dedicated screen video track
|
||||
is created at join time and frames with `transport_destination="screenVideo"`
|
||||
are routed to it.
|
||||
|
||||
```python
|
||||
params = DailyParams(
|
||||
video_out_enabled=True,
|
||||
video_out_is_live=True,
|
||||
video_out_width=1280,
|
||||
video_out_height=720,
|
||||
video_out_destinations=["screenVideo"]
|
||||
)
|
||||
|
||||
...
|
||||
|
||||
frame = OutputImageRawFrame(...)
|
||||
frame.transport_destination = "screenVideo"
|
||||
```
|
||||
(PR [#4370](https://github.com/pipecat-ai/pipecat/pull/4370))
|
||||
|
||||
- Added `camera_out_send_settings` to `DailyParams`. This dict is passed
|
||||
verbatim to the Daily client's camera publishing settings, allowing
|
||||
applications to fully control encoding, codec, bitrate, and framerate.
|
||||
|
||||
```python
|
||||
params = DailyParams(
|
||||
camera_out_send_settings={
|
||||
"maxQuality": "high",
|
||||
"encodings": {
|
||||
"high": {"maxBitrate": 2_000_000, "maxFramerate": 30}
|
||||
},
|
||||
},
|
||||
)
|
||||
```
|
||||
(PR [#4370](https://github.com/pipecat-ai/pipecat/pull/4370))
|
||||
|
||||
- Added `tool_resources` to `PipelineTask` and `FunctionCallParams`. Pass an
|
||||
application-defined object (DB handles, clients, state, etc.) to
|
||||
`PipelineTask(..., tool_resources=...)` and access it from any tool handler
|
||||
via `params.tool_resources`. Passed by reference; the caller retains their
|
||||
handle and can read mutations after the task finishes. Resolves #4256.
|
||||
(PR [#4371](https://github.com/pipecat-ai/pipecat/pull/4371))
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated NVIDIA STT services to align with Nemotron Speech defaults and
|
||||
configuration: `api_key` is now optional for local deployments, additional
|
||||
recognition settings are available (including alternatives, word offsets, and
|
||||
diarization), and streaming/segmented docs now reflect Nemotron Speech APIs.
|
||||
- NVIDIA streaming STT now sets `TranscriptionFrame.finalized=True` when the
|
||||
provider marks a result as final, and preserves `language` on both
|
||||
`TranscriptionFrame` and `InterimTranscriptionFrame`.
|
||||
(PR [#4269](https://github.com/pipecat-ai/pipecat/pull/4269))
|
||||
|
||||
- Updated `NvidiaLLMService` to emit model reasoning as `LLMThought*Frame`s
|
||||
(from both `reasoning_content` and `<think>...</think>` output), avoid mixing
|
||||
reasoning text into normal assistant content, and allow keyless local NIM
|
||||
endpoints while warning when the cloud endpoint is used without an API key.
|
||||
(PR [#4270](https://github.com/pipecat-ai/pipecat/pull/4270))
|
||||
|
||||
- STT services now reconnect safely when settings change: reconnection is
|
||||
deferred until the current user turn ends (i.e., until
|
||||
`UserStoppedSpeakingFrame` is received) rather than interrupting an active
|
||||
speech session. Audio frames received while the reconnect is in progress are
|
||||
buffered and replayed once the new connection is ready. `CartesiaSTTService`
|
||||
and `DeepgramSTTService` both use this new behavior.
|
||||
(PR [#4311](https://github.com/pipecat-ai/pipecat/pull/4311))
|
||||
|
||||
- Reduced debug log noise for LLM services. The system instruction is now
|
||||
logged once when composed (e.g. when turn completion is enabled) instead of
|
||||
on every LLM call. Per-call logs now show only the conversation messages,
|
||||
consistent across Google, Anthropic, AWS, and OpenAI services.
|
||||
(PR [#4314](https://github.com/pipecat-ai/pipecat/pull/4314))
|
||||
|
||||
- `LiveKitRunnerArguments.token` is now a required `str` (previously `str |
|
||||
None` with a default of `None`). LiveKit requires a token to join a room, so
|
||||
the type now reflects reality. This only affects custom runners that
|
||||
construct `LiveKitRunnerArguments` directly; code consuming the argument from
|
||||
the standard runner is unaffected.
|
||||
(PR [#4324](https://github.com/pipecat-ai/pipecat/pull/4324))
|
||||
|
||||
- `TranscriptionFrame.language` and `InterimTranscriptionFrame.language`
|
||||
emitted by `DeepgramFluxSTTService` now reflect the language Deepgram
|
||||
detected for each turn (read from the `languages` field on Flux's `TurnInfo`
|
||||
event). On `flux-general-multi` this gives per-turn accuracy for downstream
|
||||
consumers (e.g. TTS voice selection). `flux-general-en` continues to emit
|
||||
`Language.EN`.
|
||||
(PR [#4326](https://github.com/pipecat-ai/pipecat/pull/4326))
|
||||
|
||||
- Added `includes_inter_frame_spaces` parameter to
|
||||
`TTSService.add_word_timestamps` and `_add_word_timestamps` (default `None`).
|
||||
When `True`, downstream consumers will not inject additional spaces between
|
||||
tokens; `None` leaves each frame's own default unchanged.
|
||||
- `InworldTTSService` now passes `includes_inter_frame_spaces=True` when
|
||||
reporting word timestamps, since Inworld tokens already include inter-word
|
||||
spacing.
|
||||
(PR [#4330](https://github.com/pipecat-ai/pipecat/pull/4330))
|
||||
|
||||
- `SarvamSTTService` now uses `saaras:v3` as its default model instead of
|
||||
`saarika:v2.5`. Applications that relied on the previous default should set
|
||||
`settings=SarvamSTTService.Settings(model="saarika:v2.5")` explicitly.
|
||||
(PR [#4334](https://github.com/pipecat-ai/pipecat/pull/4334))
|
||||
|
||||
- `SpeechTimeoutUserTurnStopStrategy` now waits only `user_speech_timeout` when
|
||||
a transcript arrives without a VAD stop event, rather than
|
||||
`max(ttfs_p99_latency, user_speech_timeout)`. If you had `ttfs_p99_latency >
|
||||
user_speech_timeout`, turn detection in that path is slightly faster than
|
||||
before.
|
||||
(PR [#4337](https://github.com/pipecat-ai/pipecat/pull/4337))
|
||||
|
||||
- If you use an STT service that emits finalized transcripts (Speechmatics,
|
||||
Soniox, Deepgram Flux, AssemblyAI) with `SpeechTimeoutUserTurnStopStrategy`,
|
||||
user turns now end as soon as `user_speech_timeout` elapses after VAD stop.
|
||||
Previously the strategy also waited for the STT P99 latency
|
||||
(`ttfs_p99_latency`) even when the transcript was already marked final.
|
||||
`user_speech_timeout` is still honored as a floor — STT finalization never
|
||||
shortens it.
|
||||
(PR [#4337](https://github.com/pipecat-ai/pipecat/pull/4337))
|
||||
|
||||
- ⚠️ `PlivoFrameSerializer` and `TelnyxFrameSerializer` now raise `ValueError`
|
||||
at construction when `auto_hang_up=True` (the default) but required
|
||||
credentials are missing, matching `TwilioFrameSerializer`. Previously they
|
||||
constructed successfully and the hangup failed silently at call-end, leaving
|
||||
phantom billable sessions on the provider. If you relied on the old silent
|
||||
behavior, pass `auto_hang_up=False` explicitly or provide the credentials.
|
||||
The specific fields checked are `call_id`/`auth_id`/`auth_token` for Plivo
|
||||
and `call_control_id`/`api_key` for Telnyx.
|
||||
(PR [#4349](https://github.com/pipecat-ai/pipecat/pull/4349))
|
||||
|
||||
- `ToolsSchema(standard_tools=...)` now accepts any `Sequence[FunctionSchema |
|
||||
DirectFunction]` rather than requiring an exact `list` of the union. Callers
|
||||
can pass a narrower `list[FunctionSchema]` (or any other `Sequence`) without
|
||||
the type checker complaining about list invariance.
|
||||
(PR [#4352](https://github.com/pipecat-ai/pipecat/pull/4352))
|
||||
|
||||
- Updated `aic-sdk` dependency to `~=2.2.0`. The `AIC_LICENSE_KEY` environment
|
||||
variable replaces the previous `AICOUSTICS_LICENSE_KEY`.
|
||||
(PR [#4362](https://github.com/pipecat-ai/pipecat/pull/4362))
|
||||
|
||||
- Loosened the `protobuf` dependency to `>=5.29.6,<7`, so projects pinned to
|
||||
protobuf 5.x can install `pipecat-ai` again. The previous `>=6.31.1,<7` pin
|
||||
(introduced in 1.0.8 alongside the `nvidia-riva-client 2.25.1` upgrade)
|
||||
silently blocked any environment whose dependency graph already constrained
|
||||
protobuf to the 5.x line. The bundled `frames_pb2.py` is now compiled with
|
||||
protoc 5.x so it imports cleanly on both 5.x and 6.x runtimes.
|
||||
|
||||
Installing the `nvidia` extra still pulls protobuf 6.x: `nvidia-riva-client
|
||||
2.25.1` ships gencode that requires a 6.x runtime, so `pipecat-ai[nvidia]`
|
||||
now declares `protobuf>=6.31.1,<7` explicitly to cover an upstream packaging
|
||||
gap (https://github.com/nvidia-riva/python-clients/issues/172).
|
||||
(PR [#4372](https://github.com/pipecat-ai/pipecat/pull/4372))
|
||||
|
||||
- Daily rooms created by the development runner (`pipecat.runner.run`) now
|
||||
expire after 4 hours with `eject_at_room_exp=True`, mirroring Pipecat Cloud's
|
||||
max session limit. Previously, runner-created rooms inherited a 2-hour
|
||||
expiration on the default code paths and had no expiration at all when
|
||||
callers posted partial `dailyRoomProperties` (e.g. `{"start_video_off":
|
||||
true}`) to `/start`, causing rooms to accumulate indefinitely. Explicit `exp`
|
||||
and `eject_at_room_exp` values in `dailyRoomProperties` are still respected.
|
||||
(PR [#4374](https://github.com/pipecat-ai/pipecat/pull/4374))
|
||||
|
||||
- Updated `daily-python` dependency to `~=0.28.0`.
|
||||
(PR [#4379](https://github.com/pipecat-ai/pipecat/pull/4379))
|
||||
|
||||
### Deprecated
|
||||
|
||||
- Deprecated `TransportParams.video_out_bitrate` for the Daily transport. Use
|
||||
`DailyParams.camera_out_send_settings` instead to configure camera publishing
|
||||
encodings (bitrate, framerate, codec, etc.).
|
||||
(PR [#4370](https://github.com/pipecat-ai/pipecat/pull/4370))
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed missing tool handlers so unregistered tool calls fail with a normal
|
||||
final tool result instead of leaving tool-call state hanging.
|
||||
(PR [#4301](https://github.com/pipecat-ai/pipecat/pull/4301))
|
||||
|
||||
- Fixed `pipecat-ai[tavus]` not installing the required `daily-python`
|
||||
dependency. Installing the `tavus` extra now correctly pulls in
|
||||
`pipecat-ai[daily]`.
|
||||
(PR [#4304](https://github.com/pipecat-ai/pipecat/pull/4304))
|
||||
|
||||
- Fixed audio loss and potential errors when STT settings were updated
|
||||
mid-speech. Previously, `CartesiaSTTService` and `DeepgramSTTService` would
|
||||
immediately disconnect and reconnect when settings changed, dropping any
|
||||
in-flight audio. Reconnection is now deferred until the user stops speaking,
|
||||
and audio arriving during the reconnect window is buffered and replayed.
|
||||
(PR [#4311](https://github.com/pipecat-ai/pipecat/pull/4311))
|
||||
|
||||
- Fixed `SmallestTTSService` WebSocket endpoint URL to match Smallest AI v4.0.0
|
||||
API (`wss://waves-api.smallest.ai` → `wss://api.smallest.ai`) and restored
|
||||
keepalive using a silent space message instead of the unsupported flush
|
||||
command.
|
||||
(PR [#4320](https://github.com/pipecat-ai/pipecat/pull/4320))
|
||||
|
||||
- Fixed whitespace handling in TTS token streaming mode. Inter-token whitespace
|
||||
(e.g., spaces between words) is now preserved for correct prosody, while
|
||||
leading whitespace before the first non-whitespace token is still stripped to
|
||||
avoid issues with TTS models that are sensitive to leading spaces.
|
||||
(PR [#4323](https://github.com/pipecat-ai/pipecat/pull/4323))
|
||||
|
||||
- Fixed `SentryMetrics` silently dropping `MetricsFrame`s from
|
||||
`stop_ttfb_metrics` and `stop_processing_metrics`. `SentryMetrics` called the
|
||||
base `FrameProcessorMetrics` implementation but discarded its return value,
|
||||
so `FrameProcessor` never pushed the `MetricsFrame` downstream. This
|
||||
prevented observers (e.g. `UserBotLatencyObserver`, `MetricsLogObserver`)
|
||||
from seeing TTFB and processing metrics for any service using
|
||||
`metrics=SentryMetrics()`. The metrics were still calculated and Sentry
|
||||
transactions still completed — only the downstream frame push was affected.
|
||||
(PR [#4325](https://github.com/pipecat-ai/pipecat/pull/4325))
|
||||
|
||||
- Fixed `ElevenLabsTTSService` and `ElevenLabsHttpTTSService` emitting word
|
||||
timestamps and `TTSTextFrame` content that matched the input text instead of
|
||||
the spoken audio when a pronunciation dictionary
|
||||
(`pronunciation_dictionary_locators`) or text normalization rewrote the
|
||||
input. Both services now consume ElevenLabs' normalized alignment, so
|
||||
downstream consumers (captions, transcripts, context aggregation) reflect
|
||||
what the listener actually hears.
|
||||
(PR [#4344](https://github.com/pipecat-ai/pipecat/pull/4344))
|
||||
|
||||
- Fixed a crash in `DeepgramSTTService` when an `STTUpdateSettingsFrame`
|
||||
arrived before the WebSocket handshake completed (for example, when pushing
|
||||
an update upstream on `StartFrame`). The settings-triggered reconnect
|
||||
cancelled the in-flight connection task before its keepalive task was
|
||||
created, causing an `UnboundLocalError: cannot access local variable
|
||||
'keepalive_task'` in the handler's `finally` block.
|
||||
(PR [#4347](https://github.com/pipecat-ai/pipecat/pull/4347))
|
||||
|
||||
- Fixed direct-function registration crashing for functions without a
|
||||
docstring. `DirectFunctionWrapper` passed `inspect.getdoc()`'s result to
|
||||
`docstring_parser.parse()`, which raises when the docstring is `None`.
|
||||
Functions now register cleanly whether or not they have a docstring; an empty
|
||||
docstring produces empty description and parameter metadata as expected.
|
||||
(PR [#4352](https://github.com/pipecat-ai/pipecat/pull/4352))
|
||||
|
||||
- Fixed `AssemblyAISTTService`, `CartesiaSTTService`, `GradiumSTTService`, and
|
||||
`SonioxSTTService` crashing the pipeline on transient WebSocket send
|
||||
failures. Each `run_stt` sent audio directly without catching errors, so a
|
||||
single network hiccup mid-stream raised an uncaught exception through
|
||||
`process_frame`. The guards now log a warning and let the connection-state
|
||||
check on the next call handle recovery, matching the pattern used by
|
||||
Deepgram, xAI, Azure, and other push-based STTs.
|
||||
(PR [#4352](https://github.com/pipecat-ai/pipecat/pull/4352))
|
||||
|
||||
- Fixed Gemini Live losing conversation history in the (rare) case of a
|
||||
WebSocket reconnect before any session resumption handle is received. When
|
||||
the session reconnects (e.g. on system instruction change), conversation
|
||||
history is now re-seeded into the new session before it is marked ready for
|
||||
input.
|
||||
(PR [#4355](https://github.com/pipecat-ai/pipecat/pull/4355))
|
||||
|
||||
- Fixed SmallWebRTC data channel silently stalling on networks with a 1280-byte
|
||||
MTU (IPv6, Tailscale overlays, many consumer VPNs). aiortc's default SCTP
|
||||
chunk size of 1200 bytes produces ~1305-byte UDP datagrams after headers,
|
||||
which the kernel rejects with EMSGSIZE; aiortc has no path-MTU discovery so
|
||||
it retransmits forever at the same oversized size. The chunk size is now
|
||||
clamped to 1100 bytes (~1205-byte datagrams, ~75 bytes of slack). Override
|
||||
with `PIPECAT_SCTP_MAX_CHUNK_SIZE` if your path MTU requires a different
|
||||
value.
|
||||
(PR [#4358](https://github.com/pipecat-ai/pipecat/pull/4358))
|
||||
|
||||
## [1.0.0] - 2026-04-14
|
||||
|
||||
Migration guide: https://docs.pipecat.ai/pipecat/migration/migration-1.0
|
||||
|
||||
36
README.md
36
README.md
@@ -28,6 +28,10 @@
|
||||
|
||||
## 🌐 Pipecat Ecosystem
|
||||
|
||||
### 🧩 Multi-agent systems
|
||||
|
||||
Need multiple AI agents working together? [Pipecat Subagents](https://github.com/pipecat-ai/pipecat-subagents) lets you build distributed multi-agent systems where each agent runs its own pipeline and communicates through a shared message bus. Hand off conversations between specialists, dispatch background tasks, and scale agents across processes or machines.
|
||||
|
||||
### 📱 Client SDKs
|
||||
|
||||
Building client applications? You can connect to Pipecat from any platform using our official SDKs:
|
||||
@@ -67,7 +71,7 @@ and install any of the available plugins.
|
||||
|
||||
### 🧩 Community Integrations
|
||||
|
||||
Build and share your own Pipecat service integrations! Browse existing [community integrations](https://docs.pipecat.ai/server/services/community-integrations) or check out our [guide](COMMUNITY_INTEGRATIONS.md) to create your own.
|
||||
Build and share your own Pipecat service integrations! Browse existing [community integrations](https://docs.pipecat.ai/api-reference/server/services/community-integrations) or check out our [guide](COMMUNITY_INTEGRATIONS.md) to create your own.
|
||||
|
||||
### 📺️ Pipecat TV Channel
|
||||
|
||||
@@ -85,22 +89,22 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout
|
||||
|
||||
## 🧩 Available services
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [Nebius](https://docs.pipecat.ai/server/services/llm/nebius), [Novita](https://docs.pipecat.ai/server/services/llm/novita), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nvidia), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/llm/sarvam), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [Kokoro](https://docs.pipecat.ai/server/services/tts/kokoro), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Smallest](https://docs.pipecat.ai/server/services/tts/smallest), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [xAI](https://docs.pipecat.ai/server/services/tts/xai), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [LiveKit (WebRTC)](https://docs.pipecat.ai/server/services/transport/livekit), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), [WhatsApp](https://docs.pipecat.ai/server/services/transport/whatsapp), Local |
|
||||
| Serializers | [Exotel](https://docs.pipecat.ai/server/services/serializers/exotel), [Genesys](https://docs.pipecat.ai/server/services/serializers/genesys), [Plivo](https://docs.pipecat.ai/server/services/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/services/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/services/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/services/serializers/vonage) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/server/services/transport/lemonslice), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp Viva](https://docs.pipecat.ai/guides/features/krisp-viva), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter), [RNNoise](https://docs.pipecat.ai/server/utilities/audio/rnnoise-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
| Community | [Browse community integrations →](https://docs.pipecat.ai/server/services/community-integrations) |
|
||||
| Category | Services |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/api-reference/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/api-reference/server/services/stt/aws), [Azure](https://docs.pipecat.ai/api-reference/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/api-reference/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/api-reference/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/api-reference/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/api-reference/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/api-reference/server/services/stt/gladia), [Google](https://docs.pipecat.ai/api-reference/server/services/stt/google), [Gradium](https://docs.pipecat.ai/api-reference/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/api-reference/server/services/stt/groq), [Mistral](https://docs.pipecat.ai/api-reference/server/services/stt/mistral), [NVIDIA Riva](https://docs.pipecat.ai/api-reference/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/api-reference/server/services/stt/openai), [Sarvam](https://docs.pipecat.ai/api-reference/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/api-reference/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/api-reference/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/api-reference/server/services/stt/whisper), [xAI](https://docs.pipecat.ai/api-reference/server/services/stt/xai) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/api-reference/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/api-reference/server/services/llm/aws), [Azure](https://docs.pipecat.ai/api-reference/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/api-reference/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/api-reference/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/api-reference/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/api-reference/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/api-reference/server/services/llm/grok), [Groq](https://docs.pipecat.ai/api-reference/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/api-reference/server/services/llm/mistral), [Nebius](https://docs.pipecat.ai/api-reference/server/services/llm/nebius), [Novita](https://docs.pipecat.ai/api-reference/server/services/llm/novita), [NVIDIA NIM](https://docs.pipecat.ai/api-reference/server/services/llm/nvidia), [Ollama](https://docs.pipecat.ai/api-reference/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/api-reference/server/services/llm/openai), [OpenAI Responses](https://docs.pipecat.ai/api-reference/server/services/llm/openai-responses), [OpenRouter](https://docs.pipecat.ai/api-reference/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/api-reference/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/api-reference/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/api-reference/server/services/llm/sambanova), [Sarvam](https://docs.pipecat.ai/api-reference/server/services/llm/sarvam), [Together AI](https://docs.pipecat.ai/api-reference/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/api-reference/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/api-reference/server/services/tts/aws), [Azure](https://docs.pipecat.ai/api-reference/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/api-reference/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/api-reference/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/api-reference/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/api-reference/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/api-reference/server/services/tts/fish), [Google](https://docs.pipecat.ai/api-reference/server/services/tts/google), [Gradium](https://docs.pipecat.ai/api-reference/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/api-reference/server/services/tts/groq), [Hume](https://docs.pipecat.ai/api-reference/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/api-reference/server/services/tts/inworld), [Kokoro](https://docs.pipecat.ai/api-reference/server/services/tts/kokoro), [LMNT](https://docs.pipecat.ai/api-reference/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/api-reference/server/services/tts/minimax), [Mistral](https://docs.pipecat.ai/api-reference/server/services/tts/mistral), [Neuphonic](https://docs.pipecat.ai/api-reference/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/api-reference/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/api-reference/server/services/tts/openai), [Piper](https://docs.pipecat.ai/api-reference/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/api-reference/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/api-reference/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/api-reference/server/services/tts/sarvam), [Smallest](https://docs.pipecat.ai/api-reference/server/services/tts/smallest), [Soniox](https://docs.pipecat.ai/api-reference/server/services/tts/soniox), [Speechmatics](https://docs.pipecat.ai/api-reference/server/services/tts/speechmatics), [xAI](https://docs.pipecat.ai/api-reference/server/services/tts/xai), [XTTS](https://docs.pipecat.ai/api-reference/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/api-reference/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/api-reference/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/api-reference/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/api-reference/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/api-reference/server/services/s2s/ultravox), |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/api-reference/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/api-reference/server/services/transport/fastapi-websocket), [LiveKit (WebRTC)](https://docs.pipecat.ai/api-reference/server/services/transport/livekit), [SmallWebRTCTransport](https://docs.pipecat.ai/api-reference/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/api-reference/server/services/transport/websocket-server), [WhatsApp](https://docs.pipecat.ai/api-reference/server/services/transport/whatsapp), Local |
|
||||
| Serializers | [Exotel](https://docs.pipecat.ai/api-reference/server/services/serializers/exotel), [Genesys](https://docs.pipecat.ai/api-reference/server/services/serializers/genesys), [Plivo](https://docs.pipecat.ai/api-reference/server/services/serializers/plivo), [Twilio](https://docs.pipecat.ai/api-reference/server/services/serializers/twilio), [Telnyx](https://docs.pipecat.ai/api-reference/server/services/serializers/telnyx), [Vonage](https://docs.pipecat.ai/api-reference/server/services/serializers/vonage) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/api-reference/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/api-reference/server/services/transport/lemonslice), [Tavus](https://docs.pipecat.ai/api-reference/server/services/video/tavus), [Simli](https://docs.pipecat.ai/api-reference/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/api-reference/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/api-reference/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/api-reference/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/api-reference/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/api-reference/server/utilities/audio/silero-vad-analyzer), [Krisp Viva](https://docs.pipecat.ai/guides/features/krisp-viva), [Koala](https://docs.pipecat.ai/api-reference/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/api-reference/server/utilities/audio/aic-filter), [RNNoise](https://docs.pipecat.ai/api-reference/server/utilities/audio/rnnoise-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/api-reference/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/api-reference/server/services/analytics/sentry) |
|
||||
| Community | [Browse community integrations →](https://docs.pipecat.ai/api-reference/server/services/community-integrations) |
|
||||
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/api-reference/server/services/supported-services)
|
||||
|
||||
## ⚡ Getting started
|
||||
|
||||
|
||||
10
env.example
10
env.example
@@ -1,5 +1,5 @@
|
||||
# AI-COUSTICS
|
||||
AICOUSTICS_LICENSE_KEY=...
|
||||
AIC_LICENSE_KEY=...
|
||||
|
||||
# Anthropic
|
||||
ANTHROPIC_API_KEY=...
|
||||
@@ -214,4 +214,10 @@ WHATSAPP_PHONE_NUMBER_ID=...
|
||||
WHATSAPP_APP_SECRET=...
|
||||
|
||||
# xAI / Grok
|
||||
XAI_API_KEY=...
|
||||
XAI_API_KEY=...
|
||||
|
||||
# PIPECAT_SCTP_MAX_CHUNK_SIZE controls the maximum SCTP DATA-chunk payload
|
||||
# size (bytes) used by aiortc's data channel. The default is 1100.
|
||||
# All the details here:
|
||||
# https://docs.pipecat.ai/api-reference/server/services/transport/small-webrtc#pipecat_sctp_max_chunk_size
|
||||
#PIPECAT_SCTP_MAX_CHUNK_SIZE=1100
|
||||
@@ -71,17 +71,17 @@ transport_params = {
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -108,17 +108,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"), audio_passthrough=True)
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"], audio_passthrough=True)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121",
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -102,17 +102,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
|
||||
@@ -89,10 +89,10 @@ async def get_current_weather(params: FunctionCallParams):
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info("Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -109,7 +109,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Primary LLM for conversation (could be any provider)
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction=system_prompt,
|
||||
),
|
||||
@@ -117,7 +117,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Dedicated cheap/fast LLM for summarization only
|
||||
summarization_llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GoogleLLMService.Settings(
|
||||
model="gemini-2.5-flash",
|
||||
),
|
||||
|
||||
@@ -77,17 +77,17 @@ async def get_current_weather(params: FunctionCallParams):
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info("Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GoogleLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way. You have access to tools to get the current weather - use them when relevant.",
|
||||
),
|
||||
|
||||
@@ -72,10 +72,10 @@ async def summarize_conversation(params: FunctionCallParams):
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info("Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -91,7 +91,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"""
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction=system_prompt,
|
||||
),
|
||||
|
||||
@@ -77,17 +77,17 @@ async def get_current_weather(params: FunctionCallParams):
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info("Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way. You have access to tools to get the current weather - use them when relevant.",
|
||||
),
|
||||
|
||||
@@ -63,17 +63,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -58,24 +58,24 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
openai_llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
|
||||
groq_llm = GroqLLMService(
|
||||
api_key=os.getenv("GROQ_API_KEY"),
|
||||
api_key=os.environ["GROQ_API_KEY"],
|
||||
settings=GroqLLMService.Settings(
|
||||
system_instruction="You are a very helpful assistant. Your goal is to demonstrate your capabilities in detail in a creative and helpful way.",
|
||||
),
|
||||
|
||||
@@ -63,10 +63,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info("Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -74,7 +74,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Main LLM — drives the conversation. Its RTVI events reach the client.
|
||||
main_llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
@@ -83,7 +83,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# Evaluator LLM — silently grades the user's message in the background.
|
||||
# Its RTVI events will be suppressed so the client is unaware of this branch.
|
||||
evaluator_llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
name="EvaluatorLLM",
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a silent quality evaluator. When given a user message, respond with a single JSON object: {'score': <1-5>, 'reason': '<brief reason>'}. Do not respond conversationally.",
|
||||
|
||||
@@ -91,17 +91,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -56,10 +56,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = DeepgramTTSService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
api_key=os.environ["DEEPGRAM_API_KEY"],
|
||||
settings=DeepgramTTSService.Settings(
|
||||
voice="aura-asteria-en",
|
||||
),
|
||||
@@ -68,7 +68,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
llm = OpenAILLMService(
|
||||
# To use OpenAI
|
||||
# api_key=os.getenv("OPENAI_API_KEY"),
|
||||
# api_key=os.environ["OPENAI_API_KEY"],
|
||||
# Or, to use a local vLLM (or similar) api server
|
||||
settings=OpenAILLMService.Settings(
|
||||
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
|
||||
@@ -55,17 +55,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="d4db5fb9-f44b-4bd1-85fa-192e0f0d75f9", # Spanish-speaking Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a live translation assistant. Your sole purpose is to translate English text into Spanish. When you receive English text from the user, immediately translate it into natural, fluent Spanish. Do not add explanations, commentary, or extra information—only provide the Spanish translation of the text you receive.",
|
||||
),
|
||||
|
||||
@@ -126,14 +126,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
llm_text_aggregator.on_pattern_match("voice", on_voice_tag)
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
# Process LLM text through the pattern aggregator before TTS
|
||||
llm_text_processor = LLMTextProcessor(text_aggregator=llm_text_aggregator)
|
||||
|
||||
# Initialize TTS with narrator voice as default
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice=VOICE_IDS["narrator"],
|
||||
),
|
||||
@@ -190,7 +190,7 @@ Remember: Use narrator voice for EVERYTHING except the actual quoted dialogue.""
|
||||
|
||||
# Initialize LLM
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction=system_prompt,
|
||||
),
|
||||
|
||||
@@ -94,19 +94,19 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
required=["location", "format"],
|
||||
)
|
||||
|
||||
stt_cartesia = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
|
||||
stt_deepgram = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt_cartesia = CartesiaSTTService(api_key=os.environ["CARTESIA_API_KEY"])
|
||||
stt_deepgram = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
# Uses ServiceSwitcherStrategyManual by default
|
||||
stt_switcher = ServiceSwitcher(services=[stt_cartesia, stt_deepgram])
|
||||
|
||||
tts_cartesia = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
tts_deepgram = DeepgramTTSService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
api_key=os.environ["DEEPGRAM_API_KEY"],
|
||||
settings=DeepgramTTSService.Settings(
|
||||
voice="aura-2-helena-en",
|
||||
),
|
||||
@@ -117,11 +117,11 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
system_prompt = "You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way."
|
||||
|
||||
llm_openai = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(system_instruction=system_prompt),
|
||||
)
|
||||
llm_google = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GoogleLLMService.Settings(system_instruction=system_prompt),
|
||||
)
|
||||
# Uses ServiceSwitcherStrategyManual by default
|
||||
|
||||
@@ -42,14 +42,14 @@ class SwitchLanguage(ParallelPipeline):
|
||||
self._current_language = "English"
|
||||
|
||||
english_tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
spanish_tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="d4db5fb9-f44b-4bd1-85fa-192e0f0d75f9", # Spanish-speaking Lady
|
||||
),
|
||||
@@ -101,7 +101,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
api_key=os.environ["DEEPGRAM_API_KEY"],
|
||||
settings=DeepgramSTTService.Settings(
|
||||
language="multi",
|
||||
),
|
||||
@@ -110,7 +110,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
tts = SwitchLanguage()
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way. You can speak the following languages: 'English' and 'Spanish'.",
|
||||
),
|
||||
|
||||
@@ -42,21 +42,21 @@ class SwitchVoices(ParallelPipeline):
|
||||
self._current_voice = "News Lady"
|
||||
|
||||
news_lady = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="bf991597-6c13-47e4-8411-91ec2de5c466", # Newslady
|
||||
),
|
||||
)
|
||||
|
||||
british_lady = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
barbershop_man = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
|
||||
),
|
||||
@@ -114,12 +114,12 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = SwitchVoices()
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative and helpful way. You can do the following voices: 'News Lady', 'British Lady' and 'Barbershop Man'.",
|
||||
),
|
||||
|
||||
@@ -60,13 +60,13 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
# Cartesia offers a `<spell></spell>` tags that we can use to ask the user
|
||||
# to confirm the emails.
|
||||
# (see https://docs.cartesia.ai/build-with-sonic/formatting-text-for-sonic/spelling-out-input-text)
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -84,7 +84,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# )
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You need to gather a valid email or emails from the user. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. If the user provides one or more email addresses confirm them with the user. Enclose all emails with <spell> tags, for example <spell>a@a.com</spell>.",
|
||||
),
|
||||
|
||||
@@ -52,22 +52,22 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
classifier_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
classifier_llm = OpenAILLMService(api_key=os.environ["OPENAI_API_KEY"])
|
||||
|
||||
voicemail = VoicemailDetector(llm=classifier_llm)
|
||||
|
||||
|
||||
@@ -57,21 +57,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
api_key=os.environ["DEEPGRAM_API_KEY"],
|
||||
settings=DeepgramSTTService.Settings(
|
||||
keyterm=["pipecat"],
|
||||
),
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -107,17 +107,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
api_key=os.environ["ANTHROPIC_API_KEY"],
|
||||
enable_async_tool_cancellation=True,
|
||||
settings=AnthropicLLMService.Settings(
|
||||
system_instruction=(
|
||||
|
||||
@@ -66,17 +66,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
api_key=os.environ["ANTHROPIC_API_KEY"],
|
||||
enable_async_tool_cancellation=True,
|
||||
settings=AnthropicLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
|
||||
@@ -86,10 +86,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -97,7 +97,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Anthropic for vision analysis
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
api_key=os.environ["ANTHROPIC_API_KEY"],
|
||||
settings=AnthropicLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way. You are able to describe images from the user camera.",
|
||||
),
|
||||
|
||||
@@ -65,17 +65,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
api_key=os.environ["ANTHROPIC_API_KEY"],
|
||||
settings=AnthropicLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -86,10 +86,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
|
||||
@@ -60,18 +60,18 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
api_key=os.environ["AZURE_CHATGPT_API_KEY"],
|
||||
endpoint=os.environ["AZURE_CHATGPT_ENDPOINT"],
|
||||
settings=AzureLLMService.Settings(
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"),
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
|
||||
@@ -60,17 +60,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = CerebrasLLMService(
|
||||
api_key=os.getenv("CEREBRAS_API_KEY"),
|
||||
api_key=os.environ["CEREBRAS_API_KEY"],
|
||||
settings=CerebrasLLMService.Settings(
|
||||
system_instruction="""You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.
|
||||
|
||||
|
||||
@@ -60,17 +60,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = DeepSeekLLMService(
|
||||
api_key=os.getenv("DEEPSEEK_API_KEY"),
|
||||
api_key=os.environ["DEEPSEEK_API_KEY"],
|
||||
settings=DeepSeekLLMService.Settings(
|
||||
model="deepseek-chat",
|
||||
system_instruction="""You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.
|
||||
|
||||
@@ -76,17 +76,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -60,17 +60,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = FireworksLLMService(
|
||||
api_key=os.getenv("FIREWORKS_API_KEY"),
|
||||
api_key=os.environ["FIREWORKS_API_KEY"],
|
||||
settings=FireworksLLMService.Settings(
|
||||
model="accounts/fireworks/models/gpt-oss-20b",
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
|
||||
@@ -107,17 +107,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
enable_async_tool_cancellation=True,
|
||||
settings=GoogleLLMService.Settings(
|
||||
system_instruction=(
|
||||
|
||||
@@ -98,10 +98,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -127,7 +127,7 @@ indicate you should use the get_image tool are:
|
||||
"""
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
enable_async_tool_cancellation=True,
|
||||
settings=GoogleLLMService.Settings(
|
||||
system_instruction=system_prompt,
|
||||
|
||||
@@ -60,19 +60,19 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
|
||||
api_key=os.environ["ELEVENLABS_API_KEY"],
|
||||
settings=ElevenLabsTTSService.Settings(
|
||||
voice=os.getenv("ELEVENLABS_VOICE_ID", ""),
|
||||
voice=os.getenv("ELEVENLABS_VOICE_ID", "Xb7hH8MSUJpSbSDYk0k2"),
|
||||
),
|
||||
)
|
||||
|
||||
llm = GoogleVertexLLMService(
|
||||
credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
|
||||
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
|
||||
location=os.getenv("GOOGLE_CLOUD_LOCATION"),
|
||||
credentials=os.environ["GOOGLE_VERTEX_TEST_CREDENTIALS"],
|
||||
project_id=os.environ["GOOGLE_CLOUD_PROJECT_ID"],
|
||||
location=os.environ["GOOGLE_CLOUD_LOCATION"],
|
||||
settings=GoogleVertexLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
@@ -103,14 +103,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "developer",
|
||||
"content": "Start a conversation with 'Hey there' to get the current weather.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context = LLMContext(tools=tools)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
@@ -141,6 +134,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{
|
||||
"role": "developer",
|
||||
"content": "Please introduce yourself to the user.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -86,10 +86,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -97,7 +97,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Google Gemini model for vision analysis
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GoogleLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way. You are able to describe images from the user camera.",
|
||||
),
|
||||
|
||||
@@ -96,10 +96,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -125,7 +125,7 @@ indicate you should use the get_image tool are:
|
||||
"""
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GoogleLLMService.Settings(
|
||||
system_instruction=system_prompt,
|
||||
),
|
||||
|
||||
@@ -62,10 +62,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = XAIHttpTTSService(
|
||||
api_key=os.getenv("XAI_API_KEY"),
|
||||
api_key=os.environ["XAI_API_KEY"],
|
||||
aiohttp_session=session,
|
||||
settings=XAIHttpTTSService.Settings(
|
||||
voice="eve",
|
||||
@@ -73,7 +73,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = GrokLLMService(
|
||||
api_key=os.getenv("XAI_API_KEY"),
|
||||
api_key=os.environ["XAI_API_KEY"],
|
||||
settings=GrokLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -60,17 +60,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = GroqSTTService(api_key=os.getenv("GROQ_API_KEY"))
|
||||
stt = GroqSTTService(api_key=os.environ["GROQ_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = GroqLLMService(
|
||||
api_key=os.getenv("GROQ_API_KEY"),
|
||||
api_key=os.environ["GROQ_API_KEY"],
|
||||
settings=GroqLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -63,17 +63,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = MistralLLMService(
|
||||
api_key=os.getenv("MISTRAL_API_KEY"),
|
||||
api_key=os.environ["MISTRAL_API_KEY"],
|
||||
settings=MistralLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -117,17 +117,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way. You are able to describe images from the user camera.",
|
||||
),
|
||||
|
||||
@@ -63,17 +63,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = NebiusLLMService(
|
||||
api_key=os.getenv("NEBIUS_API_KEY"),
|
||||
api_key=os.environ["NEBIUS_API_KEY"],
|
||||
settings=NebiusLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -63,17 +63,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = NovitaLLMService(
|
||||
api_key=os.getenv("NOVITA_API_KEY"),
|
||||
api_key=os.environ["NOVITA_API_KEY"],
|
||||
settings=NovitaLLMService.Settings(
|
||||
model="openai/gpt-oss-120b",
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
|
||||
@@ -60,17 +60,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = NvidiaLLMService(
|
||||
api_key=os.getenv("NVIDIA_API_KEY"),
|
||||
api_key=os.environ["NVIDIA_API_KEY"],
|
||||
settings=NvidiaLLMService.Settings(
|
||||
model="nvidia/llama-3.3-nemotron-super-49b-v1.5",
|
||||
# Recommended when turning thinking off
|
||||
|
||||
@@ -64,10 +64,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
|
||||
@@ -107,17 +107,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
enable_async_tool_cancellation=True,
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction=(
|
||||
|
||||
@@ -70,7 +70,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = OpenAISTTService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAISTTService.Settings(
|
||||
model="gpt-4o-transcribe",
|
||||
prompt="Expect words related weather, such as temperature and conditions. And restaurant names.",
|
||||
@@ -78,7 +78,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
tts = OpenAITTSService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAITTSService.Settings(
|
||||
voice="ballad",
|
||||
),
|
||||
@@ -86,7 +86,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
enable_async_tool_cancellation=True,
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
|
||||
@@ -107,17 +107,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAIResponsesLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
enable_async_tool_cancellation=True,
|
||||
settings=OpenAIResponsesLLMService.Settings(
|
||||
system_instruction=(
|
||||
|
||||
@@ -66,17 +66,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAIResponsesLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
enable_async_tool_cancellation=True,
|
||||
settings=OpenAIResponsesLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
|
||||
@@ -63,17 +63,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAIResponsesHttpLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAIResponsesHttpLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -87,17 +87,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAIResponsesHttpLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAIResponsesHttpLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way. You are able to describe images from the user camera.",
|
||||
),
|
||||
|
||||
@@ -87,17 +87,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAIResponsesLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAIResponsesLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way. You are able to describe images from the user camera.",
|
||||
),
|
||||
|
||||
@@ -63,17 +63,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAIResponsesLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAIResponsesLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -87,17 +87,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way. You are able to describe images from the user camera.",
|
||||
),
|
||||
|
||||
@@ -64,7 +64,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = OpenAISTTService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAISTTService.Settings(
|
||||
model="gpt-4o-transcribe",
|
||||
prompt="Expect words related weather, such as temperature and conditions. And restaurant names.",
|
||||
@@ -72,7 +72,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
tts = OpenAITTSService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAITTSService.Settings(
|
||||
voice="ballad",
|
||||
),
|
||||
@@ -80,7 +80,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -60,11 +60,11 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = AzureTTSService(
|
||||
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
|
||||
region=os.getenv("AZURE_SPEECH_REGION"),
|
||||
api_key=os.environ["AZURE_SPEECH_API_KEY"],
|
||||
region=os.environ["AZURE_SPEECH_REGION"],
|
||||
settings=AzureTTSService.Settings(
|
||||
voice="en-US-JennyNeural",
|
||||
language="en-US",
|
||||
@@ -74,7 +74,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = OpenRouterLLMService(
|
||||
api_key=os.getenv("OPENROUTER_API_KEY"),
|
||||
api_key=os.environ["OPENROUTER_API_KEY"],
|
||||
settings=OpenRouterLLMService.Settings(
|
||||
model="openai/gpt-4o-2024-11-20",
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
|
||||
@@ -58,17 +58,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = PerplexityLLMService(
|
||||
api_key=os.getenv("PERPLEXITY_API_KEY"),
|
||||
api_key=os.environ["PERPLEXITY_API_KEY"],
|
||||
settings=PerplexityLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -60,17 +60,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = QwenLLMService(
|
||||
api_key=os.getenv("QWEN_API_KEY"),
|
||||
api_key=os.environ["QWEN_API_KEY"],
|
||||
model="qwen2.5-72b-instruct",
|
||||
settings=QwenLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
|
||||
@@ -61,18 +61,18 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
api_key=os.environ["DEEPGRAM_API_KEY"],
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = SambaNovaLLMService(
|
||||
api_key=os.getenv("SAMBANOVA_API_KEY"),
|
||||
api_key=os.environ["SAMBANOVA_API_KEY"],
|
||||
settings=SambaNovaLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -64,21 +64,21 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = SarvamSTTService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
api_key=os.environ["SARVAM_API_KEY"],
|
||||
settings=SarvamSTTService.Settings(
|
||||
model="saaras:v3",
|
||||
),
|
||||
)
|
||||
|
||||
tts = SarvamTTSService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
api_key=os.environ["SARVAM_API_KEY"],
|
||||
settings=SarvamTTSService.Settings(
|
||||
model="bulbul:v3",
|
||||
voice="shubh",
|
||||
),
|
||||
)
|
||||
llm = SarvamLLMService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
api_key=os.environ["SARVAM_API_KEY"],
|
||||
settings=SarvamLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -60,17 +60,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = TogetherLLMService(
|
||||
api_key=os.getenv("TOGETHER_API_KEY"),
|
||||
api_key=os.environ["TOGETHER_API_KEY"],
|
||||
settings=TogetherLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
260
examples/function-calling/function-calling-tool-resources.py
Normal file
260
examples/function-calling/function-calling-tool-resources.py
Normal file
@@ -0,0 +1,260 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Example demonstrating ``PipelineTask(tool_resources=...)``.
|
||||
|
||||
``tool_resources`` is an application-defined bag of anything you want every
|
||||
tool handler in a session to share by reference: database handles, HTTP
|
||||
clients, feature flags, per-user state, observability clients, in-memory
|
||||
caches — whatever fits your app. Pipecat passes it through untouched as
|
||||
``FunctionCallParams.tool_resources``.
|
||||
|
||||
This example uses a small ``ToolCallLogger`` as a stand-in for that "shared
|
||||
thing". A real app might just as easily pass a Postgres pool, a Redis
|
||||
client, a Stripe SDK instance, or any combination thereof. The mechanics
|
||||
shown here — construct once, hand to the task, read it from each handler,
|
||||
inspect it after the session — are the same regardless of what you put in.
|
||||
|
||||
We bundle resources in a typed ``SessionResources`` dataclass and cast back
|
||||
to it at the top of each handler. Pipecat doesn't care what type you pass
|
||||
(a plain dict works too), but a typed container gives you autocomplete and
|
||||
refactor safety instead of dict-by-string-key lookups.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime, timezone
|
||||
from typing import Any, cast
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.responses.llm import OpenAIResponsesLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class ToolCallLogger:
|
||||
"""Stand-in shared resource — swap for whatever your app actually needs."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the logger with an empty list of recorded calls."""
|
||||
self._calls: list[dict[str, Any]] = []
|
||||
|
||||
def log_tool_call(self, function_name: str, arguments: Mapping[str, Any]) -> None:
|
||||
"""Record a tool call invocation.
|
||||
|
||||
Args:
|
||||
function_name: The name of the tool being invoked.
|
||||
arguments: The arguments passed to the tool.
|
||||
"""
|
||||
entry = {
|
||||
"timestamp": datetime.now(UTC).isoformat(),
|
||||
"function_name": function_name,
|
||||
"arguments": dict(arguments),
|
||||
}
|
||||
self._calls.append(entry)
|
||||
logger.info(f"[ToolCallLogger] {function_name} called with {dict(arguments)}")
|
||||
|
||||
def dump(self) -> str:
|
||||
"""Return all recorded tool calls as a JSON string."""
|
||||
return json.dumps(self._calls, indent=2)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SessionResources:
|
||||
"""Typed container for everything the tool handlers in this session share.
|
||||
|
||||
Add fields here as the app grows (e.g. ``db: AsyncConnection``,
|
||||
``http: httpx.AsyncClient``). Handlers ``cast()`` ``params.tool_resources``
|
||||
to this type to get autocomplete and refactor safety.
|
||||
"""
|
||||
|
||||
tool_call_logger: ToolCallLogger
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
resources = cast(SessionResources, params.tool_resources)
|
||||
resources.tool_call_logger.log_tool_call(params.function_name, params.arguments)
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
resources = cast(SessionResources, params.tool_resources)
|
||||
resources.tool_call_logger.log_tool_call(params.function_name, params.arguments)
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAIResponsesLLMService(
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAIResponsesLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
@llm.event_handler("on_connection_error")
|
||||
async def on_connection_error(service, error):
|
||||
logger.error(f"LLM connection error: {error}")
|
||||
|
||||
@llm.event_handler("on_function_calls_started")
|
||||
async def on_function_calls_started(service, function_calls):
|
||||
# Avoid appending this filler message to the LLM context — it would
|
||||
# alter the conversation history and prevent
|
||||
# OpenAIResponsesLLMService's previous_response_id optimization from
|
||||
# matching, forcing a full context resend.
|
||||
await tts.queue_frame(TTSSpeakFrame("Let me check on that.", append_to_context=False))
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
|
||||
|
||||
context = LLMContext(tools=tools)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_aggregator,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
# Keep a local handle so we can read collected state after the session
|
||||
# ends; Pipecat never copies or clears the object.
|
||||
tool_call_logger = ToolCallLogger()
|
||||
resources = SessionResources(tool_call_logger=tool_call_logger)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
tool_resources=resources,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
# The session has ended; read whatever state the handlers built up.
|
||||
logger.info(f"Tool calls logged during session:\n{tool_call_logger.dump()}")
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -36,7 +36,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
|
||||
@@ -28,7 +28,7 @@ async def main():
|
||||
transport = LocalAudioTransport(LocalAudioTransportParams(audio_out_enabled=True))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
|
||||
@@ -38,14 +38,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -42,7 +42,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
imagegen = GoogleImageGenService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
|
||||
@@ -108,10 +108,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Create an HTTP session for API calls
|
||||
async with aiohttp.ClientSession() as session:
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm = OpenAILLMService(api_key=os.environ["OPENAI_API_KEY"])
|
||||
|
||||
tts = CartesiaHttpTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaHttpTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
|
||||
@@ -96,17 +96,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -51,17 +51,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -40,17 +40,17 @@ async def main():
|
||||
)
|
||||
)
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -63,17 +63,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -53,10 +53,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -77,7 +77,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"""
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
api_key=os.environ["ANTHROPIC_API_KEY"],
|
||||
settings=AnthropicLLMService.Settings(
|
||||
system_instruction=system_prompt,
|
||||
),
|
||||
|
||||
@@ -49,10 +49,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -71,7 +71,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"""
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
api_key=os.environ["ANTHROPIC_API_KEY"],
|
||||
settings=AnthropicLLMService.Settings(
|
||||
system_instruction=system_prompt,
|
||||
),
|
||||
|
||||
@@ -23,8 +23,6 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.mcp_service import MCPClient
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
@@ -54,15 +52,6 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
system = f"""
|
||||
You are a helpful LLM in a voice call.
|
||||
Your goal is to answer questions about the user's GitHub repositories and account.
|
||||
@@ -85,7 +74,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
tools = await mcp.get_tools_schema()
|
||||
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
system_instruction=system,
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
@@ -54,10 +54,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -73,7 +73,7 @@ Just respond with short sentences when you are carrying out tool calls.
|
||||
"""
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GoogleLLMService.Settings(
|
||||
system_instruction=system_prompt,
|
||||
),
|
||||
|
||||
@@ -100,17 +100,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
|
||||
@@ -60,12 +60,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
stt = DeepgramSTTService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
api_key=os.environ["DEEPGRAM_API_KEY"],
|
||||
metrics=SentryMetrics(),
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -73,7 +73,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
metrics=SentryMetrics(),
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
|
||||
@@ -84,7 +84,7 @@ async def load_conversation(params: FunctionCallParams):
|
||||
filename = params.arguments["filename"]
|
||||
logger.debug(f"loading conversation from {filename}")
|
||||
try:
|
||||
with open(filename, "r") as file:
|
||||
with open(filename) as file:
|
||||
params.context.set_messages(json.load(file))
|
||||
logger.debug(
|
||||
f"loaded conversation from {filename}\n{json.dumps(params.context.get_messages(), indent=4)}"
|
||||
@@ -170,17 +170,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
api_key=os.environ["ANTHROPIC_API_KEY"],
|
||||
settings=AnthropicLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
),
|
||||
|
||||
@@ -87,7 +87,9 @@ async def save_conversation(params: FunctionCallParams):
|
||||
# the simplest thing to do is to pop messages until the last one is an assistant
|
||||
# response
|
||||
while messages and not (
|
||||
messages[-1].get("role") == "assistant" and "content" in messages[-1]
|
||||
isinstance(messages[-1], dict)
|
||||
and messages[-1].get("role") == "assistant"
|
||||
and "content" in messages[-1]
|
||||
):
|
||||
messages.pop()
|
||||
if messages: # we never expect this to be empty
|
||||
@@ -105,7 +107,7 @@ async def load_conversation(params: FunctionCallParams):
|
||||
filename = params.arguments["filename"]
|
||||
logger.debug(f"loading conversation from {filename}")
|
||||
try:
|
||||
with open(filename, "r") as file:
|
||||
with open(filename) as file:
|
||||
messages = json.load(file)
|
||||
# HACK: if using the older Nova Sonic (pre-2) model, you need a special way of
|
||||
# triggering the first assistant response. The call to trigger_assistant_response(),
|
||||
@@ -125,6 +127,7 @@ async def load_conversation(params: FunctionCallParams):
|
||||
}
|
||||
)
|
||||
params.context.set_messages(messages)
|
||||
assert isinstance(params.llm, AWSNovaSonicLLMService)
|
||||
await params.llm.reset_conversation()
|
||||
# await params.llm.trigger_assistant_response()
|
||||
except Exception as e:
|
||||
@@ -219,9 +222,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = AWSNovaSonicLLMService(
|
||||
secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
||||
access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
||||
region=os.getenv("AWS_REGION"), # as of 2025-05-06, us-east-1 is the only supported region
|
||||
secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
|
||||
access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
|
||||
region=os.environ["AWS_REGION"], # as of 2025-05-06, us-east-1 is the only supported region
|
||||
settings=AWSNovaSonicLLMService.Settings(
|
||||
voice="tiffany", # matthew, tiffany, amy
|
||||
system_instruction=system_instruction,
|
||||
|
||||
@@ -110,7 +110,7 @@ async def load_conversation(params: FunctionCallParams):
|
||||
filename = params.arguments["filename"]
|
||||
logger.debug(f"loading conversation from {filename}")
|
||||
try:
|
||||
with open(filename, "r") as file:
|
||||
with open(filename) as file:
|
||||
params.context.set_messages(json.load(file))
|
||||
await params.result_callback(
|
||||
{
|
||||
@@ -243,17 +243,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
system_instruction=system_instruction,
|
||||
)
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ async def load_conversation(params: FunctionCallParams):
|
||||
filename = params.arguments["filename"]
|
||||
logger.debug(f"loading conversation from {filename}")
|
||||
try:
|
||||
with open(filename, "r") as file:
|
||||
with open(filename) as file:
|
||||
params.context.set_messages(json.load(file))
|
||||
await params.llm.reset_conversation()
|
||||
# Manually create a response since we've reset the conversation
|
||||
@@ -192,7 +192,7 @@ Remember, your responses should be short - just one or two sentences usually."""
|
||||
)
|
||||
|
||||
llm = GrokRealtimeLLMService(
|
||||
api_key=os.getenv("XAI_API_KEY"),
|
||||
api_key=os.environ["XAI_API_KEY"],
|
||||
session_properties=session_properties,
|
||||
)
|
||||
|
||||
|
||||
@@ -91,8 +91,9 @@ async def load_conversation(params: FunctionCallParams):
|
||||
filename = params.arguments["filename"]
|
||||
logger.debug(f"loading conversation from {filename}")
|
||||
try:
|
||||
with open(filename, "r") as file:
|
||||
with open(filename) as file:
|
||||
params.context.set_messages(json.load(file))
|
||||
assert isinstance(params.llm, OpenAIRealtimeLLMService)
|
||||
await params.llm.reset_conversation()
|
||||
# NOTE: we manually create a response here rather than relying
|
||||
# on the function callback to trigger one since we've reset the
|
||||
@@ -171,10 +172,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
llm = OpenAIRealtimeLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAIRealtimeLLMService.Settings(
|
||||
system_instruction="""Your knowledge cutoff is 2023-10. You are a helpful and friendly AI.
|
||||
|
||||
|
||||
@@ -85,7 +85,7 @@ async def load_conversation(params: FunctionCallParams):
|
||||
filename = params.arguments["filename"]
|
||||
logger.debug(f"loading conversation from {filename}")
|
||||
try:
|
||||
with open(filename, "r") as file:
|
||||
with open(filename) as file:
|
||||
params.context.set_messages(json.load(file))
|
||||
logger.debug(
|
||||
f"loaded conversation from {filename}\n{json.dumps(params.context.get_messages(), indent=4)}"
|
||||
@@ -171,17 +171,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAIResponsesHttpLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAIResponsesHttpLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
),
|
||||
|
||||
@@ -85,7 +85,7 @@ async def load_conversation(params: FunctionCallParams):
|
||||
filename = params.arguments["filename"]
|
||||
logger.debug(f"loading conversation from {filename}")
|
||||
try:
|
||||
with open(filename, "r") as file:
|
||||
with open(filename) as file:
|
||||
params.context.set_messages(json.load(file))
|
||||
logger.debug(
|
||||
f"loaded conversation from {filename}\n{json.dumps(params.context.get_messages(), indent=4)}"
|
||||
@@ -171,17 +171,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAIResponsesLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAIResponsesLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
),
|
||||
|
||||
@@ -85,7 +85,7 @@ async def load_conversation(params: FunctionCallParams):
|
||||
filename = params.arguments["filename"]
|
||||
logger.debug(f"loading conversation from {filename}")
|
||||
try:
|
||||
with open(filename, "r") as file:
|
||||
with open(filename) as file:
|
||||
params.context.set_messages(json.load(file))
|
||||
logger.debug(
|
||||
f"loaded conversation from {filename}\n{json.dumps(params.context.get_messages(), indent=4)}"
|
||||
@@ -171,17 +171,17 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
system_instruction=system_instruction,
|
||||
)
|
||||
|
||||
|
||||
@@ -95,10 +95,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
@@ -106,7 +106,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Initialize the Gemini Multimodal Live model
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GoogleLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
),
|
||||
|
||||
@@ -52,7 +52,7 @@ import os
|
||||
import time
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from google import genai
|
||||
from google import genai # pyright: ignore[reportAttributeAccessIssue]
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
@@ -87,7 +87,7 @@ def get_rag_content():
|
||||
"""Get the RAG content from the file."""
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
rag_content_path = os.path.join(script_dir, "assets", "rag-content.txt")
|
||||
with open(rag_content_path, "r") as f:
|
||||
with open(rag_content_path) as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
@@ -179,10 +179,10 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
api_key=os.environ["CARTESIA_API_KEY"],
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="f9836c6e-a0bd-460e-9d3c-f7299fa60f94", # Southern Lady
|
||||
),
|
||||
@@ -197,7 +197,7 @@ Your response will be turned into speech so use only simple words and punctuatio
|
||||
"""
|
||||
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GoogleLLMService.Settings(
|
||||
model=VOICE_MODEL,
|
||||
system_instruction=system_prompt,
|
||||
|
||||
@@ -133,11 +133,11 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
|
||||
|
||||
# Initialize text-to-speech service
|
||||
tts = ElevenLabsTTSService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
api_key=os.environ["ELEVENLABS_API_KEY"],
|
||||
settings=ElevenLabsTTSService.Settings(
|
||||
voice="pNInz6obpgDQGcFmaJgB",
|
||||
),
|
||||
@@ -196,7 +196,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Initialize LLM service
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
api_key=os.environ["OPENAI_API_KEY"],
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="""You are a personal assistant. You can remember things about the person you are talking to.
|
||||
Some Guidelines:
|
||||
|
||||
@@ -30,6 +30,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
|
||||
from pipecat.services.aws.nova_sonic.session_continuation import SessionContinuationParams
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -116,8 +117,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Create the AWS Nova Sonic LLM service
|
||||
llm = AWSNovaSonicLLMService(
|
||||
secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
||||
access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
||||
secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
|
||||
access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
|
||||
# as of 2025-12-09, these are the supported regions:
|
||||
# - Nova 2 Sonic (the default model):
|
||||
# - us-east-1
|
||||
@@ -126,12 +127,22 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# - Nova Sonic (the older model):
|
||||
# - us-east-1
|
||||
# - ap-northeast-1
|
||||
region=os.getenv("AWS_REGION"),
|
||||
region=os.environ["AWS_REGION"],
|
||||
session_token=os.getenv("AWS_SESSION_TOKEN"),
|
||||
settings=AWSNovaSonicLLMService.Settings(
|
||||
voice="tiffany",
|
||||
system_instruction=system_instruction,
|
||||
),
|
||||
# Session continuation is enabled by default, allowing seamless
|
||||
# conversations longer than the AWS ~8-minute session limit.
|
||||
# The service rotates sessions in the background with no
|
||||
# user-perceptible interruption. You can tune the threshold or
|
||||
# disable it with: session_continuation=SessionContinuationParams(enabled=False)
|
||||
session_continuation=SessionContinuationParams(
|
||||
# When to start preparing the next session (default: 360 = 6 min).
|
||||
# Lower this (e.g. 20) to see a handoff happen quickly during testing.
|
||||
transition_threshold_seconds=360,
|
||||
),
|
||||
# you could choose to pass tools here rather than via context
|
||||
# tools=tools
|
||||
)
|
||||
|
||||
@@ -112,8 +112,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = AzureRealtimeLLMService(
|
||||
api_key=os.getenv("AZURE_REALTIME_API_KEY"),
|
||||
base_url=os.getenv("AZURE_REALTIME_BASE_URL"),
|
||||
api_key=os.environ["AZURE_REALTIME_API_KEY"],
|
||||
base_url=os.environ["AZURE_REALTIME_BASE_URL"],
|
||||
settings=AzureRealtimeLLMService.Settings(
|
||||
system_instruction="""You are a helpful and friendly AI.
|
||||
|
||||
|
||||
@@ -104,7 +104,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Initialize Gemini service with File API support
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GeminiLiveLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
voice="Charon", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
|
||||
@@ -114,7 +114,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GeminiLiveLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
),
|
||||
|
||||
@@ -67,7 +67,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Initialize the Gemini Multimodal Live model
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GeminiLiveLLMService.Settings(
|
||||
voice="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
system_instruction=system_instruction,
|
||||
|
||||
@@ -133,7 +133,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GeminiLiveLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
),
|
||||
|
||||
@@ -75,7 +75,8 @@ class GroundingMetadataProcessor(FrameProcessor):
|
||||
if isinstance(frame, LLMSearchResponseFrame):
|
||||
self._grounding_count += 1
|
||||
logger.info(f"\n\n🔍 GROUNDING METADATA RECEIVED #{self._grounding_count}\n")
|
||||
logger.info(f"📝 Search Result Text: {frame.search_result[:200]}...")
|
||||
if frame.search_result:
|
||||
logger.info(f"📝 Search Result Text: {frame.search_result[:200]}...")
|
||||
|
||||
if frame.rendered_content:
|
||||
logger.info(f"🔗 Rendered Content: {frame.rendered_content}")
|
||||
@@ -101,7 +102,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
)
|
||||
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GeminiLiveLLMService.Settings(
|
||||
system_instruction=SYSTEM_INSTRUCTION,
|
||||
voice="Charon", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
@@ -111,16 +112,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Create a processor to capture grounding metadata
|
||||
grounding_processor = GroundingMetadataProcessor()
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Please introduce yourself and let me know that you can help with current information by searching the web. Ask me what current information I'd like to know about.",
|
||||
},
|
||||
]
|
||||
|
||||
# Set up conversation context and management
|
||||
context = LLMContext(messages)
|
||||
context = LLMContext()
|
||||
# Server-side VAD is enabled by default; no local VAD is added.
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
@@ -144,6 +137,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{
|
||||
"role": "developer",
|
||||
"content": "Please introduce yourself and let me know that you can help with current information by searching the web. Ask me what current information I'd like to know about.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -54,7 +54,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
api_key=os.environ["GOOGLE_API_KEY"],
|
||||
settings=GeminiLiveLLMService.Settings(
|
||||
voice="Aoede", # Puck, Charon, Kore, Fenrir, Aoede
|
||||
vad=GeminiVADParams(disabled=True),
|
||||
|
||||
@@ -110,8 +110,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
llm = GeminiLiveVertexLLMService(
|
||||
credentials=os.getenv("GOOGLE_VERTEX_TEST_CREDENTIALS"),
|
||||
project_id=os.getenv("GOOGLE_CLOUD_PROJECT_ID"),
|
||||
location=os.getenv("GOOGLE_CLOUD_LOCATION"),
|
||||
project_id=os.environ["GOOGLE_CLOUD_PROJECT_ID"],
|
||||
location=os.environ["GOOGLE_CLOUD_LOCATION"],
|
||||
settings=GeminiLiveVertexLLMService.Settings(
|
||||
system_instruction=system_instruction,
|
||||
voice="Puck", # Aoede, Charon, Fenrir, Kore, Puck
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user