Compare commits
314 Commits
rtvi-send-
...
v0.0.108
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a84c69858e | ||
|
|
ca224219dc | ||
|
|
83dc979d19 | ||
|
|
fc76b3f2fb | ||
|
|
4670370dbb | ||
|
|
47e53890e3 | ||
|
|
195180b6f4 | ||
|
|
8b64166bb7 | ||
|
|
1d18995435 | ||
|
|
ea7324b2ba | ||
|
|
52ed7137af | ||
|
|
b33df03724 | ||
|
|
28fbe1db08 | ||
|
|
9240e92d9f | ||
|
|
5caf53f086 | ||
|
|
ac2716811c | ||
|
|
d313d56776 | ||
|
|
159776f106 | ||
|
|
a23803478f | ||
|
|
bae193ab4d | ||
|
|
04adb697be | ||
|
|
4f9c8a6860 | ||
|
|
a1a29b3933 | ||
|
|
0798803c70 | ||
|
|
6422661d08 | ||
|
|
ed94b65d83 | ||
|
|
f9670b9601 | ||
|
|
5b2991f47f | ||
|
|
fc3186dc0d | ||
|
|
1808b447c9 | ||
|
|
70df9d3fe4 | ||
|
|
a8bfc23d3a | ||
|
|
e2870fc2ac | ||
|
|
e851f8c1d5 | ||
|
|
b31bece617 | ||
|
|
9e350bcc2f | ||
|
|
9c2594c484 | ||
|
|
900fc88430 | ||
|
|
4ef5ac6f0c | ||
|
|
cbb3d99493 | ||
|
|
fb1996cedc | ||
|
|
95c55ec6c3 | ||
|
|
a45de9af7f | ||
|
|
5e61a57582 | ||
|
|
d8b0ed18fd | ||
|
|
789275a57b | ||
|
|
38c961a363 | ||
|
|
41a86a51bf | ||
|
|
e1bfa4cf21 | ||
|
|
537d57449e | ||
|
|
33e146decd | ||
|
|
eee47deb34 | ||
|
|
21a729ae5d | ||
|
|
1870f4010e | ||
|
|
28683a7296 | ||
|
|
0e504d876d | ||
|
|
5c51981207 | ||
|
|
a13c4d1248 | ||
|
|
ca1b4ad124 | ||
|
|
533dcdba3f | ||
|
|
7eec03cb77 | ||
|
|
83911dced6 | ||
|
|
4e4a8c45d5 | ||
|
|
9c6d51c570 | ||
|
|
9152d85824 | ||
|
|
6a87d0e87d | ||
|
|
fe0633ecd1 | ||
|
|
ca2bfd6f12 | ||
|
|
345ccc0abe | ||
|
|
800fd6a916 | ||
|
|
d286991257 | ||
|
|
a06bf47ed2 | ||
|
|
5ad4aa9bea | ||
|
|
c4466ba678 | ||
|
|
df602b900d | ||
|
|
c331c75d66 | ||
|
|
f7ec6befe1 | ||
|
|
6a6ee8d563 | ||
|
|
259f5e124c | ||
|
|
cfe91d11ec | ||
|
|
467184e63e | ||
|
|
af566ac936 | ||
|
|
62484a4fc3 | ||
|
|
7fef3b01eb | ||
|
|
6d1918f12a | ||
|
|
e58740e948 | ||
|
|
ddfe44940d | ||
|
|
fdbdbc8be3 | ||
|
|
3cd7d882fb | ||
|
|
2d78533d77 | ||
|
|
c1dd44f947 | ||
|
|
9db15e7942 | ||
|
|
503e5e9106 | ||
|
|
2ff4b3f4a3 | ||
|
|
b4096f9a11 | ||
|
|
c4253a7d98 | ||
|
|
2441c4f801 | ||
|
|
a7a55dd30e | ||
|
|
de6a7223ba | ||
|
|
165932e1cc | ||
|
|
1f0d9ad01a | ||
|
|
052075c244 | ||
|
|
a8d0e1de9f | ||
|
|
4f0b2066c0 | ||
|
|
413dbaf974 | ||
|
|
5645909d34 | ||
|
|
da3f184316 | ||
|
|
e5a2723632 | ||
|
|
4ee4002d5d | ||
|
|
54a17ab1f3 | ||
|
|
1c99a537b2 | ||
|
|
ff5d055b3c | ||
|
|
adc003d6c7 | ||
|
|
bbd14de9c5 | ||
|
|
02b97035f8 | ||
|
|
f470ff193e | ||
|
|
7bc8b89a54 | ||
|
|
a8eff6fbbf | ||
|
|
86e086c6b5 | ||
|
|
4bdfe1cf31 | ||
|
|
bb33045389 | ||
|
|
ac2b1ecd47 | ||
|
|
e7dd84b552 | ||
|
|
39329aaddb | ||
|
|
56a56a4174 | ||
|
|
b80328e038 | ||
|
|
3a80be760b | ||
|
|
b66c892100 | ||
|
|
6c30371295 | ||
|
|
ddf6a41854 | ||
|
|
e0c49927cf | ||
|
|
45926a7135 | ||
|
|
8c678c1c98 | ||
|
|
4c121332cf | ||
|
|
74686f9190 | ||
|
|
19bcc8620c | ||
|
|
0530722c58 | ||
|
|
0d1b834770 | ||
|
|
7a0f7b58d1 | ||
|
|
5806a3f0fa | ||
|
|
27fabfc1b3 | ||
|
|
d779a5b4ea | ||
|
|
2bb36b5b66 | ||
|
|
e0bc9c73c6 | ||
|
|
2135557689 | ||
|
|
a0393b9af6 | ||
|
|
64ba013b68 | ||
|
|
7377d88cf5 | ||
|
|
3bbec0a2c8 | ||
|
|
e29a63e1ae | ||
|
|
45178972d7 | ||
|
|
bb7199d143 | ||
|
|
d4dea30407 | ||
|
|
b49bf1c83f | ||
|
|
1b0f7ecb0e | ||
|
|
8e57dd67a2 | ||
|
|
5d71de8aad | ||
|
|
dc56cb2ccc | ||
|
|
063955b7eb | ||
|
|
e05bd54743 | ||
|
|
35f52f70ab | ||
|
|
d05eb02b98 | ||
|
|
4abd4d031d | ||
|
|
7e42998e9e | ||
|
|
28eb4544d3 | ||
|
|
b45dcb1ae0 | ||
|
|
6eb988b729 | ||
|
|
f68b3222b3 | ||
|
|
3274235ea1 | ||
|
|
05b9c514fb | ||
|
|
03c0d7c345 | ||
|
|
0783edb185 | ||
|
|
51d28b4a9f | ||
|
|
cf083b8411 | ||
|
|
099814d74a | ||
|
|
dd45843c42 | ||
|
|
fe15d8654b | ||
|
|
68a440ae2e | ||
|
|
8109ab6135 | ||
|
|
f311a0b6e4 | ||
|
|
9df8985d60 | ||
|
|
b3a25e0ebe | ||
|
|
02cfb129d3 | ||
|
|
311afef7da | ||
|
|
5ed183d215 | ||
|
|
5c3d3aea2b | ||
|
|
0651569a4e | ||
|
|
bf04ea2043 | ||
|
|
aa0b49d69f | ||
|
|
8c6f4a8d7b | ||
|
|
bbaa5971c4 | ||
|
|
cdd8c3e5bb | ||
|
|
1c8a8f51d4 | ||
|
|
349b8645f3 | ||
|
|
696196e30c | ||
|
|
dacffccd3a | ||
|
|
f21b262969 | ||
|
|
7414b30308 | ||
|
|
3268cb93d5 | ||
|
|
9211379720 | ||
|
|
42cab7eea0 | ||
|
|
483b643b07 | ||
|
|
12dc429761 | ||
|
|
066b206b3d | ||
|
|
ddd1b71b56 | ||
|
|
8612c9f50a | ||
|
|
d314e2831a | ||
|
|
fd0bfe141f | ||
|
|
3042929989 | ||
|
|
0f6cc231cf | ||
|
|
844555c520 | ||
|
|
3428a4c6ad | ||
|
|
f283cc5bc6 | ||
|
|
70552d7697 | ||
|
|
84c2a24c9f | ||
|
|
f8c7414ea7 | ||
|
|
f1f51de962 | ||
|
|
e93b0ace06 | ||
|
|
c32240e14b | ||
|
|
e6602f9244 | ||
|
|
9a30b18f21 | ||
|
|
936a39f4a1 | ||
|
|
3b1cb30926 | ||
|
|
ce36487143 | ||
|
|
ec3bd8c5b1 | ||
|
|
622ebd5d74 | ||
|
|
a9a1941a45 | ||
|
|
53e0136366 | ||
|
|
bc0e7130b8 | ||
|
|
d8af4447ff | ||
|
|
c89e366739 | ||
|
|
e9f3086ea3 | ||
|
|
b5c362d6e6 | ||
|
|
e5aaa4c4eb | ||
|
|
a12ad27348 | ||
|
|
44504efdc7 | ||
|
|
da8070e98e | ||
|
|
b98ad7fb64 | ||
|
|
10ddf45015 | ||
|
|
e41cb2cd0c | ||
|
|
a69abcc67a | ||
|
|
a11c48d5b0 | ||
|
|
7caec9018b | ||
|
|
08052d8880 | ||
|
|
4c456ada04 | ||
|
|
488dc1d07e | ||
|
|
dafbb2eb66 | ||
|
|
ea1534f9f8 | ||
|
|
f6e7599e49 | ||
|
|
6424c36666 | ||
|
|
05e344b9ec | ||
|
|
4ec7be8850 | ||
|
|
0533ea7b7f | ||
|
|
a3431d3b01 | ||
|
|
348df9d4ce | ||
|
|
a9256ebc35 | ||
|
|
a0f311158d | ||
|
|
d3ca034c4f | ||
|
|
39425a675a | ||
|
|
c4d1b89049 | ||
|
|
fd8c6c88bb | ||
|
|
57fd29f0c4 | ||
|
|
06f7da44f1 | ||
|
|
d702ebd6a2 | ||
|
|
26fc238eb7 | ||
|
|
61ff53f2b9 | ||
|
|
5e7639812a | ||
|
|
ba779f920f | ||
|
|
c3d6e965d8 | ||
|
|
0f1ff16af1 | ||
|
|
1ede8460a2 | ||
|
|
463db59bb5 | ||
|
|
0be4084683 | ||
|
|
8f6dfc4777 | ||
|
|
6841c0719b | ||
|
|
2836b1ea7e | ||
|
|
5fd98e1391 | ||
|
|
ef419cd87a | ||
|
|
8750c26cdc | ||
|
|
3e0c536fe7 | ||
|
|
7ee5fa9e20 | ||
|
|
7dfcaf8096 | ||
|
|
05157129e2 | ||
|
|
4a0411cbc4 | ||
|
|
6cd39b8b42 | ||
|
|
38d7882f0f | ||
|
|
4aea7784c9 | ||
|
|
bad10177d4 | ||
|
|
c4be513044 | ||
|
|
4b704e6d3a | ||
|
|
b1a8588209 | ||
|
|
5de794e1da | ||
|
|
891966346c | ||
|
|
2001ab4577 | ||
|
|
0449df828c | ||
|
|
951bb0c1a7 | ||
|
|
21b1812c71 | ||
|
|
c4f21ef76b | ||
|
|
a7167ad121 | ||
|
|
eaccb96454 | ||
|
|
45186cc4ce | ||
|
|
0378fb0d91 | ||
|
|
9a55eb67cf | ||
|
|
8a4f6b486e | ||
|
|
8745f20330 | ||
|
|
3e5be23bd8 | ||
|
|
33f042b500 | ||
|
|
0722784f3a | ||
|
|
cbc1c275b3 | ||
|
|
14ca70f13e | ||
|
|
f7568a91b1 | ||
|
|
dfe5fec8f9 | ||
|
|
dc0386937a | ||
|
|
9cc2644719 |
612
CHANGELOG.md
612
CHANGELOG.md
@@ -7,6 +7,618 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
<!-- towncrier release notes start -->
|
||||
|
||||
## [0.0.108] - 2026-03-27
|
||||
|
||||
### Added
|
||||
|
||||
- Added `SarvamLLMService` with support for `sarvam-30b`, `sarvam-30b-16k`,
|
||||
`sarvam-105b` and `sarvam-105b-32k`.
|
||||
(PR [#3978](https://github.com/pipecat-ai/pipecat/pull/3978))
|
||||
|
||||
- Added `on_turn_context_created(context_id)` hook to `TTSService`. Override
|
||||
this to perform provider-specific setup (e.g. eagerly opening a server-side
|
||||
context) before text starts flowing. Called each time a new turn context ID
|
||||
is created.
|
||||
(PR [#4013](https://github.com/pipecat-ai/pipecat/pull/4013))
|
||||
|
||||
- Added `XAIHttpTTSService` for text-to-speech using xAI's HTTP TTS API.
|
||||
(PR [#4031](https://github.com/pipecat-ai/pipecat/pull/4031))
|
||||
|
||||
- Added support for "developer" role messages in conversation context across
|
||||
all LLM adapters. For non-OpenAI services (Anthropic, Google, AWS Bedrock),
|
||||
"developer" messages are converted to "user" messages (use
|
||||
`system_instruction` to set the system instruction). For OpenAI services,
|
||||
"developer" messages pass through in conversation history. For the Responses
|
||||
API, they are kept as "developer" role (matching the existing "system" →
|
||||
"developer" conversion).
|
||||
(PR [#4089](https://github.com/pipecat-ai/pipecat/pull/4089))
|
||||
|
||||
- Added `SmallestTTSService`, a WebSocket-based TTS service integration with
|
||||
Smallest AI's Waves API. Supports the Lightning v2 and v3.1 models with
|
||||
configurable voice, language, speed, consistency, similarity, and enhancement
|
||||
settings.
|
||||
(PR [#4092](https://github.com/pipecat-ai/pipecat/pull/4092))
|
||||
|
||||
- Added warnings in turn stop strategies when `VADParams.stop_secs` differs
|
||||
from the recommended default (0.2s) or when `stop_secs >= STT p99 latency`,
|
||||
which collapses the STT wait timeout to 0s and may cause delayed turn
|
||||
detection. The warnings guide developers to re-run the
|
||||
[stt-benchmark](https://github.com/pipecat-ai/stt-benchmark) with their VAD
|
||||
settings.
|
||||
(PR [#4115](https://github.com/pipecat-ai/pipecat/pull/4115))
|
||||
|
||||
- Added `domain` parameter to `AssemblyAISTTSettings` for specialized
|
||||
recognition modes such as Medical Mode (`domain="medical-v1"`).
|
||||
(PR [#4117](https://github.com/pipecat-ai/pipecat/pull/4117))
|
||||
|
||||
- Added `NovitaLLMService` for using Novita AI's LLM models via their
|
||||
OpenAI-compatible API.
|
||||
(PR [#4119](https://github.com/pipecat-ai/pipecat/pull/4119))
|
||||
|
||||
- Added `cleanup()` method to `VADAnalyzer` and `VADController` so VAD analyzer
|
||||
resources are properly released when no longer needed. Custom `VADAnalyzer`
|
||||
subclasses can override `cleanup()` to free any held resources.
|
||||
(PR [#4120](https://github.com/pipecat-ai/pipecat/pull/4120))
|
||||
|
||||
- Added `on_end_of_turn` event handler to `AssemblyAISTTService`. This fires
|
||||
after the final transcript is pushed, providing a reliable hook for
|
||||
end-of-turn logic that doesn't race with `TranscriptionFrame`. Works in both
|
||||
Pipecat and AssemblyAI turn detection modes.
|
||||
(PR [#4128](https://github.com/pipecat-ai/pipecat/pull/4128))
|
||||
|
||||
- Added `DeepgramFluxSageMakerSTTService` for running Deepgram Flux
|
||||
speech-to-text on AWS SageMaker endpoints. Use with
|
||||
`ExternalUserTurnStrategies` to take advantage of Flux's turn detection.
|
||||
(PR [#4143](https://github.com/pipecat-ai/pipecat/pull/4143))
|
||||
|
||||
- Added `Mem0MemoryService.get_memories()` convenience method for retrieving
|
||||
all stored memories outside the pipeline (e.g. to build a personalized
|
||||
greeting at connection time). This avoids the need to manually handle client
|
||||
type branching, filter construction, and async wrapping.
|
||||
(PR [#4156](https://github.com/pipecat-ai/pipecat/pull/4156))
|
||||
|
||||
### Changed
|
||||
|
||||
- Added context prewarming path for `InworldTTSService` to improve first audio
|
||||
latency.
|
||||
(PR [#4013](https://github.com/pipecat-ai/pipecat/pull/4013))
|
||||
|
||||
- Added `KrispVivaVadAnalyzer` for Voice Activity Detection using the Krisp
|
||||
VIVA SDK (requires `krisp_audio`).
|
||||
(PR [#4022](https://github.com/pipecat-ai/pipecat/pull/4022))
|
||||
|
||||
- Modified `InworldTTSService` to close context at end of turn instead of
|
||||
relying on idle timeout.
|
||||
(PR [#4028](https://github.com/pipecat-ai/pipecat/pull/4028))
|
||||
|
||||
- Added Gemini 3 support to the Gemini Live service.
|
||||
(PR [#4078](https://github.com/pipecat-ai/pipecat/pull/4078))
|
||||
|
||||
- `TTSService`: the default `stop_frame_timeout_s` (idle time before an
|
||||
automatic `TTSStoppedFrame` is pushed when `push_stop_frames=True`) has
|
||||
changed from `2.0` to `3.0` seconds.
|
||||
(PR [#4084](https://github.com/pipecat-ai/pipecat/pull/4084))
|
||||
|
||||
- ⚠️ `GeminiLLMAdapter` now only treats `messages[0]` as the initial system
|
||||
message, matching all other adapters. Previously it searched for the first
|
||||
"system" message anywhere in the conversation history. A "system" message
|
||||
appearing later in the list will now be converted to "user" instead of being
|
||||
extracted as the system instruction.
|
||||
(PR [#4089](https://github.com/pipecat-ai/pipecat/pull/4089))
|
||||
|
||||
- Fixed `InworldTtsService` to fallback to full text when TTS timestamps are
|
||||
not received.
|
||||
(PR [#4113](https://github.com/pipecat-ai/pipecat/pull/4113))
|
||||
|
||||
- ⚠️ Realtime services (Gemini Live, OpenAI Realtime, Grok Realtime, Nova
|
||||
Sonic) now prefer `system_instruction` from service settings over an initial
|
||||
system message in the LLM context, matching the behavior of non-realtime
|
||||
services. Previously, context-provided system instructions took precedence. A
|
||||
warning is now logged when both are set.
|
||||
(PR [#4130](https://github.com/pipecat-ai/pipecat/pull/4130))
|
||||
|
||||
- Bumped `nvidia-riva-client` minimum version to `>=2.25.1`.
|
||||
(PR [#4136](https://github.com/pipecat-ai/pipecat/pull/4136))
|
||||
|
||||
- Upgraded `protobuf` from 5.x to 6.x (`>=6.31.1,<7`).
|
||||
(PR [#4136](https://github.com/pipecat-ai/pipecat/pull/4136))
|
||||
|
||||
- Unrecognized language strings (e.g. Deepgram's `"multi"`) no longer produce a
|
||||
warning at startup. The log message has been downgraded to debug level since
|
||||
these are valid service-specific values that are passed through correctly.
|
||||
(PR [#4137](https://github.com/pipecat-ai/pipecat/pull/4137))
|
||||
|
||||
- `GrokLLMService` and `GrokRealtimeLLMService` now live in the
|
||||
`pipecat.services.xai` module alongside `XAIHttpTTSService`, since all three
|
||||
use the same xAI API. Update imports from `pipecat.services.grok.*` to
|
||||
`pipecat.services.xai.*` (e.g. `from pipecat.services.xai.llm import
|
||||
GrokLLMService`).
|
||||
(PR [#4142](https://github.com/pipecat-ai/pipecat/pull/4142))
|
||||
|
||||
- ⚠️ Bumped `mem0ai` dependency from `~=0.1.94` to `>=1.0.8,<2`. Users of the
|
||||
`mem0` extra will need to update their mem0ai package.
|
||||
(PR [#4156](https://github.com/pipecat-ai/pipecat/pull/4156))
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `pipecat.services.grok.llm`, `pipecat.services.grok.realtime.llm`, and
|
||||
`pipecat.services.grok.realtime.events` are deprecated. The old import paths
|
||||
still work but emit a `DeprecationWarning`; use `pipecat.services.xai.llm`,
|
||||
`pipecat.services.xai.realtime.llm`, and
|
||||
`pipecat.services.xai.realtime.events` instead.
|
||||
(PR [#4142](https://github.com/pipecat-ai/pipecat/pull/4142))
|
||||
|
||||
### Removed
|
||||
|
||||
- ⚠️ `TTSService.add_word_timestamps()` no longer supports the `"Reset"` and
|
||||
`"TTSStoppedFrame"` sentinel strings. If you have a custom TTS service that
|
||||
called `await self.add_word_timestamps([("Reset", 0)])` or `await
|
||||
self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)], ctx_id)`,
|
||||
replace them with `await self.append_to_audio_context(ctx_id,
|
||||
TTSStoppedFrame(context_id=ctx_id))` and let `_handle_audio_context` manage
|
||||
the word-timestamp reset automatically.
|
||||
(PR [#4145](https://github.com/pipecat-ai/pipecat/pull/4145))
|
||||
|
||||
- Removed `SambaNovaSTTService`. SambaNova no longer offers speech-to-text
|
||||
audio models. Use another STT provider instead.
|
||||
(PR [#4154](https://github.com/pipecat-ai/pipecat/pull/4154))
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed Gemini Live (`GoogleGeminiLiveLLMService`) not honoring
|
||||
`settings.system_instruction`. The system instruction was being read from a
|
||||
deprecated constructor parameter instead of the settings object, causing it
|
||||
to be silently ignored.
|
||||
(PR [#4089](https://github.com/pipecat-ai/pipecat/pull/4089))
|
||||
|
||||
- Fixed `AWSBedrockLLMAdapter` sending an empty message list to the API when
|
||||
the only message in context was a system message. The lone system message is
|
||||
now converted to "user" role instead of being extracted, matching the
|
||||
existing Anthropic adapter behavior.
|
||||
(PR [#4089](https://github.com/pipecat-ai/pipecat/pull/4089))
|
||||
|
||||
- Fixed Gemini Live pipeline hanging indefinitely when an `EndFrame` was
|
||||
deferred while waiting for the bot to finish responding and `turn_complete`
|
||||
never arrived. As a possible root-cause fix, `turn_complete` messages are now
|
||||
handled even if they lack `usage_metadata`. As a fallback, the deferred
|
||||
`EndFrame` now has a 30-second safety timeout.
|
||||
(PR [#4125](https://github.com/pipecat-ai/pipecat/pull/4125))
|
||||
|
||||
- Fixed ElevenLabs WebSocket disconnections (1008 "Maximum simultaneous
|
||||
contexts exceeded") caused by rapid user interruptions. When interruptions
|
||||
arrived before any TTS text was generated, phantom contexts were created on
|
||||
the ElevenLabs server that were never closed, eventually exceeding the
|
||||
5-context limit.
|
||||
(PR [#4126](https://github.com/pipecat-ai/pipecat/pull/4126))
|
||||
|
||||
- Fixed the final sentence being dropped from the conversation context when
|
||||
using RTVI text input with non-word-timestamp TTS services. The
|
||||
`LLMFullResponseEndFrame` was racing ahead of the last `TTSTextFrame`,
|
||||
causing the `LLMAssistantAggregator` to finalize the context before the final
|
||||
sentence arrived.
|
||||
(PR [#4127](https://github.com/pipecat-ai/pipecat/pull/4127))
|
||||
|
||||
- Fixed audio crackling and popping in recordings when both user and bot are
|
||||
speaking. `AudioBufferProcessor` no longer injects silence into a track's
|
||||
buffer while that track is actively producing audio, preventing mid-utterance
|
||||
interruptions in the recorded output.
|
||||
(PR [#4135](https://github.com/pipecat-ai/pipecat/pull/4135))
|
||||
|
||||
- Fixed websocket TTS word timestamps so interrupted contexts cannot leak stale
|
||||
words or backward PTS values into later turns.
|
||||
(PR [#4145](https://github.com/pipecat-ai/pipecat/pull/4145))
|
||||
|
||||
- Fixed a race condition in `InterruptibleTTSService` where, if `run_tts` had
|
||||
been invoked but `BotStartedSpeakingFrame` had not yet been received, a user
|
||||
interruption could allow stale audio to leak through.
|
||||
(PR [#4145](https://github.com/pipecat-ai/pipecat/pull/4145))
|
||||
|
||||
- Fixed Gemini Live local VAD mode (`GeminiVADParams(disabled=True)` with
|
||||
external VAD) not working. The bot now correctly detects user speech and
|
||||
signals turn boundaries to the Gemini API.
|
||||
(PR [#4146](https://github.com/pipecat-ai/pipecat/pull/4146))
|
||||
|
||||
- Fixed Gemini Live message handling to process all `server_content` fields
|
||||
independently. Gemini 3.x can bundle multiple fields (e.g. `model_turn` and
|
||||
`output_transcription`) on the same message, but the previous `elif` chain
|
||||
only processed the first match, silently dropping the rest.
|
||||
(PR [#4147](https://github.com/pipecat-ai/pipecat/pull/4147))
|
||||
|
||||
- Fixed `ServiceSwitcher` with `ServiceSwitcherStrategyFailover` incorrectly
|
||||
triggering failover when `ErrorFrame`s from other pipeline stages (e.g. TTS)
|
||||
propagated upstream through the switcher. Previously, any non-fatal error
|
||||
passing through would be misattributed to the active service and trigger an
|
||||
unwanted service switch. Now only errors originating from the switcher's own
|
||||
managed services trigger failover.
|
||||
(PR [#4149](https://github.com/pipecat-ai/pipecat/pull/4149))
|
||||
|
||||
- Fixed `LiveKitOutputTransport` not clearing the `rtc.AudioSource` internal
|
||||
buffer on interruption, causing the bot to continue speaking for several
|
||||
seconds after being interrupted.
|
||||
(PR [#4151](https://github.com/pipecat-ai/pipecat/pull/4151))
|
||||
|
||||
- Fixed a crash in OpenAI LLM processing when the provider returns
|
||||
`chunk.choices[0].delta.audio = None`, which caused `'NoneType' object has no
|
||||
attribute 'get'` errors during audio transcript handling.
|
||||
(PR [#4152](https://github.com/pipecat-ai/pipecat/pull/4152))
|
||||
|
||||
- Fixed error floods in `DeepgramSTTService` when the WebSocket connection
|
||||
drops. With Deepgram SDK 6.x, `send_media()` raises exceptions on a dead
|
||||
connection instead of silently failing, causing every queued audio frame to
|
||||
log an error. Now `send_media()` failures are caught gracefully — a single
|
||||
warning is logged and audio frames are skipped until the existing
|
||||
reconnection logic restores the connection.
|
||||
(PR [#4153](https://github.com/pipecat-ai/pipecat/pull/4153))
|
||||
|
||||
- `Mem0MemoryService` no longer blocks the event loop during memory storage and
|
||||
retrieval. All Mem0 API calls now run in a background thread, and message
|
||||
storage is fire-and-forget so it doesn't delay downstream processing.
|
||||
(PR [#4156](https://github.com/pipecat-ai/pipecat/pull/4156))
|
||||
|
||||
- Fixed `Mem0MemoryService` failing to store messages when the context
|
||||
contained system or developer role messages. The Mem0 API only accepts user
|
||||
and assistant roles, so other roles are now filtered out before storing.
|
||||
(PR [#4156](https://github.com/pipecat-ai/pipecat/pull/4156))
|
||||
|
||||
- Added missing `on_dtmf_event` callback to `LemonSliceTransportClient.setup()`
|
||||
`DailyCallbacks` construction, fixing a `ValidationError` at pipeline setup
|
||||
time.
|
||||
(PR [#4161](https://github.com/pipecat-ai/pipecat/pull/4161))
|
||||
|
||||
- Fixed an issue in `InworldTTSService` where, in cases of fast interruption,
|
||||
we would continue receiving audio from the previous context.
|
||||
(PR [#4167](https://github.com/pipecat-ai/pipecat/pull/4167))
|
||||
|
||||
- Fixed a word timestamp interleaving issue in `InworldTTSService` when
|
||||
processing multiple sentences.
|
||||
(PR [#4167](https://github.com/pipecat-ai/pipecat/pull/4167))
|
||||
|
||||
- Fixed duplicate `TTSStoppedFrame` being pushed in TTS services using
|
||||
`push_stop_frames=True`. When the stop-frame timeout fired, a second
|
||||
`TTSStoppedFrame` could be pushed after the normal one at context completion.
|
||||
(PR [#4172](https://github.com/pipecat-ai/pipecat/pull/4172))
|
||||
|
||||
- ⚠️ Fixed `DeepgramSTTService` compatibility with deepgram-sdk 6.1.0. The SDK
|
||||
now requires explicit message objects for `send_keep_alive()`,
|
||||
`send_close_stream()`, and `send_finalize()`. The minimum deepgram-sdk
|
||||
version is now 6.1.0.
|
||||
(PR [#4174](https://github.com/pipecat-ai/pipecat/pull/4174))
|
||||
|
||||
- Fixed RTVI events not being delivered to clients when using WebSocket
|
||||
transports. `ProtobufFrameSerializer` now sets `ignore_rtvi_messages=False`
|
||||
by default.
|
||||
(PR [#4176](https://github.com/pipecat-ai/pipecat/pull/4176))
|
||||
|
||||
- Fixed a timing issue where turn detection timer tasks (idle controller,
|
||||
speech timeout, turn analyzer, and turn completion) could miss their first
|
||||
tick because the newly created asyncio task was not yet scheduled when the
|
||||
caller continued.
|
||||
(PR [#4183](https://github.com/pipecat-ai/pipecat/pull/4183))
|
||||
|
||||
- Fixed `FastAPIWebsocketTransport` intermittently hanging on shutdown when the
|
||||
remote side (e.g. Twilio) disconnects while audio is being sent. A race
|
||||
condition between the send and receive paths could cause the
|
||||
`on_client_disconnected` callback to be skipped, leaving the pipeline waiting
|
||||
for a disconnect signal that never came.
|
||||
(PR [#4186](https://github.com/pipecat-ai/pipecat/pull/4186))
|
||||
|
||||
### Performance
|
||||
|
||||
- `RimeTTSService` now handles Rime's `done` WebSocket message to complete
|
||||
audio contexts immediately, eliminating the 3-second idle timeout that
|
||||
previously added latency at the end of each utterance.
|
||||
(PR [#4172](https://github.com/pipecat-ai/pipecat/pull/4172))
|
||||
|
||||
## [0.0.107] - 2026-03-23
|
||||
|
||||
### Added
|
||||
|
||||
- Added `frame_order` parameter to `SyncParallelPipeline`. Set
|
||||
`frame_order=FrameOrder.PIPELINE` to push synchronized output frames in
|
||||
pipeline definition order (all frames from the first pipeline, then the
|
||||
second, etc.) instead of the default arrival order.
|
||||
(PR [#4029](https://github.com/pipecat-ai/pipecat/pull/4029))
|
||||
|
||||
- Added `sync_with_audio` field to `OutputImageRawFrame`. When set to `True`,
|
||||
the output transport queues image frames with audio so they are displayed
|
||||
only after all preceding audio has been sent, enabling synchronized
|
||||
audio/image playback.
|
||||
(PR [#4029](https://github.com/pipecat-ai/pipecat/pull/4029))
|
||||
|
||||
- Added `OpenAIResponsesLLMService`, a new LLM service that uses the OpenAI
|
||||
Responses API. Supports streaming text, function calling, usage metrics, and
|
||||
out-of-band inference. Works with the universal `LLMContext` and
|
||||
`LLMContextAggregatorPair`. See
|
||||
`examples/foundational/07-interruptible-openai-responses.py` and
|
||||
`14-function-calling-openai-responses.py`.
|
||||
(PR [#4074](https://github.com/pipecat-ai/pipecat/pull/4074))
|
||||
|
||||
- Added `audio_out_auto_silence` parameter to `TransportParams` (defaults to
|
||||
`True`). When set to `False`, the transport waits for audio data instead of
|
||||
inserting silence when the output queue is empty, which is useful for
|
||||
scenarios that require uninterrupted audio playback without artificial gaps.
|
||||
(PR [#4104](https://github.com/pipecat-ai/pipecat/pull/4104))
|
||||
|
||||
### Changed
|
||||
|
||||
- Renamed tracing span attributes to align with OpenTelemetry GenAI semantic
|
||||
conventions: `gen_ai.system` to `gen_ai.provider.name`, `system` to
|
||||
`gen_ai.system_instructions`, `gen_ai.usage.cache_read_input_tokens` to
|
||||
`gen_ai.usage.cache_read.input_tokens`, and
|
||||
`gen_ai.usage.cache_creation_input_tokens` to
|
||||
`gen_ai.usage.cache_creation.input_tokens`.
|
||||
(PR [#3449](https://github.com/pipecat-ai/pipecat/pull/3449))
|
||||
|
||||
- `DeepgramSageMakerTTSService` now correctly routes audio through the base
|
||||
`TTSService` audio context queue. Audio frames are delivered via
|
||||
`append_to_audio_context()` instead of being pushed directly, enabling proper
|
||||
ordering, interruption handling, and start/stop frame lifecycle management.
|
||||
Interruptions now trigger a `Clear` message to Deepgram (flushing its text
|
||||
buffer) at the right time via `on_audio_context_interrupted`.
|
||||
(PR [#4083](https://github.com/pipecat-ai/pipecat/pull/4083))
|
||||
|
||||
- `GradiumTTSService` now sends a per-context `setup` message with
|
||||
`client_req_id` before the first text message for each TTS context, following
|
||||
Gradium's multiplexing protocol. Previously, a single setup message was sent
|
||||
at connection time without a `client_req_id`, which prevented Gradium from
|
||||
associating requests with their sessions when using `close_ws_on_eos=False`.
|
||||
(PR [#4091](https://github.com/pipecat-ai/pipecat/pull/4091))
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed stale `system_instruction` in LLM tracing spans by reading from
|
||||
`_settings.system_instruction` instead of the removed `_system_instruction`
|
||||
attribute.
|
||||
(PR [#3449](https://github.com/pipecat-ai/pipecat/pull/3449))
|
||||
|
||||
- Fixed `SyncParallelPipeline` breaking the Whisker debugger.
|
||||
(PR [#4029](https://github.com/pipecat-ai/pipecat/pull/4029))
|
||||
|
||||
- Fixed `SyncParallelPipeline` race condition where concurrent SystemFrame
|
||||
processing (e.g. from RTVI) could corrupt sink queues and cause deadlocks.
|
||||
SystemFrames now take a fast path that passes them through without draining
|
||||
queued output.
|
||||
(PR [#4029](https://github.com/pipecat-ai/pipecat/pull/4029))
|
||||
|
||||
- Fixed TTS frame ordering so that non-system frames always arrive in correct
|
||||
order relative to the `TTSStartedFrame`/`TTSAudioRawFrame`/`TTSStoppedFrame`
|
||||
sequence. Previously these frames could race ahead of or behind audio context
|
||||
frames, producing out-of-order output downstream.
|
||||
(PR [#4075](https://github.com/pipecat-ai/pipecat/pull/4075))
|
||||
|
||||
- Fixed `SarvamTTSService` audio and error frames now route through
|
||||
`append_to_audio_context()` instead of `push_frame()`, ensuring correct
|
||||
behavior with audio contexts and interruptions.
|
||||
(PR [#4082](https://github.com/pipecat-ai/pipecat/pull/4082))
|
||||
|
||||
- Fixed audio frame ordering and interruption handling in Fish Audio, LMNT,
|
||||
Neuphonic, and Rime NonJson TTS services. These services were bypassing the
|
||||
base `TTSService` audio context serialization queue by pushing audio frames
|
||||
directly, which could cause out-of-order frames and broken interruptions
|
||||
during speech.
|
||||
(PR [#4090](https://github.com/pipecat-ai/pipecat/pull/4090))
|
||||
|
||||
- Fixed Genesys AudioHook serializer to always include the `parameters` field in
|
||||
protocol messages. The AudioHook protocol requires every message to carry a
|
||||
`parameters` object (even if empty), but `_create_message` omitted it when no
|
||||
parameters were provided. This caused clients that validate message structure
|
||||
(including the Genesys reference implementation) to reject `pong` and
|
||||
parameter-less `closed` responses, breaking server sequence tracking and
|
||||
preventing `outputVariables` from reaching the Architect flow.
|
||||
(PR [#4093](https://github.com/pipecat-ai/pipecat/pull/4093))
|
||||
|
||||
## [0.0.106] - 2026-03-18
|
||||
|
||||
### Added
|
||||
|
||||
- Added optional `service` field to `ServiceUpdateSettingsFrame` (and its
|
||||
subclasses `LLMUpdateSettingsFrame`, `TTSUpdateSettingsFrame`,
|
||||
`STTUpdateSettingsFrame`) to target a specific service instance. When
|
||||
`service` is set, only the matching service applies the settings; others
|
||||
forward the frame unchanged. This enables updating a single service when
|
||||
multiple services of the same type exist in the pipeline.
|
||||
(PR [#4004](https://github.com/pipecat-ai/pipecat/pull/4004))
|
||||
|
||||
- Added `sip_provider` and `room_geo` parameters to `configure()` in the Daily
|
||||
runner. These convenience parameters let callers specify a SIP provider name
|
||||
and geographic region directly without manually constructing
|
||||
`DailyRoomProperties` and `DailyRoomSipParams`.
|
||||
(PR [#4005](https://github.com/pipecat-ai/pipecat/pull/4005))
|
||||
|
||||
- Added `PerplexityLLMAdapter` that automatically transforms conversation
|
||||
messages to satisfy Perplexity's stricter API constraints (strict role
|
||||
alternation, no non-initial system messages, last message must be user/tool).
|
||||
Previously, certain conversation histories could cause Perplexity API errors
|
||||
that didn't occur with OpenAI (`PerplexityLLMService` subclasses
|
||||
`OpenAILLMService` since Perplexity uses an OpenAI-compatible API).
|
||||
(PR [#4009](https://github.com/pipecat-ai/pipecat/pull/4009))
|
||||
|
||||
- Added DTMF input event support to the Daily transport. Incoming DTMF tones
|
||||
are now received via Daily's `on_dtmf_event` callback and pushed into the
|
||||
pipeline as `InputDTMFFrame`, enabling bots to react to keypad presses from
|
||||
phone callers.
|
||||
(PR [#4047](https://github.com/pipecat-ai/pipecat/pull/4047))
|
||||
|
||||
- Added `WakePhraseUserTurnStartStrategy` for triggering user turns based on
|
||||
wake phrases, with support for `single_activation` mode. Deprecates
|
||||
`WakeCheckFilter`.
|
||||
(PR [#4064](https://github.com/pipecat-ai/pipecat/pull/4064))
|
||||
|
||||
- Added `default_user_turn_start_strategies()` and
|
||||
`default_user_turn_stop_strategies()` helper functions for composing custom
|
||||
strategy lists.
|
||||
(PR [#4064](https://github.com/pipecat-ai/pipecat/pull/4064))
|
||||
|
||||
### Changed
|
||||
|
||||
- Changed tool result JSON serialization to use `ensure_ascii=False`,
|
||||
preserving UTF-8 characters instead of escaping them. This reduces context
|
||||
size and token usage for non-English languages.
|
||||
(PR [#3457](https://github.com/pipecat-ai/pipecat/pull/3457))
|
||||
|
||||
- `OpenAIRealtimeSTTService`'s `noise_reduction` parameter is now part of
|
||||
`OpenAIRealtimeSTTSettings`, making it runtime-updatable via
|
||||
`STTUpdateSettingsFrame`. The direct `noise_reduction` init argument is
|
||||
deprecated as of 0.0.106.
|
||||
(PR [#3991](https://github.com/pipecat-ai/pipecat/pull/3991))
|
||||
|
||||
- Updated `sarvamai` dependency from `0.1.26a2` (alpha) to `0.1.26` (stable
|
||||
release).
|
||||
(PR [#3997](https://github.com/pipecat-ai/pipecat/pull/3997))
|
||||
|
||||
- `SimliVideoService` now extends `AIService` instead of `FrameProcessor`,
|
||||
aligning it with the HeyGen and Tavus video services. It supports
|
||||
`SimliVideoService.Settings(...)` for configuration and uses
|
||||
`start()`/`stop()`/`cancel()` lifecycle methods. Existing constructor usage
|
||||
(`api_key`, `face_id`, etc.) remains unchanged.
|
||||
(PR [#4001](https://github.com/pipecat-ai/pipecat/pull/4001))
|
||||
|
||||
- Update `pipecat-ai-small-webrtc-prebuilt` to `2.4.0`.
|
||||
(PR [#4023](https://github.com/pipecat-ai/pipecat/pull/4023))
|
||||
|
||||
- Nova Sonic assistant text transcripts are now delivered in real-time using
|
||||
speculative text events instead of delayed final text events. Previously,
|
||||
assistant text only arrived after all audio had finished playing, causing
|
||||
laggy transcripts in client UIs. Speculative text arrives before each audio
|
||||
chunk, providing text synchronized with what the bot is saying. This also
|
||||
simplifies the internal text handling by removing the interruption re-push
|
||||
hack and assistant text buffer.
|
||||
(PR [#4042](https://github.com/pipecat-ai/pipecat/pull/4042))
|
||||
|
||||
- Updated `daily-python` dependency to 0.25.0.
|
||||
(PR [#4047](https://github.com/pipecat-ai/pipecat/pull/4047))
|
||||
|
||||
- Added `enable_dialout` parameter to `configure()` in `pipecat.runner.daily`
|
||||
to support dial-out rooms. Also narrowed misleading `Optional` type hints and
|
||||
deduplicated token expiry calculation.
|
||||
(PR [#4048](https://github.com/pipecat-ai/pipecat/pull/4048))
|
||||
|
||||
- Extended `ProcessFrameResult` to stop strategies, allowing a stop strategy to
|
||||
short-circuit evaluation of subsequent strategies by returning `STOP`.
|
||||
(PR [#4064](https://github.com/pipecat-ai/pipecat/pull/4064))
|
||||
|
||||
- `GradiumSTTService` now takes both an `encoding` and `sample_rate`
|
||||
constructor argument which is assmebled in the class to form the
|
||||
`input_format`. PCM accepts `8000`, `16000`, and `24000` Hz sample rates.
|
||||
(PR [#4066](https://github.com/pipecat-ai/pipecat/pull/4066))
|
||||
|
||||
- Improved `GradiumSTTService` transcription accuracy by reworking how text
|
||||
fragments are accumulated and finalized. Previously, trailing words could be
|
||||
dropped when the server's `flushed` response arrived before all text tokens
|
||||
were delivered. The service now uses a short aggregation delay after flush to
|
||||
capture trailing tokens, producing complete utterances.
|
||||
(PR [#4066](https://github.com/pipecat-ai/pipecat/pull/4066))
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `SimliVideoService.InputParams` is deprecated. Use the direct constructor
|
||||
parameters `max_session_length`, `max_idle_time`, and `enable_logging`
|
||||
instead.
|
||||
(PR [#4001](https://github.com/pipecat-ai/pipecat/pull/4001))
|
||||
|
||||
- Deprecated `LocalSmartTurnAnalyzerV2` and `LocalCoreMLSmartTurnAnalyzer`. Use
|
||||
`LocalSmartTurnAnalyzerV3` instead. Instantiating these analyzers will now
|
||||
emit a `DeprecationWarning`.
|
||||
(PR [#4012](https://github.com/pipecat-ai/pipecat/pull/4012))
|
||||
|
||||
- Deprecated `WakeCheckFilter` in favor of `WakePhraseUserTurnStartStrategy`.
|
||||
(PR [#4064](https://github.com/pipecat-ai/pipecat/pull/4064))
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where the default model for `OpenAILLMService` and
|
||||
`AzureLLMService` was mistakenly reverted to `gpt-4o`. The defaults are now
|
||||
restored to `gpt-4.1`.
|
||||
(PR [#4000](https://github.com/pipecat-ai/pipecat/pull/4000))
|
||||
|
||||
- Fixed a race condition where `EndTaskFrame` could cause the pipeline to shut
|
||||
down before in-flight frames (e.g. LLM function call responses) finished
|
||||
processing. `EndTaskFrame` and `StopTaskFrame` now flow through the pipeline
|
||||
as `ControlFrame`s, ensuring all pending work is flushed before shutdown
|
||||
begins. `CancelTaskFrame` and `InterruptionTaskFrame` remain immediate
|
||||
(`SystemFrame`).
|
||||
(PR [#4006](https://github.com/pipecat-ai/pipecat/pull/4006))
|
||||
|
||||
- Fixed `ParallelPipeline` dropping or misordering frames during lifecycle
|
||||
synchronization. Buffered frames are now flushed in the correct order
|
||||
relative to synchronization frames (`StartFrame` goes first,
|
||||
`EndFrame`/`CancelFrame` go after), and frames added to the buffer during
|
||||
flush are also drained.
|
||||
(PR [#4007](https://github.com/pipecat-ai/pipecat/pull/4007))
|
||||
|
||||
- Fixed `TTSService` potentially canceling in-flight audio during shutdown. The
|
||||
stop sequence now waits for all queued audio contexts to finish processing
|
||||
before canceling the stop frame task.
|
||||
(PR [#4007](https://github.com/pipecat-ai/pipecat/pull/4007))
|
||||
|
||||
- Fixed `Language` enum values (e.g. `Language.ES`) not being converted to
|
||||
service-specific codes when passed via
|
||||
`settings=Service.Settings(language=Language.ES)` at init time. This caused
|
||||
API errors (e.g. 400 from Rime) because the raw enum was sent instead of the
|
||||
expected language code (e.g. `"spa"`). Runtime updates via
|
||||
`UpdateSettingsFrame` were unaffected. The fix centralizes conversion in the
|
||||
base `TTSService` and `STTService` classes so all services handle this
|
||||
consistently.
|
||||
(PR [#4024](https://github.com/pipecat-ai/pipecat/pull/4024))
|
||||
|
||||
- Fixed `DeepgramSTTService` ignoring the `base_url` scheme when using `ws://`
|
||||
or `http://`. Previously these were silently overwritten with `wss://` /
|
||||
`https://`, breaking air-gapped or private deployments that don't use TLS.
|
||||
All scheme choices (`wss://`, `https://`, `ws://`, `http://`, or bare
|
||||
hostname) are now respected.
|
||||
(PR [#4026](https://github.com/pipecat-ai/pipecat/pull/4026))
|
||||
|
||||
- Fixed `LLMSwitcher.register_function()` and `register_direct_function()` not
|
||||
accepting or forwarding the `timeout_secs` parameter.
|
||||
(PR [#4037](https://github.com/pipecat-ai/pipecat/pull/4037))
|
||||
|
||||
- Fixed empty user transcriptions in Nova Sonic causing spurious interruptions.
|
||||
Previously, an empty transcription could trigger an interruption of the
|
||||
assistant's response even though the user hadn't actually spoken.
|
||||
(PR [#4042](https://github.com/pipecat-ai/pipecat/pull/4042))
|
||||
|
||||
- Fixed `SonioxSTTService` and `OpenAIRealtimeSTTService` crash when language
|
||||
parameters contain plain strings instead of `Language` enum values.
|
||||
(PR [#4046](https://github.com/pipecat-ai/pipecat/pull/4046))
|
||||
|
||||
- Fixed premature user turn stops caused by late transcriptions arriving
|
||||
between turns. A stale transcript from the previous turn could persist into
|
||||
the next turn and trigger a stop before the current turn's real transcript
|
||||
arrived. Stop strategies are now reset at both turn start and turn stop to
|
||||
prevent state from leaking across turn boundaries.
|
||||
(PR [#4057](https://github.com/pipecat-ai/pipecat/pull/4057))
|
||||
|
||||
- Fixed raw language strings like `"de-DE"` silently failing when passed to
|
||||
TTS/STT services (e.g. ElevenLabs producing no audio). Raw strings now go
|
||||
through the same `Language` enum resolution as enum values, so regional codes
|
||||
like `"de-DE"` are properly converted to service-expected formats like
|
||||
`"de"`. Unrecognized strings log a warning instead of failing silently.
|
||||
(PR [#4058](https://github.com/pipecat-ai/pipecat/pull/4058))
|
||||
|
||||
- Fixed Deepgram STT list-type settings (`keyterm`, `keywords`, `search`,
|
||||
`redact`, `replace`) being stringified instead of passed as lists to the SDK,
|
||||
which caused them to be sent as literal strings (e.g. `"['pipecat']"`) in the
|
||||
WebSocket query params.
|
||||
(PR [#4063](https://github.com/pipecat-ai/pipecat/pull/4063))
|
||||
|
||||
- Fixed `MinWordsUserTurnStartStrategy` including text below the word threshold
|
||||
in the output by resetting aggregation when the minimum word count is not
|
||||
met.
|
||||
(PR [#4064](https://github.com/pipecat-ai/pipecat/pull/4064))
|
||||
|
||||
- Fixed audio overlap and potential dropped TTS content when multiple assistant
|
||||
turns occur in quick succession. `TTSService` now flushes remaining text
|
||||
before pausing frame processing on `LLMFullResponseEndFrame`/`EndFrame`,
|
||||
instead of pausing first.
|
||||
(PR [#4071](https://github.com/pipecat-ai/pipecat/pull/4071))
|
||||
|
||||
### Security
|
||||
|
||||
- Bumped PyJWT minimum version from 2.10.1 to 2.12.0 in the `livekit` extra to
|
||||
address CVE-2026-32597 (GHSA-752w-5fwx-jx9f), where PyJWT <= 2.11.0 accepted
|
||||
unknown `crit` header extensions.
|
||||
(PR [#4035](https://github.com/pipecat-ai/pipecat/pull/4035))
|
||||
|
||||
## [0.0.105] - 2026-03-10
|
||||
|
||||
### Added
|
||||
|
||||
@@ -65,12 +65,25 @@ Once your PR is submitted, post in the `#community-integrations` Discord channel
|
||||
|
||||
#### Websocket-based Services
|
||||
|
||||
**Base class:** `WebsocketSTTService`
|
||||
|
||||
**Use for:** Services where you manage the websocket connection directly. Combines `STTService` with `WebsocketService` for automatic reconnection and keepalive support.
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [CartesiaSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/cartesia/stt.py)
|
||||
- [ElevenLabsRealtimeSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/elevenlabs/stt.py)
|
||||
|
||||
#### SDK-based Streaming Services
|
||||
|
||||
**Base class:** `STTService`
|
||||
|
||||
**Use for:** Streaming services where the provider's Python SDK manages the connection internally.
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [DeepgramSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/deepgram/stt.py)
|
||||
- [SpeechmaticsSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/speechmatics/stt.py)
|
||||
- [GoogleSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/stt.py)
|
||||
|
||||
#### File-based Services
|
||||
|
||||
@@ -108,55 +121,59 @@ Once your PR is submitted, post in the `#community-integrations` Discord channel
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- **Frame sequence:** Output must follow this frame sequence pattern:
|
||||
- `LLMFullResponseStartFrame` - Signals the start of an LLM response
|
||||
- `LLMTextFrame` - Contains LLM content, typically streamed as tokens
|
||||
- `LLMFullResponseEndFrame` - Signals the end of an LLM response
|
||||
- **`_process_context(self, context: LLMContext)`** — The main method that processes an LLM context and generates a response. Each LLM service overrides `process_frame` to extract context from `LLMContextFrame` and calls `_process_context`.
|
||||
|
||||
- **Context aggregation:** Implement context aggregation to collect user and assistant content:
|
||||
- Aggregators come in pairs with a `user()` instance and `assistant()` instance
|
||||
- Context must adhere to the `LLMContext` universal format
|
||||
- Aggregators should handle adding messages, function calls, and images to the context
|
||||
- **`adapter_class`** — Class attribute pointing to a `BaseLLMAdapter` subclass. Defaults to `OpenAILLMAdapter`. Non-OpenAI services must implement their own adapter (see `src/pipecat/adapters/base_llm_adapter.py`) with methods:
|
||||
- `get_llm_invocation_params(context)` — Extract provider-specific params from universal context
|
||||
- `to_provider_tools_format(tools_schema)` — Convert standard tools to provider format
|
||||
- `get_messages_for_logging(context)` — Format messages for logging
|
||||
- Reference adapters: `src/pipecat/adapters/services/` (anthropic, gemini, bedrock, etc.)
|
||||
|
||||
- **Frame sequence:** Output must follow this frame sequence pattern:
|
||||
- `LLMFullResponseStartFrame` — Signals the start of an LLM response
|
||||
- `LLMTextFrame` — Contains LLM content, typically streamed as tokens
|
||||
- `LLMFullResponseEndFrame` — Signals the end of an LLM response
|
||||
|
||||
- **Thought frames (reasoning models):** If the model supports extended thinking / chain-of-thought, emit thought frames alongside the response:
|
||||
- `LLMThoughtStartFrame` — Signals the start of a thought
|
||||
- `LLMThoughtTextFrame` — Contains thought content, streamed as tokens
|
||||
- `LLMThoughtEndFrame` — Signals the end of a thought
|
||||
|
||||
- **Context aggregation** is handled by the framework via `LLMContext` + `LLMContextAggregatorPair`. The LLM service just processes context it receives — no need to implement aggregators.
|
||||
|
||||
### TTS (Text-to-Speech) Services
|
||||
|
||||
#### AudioContextWordTTSService
|
||||
#### WebsocketTTSService
|
||||
|
||||
**Use for:** Websocket-based services supporting word/timestamp alignment
|
||||
**Use for:** Websocket-based streaming services (with or without word timestamps)
|
||||
|
||||
**Example:**
|
||||
**Examples:**
|
||||
|
||||
- [CartesiaTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/cartesia/tts.py)
|
||||
- [ElevenLabsTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/elevenlabs/tts.py)
|
||||
|
||||
#### InterruptibleTTSService
|
||||
|
||||
**Use for:** Websocket-based services without word/timestamp alignment, requiring disconnection on interruption
|
||||
**Use for:** Websocket-based services without word timestamps that reconnect on interruption (e.g. don't support a context ID or interruption message)
|
||||
|
||||
**Example:**
|
||||
|
||||
- [SarvamTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/sarvam/tts.py)
|
||||
|
||||
#### WordTTSService
|
||||
|
||||
**Use for:** HTTP-based services supporting word/timestamp alignment
|
||||
|
||||
**Example:**
|
||||
|
||||
- [ElevenLabsHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/elevenlabs/tts.py)
|
||||
|
||||
#### TTSService
|
||||
|
||||
**Use for:** HTTP-based services without word/timestamp alignment
|
||||
**Use for:** HTTP-based services (word timestamps are supported in the base class)
|
||||
|
||||
**Example:**
|
||||
**Examples:**
|
||||
|
||||
- [GoogleHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/tts.py)
|
||||
- [OpenAITTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/openai/tts.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- For websocket services, use asyncio WebSocket implementation (required for v13+ support)
|
||||
- For websocket services, use asyncio WebSocket implementation
|
||||
- Handle idle service timeouts with keepalives
|
||||
- TTSServices push both audio (`TTSRawAudioFrame`) and text (`TTSTextFrame`) frames
|
||||
- TTS services push both audio (`TTSAudioRawFrame`) and text (`TTSTextFrame`) frames
|
||||
|
||||
### Telephony Serializers
|
||||
|
||||
@@ -200,9 +217,9 @@ Vision services process images and provide analysis such as descriptions, object
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- Must implement `run_vision` method that takes an `LLMContext` and returns an `AsyncGenerator[Frame, None]`
|
||||
- The method processes the latest image in the context and yields frames with analysis results
|
||||
- Typically yields `TextFrame` objects containing descriptions or answers
|
||||
- Must implement `run_vision` method that takes a `UserImageRawFrame` and returns an `AsyncGenerator[Frame, None]`
|
||||
- The method processes the image frame and yields frames with analysis results
|
||||
- Must yield the frame sequence: `VisionFullResponseStartFrame`, `VisionTextFrame`, `VisionFullResponseEndFrame`
|
||||
|
||||
## Implementation Guidelines
|
||||
|
||||
@@ -381,7 +398,7 @@ Note that `self.sample_rate` is a `@property` set in the TTSService base class,
|
||||
|
||||
Use Pipecat's tracing decorators:
|
||||
|
||||
- **STT:** `@traced_stt` - decorate a function that handles `transcript`, `is_final`, `language` as args
|
||||
- **STT:** `@traced_stt` - decorate `_handle_transcription(self, transcript, is_final, language)` (the standard method name convention)
|
||||
- **LLM:** `@traced_llm` - decorate the `_process_context()` method
|
||||
- **TTS:** `@traced_tts` - decorate the `run_tts()` method
|
||||
|
||||
@@ -403,17 +420,15 @@ For REST-based communication, use aiohttp. Pipecat includes this as a required d
|
||||
- Wrap API calls in appropriate try/catch blocks
|
||||
- Handle rate limits and network failures gracefully
|
||||
- Provide meaningful error messages
|
||||
- When errors occur, raise exceptions AND push `ErrorFrame`s to notify the pipeline:
|
||||
- When errors occur, raise exceptions AND push errors to notify the pipeline:
|
||||
|
||||
```python
|
||||
from pipecat.frames.frames import ErrorFrame
|
||||
|
||||
try:
|
||||
# Your API call
|
||||
result = await self._make_api_call()
|
||||
except Exception as e:
|
||||
# Push error frame to pipeline
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
# Push error upstream to notify the pipeline
|
||||
await self.push_error(f"{self} error: {e}", exception=e)
|
||||
# Raise or handle as appropriate
|
||||
raise
|
||||
```
|
||||
|
||||
31
README.md
31
README.md
@@ -65,6 +65,10 @@ claude plugin marketplace add pipecat-ai/skills
|
||||
|
||||
and install any of the available plugins.
|
||||
|
||||
### 🧩 Community Integrations
|
||||
|
||||
Build and share your own Pipecat service integrations! Browse existing [community integrations](https://docs.pipecat.ai/server/services/community-integrations) or check out our [guide](COMMUNITY_INTEGRATIONS.md) to create your own.
|
||||
|
||||
### 📺️ Pipecat TV Channel
|
||||
|
||||
Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.youtube.com/playlist?list=PLzU2zoMTQIHjqC3v4q2XVSR3hGSzwKFwH) channel.
|
||||
@@ -81,19 +85,20 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout
|
||||
|
||||
## 🧩 Available services
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Exotel](https://docs.pipecat.ai/server/utilities/serializers/exotel), [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/utilities/serializers/vonage) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/server/services/video/lemonslice), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
| Category | Services |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [Novita](https://docs.pipecat.ai/server/services/llm/novita), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nvidia), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/llm/sarvam), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Smallest](https://docs.pipecat.ai/server/services/tts/smallest), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [xAI](https://docs.pipecat.ai/server/services/tts/xai), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Exotel](https://docs.pipecat.ai/server/utilities/serializers/exotel), [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/utilities/serializers/vonage) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/server/services/video/lemonslice), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
| Community | [Browse community integrations →](https://docs.pipecat.ai/server/services/community-integrations) |
|
||||
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
- Changed tool result JSON serialization to use `ensure_ascii=False`, preserving UTF-8 characters instead of escaping them. This reduces context size and token usage for non-English languages.
|
||||
@@ -1 +0,0 @@
|
||||
- `OpenAIRealtimeSTTService`'s `noise_reduction` parameter is now part of `OpenAIRealtimeSTTSettings`, making it runtime-updatable via `STTUpdateSettingsFrame`. The direct `noise_reduction` init argument is deprecated as of 0.0.106.
|
||||
@@ -1 +0,0 @@
|
||||
- Updated `sarvamai` dependency from `0.1.26a2` (alpha) to `0.1.26` (stable release).
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed an issue where the default model for `OpenAILLMService` and `AzureLLMService` was mistakenly reverted to `gpt-4o`. The defaults are now restored to `gpt-4.1`.
|
||||
@@ -1 +0,0 @@
|
||||
- `SimliVideoService` now extends `AIService` instead of `FrameProcessor`, aligning it with the HeyGen and Tavus video services. It supports `SimliVideoService.Settings(...)` for configuration and uses `start()`/`stop()`/`cancel()` lifecycle methods. Existing constructor usage (`api_key`, `face_id`, etc.) remains unchanged.
|
||||
@@ -1 +0,0 @@
|
||||
- `SimliVideoService.InputParams` is deprecated. Use the direct constructor parameters `max_session_length`, `max_idle_time`, and `enable_logging` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- Added optional `service` field to `ServiceUpdateSettingsFrame` (and its subclasses `LLMUpdateSettingsFrame`, `TTSUpdateSettingsFrame`, `STTUpdateSettingsFrame`) to target a specific service instance. When `service` is set, only the matching service applies the settings; others forward the frame unchanged. This enables updating a single service when multiple services of the same type exist in the pipeline.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `sip_provider` and `room_geo` parameters to `configure()` in the Daily runner. These convenience parameters let callers specify a SIP provider name and geographic region directly without manually constructing `DailyRoomProperties` and `DailyRoomSipParams`.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed a race condition where `EndTaskFrame` could cause the pipeline to shut down before in-flight frames (e.g. LLM function call responses) finished processing. `EndTaskFrame` and `StopTaskFrame` now flow through the pipeline as `ControlFrame`s, ensuring all pending work is flushed before shutdown begins. `CancelTaskFrame` and `InterruptionTaskFrame` remain immediate (`SystemFrame`).
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed `TTSService` potentially canceling in-flight audio during shutdown. The stop sequence now waits for all queued audio contexts to finish processing before canceling the stop frame task.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed `ParallelPipeline` dropping or misordering frames during lifecycle synchronization. Buffered frames are now flushed in the correct order relative to synchronization frames (`StartFrame` goes first, `EndFrame`/`CancelFrame` go after), and frames added to the buffer during flush are also drained.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `PerplexityLLMAdapter` that automatically transforms conversation messages to satisfy Perplexity's stricter API constraints (strict role alternation, no non-initial system messages, last message must be user/tool). Previously, certain conversation histories could cause Perplexity API errors that didn't occur with OpenAI (`PerplexityLLMService` subclasses `OpenAILLMService` since Perplexity uses an OpenAI-compatible API).
|
||||
@@ -1 +0,0 @@
|
||||
- Deprecated `LocalSmartTurnAnalyzerV2` and `LocalCoreMLSmartTurnAnalyzer`. Use `LocalSmartTurnAnalyzerV3` instead. Instantiating these analyzers will now emit a `DeprecationWarning`.
|
||||
@@ -1 +0,0 @@
|
||||
- Update `pipecat-ai-small-webrtc-prebuilt` to `2.4.0`.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed `Language` enum values (e.g. `Language.ES`) not being converted to service-specific codes when passed via `settings=Service.Settings(language=Language.ES)` at init time. This caused API errors (e.g. 400 from Rime) because the raw enum was sent instead of the expected language code (e.g. `"spa"`). Runtime updates via `UpdateSettingsFrame` were unaffected. The fix centralizes conversion in the base `TTSService` and `STTService` classes so all services handle this consistently.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed `DeepgramSTTService` ignoring the `base_url` scheme when using `ws://` or `http://`. Previously these were silently overwritten with `wss://` / `https://`, breaking air-gapped or private deployments that don't use TLS. All scheme choices (`wss://`, `https://`, `ws://`, `http://`, or bare hostname) are now respected.
|
||||
@@ -1 +0,0 @@
|
||||
- Bumped PyJWT minimum version from 2.10.1 to 2.12.0 in the `livekit` extra to address CVE-2026-32597 (GHSA-752w-5fwx-jx9f), where PyJWT <= 2.11.0 accepted unknown `crit` header extensions.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed `LLMSwitcher.register_function()` and `register_direct_function()` not accepting or forwarding the `timeout_secs` parameter.
|
||||
@@ -1 +0,0 @@
|
||||
Fixed `SonioxSTTService` and `OpenAIRealtimeSTTService` crash when language parameters contain plain strings instead of `Language` enum values.
|
||||
@@ -1 +0,0 @@
|
||||
- Added DTMF input event support to the Daily transport. Incoming DTMF tones are now received via Daily's `on_dtmf_event` callback and pushed into the pipeline as `InputDTMFFrame`, enabling bots to react to keypad presses from phone callers.
|
||||
@@ -1 +0,0 @@
|
||||
- Updated `daily-python` dependency to 0.25.0.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `enable_dialout` parameter to `configure()` in `pipecat.runner.daily` to support dial-out rooms. Also narrowed misleading `Optional` type hints and deduplicated token expiry calculation.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed premature user turn stops caused by late transcriptions arriving between turns. A stale transcript from the previous turn could persist into the next turn and trigger a stop before the current turn's real transcript arrived. Stop strategies are now reset at both turn start and turn stop to prevent state from leaking across turn boundaries.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed raw language strings like `"de-DE"` silently failing when passed to TTS/STT services (e.g. ElevenLabs producing no audio). Raw strings now go through the same `Language` enum resolution as enum values, so regional codes like `"de-DE"` are properly converted to service-expected formats like `"de"`. Unrecognized strings log a warning instead of failing silently.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed Deepgram STT list-type settings (`keyterm`, `keywords`, `search`, `redact`, `replace`) being stringified instead of passed as lists to the SDK, which caused them to be sent as literal strings (e.g. `"['pipecat']"`) in the WebSocket query params.
|
||||
12
env.example
12
env.example
@@ -80,9 +80,6 @@ GOOGLE_TEST_CREDENTIALS=...
|
||||
# Gradium
|
||||
GRAPDIUM_API_KEY=...
|
||||
|
||||
# Grok
|
||||
GROK_API_KEY=...
|
||||
|
||||
# Groq
|
||||
GROQ_API_KEY=...
|
||||
|
||||
@@ -127,6 +124,9 @@ MISTRAL_API_KEY=...
|
||||
# Neuphonic
|
||||
NEUPHONIC_API_KEY=...
|
||||
|
||||
# Novita
|
||||
NOVITA_API_KEY=...
|
||||
|
||||
# NVIDIA
|
||||
NVIDIA_API_KEY=...
|
||||
|
||||
@@ -176,6 +176,9 @@ SENTRY_DSN=...
|
||||
SIMLI_API_KEY=...
|
||||
SIMLI_FACE_ID=...
|
||||
|
||||
# Smallest
|
||||
SMALLEST_API_KEY=...
|
||||
|
||||
# Smart turn
|
||||
LOCAL_SMART_TURN_MODEL_PATH=...
|
||||
FAL_SMART_TURN_API_KEY=...
|
||||
@@ -209,3 +212,6 @@ WHATSAPP_TOKEN=...
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN=...
|
||||
WHATSAPP_PHONE_NUMBER_ID=...
|
||||
WHATSAPP_APP_SECRET=...
|
||||
|
||||
# xAI / Grok
|
||||
XAI_API_KEY=...
|
||||
@@ -60,7 +60,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
context = LLMContext()
|
||||
context.add_message({"role": "user", "content": "Say hello to the world."})
|
||||
context.add_message({"role": "developer", "content": "Say hello to the world."})
|
||||
await task.queue_frames([LLMContextFrame(context), EndFrame()])
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
@@ -109,7 +109,9 @@ async def run_example(webrtc_connection: SmallWebRTCConnection):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -92,7 +92,7 @@ async def main():
|
||||
await transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "user", "content": "Please introduce yourself to the user."}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -16,11 +16,12 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
OutputImageRawFrame,
|
||||
TextFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
|
||||
from pipecat.pipeline.sync_parallel_pipeline import FrameOrder, SyncParallelPipeline
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.sentence import SentenceAggregator
|
||||
@@ -30,6 +31,7 @@ from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaHttpTTSService
|
||||
from pipecat.services.fal.image import FalImageGenService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.tts_service import TextAggregationMode
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
@@ -44,6 +46,18 @@ class MonthFrame(DataFrame):
|
||||
return f"{self.name}(month: {self.month})"
|
||||
|
||||
|
||||
class MarkImageForPlaybackSync(FrameProcessor):
|
||||
"""Marks output image frames to be synchronized with audio playback."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, OutputImageRawFrame):
|
||||
frame.sync_with_audio = True
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class MonthPrepender(FrameProcessor):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@@ -101,6 +115,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
settings=CartesiaHttpTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
# No need to aggregate by sentences (the default), as we already know we're getting full sentences
|
||||
# (Otherwise the service will unnecessarily wait for follow-up input to confirm the sentence is complete,
|
||||
# which, sadly, actually breaks the synchronization mechanism)
|
||||
text_aggregation_mode=TextAggregationMode.TOKEN,
|
||||
)
|
||||
|
||||
imagegen = FalImageGenService(
|
||||
@@ -119,17 +137,26 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# that, each pipeline runs concurrently and `SyncParallelPipeline` will
|
||||
# wait for the input frame to be processed.
|
||||
#
|
||||
# We use `FrameOrder.PIPELINE` so that each synchronized batch of output
|
||||
# frames is pushed in the order the pipelines are listed: image first,
|
||||
# then audio. This ensures the transport receives the image before the
|
||||
# audio frames it should accompany.
|
||||
#
|
||||
# Note that `SyncParallelPipeline` requires the last processor in each
|
||||
# of the pipelines to be synchronous. In this case, we use
|
||||
# `CartesiaHttpTTSService` and `FalImageGenService` which make HTTP
|
||||
# `FalImageGenService` and `CartesiaHttpTTSService` which make HTTP
|
||||
# requests and wait for the response.
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
llm, # LLM
|
||||
sentence_aggregator, # Aggregates LLM output into full sentences
|
||||
SyncParallelPipeline( # Run pipelines in parallel aggregating the result
|
||||
[
|
||||
imagegen, # Generate image
|
||||
MarkImageForPlaybackSync(), # Mark image as needing sync w/audio during playback
|
||||
],
|
||||
[month_prepender, tts], # Create "Month: sentence" and output audio
|
||||
[imagegen], # Generate image
|
||||
frame_order=FrameOrder.PIPELINE,
|
||||
),
|
||||
transport.output(), # Transport output
|
||||
]
|
||||
|
||||
@@ -1,202 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import tkinter as tk
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMContextFrame,
|
||||
OutputAudioRawFrame,
|
||||
TextFrame,
|
||||
TTSAudioRawFrame,
|
||||
URLImageRawFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.sentence import SentenceAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.cartesia.tts import CartesiaHttpTTSService
|
||||
from pipecat.services.fal.image import FalImageGenService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.local.tk import TkLocalTransport, TkTransportParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tk_root = tk.Tk()
|
||||
tk_root.title("Calendar")
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
async def get_month_data(month):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.",
|
||||
}
|
||||
]
|
||||
|
||||
class ImageDescription(FrameProcessor):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.text = ""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TextFrame):
|
||||
self.text = frame.text
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
class AudioGrabber(FrameProcessor):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.audio = bytearray()
|
||||
self.frame = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TTSAudioRawFrame):
|
||||
self.audio.extend(frame.audio)
|
||||
self.frame = OutputAudioRawFrame(
|
||||
bytes(self.audio), frame.sample_rate, frame.num_channels
|
||||
)
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
class ImageGrabber(FrameProcessor):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.frame = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, URLImageRawFrame):
|
||||
self.frame = frame
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
tts = CartesiaHttpTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
settings=CartesiaHttpTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
imagegen = FalImageGenService(
|
||||
settings=FalImageGenService.Settings(
|
||||
image_size="square_hd",
|
||||
),
|
||||
aiohttp_session=session,
|
||||
key=os.getenv("FAL_KEY"),
|
||||
)
|
||||
|
||||
sentence_aggregator = SentenceAggregator()
|
||||
|
||||
description = ImageDescription()
|
||||
|
||||
audio_grabber = AudioGrabber()
|
||||
|
||||
image_grabber = ImageGrabber()
|
||||
|
||||
# With `SyncParallelPipeline` we synchronize audio and images by
|
||||
# pushing them basically in order (e.g. I1 A1 A1 A1 I2 A2 A2 A2 A2
|
||||
# I3 A3). To do that, each pipeline runs concurrently and
|
||||
# `SyncParallelPipeline` will wait for the input frame to be
|
||||
# processed.
|
||||
#
|
||||
# Note that `SyncParallelPipeline` requires the last processor in
|
||||
# each of the pipelines to be synchronous. In this case, we use
|
||||
# `CartesiaHttpTTSService` and `FalImageGenService` which make HTTP
|
||||
# requests and wait for the response.
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
llm, # LLM
|
||||
sentence_aggregator, # Aggregates LLM output into full sentences
|
||||
description, # Store sentence
|
||||
SyncParallelPipeline(
|
||||
[tts, audio_grabber], # Generate and store audio for the given sentence
|
||||
[imagegen, image_grabber], # Generate and storeimage for the given sentence
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
await task.queue_frame(LLMContextFrame(LLMContext(messages)))
|
||||
await task.stop_when_done()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
return {
|
||||
"month": month,
|
||||
"text": description.text,
|
||||
"image": image_grabber.frame,
|
||||
"audio": audio_grabber.frame,
|
||||
}
|
||||
|
||||
transport = TkLocalTransport(
|
||||
tk_root,
|
||||
TkTransportParams(
|
||||
audio_out_enabled=True,
|
||||
video_out_enabled=True,
|
||||
video_out_width=1024,
|
||||
video_out_height=1024,
|
||||
),
|
||||
)
|
||||
|
||||
pipeline = Pipeline([transport.output()])
|
||||
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
# We only specify a few months as we create tasks all at once and we
|
||||
# might get rate limited otherwise.
|
||||
months: list[str] = [
|
||||
"January",
|
||||
"February",
|
||||
]
|
||||
|
||||
# We create one task per month. This will be executed concurrently.
|
||||
month_tasks = [asyncio.create_task(get_month_data(month)) for month in months]
|
||||
|
||||
# Now we wait for each month task in the order they're completed. The
|
||||
# benefit is we'll have as little delay as possible before the first
|
||||
# month, and likely no delay between months, but the months won't
|
||||
# display in order.
|
||||
async def show_images(month_tasks):
|
||||
for month_data_task in asyncio.as_completed(month_tasks):
|
||||
data = await month_data_task
|
||||
await task.queue_frames([data["image"], data["audio"]])
|
||||
|
||||
await runner.stop_when_done()
|
||||
|
||||
async def run_tk():
|
||||
while not task.has_finished():
|
||||
tk_root.update()
|
||||
tk_root.update_idletasks()
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
await asyncio.gather(runner.run(task), show_images(month_tasks), run_tk())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -129,7 +129,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -103,7 +103,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "user", "content": "Please introduce yourself to the user."}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -4,14 +4,12 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
@@ -24,25 +22,14 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService, GeminiModalities
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.responses.llm import OpenAIResponsesLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
SYSTEM_INSTRUCTION = f"""
|
||||
"You are Gemini Chatbot, a friendly, helpful robot.
|
||||
|
||||
Your goal is to demonstrate your capabilities in a succinct way.
|
||||
|
||||
Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points.
|
||||
|
||||
Respond to what the user said in a creative and helpful way. Keep your responses brief. One or two sentences at most.
|
||||
"""
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
@@ -64,57 +51,37 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# KNOWN ISSUE: If using GeminiLiveVertexLLMService, you cannot specify a
|
||||
# modality other than AUDIO (at least not if using the service's default
|
||||
# model, which is a native audio model:
|
||||
# https://cloud.google.com/vertex-ai/generative-ai/docs/live-api/tools#native-audio).
|
||||
llm = GeminiLiveLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
settings=GeminiLiveLLMService.Settings(
|
||||
system_instruction=SYSTEM_INSTRUCTION,
|
||||
modalities=GeminiModalities.TEXT,
|
||||
),
|
||||
tools=[{"google_search": {}}, {"code_execution": {}}],
|
||||
)
|
||||
|
||||
# Optionally, you can set the response modalities via a function
|
||||
# llm.set_model_modalities(
|
||||
# GeminiMultimodalModalities.TEXT
|
||||
# )
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"), voice_id="71a7ad14-091c-4e8e-a314-022ece01c121"
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": 'Start by saying "Hello, I\'m Gemini".',
|
||||
},
|
||||
]
|
||||
llm = OpenAIResponsesLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
settings=OpenAIResponsesLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
|
||||
# Set up conversation context and management
|
||||
# The context_aggregator will automatically collect conversation context
|
||||
context = LLMContext(messages)
|
||||
context = LLMContext()
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
# Set stop_secs to something roughly similar to the internal setting
|
||||
# of the Multimodal Live api, just to align events. This doesn't
|
||||
# really matter because we can only use the Multimodal Live API's
|
||||
# phrase endpointing, for now.
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5))
|
||||
),
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
user_aggregator,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
user_aggregator, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
assistant_aggregator, # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -131,6 +98,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
@@ -98,7 +98,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -148,7 +148,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Say a short hello to the user."})
|
||||
context.add_message({"role": "developer", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -128,7 +128,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Say a short hello to the user."})
|
||||
context.add_message({"role": "developer", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.aws.llm import AWSBedrockLLMService, AWSBedrockLLMSettings
|
||||
from pipecat.services.deepgram.flux.sagemaker.stt import DeepgramFluxSageMakerSTTService
|
||||
from pipecat.services.deepgram.sagemaker.tts import DeepgramSageMakerTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.user_turn_strategies import ExternalUserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Initialize Deepgram Flux SageMaker STT Service
|
||||
# This requires:
|
||||
# - AWS credentials configured (via environment variables or AWS CLI)
|
||||
# - A deployed SageMaker endpoint with Deepgram Flux model
|
||||
stt = DeepgramFluxSageMakerSTTService(
|
||||
endpoint_name=os.getenv("SAGEMAKER_STT_ENDPOINT_NAME"),
|
||||
region=os.getenv("AWS_REGION"),
|
||||
settings=DeepgramFluxSageMakerSTTService.Settings(
|
||||
min_confidence=0.3,
|
||||
),
|
||||
)
|
||||
|
||||
# Initialize Deepgram SageMaker TTS Service
|
||||
# This requires:
|
||||
# - AWS credentials configured (via environment variables or AWS CLI)
|
||||
# - A deployed SageMaker endpoint with Deepgram TTS model
|
||||
tts = DeepgramSageMakerTTSService(
|
||||
endpoint_name=os.getenv("SAGEMAKER_TTS_ENDPOINT_NAME"),
|
||||
region=os.getenv("AWS_REGION"),
|
||||
settings=DeepgramSageMakerTTSService.Settings(
|
||||
voice="aura-2-andromeda-en",
|
||||
),
|
||||
)
|
||||
|
||||
llm = AWSBedrockLLMService(
|
||||
aws_region=os.getenv("AWS_REGION"),
|
||||
settings=AWSBedrockLLMSettings(
|
||||
model="us.amazon.nova-pro-v1:0",
|
||||
temperature=0.8,
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
|
||||
context = LLMContext()
|
||||
# Use ExternalUserTurnStrategies since Flux handles turn detection natively
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=ExternalUserTurnStrategies(),
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
user_aggregator, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
assistant_aggregator, # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@stt.event_handler("on_update")
|
||||
async def on_deepgram_flux_update(stt, transcript):
|
||||
logger.debug(f"On deepgram flux update: {transcript}")
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -109,7 +109,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -104,7 +104,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "user", "content": "Please introduce yourself to the user."}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -114,7 +114,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -106,7 +106,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -100,7 +100,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -108,7 +108,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "user", "content": "Please introduce yourself to the user."}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -100,7 +100,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
128
examples/foundational/07e-interruptible-xai.py
Normal file
128
examples/foundational/07e-interruptible-xai.py
Normal file
@@ -0,0 +1,128 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.xai.llm import GrokLLMService
|
||||
from pipecat.services.xai.tts import XAIHttpTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = XAIHttpTTSService(
|
||||
api_key=os.getenv("XAI_API_KEY"),
|
||||
aiohttp_session=session,
|
||||
settings=XAIHttpTTSService.Settings(
|
||||
voice="eve",
|
||||
),
|
||||
)
|
||||
|
||||
llm = GrokLLMService(
|
||||
api_key=os.getenv("XAI_API_KEY"),
|
||||
settings=GrokLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
|
||||
context = LLMContext()
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
user_aggregator, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
assistant_aggregator, # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -102,7 +102,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -102,7 +102,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -106,7 +106,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -108,7 +108,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -103,7 +103,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -104,7 +104,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "user", "content": "Please introduce yourself to the user."}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -114,7 +114,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -109,7 +109,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -99,7 +99,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -95,7 +95,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -151,7 +151,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
LLMMessagesAppendFrame(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"role": "developer",
|
||||
"content": f"Greet the user and introduce yourself. Don't use emojis.",
|
||||
}
|
||||
],
|
||||
|
||||
@@ -66,7 +66,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
settings=AWSBedrockLLMService.Settings(
|
||||
model="us.anthropic.claude-sonnet-4-6",
|
||||
temperature=0.8,
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
# system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
|
||||
@@ -101,7 +101,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -124,7 +124,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation with a styled introduction
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -128,7 +128,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
# Kick off the conversation
|
||||
context.add_message(
|
||||
{
|
||||
"role": "user",
|
||||
"role": "developer",
|
||||
"content": "You are an AI assistant. You can help with a variety of tasks. Introduce yourself and ask the user what they would like to know.",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -112,7 +112,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -112,7 +112,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -153,7 +153,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -102,7 +102,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -104,7 +104,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=KrispVivaTurn())]
|
||||
),
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
vad_analyzer=SileroVADAnalyzer(), # or KrispVivaVadAnalyzer
|
||||
),
|
||||
)
|
||||
|
||||
@@ -134,7 +134,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -103,7 +103,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -107,7 +107,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "user", "content": "Please introduce yourself to the user."}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -99,7 +99,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -95,7 +95,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -269,7 +269,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -100,7 +100,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -105,7 +105,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "user", "content": "Please introduce yourself to the user."}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -99,7 +99,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -106,7 +106,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "user", "content": "Please introduce yourself to the user."}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ async def main():
|
||||
),
|
||||
)
|
||||
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message({"role": "developer", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
@@ -107,7 +107,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "user", "content": "Please introduce yourself to the user."}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.sarvam.llm import SarvamLLMService
|
||||
from pipecat.services.sarvam.stt import SarvamSTTService
|
||||
from pipecat.services.sarvam.tts import SarvamHttpTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
@@ -72,9 +72,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
settings=OpenAILLMService.Settings(
|
||||
llm = SarvamLLMService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
settings=SarvamLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
|
||||
@@ -21,7 +21,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.sarvam.llm import SarvamLLMService
|
||||
from pipecat.services.sarvam.stt import SarvamSTTService
|
||||
from pipecat.services.sarvam.tts import SarvamTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
@@ -55,20 +55,20 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
stt = SarvamSTTService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
settings=SarvamSTTService.Settings(
|
||||
model="saarika:v2.5",
|
||||
model="saaras:v3",
|
||||
),
|
||||
)
|
||||
|
||||
tts = SarvamTTSService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
settings=SarvamTTSService.Settings(
|
||||
model="bulbul:v2",
|
||||
voice="manisha",
|
||||
model="bulbul:v3",
|
||||
voice="shubh",
|
||||
),
|
||||
)
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
settings=OpenAILLMService.Settings(
|
||||
llm = SarvamLLMService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
settings=SarvamLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
@@ -96,6 +96,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
allow_interruptions=True,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -105,7 +105,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -61,7 +61,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
streaming=True,
|
||||
settings=InworldHttpTTSService.Settings(
|
||||
voice="Ashley",
|
||||
model="inworld-tts-1",
|
||||
),
|
||||
# Set to False for non-streaming mode or True for streaming mode.
|
||||
)
|
||||
@@ -112,7 +111,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info("Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "user", "content": "Please introduce yourself to the user."}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -56,7 +56,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
api_key=os.getenv("INWORLD_API_KEY", ""),
|
||||
settings=InworldTTSService.Settings(
|
||||
voice="Ashley",
|
||||
model="inworld-tts-1",
|
||||
temperature=1.1,
|
||||
),
|
||||
)
|
||||
@@ -99,7 +98,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info("Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -105,7 +105,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "user", "content": "Please introduce yourself to the user."}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -100,7 +100,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -128,7 +128,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
await audiobuffer.start_recording()
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@audiobuffer.event_handler("on_audio_data")
|
||||
|
||||
@@ -113,7 +113,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"💡 Word timestamps are enabled! Watch the console for TTSTextFrame logs showing each word with its PTS."
|
||||
)
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -106,7 +106,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -99,7 +99,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info("Client connected")
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -98,7 +98,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -98,7 +98,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -98,7 +98,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
122
examples/foundational/07zl-interruptible-smallest.py
Normal file
122
examples/foundational/07zl-interruptible-smallest.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.smallest.tts import SmallestTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
)
|
||||
|
||||
tts = SmallestTTSService(
|
||||
api_key=os.getenv("SMALLEST_API_KEY"),
|
||||
settings=SmallestTTSService.Settings(
|
||||
voice="sophia",
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
settings=OpenAILLMService.Settings(
|
||||
system_instruction="You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
),
|
||||
)
|
||||
|
||||
context = LLMContext()
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_aggregator,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -141,7 +141,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected: {client}")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
@@ -19,7 +19,6 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.filters.wake_check_filter import WakeCheckFilter
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
@@ -28,6 +27,11 @@ from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.user_start import WakePhraseUserTurnStartStrategy
|
||||
from pipecat.turns.user_turn_strategies import (
|
||||
UserTurnStrategies,
|
||||
default_user_turn_start_strategies,
|
||||
)
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -52,7 +56,12 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = DeepgramSTTService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
settings=DeepgramSTTService.Settings(
|
||||
keyterm=["pipecat"],
|
||||
),
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
@@ -68,19 +77,28 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
),
|
||||
)
|
||||
|
||||
hey_robot_filter = WakeCheckFilter(["hey robot", "hey, robot"])
|
||||
|
||||
context = LLMContext()
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
start=[
|
||||
WakePhraseUserTurnStartStrategy(
|
||||
phrases=["pipecat"],
|
||||
# Timeout before wake phrase must be spoken again
|
||||
timeout=5.0,
|
||||
),
|
||||
*default_user_turn_start_strategies(),
|
||||
]
|
||||
),
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
hey_robot_filter, # Filter out speech not directed at the robot
|
||||
stt,
|
||||
user_aggregator, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
@@ -103,10 +121,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Please introduce yourself. Tell the user they should say 'Hey Robot' before talking to you.",
|
||||
}
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
139
examples/foundational/12-describe-image-openai-responses.py
Normal file
139
examples/foundational/12-describe-image-openai-responses.py
Normal file
@@ -0,0 +1,139 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.responses.llm import OpenAIResponsesLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAIResponsesLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
settings=OpenAIResponsesLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way. You are also able to describe images.",
|
||||
),
|
||||
)
|
||||
|
||||
context = LLMContext()
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
user_aggregator, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
assistant_aggregator, # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
if not runner_args.body:
|
||||
script_dir = os.path.dirname(__file__)
|
||||
runner_args.body = {
|
||||
"image_path": os.path.join(script_dir, "assets", "cat.jpg"),
|
||||
"question": "Describe this image",
|
||||
}
|
||||
|
||||
image_path = runner_args.body["image_path"]
|
||||
question = runner_args.body["question"]
|
||||
|
||||
# Kick off the conversation.
|
||||
image = Image.open(image_path)
|
||||
message = await LLMContext.create_image_message(
|
||||
image=image.tobytes(),
|
||||
format="RGB",
|
||||
size=image.size,
|
||||
text=question,
|
||||
)
|
||||
context.add_message(message)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -1,121 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import Frame, TranscriptionFrame, UserStoppedSpeakingFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.audio.vad_processor import VADProcessor
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.sambanova.stt import SambaNovaSTTService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
STOP_SECS = 2.0
|
||||
|
||||
|
||||
class TranscriptionLogger(FrameProcessor):
|
||||
"""Measures transcription latency.
|
||||
|
||||
Uses the (intentionally) long STOP_SECS parameter to give the transcription time to finish,
|
||||
then outputs the timing between when the VAD first classified audio input as not-speech and
|
||||
the delivery of the last transcription frame.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._last_transcription_time = time.time()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserStoppedSpeakingFrame):
|
||||
logger.debug(
|
||||
f"Transcription latency: {(STOP_SECS - (time.time() - self._last_transcription_time)):.2f}"
|
||||
)
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
self._last_transcription_time = time.time()
|
||||
|
||||
# Push all frames through
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = SambaNovaSTTService(
|
||||
settings=SambaNovaSTTService.Settings(
|
||||
model="Whisper-Large-v3",
|
||||
),
|
||||
api_key=os.getenv("SAMBANOVA_API_KEY"),
|
||||
)
|
||||
|
||||
tl = TranscriptionLogger()
|
||||
vad_processor = VADProcessor(
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=STOP_SECS))
|
||||
)
|
||||
|
||||
pipeline = Pipeline([transport.input(), vad_processor, stt, tl])
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
175
examples/foundational/14-function-calling-openai-responses.py
Normal file
175
examples/foundational/14-function-calling-openai-responses.py
Normal file
@@ -0,0 +1,175 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.responses.llm import OpenAIResponsesLLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
async def fetch_restaurant_recommendation(params: FunctionCallParams):
|
||||
await params.result_callback({"name": "The Golden Dragon"})
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
llm = OpenAIResponsesLLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
settings=OpenAIResponsesLLMService.Settings(
|
||||
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
|
||||
),
|
||||
)
|
||||
|
||||
# You can also register a function_name of None to get all functions
|
||||
# sent to the same callback with an additional function_name parameter.
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
@llm.event_handler("on_function_calls_started")
|
||||
async def on_function_calls_started(service, function_calls):
|
||||
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
)
|
||||
restaurant_function = FunctionSchema(
|
||||
name="get_restaurant_recommendation",
|
||||
description="Get a restaurant recommendation",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
},
|
||||
required=["location"],
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
|
||||
|
||||
context = LLMContext(tools=tools)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_aggregator,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -138,7 +138,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user