Compare commits
8 Commits
filipi/asy
...
rtvi-send-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7742d1a83b | ||
|
|
d9cebe602f | ||
|
|
96e06d2401 | ||
|
|
267c86e596 | ||
|
|
9fb06c3e4b | ||
|
|
71197fbc2c | ||
|
|
9cd4e5faca | ||
|
|
4f290be834 |
@@ -144,7 +144,7 @@ class InputParams(BaseModel):
|
||||
|
||||
#### Examples
|
||||
|
||||
Validated against `examples/07-interruptible.py`:
|
||||
Validated against `examples/foundational/07-interruptible.py`:
|
||||
|
||||
- Proper `create_transport()` usage
|
||||
- Correct pipeline structure
|
||||
|
||||
2
.github/workflows/python-compatibility.yaml
vendored
2
.github/workflows/python-compatibility.yaml
vendored
@@ -42,7 +42,7 @@ jobs:
|
||||
|
||||
- name: Test uv sync with all extras
|
||||
run: |
|
||||
uv sync --group dev --all-extras
|
||||
uv sync --group dev --all-extras --no-extra krisp
|
||||
|
||||
- name: Verify installation
|
||||
run: |
|
||||
|
||||
51
.github/workflows/sync-quickstart.yaml
vendored
Normal file
51
.github/workflows/sync-quickstart.yaml
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
name: Sync Quickstart to pipecat-quickstart repo
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'examples/quickstart/**'
|
||||
workflow_dispatch: # Manual trigger
|
||||
|
||||
jobs:
|
||||
sync-quickstart:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout main repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Checkout quickstart repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
repository: pipecat-ai/pipecat-quickstart
|
||||
token: ${{ secrets.QUICKSTART_SYNC_TOKEN }}
|
||||
path: quickstart-repo
|
||||
|
||||
- name: Sync files (excluding uv.lock and README.md)
|
||||
run: |
|
||||
# Copy all files except uv.lock and README.md
|
||||
find examples/quickstart -type f \
|
||||
-not -name "README.md" \
|
||||
-not -name "uv.lock" \
|
||||
-exec cp {} quickstart-repo/ \;
|
||||
|
||||
- name: Commit and push changes
|
||||
run: |
|
||||
cd quickstart-repo
|
||||
git config user.name "GitHub Action"
|
||||
git config user.email "action@github.com"
|
||||
git add .
|
||||
|
||||
# Only commit if there are changes
|
||||
if ! git diff --staged --quiet; then
|
||||
git commit -m "Sync from pipecat main repo
|
||||
|
||||
Updated files from examples/quickstart/
|
||||
Commit: ${{ github.sha }}
|
||||
"
|
||||
git push
|
||||
else
|
||||
echo "No changes to sync"
|
||||
fi
|
||||
@@ -1,13 +1,8 @@
|
||||
repos:
|
||||
- repo: local
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.12.1
|
||||
hooks:
|
||||
- id: ruff
|
||||
name: ruff
|
||||
entry: uv run ruff check --fix
|
||||
language: system
|
||||
types: [python]
|
||||
language_version: python3
|
||||
args: [--fix]
|
||||
- id: ruff-format
|
||||
name: ruff-format
|
||||
entry: uv run ruff format
|
||||
language: system
|
||||
types: [python]
|
||||
|
||||
@@ -11,7 +11,7 @@ build:
|
||||
jobs:
|
||||
post_install:
|
||||
- pip install uv
|
||||
- UV_PROJECT_ENVIRONMENT=$READTHEDOCS_VIRTUALENV_PATH uv sync --group docs --all-extras --no-extra gstreamer --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
- UV_PROJECT_ENVIRONMENT=$READTHEDOCS_VIRTUALENV_PATH uv sync --group docs --all-extras --no-extra krisp --no-extra gstreamer --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
|
||||
sphinx:
|
||||
configuration: docs/api/conf.py
|
||||
|
||||
612
CHANGELOG.md
612
CHANGELOG.md
@@ -7,618 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
<!-- towncrier release notes start -->
|
||||
|
||||
## [0.0.108] - 2026-03-27
|
||||
|
||||
### Added
|
||||
|
||||
- Added `SarvamLLMService` with support for `sarvam-30b`, `sarvam-30b-16k`,
|
||||
`sarvam-105b` and `sarvam-105b-32k`.
|
||||
(PR [#3978](https://github.com/pipecat-ai/pipecat/pull/3978))
|
||||
|
||||
- Added `on_turn_context_created(context_id)` hook to `TTSService`. Override
|
||||
this to perform provider-specific setup (e.g. eagerly opening a server-side
|
||||
context) before text starts flowing. Called each time a new turn context ID
|
||||
is created.
|
||||
(PR [#4013](https://github.com/pipecat-ai/pipecat/pull/4013))
|
||||
|
||||
- Added `XAIHttpTTSService` for text-to-speech using xAI's HTTP TTS API.
|
||||
(PR [#4031](https://github.com/pipecat-ai/pipecat/pull/4031))
|
||||
|
||||
- Added support for "developer" role messages in conversation context across
|
||||
all LLM adapters. For non-OpenAI services (Anthropic, Google, AWS Bedrock),
|
||||
"developer" messages are converted to "user" messages (use
|
||||
`system_instruction` to set the system instruction). For OpenAI services,
|
||||
"developer" messages pass through in conversation history. For the Responses
|
||||
API, they are kept as "developer" role (matching the existing "system" →
|
||||
"developer" conversion).
|
||||
(PR [#4089](https://github.com/pipecat-ai/pipecat/pull/4089))
|
||||
|
||||
- Added `SmallestTTSService`, a WebSocket-based TTS service integration with
|
||||
Smallest AI's Waves API. Supports the Lightning v2 and v3.1 models with
|
||||
configurable voice, language, speed, consistency, similarity, and enhancement
|
||||
settings.
|
||||
(PR [#4092](https://github.com/pipecat-ai/pipecat/pull/4092))
|
||||
|
||||
- Added warnings in turn stop strategies when `VADParams.stop_secs` differs
|
||||
from the recommended default (0.2s) or when `stop_secs >= STT p99 latency`,
|
||||
which collapses the STT wait timeout to 0s and may cause delayed turn
|
||||
detection. The warnings guide developers to re-run the
|
||||
[stt-benchmark](https://github.com/pipecat-ai/stt-benchmark) with their VAD
|
||||
settings.
|
||||
(PR [#4115](https://github.com/pipecat-ai/pipecat/pull/4115))
|
||||
|
||||
- Added `domain` parameter to `AssemblyAISTTSettings` for specialized
|
||||
recognition modes such as Medical Mode (`domain="medical-v1"`).
|
||||
(PR [#4117](https://github.com/pipecat-ai/pipecat/pull/4117))
|
||||
|
||||
- Added `NovitaLLMService` for using Novita AI's LLM models via their
|
||||
OpenAI-compatible API.
|
||||
(PR [#4119](https://github.com/pipecat-ai/pipecat/pull/4119))
|
||||
|
||||
- Added `cleanup()` method to `VADAnalyzer` and `VADController` so VAD analyzer
|
||||
resources are properly released when no longer needed. Custom `VADAnalyzer`
|
||||
subclasses can override `cleanup()` to free any held resources.
|
||||
(PR [#4120](https://github.com/pipecat-ai/pipecat/pull/4120))
|
||||
|
||||
- Added `on_end_of_turn` event handler to `AssemblyAISTTService`. This fires
|
||||
after the final transcript is pushed, providing a reliable hook for
|
||||
end-of-turn logic that doesn't race with `TranscriptionFrame`. Works in both
|
||||
Pipecat and AssemblyAI turn detection modes.
|
||||
(PR [#4128](https://github.com/pipecat-ai/pipecat/pull/4128))
|
||||
|
||||
- Added `DeepgramFluxSageMakerSTTService` for running Deepgram Flux
|
||||
speech-to-text on AWS SageMaker endpoints. Use with
|
||||
`ExternalUserTurnStrategies` to take advantage of Flux's turn detection.
|
||||
(PR [#4143](https://github.com/pipecat-ai/pipecat/pull/4143))
|
||||
|
||||
- Added `Mem0MemoryService.get_memories()` convenience method for retrieving
|
||||
all stored memories outside the pipeline (e.g. to build a personalized
|
||||
greeting at connection time). This avoids the need to manually handle client
|
||||
type branching, filter construction, and async wrapping.
|
||||
(PR [#4156](https://github.com/pipecat-ai/pipecat/pull/4156))
|
||||
|
||||
### Changed
|
||||
|
||||
- Added context prewarming path for `InworldTTSService` to improve first audio
|
||||
latency.
|
||||
(PR [#4013](https://github.com/pipecat-ai/pipecat/pull/4013))
|
||||
|
||||
- Added `KrispVivaVadAnalyzer` for Voice Activity Detection using the Krisp
|
||||
VIVA SDK (requires `krisp_audio`).
|
||||
(PR [#4022](https://github.com/pipecat-ai/pipecat/pull/4022))
|
||||
|
||||
- Modified `InworldTTSService` to close context at end of turn instead of
|
||||
relying on idle timeout.
|
||||
(PR [#4028](https://github.com/pipecat-ai/pipecat/pull/4028))
|
||||
|
||||
- Added Gemini 3 support to the Gemini Live service.
|
||||
(PR [#4078](https://github.com/pipecat-ai/pipecat/pull/4078))
|
||||
|
||||
- `TTSService`: the default `stop_frame_timeout_s` (idle time before an
|
||||
automatic `TTSStoppedFrame` is pushed when `push_stop_frames=True`) has
|
||||
changed from `2.0` to `3.0` seconds.
|
||||
(PR [#4084](https://github.com/pipecat-ai/pipecat/pull/4084))
|
||||
|
||||
- ⚠️ `GeminiLLMAdapter` now only treats `messages[0]` as the initial system
|
||||
message, matching all other adapters. Previously it searched for the first
|
||||
"system" message anywhere in the conversation history. A "system" message
|
||||
appearing later in the list will now be converted to "user" instead of being
|
||||
extracted as the system instruction.
|
||||
(PR [#4089](https://github.com/pipecat-ai/pipecat/pull/4089))
|
||||
|
||||
- Fixed `InworldTtsService` to fallback to full text when TTS timestamps are
|
||||
not received.
|
||||
(PR [#4113](https://github.com/pipecat-ai/pipecat/pull/4113))
|
||||
|
||||
- ⚠️ Realtime services (Gemini Live, OpenAI Realtime, Grok Realtime, Nova
|
||||
Sonic) now prefer `system_instruction` from service settings over an initial
|
||||
system message in the LLM context, matching the behavior of non-realtime
|
||||
services. Previously, context-provided system instructions took precedence. A
|
||||
warning is now logged when both are set.
|
||||
(PR [#4130](https://github.com/pipecat-ai/pipecat/pull/4130))
|
||||
|
||||
- Bumped `nvidia-riva-client` minimum version to `>=2.25.1`.
|
||||
(PR [#4136](https://github.com/pipecat-ai/pipecat/pull/4136))
|
||||
|
||||
- Upgraded `protobuf` from 5.x to 6.x (`>=6.31.1,<7`).
|
||||
(PR [#4136](https://github.com/pipecat-ai/pipecat/pull/4136))
|
||||
|
||||
- Unrecognized language strings (e.g. Deepgram's `"multi"`) no longer produce a
|
||||
warning at startup. The log message has been downgraded to debug level since
|
||||
these are valid service-specific values that are passed through correctly.
|
||||
(PR [#4137](https://github.com/pipecat-ai/pipecat/pull/4137))
|
||||
|
||||
- `GrokLLMService` and `GrokRealtimeLLMService` now live in the
|
||||
`pipecat.services.xai` module alongside `XAIHttpTTSService`, since all three
|
||||
use the same xAI API. Update imports from `pipecat.services.grok.*` to
|
||||
`pipecat.services.xai.*` (e.g. `from pipecat.services.xai.llm import
|
||||
GrokLLMService`).
|
||||
(PR [#4142](https://github.com/pipecat-ai/pipecat/pull/4142))
|
||||
|
||||
- ⚠️ Bumped `mem0ai` dependency from `~=0.1.94` to `>=1.0.8,<2`. Users of the
|
||||
`mem0` extra will need to update their mem0ai package.
|
||||
(PR [#4156](https://github.com/pipecat-ai/pipecat/pull/4156))
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `pipecat.services.grok.llm`, `pipecat.services.grok.realtime.llm`, and
|
||||
`pipecat.services.grok.realtime.events` are deprecated. The old import paths
|
||||
still work but emit a `DeprecationWarning`; use `pipecat.services.xai.llm`,
|
||||
`pipecat.services.xai.realtime.llm`, and
|
||||
`pipecat.services.xai.realtime.events` instead.
|
||||
(PR [#4142](https://github.com/pipecat-ai/pipecat/pull/4142))
|
||||
|
||||
### Removed
|
||||
|
||||
- ⚠️ `TTSService.add_word_timestamps()` no longer supports the `"Reset"` and
|
||||
`"TTSStoppedFrame"` sentinel strings. If you have a custom TTS service that
|
||||
called `await self.add_word_timestamps([("Reset", 0)])` or `await
|
||||
self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)], ctx_id)`,
|
||||
replace them with `await self.append_to_audio_context(ctx_id,
|
||||
TTSStoppedFrame(context_id=ctx_id))` and let `_handle_audio_context` manage
|
||||
the word-timestamp reset automatically.
|
||||
(PR [#4145](https://github.com/pipecat-ai/pipecat/pull/4145))
|
||||
|
||||
- Removed `SambaNovaSTTService`. SambaNova no longer offers speech-to-text
|
||||
audio models. Use another STT provider instead.
|
||||
(PR [#4154](https://github.com/pipecat-ai/pipecat/pull/4154))
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed Gemini Live (`GoogleGeminiLiveLLMService`) not honoring
|
||||
`settings.system_instruction`. The system instruction was being read from a
|
||||
deprecated constructor parameter instead of the settings object, causing it
|
||||
to be silently ignored.
|
||||
(PR [#4089](https://github.com/pipecat-ai/pipecat/pull/4089))
|
||||
|
||||
- Fixed `AWSBedrockLLMAdapter` sending an empty message list to the API when
|
||||
the only message in context was a system message. The lone system message is
|
||||
now converted to "user" role instead of being extracted, matching the
|
||||
existing Anthropic adapter behavior.
|
||||
(PR [#4089](https://github.com/pipecat-ai/pipecat/pull/4089))
|
||||
|
||||
- Fixed Gemini Live pipeline hanging indefinitely when an `EndFrame` was
|
||||
deferred while waiting for the bot to finish responding and `turn_complete`
|
||||
never arrived. As a possible root-cause fix, `turn_complete` messages are now
|
||||
handled even if they lack `usage_metadata`. As a fallback, the deferred
|
||||
`EndFrame` now has a 30-second safety timeout.
|
||||
(PR [#4125](https://github.com/pipecat-ai/pipecat/pull/4125))
|
||||
|
||||
- Fixed ElevenLabs WebSocket disconnections (1008 "Maximum simultaneous
|
||||
contexts exceeded") caused by rapid user interruptions. When interruptions
|
||||
arrived before any TTS text was generated, phantom contexts were created on
|
||||
the ElevenLabs server that were never closed, eventually exceeding the
|
||||
5-context limit.
|
||||
(PR [#4126](https://github.com/pipecat-ai/pipecat/pull/4126))
|
||||
|
||||
- Fixed the final sentence being dropped from the conversation context when
|
||||
using RTVI text input with non-word-timestamp TTS services. The
|
||||
`LLMFullResponseEndFrame` was racing ahead of the last `TTSTextFrame`,
|
||||
causing the `LLMAssistantAggregator` to finalize the context before the final
|
||||
sentence arrived.
|
||||
(PR [#4127](https://github.com/pipecat-ai/pipecat/pull/4127))
|
||||
|
||||
- Fixed audio crackling and popping in recordings when both user and bot are
|
||||
speaking. `AudioBufferProcessor` no longer injects silence into a track's
|
||||
buffer while that track is actively producing audio, preventing mid-utterance
|
||||
interruptions in the recorded output.
|
||||
(PR [#4135](https://github.com/pipecat-ai/pipecat/pull/4135))
|
||||
|
||||
- Fixed websocket TTS word timestamps so interrupted contexts cannot leak stale
|
||||
words or backward PTS values into later turns.
|
||||
(PR [#4145](https://github.com/pipecat-ai/pipecat/pull/4145))
|
||||
|
||||
- Fixed a race condition in `InterruptibleTTSService` where, if `run_tts` had
|
||||
been invoked but `BotStartedSpeakingFrame` had not yet been received, a user
|
||||
interruption could allow stale audio to leak through.
|
||||
(PR [#4145](https://github.com/pipecat-ai/pipecat/pull/4145))
|
||||
|
||||
- Fixed Gemini Live local VAD mode (`GeminiVADParams(disabled=True)` with
|
||||
external VAD) not working. The bot now correctly detects user speech and
|
||||
signals turn boundaries to the Gemini API.
|
||||
(PR [#4146](https://github.com/pipecat-ai/pipecat/pull/4146))
|
||||
|
||||
- Fixed Gemini Live message handling to process all `server_content` fields
|
||||
independently. Gemini 3.x can bundle multiple fields (e.g. `model_turn` and
|
||||
`output_transcription`) on the same message, but the previous `elif` chain
|
||||
only processed the first match, silently dropping the rest.
|
||||
(PR [#4147](https://github.com/pipecat-ai/pipecat/pull/4147))
|
||||
|
||||
- Fixed `ServiceSwitcher` with `ServiceSwitcherStrategyFailover` incorrectly
|
||||
triggering failover when `ErrorFrame`s from other pipeline stages (e.g. TTS)
|
||||
propagated upstream through the switcher. Previously, any non-fatal error
|
||||
passing through would be misattributed to the active service and trigger an
|
||||
unwanted service switch. Now only errors originating from the switcher's own
|
||||
managed services trigger failover.
|
||||
(PR [#4149](https://github.com/pipecat-ai/pipecat/pull/4149))
|
||||
|
||||
- Fixed `LiveKitOutputTransport` not clearing the `rtc.AudioSource` internal
|
||||
buffer on interruption, causing the bot to continue speaking for several
|
||||
seconds after being interrupted.
|
||||
(PR [#4151](https://github.com/pipecat-ai/pipecat/pull/4151))
|
||||
|
||||
- Fixed a crash in OpenAI LLM processing when the provider returns
|
||||
`chunk.choices[0].delta.audio = None`, which caused `'NoneType' object has no
|
||||
attribute 'get'` errors during audio transcript handling.
|
||||
(PR [#4152](https://github.com/pipecat-ai/pipecat/pull/4152))
|
||||
|
||||
- Fixed error floods in `DeepgramSTTService` when the WebSocket connection
|
||||
drops. With Deepgram SDK 6.x, `send_media()` raises exceptions on a dead
|
||||
connection instead of silently failing, causing every queued audio frame to
|
||||
log an error. Now `send_media()` failures are caught gracefully — a single
|
||||
warning is logged and audio frames are skipped until the existing
|
||||
reconnection logic restores the connection.
|
||||
(PR [#4153](https://github.com/pipecat-ai/pipecat/pull/4153))
|
||||
|
||||
- `Mem0MemoryService` no longer blocks the event loop during memory storage and
|
||||
retrieval. All Mem0 API calls now run in a background thread, and message
|
||||
storage is fire-and-forget so it doesn't delay downstream processing.
|
||||
(PR [#4156](https://github.com/pipecat-ai/pipecat/pull/4156))
|
||||
|
||||
- Fixed `Mem0MemoryService` failing to store messages when the context
|
||||
contained system or developer role messages. The Mem0 API only accepts user
|
||||
and assistant roles, so other roles are now filtered out before storing.
|
||||
(PR [#4156](https://github.com/pipecat-ai/pipecat/pull/4156))
|
||||
|
||||
- Added missing `on_dtmf_event` callback to `LemonSliceTransportClient.setup()`
|
||||
`DailyCallbacks` construction, fixing a `ValidationError` at pipeline setup
|
||||
time.
|
||||
(PR [#4161](https://github.com/pipecat-ai/pipecat/pull/4161))
|
||||
|
||||
- Fixed an issue in `InworldTTSService` where, in cases of fast interruption,
|
||||
we would continue receiving audio from the previous context.
|
||||
(PR [#4167](https://github.com/pipecat-ai/pipecat/pull/4167))
|
||||
|
||||
- Fixed a word timestamp interleaving issue in `InworldTTSService` when
|
||||
processing multiple sentences.
|
||||
(PR [#4167](https://github.com/pipecat-ai/pipecat/pull/4167))
|
||||
|
||||
- Fixed duplicate `TTSStoppedFrame` being pushed in TTS services using
|
||||
`push_stop_frames=True`. When the stop-frame timeout fired, a second
|
||||
`TTSStoppedFrame` could be pushed after the normal one at context completion.
|
||||
(PR [#4172](https://github.com/pipecat-ai/pipecat/pull/4172))
|
||||
|
||||
- ⚠️ Fixed `DeepgramSTTService` compatibility with deepgram-sdk 6.1.0. The SDK
|
||||
now requires explicit message objects for `send_keep_alive()`,
|
||||
`send_close_stream()`, and `send_finalize()`. The minimum deepgram-sdk
|
||||
version is now 6.1.0.
|
||||
(PR [#4174](https://github.com/pipecat-ai/pipecat/pull/4174))
|
||||
|
||||
- Fixed RTVI events not being delivered to clients when using WebSocket
|
||||
transports. `ProtobufFrameSerializer` now sets `ignore_rtvi_messages=False`
|
||||
by default.
|
||||
(PR [#4176](https://github.com/pipecat-ai/pipecat/pull/4176))
|
||||
|
||||
- Fixed a timing issue where turn detection timer tasks (idle controller,
|
||||
speech timeout, turn analyzer, and turn completion) could miss their first
|
||||
tick because the newly created asyncio task was not yet scheduled when the
|
||||
caller continued.
|
||||
(PR [#4183](https://github.com/pipecat-ai/pipecat/pull/4183))
|
||||
|
||||
- Fixed `FastAPIWebsocketTransport` intermittently hanging on shutdown when the
|
||||
remote side (e.g. Twilio) disconnects while audio is being sent. A race
|
||||
condition between the send and receive paths could cause the
|
||||
`on_client_disconnected` callback to be skipped, leaving the pipeline waiting
|
||||
for a disconnect signal that never came.
|
||||
(PR [#4186](https://github.com/pipecat-ai/pipecat/pull/4186))
|
||||
|
||||
### Performance
|
||||
|
||||
- `RimeTTSService` now handles Rime's `done` WebSocket message to complete
|
||||
audio contexts immediately, eliminating the 3-second idle timeout that
|
||||
previously added latency at the end of each utterance.
|
||||
(PR [#4172](https://github.com/pipecat-ai/pipecat/pull/4172))
|
||||
|
||||
## [0.0.107] - 2026-03-23
|
||||
|
||||
### Added
|
||||
|
||||
- Added `frame_order` parameter to `SyncParallelPipeline`. Set
|
||||
`frame_order=FrameOrder.PIPELINE` to push synchronized output frames in
|
||||
pipeline definition order (all frames from the first pipeline, then the
|
||||
second, etc.) instead of the default arrival order.
|
||||
(PR [#4029](https://github.com/pipecat-ai/pipecat/pull/4029))
|
||||
|
||||
- Added `sync_with_audio` field to `OutputImageRawFrame`. When set to `True`,
|
||||
the output transport queues image frames with audio so they are displayed
|
||||
only after all preceding audio has been sent, enabling synchronized
|
||||
audio/image playback.
|
||||
(PR [#4029](https://github.com/pipecat-ai/pipecat/pull/4029))
|
||||
|
||||
- Added `OpenAIResponsesLLMService`, a new LLM service that uses the OpenAI
|
||||
Responses API. Supports streaming text, function calling, usage metrics, and
|
||||
out-of-band inference. Works with the universal `LLMContext` and
|
||||
`LLMContextAggregatorPair`. See
|
||||
`examples/foundational/07-interruptible-openai-responses.py` and
|
||||
`14-function-calling-openai-responses.py`.
|
||||
(PR [#4074](https://github.com/pipecat-ai/pipecat/pull/4074))
|
||||
|
||||
- Added `audio_out_auto_silence` parameter to `TransportParams` (defaults to
|
||||
`True`). When set to `False`, the transport waits for audio data instead of
|
||||
inserting silence when the output queue is empty, which is useful for
|
||||
scenarios that require uninterrupted audio playback without artificial gaps.
|
||||
(PR [#4104](https://github.com/pipecat-ai/pipecat/pull/4104))
|
||||
|
||||
### Changed
|
||||
|
||||
- Renamed tracing span attributes to align with OpenTelemetry GenAI semantic
|
||||
conventions: `gen_ai.system` to `gen_ai.provider.name`, `system` to
|
||||
`gen_ai.system_instructions`, `gen_ai.usage.cache_read_input_tokens` to
|
||||
`gen_ai.usage.cache_read.input_tokens`, and
|
||||
`gen_ai.usage.cache_creation_input_tokens` to
|
||||
`gen_ai.usage.cache_creation.input_tokens`.
|
||||
(PR [#3449](https://github.com/pipecat-ai/pipecat/pull/3449))
|
||||
|
||||
- `DeepgramSageMakerTTSService` now correctly routes audio through the base
|
||||
`TTSService` audio context queue. Audio frames are delivered via
|
||||
`append_to_audio_context()` instead of being pushed directly, enabling proper
|
||||
ordering, interruption handling, and start/stop frame lifecycle management.
|
||||
Interruptions now trigger a `Clear` message to Deepgram (flushing its text
|
||||
buffer) at the right time via `on_audio_context_interrupted`.
|
||||
(PR [#4083](https://github.com/pipecat-ai/pipecat/pull/4083))
|
||||
|
||||
- `GradiumTTSService` now sends a per-context `setup` message with
|
||||
`client_req_id` before the first text message for each TTS context, following
|
||||
Gradium's multiplexing protocol. Previously, a single setup message was sent
|
||||
at connection time without a `client_req_id`, which prevented Gradium from
|
||||
associating requests with their sessions when using `close_ws_on_eos=False`.
|
||||
(PR [#4091](https://github.com/pipecat-ai/pipecat/pull/4091))
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed stale `system_instruction` in LLM tracing spans by reading from
|
||||
`_settings.system_instruction` instead of the removed `_system_instruction`
|
||||
attribute.
|
||||
(PR [#3449](https://github.com/pipecat-ai/pipecat/pull/3449))
|
||||
|
||||
- Fixed `SyncParallelPipeline` breaking the Whisker debugger.
|
||||
(PR [#4029](https://github.com/pipecat-ai/pipecat/pull/4029))
|
||||
|
||||
- Fixed `SyncParallelPipeline` race condition where concurrent SystemFrame
|
||||
processing (e.g. from RTVI) could corrupt sink queues and cause deadlocks.
|
||||
SystemFrames now take a fast path that passes them through without draining
|
||||
queued output.
|
||||
(PR [#4029](https://github.com/pipecat-ai/pipecat/pull/4029))
|
||||
|
||||
- Fixed TTS frame ordering so that non-system frames always arrive in correct
|
||||
order relative to the `TTSStartedFrame`/`TTSAudioRawFrame`/`TTSStoppedFrame`
|
||||
sequence. Previously these frames could race ahead of or behind audio context
|
||||
frames, producing out-of-order output downstream.
|
||||
(PR [#4075](https://github.com/pipecat-ai/pipecat/pull/4075))
|
||||
|
||||
- Fixed `SarvamTTSService` audio and error frames now route through
|
||||
`append_to_audio_context()` instead of `push_frame()`, ensuring correct
|
||||
behavior with audio contexts and interruptions.
|
||||
(PR [#4082](https://github.com/pipecat-ai/pipecat/pull/4082))
|
||||
|
||||
- Fixed audio frame ordering and interruption handling in Fish Audio, LMNT,
|
||||
Neuphonic, and Rime NonJson TTS services. These services were bypassing the
|
||||
base `TTSService` audio context serialization queue by pushing audio frames
|
||||
directly, which could cause out-of-order frames and broken interruptions
|
||||
during speech.
|
||||
(PR [#4090](https://github.com/pipecat-ai/pipecat/pull/4090))
|
||||
|
||||
- Fixed Genesys AudioHook serializer to always include the `parameters` field in
|
||||
protocol messages. The AudioHook protocol requires every message to carry a
|
||||
`parameters` object (even if empty), but `_create_message` omitted it when no
|
||||
parameters were provided. This caused clients that validate message structure
|
||||
(including the Genesys reference implementation) to reject `pong` and
|
||||
parameter-less `closed` responses, breaking server sequence tracking and
|
||||
preventing `outputVariables` from reaching the Architect flow.
|
||||
(PR [#4093](https://github.com/pipecat-ai/pipecat/pull/4093))
|
||||
|
||||
## [0.0.106] - 2026-03-18
|
||||
|
||||
### Added
|
||||
|
||||
- Added optional `service` field to `ServiceUpdateSettingsFrame` (and its
|
||||
subclasses `LLMUpdateSettingsFrame`, `TTSUpdateSettingsFrame`,
|
||||
`STTUpdateSettingsFrame`) to target a specific service instance. When
|
||||
`service` is set, only the matching service applies the settings; others
|
||||
forward the frame unchanged. This enables updating a single service when
|
||||
multiple services of the same type exist in the pipeline.
|
||||
(PR [#4004](https://github.com/pipecat-ai/pipecat/pull/4004))
|
||||
|
||||
- Added `sip_provider` and `room_geo` parameters to `configure()` in the Daily
|
||||
runner. These convenience parameters let callers specify a SIP provider name
|
||||
and geographic region directly without manually constructing
|
||||
`DailyRoomProperties` and `DailyRoomSipParams`.
|
||||
(PR [#4005](https://github.com/pipecat-ai/pipecat/pull/4005))
|
||||
|
||||
- Added `PerplexityLLMAdapter` that automatically transforms conversation
|
||||
messages to satisfy Perplexity's stricter API constraints (strict role
|
||||
alternation, no non-initial system messages, last message must be user/tool).
|
||||
Previously, certain conversation histories could cause Perplexity API errors
|
||||
that didn't occur with OpenAI (`PerplexityLLMService` subclasses
|
||||
`OpenAILLMService` since Perplexity uses an OpenAI-compatible API).
|
||||
(PR [#4009](https://github.com/pipecat-ai/pipecat/pull/4009))
|
||||
|
||||
- Added DTMF input event support to the Daily transport. Incoming DTMF tones
|
||||
are now received via Daily's `on_dtmf_event` callback and pushed into the
|
||||
pipeline as `InputDTMFFrame`, enabling bots to react to keypad presses from
|
||||
phone callers.
|
||||
(PR [#4047](https://github.com/pipecat-ai/pipecat/pull/4047))
|
||||
|
||||
- Added `WakePhraseUserTurnStartStrategy` for triggering user turns based on
|
||||
wake phrases, with support for `single_activation` mode. Deprecates
|
||||
`WakeCheckFilter`.
|
||||
(PR [#4064](https://github.com/pipecat-ai/pipecat/pull/4064))
|
||||
|
||||
- Added `default_user_turn_start_strategies()` and
|
||||
`default_user_turn_stop_strategies()` helper functions for composing custom
|
||||
strategy lists.
|
||||
(PR [#4064](https://github.com/pipecat-ai/pipecat/pull/4064))
|
||||
|
||||
### Changed
|
||||
|
||||
- Changed tool result JSON serialization to use `ensure_ascii=False`,
|
||||
preserving UTF-8 characters instead of escaping them. This reduces context
|
||||
size and token usage for non-English languages.
|
||||
(PR [#3457](https://github.com/pipecat-ai/pipecat/pull/3457))
|
||||
|
||||
- `OpenAIRealtimeSTTService`'s `noise_reduction` parameter is now part of
|
||||
`OpenAIRealtimeSTTSettings`, making it runtime-updatable via
|
||||
`STTUpdateSettingsFrame`. The direct `noise_reduction` init argument is
|
||||
deprecated as of 0.0.106.
|
||||
(PR [#3991](https://github.com/pipecat-ai/pipecat/pull/3991))
|
||||
|
||||
- Updated `sarvamai` dependency from `0.1.26a2` (alpha) to `0.1.26` (stable
|
||||
release).
|
||||
(PR [#3997](https://github.com/pipecat-ai/pipecat/pull/3997))
|
||||
|
||||
- `SimliVideoService` now extends `AIService` instead of `FrameProcessor`,
|
||||
aligning it with the HeyGen and Tavus video services. It supports
|
||||
`SimliVideoService.Settings(...)` for configuration and uses
|
||||
`start()`/`stop()`/`cancel()` lifecycle methods. Existing constructor usage
|
||||
(`api_key`, `face_id`, etc.) remains unchanged.
|
||||
(PR [#4001](https://github.com/pipecat-ai/pipecat/pull/4001))
|
||||
|
||||
- Update `pipecat-ai-small-webrtc-prebuilt` to `2.4.0`.
|
||||
(PR [#4023](https://github.com/pipecat-ai/pipecat/pull/4023))
|
||||
|
||||
- Nova Sonic assistant text transcripts are now delivered in real-time using
|
||||
speculative text events instead of delayed final text events. Previously,
|
||||
assistant text only arrived after all audio had finished playing, causing
|
||||
laggy transcripts in client UIs. Speculative text arrives before each audio
|
||||
chunk, providing text synchronized with what the bot is saying. This also
|
||||
simplifies the internal text handling by removing the interruption re-push
|
||||
hack and assistant text buffer.
|
||||
(PR [#4042](https://github.com/pipecat-ai/pipecat/pull/4042))
|
||||
|
||||
- Updated `daily-python` dependency to 0.25.0.
|
||||
(PR [#4047](https://github.com/pipecat-ai/pipecat/pull/4047))
|
||||
|
||||
- Added `enable_dialout` parameter to `configure()` in `pipecat.runner.daily`
|
||||
to support dial-out rooms. Also narrowed misleading `Optional` type hints and
|
||||
deduplicated token expiry calculation.
|
||||
(PR [#4048](https://github.com/pipecat-ai/pipecat/pull/4048))
|
||||
|
||||
- Extended `ProcessFrameResult` to stop strategies, allowing a stop strategy to
|
||||
short-circuit evaluation of subsequent strategies by returning `STOP`.
|
||||
(PR [#4064](https://github.com/pipecat-ai/pipecat/pull/4064))
|
||||
|
||||
- `GradiumSTTService` now takes both an `encoding` and `sample_rate`
|
||||
constructor argument which is assmebled in the class to form the
|
||||
`input_format`. PCM accepts `8000`, `16000`, and `24000` Hz sample rates.
|
||||
(PR [#4066](https://github.com/pipecat-ai/pipecat/pull/4066))
|
||||
|
||||
- Improved `GradiumSTTService` transcription accuracy by reworking how text
|
||||
fragments are accumulated and finalized. Previously, trailing words could be
|
||||
dropped when the server's `flushed` response arrived before all text tokens
|
||||
were delivered. The service now uses a short aggregation delay after flush to
|
||||
capture trailing tokens, producing complete utterances.
|
||||
(PR [#4066](https://github.com/pipecat-ai/pipecat/pull/4066))
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `SimliVideoService.InputParams` is deprecated. Use the direct constructor
|
||||
parameters `max_session_length`, `max_idle_time`, and `enable_logging`
|
||||
instead.
|
||||
(PR [#4001](https://github.com/pipecat-ai/pipecat/pull/4001))
|
||||
|
||||
- Deprecated `LocalSmartTurnAnalyzerV2` and `LocalCoreMLSmartTurnAnalyzer`. Use
|
||||
`LocalSmartTurnAnalyzerV3` instead. Instantiating these analyzers will now
|
||||
emit a `DeprecationWarning`.
|
||||
(PR [#4012](https://github.com/pipecat-ai/pipecat/pull/4012))
|
||||
|
||||
- Deprecated `WakeCheckFilter` in favor of `WakePhraseUserTurnStartStrategy`.
|
||||
(PR [#4064](https://github.com/pipecat-ai/pipecat/pull/4064))
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where the default model for `OpenAILLMService` and
|
||||
`AzureLLMService` was mistakenly reverted to `gpt-4o`. The defaults are now
|
||||
restored to `gpt-4.1`.
|
||||
(PR [#4000](https://github.com/pipecat-ai/pipecat/pull/4000))
|
||||
|
||||
- Fixed a race condition where `EndTaskFrame` could cause the pipeline to shut
|
||||
down before in-flight frames (e.g. LLM function call responses) finished
|
||||
processing. `EndTaskFrame` and `StopTaskFrame` now flow through the pipeline
|
||||
as `ControlFrame`s, ensuring all pending work is flushed before shutdown
|
||||
begins. `CancelTaskFrame` and `InterruptionTaskFrame` remain immediate
|
||||
(`SystemFrame`).
|
||||
(PR [#4006](https://github.com/pipecat-ai/pipecat/pull/4006))
|
||||
|
||||
- Fixed `ParallelPipeline` dropping or misordering frames during lifecycle
|
||||
synchronization. Buffered frames are now flushed in the correct order
|
||||
relative to synchronization frames (`StartFrame` goes first,
|
||||
`EndFrame`/`CancelFrame` go after), and frames added to the buffer during
|
||||
flush are also drained.
|
||||
(PR [#4007](https://github.com/pipecat-ai/pipecat/pull/4007))
|
||||
|
||||
- Fixed `TTSService` potentially canceling in-flight audio during shutdown. The
|
||||
stop sequence now waits for all queued audio contexts to finish processing
|
||||
before canceling the stop frame task.
|
||||
(PR [#4007](https://github.com/pipecat-ai/pipecat/pull/4007))
|
||||
|
||||
- Fixed `Language` enum values (e.g. `Language.ES`) not being converted to
|
||||
service-specific codes when passed via
|
||||
`settings=Service.Settings(language=Language.ES)` at init time. This caused
|
||||
API errors (e.g. 400 from Rime) because the raw enum was sent instead of the
|
||||
expected language code (e.g. `"spa"`). Runtime updates via
|
||||
`UpdateSettingsFrame` were unaffected. The fix centralizes conversion in the
|
||||
base `TTSService` and `STTService` classes so all services handle this
|
||||
consistently.
|
||||
(PR [#4024](https://github.com/pipecat-ai/pipecat/pull/4024))
|
||||
|
||||
- Fixed `DeepgramSTTService` ignoring the `base_url` scheme when using `ws://`
|
||||
or `http://`. Previously these were silently overwritten with `wss://` /
|
||||
`https://`, breaking air-gapped or private deployments that don't use TLS.
|
||||
All scheme choices (`wss://`, `https://`, `ws://`, `http://`, or bare
|
||||
hostname) are now respected.
|
||||
(PR [#4026](https://github.com/pipecat-ai/pipecat/pull/4026))
|
||||
|
||||
- Fixed `LLMSwitcher.register_function()` and `register_direct_function()` not
|
||||
accepting or forwarding the `timeout_secs` parameter.
|
||||
(PR [#4037](https://github.com/pipecat-ai/pipecat/pull/4037))
|
||||
|
||||
- Fixed empty user transcriptions in Nova Sonic causing spurious interruptions.
|
||||
Previously, an empty transcription could trigger an interruption of the
|
||||
assistant's response even though the user hadn't actually spoken.
|
||||
(PR [#4042](https://github.com/pipecat-ai/pipecat/pull/4042))
|
||||
|
||||
- Fixed `SonioxSTTService` and `OpenAIRealtimeSTTService` crash when language
|
||||
parameters contain plain strings instead of `Language` enum values.
|
||||
(PR [#4046](https://github.com/pipecat-ai/pipecat/pull/4046))
|
||||
|
||||
- Fixed premature user turn stops caused by late transcriptions arriving
|
||||
between turns. A stale transcript from the previous turn could persist into
|
||||
the next turn and trigger a stop before the current turn's real transcript
|
||||
arrived. Stop strategies are now reset at both turn start and turn stop to
|
||||
prevent state from leaking across turn boundaries.
|
||||
(PR [#4057](https://github.com/pipecat-ai/pipecat/pull/4057))
|
||||
|
||||
- Fixed raw language strings like `"de-DE"` silently failing when passed to
|
||||
TTS/STT services (e.g. ElevenLabs producing no audio). Raw strings now go
|
||||
through the same `Language` enum resolution as enum values, so regional codes
|
||||
like `"de-DE"` are properly converted to service-expected formats like
|
||||
`"de"`. Unrecognized strings log a warning instead of failing silently.
|
||||
(PR [#4058](https://github.com/pipecat-ai/pipecat/pull/4058))
|
||||
|
||||
- Fixed Deepgram STT list-type settings (`keyterm`, `keywords`, `search`,
|
||||
`redact`, `replace`) being stringified instead of passed as lists to the SDK,
|
||||
which caused them to be sent as literal strings (e.g. `"['pipecat']"`) in the
|
||||
WebSocket query params.
|
||||
(PR [#4063](https://github.com/pipecat-ai/pipecat/pull/4063))
|
||||
|
||||
- Fixed `MinWordsUserTurnStartStrategy` including text below the word threshold
|
||||
in the output by resetting aggregation when the minimum word count is not
|
||||
met.
|
||||
(PR [#4064](https://github.com/pipecat-ai/pipecat/pull/4064))
|
||||
|
||||
- Fixed audio overlap and potential dropped TTS content when multiple assistant
|
||||
turns occur in quick succession. `TTSService` now flushes remaining text
|
||||
before pausing frame processing on `LLMFullResponseEndFrame`/`EndFrame`,
|
||||
instead of pausing first.
|
||||
(PR [#4071](https://github.com/pipecat-ai/pipecat/pull/4071))
|
||||
|
||||
### Security
|
||||
|
||||
- Bumped PyJWT minimum version from 2.10.1 to 2.12.0 in the `livekit` extra to
|
||||
address CVE-2026-32597 (GHSA-752w-5fwx-jx9f), where PyJWT <= 2.11.0 accepted
|
||||
unknown `crit` header extensions.
|
||||
(PR [#4035](https://github.com/pipecat-ai/pipecat/pull/4035))
|
||||
|
||||
## [0.0.105] - 2026-03-10
|
||||
|
||||
### Added
|
||||
|
||||
@@ -10,7 +10,7 @@ Pipecat is an open-source Python framework for building real-time voice and mult
|
||||
|
||||
```bash
|
||||
# Setup development environment
|
||||
uv sync --group dev --all-extras --no-extra gstreamer
|
||||
uv sync --group dev --all-extras --no-extra gstreamer --no-extra krisp
|
||||
|
||||
# Install pre-commit hooks
|
||||
uv run pre-commit install
|
||||
|
||||
@@ -23,7 +23,7 @@ Create your integration following the patterns and examples shown in the "Integr
|
||||
Your repository must contain these components:
|
||||
|
||||
- **Source code** - Complete implementation following Pipecat patterns
|
||||
- **Foundational example** - Single file example showing basic usage (see [Pipecat examples](https://github.com/pipecat-ai/pipecat/tree/main/examples))
|
||||
- **Foundational example** - Single file example showing basic usage (see [Pipecat examples](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational))
|
||||
- **README.md** - Must include:
|
||||
- Introduction and explanation of your integration
|
||||
- Installation instructions
|
||||
@@ -65,25 +65,12 @@ Once your PR is submitted, post in the `#community-integrations` Discord channel
|
||||
|
||||
#### Websocket-based Services
|
||||
|
||||
**Base class:** `WebsocketSTTService`
|
||||
|
||||
**Use for:** Services where you manage the websocket connection directly. Combines `STTService` with `WebsocketService` for automatic reconnection and keepalive support.
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [CartesiaSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/cartesia/stt.py)
|
||||
- [ElevenLabsRealtimeSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/elevenlabs/stt.py)
|
||||
|
||||
#### SDK-based Streaming Services
|
||||
|
||||
**Base class:** `STTService`
|
||||
|
||||
**Use for:** Streaming services where the provider's Python SDK manages the connection internally.
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [DeepgramSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/deepgram/stt.py)
|
||||
- [GoogleSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/stt.py)
|
||||
- [SpeechmaticsSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/speechmatics/stt.py)
|
||||
|
||||
#### File-based Services
|
||||
|
||||
@@ -121,59 +108,55 @@ Once your PR is submitted, post in the `#community-integrations` Discord channel
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- **`_process_context(self, context: LLMContext)`** — The main method that processes an LLM context and generates a response. Each LLM service overrides `process_frame` to extract context from `LLMContextFrame` and calls `_process_context`.
|
||||
|
||||
- **`adapter_class`** — Class attribute pointing to a `BaseLLMAdapter` subclass. Defaults to `OpenAILLMAdapter`. Non-OpenAI services must implement their own adapter (see `src/pipecat/adapters/base_llm_adapter.py`) with methods:
|
||||
- `get_llm_invocation_params(context)` — Extract provider-specific params from universal context
|
||||
- `to_provider_tools_format(tools_schema)` — Convert standard tools to provider format
|
||||
- `get_messages_for_logging(context)` — Format messages for logging
|
||||
- Reference adapters: `src/pipecat/adapters/services/` (anthropic, gemini, bedrock, etc.)
|
||||
|
||||
- **Frame sequence:** Output must follow this frame sequence pattern:
|
||||
- `LLMFullResponseStartFrame` — Signals the start of an LLM response
|
||||
- `LLMTextFrame` — Contains LLM content, typically streamed as tokens
|
||||
- `LLMFullResponseEndFrame` — Signals the end of an LLM response
|
||||
- `LLMFullResponseStartFrame` - Signals the start of an LLM response
|
||||
- `LLMTextFrame` - Contains LLM content, typically streamed as tokens
|
||||
- `LLMFullResponseEndFrame` - Signals the end of an LLM response
|
||||
|
||||
- **Thought frames (reasoning models):** If the model supports extended thinking / chain-of-thought, emit thought frames alongside the response:
|
||||
- `LLMThoughtStartFrame` — Signals the start of a thought
|
||||
- `LLMThoughtTextFrame` — Contains thought content, streamed as tokens
|
||||
- `LLMThoughtEndFrame` — Signals the end of a thought
|
||||
|
||||
- **Context aggregation** is handled by the framework via `LLMContext` + `LLMContextAggregatorPair`. The LLM service just processes context it receives — no need to implement aggregators.
|
||||
- **Context aggregation:** Implement context aggregation to collect user and assistant content:
|
||||
- Aggregators come in pairs with a `user()` instance and `assistant()` instance
|
||||
- Context must adhere to the `LLMContext` universal format
|
||||
- Aggregators should handle adding messages, function calls, and images to the context
|
||||
|
||||
### TTS (Text-to-Speech) Services
|
||||
|
||||
#### WebsocketTTSService
|
||||
#### AudioContextWordTTSService
|
||||
|
||||
**Use for:** Websocket-based streaming services (with or without word timestamps)
|
||||
**Use for:** Websocket-based services supporting word/timestamp alignment
|
||||
|
||||
**Examples:**
|
||||
**Example:**
|
||||
|
||||
- [CartesiaTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/cartesia/tts.py)
|
||||
- [ElevenLabsTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/elevenlabs/tts.py)
|
||||
|
||||
#### InterruptibleTTSService
|
||||
|
||||
**Use for:** Websocket-based services without word timestamps that reconnect on interruption (e.g. don't support a context ID or interruption message)
|
||||
**Use for:** Websocket-based services without word/timestamp alignment, requiring disconnection on interruption
|
||||
|
||||
**Example:**
|
||||
|
||||
- [SarvamTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/sarvam/tts.py)
|
||||
|
||||
#### WordTTSService
|
||||
|
||||
**Use for:** HTTP-based services supporting word/timestamp alignment
|
||||
|
||||
**Example:**
|
||||
|
||||
- [ElevenLabsHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/elevenlabs/tts.py)
|
||||
|
||||
#### TTSService
|
||||
|
||||
**Use for:** HTTP-based services (word timestamps are supported in the base class)
|
||||
**Use for:** HTTP-based services without word/timestamp alignment
|
||||
|
||||
**Examples:**
|
||||
**Example:**
|
||||
|
||||
- [GoogleHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/tts.py)
|
||||
- [OpenAITTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/openai/tts.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- For websocket services, use asyncio WebSocket implementation
|
||||
- For websocket services, use asyncio WebSocket implementation (required for v13+ support)
|
||||
- Handle idle service timeouts with keepalives
|
||||
- TTS services push both audio (`TTSAudioRawFrame`) and text (`TTSTextFrame`) frames
|
||||
- TTSServices push both audio (`TTSRawAudioFrame`) and text (`TTSTextFrame`) frames
|
||||
|
||||
### Telephony Serializers
|
||||
|
||||
@@ -217,25 +200,14 @@ Vision services process images and provide analysis such as descriptions, object
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- Must implement `run_vision` method that takes a `UserImageRawFrame` and returns an `AsyncGenerator[Frame, None]`
|
||||
- The method processes the image frame and yields frames with analysis results
|
||||
- Must yield the frame sequence: `VisionFullResponseStartFrame`, `VisionTextFrame`, `VisionFullResponseEndFrame`
|
||||
- Must implement `run_vision` method that takes an `LLMContext` and returns an `AsyncGenerator[Frame, None]`
|
||||
- The method processes the latest image in the context and yields frames with analysis results
|
||||
- Typically yields `TextFrame` objects containing descriptions or answers
|
||||
|
||||
## Implementation Guidelines
|
||||
|
||||
### Naming Conventions
|
||||
|
||||
#### Package and Repository Naming
|
||||
|
||||
Use the `pipecat-{vendor}` naming convention for your PyPI package and repository:
|
||||
|
||||
- `pipecat-{vendor}` — for single-service integrations (e.g., `pipecat-deepdub`)
|
||||
- `pipecat-{vendor}-{type}` — when a vendor offers multiple service types (e.g., `pipecat-upliftai-stt`, `pipecat-upliftai-tts`)
|
||||
|
||||
This convention makes community packages easily discoverable via PyPI search and clearly identifies them as part of the Pipecat ecosystem.
|
||||
|
||||
#### Class Naming
|
||||
|
||||
- **STT:** `VendorSTTService`
|
||||
- **LLM:** `VendorLLMService`
|
||||
- **TTS:**
|
||||
@@ -409,7 +381,7 @@ Note that `self.sample_rate` is a `@property` set in the TTSService base class,
|
||||
|
||||
Use Pipecat's tracing decorators:
|
||||
|
||||
- **STT:** `@traced_stt` - decorate `_handle_transcription(self, transcript, is_final, language)` (the standard method name convention)
|
||||
- **STT:** `@traced_stt` - decorate a function that handles `transcript`, `is_final`, `language` as args
|
||||
- **LLM:** `@traced_llm` - decorate the `_process_context()` method
|
||||
- **TTS:** `@traced_tts` - decorate the `run_tts()` method
|
||||
|
||||
@@ -417,9 +389,8 @@ Use Pipecat's tracing decorators:
|
||||
|
||||
### Packaging and Distribution
|
||||
|
||||
- Name your package `pipecat-{vendor}` (see [Naming Conventions](#naming-conventions))
|
||||
- Use [uv](https://docs.astral.sh/uv/) for packaging (encouraged)
|
||||
- Publish to PyPI for easier installation
|
||||
- Consider releasing to PyPI for easier installation
|
||||
- Follow semantic versioning principles
|
||||
- Maintain a changelog
|
||||
|
||||
@@ -432,15 +403,17 @@ For REST-based communication, use aiohttp. Pipecat includes this as a required d
|
||||
- Wrap API calls in appropriate try/catch blocks
|
||||
- Handle rate limits and network failures gracefully
|
||||
- Provide meaningful error messages
|
||||
- When errors occur, raise exceptions AND push errors to notify the pipeline:
|
||||
- When errors occur, raise exceptions AND push `ErrorFrame`s to notify the pipeline:
|
||||
|
||||
```python
|
||||
from pipecat.frames.frames import ErrorFrame
|
||||
|
||||
try:
|
||||
# Your API call
|
||||
result = await self._make_api_call()
|
||||
except Exception as e:
|
||||
# Push error upstream to notify the pipeline
|
||||
await self.push_error(f"{self} error: {e}", exception=e)
|
||||
# Push error frame to pipeline
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
# Raise or handle as appropriate
|
||||
raise
|
||||
```
|
||||
|
||||
38
README.md
38
README.md
@@ -8,7 +8,7 @@
|
||||
|
||||
**Pipecat** is an open-source Python framework for building real-time voice and multimodal conversational agents. Orchestrate audio and video, AI services, different transports, and conversation pipelines effortlessly—so you can focus on what makes your agent unique.
|
||||
|
||||
> Want to dive right in? Run `pipecat init quickstart` or follow the [quickstart guide](https://docs.pipecat.ai/getting-started/quickstart).
|
||||
> Want to dive right in? Try the [quickstart](https://docs.pipecat.ai/getting-started/quickstart).
|
||||
|
||||
## 🚀 What You Can Build
|
||||
|
||||
@@ -65,10 +65,6 @@ claude plugin marketplace add pipecat-ai/skills
|
||||
|
||||
and install any of the available plugins.
|
||||
|
||||
### 🧩 Community Integrations
|
||||
|
||||
Build and share your own Pipecat service integrations! Browse existing [community integrations](https://docs.pipecat.ai/server/services/community-integrations) or check out our [guide](COMMUNITY_INTEGRATIONS.md) to create your own.
|
||||
|
||||
### 📺️ Pipecat TV Channel
|
||||
|
||||
Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.youtube.com/playlist?list=PLzU2zoMTQIHjqC3v4q2XVSR3hGSzwKFwH) channel.
|
||||
@@ -80,25 +76,24 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout
|
||||
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/storytelling-chatbot/image.png" width="400" /></a>
|
||||
<br/>
|
||||
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/translation-chatbot/image.png" width="400" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/blob/main/examples/vision/vision-moondream.py"><img src="https://github.com/pipecat-ai/pipecat/blob/main/examples/assets/moondream.png" width="400" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/blob/main/examples/foundational/12-describe-video.py"><img src="https://github.com/pipecat-ai/pipecat/blob/main/examples/foundational/assets/moondream.png" width="400" /></a>
|
||||
</p>
|
||||
|
||||
## 🧩 Available services
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [Nebius](https://docs.pipecat.ai/server/services/llm/nebius), [Novita](https://docs.pipecat.ai/server/services/llm/novita), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nvidia), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/llm/sarvam), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [Kokoro](https://docs.pipecat.ai/server/services/tts/kokoro), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Smallest](https://docs.pipecat.ai/server/services/tts/smallest), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [xAI](https://docs.pipecat.ai/server/services/tts/xai), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [LiveKit (WebRTC)](https://docs.pipecat.ai/server/services/transport/livekit), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), [WhatsApp](https://docs.pipecat.ai/server/services/transport/whatsapp), Local |
|
||||
| Serializers | [Exotel](https://docs.pipecat.ai/server/services/serializers/exotel), [Genesys](https://docs.pipecat.ai/server/services/serializers/genesys), [Plivo](https://docs.pipecat.ai/server/services/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/services/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/services/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/services/serializers/vonage) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/server/services/transport/lemonslice), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp Viva](https://docs.pipecat.ai/guides/features/krisp-viva), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter), [RNNoise](https://docs.pipecat.ai/server/utilities/audio/rnnoise-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
| Community | [Browse community integrations →](https://docs.pipecat.ai/server/services/community-integrations) |
|
||||
| Category | Services |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Exotel](https://docs.pipecat.ai/server/utilities/serializers/exotel), [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/utilities/serializers/vonage) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/server/services/video/lemonslice), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||
|
||||
@@ -142,7 +137,7 @@ You can get started with Pipecat running on your local machine, then move your a
|
||||
|
||||
## 🧪 Code examples
|
||||
|
||||
- [Foundational](https://github.com/pipecat-ai/pipecat/tree/main/examples) — small snippets that build on each other, introducing one or two concepts at a time
|
||||
- [Foundational](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational) — small snippets that build on each other, introducing one or two concepts at a time
|
||||
- [Example apps](https://github.com/pipecat-ai/pipecat-examples) — complete applications that you can use as starting points for development
|
||||
|
||||
## 🛠️ Contributing to the framework
|
||||
@@ -166,6 +161,7 @@ You can get started with Pipecat running on your local machine, then move your a
|
||||
```bash
|
||||
uv sync --group dev --all-extras \
|
||||
--no-extra gstreamer \
|
||||
--no-extra krisp \
|
||||
--no-extra local \
|
||||
```
|
||||
|
||||
|
||||
1
changelog/3457.changed.md
Normal file
1
changelog/3457.changed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Changed tool result JSON serialization to use `ensure_ascii=False`, preserving UTF-8 characters instead of escaping them. This reduces context size and token usage for non-English languages.
|
||||
1
changelog/3991.changed.md
Normal file
1
changelog/3991.changed.md
Normal file
@@ -0,0 +1 @@
|
||||
- `OpenAIRealtimeSTTService`'s `noise_reduction` parameter is now part of `OpenAIRealtimeSTTSettings`, making it runtime-updatable via `STTUpdateSettingsFrame`. The direct `noise_reduction` init argument is deprecated as of 0.0.106.
|
||||
1
changelog/3997.changed.md
Normal file
1
changelog/3997.changed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Updated `sarvamai` dependency from `0.1.26a2` (alpha) to `0.1.26` (stable release).
|
||||
1
changelog/4000.fixed.md
Normal file
1
changelog/4000.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed an issue where the default model for `OpenAILLMService` and `AzureLLMService` was mistakenly reverted to `gpt-4o`. The defaults are now restored to `gpt-4.1`.
|
||||
1
changelog/4001.changed.md
Normal file
1
changelog/4001.changed.md
Normal file
@@ -0,0 +1 @@
|
||||
- `SimliVideoService` now extends `AIService` instead of `FrameProcessor`, aligning it with the HeyGen and Tavus video services. It supports `SimliVideoService.Settings(...)` for configuration and uses `start()`/`stop()`/`cancel()` lifecycle methods. Existing constructor usage (`api_key`, `face_id`, etc.) remains unchanged.
|
||||
1
changelog/4001.deprecated.md
Normal file
1
changelog/4001.deprecated.md
Normal file
@@ -0,0 +1 @@
|
||||
- `SimliVideoService.InputParams` is deprecated. Use the direct constructor parameters `max_session_length`, `max_idle_time`, and `enable_logging` instead.
|
||||
1
changelog/4004.added.md
Normal file
1
changelog/4004.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added optional `service` field to `ServiceUpdateSettingsFrame` (and its subclasses `LLMUpdateSettingsFrame`, `TTSUpdateSettingsFrame`, `STTUpdateSettingsFrame`) to target a specific service instance. When `service` is set, only the matching service applies the settings; others forward the frame unchanged. This enables updating a single service when multiple services of the same type exist in the pipeline.
|
||||
1
changelog/4005.added.md
Normal file
1
changelog/4005.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added `sip_provider` and `room_geo` parameters to `configure()` in the Daily runner. These convenience parameters let callers specify a SIP provider name and geographic region directly without manually constructing `DailyRoomProperties` and `DailyRoomSipParams`.
|
||||
1
changelog/4006.fixed.md
Normal file
1
changelog/4006.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed a race condition where `EndTaskFrame` could cause the pipeline to shut down before in-flight frames (e.g. LLM function call responses) finished processing. `EndTaskFrame` and `StopTaskFrame` now flow through the pipeline as `ControlFrame`s, ensuring all pending work is flushed before shutdown begins. `CancelTaskFrame` and `InterruptionTaskFrame` remain immediate (`SystemFrame`).
|
||||
1
changelog/4007.fixed.2.md
Normal file
1
changelog/4007.fixed.2.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed `TTSService` potentially canceling in-flight audio during shutdown. The stop sequence now waits for all queued audio contexts to finish processing before canceling the stop frame task.
|
||||
1
changelog/4007.fixed.md
Normal file
1
changelog/4007.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed `ParallelPipeline` dropping or misordering frames during lifecycle synchronization. Buffered frames are now flushed in the correct order relative to synchronization frames (`StartFrame` goes first, `EndFrame`/`CancelFrame` go after), and frames added to the buffer during flush are also drained.
|
||||
1
changelog/4009.added.md
Normal file
1
changelog/4009.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added `PerplexityLLMAdapter` that automatically transforms conversation messages to satisfy Perplexity's stricter API constraints (strict role alternation, no non-initial system messages, last message must be user/tool). Previously, certain conversation histories could cause Perplexity API errors that didn't occur with OpenAI (`PerplexityLLMService` subclasses `OpenAILLMService` since Perplexity uses an OpenAI-compatible API).
|
||||
1
changelog/4012.deprecated.md
Normal file
1
changelog/4012.deprecated.md
Normal file
@@ -0,0 +1 @@
|
||||
- Deprecated `LocalSmartTurnAnalyzerV2` and `LocalCoreMLSmartTurnAnalyzer`. Use `LocalSmartTurnAnalyzerV3` instead. Instantiating these analyzers will now emit a `DeprecationWarning`.
|
||||
1
changelog/4023.changed.md
Normal file
1
changelog/4023.changed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Update `pipecat-ai-small-webrtc-prebuilt` to `2.4.0`.
|
||||
1
changelog/4024.fixed.md
Normal file
1
changelog/4024.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed `Language` enum values (e.g. `Language.ES`) not being converted to service-specific codes when passed via `settings=Service.Settings(language=Language.ES)` at init time. This caused API errors (e.g. 400 from Rime) because the raw enum was sent instead of the expected language code (e.g. `"spa"`). Runtime updates via `UpdateSettingsFrame` were unaffected. The fix centralizes conversion in the base `TTSService` and `STTService` classes so all services handle this consistently.
|
||||
1
changelog/4026.fixed.md
Normal file
1
changelog/4026.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed `DeepgramSTTService` ignoring the `base_url` scheme when using `ws://` or `http://`. Previously these were silently overwritten with `wss://` / `https://`, breaking air-gapped or private deployments that don't use TLS. All scheme choices (`wss://`, `https://`, `ws://`, `http://`, or bare hostname) are now respected.
|
||||
1
changelog/4035.security.md
Normal file
1
changelog/4035.security.md
Normal file
@@ -0,0 +1 @@
|
||||
- Bumped PyJWT minimum version from 2.10.1 to 2.12.0 in the `livekit` extra to address CVE-2026-32597 (GHSA-752w-5fwx-jx9f), where PyJWT <= 2.11.0 accepted unknown `crit` header extensions.
|
||||
1
changelog/4037.fixed.md
Normal file
1
changelog/4037.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed `LLMSwitcher.register_function()` and `register_direct_function()` not accepting or forwarding the `timeout_secs` parameter.
|
||||
1
changelog/4046.fixed.md
Normal file
1
changelog/4046.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
Fixed `SonioxSTTService` and `OpenAIRealtimeSTTService` crash when language parameters contain plain strings instead of `Language` enum values.
|
||||
1
changelog/4047.added.md
Normal file
1
changelog/4047.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added DTMF input event support to the Daily transport. Incoming DTMF tones are now received via Daily's `on_dtmf_event` callback and pushed into the pipeline as `InputDTMFFrame`, enabling bots to react to keypad presses from phone callers.
|
||||
1
changelog/4047.changed.md
Normal file
1
changelog/4047.changed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Updated `daily-python` dependency to 0.25.0.
|
||||
1
changelog/4048.changed.md
Normal file
1
changelog/4048.changed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added `enable_dialout` parameter to `configure()` in `pipecat.runner.daily` to support dial-out rooms. Also narrowed misleading `Optional` type hints and deduplicated token expiry calculation.
|
||||
1
changelog/4057.fixed.md
Normal file
1
changelog/4057.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed premature user turn stops caused by late transcriptions arriving between turns. A stale transcript from the previous turn could persist into the next turn and trigger a stop before the current turn's real transcript arrived. Stop strategies are now reset at both turn start and turn stop to prevent state from leaking across turn boundaries.
|
||||
1
changelog/4058.fixed.md
Normal file
1
changelog/4058.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed raw language strings like `"de-DE"` silently failing when passed to TTS/STT services (e.g. ElevenLabs producing no audio). Raw strings now go through the same `Language` enum resolution as enum values, so regional codes like `"de-DE"` are properly converted to service-expected formats like `"de"`. Unrecognized strings log a warning instead of failing silently.
|
||||
1
changelog/4063.fixed.md
Normal file
1
changelog/4063.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed Deepgram STT list-type settings (`keyterm`, `keywords`, `search`, `redact`, `replace`) being stringified instead of passed as lists to the SDK, which caused them to be sent as literal strings (e.g. `"['pipecat']"`) in the WebSocket query params.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Added WebSocket-based `OpenAIResponsesLLMService` as the new default for the OpenAI Responses API. It maintains a persistent connection to `wss://api.openai.com/v1/responses` and automatically uses `previous_response_id` to send only incremental context, falling back to full context on reconnection or cache miss. The previous HTTP-based implementation is now available as `OpenAIResponsesHttpLLMService`.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `OpenPipeLLMService` and the `openpipe` extra. OpenPipe was acquired by CoreWeave and the package is no longer maintained. If you were using `openpipe` as an LLM provider, switch to the underlying provider directly (e.g. `openai`). The OpenPipe interface can still be used with `OpenAILLMService` by specifying a `base_url`.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Updated `langchain` extra to require langchain 1.x (from 0.3.x), langchain-community 0.4.x (from 0.3.x), and langchain-openai 1.x (from 0.3.x). If you pin these packages in your project, update your pins accordingly.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed `InworldHttpTTSService` streaming responses crashing with `UnicodeDecodeError` when multi-byte UTF-8 characters were split across chunk boundaries. This caused TTS audio to cut off mid-sentence intermittently.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed a crash (`JSONDecodeError`) when a user interruption occurs while the LLM is streaming function call arguments. Previously, the incomplete JSON arguments were passed directly to `json.loads()`, causing an unhandled exception. Affected services: OpenAI, Google (OpenAI-compatible), and SambaNova.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `observers` field from `PipelineParams`. Pass observers directly to `PipelineTask` constructor instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `on_pipeline_ended`, `on_pipeline_cancelled`, and `on_pipeline_stopped` events from `PipelineTask`. Use `on_pipeline_finished` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `AudioBufferProcessor.user_continuous_stream` parameter. Use `user_audio_passthrough` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `camera_in_enabled`, `camera_in_is_live`, `camera_in_width`, `camera_in_height`, `camera_out_enabled`, `camera_out_is_live`, `camera_out_width`, `camera_out_height`, and `camera_out_color` transport params. Use the `video_in_*` and `video_out_*` equivalents instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `RTVIObserver.errors_enabled` parameter.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `vad_enabled` and `vad_audio_passthrough` transport params.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `TTSService.say()`. Push a `TTSSpeakFrame` into the pipeline instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `DailyRunner.configure_with_args()`. Use `PipelineRunner` with `RunnerArguments` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated RTVI models, frames, and processor methods including `RTVIConfig`, `RTVIServiceConfig`, `RTVIServiceOptionConfig`, various `RTVI*Data` models, `RTVIActionFrame`, and `RTVIProcessor.handle_function_call`/`handle_function_call_start`. Use the updated RTVI processor API instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `FrameProcessor.wait_for_task()`. Use `create_task()` and manage tasks with the built-in `TaskManager` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `KrispFilter`. The `krisp` extra has been removed from `pyproject.toml`.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `LLMService.request_image_frame()`. Push a `UserImageRequestFrame` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `create_default_resampler()` from `pipecat.audio.utils`.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `FalSmartTurnAnalyzer` and `LocalSmartTurnAnalyzer`.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated transport frames: `TransportMessageFrame`, `TransportMessageUrgentFrame`, `InputTransportMessageUrgentFrame`, `DailyTransportMessageFrame`, and `DailyTransportMessageUrgentFrame`. Use `OutputTransportMessageFrame`, `OutputTransportMessageUrgentFrame`, `InputTransportMessageFrame`, `DailyOutputTransportMessageFrame`, and `DailyOutputTransportMessageUrgentFrame` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `KeypadEntryFrame` alias.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated interruption frames: `StartInterruptionFrame` and `BotInterruptionFrame`. Use `InterruptionFrame` and `InterruptionTaskFrame` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `LLMService.start_callback` parameter. Register an `on_llm_response_start` event handler instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed single-argument function call support from `LLMService`. Functions must use named parameters instead of a single `arguments` parameter.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `NoisereduceFilter`. Use system-level noise reduction or a service-based alternative instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `pipecat.services.riva` package. Use `pipecat.services.nvidia.stt` and `pipecat.services.nvidia.tts` instead (`RivaSTTService` → `NvidiaSTTService`, `RivaTTSService` → `NvidiaTTSService`).
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `pipecat.services.nim` package. Use `pipecat.services.nvidia.llm` instead (`NimLLMService` → `NvidiaLLMService`).
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `pipecat.services.gemini_multimodal_live` package. Use `pipecat.services.google.gemini_live` instead. Note that class names no longer include "Multimodal" (e.g. `GeminiMultimodalLiveLLMService` → `GeminiLiveLLMService`).
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `pipecat.services.aws_nova_sonic` package. Use `pipecat.services.aws.nova_sonic` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `pipecat.services.openai_realtime` package. Use `pipecat.services.openai.realtime` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `OpenAIRealtimeBetaLLMService` and `AzureRealtimeBetaLLMService`. Use `OpenAIRealtimeLLMService` and `AzureRealtimeLLMService` from `pipecat.services.openai.realtime` and `pipecat.services.azure.realtime` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `pipecat.services.deepgram.stt_sagemaker` and `pipecat.services.deepgram.tts_sagemaker` modules. Use `pipecat.services.deepgram.sagemaker.stt` and `pipecat.services.deepgram.sagemaker.tts` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `GoogleLLMOpenAIBetaService` from `pipecat.services.google.openai`. Use `GoogleLLMService` from `pipecat.services.google.llm` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `pipecat.services.google.llm_vertex` module. Use `pipecat.services.google.vertex.llm` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `pipecat.services.google.gemini_live.llm_vertex` module. Use `pipecat.services.google.gemini_live.vertex.llm` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated `pipecat.services.ai_services` module. Import from `pipecat.services.ai_service`, `pipecat.services.llm_service`, `pipecat.services.stt_service`, `pipecat.services.tts_service`, etc. instead.
|
||||
@@ -1 +0,0 @@
|
||||
- Changed `GrokLLMService` default model from `grok-3-beta` to `grok-3`, now that the model is generally available.
|
||||
@@ -1 +0,0 @@
|
||||
- `GoogleImageGenService` now defaults to `imagen-4.0-generate-001` (previously `imagen-3.0-generate-002`).
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ `BaseOpenAILLMService.get_chat_completions()` now accepts an `LLMContext` instead of `OpenAILLMInvocationParams`. If you override this method, update your signature accordingly.
|
||||
@@ -1,22 +0,0 @@
|
||||
- ⚠️ Removed deprecated service-specific context and aggregator machinery, which was superseded by the universal `LLMContext` system.
|
||||
|
||||
Service-specific classes removed: `AnthropicLLMContext`, `AnthropicContextAggregatorPair`, `AWSBedrockLLMContext`, `AWSBedrockContextAggregatorPair`, `OpenAIContextAggregatorPair`, and their user/assistant aggregators. Also removed `create_context_aggregator()` from `LLMService`, `OpenAILLMService`, `AnthropicLLMService`, and `AWSBedrockLLMService`.
|
||||
|
||||
Base aggregator classes removed (from `pipecat.processors.aggregators.llm_response`): `BaseLLMResponseAggregator`, `LLMContextResponseAggregator`, `LLMUserContextAggregator`, `LLMAssistantContextAggregator`, `LLMUserResponseAggregator`, `LLMAssistantResponseAggregator`.
|
||||
|
||||
From the developer's point of view, migrating will usually be a matter of going from this:
|
||||
|
||||
```python
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
```
|
||||
|
||||
To this:
|
||||
|
||||
```python
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
```
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated frame types `LLMMessagesFrame` and `OpenAILLMContextAssistantTimestampFrame` from `pipecat.frames.frames`. Instead of `LLMMessagesFrame`, use `LLMContextFrame` with the new messages, or `LLMMessagesUpdateFrame` with `run_llm=True`.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `GatedOpenAILLMContextAggregator` (from `pipecat.processors.aggregators.gated_open_ai_llm_context`). Use `GatedLLMContextAggregator` (from `pipecat.processors.aggregators.gated_llm_context`) instead.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed `VisionImageFrameAggregator` (from `pipecat.processors.aggregators.vision_image_frame`). Vision/image handling is now built into `LLMContext` (from `pipecat.processors.aggregators.llm_context`). See the `12*` examples for the recommended replacement pattern.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ Removed deprecated compatibility modules: `pipecat.services.openai_realtime_beta` (use `pipecat.services.openai.realtime`), `pipecat.services.openai_realtime.context`, `pipecat.services.openai_realtime.frames`, `pipecat.services.openai.realtime.context`, `pipecat.services.openai.realtime.frames`, `pipecat.services.gemini_multimodal_live` (use `pipecat.services.google.gemini_live`), `pipecat.services.aws_nova_sonic.context` (use `pipecat.services.aws.nova_sonic`), `pipecat.services.google.openai` and `pipecat.services.google.llm_openai` (use `pipecat.services.google.llm`).
|
||||
@@ -1,18 +0,0 @@
|
||||
- ⚠️ Removed `OpenAILLMContext`, `OpenAILLMContextFrame`, and `OpenAILLMContext.from_messages()`. Use `LLMContext` (from `pipecat.processors.aggregators.llm_context`) and `LLMContextFrame` (from `pipecat.frames.frames`) instead. All services now exclusively use the universal `LLMContext`.
|
||||
|
||||
From the developer's point of view, migrating will usually be a matter of going from this:
|
||||
|
||||
```python
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
```
|
||||
|
||||
To this:
|
||||
|
||||
```python
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
```
|
||||
@@ -1 +0,0 @@
|
||||
- Added `group_parallel_tools` parameter to `LLMService` (default `True`). When `True`, all function calls from the same LLM response batch share a group ID and the LLM is triggered exactly once after the last call completes. Set to `False` to trigger inference independently for each function call result as it arrives.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `is_async=True` support to `register_function()` and `register_direct_function()`. When enabled, the LLM continues the conversation immediately without waiting for the function result. The result is injected back into the context as a `developer` message once available, triggering a new LLM inference at that point.
|
||||
@@ -1 +0,0 @@
|
||||
- When multiple function calls are returned in a single LLM response, the LLM is now triggered exactly once after the last call in the batch completes, rather than waiting for all function calls.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed `BaseOutputTransport` discarding pending `UninterruptibleFrame` items (e.g. function-call context updates) when an interruption arrived. The audio task is now kept alive and only interruptible frames are drained when uninterruptible frames are present in the queue.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed spurious LLM inference being triggered when a function call result arrived while the user was actively speaking. The context frame is now suppressed until the user stops speaking.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed an issue where `UninterruptibleFrame` items queued in `FrameProcessor` could be incorrectly dropped on interruption. Previously only the frame currently being processed was checked; now the entire process queue is scanned so pending uninterruptible frames are always delivered.
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
# Build docs using uv
|
||||
echo "Installing dependencies with uv..."
|
||||
uv sync --group docs --all-extras --no-extra gstreamer --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
uv sync --group docs --all-extras --no-extra krisp --no-extra gstreamer --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
|
||||
# Check if sphinx-build is available
|
||||
if ! uv run sphinx-build --version &> /dev/null; then
|
||||
|
||||
@@ -48,6 +48,8 @@ autodoc_default_options = {
|
||||
# Mock imports for optional dependencies
|
||||
autodoc_mock_imports = [
|
||||
# Krisp - has build issues on some platforms
|
||||
"pipecat_ai_krisp",
|
||||
"krisp",
|
||||
"krisp_audio",
|
||||
# System-specific GUI libraries
|
||||
"_tkinter",
|
||||
@@ -96,6 +98,7 @@ autodoc_mock_imports = [
|
||||
"cartesia",
|
||||
"camb",
|
||||
"sarvamai",
|
||||
"openpipe",
|
||||
"openai.types.beta.realtime",
|
||||
"langchain_core",
|
||||
"langchain_core.messages",
|
||||
|
||||
18
env.example
18
env.example
@@ -80,6 +80,9 @@ GOOGLE_TEST_CREDENTIALS=...
|
||||
# Gradium
|
||||
GRAPDIUM_API_KEY=...
|
||||
|
||||
# Grok
|
||||
GROK_API_KEY=...
|
||||
|
||||
# Groq
|
||||
GROQ_API_KEY=...
|
||||
|
||||
@@ -121,21 +124,18 @@ MINIMAX_GROUP_ID=...
|
||||
# Mistral
|
||||
MISTRAL_API_KEY=...
|
||||
|
||||
# Nebius
|
||||
NEBIUS_API_KEY=...
|
||||
|
||||
# Neuphonic
|
||||
NEUPHONIC_API_KEY=...
|
||||
|
||||
# Novita
|
||||
NOVITA_API_KEY=...
|
||||
|
||||
# NVIDIA
|
||||
NVIDIA_API_KEY=...
|
||||
|
||||
# OpenAI
|
||||
OPENAI_API_KEY=...
|
||||
|
||||
# OpenPipe
|
||||
OPENPIPE_API_KEY=...
|
||||
|
||||
# OpenRouter
|
||||
OPENROUTER_API_KEY=...
|
||||
|
||||
@@ -176,9 +176,6 @@ SENTRY_DSN=...
|
||||
SIMLI_API_KEY=...
|
||||
SIMLI_FACE_ID=...
|
||||
|
||||
# Smallest
|
||||
SMALLEST_API_KEY=...
|
||||
|
||||
# Smart turn
|
||||
LOCAL_SMART_TURN_MODEL_PATH=...
|
||||
FAL_SMART_TURN_API_KEY=...
|
||||
@@ -212,6 +209,3 @@ WHATSAPP_TOKEN=...
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN=...
|
||||
WHATSAPP_PHONE_NUMBER_ID=...
|
||||
WHATSAPP_APP_SECRET=...
|
||||
|
||||
# xAI / Grok
|
||||
XAI_API_KEY=...
|
||||
@@ -1,150 +1,31 @@
|
||||
# Pipecat Examples
|
||||
|
||||
This directory contains examples showing how to build voice and multimodal agents with Pipecat.
|
||||
This directory contains examples to help you learn how to build with Pipecat.
|
||||
|
||||
## Setup
|
||||
## Getting Started
|
||||
|
||||
1. Follow the [README](https://github.com/pipecat-ai/pipecat/blob/main/README.md#%EF%B8%8F-contributing-to-the-framework) steps to get your local environment configured.
|
||||
New to Pipecat? Start here:
|
||||
|
||||
> **Run from root directory**: Make sure you are running the steps from the root directory.
|
||||
- **[Quickstart](quickstart/)** - Get your first voice AI bot running in 5 minutes _(coming soon)_
|
||||
- **[Client/Server Web](client-server-web/)** - Learn to build web applications with Pipecat's client SDKs _(coming soon)_
|
||||
- **[Phone Bot with Twilio](phone-bot-twilio/)** - Connect your bot to a phone number _(coming soon)_
|
||||
|
||||
> **Using local audio?**: The `LocalAudioTransport` requires a system dependency for `portaudio`. Install the dependency to use the transport.
|
||||
## Foundational Examples
|
||||
|
||||
2. Copy the [`env.example`](../env.example) file and add API keys for services you plan to use:
|
||||
Single-file examples that introduce core Pipecat concepts one at a time. These examples:
|
||||
|
||||
```bash
|
||||
cp env.example .env
|
||||
# Edit .env with your API keys
|
||||
```
|
||||
- Build on each other progressively
|
||||
- Focus on specific features or integrations
|
||||
- Are used for testing with every Pipecat release
|
||||
|
||||
3. Run any example:
|
||||
See the **[Foundational Examples README](foundational/)** for the complete list.
|
||||
|
||||
```bash
|
||||
uv run python getting-started/01-say-one-thing.py
|
||||
```
|
||||
## More Advanced Examples
|
||||
|
||||
4. Open the web interface at http://localhost:7860/client/ and click "Connect"
|
||||
Ready to explore complex use cases? Visit **[pipecat-examples](https://github.com/pipecat-ai/pipecat-examples)** for:
|
||||
|
||||
## Running examples with other transports
|
||||
|
||||
Most examples support running with other transports, like Twilio or Daily.
|
||||
|
||||
### Daily
|
||||
|
||||
You need to create a Daily account at https://dashboard.daily.co/u/signup. Once signed up, you can create your own room from the dashboard and set the environment variables `DAILY_ROOM_URL` and `DAILY_API_KEY`. Alternatively, you can let the example create a room for you (still needs `DAILY_API_KEY` environment variable). Then, start any example with `-t daily`:
|
||||
|
||||
```bash
|
||||
uv run getting-started/06-voice-agent.py -t daily
|
||||
```
|
||||
|
||||
### Twilio
|
||||
|
||||
It is also possible to run the example through a Twilio phone number. You will need to setup a few things:
|
||||
|
||||
1. Install and run [ngrok](https://ngrok.com/download).
|
||||
|
||||
```bash
|
||||
ngrok http 7860
|
||||
```
|
||||
|
||||
2. Configure your Twilio phone number. One way is to setup a TwiML app and set the request URL to the ngrok URL from step (1). Then, set your phone number to use the new TwiML app.
|
||||
|
||||
Then, run the example with:
|
||||
|
||||
```bash
|
||||
uv run getting-started/06-voice-agent.py -t twilio -x NGROK_HOST_NAME
|
||||
```
|
||||
|
||||
## Directory Structure
|
||||
|
||||
### [`getting-started/`](./getting-started/)
|
||||
|
||||
Progressive introduction to Pipecat, from minimal TTS to a full voice agent with function calling.
|
||||
|
||||
### [`voice/`](./voice/)
|
||||
|
||||
Full STT + LLM + TTS voice agent pipelines showcasing different speech service providers (Deepgram, ElevenLabs, Cartesia, etc.)
|
||||
|
||||
### [`function-calling/`](./function-calling/)
|
||||
|
||||
Function calling with different LLM providers (OpenAI, Anthropic, Google, etc.)
|
||||
|
||||
### [`transcription/`](./transcription/)
|
||||
|
||||
Speech-to-text examples with various STT providers.
|
||||
|
||||
### [`vision/`](./vision/)
|
||||
|
||||
Image description and vision capabilities with different multimodal LLMs.
|
||||
|
||||
### [`realtime/`](./realtime/)
|
||||
|
||||
Realtime and multimodal live APIs (OpenAI Realtime, Gemini Live, AWS Nova Sonic, Ultravox, Grok).
|
||||
|
||||
### [`persistent-context/`](./persistent-context/)
|
||||
|
||||
Maintaining conversation context across sessions with different providers.
|
||||
|
||||
### [`context-summarization/`](./context-summarization/)
|
||||
|
||||
Summarizing conversation context to manage token limits.
|
||||
|
||||
### [`update-settings/`](./update-settings/)
|
||||
|
||||
Changing service settings at runtime, organized by service type:
|
||||
|
||||
- **[`stt/`](./update-settings/stt/)** — Speech-to-text settings
|
||||
- **[`tts/`](./update-settings/tts/)** — Text-to-speech settings
|
||||
- **[`llm/`](./update-settings/llm/)** — LLM settings
|
||||
|
||||
### [`turn-management/`](./turn-management/)
|
||||
|
||||
Turn detection, interruption handling, and user input management.
|
||||
|
||||
### [`thinking-and-mcp/`](./thinking-and-mcp/)
|
||||
|
||||
LLM thinking/reasoning modes and MCP (Model Context Protocol) tool server integration.
|
||||
|
||||
### [`transports/`](./transports/)
|
||||
|
||||
Transport layer examples (WebRTC, Daily, LiveKit).
|
||||
|
||||
### [`video-avatar/`](./video-avatar/)
|
||||
|
||||
Video avatar integrations (Tavus, HeyGen, Simli, LemonSlice).
|
||||
|
||||
### [`video-processing/`](./video-processing/)
|
||||
|
||||
Video processing, mirroring, GStreamer, and custom video tracks.
|
||||
|
||||
### [`audio/`](./audio/)
|
||||
|
||||
Audio recording, background sounds, and sound effects.
|
||||
|
||||
### [`observability/`](./observability/)
|
||||
|
||||
Pipeline monitoring: observers, heartbeats, and Sentry metrics.
|
||||
|
||||
### [`rag/`](./rag/)
|
||||
|
||||
Retrieval-augmented generation, grounding, and long-term memory (Mem0, Gemini).
|
||||
|
||||
### [`features/`](./features/)
|
||||
|
||||
Miscellaneous features: wake phrases, live translation, service switching, voice switching, and more.
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Customizing Network Settings
|
||||
|
||||
```bash
|
||||
uv run python <example-name> --host 0.0.0.0 --port 8080
|
||||
```
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
- **No audio/video**: Check browser permissions for microphone and camera
|
||||
- **Connection errors**: Verify API keys in `.env` file
|
||||
- **Port conflicts**: Use `--port` to change the port
|
||||
|
||||
For more examples, visit the [pipecat-examples repository](https://github.com/pipecat-ai/pipecat-examples).
|
||||
- Production-ready applications
|
||||
- Multi-platform client implementations
|
||||
- Telephony integrations
|
||||
- Multimodal and creative applications
|
||||
- Deployment and monitoring examples
|
||||
|
||||
71
examples/foundational/01-say-one-thing-piper.py
Normal file
71
examples/foundational/01-say-one-thing-piper.py
Normal file
@@ -0,0 +1,71 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.piper.tts import PiperHttpTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(audio_out_enabled=True),
|
||||
"twilio": lambda: FastAPIWebsocketParams(audio_out_enabled=True),
|
||||
"webrtc": lambda: TransportParams(audio_out_enabled=True),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Create an HTTP session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tts = PiperHttpTTSService(
|
||||
base_url=os.getenv("PIPER_BASE_URL"),
|
||||
aiohttp_session=session,
|
||||
sample_rate=24000,
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
Pipeline([tts, transport.output()]),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
# Register an event handler so we can play the audio when the client joins
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
await task.queue_frames([TTSSpeakFrame(f"Hello there!"), EndFrame()])
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
72
examples/foundational/01-say-one-thing-rime.py
Normal file
72
examples/foundational/01-say-one-thing-rime.py
Normal file
@@ -0,0 +1,72 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.rime.tts import RimeHttpTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(audio_out_enabled=True),
|
||||
"twilio": lambda: FastAPIWebsocketParams(audio_out_enabled=True),
|
||||
"webrtc": lambda: TransportParams(audio_out_enabled=True),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Create an HTTP session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tts = RimeHttpTTSService(
|
||||
api_key=os.getenv("RIME_API_KEY", ""),
|
||||
aiohttp_session=session,
|
||||
settings=RimeHttpTTSService.Settings(
|
||||
voice="rex",
|
||||
),
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
Pipeline([tts, transport.output()]),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
# Register an event handler so we can play the audio when the client joins
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
await task.queue_frames([TTSSpeakFrame(f"Hello there!"), EndFrame()])
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
64
examples/foundational/01b-livekit-audio.py
Normal file
64
examples/foundational/01b-livekit-audio.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.runner.livekit import configure
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.transports.livekit.transport import LiveKitParams, LiveKitTransport
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main():
|
||||
(url, token, room_name) = await configure()
|
||||
|
||||
transport = LiveKitTransport(
|
||||
url=url,
|
||||
token=token,
|
||||
room_name=room_name,
|
||||
params=LiveKitParams(audio_out_enabled=True),
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
settings=CartesiaTTSService.Settings(
|
||||
voice="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
),
|
||||
)
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
task = PipelineTask(Pipeline([tts, transport.output()]))
|
||||
|
||||
# Register an event handler so we can play the audio when the
|
||||
# participant joins.
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant_id):
|
||||
await asyncio.sleep(1)
|
||||
await task.queue_frame(
|
||||
TTSSpeakFrame(
|
||||
"Hello there! How are you doing today? Would you like to talk about the weather?"
|
||||
)
|
||||
)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
64
examples/foundational/01c-nvidia-riva-tts.py
Normal file
64
examples/foundational/01c-nvidia-riva-tts.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.nvidia.tts import NvidiaTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(audio_out_enabled=True),
|
||||
"twilio": lambda: FastAPIWebsocketParams(audio_out_enabled=True),
|
||||
"webrtc": lambda: TransportParams(audio_out_enabled=True),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
tts = NvidiaTTSService(api_key=os.getenv("NVIDIA_API_KEY"))
|
||||
|
||||
task = PipelineTask(
|
||||
Pipeline([tts, transport.output()]),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
# Register an event handler so we can play the audio when the client joins
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
await task.queue_frames([TTSSpeakFrame(f"Hello there!"), EndFrame()])
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -60,7 +60,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
context = LLMContext()
|
||||
context.add_message({"role": "developer", "content": "Say hello to the world."})
|
||||
context.add_message({"role": "user", "content": "Say hello to the world."})
|
||||
await task.queue_frames([LLMContextFrame(context), EndFrame()])
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
84
examples/foundational/03-still-frame.py
Normal file
84
examples/foundational/03-still-frame.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import TextFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.fal.image import FalImageGenService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We use lambdas to defer transport parameter creation until the transport
|
||||
# type is selected at runtime.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
video_out_enabled=True,
|
||||
video_out_width=1024,
|
||||
video_out_height=1024,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
video_out_enabled=True,
|
||||
video_out_width=1024,
|
||||
video_out_height=1024,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
# Create an HTTP session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
imagegen = FalImageGenService(
|
||||
settings=FalImageGenService.Settings(
|
||||
image_size="square_hd",
|
||||
),
|
||||
aiohttp_session=session,
|
||||
key=os.getenv("FAL_KEY"),
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
Pipeline([imagegen, transport.output()]),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
# Register an event handler so we can play the audio when the client joins
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
await task.queue_frame(TextFrame("a cat in the style of picasso"))
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -109,9 +109,7 @@ async def run_example(webrtc_connection: SmallWebRTCConnection):
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
context.add_message(
|
||||
{"role": "developer", "content": "Please introduce yourself to the user."}
|
||||
)
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user