Compare commits

..

1 Commits

Author SHA1 Message Date
Mark Backman
708ef71c96 Update python-compatibility workflow to include new user project check 2025-08-09 20:19:16 -04:00
144 changed files with 3008 additions and 8104 deletions

View File

@@ -25,7 +25,7 @@ jobs:
version: "latest"
- name: Set up Python
run: uv python install 3.12
run: uv python install 3.10
- name: Install system packages
run: |

View File

@@ -9,14 +9,14 @@ on:
paths: ['pyproject.toml']
jobs:
test-compatibility:
test-dev-environment:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ['3.10.18', '3.11.13', '3.12.11', '3.13.5']
name: Python ${{ matrix.python-version }}
name: Dev Environment - Python ${{ matrix.python-version }}
steps:
- name: Checkout code
uses: actions/checkout@v4
@@ -55,7 +55,69 @@ jobs:
--no-extra moondream \
--no-extra mlx-whisper
- name: Verify installation
- name: Verify dev installation
run: |
uv run python --version
uv run python -c "import pipecat; print('✅ Pipecat imports successfully')"
uv run python -c "import pipecat; print('✅ Dev environment - Pipecat imports successfully')"
test-user-experience:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ['3.10.18', '3.11.13', '3.12.11', '3.13.5']
name: User Experience - Python ${{ matrix.python-version }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y \
portaudio19-dev \
libcairo2-dev \
libgirepository1.0-dev \
pkg-config
- name: Install uv
uses: astral-sh/setup-uv@v4
with:
version: 'latest'
- name: Set up Python ${{ matrix.python-version }}
run: |
uv python install ${{ matrix.python-version }}
- name: Build local package
run: |
uv build
- name: Create test project
run: |
mkdir test-project
cd test-project
uv init --python ${{ matrix.python-version }}
- name: Test comprehensive extras with uv add (Python 3.10-3.12)
if: "!startsWith(matrix.python-version, '3.13.')"
run: |
cd test-project
# Use uv add with built wheel to leverage dependency management
uv add "../dist/pipecat_ai-"*".whl[anthropic,assemblyai,asyncai,aws,aws-nova-sonic,azure,cartesia,cerebras,deepseek,daily,deepgram,elevenlabs,fal,fireworks,fish,gladia,google,grok,groq,gstreamer,heygen,inworld,koala,langchain,livekit,lmnt,local,mcp,mem0,mlx-whisper,moondream,nim,neuphonic,noisereduce,openai,openpipe,openrouter,perplexity,playht,qwen,rime,riva,runner,sambanova,sentry,local-smart-turn,remote-smart-turn,silero,simli,soniox,soundfile,speechmatics,tavus,together,tracing,ultravox,webrtc,websocket,whisper]"
- name: Test Python 3.13 compatible extras with uv add
if: startsWith(matrix.python-version, '3.13.')
run: |
cd test-project
# Use uv add with built wheel and Python 3.13 compatible extras
uv add "../dist/pipecat_ai-"*".whl[anthropic,assemblyai,asyncai,aws,aws-nova-sonic,azure,cartesia,cerebras,deepseek,daily,deepgram,elevenlabs,fal,fireworks,fish,gladia,google,grok,groq,gstreamer,heygen,inworld,koala,langchain,livekit,lmnt,local,mcp,mem0,nim,neuphonic,noisereduce,openai,openpipe,openrouter,perplexity,playht,qwen,rime,riva,runner,sambanova,sentry,remote-smart-turn,silero,simli,soniox,soundfile,speechmatics,tavus,together,tracing,webrtc,websocket,whisper]"
- name: Verify user installation
run: |
cd test-project
uv run python --version
uv run python -c "import pipecat; print('✅ User experience - Pipecat imports successfully')"
# Test that basic functionality works
uv run python -c "from pipecat.pipeline.pipeline import Pipeline; print('✅ Pipeline import works')"

View File

@@ -23,12 +23,17 @@ jobs:
token: ${{ secrets.QUICKSTART_SYNC_TOKEN }}
path: quickstart-repo
- name: Sync files (excluding uv.lock and README.md)
- name: Sync files (excluding READMEs)
run: |
# Copy all files except uv.lock and README.md
# Copy code files only, skip READMEs
cp examples/quickstart/bot.py quickstart-repo/
cp examples/quickstart/requirements.txt quickstart-repo/
cp examples/quickstart/env.example quickstart-repo/
# Copy any other files that aren't README.md
find examples/quickstart -type f \
-not -name "README.md" \
-not -name "uv.lock" \
-not -name "*.md" \
-exec cp {} quickstart-repo/ \;
- name: Commit and push changes

View File

@@ -29,7 +29,7 @@ jobs:
version: "latest"
- name: Set up Python
run: uv python install 3.12
run: uv python install 3.10
- name: Install system packages
run: |

42
.github/workflows/update-lockfile.yaml vendored Normal file
View File

@@ -0,0 +1,42 @@
name: Update lockfile
on:
push:
paths:
- 'pyproject.toml'
branches:
- main
workflow_dispatch: # Allows manual triggering from GitHub UI
jobs:
update-lockfile:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
# This gives the workflow permission to push back to the repo
token: ${{ secrets.GITHUB_TOKEN }}
- name: Install uv
uses: astral-sh/setup-uv@v1
- name: Update lockfile
run: uv lock
- name: Check for changes
id: verify-changed-files
run: |
if [ -n "$(git status --porcelain)" ]; then
echo "changed=true" >> $GITHUB_OUTPUT
else
echo "changed=false" >> $GITHUB_OUTPUT
fi
- name: Commit lockfile
if: steps.verify-changed-files.outputs.changed == 'true'
run: |
git config --local user.email "action@github.com"
git config --local user.name "GitHub Action"
git add uv.lock
git commit -m "chore: update uv.lock after dependency changes"
git push

View File

@@ -5,208 +5,18 @@ All notable changes to **Pipecat** will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Fixed
- Fixed a `CartesiaTTSService` issue that was causing the application to hang
after Cartesia's 5 minutes timed out.
## [0.0.81] - 2025-08-25
### Added
- Added `pipecat.extensions.voicemail`, a module for detecting voicemail vs.
live conversation, primarily intended for use in outbound calling scenarios.
The voicemail module is optimized for text LLMs only.
- Added new frames to the `idle_timeout_frames` arg: `TranscriptionFrame`,
`InterimTranscriptionFrame`, `UserStartedSpeakingFrame`, and
`UserStoppedSpeakingFrame`. These additions serve as indicators of user
activity in the pipeline idle detection logic.
- Allow passing custom pipeline sink and source processors to a
`Pipeline`. Pipeline source and sink processors are used to know and control
what's coming in and out of a `Pipeline` processor.
- Added `FrameProcessor.pause_processing_system_frames()` and
`FrameProcessor.resume_processing_system_frames()`. These allow to pause and
resume the processing of system frame.
- Added new `on_process_frame()` observer method which makes it possible to know
when a frame is being processed.
- Added new `FrameProcessor.entry_processor()` method. This allows you to access
the first non-compound processor in a pipeline.
- Added `FrameProcessor` properties `processors`, `next` and `previous`.
- `ElevenLabsTTSService` now supports additional runtime changes to the `model`,
`language`, and `voice_settings` parameters.
- Added `apply_text_normalization` support to `ElevenLabsTTSService` and
`ElevenLabsHttpTTSService`.
- Added `MistralLLMService`, using Mistral's chat completion API.
- Added the ability to retry executing a chat completion after a timeout period
for `OpenAILLMService` and its subclasses, `AnthropicLLMService`, and
`AWSBedrockLLMService`. The LLM services accept new args:
`retry_timeout_secs` and `retry_on_timeout`. This feature is disabled by
default.
## Unreleased
### Changed
- Updated `daily-python` to 0.19.7.
### Deprecated
- `FrameProcessor.wait_for_task()` is deprecated. Use `await task` or `await
asyncio.wait_for(task, timeout)` instead.
### Removed
- Watchdog timers have been removed. They were introduced in 0.0.72 to help
diagnose pipeline freezes. Unfortunately, they proved ineffective since they
required developers to use Pipecat-specific queues, iterators, and events to
correctly reset the timer, which limited their usefulness and added friction.
- Removed unused `FrameProcessor.set_parent()` and
`FrameProcessor.get_parent()`.
### Fixed
- Fixed an issue that would cause `PipelineRunner` and `PipelineTask` to not
handle external asyncio task cancellation properly.
- Added `SpeechmaticsSTTService` exception handling on connection and sending.
- Replaced `asyncio.wait_for()` for `wait_for2.wait_for()` for Python <
3.12. because of issues regarding task cancellation (i.e. cancellation is
never propagated).
See https://bugs.python.org/issue42130
- Fixed an `AudioBufferProcessor` issues that would cause audio overlap when
setting a max buffer size.
- Fixed an issue where `AsyncAITTSService` had very high latency in responding
by adding `force=true` when sending the flush command.
### Performance
- Improve `PipelineTask` performance by using direct mode processors and by
removing unnecessary tasks.
- Improve `ParallelPipeline` performance by using direct mode, by not
creating a task for each frame and every sub-pipeline and also by removing
other unnecessary tasks.
- `Pipeline` performance improvements by using direct mode.
### Other
- Added `14w-function-calling-mistal.py` using `MistralLLMService`.
- Added `13j-azure-transcription.py` using `AzureSTTService`.
## [0.0.80] - 2025-08-13
### Added
- Added `GeminiTTSService` which uses Google Gemini to generate TTS output. The
Gemini model can be prompted to insert styled speech to control the TTS
output.
- Added Exotel support to Pipecat's development runner. You can now connect
using the runner with `uv run bot.py -t exotel` and an ngrok connection to
HTTP port 7860.
- Added `enable_direct_mode` argument to `FrameProcessor`. The direct mode is
for processors which require very little I/O or compute resources, that is
processors that can perform their task almost immediately. These type of
processors don't need any of the internal tasks and queues usually created by
frame processors which means overall application performance might be slightly
increased. Use with care.
- Added TTFB metrics for `HeyGenVideoService` and `TavusVideoService`.
- Added `endpoint_id` parameter to `AzureSTTService`. ([Custom EndpointId](https://docs.azure.cn/en-us/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python#use-a-custom-endpoint))
### Changed
- `WatchdogPriorityQueue` now requires the items to be inserted to always be
tuples and the size of the tuple needs to be specified in the constructor when
creating the queue with the `tuple_size` argument.
- Updated Moondream to revision `2025-01-09`.
- Updated `PlayHTHttpTTSService` to no longer use the `pyht` client to remove
compatibility issues with other packages. Now you can use the PlayHT HTTP
service with other services, like GoogleLLMService.
- Updated `pyproject.toml` to once again pin `numba` to `>=0.61.2` in order to
resolve package versioning issues.
- Updated the `STTMuteFilter` to include `VADUserStartedSpeakingFrame` and
`VADUserStoppedSpeakingFrame` in the list of frames to filter when the
filtering is on.
### Performance
- Improving the latency of the `HeyGenVideoService`.
- Improved some frame processors performance by using the new frame processor
direct mode. In direct mode a frame processor will process frames right away
avoiding the need for internal queues and tasks. This is useful for some
simple processors. For example, in processors that wrap other processors
(e.g. `Pipeline`, `ParallelPipeline`), we add one processor before and one
after the wrapped processors (internally, you will see them as sources and
sinks). These sources and sinks don't do any special processing and they
basically forward frames. So, for these simple processors we now enable the
new direct mode which avoids creating any internal tasks (and queues) and
therefore improves performance.
### Fixed
- Fixed an issue with the `BaseWhisperSTTService` where the language was
specified as an enum and not a string.
- Fixed an issue where `SmallWebRTCTransport` ended before TTS finished.
- Fixed an issue in `OpenAIRealtimeBetaLLMService` where specifying a `text`
`modalities` didn't result in text being outputted from the model.
- Added SSML reserved character escaping to `AzureBaseTTSService` to properly
handle special characters in text sent to Azure TTS. This fixes an issue
where characters like `&`, `<`, `>`, `"`, and `'` in LLM-generated text would
cause TTS failures.
- Fixed a `WatchdogPriorityQueue` issue that could cause an exception when
compating watchdog cancel sentinel items with other items in the queue.
- Fixed an issue that would cause system frames to not be processed with higher
priority than other frames. This could cause slower interruption times.
- Fixed an issue where retrying a websocket connection error would result in an
error.
### Other
- Add foundation example `19b-openai-realtime-beta-text.py`, showing how to use
`OpenAIRealtimeBetaLLMService` to output text to a TTS service.
- Add vision support to release evals so we can run the foundational examples 12
series.
- Added foundational example `15a-switch-languages.py` to release evals. It is
able to detect if we switched the language properly.
- Updated foundational examples to show how to enclose complex logic
(e.g. `ParallelPipeline`) into a single processor so the main pipeline becomes
simpler.
- Added `07n-interruptible-gemini.py`, demonstrating how to use
`GeminiTTSService`.
- Updated `15-switch-voices.py` and `15a-switch-languages.py` examples to show
how to enclose complex logic (e.g. `ParallelPipeline`) into a single processor
so the main pipeline becomes simpler.
## [0.0.79] - 2025-08-07

View File

@@ -31,23 +31,6 @@ git push origin your-branch-name
Our maintainers will review your PR, and once everything is good, your contributions will be merged!
## Dependency Management
This project uses [uv](https://docs.astral.sh/uv/) for dependency management. The `uv.lock` file is committed to ensure reproducible builds.
### Adding or Updating Dependencies
1. Edit `pyproject.toml` to add/update dependencies
2. Run `uv lock` to update the lockfile with new dependency resolution
3. Run `uv sync` to install the updated dependencies locally
4. Always commit both files together:
```bash
git add pyproject.toml uv.lock
git commit -m "feat: add new dependency for feature X"
```
**Important:** Never manually edit `uv.lock`. It's auto-generated by `uv lock`.
## Code Style and Documentation
### Python Code Style

View File

@@ -54,7 +54,7 @@ You can connect to Pipecat from any platform using our official SDKs:
| Category | Services |
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
@@ -114,8 +114,7 @@ You can get started with Pipecat running on your local machine, then move your a
### Prerequisites
**Minimum Python Version:** 3.10
**Recommended Python Version:** 3.12
**Python Version:** 3.10+
### Setup Steps

10
docs/README.md Normal file
View File

@@ -0,0 +1,10 @@
# Pipecat Docs
## [Architecture Overview](architecture.md)
Learn about the thinking behind the framework's design.
## [A Frame's Progress](frame-progress.md)
See how a Frame is processed through a Transport, a Pipeline, and a series of Frame Processors.

17
docs/architecture.md Normal file
View File

@@ -0,0 +1,17 @@
# Pipecat architecture guide
## Frames
Frames can represent discrete chunks of data, for instance a chunk of text, a chunk of audio, or an image. They can also be used to as control flow, for instance a frame that indicates that there is no more data available, or that a user started or stopped talking. They can also represent more complex data structures, such as a message array used for an LLM completion.
## FrameProcessors
Frame processors operate on frames. Every frame processor implements a `process_frame` method that consumes one frame and produces zero or more frames. Frame processors can do simple transforms, such as concatenating text fragments into sentences, or they can treat frames as input for an AI Service, and emit chat completions based on message arrays or transform text into audio or images.
## Pipelines
Pipelines are lists of frame processors linked together. Frame processors can push frames upstream or downstream to their peers. A very simple pipeline might chain an LLM frame processor to a text-to-speech frame processor, with a transport as an output.
## Transports
Transports provide input and output frame processors to receive or send frames respectively. For example, the `DailyTransport` does this with a WebRTC session joined to a Daily.co room.

46
docs/frame-progress.md Normal file
View File

@@ -0,0 +1,46 @@
# A Frame's Progress
1. A user says “Hello, LLM” and the cloud transcription service delivers a transcription to the Transport.
![A transcript frame arrives](images/frame-progress-01.png)
2. The Transport places a Transcription frame in the Pipelines source queue.
![Frame in source queue](images/frame-progress-02.png)
3. The Pipeline passes the Transcription frame to the first Frame Processor in its list, the LLM User Message Aggregator.
![To UMA](images/frame-progress-03.png)
4. The LLM User Message Aggregator updates the LLM Context with a `{“user”: “Hello LLM”}` message.
![Update context](images/frame-progress-04.png)
5. The LLM User Message Aggregator yields an LLM Message Frame, containing the updated LLM Context. The Pipeline passes this frame to the LLM Frame Processor.
![Update context](images/frame-progress-05.png)
6. The LLM Frame Processor creates a streaming chat completion based on the LLM context and yields the first chunk of a response, Text Frame with the value “Hi, “. The Pipeline passes this frame to the TTS Frame Processor. The TTS Frame Processor aggregates this response but doesnt yield anything, yet, because its waiting for a full sentence.
![LLM yields Text](images/frame-progress-06.png)
7. The LLM Frame Processor yields another Text Frame with the value “there.”. The Pipeline passes this frame to the TTS Frame Processor.
![LLM yields more Text](images/frame-progress-07.png)
8. The TTS Frame Processor now has a full sentence, so it starts streaming audio based on “Hi, there.” It yields the first chunk of streaming audio as an Audio frame, which the Pipeline passes to the LLM Assistant Message Aggregator.
![TTS yields Audio](images/frame-progress-08.png)
9. The LLM Assistant Message Aggregator doesnt do anything with Audio frames, so it immediately yields the frame, unchanged. This is the convention for all Frame Processors: frames that the processor doesnt process should be immediately yielded.
![pass-through](images/frame-progress-09.png)
10. The Pipeline places the first Audio frame in its sink queue, which is being watched by the Transport. Since the frame is now in a queue, the Pipeline can continue processing other frames. Note that the source and sink queues form a sort of “boundary of concurrent processing” between a Pipeline and the outside world. In a Pipeline, Frames are processed sequentially; once a Frame is on a queue it can be processed in parallel with the frames being processed by the Pipeline. TODO: link to a more in-depth section about this.
![sink queue](images/frame-progress-10.png)
11. The TTS Frame Processor yields another Audio frame as the Transport transmits the first Audio frame.
![parallel audio](images/frame-progress-11.png)
12. As before, the LLM Assistant Message Aggregator immediately yields the Audio frame and the Pipeline places the Audio frame in the sink queue.
![sink queue 2](images/frame-progress-12.png)
13. The TTS Frame Processor has no more frames to yield. The LLM Frame Processor emits an LLM Response End Frame, which the Pipeline passes to the TTS Frame Processor.
![response end](images/frame-progress-13.png)
14. The TTS Frame Processor immediately yields the LLM Response End Frame, so the Pipeline passes it along to the LLM Assistant Message Aggregator. The LLM Assistant Message Aggregator updates the LLM Context with the full response from the LLM. TODO TODO: I realized I forgot that the TSS Frame Processor also yields the Text frames that the LLM emitted so that the LLM Assistant Message Aggregator could accumulate them, arrggh.
![response end](images/frame-progress-14.png)
15. The system is quiet, and waiting for the next message from the Transport.
![response end](images/frame-progress-15.png)

110
docs/frame.md Normal file
View File

@@ -0,0 +1,110 @@
# Understanding Different Frame Types in the Pipecat System
In the Pipecat system, frames are used to represent different types of data and control signals that flow through the pipeline. Understanding these frame types is crucial for working with the system effectively. This tutorial will cover the main categories of frames and their specific uses.
## 1. Base Frame Classes
### Frame
The `Frame` class is the base class for all frames. It includes:
- `id`: A unique identifier
- `name`: A descriptive name
- `pts`: Presentation timestamp (optional)
### DataFrame
`DataFrame` is a subclass of `Frame` and serves as a base for most data-carrying frames.
## 2. Audio Frames
### AudioRawFrame
Represents a chunk of audio with properties:
- `audio`: Raw audio data
- `sample_rate`: Audio sample rate
- `num_channels`: Number of audio channels
Subclasses include:
- `InputAudioRawFrame`: For audio from input sources
- `OutputAudioRawFrame`: For audio to be played by output devices
- `TTSAudioRawFrame`: For audio generated by Text-to-Speech services
## 3. Image Frames
### ImageRawFrame
Represents an image with properties:
- `image`: Raw image data
- `size`: Image dimensions
- `format`: Image format (e.g., JPEG, PNG)
Subclasses include:
- `InputImageRawFrame`: For images from input sources
- `OutputImageRawFrame`: For images to be displayed
- `UserImageRawFrame`: For images associated with a specific user
- `VisionImageRawFrame`: For images with associated text for description
- `URLImageRawFrame`: For images with an associated URL
### SpriteFrame
Represents an animated sprite, containing a list of `ImageRawFrame` objects.
## 4. Text and Transcription Frames
### TextFrame
Represents a chunk of text, used for various purposes in the pipeline.
### TranscriptionFrame
A specialized `TextFrame` for speech transcriptions, including:
- `user_id`: ID of the speaking user
- `timestamp`: When the transcription was generated
- `language`: Detected language of the speech
### InterimTranscriptionFrame
Similar to `TranscriptionFrame`, but for interim (not final) transcriptions.
## 5. LLM (Language Model) Frames
### LLMMessagesFrame
Contains a list of messages for an LLM service to process.
### LLMMessagesAppendFrame and LLMMessagesUpdateFrame
Used to modify the current context of LLM messages.
### LLMSetToolsFrame
Specifies tools (functions) available for the LLM to use.
### LLMEnablePromptCachingFrame
Controls prompt caching in certain LLMs.
## 6. System and Control Frames
### SystemFrame
Base class for system-level frames.
Important system frames include:
- `StartFrame`: Initiates a pipeline
- `CancelFrame`: Stops a pipeline immediately
- `ErrorFrame`: Notifies of errors (with `FatalErrorFrame` for unrecoverable errors)
- `EndTaskFrame` and `CancelTaskFrame`: Control pipeline tasks
- `StartInterruptionFrame` and `StopInterruptionFrame`: Indicate user speech for interruptions
### ControlFrame
Base class for control-flow frames.
Notable control frames:
- `EndFrame`: Signals the end of a pipeline
- `LLMFullResponseStartFrame` and `LLMFullResponseEndFrame`: Bracket LLM responses
- `UserStartedSpeakingFrame` and `UserStoppedSpeakingFrame`: Indicate user speech activity
- `BotStartedSpeakingFrame` and `BotStoppedSpeakingFrame`: Indicate bot speech activity
- `TTSStartedFrame` and `TTSStoppedFrame`: Bracket Text-to-Speech responses
## 7. Special Purpose Frames
### MetricsFrame
Contains performance metrics data.
### FunctionCallInProgressFrame and FunctionCallResultFrame
Used for handling LLM function (tool) calls.
### ServiceUpdateSettingsFrame
Base class for updating service settings, with specific subclasses for LLM, TTS, and STT services.
## Conclusion
Understanding these frame types is essential for working with the Pipecat system. Each frame type serves a specific purpose in the pipeline, whether it's carrying data (like audio or images), controlling the flow of the pipeline, or managing system-level operations. By using the appropriate frame types, you can effectively process and transmit various kinds of information through your pipeline.

Binary file not shown.

After

Width:  |  Height:  |  Size: 98 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 98 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 94 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 94 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 95 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 94 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 96 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 110 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 111 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 98 KiB

View File

@@ -1,163 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""
A conversational AI bot using Gemini for both LLM and TTS.
This example demonstrates how to use Gemini's TTS capabilities with the new
GeminiTTSService, which uses Gemini's TTS-specific models instead of Google Cloud TTS.
Features showcased:
- Gemini LLM for conversation
- Gemini TTS with natural voice control
- Support for different voice personalities
- Style and tone control through natural language prompts
Run with:
python examples/foundational/gemini-tts.py
Make sure to set your environment variables:
export GOOGLE_API_KEY=your_api_key_here
"""
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.llm import GoogleLLMService
from pipecat.services.google.stt import GoogleSTTService
from pipecat.services.google.tts import GeminiTTSService
from pipecat.transcriptions.language import Language
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot with Gemini TTS")
stt = GoogleSTTService(
params=GoogleSTTService.InputParams(languages=Language.EN_US),
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
)
tts = GeminiTTSService(
api_key=os.getenv("GOOGLE_API_KEY"),
model="gemini-2.5-flash-preview-tts", # TTS-specific model
voice_id="Charon",
params=GeminiTTSService.InputParams(language=Language.EN_US),
)
llm = GoogleLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
model="gemini-2.5-flash",
)
# System message that instructs the AI on how to speak
messages = [
{
"role": "system",
"content": """You are a helpful AI assistant in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way.
IMPORTANT: Since you're using Gemini TTS which supports natural voice control, you can include speaking instructions in your responses. For example:
- "Say cheerfully: Welcome to our conversation!"
- "Read this in a calm, professional tone: Here are the details you requested."
- "Speak in an excited whisper: I have some great news to share!"
- "Say slowly and clearly: Let me explain this step by step."
Feel free to use natural language instructions to control your voice style, tone, pace, and emotion. The TTS system will interpret these instructions and adjust the speech accordingly.
Your output will be converted to audio, so avoid special characters in your answers. Respond to what the user said in a creative and helpful way.""",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
context_aggregator.user(), # User responses
llm, # LLM
tts, # Gemini TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation with a styled introduction
messages.append(
{
"role": "system",
"content": "Say cheerfully and warmly: Hello! I'm your AI assistant powered by Gemini's new TTS technology. I can speak with different voices, tones, and styles. How can I help you today?",
}
)
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -1,88 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, TranscriptionFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.azure.stt import AzureSTTService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
class TranscriptionLogger(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, TranscriptionFrame):
print(f"Transcription: {frame.text}")
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = AzureSTTService(
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
region=os.getenv("AZURE_SPEECH_REGION"),
)
tl = TranscriptionLogger()
pipeline = Pipeline([transport.input(), stt, tl])
task = PipelineTask(
pipeline,
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -1,165 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.mistral.llm import MistralLLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
async def fetch_weather_from_api(params: FunctionCallParams):
await params.result_callback({"conditions": "nice", "temperature": "75"})
async def fetch_restaurant_recommendation(params: FunctionCallParams):
await params.result_callback({"name": "The Golden Dragon"})
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
llm = MistralLLMService(api_key=os.getenv("MISTRAL_API_KEY"))
# You can also register a function_name of None to get all functions
# sent to the same callback with an additional function_name parameter.
llm.register_function("get_current_weather", fetch_weather_from_api)
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
weather_function = FunctionSchema(
name="get_current_weather",
description="Get the current weather",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the user's location.",
},
},
required=["location", "format"],
)
restaurant_function = FunctionSchema(
name="get_restaurant_recommendation",
description="Get a restaurant recommendation",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
required=["location"],
)
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages, tools)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
stt,
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -137,7 +137,7 @@ You have access to the following tools:
- get_current_weather: Get the current weather for a given location.
- get_restaurant_recommendation: Get a restaurant recommendation for a given location.
Remember, your responses should be short. Just one or two sentences, usually. Respond in English.""",
Remember, your responses should be short. Just one or two sentences, usually.""",
)
llm = OpenAIRealtimeBetaLLMService(
@@ -158,6 +158,16 @@ Remember, your responses should be short. Just one or two sentences, usually. Re
# openai WebSocket API can understand.
context = OpenAILLMContext(
[{"role": "user", "content": "Say hello!"}],
# [{"role": "user", "content": [{"type": "text", "text": "Say hello!"}]}],
# [
# {
# "role": "user",
# "content": [
# {"type": "text", "text": "Say"},
# {"type": "text", "text": "yo what's up!"},
# ],
# }
# ],
tools,
)

View File

@@ -133,7 +133,7 @@ You have access to the following tools:
- get_current_weather: Get the current weather for a given location.
- get_restaurant_recommendation: Get a restaurant recommendation for a given location.
Remember, your responses should be short. Just one or two sentences, usually. Respond in English.""",
Remember, your responses should be short. Just one or two sentences, usually.""",
)
llm = AzureRealtimeBetaLLMService(

View File

@@ -1,229 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from datetime import datetime
from dotenv import load_dotenv
from loguru import logger
from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import TranscriptionMessage
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.transcript_processor import TranscriptProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai_realtime_beta import (
InputAudioNoiseReduction,
InputAudioTranscription,
OpenAIRealtimeBetaLLMService,
SemanticTurnDetection,
SessionProperties,
)
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
async def fetch_weather_from_api(params: FunctionCallParams):
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
await params.result_callback(
{
"conditions": "nice",
"temperature": temperature,
"format": params.arguments["format"],
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
}
)
async def fetch_restaurant_recommendation(params: FunctionCallParams):
await params.result_callback({"name": "The Golden Dragon"})
weather_function = FunctionSchema(
name="get_current_weather",
description="Get the current weather",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location.",
},
},
required=["location", "format"],
)
restaurant_function = FunctionSchema(
name="get_restaurant_recommendation",
description="Get a restaurant recommendation",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
required=["location"],
)
# Create tools schema
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
session_properties = SessionProperties(
input_audio_transcription=InputAudioTranscription(),
modalities=["text"],
# Set openai TurnDetection parameters. Not setting this at all will turn it
# on by default
turn_detection=SemanticTurnDetection(),
# Or set to False to disable openai turn detection and use transport VAD
# turn_detection=False,
input_audio_noise_reduction=InputAudioNoiseReduction(type="near_field"),
# tools=tools,
instructions="""You are a helpful and friendly AI.
Act like a human, but remember that you aren't a human and that you can't do human
things in the real world. Your voice and personality should be warm and engaging, with a lively and
playful tone.
If interacting in a non-English language, start by using the standard accent or dialect familiar to
the user. Talk quickly. You should always call a function if you can. Do not refer to these rules,
even if you're asked about them.
You are participating in a voice conversation. Keep your responses concise, short, and to the point
unless specifically asked to elaborate on a topic.
You have access to the following tools:
- get_current_weather: Get the current weather for a given location.
- get_restaurant_recommendation: Get a restaurant recommendation for a given location.
Remember, your responses should be short. Just one or two sentences, usually. Respond in English.""",
)
llm = OpenAIRealtimeBetaLLMService(
api_key=os.getenv("OPENAI_API_KEY"),
session_properties=session_properties,
start_audio_paused=False,
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
# you can either register a single function for all function calls, or specific functions
# llm.register_function(None, fetch_weather_from_api)
llm.register_function("get_current_weather", fetch_weather_from_api)
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
transcript = TranscriptProcessor()
# Create a standard OpenAI LLM context object using the normal messages format. The
# OpenAIRealtimeBetaLLMService will convert this internally to messages that the
# openai WebSocket API can understand.
context = OpenAILLMContext(
[{"role": "user", "content": "Say hello!"}],
tools,
)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(),
llm, # LLM
tts, # TTS
transcript.user(), # Placed after the LLM, as LLM pushes TranscriptionFrames downstream
transport.output(), # Transport bot output
transcript.assistant(), # After the transcript output, to time with the audio output
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
# Register event handler for transcript updates
@transcript.event_handler("on_transcript_update")
async def on_transcript_update(processor, frame):
for msg in frame.messages:
if isinstance(msg, TranscriptionMessage):
timestamp = f"[{msg.timestamp}] " if msg.timestamp else ""
line = f"{timestamp}{msg.role}: {msg.content}"
logger.info(f"Transcript: {line}")
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -25,8 +25,7 @@ from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import LLMService
from pipecat.services.openai.llm import OpenAIContextAggregatorPair, OpenAILLMService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.sync.event_notifier import EventNotifier
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
@@ -35,76 +34,6 @@ from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
class TurnDetectionLLM(Pipeline):
def __init__(self, llm: LLMService, context_aggregator: OpenAIContextAggregatorPair):
# This is the LLM that will be used to detect if the user has finished a
# statement. This doesn't really need to be an LLM, we could use NLP
# libraries for that, but it was easier as an example because we
# leverage the context aggregators.
statement_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
statement_messages = [
{
"role": "system",
"content": "Determine if the user's statement is a complete sentence or question, ending in a natural pause or punctuation. Return 'YES' if it is complete and 'NO' if it seems to leave a thought unfinished.",
},
]
statement_context = OpenAILLMContext(statement_messages)
statement_context_aggregator = statement_llm.create_context_aggregator(statement_context)
# We have instructed the LLM to return 'YES' if it thinks the user
# completed a sentence. So, if it's 'YES' we will return true in this
# predicate which will wake up the notifier.
async def wake_check_filter(frame):
logger.debug(f"Completeness check frame: {frame}")
return frame.text == "YES"
# This is a notifier that we use to synchronize the two LLMs.
notifier = EventNotifier()
# This a filter that will wake up the notifier if the given predicate
# (wake_check_filter) returns true.
completness_check = WakeNotifierFilter(
notifier, types=(TextFrame,), filter=wake_check_filter
)
# This processor keeps the last context and will let it through once the
# notifier is woken up. We start with the gate open because we send an
# initial context frame to start the conversation.
gated_context_aggregator = GatedOpenAILLMContextAggregator(
notifier=notifier, start_open=True
)
# Notify if the user hasn't said anything.
async def user_idle_notifier(frame):
await notifier.notify()
# Sometimes the LLM will fail detecting if a user has completed a
# sentence, this will wake up the notifier if that happens.
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=3.0)
# The ParallePipeline input are the user transcripts. We have two
# contexts. The first one will be used to determine if the user finished
# a statement and if so the notifier will be woken up. The second
# context is simply the regular context but it's gated waiting for the
# notifier to be woken up.
super().__init__(
[
ParallelPipeline(
[
statement_context_aggregator.user(),
statement_llm,
completness_check,
NullFilter(),
],
[context_aggregator.user(), gated_context_aggregator, llm],
),
user_idle,
]
)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
@@ -137,8 +66,24 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
# This is the LLM that will be used to detect if the user has finished a
# statement. This doesn't really need to be an LLM, we could use NLP
# libraries for that, but it was easier as an example because we
# leverage the context aggregators.
statement_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
statement_messages = [
{
"role": "system",
"content": "Determine if the user's statement is a complete sentence or question, ending in a natural pause or punctuation. Return 'YES' if it is complete and 'NO' if it seems to leave a thought unfinished.",
},
]
statement_context = OpenAILLMContext(statement_messages)
statement_context_aggregator = statement_llm.create_context_aggregator(statement_context)
# This is the regular LLM.
llm_main = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages = [
{
@@ -148,16 +93,53 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
]
context = OpenAILLMContext(messages)
context_aggregator = llm_main.create_context_aggregator(context)
context_aggregator = llm.create_context_aggregator(context)
# LLM + turn detection (with an extra LLM as a judge)
llm = TurnDetectionLLM(llm_main, context_aggregator)
# We have instructed the LLM to return 'YES' if it thinks the user
# completed a sentence. So, if it's 'YES' we will return true in this
# predicate which will wake up the notifier.
async def wake_check_filter(frame):
return frame.text == "YES"
# This is a notifier that we use to synchronize the two LLMs.
notifier = EventNotifier()
# This a filter that will wake up the notifier if the given predicate
# (wake_check_filter) returns true.
completness_check = WakeNotifierFilter(notifier, types=(TextFrame,), filter=wake_check_filter)
# This processor keeps the last context and will let it through once the
# notifier is woken up. We start with the gate open because we send an
# initial context frame to start the conversation.
gated_context_aggregator = GatedOpenAILLMContextAggregator(notifier=notifier, start_open=True)
# Notify if the user hasn't said anything.
async def user_idle_notifier(frame):
await notifier.notify()
# Sometimes the LLM will fail detecting if a user has completed a
# sentence, this will wake up the notifier if that happens.
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=3.0)
# The ParallePipeline input are the user transcripts. We have two
# contexts. The first one will be used to determine if the user finished
# a statement and if so the notifier will be woken up. The second
# context is simply the regular context but it's gated waiting for the
# notifier to be woken up.
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # STT
llm, # LLM with turn detection
stt,
ParallelPipeline(
[
statement_context_aggregator.user(),
statement_llm,
completness_check,
NullFilter(),
],
[context_aggregator.user(), gated_context_aggregator, llm],
),
user_idle,
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses

View File

@@ -6,6 +6,7 @@
import asyncio
import os
import time
from dotenv import load_dotenv
from loguru import logger
@@ -43,14 +44,13 @@ from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import FunctionCallParams, LLMService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.sync.base_notifier import BaseNotifier
from pipecat.sync.event_notifier import EventNotifier
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
from pipecat.utils.time import time_now_iso8601
load_dotenv(override=True)
@@ -192,75 +192,6 @@ async def fetch_weather_from_api(params: FunctionCallParams):
await params.result_callback({"conditions": "nice", "temperature": "75"})
class TurnDetectionLLM(Pipeline):
def __init__(self, llm: LLMService):
# This is the LLM that will be used to detect if the user has finished a
# statement. This doesn't really need to be an LLM, we could use NLP
# libraries for that, but we have the machinery to use an LLM, so we
# might as well!
statement_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
# We have instructed the LLM to return 'YES' if it thinks the user
# completed a sentence. So, if it's 'YES' we will return true in this
# predicate which will wake up the notifier.
async def wake_check_filter(frame):
logger.debug(f"Completeness check frame: {frame}")
return frame.text == "YES"
# This is a notifier that we use to synchronize the two LLMs.
notifier = EventNotifier()
# This turns the LLM context into an inference request to classify the user's speech
# as complete or incomplete.
statement_judge_context_filter = StatementJudgeContextFilter()
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
completeness_check = CompletenessCheck(notifier=notifier)
# # Notify if the user hasn't said anything.
async def user_idle_notifier(frame):
await notifier.notify()
# Sometimes the LLM will fail detecting if a user has completed a
# sentence, this will wake up the notifier if that happens.
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=5.0)
# We start with the gate open because we send an initial context frame
# to start the conversation.
bot_output_gate = OutputGate(notifier=notifier, start_open=True)
async def pass_only_llm_trigger_frames(frame):
return (
isinstance(frame, OpenAILLMContextFrame)
or isinstance(frame, StartInterruptionFrame)
or isinstance(frame, StopInterruptionFrame)
or isinstance(frame, FunctionCallInProgressFrame)
or isinstance(frame, FunctionCallResultFrame)
)
super().__init__(
[
ParallelPipeline(
[
# Ignore everything except an OpenAILLMContextFrame. Pass a specially constructed
# simplified context frame to the statement classifier LLM. The only frame this
# sub-pipeline will output is a UserStoppedSpeakingFrame.
statement_judge_context_filter,
statement_llm,
completeness_check,
],
[
# Block everything except frames that trigger LLM inference.
FunctionFilter(filter=pass_only_llm_trigger_frames),
llm,
bot_output_gate, # Buffer all llm/tts output until notified.
],
),
user_idle,
]
)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
@@ -293,13 +224,18 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
# This is the LLM that will be used to detect if the user has finished a
# statement. This doesn't really need to be an LLM, we could use NLP
# libraries for that, but we have the machinery to use an LLM, so we might as well!
statement_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
# This is the regular LLM.
llm_main = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
# You can also register a function_name of None to get all functions
# sent to the same callback with an additional function_name parameter.
llm_main.register_function("get_current_weather", fetch_weather_from_api)
llm.register_function("get_current_weather", fetch_weather_from_api)
@llm_main.event_handler("on_function_calls_started")
@llm.event_handler("on_function_calls_started")
async def on_function_calls_started(service, function_calls):
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
@@ -336,18 +272,69 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
]
context = OpenAILLMContext(messages, tools)
context_aggregator = llm_main.create_context_aggregator(context)
context_aggregator = llm.create_context_aggregator(context)
# LLM + turn detection (with an extra LLM as a judge)
llm = TurnDetectionLLM(llm_main)
# We have instructed the LLM to return 'YES' if it thinks the user
# completed a sentence. So, if it's 'YES' we will return true in this
# predicate which will wake up the notifier.
async def wake_check_filter(frame):
logger.debug(f"Completeness check frame: {frame}")
return frame.text == "YES"
# This is a notifier that we use to synchronize the two LLMs.
notifier = EventNotifier()
# This turns the LLM context into an inference request to classify the user's speech
# as complete or incomplete.
statement_judge_context_filter = StatementJudgeContextFilter()
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
completeness_check = CompletenessCheck(notifier=notifier)
# # Notify if the user hasn't said anything.
async def user_idle_notifier(frame):
await notifier.notify()
# Sometimes the LLM will fail detecting if a user has completed a
# sentence, this will wake up the notifier if that happens.
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=5.0)
# We start with the gate open because we send an initial context frame
# to start the conversation.
bot_output_gate = OutputGate(notifier=notifier, start_open=True)
async def pass_only_llm_trigger_frames(frame):
return (
isinstance(frame, OpenAILLMContextFrame)
or isinstance(frame, StartInterruptionFrame)
or isinstance(frame, StopInterruptionFrame)
or isinstance(frame, FunctionCallInProgressFrame)
or isinstance(frame, FunctionCallResultFrame)
)
pipeline = Pipeline(
[
transport.input(),
stt,
context_aggregator.user(),
llm,
ParallelPipeline(
[
# Ignore everything except an OpenAILLMContextFrame. Pass a specially constructed
# simplified context frame to the statement classifier LLM. The only frame this
# sub-pipeline will output is a UserStoppedSpeakingFrame.
statement_judge_context_filter,
statement_llm,
completeness_check,
],
[
# Block everything except frames that trigger LLM inference.
FunctionFilter(filter=pass_only_llm_trigger_frames),
llm,
bot_output_gate, # Buffer all llm/tts output until notified.
],
),
tts,
user_idle,
transport.output(),
context_aggregator.assistant(),
]
@@ -378,9 +365,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
await task.queue_frames(
[
UserStartedSpeakingFrame(),
TranscriptionFrame(
user_id="", timestamp=time_now_iso8601(), text=message["message"]
),
TranscriptionFrame(user_id="", timestamp=time.time(), text=message["message"]),
UserStoppedSpeakingFrame(),
]
)

View File

@@ -6,6 +6,7 @@
import asyncio
import os
import time
from dotenv import load_dotenv
from loguru import logger
@@ -44,14 +45,13 @@ from pipecat.runner.utils import create_transport
from pipecat.services.anthropic.llm import AnthropicLLMService
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import FunctionCallParams, LLMService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.sync.base_notifier import BaseNotifier
from pipecat.sync.event_notifier import EventNotifier
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
from pipecat.utils.time import time_now_iso8601
load_dotenv(override=True)
@@ -391,75 +391,6 @@ class OutputGate(FrameProcessor):
break
class TurnDetectionLLM(Pipeline):
def __init__(self, llm: LLMService):
# This is the LLM that will be used to detect if the user has finished a
# statement. This doesn't really need to be an LLM, we could use NLP
# libraries for that, but we have the machinery to use an LLM, so we might as well!
statement_llm = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
# This is a notifier that we use to synchronize the two LLMs.
notifier = EventNotifier()
# This turns the LLM context into an inference request to classify the user's speech
# as complete or incomplete.
statement_judge_context_filter = StatementJudgeContextFilter()
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
completeness_check = CompletenessCheck(notifier=notifier)
# # Notify if the user hasn't said anything.
async def user_idle_notifier(frame):
await notifier.notify()
# Sometimes the LLM will fail detecting if a user has completed a
# sentence, this will wake up the notifier if that happens.
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=5.0)
# We start with the gate open because we send an initial context frame
# to start the conversation.
bot_output_gate = OutputGate(notifier=notifier, start_open=True)
async def block_user_stopped_speaking(frame):
return not isinstance(frame, UserStoppedSpeakingFrame)
async def pass_only_llm_trigger_frames(frame):
return (
isinstance(frame, OpenAILLMContextFrame)
or isinstance(frame, StartInterruptionFrame)
or isinstance(frame, StopInterruptionFrame)
or isinstance(frame, FunctionCallInProgressFrame)
or isinstance(frame, FunctionCallResultFrame)
)
super().__init__(
[
ParallelPipeline(
[
# Pass everything except UserStoppedSpeaking to the elements after
# this ParallelPipeline
FunctionFilter(filter=block_user_stopped_speaking),
],
[
# Ignore everything except an OpenAILLMContextFrame. Pass a specially constructed
# simplified context frame to the statement classifier LLM. The only frame this
# sub-pipeline will output is a UserStoppedSpeakingFrame.
statement_judge_context_filter,
statement_llm,
completeness_check,
],
[
# Block everything except frames that trigger LLM inference.
FunctionFilter(filter=pass_only_llm_trigger_frames),
llm,
bot_output_gate, # Buffer all llm/tts output until notified.
],
),
user_idle,
]
)
async def fetch_weather_from_api(params: FunctionCallParams):
await params.result_callback({"conditions": "nice", "temperature": "75"})
@@ -496,13 +427,18 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
# This is the LLM that will be used to detect if the user has finished a
# statement. This doesn't really need to be an LLM, we could use NLP
# libraries for that, but we have the machinery to use an LLM, so we might as well!
statement_llm = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
# This is the regular LLM.
llm_main = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
# Register a function_name of None to get all functions
# sent to the same callback with an additional function_name parameter.
llm_main.register_function("get_current_weather", fetch_weather_from_api)
llm.register_function("get_current_weather", fetch_weather_from_api)
@llm_main.event_handler("on_function_calls_started")
@llm.event_handler("on_function_calls_started")
async def on_function_calls_started(service, function_calls):
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
@@ -539,18 +475,76 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
]
context = OpenAILLMContext(messages, tools)
context_aggregator = llm_main.create_context_aggregator(context)
context_aggregator = llm.create_context_aggregator(context)
# LLM + turn detection (with an extra LLM as a judge)
llm = TurnDetectionLLM(llm_main)
# We have instructed the LLM to return 'YES' if it thinks the user
# completed a sentence. So, if it's 'YES' we will return true in this
# predicate which will wake up the notifier.
async def wake_check_filter(frame):
return frame.text == "YES"
# This is a notifier that we use to synchronize the two LLMs.
notifier = EventNotifier()
# This turns the LLM context into an inference request to classify the user's speech
# as complete or incomplete.
statement_judge_context_filter = StatementJudgeContextFilter()
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
completeness_check = CompletenessCheck(notifier=notifier)
# # Notify if the user hasn't said anything.
async def user_idle_notifier(frame):
await notifier.notify()
# Sometimes the LLM will fail detecting if a user has completed a
# sentence, this will wake up the notifier if that happens.
user_idle = UserIdleProcessor(callback=user_idle_notifier, timeout=5.0)
# We start with the gate open because we send an initial context frame
# to start the conversation.
bot_output_gate = OutputGate(notifier=notifier, start_open=True)
async def block_user_stopped_speaking(frame):
return not isinstance(frame, UserStoppedSpeakingFrame)
async def pass_only_llm_trigger_frames(frame):
return (
isinstance(frame, OpenAILLMContextFrame)
or isinstance(frame, StartInterruptionFrame)
or isinstance(frame, StopInterruptionFrame)
or isinstance(frame, FunctionCallInProgressFrame)
or isinstance(frame, FunctionCallResultFrame)
)
pipeline = Pipeline(
[
transport.input(),
stt,
context_aggregator.user(),
llm,
ParallelPipeline(
[
# Pass everything except UserStoppedSpeaking to the elements after
# this ParallelPipeline
FunctionFilter(filter=block_user_stopped_speaking),
],
[
# Ignore everything except an OpenAILLMContextFrame. Pass a specially constructed
# simplified context frame to the statement classifier LLM. The only frame this
# sub-pipeline will output is a UserStoppedSpeakingFrame.
statement_judge_context_filter,
statement_llm,
completeness_check,
],
[
# Block everything except frames that trigger LLM inference.
FunctionFilter(filter=pass_only_llm_trigger_frames),
llm,
bot_output_gate, # Buffer all llm/tts output until notified.
],
),
tts,
user_idle,
transport.output(),
context_aggregator.assistant(),
]
@@ -586,9 +580,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
await task.queue_frames(
[
UserStartedSpeakingFrame(),
TranscriptionFrame(
user_id="", timestamp=time_now_iso8601(), text=message["message"]
),
TranscriptionFrame(user_id="", timestamp=time.time(), text=message["message"]),
UserStoppedSpeakingFrame(),
]
)

View File

@@ -47,13 +47,11 @@ from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.google.llm import GoogleLLMContext, GoogleLLMService
from pipecat.services.llm_service import LLMService
from pipecat.sync.base_notifier import BaseNotifier
from pipecat.sync.event_notifier import EventNotifier
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
from pipecat.utils.time import time_now_iso8601
load_dotenv(override=True)
@@ -609,90 +607,23 @@ class OutputGate(FrameProcessor):
self._gate_task = None
async def _gate_task_handler(self):
await self._notifier.wait()
while True:
try:
await self._notifier.wait()
transcription = await self._transcription_buffer.wait_for_transcription() or "-"
self._context.add_message(Content(role="user", parts=[Part(text=transcription)]))
transcription = await self._transcription_buffer.wait_for_transcription() or "-"
self._context.add_message(Content(role="user", parts=[Part(text=transcription)]))
self.open_gate()
for frame, direction in self._frames_buffer:
await self.push_frame(frame, direction)
self._frames_buffer = []
class TurnDetectionLLM(Pipeline):
def __init__(self, llm: LLMService, context: OpenAILLMContext):
# This is the LLM that will transcribe user speech.
tx_llm = GoogleLLMService(
name="Transcriber",
model=TRANSCRIBER_MODEL,
api_key=os.getenv("GOOGLE_API_KEY"),
temperature=0.0,
system_instruction=transcriber_system_instruction,
)
# This is the LLM that will classify user speech as complete or incomplete.
classifier_llm = GoogleLLMService(
name="Classifier",
model=CLASSIFIER_MODEL,
api_key=os.getenv("GOOGLE_API_KEY"),
temperature=0.0,
system_instruction=classifier_system_instruction,
)
# This is a notifier that we use to synchronize the two LLMs.
notifier = EventNotifier()
# This turns the LLM context into an inference request to classify the user's speech
# as complete or incomplete.
# statement_judge_context_filter = StatementJudgeAudioContextAccumulator(notifier=notifier)
audio_accumulater = AudioAccumulator()
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
completeness_check = CompletenessCheck(
notifier=notifier, audio_accumulator=audio_accumulater
)
async def block_user_stopped_speaking(frame):
return not isinstance(frame, UserStoppedSpeakingFrame)
conversation_audio_context_assembler = ConversationAudioContextAssembler(context=context)
llm_aggregator_buffer = LLMAggregatorBuffer()
bot_output_gate = OutputGate(
notifier=notifier, context=context, llm_transcription_buffer=llm_aggregator_buffer
)
super().__init__(
[
audio_accumulater,
ParallelPipeline(
[
# Pass everything except UserStoppedSpeaking to the elements after
# this ParallelPipeline
FunctionFilter(filter=block_user_stopped_speaking),
],
[
ParallelPipeline(
[
classifier_llm,
completeness_check,
],
[
tx_llm,
llm_aggregator_buffer,
],
)
],
[
conversation_audio_context_assembler,
llm,
bot_output_gate, # buffer output until notified, then flush frames and update context
],
),
]
)
self.open_gate()
for frame, direction in self._frames_buffer:
await self.push_frame(frame, direction)
self._frames_buffer = []
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"OutputGate error: {e}")
raise e
break
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
@@ -725,6 +656,24 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
# This is the LLM that will transcribe user speech.
tx_llm = GoogleLLMService(
name="Transcriber",
model=TRANSCRIBER_MODEL,
api_key=os.getenv("GOOGLE_API_KEY"),
temperature=0.0,
system_instruction=transcriber_system_instruction,
)
# This is the LLM that will classify user speech as complete or incomplete.
classifier_llm = GoogleLLMService(
name="Classifier",
model=CLASSIFIER_MODEL,
api_key=os.getenv("GOOGLE_API_KEY"),
temperature=0.0,
system_instruction=classifier_system_instruction,
)
# This is the regular LLM that responds conversationally.
conversation_llm = GoogleLLMService(
name="Conversation",
@@ -736,12 +685,57 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
context = OpenAILLMContext()
context_aggregator = conversation_llm.create_context_aggregator(context)
llm = TurnDetectionLLM(conversation_llm, context)
# This is a notifier that we use to synchronize the two LLMs.
notifier = EventNotifier()
# This turns the LLM context into an inference request to classify the user's speech
# as complete or incomplete.
# statement_judge_context_filter = StatementJudgeAudioContextAccumulator(notifier=notifier)
audio_accumulater = AudioAccumulator()
# This sends a UserStoppedSpeakingFrame and triggers the notifier event
completeness_check = CompletenessCheck(notifier=notifier, audio_accumulator=audio_accumulater)
async def block_user_stopped_speaking(frame):
return not isinstance(frame, UserStoppedSpeakingFrame)
conversation_audio_context_assembler = ConversationAudioContextAssembler(context=context)
llm_aggregator_buffer = LLMAggregatorBuffer()
bot_output_gate = OutputGate(
notifier=notifier, context=context, llm_transcription_buffer=llm_aggregator_buffer
)
pipeline = Pipeline(
[
transport.input(),
llm,
audio_accumulater,
ParallelPipeline(
[
# Pass everything except UserStoppedSpeaking to the elements after
# this ParallelPipeline
FunctionFilter(filter=block_user_stopped_speaking),
],
[
ParallelPipeline(
[
classifier_llm,
completeness_check,
],
[
tx_llm,
llm_aggregator_buffer,
],
)
],
[
conversation_audio_context_assembler,
conversation_llm,
bot_output_gate, # buffer output until notified, then flush frames and update context
# TempPrinter(),
],
),
tts,
transport.output(),
context_aggregator.assistant(),
@@ -772,9 +766,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
await task.queue_frames(
[
UserStartedSpeakingFrame(),
TranscriptionFrame(
user_id="", timestamp=time_now_iso8601(), text=message["message"]
),
TranscriptionFrame(user_id="", timestamp=time.time(), text=message["message"]),
UserStoppedSpeakingFrame(),
]
)

View File

@@ -1,139 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.extensions.voicemail.voicemail_detector import VoicemailDetector
from pipecat.frames.frames import EndTaskFrame, TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
classifier_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
voicemail = VoicemailDetector(llm=classifier_llm)
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(),
stt,
voicemail.detector(), # Voicemail detection — between STT and User context aggregator
context_aggregator.user(),
llm,
tts,
voicemail.gate(), # TTS gating — Immediately after the TTS service
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
@voicemail.event_handler("on_voicemail_detected")
async def handle_voicemail(processor):
logger.info("Voicemail detected! Leaving a message...")
# Push frames using standard Pipecat pattern
await processor.push_frame(
TTSSpeakFrame(
"Hello, this is Jamie calling about your appointment. Please call me back at 555-0123 when you get this."
)
)
# NOTE: A common pattern is to end pipeline after the voicemail is left.
# Uncomment the following line to end the pipeline after leaving the voicemail.
# await processor.push_frame(EndTaskFrame(), FrameDirection.UPSTREAM)
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -1,192 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from typing import override
from dotenv import load_dotenv
from loguru import logger
from openai.types.chat import ChatCompletionMessageParam
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, LLMTextFrame
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import (
OpenAILLMContext,
OpenAILLMContextFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
class LLMRaceProcessor(FrameProcessor):
"""Manages racing between two LLMs - only allows frames from the first LLM to respond."""
# Class variables to share state between instances (using public names)
winning_llm_name = None
response_started = False
def __init__(self) -> None:
super().__init__()
self._current_llm_name = None
def set_llm_name(self, name: str):
"""Set the name of the LLM this processor instance is handling."""
self._current_llm_name = name
@override
async def process_frame(self, frame: Frame, direction: FrameDirection):
# Always call super first to handle StartFrame and other system frames
await super().process_frame(frame, direction)
if isinstance(frame, LLMTextFrame):
if not LLMRaceProcessor.response_started:
# First response wins the race
LLMRaceProcessor.winning_llm_name = self._current_llm_name
LLMRaceProcessor.response_started = True
logger.info(
f"🏆 [LLM_RACE] {self._current_llm_name} wins the race! Text: '{frame.text}'"
)
await self.push_frame(frame, direction)
elif LLMRaceProcessor.winning_llm_name == self._current_llm_name:
# Continue allowing frames from winning LLM
logger.info(f"✅ [LLM_RACE] {self._current_llm_name} continuing: '{frame.text}'")
await self.push_frame(frame, direction)
else:
# Drop frames from losing LLM
logger.info(
f"❌ [LLM_RACE] Dropping '{frame.text}' from losing LLM: {self._current_llm_name}"
)
else:
# Pass through all non-LLM frames (including system frames)
await self.push_frame(frame, direction)
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot with parallel LLM racing")
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY", ""))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY", ""),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
# Create two LLM instances for racing
llm1 = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
llm2 = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages: list[ChatCompletionMessageParam] = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
# Create shared context for both LLMs
context = OpenAILLMContext(messages)
context_aggregator = llm1.create_context_aggregator(context)
# Make sure both LLMs share the same context - they should both process context frames
# In a ParallelPipeline, the context frames will be duplicated to both branches
# Create separate race processors for each LLM to track which one responds first
race_processor1 = LLMRaceProcessor()
race_processor1.set_llm_name("LLM1")
race_processor2 = LLMRaceProcessor()
race_processor2.set_llm_name("LLM2")
# Create parallel LLM branches using ParallelPipeline
parallel_llms = ParallelPipeline(
[llm1, race_processor1], # Branch 1: LLM1 -> race processor 1
[llm2, race_processor2], # Branch 2: LLM2 -> race processor 2
)
# Simple pipeline with parallel LLM processing
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt, # Speech to text
context_aggregator.user(), # User responses (creates context frames for LLMs)
parallel_llms, # Parallel LLM processing
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
# observers=[debug_observer, llm_observer],
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Use a simpler approach - add message to context and push a context frame
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
# Create a new context with the updated messages
updated_context = OpenAILLMContext(messages)
await task.queue_frames([OpenAILLMContextFrame(context=updated_context)])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -4,32 +4,40 @@ This directory contains examples showing how to build voice and multimodal agent
## Setup
1. Follow the [README](../../README.md#%EF%B8%8F-contributing-to-the-framework) steps to get your local environment configured.
1. Make sure you have uv installed:
> **Run from root directory**: Make sure you are running the steps from the root directory.
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
```
> **Using local audio?**: The `LocalAudioTransport` requires a system dependency for `portaudio`. Install the dependency to use the transport.
> **Need help?** Refer to the [uv install documentation](https://docs.astral.sh/uv/getting-started/installation/).
2. Copy the [`env.example`](../../env.example) file and add API keys for services you plan to use:
2. Create a venv and install example dependencies:
```bash
uv sync --all-extras --no-extra krisp
```
3. Create a `.env` file with your API keys:
```bash
cp env.example .env
# Edit .env with your API keys
```
3. Navigate to the examples directory if you aren't already there:
4. Navigate to the examples directory:
```bash
cd examples/foundational
```
4. Run any example:
5. Run any example:
```bash
uv run python 01-say-one-thing.py
```
5. Open the web interface at http://localhost:7860/client/ and click "Connect"
6. Open the web interface at http://localhost:7860/client/ and click "Connect"
## Running examples with other transports

View File

@@ -1,16 +0,0 @@
FROM dailyco/pipecat-base:latest
# Enable bytecode compilation
ENV UV_COMPILE_BYTECODE=1
# Copy from the cache instead of linking since it's a mounted volume
ENV UV_LINK_MODE=copy
# Install the project's dependencies using the lockfile and settings
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --locked --no-install-project --no-dev
# Copy the application code
COPY ./bot.py bot.py

View File

@@ -1,159 +1,87 @@
# Pipecat Quickstart
Build and deploy your first voice AI bot in under 10 minutes. Develop locally, then scale to production on Pipecat Cloud.
Run your first Pipecat bot in under 5 minutes. This example creates a voice AI bot that you can talk to in your browser.
**Two steps**: [🏠 Local Development](#run-your-bot-locally) → [☁️ Production Deployment](#deploy-to-production)
## Prerequisites
> 🎯 Quick start: Local bot in 5 minutes, production deployment in 5 more
### Python 3.10+
## Step 1: Local Development (5 min)
### Prerequisites
#### Environment
- Python 3.10 or later
- [uv](https://docs.astral.sh/uv/getting-started/installation/) package manager installed
#### AI Service API keys
You'll need API keys from three services:
- [Deepgram](https://console.deepgram.com/signup) for Speech-to-Text
- [OpenAI](https://auth.openai.com/create-account) for LLM inference
- [Cartesia](https://play.cartesia.ai/sign-up) for Text-to-Speech
> 💡 **Tip**: Sign up for all three now. You'll need them for both local and cloud deployment.
### Setup
Navigate to the quickstart directory and set up your environment.
1. Install dependencies:
```bash
uv sync
```
2. Configure your API keys:
Create a `.env` file:
```bash
cp env.example .env
```
Then, add your API keys:
```ini
DEEPGRAM_API_KEY=your_deepgram_api_key
OPENAI_API_KEY=your_openai_api_key
CARTESIA_API_KEY=your_cartesia_api_key
```
### Run your bot locally
Pipecat requires Python 3.10 or newer. Check your version:
```bash
uv run bot.py
python --version
```
If you need to upgrade Python, we recommend using a version manager like `uv` or `pyenv`.
### AI Service API keys
Pipecat orchestrates different AI services in a pipeline, ensuring low latency communication. In this quickstart example, we'll use:
- [Deepgram](https://console.deepgram.com/signup) for Speech-to-Text transcriptions
- [OpenAI](https://auth.openai.com/create-account) for LLM inference
- [Cartesia](https://play.cartesia.ai/sign-up) for Text-to-Speech audio generation
Have your API keys ready. We'll add them to your `.env` shortly.
## Setup
1. Set up a virtual environment
From the `examples/quickstart` directory, run:
```bash
python -m venv .venv
source .venv/bin/activate # On Windows: .venv\Scripts\activate
```
> Using `uv`? Create your venv using: `uv venv && source .venv/bin/activate`.
2. Install dependencies
```bash
pip install -r requirements.txt
```
> Using `uv`? Install requirements using: `uv pip install -r requirements.txt`.
3. Configure environment variables
Create a `.env` file:
```bash
cp env.example .env
```
Then, add your API keys:
```
DEEPGRAM_API_KEY=your_deepgram_api_key
OPENAI_API_KEY=your_openai_api_key
CARTESIA_API_KEY=your_cartesia_api_key
```
4. Run the example
Run your bot using:
```bash
python bot.py
```
> Using `uv`? Run your bot using: `uv run bot.py`.
**Open http://localhost:7860 in your browser** and click `Connect` to start talking to your bot.
> 💡 First run note: The initial startup may take ~20 seconds as Pipecat downloads required models and imports.
> 💡 First run note: The initial startup may take ~10 seconds as Pipecat downloads required models, like the Silero VAD model.
🎉 **Success!** Your bot is running locally. Now let's deploy it to production so others can use it.
## Troubleshooting
---
- **Browser permissions**: Make sure to allow microphone access when prompted by your browser.
- **Connection issues**: If the WebRTC connection fails, first try a different browser. If that fails, make sure you don't have a VPN or firewall rules blocking traffic. WebRTC uses UDP to communicate.
- **Audio issues**: Check that your microphone and speakers are working and not muted.
## Step 2: Deploy to Production (5 min)
## Next Steps
Transform your local bot into a production-ready service. Pipecat Cloud handles scaling, monitoring, and global deployment.
### Prerequisites
1. [Sign up for Pipecat Cloud](https://pipecat.daily.co/sign-up).
2. Install the Pipecat Cloud CLI:
```bash
uv add pipecatcloud
```
> 💡 Tip: You can run the `pipecatcloud` CLI using the `pcc` alias.
3. Set up Docker for building your bot image:
- **Install [Docker](https://www.docker.com/)** on your system
- **Create a [Docker Hub](https://hub.docker.com/) account**
- **Login to Docker Hub:**
```bash
docker login
```
### Configure your deployment
The `pcc-deploy.toml` file tells Pipecat Cloud how to run your bot. **Update the image field** with your Docker Hub username by editing `pcc-deploy.toml`.
```ini
agent_name = "quickstart"
image = "YOUR_DOCKERHUB_USERNAME/quickstart:0.1" # 👈 Update this line
secret_set = "quickstart-secrets"
[scaling]
min_agents = 1
```
**Understanding the TOML file settings:**
- `agent_name`: Your bot's name in Pipecat Cloud
- `image`: The Docker image to deploy (format: `username/image:version`)
- `secret_set`: Where your API keys are stored securely
- `min_agents`: Number of bot instances to keep ready (1 = instant start)
> 💡 Tip: [Set up `image_credentials`](https://docs.pipecat.ai/deployment/pipecat-cloud/fundamentals/secrets#image-pull-secrets) in your TOML file for authenticated image pulls
### Configure secrets
Upload your API keys to Pipecat Cloud's secure storage:
```bash
uv run pcc secrets set quickstart-secrets --file .env
```
This creates a secret set called `quickstart-secrets` (matching your TOML file) and uploads all your API keys from `.env`.
### Build and deploy
Build your Docker image and push to Docker Hub:
```bash
# Update build.sh with your Docker Hub username, then:
./build.sh
```
Deploy to Pipecat Cloud:
```bash
uv run pcc deploy
```
### Connect to your agent
1. Open your [Pipecat Cloud dashboard](https://pipecat.daily.co/)
2. Select your `quickstart` agent → **Sandbox**
3. Allow microphone access and click **Connect**
---
## What's Next?
**🔧 Customize your bot**: Modify `bot.py` to change personality, add functions, or integrate with your data
**📚 Learn more**: Check out [Pipecat's docs](https://docs.pipecat.ai/) for advanced features
**💬 Get help**: Join [Pipecat's Discord](https://discord.gg/pipecat) to connect with the community
### Troubleshooting
- **Browser permissions**: Allow microphone access when prompted
- **Connection issues**: Try a different browser or check VPN/firewall settings
- **Audio issues**: Verify microphone and speakers are working and not muted
- **Read the docs**: Check out [Pipecat's docs](https://docs.pipecat.ai/) for guides and reference information.
- **Join Discord**: Join [Pipecat's Discord server](https://discord.gg/pipecat) to get help and learn about what others are building.

View File

@@ -7,16 +7,18 @@
"""Pipecat Quickstart Example.
The example runs a simple voice AI bot that you can connect to using your
browser and speak with it. You can also deploy this bot to Pipecat Cloud.
browser and speak with it.
Required AI services:
- Deepgram (Speech-to-Text)
- OpenAI (LLM)
- Cartesia (Text-to-Speech)
The example connects between client and server using a P2P WebRTC connection.
Run the bot using::
uv run bot.py
python bot.py
"""
import os
@@ -25,7 +27,7 @@ from dotenv import load_dotenv
from loguru import logger
print("🚀 Starting Pipecat bot...")
print("⏳ Loading models and imports (20 seconds first run only)\n")
print("⏳ Loading AI models (30-40 seconds first run, <2 seconds after)\n")
logger.info("Loading Silero VAD model...")
from pipecat.audio.vad.silero import SileroVADAnalyzer
@@ -38,12 +40,15 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.services.daily import DailyParams
logger.info("✅ Pipeline components loaded")
logger.info("Loading WebRTC transport...")
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
logger.info("✅ All components loaded successfully!")
@@ -116,20 +121,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
async def bot(runner_args: RunnerArguments):
"""Main bot entry point for the bot starter."""
transport_params = {
"daily": lambda: DailyParams(
transport = SmallWebRTCTransport(
params=TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
transport = await create_transport(runner_args, transport_params)
webrtc_connection=runner_args.webrtc_connection,
)
await run_bot(transport, runner_args)

View File

@@ -1,19 +0,0 @@
#!/bin/bash
set -e
VERSION="0.1"
DOCKER_USERNAME="your_username"
AGENT_NAME="quickstart"
# Build the Docker image with the correct context
echo "Building Docker image..."
docker build --platform=linux/arm64 -t "$DOCKER_USERNAME/$AGENT_NAME:$VERSION" -t "$DOCKER_USERNAME/$AGENT_NAME:latest" .
# Push the Docker images
echo "Pushing Docker image $DOCKER_USERNAME/$AGENT_NAME:$VERSION..."
docker push "$DOCKER_USERNAME/$AGENT_NAME:$VERSION"
echo "Pushing Docker image $DOCKER_USERNAME/$AGENT_NAME:latest..."
docker push "$DOCKER_USERNAME/$AGENT_NAME:latest"
echo "Successfully built and pushed $DOCKER_USERNAME/$AGENT_NAME:$VERSION and $DOCKER_USERNAME/$AGENT_NAME:latest"

View File

@@ -1,6 +1,3 @@
DEEPGRAM_API_KEY=your_deepgram_api_key
OPENAI_API_KEY=your_openai_api_key
CARTESIA_API_KEY=your_cartesia_api_key
# Optional: Connect via Daily WebRTC locally
DAILY_API_KEY=your_daily_api_key
CARTESIA_API_KEY=your_cartesia_api_key

View File

@@ -1,6 +0,0 @@
agent_name = "quickstart"
image = "your_username/quickstart:0.1"
secret_set = "quickstart-secrets"
[scaling]
min_agents = 1

View File

@@ -1,19 +0,0 @@
[project]
name = "pipecat-quickstart"
version = "0.1.0"
description = "Quickstart example for building voice AI bots with Pipecat"
requires-python = ">=3.10"
dependencies = [
"pipecat-ai[webrtc,daily,silero,deepgram,openai,cartesia,runner]>=0.0.79",
"pipecatcloud>=0.2.3"
]
[dependency-groups]
dev = [
"ruff~=0.12.1",
]
[tool.ruff]
line-length = 100
[tool.ruff.lint]
select = ["I"]

View File

@@ -0,0 +1 @@
pipecat-ai[webrtc,silero,deepgram,openai,cartesia,runner]>=0.0.77

File diff suppressed because it is too large Load Diff

View File

@@ -34,9 +34,8 @@ dependencies = [
"resampy~=0.4.3",
"soxr~=0.5.0",
"openai>=1.74.0,<=1.99.1",
# Pinning numba to resolve package dependencies
# Pinning numba (resampy dep) to resolve a package dependency
"numba==0.61.2",
"wait_for2>=0.4.1; python_version<'3.12'",
]
[project.urls]
@@ -53,7 +52,7 @@ azure = [ "azure-cognitiveservices-speech~=1.42.0"]
cartesia = [ "cartesia~=2.0.3", "websockets>=13.1,<15.0" ]
cerebras = []
deepseek = []
daily = [ "daily-python~=0.19.7" ]
daily = [ "daily-python~=0.19.6" ]
deepgram = [ "deepgram-sdk~=4.7.0" ]
elevenlabs = [ "websockets>=13.1,<15.0" ]
fal = [ "fal-client~=0.5.9" ]
@@ -74,9 +73,8 @@ lmnt = [ "websockets>=13.1,<15.0" ]
local = [ "pyaudio~=0.2.14" ]
mcp = [ "mcp[cli]~=1.9.4" ]
mem0 = [ "mem0ai~=0.1.94" ]
mistral = []
mlx-whisper = [ "mlx-whisper~=0.4.2" ]
moondream = [ "accelerate~=1.10.0", "einops~=0.8.0", "pyvips[binary]~=3.0.0", "timm~=1.0.13", "transformers>=4.48.0" ]
moondream = [ "einops~=0.8.0", "timm~=1.0.13", "transformers>=4.48.0" ]
nim = []
neuphonic = [ "websockets>=13.1,<15.0" ]
noisereduce = [ "noisereduce~=3.0.3" ]
@@ -84,7 +82,7 @@ openai = [ "websockets>=13.1,<15.0" ]
openpipe = [ "openpipe~=4.50.0" ]
openrouter = []
perplexity = []
playht = [ "websockets>=13.1,<15.0" ]
playht = [ "pyht>=0.1.6", "websockets>=13.1,<15.0" ]
qwen = []
rime = [ "websockets>=13.1,<15.0" ]
riva = [ "nvidia-riva-client~=2.21.1" ]
@@ -115,7 +113,7 @@ dev = [
"pre-commit~=4.2.0",
"pyright~=1.1.402",
"pytest~=8.4.1",
"pytest-asyncio~=1.1.0",
"pytest-asyncio~=1.0.0",
"pytest-aiohttp==1.1.0",
"ruff~=0.12.1",
"setuptools~=78.1.1",
@@ -126,7 +124,7 @@ dev = [
docs = [
"sphinx>=8.1.3",
"sphinx-rtd-theme",
"sphinx-markdown-builder",
"sphinx-markdown-builder",
"sphinx-autodoc-typehints",
"toml",
]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 63 KiB

View File

@@ -4,6 +4,7 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import io
import os
@@ -12,12 +13,11 @@ import time
import wave
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Tuple
from typing import List, Optional
import aiofiles
from deepgram import LiveOptions
from loguru import logger
from PIL.ImageFile import ImageFile
from utils import (
EvalResult,
load_module_from_path,
@@ -30,7 +30,7 @@ from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import EndTaskFrame, OutputImageRawFrame
from pipecat.frames.frames import EndTaskFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -49,8 +49,6 @@ SCRIPT_DIR = Path(__file__).resolve().parent
PIPELINE_IDLE_TIMEOUT_SECS = 60
EVAL_TIMEOUT_SECS = 90
EvalPrompt = str | Tuple[str, ImageFile]
class EvalRunner:
def __init__(
@@ -89,13 +87,7 @@ class EvalRunner:
async def assert_eval_false(self):
await self._queue.put(False)
async def run_eval(
self,
example_file: str,
prompt: EvalPrompt,
eval: str,
user_speaks_first: bool = False,
):
async def run_eval(self, example_file: str, prompt: str, eval: Optional[str] = None):
if not re.match(self._pattern, example_file):
return
@@ -112,9 +104,7 @@ class EvalRunner:
try:
tasks = [
asyncio.create_task(run_example_pipeline(script_path)),
asyncio.create_task(
run_eval_pipeline(self, example_file, prompt, eval, user_speaks_first)
),
asyncio.create_task(run_eval_pipeline(self, example_file, prompt, eval)),
]
_, pending = await asyncio.wait(tasks, timeout=EVAL_TIMEOUT_SECS)
if pending:
@@ -188,7 +178,6 @@ async def run_example_pipeline(script_path: Path):
DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_in_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
@@ -200,11 +189,7 @@ async def run_example_pipeline(script_path: Path):
async def run_eval_pipeline(
eval_runner: EvalRunner,
example_file: str,
prompt: EvalPrompt,
eval: str,
user_speaks_first: bool = False,
eval_runner: EvalRunner, example_file: str, prompt: str, eval: Optional[str]
):
logger.info(f"Starting eval bot")
@@ -217,7 +202,6 @@ async def run_eval_pipeline(
DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=2.0)),
),
)
@@ -226,15 +210,12 @@ async def run_eval_pipeline(
# 5" (in audio) this can be converted to "32 is 5".
stt = DeepgramSTTService(
api_key=os.getenv("DEEPGRAM_API_KEY"),
live_options=LiveOptions(
language="multi",
smart_format=False,
),
live_options=LiveOptions(smart_format=False),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="97f4b8fb-f2fe-444b-bb9a-c109783a857a", # Nathan
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
@@ -258,25 +239,15 @@ async def run_eval_pipeline(
)
tools = ToolsSchema(standard_tools=[eval_function])
# Load example prompt depending on image.
example_prompt = ""
example_image: Optional[ImageFile] = None
if isinstance(prompt, str):
example_prompt = prompt
elif isinstance(prompt, tuple):
example_prompt, example_image = prompt
eval_prompt = f"The answer is correct if it's appropriate for the context and matches: {eval}."
common_system_prompt = f"Call the eval function with your assessment only if the user answers the question. {eval_prompt}"
if user_speaks_first:
system_prompt = f"You are an LLM eval, be extremly brief. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}"
else:
system_prompt = f"You are an LLM eval, be extremly brief. Your goal is to first ask one question: {example_prompt}. {common_system_prompt}"
# See if we need to include an eval prompt.
eval_prompt = ""
if eval:
eval_prompt = f"The answer is correct if the user says [{eval}]."
messages = [
{
"role": "system",
"content": system_prompt,
"content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}",
},
]
@@ -314,24 +285,8 @@ async def run_eval_pipeline(
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
if example_image:
await task.queue_frame(
OutputImageRawFrame(
image=example_image.tobytes(),
size=example_image.size,
format="RGB",
)
)
await audio_buffer.start_recording()
# Default behavior is for the bot to speak first
# If the eval bot speaks first, we append the prompt to the messages
if user_speaks_first:
messages.append(
{"role": "user", "content": f"Start by saying this exactly: '{prompt}'"}
)
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
@@ -341,8 +296,6 @@ async def run_eval_pipeline(
async def on_pipeline_idle_timeout(task):
await eval_runner.assert_eval_false()
# TODO(aleix): We should handle SIGINT and SIGTERM so we can cancel both the
# eval and the example.
runner = PipelineRunner(handle_sigint=False)
runner = PipelineRunner()
await runner.run(task)

View File

@@ -13,24 +13,17 @@ from pathlib import Path
from dotenv import load_dotenv
from eval import EvalRunner
from loguru import logger
from PIL import Image
from utils import check_env_variables
load_dotenv(override=True)
SCRIPT_DIR = Path(__file__).resolve().parent
ASSETS_DIR = SCRIPT_DIR / "assets"
FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational"
# Speaking order constants
USER_SPEAKS_FIRST = True
BOT_SPEAKS_FIRST = False
# Math
PROMPT_SIMPLE_MATH = "A simple math addition."
EVAL_SIMPLE_MATH = "Correct math addition."
# Weather
PROMPT_WEATHER = "What's the weather in San Francisco?"
@@ -42,173 +35,112 @@ EVAL_WEATHER = (
PROMPT_ONLINE_SEARCH = "What's the date right now in London?"
EVAL_ONLINE_SEARCH = f"Today is {datetime.now(timezone.utc).strftime('%B %d, %Y')}."
# Switch language
PROMPT_SWITCH_LANGUAGE = "Say something in Spanish."
EVAL_SWITCH_LANGUAGE = "The user is now talking in Spanish."
# Vision
PROMPT_VISION = ("What do you see?", Image.open(ASSETS_DIR / "cat.jpg"))
EVAL_VISION = "A cat description."
# Voicemail
PROMPT_VOICEMAIL = "Please leave a message after the beep."
EVAL_VOICEMAIL = "Assess the conversation and determine if it is a voicemail."
PROMPT_CONVERSATION = "Hello, this is Mark."
EVAL_CONVERSATION = "A start of a conversation, not a voicemail."
TESTS_07 = [
# 07 series
("07-interruptible.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07-interruptible-cartesia-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07a-interruptible-speechmatics.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07aa-interruptible-soniox.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07ab-interruptible-inworld-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07ac-interruptible-asyncai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
(
"07d-interruptible-elevenlabs-http.py",
PROMPT_SIMPLE_MATH,
EVAL_SIMPLE_MATH,
BOT_SPEAKS_FIRST,
),
("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07j-interruptible-gladia.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07k-interruptible-lmnt.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07l-interruptible-groq.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07m-interruptible-aws.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07n-interruptible-gemini.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07n-interruptible-google.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07o-interruptible-assemblyai.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07q-interruptible-rime.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07q-interruptible-rime-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07r-interruptible-riva-nim.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
(
"07s-interruptible-google-audio-in.py",
PROMPT_SIMPLE_MATH,
EVAL_SIMPLE_MATH,
BOT_SPEAKS_FIRST,
),
("07t-interruptible-fish.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07v-interruptible-neuphonic.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07v-interruptible-neuphonic-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07-interruptible.py", PROMPT_SIMPLE_MATH, None),
("07-interruptible-cartesia-http.py", PROMPT_SIMPLE_MATH, None),
("07a-interruptible-speechmatics.py", PROMPT_SIMPLE_MATH, None),
("07aa-interruptible-soniox.py", PROMPT_SIMPLE_MATH, None),
("07ab-interruptible-inworld-http.py", PROMPT_SIMPLE_MATH, None),
("07ac-interruptible-asyncai.py", PROMPT_SIMPLE_MATH, None),
("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, None),
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, None),
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, None),
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, None),
("07d-interruptible-elevenlabs-http.py", PROMPT_SIMPLE_MATH, None),
("07e-interruptible-playht.py", PROMPT_SIMPLE_MATH, None),
("07e-interruptible-playht-http.py", PROMPT_SIMPLE_MATH, None),
("07f-interruptible-azure.py", PROMPT_SIMPLE_MATH, None),
("07g-interruptible-openai.py", PROMPT_SIMPLE_MATH, None),
("07h-interruptible-openpipe.py", PROMPT_SIMPLE_MATH, None),
("07j-interruptible-gladia.py", PROMPT_SIMPLE_MATH, None),
("07k-interruptible-lmnt.py", PROMPT_SIMPLE_MATH, None),
("07l-interruptible-groq.py", PROMPT_SIMPLE_MATH, None),
("07m-interruptible-aws.py", PROMPT_SIMPLE_MATH, None),
("07n-interruptible-google.py", PROMPT_SIMPLE_MATH, None),
("07o-interruptible-assemblyai.py", PROMPT_SIMPLE_MATH, None),
("07q-interruptible-rime.py", PROMPT_SIMPLE_MATH, None),
("07q-interruptible-rime-http.py", PROMPT_SIMPLE_MATH, None),
("07r-interruptible-riva-nim.py", PROMPT_SIMPLE_MATH, None),
("07s-interruptible-google-audio-in.py", PROMPT_SIMPLE_MATH, None),
("07t-interruptible-fish.py", PROMPT_SIMPLE_MATH, None),
("07v-interruptible-neuphonic.py", PROMPT_SIMPLE_MATH, None),
("07v-interruptible-neuphonic-http.py", PROMPT_SIMPLE_MATH, None),
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, None),
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, None),
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, None),
# Needs a local XTTS docker instance running.
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, None),
# Needs a Krisp license.
# ("07p-interruptible-krisp.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
# ("07p-interruptible-krisp.py", PROMPT_SIMPLE_MATH, None),
# Needs GPU resources.
# ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
]
TESTS_12 = [
("12-describe-video.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
("12a-describe-video-gemini-flash.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
("12b-describe-video-gpt-4o.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
("12c-describe-video-anthropic.py", PROMPT_VISION, EVAL_VISION, BOT_SPEAKS_FIRST),
# ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, None),
]
TESTS_14 = [
("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14b-function-calling-anthropic-video.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14d-function-calling-video.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14e-function-calling-google.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14f-function-calling-groq.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14g-function-calling-grok.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14h-function-calling-azure.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14i-function-calling-fireworks.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14j-function-calling-nim.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14m-function-calling-openrouter.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14n-function-calling-perplexity.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14p-function-calling-gemini-vertex-ai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14q-function-calling-qwen.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14w-function-calling-mistral.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER),
("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER),
("14b-function-calling-anthropic-video.py", PROMPT_WEATHER, EVAL_WEATHER),
("14d-function-calling-video.py", PROMPT_WEATHER, EVAL_WEATHER),
("14e-function-calling-google.py", PROMPT_WEATHER, EVAL_WEATHER),
("14f-function-calling-groq.py", PROMPT_WEATHER, EVAL_WEATHER),
("14g-function-calling-grok.py", PROMPT_WEATHER, EVAL_WEATHER),
("14h-function-calling-azure.py", PROMPT_WEATHER, EVAL_WEATHER),
("14i-function-calling-fireworks.py", PROMPT_WEATHER, EVAL_WEATHER),
("14j-function-calling-nim.py", PROMPT_WEATHER, EVAL_WEATHER),
("14m-function-calling-openrouter.py", PROMPT_WEATHER, EVAL_WEATHER),
("14n-function-calling-perplexity.py", PROMPT_WEATHER, EVAL_WEATHER),
("14p-function-calling-gemini-vertex-ai.py", PROMPT_WEATHER, EVAL_WEATHER),
("14q-function-calling-qwen.py", PROMPT_WEATHER, EVAL_WEATHER),
("14r-function-calling-aws.py", PROMPT_WEATHER, EVAL_WEATHER),
("14v-function-calling-openai.py", PROMPT_WEATHER, EVAL_WEATHER),
# Currently not working.
# ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
# ("14k-function-calling-cerebras.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
# ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
# ("14o-function-calling-gemini-openai-format.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
]
TESTS_15 = [
("15a-switch-languages.py", PROMPT_SWITCH_LANGUAGE, EVAL_SWITCH_LANGUAGE, BOT_SPEAKS_FIRST),
# ("14c-function-calling-together.py", PROMPT_WEATHER, EVAL_WEATHER),
# ("14k-function-calling-cerebras.py", PROMPT_WEATHER, EVAL_WEATHER),
# ("14l-function-calling-deepseek.py", PROMPT_WEATHER, EVAL_WEATHER),
# ("14o-function-calling-gemini-openai-format.py", PROMPT_WEATHER, EVAL_WEATHER),
]
TESTS_19 = [
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER, BOT_SPEAKS_FIRST),
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER),
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER),
]
TESTS_21 = [
("21a-tavus-video-service.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("21a-tavus-video-service.py", PROMPT_SIMPLE_MATH, None),
]
TESTS_26 = [
("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
(
"26a-gemini-multimodal-live-transcription.py",
PROMPT_SIMPLE_MATH,
EVAL_SIMPLE_MATH,
BOT_SPEAKS_FIRST,
),
(
"26b-gemini-multimodal-live-function-calling.py",
PROMPT_WEATHER,
EVAL_WEATHER,
BOT_SPEAKS_FIRST,
),
("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
(
"26e-gemini-multimodal-google-search.py",
PROMPT_ONLINE_SEARCH,
EVAL_ONLINE_SEARCH,
BOT_SPEAKS_FIRST,
),
("26-gemini-multimodal-live.py", PROMPT_SIMPLE_MATH, None),
("26a-gemini-multimodal-live-transcription.py", PROMPT_SIMPLE_MATH, None),
("26b-gemini-multimodal-live-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER),
("26c-gemini-multimodal-live-video.py", PROMPT_SIMPLE_MATH, None),
("26e-gemini-multimodal-google-search.py", PROMPT_ONLINE_SEARCH, EVAL_ONLINE_SEARCH),
# Currently not working.
# ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
# ("26d-gemini-multimodal-live-text.py", PROMPT_SIMPLE_MATH, None),
]
TESTS_27 = [
("27-simli-layer.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("27-simli-layer.py", PROMPT_SIMPLE_MATH, None),
]
TESTS_40 = [
("40-aws-nova-sonic.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("40-aws-nova-sonic.py", PROMPT_SIMPLE_MATH, None),
]
TESTS_43 = [
("43a-heygen-video-service.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
]
TESTS_44 = [
("44-voicemail-detection.py", PROMPT_VOICEMAIL, EVAL_VOICEMAIL, USER_SPEAKS_FIRST),
("44-voicemail-detection.py", PROMPT_CONVERSATION, EVAL_CONVERSATION, USER_SPEAKS_FIRST),
("43a-heygen-video-service.py", PROMPT_SIMPLE_MATH, None),
]
TESTS = [
*TESTS_07,
*TESTS_12,
*TESTS_14,
*TESTS_15,
*TESTS_19,
*TESTS_21,
*TESTS_26,
*TESTS_27,
*TESTS_40,
*TESTS_43,
*TESTS_44,
]
@@ -230,11 +162,8 @@ async def main(args: argparse.Namespace):
log_level=log_level,
)
# Parse test config: (test, prompt, eval, user_speaks_first)
for test_config in TESTS:
test, prompt, eval, user_speaks_first = test_config
await runner.run_eval(test, prompt, eval, user_speaks_first)
for test, prompt, eval in TESTS:
await runner.run_eval(test, prompt, eval)
runner.print_results()

View File

@@ -12,20 +12,3 @@ from loguru import logger
__version__ = version("pipecat-ai")
logger.info(f"ᓚᘏᗢ Pipecat {__version__} (Python {sys.version}) ᓚᘏᗢ")
# We replace `asyncio.wait_for()` for `wait_for2.wait_for()` for Python < 3.12.
#
# In Python 3.12, `asyncio.wait_for()` is implemented in terms of
# `asyncio.timeout()` which fixed a bunch of issues. However, this was never
# backported (because of the lack of `async.timeout()`) and there are still many
# remainig issues, specially in Python 3.10, in `async.wait_for()`.
#
# See https://github.com/python/cpython/pull/98518
import asyncio
if sys.version_info < (3, 12):
import wait_for2
# Replace asyncio.wait_for.
asyncio.wait_for = wait_for2.wait_for

View File

@@ -28,7 +28,7 @@ SPEAKING_THRESHOLD = 20
def create_default_resampler(**kwargs) -> BaseAudioResampler:
"""Create a default audio resampler instance.
.. deprecated:: 0.0.74
. deprecated:: 0.0.74
This function is deprecated and will be removed in a future version.
Use `create_stream_resampler` for real-time processing scenarios or
`create_file_resampler` for batch processing of complete audio files.

View File

@@ -1,707 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Voicemail detection module for Pipecat.
This module provides voicemail detection capabilities using parallel pipeline
processing to classify incoming calls as either voicemail messages or live
conversations. It's specifically designed for outbound calling scenarios where
a bot needs to determine if a human answered or if the call went to voicemail.
Note:
The voicemail module is optimized for text LLMs only.
"""
import asyncio
from typing import List, Optional
from loguru import logger
from pipecat.frames.frames import (
BotInterruptionFrame,
EndFrame,
Frame,
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMTextFrame,
StopFrame,
SystemFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
TTSTextFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
from pipecat.services.llm_service import LLMService
from pipecat.sync.base_notifier import BaseNotifier
from pipecat.sync.event_notifier import EventNotifier
class NotifierGate(FrameProcessor):
"""Base gate processor that controls frame flow based on notifier signals.
This base class provides common gate functionality for processors that need to
start open and close permanently when a notifier signals. Subclasses define
which frames are allowed through when the gate is closed.
The gate starts open to allow initial processing and closes permanently once
the notifier signals. This ensures controlled frame flow based on external
decisions or events.
"""
def __init__(self, notifier: BaseNotifier, task_name: str = "gate"):
"""Initialize the notifier gate.
Args:
notifier: Notifier that signals when the gate should close.
task_name: Name for the notification waiting task (for debugging).
"""
super().__init__()
self._notifier = notifier
self._task_name = task_name
self._gate_opened = True
self._gate_task: Optional[asyncio.Task] = None
async def setup(self, setup: FrameProcessorSetup):
"""Set up the processor with required components.
Args:
setup: Configuration object containing setup parameters.
"""
await super().setup(setup)
self._gate_task = self.create_task(self._wait_for_notification())
async def cleanup(self):
"""Clean up the processor resources."""
await super().cleanup()
if self._gate_task:
await self.cancel_task(self._gate_task)
self._gate_task = None
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames and control gate state based on notifier signals.
Args:
frame: The frame to process.
direction: The direction of frame flow in the pipeline.
"""
await super().process_frame(frame, direction)
# Gate logic: open gate allows all frames, closed gate filters frames
if self._gate_opened:
await self.push_frame(frame, direction)
elif isinstance(
frame,
(SystemFrame, EndFrame, StopFrame),
):
await self.push_frame(frame, direction)
async def _wait_for_notification(self):
"""Wait for notifier signal and close the gate.
This method blocks until the notifier signals, then closes the gate
permanently to change frame filtering behavior.
"""
await self._notifier.wait()
if self._gate_opened:
self._gate_opened = False
class ClassifierGate(NotifierGate):
"""Gate processor that controls frame flow based on classification decisions.
Inherits from NotifierGate and starts open to allow initial classification
processing. Closes permanently once a classification decision is made
(CONVERSATION or VOICEMAIL). This ensures the classifier only runs until a
definitive decision is reached, preventing unnecessary LLM calls and maintaining
system efficiency.
When closed, only allows system frames and user speaking frames to continue.
Speaking frames are needed for voicemail timing control, but not for conversation.
"""
def __init__(self, gate_notifier: BaseNotifier, conversation_notifier: BaseNotifier):
"""Initialize the classifier gate.
Args:
gate_notifier: Notifier that signals when a classification decision has
been made and the gate should close.
conversation_notifier: Notifier that signals when conversation is detected.
"""
super().__init__(gate_notifier, task_name="classifier_gate")
self._conversation_notifier = conversation_notifier
self._conversation_detected = False
self._conversation_task: Optional[asyncio.Task] = None
async def setup(self, setup: FrameProcessorSetup):
"""Set up the processor with required components.
Args:
setup: Configuration object containing setup parameters.
"""
await super().setup(setup)
self._conversation_task = self.create_task(self._wait_for_conversation())
async def cleanup(self):
"""Clean up the processor resources."""
await super().cleanup()
if self._conversation_task:
await self.cancel_task(self._conversation_task)
self._conversation_task = None
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames and control gate state based on notifier signals.
Args:
frame: The frame to process.
direction: The direction of frame flow in the pipeline.
"""
await FrameProcessor.process_frame(self, frame, direction)
# Gate logic: open gate allows all frames, closed gate filters frames
if self._gate_opened:
await self.push_frame(frame, direction)
elif isinstance(frame, (UserStartedSpeakingFrame, UserStoppedSpeakingFrame)):
# Only allow speaking frames if conversation was NOT detected (i.e., voicemail case)
# This prevents the UserContextAggregator from issuing a warning about no aggregation
# to push.
if not self._conversation_detected:
await self.push_frame(frame, direction)
elif isinstance(frame, (SystemFrame, EndFrame, StopFrame)):
# Always allow system frames through
# This includes the UserStartedSpeakingFrame and UserStoppedSpeakingFrame
# which are used to detect voicemail timing.
await self.push_frame(frame, direction)
async def _wait_for_conversation(self):
"""Wait for conversation detection notification and mark conversation detected."""
await self._conversation_notifier.wait()
self._conversation_detected = True
class ConversationGate(NotifierGate):
"""Gate processor that blocks conversation flow when voicemail is detected.
Inherits from NotifierGate and starts open to allow normal conversation
processing. Closes permanently when voicemail is detected to prevent the
main conversation LLM from processing additional input after voicemail
classification.
When closed, only allows system frames and user speaking frames to continue.
"""
def __init__(self, voicemail_notifier: BaseNotifier):
"""Initialize the conversation gate.
Args:
voicemail_notifier: Notifier that signals when voicemail has been
detected and the conversation should be blocked.
"""
super().__init__(voicemail_notifier, task_name="conversation_gate")
class ClassificationProcessor(FrameProcessor):
"""Processor that handles LLM classification responses and triggers events.
This processor aggregates LLM text tokens into complete responses and analyzes
them to determine if the call reached a voicemail system or a live person.
It uses the LLM response frame delimiters (LLMFullResponseStartFrame and
LLMFullResponseEndFrame) to ensure complete token aggregation regardless
of how the LLM tokenizes the response words.
The processor expects responses containing either "CONVERSATION" (indicating
a human answered) or "VOICEMAIL" (indicating an automated system). Once a
decision is made, it triggers the appropriate notifications and event handlers.
For voicemail detection, the event handler timer starts immediately and is cancelled
and restarted based on user speech patterns to ensure proper timing.
"""
def __init__(
self,
*,
gate_notifier: BaseNotifier,
conversation_notifier: BaseNotifier,
voicemail_notifier: BaseNotifier,
voicemail_response_delay: float,
):
"""Initialize the voicemail processor.
Args:
gate_notifier: Notifier to signal the ClassifierGate about classification
decisions so it can close and stop processing.
conversation_notifier: Notifier to signal the TTSGate to release
all gated TTS frames for normal conversation flow.
voicemail_notifier: Notifier to signal the TTSGate to clear
gated TTS frames since voicemail was detected.
voicemail_response_delay: Delay in seconds after user stops speaking
before triggering the voicemail event handler. This ensures the voicemail
greeting or user message is complete before responding.
"""
super().__init__()
self._gate_notifier = gate_notifier
self._conversation_notifier = conversation_notifier
self._voicemail_notifier = voicemail_notifier
self._voicemail_response_delay = voicemail_response_delay
# Register the voicemail detected event
self._register_event_handler("on_voicemail_detected")
# Aggregation state for collecting complete LLM responses
self._processing_response = False
self._response_buffer = ""
self._decision_made = False
# Voicemail timing state
self._voicemail_detected = False
self._voicemail_task: Optional[asyncio.Task] = None
self._voicemail_event = asyncio.Event()
self._voicemail_event.set()
async def setup(self, setup: FrameProcessorSetup):
"""Set up the processor with required components.
Args:
setup: Configuration object containing setup parameters.
"""
await super().setup(setup)
self._voicemail_task = self.create_task(self._delayed_voicemail_handler())
async def cleanup(self):
"""Clean up the processor resources."""
await super().cleanup()
if self._voicemail_task:
await self.cancel_task(self._voicemail_task)
self._voicemail_task = None
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames and handle LLM classification responses.
This method implements a state machine for aggregating LLM responses:
1. LLMFullResponseStartFrame: Begin collecting tokens
2. LLMTextFrame: Accumulate text tokens into buffer
3. LLMFullResponseEndFrame: Process complete response and make decision
4. UserStartedSpeakingFrame/UserStoppedSpeakingFrame: Manage voicemail timing
Args:
frame: The frame to process.
direction: The direction of frame flow in the pipeline.
"""
await super().process_frame(frame, direction)
if isinstance(frame, LLMFullResponseStartFrame):
# Begin aggregating a new LLM response
self._processing_response = True
self._response_buffer = ""
elif isinstance(frame, LLMFullResponseEndFrame):
# Complete response received - make classification decision
if self._processing_response and not self._decision_made:
await self._process_classification(self._response_buffer.strip())
self._processing_response = False
self._response_buffer = ""
elif isinstance(frame, LLMTextFrame) and self._processing_response:
# Accumulate text tokens from the streaming LLM response
self._response_buffer += frame.text
elif isinstance(frame, UserStartedSpeakingFrame):
# User started speaking - set the voicemail event
if self._voicemail_detected:
self._voicemail_event.set()
elif isinstance(frame, UserStoppedSpeakingFrame):
# User stopped speaking - clear the voicemail event
if self._voicemail_detected:
self._voicemail_event.clear()
else:
# Pass all non-LLM frames through
# Blocking LLM frames prevents interference with the downstream LLM
await self.push_frame(frame, direction)
async def _process_classification(self, full_response: str):
"""Process the complete LLM classification response and trigger actions.
Analyzes the aggregated response text to determine if it contains
"CONVERSATION" or "VOICEMAIL" and triggers the appropriate notifications
and callbacks based on the classification result.
Args:
full_response: The complete aggregated response text from the LLM.
"""
if self._decision_made:
return
response = full_response.upper()
logger.debug(f"{self}: Classifying response: '{full_response}'")
if "CONVERSATION" in response:
# Human answered - continue normal conversation flow
self._decision_made = True
logger.info(f"{self}: CONVERSATION detected")
await self._gate_notifier.notify() # Close the classifier gate
await self._conversation_notifier.notify() # Release buffered TTS frames
elif "VOICEMAIL" in response:
# Voicemail detected - trigger voicemail handling
self._decision_made = True
self._voicemail_detected = True
logger.info(f"{self}: VOICEMAIL detected")
await self._gate_notifier.notify() # Close the classifier gate
await self._voicemail_notifier.notify() # Clear buffered TTS frames
# Interrupt the current pipeline to stop any ongoing processing
await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
# Set the voicemail event to trigger the voicemail handler
self._voicemail_event.clear()
else:
# This can happen if the LLM is interrupted before completing the response
logger.debug(f"{self}: No classification found: '{full_response}'")
async def _delayed_voicemail_handler(self):
"""Execute the voicemail event handler after the configured delay.
This method waits for the specified delay period, then triggers the
developer's voicemail event handler. The timer can be cancelled and restarted
based on user speech patterns to ensure proper timing.
"""
while True:
try:
await asyncio.wait_for(
self._voicemail_event.wait(), timeout=self._voicemail_response_delay
)
await asyncio.sleep(0.1)
except asyncio.TimeoutError:
await self._call_event_handler("on_voicemail_detected")
break
class TTSGate(FrameProcessor):
"""Gates TTS frames until voicemail classification decision is made.
This processor holds TTS output frames in a gate while the voicemail
classification is in progress. This prevents audio from being played
to the caller before determining if they're human or a voicemail system.
The gate operates in two modes based on the classification result:
- CONVERSATION: Opens the gate to release all held frames for normal dialogue
- VOICEMAIL: Clears held frames since they're not needed for voicemail
The gating only applies to TTS-related frames (TTSTextFrame, TTSAudioRawFrame).
All other frames pass through immediately to maintain proper pipeline flow.
"""
def __init__(self, conversation_notifier: BaseNotifier, voicemail_notifier: BaseNotifier):
"""Initialize the TTS gate.
Args:
conversation_notifier: Notifier that signals when a conversation is
detected and gated frames should be released for playback.
voicemail_notifier: Notifier that signals when voicemail is detected
and gated frames should be cleared (not played).
"""
super().__init__()
self._conversation_notifier = conversation_notifier
self._voicemail_notifier = voicemail_notifier
self._frame_buffer: List[tuple[Frame, FrameDirection]] = []
self._gating_active = True
self._conversation_task: Optional[asyncio.Task] = None
self._voicemail_task: Optional[asyncio.Task] = None
async def setup(self, setup: FrameProcessorSetup):
"""Set up the processor with required components.
Args:
setup: Configuration object containing setup parameters.
"""
await super().setup(setup)
self._conversation_task = self.create_task(self._wait_for_conversation())
self._voicemail_task = self.create_task(self._wait_for_voicemail())
async def cleanup(self):
"""Clean up the processor resources."""
await super().cleanup()
if self._conversation_task:
await self.cancel_task(self._conversation_task)
self._conversation_task = None
if self._voicemail_task:
await self.cancel_task(self._voicemail_task)
self._voicemail_task = None
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames and handle gating logic based on frame type.
TTS frames are gated while classification is active. All other frames
pass through immediately. The gating state is controlled by the
classification notifications.
Args:
frame: The frame to process.
direction: The direction of frame flow in the pipeline.
"""
await super().process_frame(frame, direction)
# Core gating logic: hold TTS frames, pass everything else through
if self._gating_active and isinstance(
frame, (TTSStartedFrame, TTSStoppedFrame, TTSTextFrame, TTSAudioRawFrame)
):
# Gate TTS frames while waiting for classification decision
self._frame_buffer.append((frame, direction))
else:
# Pass through all non-TTS frames immediately
await self.push_frame(frame, direction)
async def _wait_for_conversation(self):
"""Wait for conversation detection notification and release gated frames.
When a conversation is detected, all gated TTS frames are released
in order to continue normal dialogue flow. This allows the bot to
respond naturally to the human caller.
"""
await self._conversation_notifier.wait()
# Release all gated frames in original order
self._gating_active = False
for frame, direction in self._frame_buffer:
await self.push_frame(frame, direction)
self._frame_buffer.clear()
async def _wait_for_voicemail(self):
"""Wait for voicemail detection notification and clear gated frames.
When voicemail is detected, all gated TTS frames are discarded
since they were intended for human conversation and are not appropriate
for voicemail systems. The developer event handlers will handle voicemail-
specific audio output.
"""
await self._voicemail_notifier.wait()
# Clear gated frames without playing them
self._gating_active = False
self._frame_buffer.clear()
class VoicemailDetector(ParallelPipeline):
"""Parallel pipeline for detecting voicemail vs. live conversation in outbound calls.
This detector uses a parallel pipeline architecture to perform real-time
classification of outbound phone calls without interrupting the conversation
flow. It determines whether a human answered the phone or if the call went
to a voicemail system.
Architecture:
- Conversation branch: Empty pipeline that allows normal frame flow
- Classification branch: Contains the LLM classifier and decision logic
The system uses a gate mechanism to control when classification runs and
a gating system to prevent TTS output until classification is complete.
Once a decision is made, the appropriate action is taken:
- CONVERSATION: Continue normal bot dialogue
- VOICEMAIL: Trigger developer event handler for custom voicemail handling
Example::
classification_llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
detector = VoicemailDetector(llm=classification_llm)
@detector.event_handler("on_voicemail_detected")
async def handle_voicemail(processor):
await processor.push_frame(TTSSpeakFrame("Please leave a message."))
pipeline = Pipeline([
transport.input(),
stt,
detector.detector(), # Classification
context_aggregator.user(),
llm,
tts,
detector.gate(), # TTS gating
transport.output(),
context_aggregator.assistant(),
])
# For custom prompts, append the required response instruction:
custom_prompt = "Your custom classification logic here. " + VoicemailDetector.CLASSIFIER_RESPONSE_INSTRUCTION
Events:
on_voicemail_detected: Triggered when voicemail is detected after the configured
delay. The event handler receives one argument: the ClassificationProcessor
instance which can be used to push frames.
Constants:
CLASSIFIER_RESPONSE_INSTRUCTION: The exact text that must be included in custom
system prompts to ensure proper classification functionality.
"""
CLASSIFIER_RESPONSE_INSTRUCTION = 'Respond with ONLY "CONVERSATION" if a person answered, or "VOICEMAIL" if it\'s voicemail/recording.'
DEFAULT_SYSTEM_PROMPT = (
"""You are a voicemail detection classifier for an OUTBOUND calling system. A bot has called a phone number and you need to determine if a human answered or if the call went to voicemail based on the provided text.
HUMAN ANSWERED - LIVE CONVERSATION (respond "CONVERSATION"):
- Personal greetings: "Hello?", "Hi", "Yeah?", "John speaking"
- Interactive responses: "Who is this?", "What do you want?", "Can I help you?"
- Conversational tone expecting back-and-forth dialogue
- Questions directed at the caller: "Hello? Anyone there?"
- Informal responses: "Yep", "What's up?", "Speaking"
- Natural, spontaneous speech patterns
- Immediate acknowledgment of the call
VOICEMAIL SYSTEM (respond "VOICEMAIL"):
- Automated voicemail greetings: "Hi, you've reached [name], please leave a message"
- Phone carrier messages: "The number you have dialed is not in service", "Please leave a message", "All circuits are busy"
- Professional voicemail: "This is [name], I'm not available right now"
- Instructions about leaving messages: "leave a message", "leave your name and number"
- References to callback or messaging: "call me back", "I'll get back to you"
- Carrier system messages: "mailbox is full", "has not been set up"
- Business hours messages: "our office is currently closed"
"""
+ CLASSIFIER_RESPONSE_INSTRUCTION
)
def __init__(
self,
*,
llm: LLMService,
voicemail_response_delay: float = 2.0,
custom_system_prompt: Optional[str] = None,
):
"""Initialize the voicemail detector with classification and buffering components.
Args:
llm: LLM service used for voicemail vs conversation classification.
Should be fast and reliable for real-time classification.
voicemail_response_delay: Delay in seconds after user stops speaking
before triggering the voicemail event handler. This allows voicemail
responses to be played back after a short delay to ensure the response
occurs during the voicemail recording. Default is 2.0 seconds.
custom_system_prompt: Optional custom system prompt for classification. If None,
uses the default prompt optimized for outbound calling scenarios.
Custom prompts should instruct the LLM to respond with exactly
"CONVERSATION" or "VOICEMAIL" for proper detection functionality.
"""
self._classifier_llm = llm
self._prompt = (
custom_system_prompt if custom_system_prompt is not None else self.DEFAULT_SYSTEM_PROMPT
)
self._voicemail_response_delay = voicemail_response_delay
# Validate custom prompts to ensure they work with the detection logic
if custom_system_prompt is not None:
self._validate_prompt(custom_system_prompt)
# Set up the LLM context with the classification prompt
self._messages = [
{
"role": "system",
"content": self._prompt,
},
]
# Create the LLM context and aggregators for conversation management
self._context = OpenAILLMContext(self._messages)
self._context_aggregator = llm.create_context_aggregator(self._context)
# Create notification system for coordinating between components
self._gate_notifier = EventNotifier() # Signals classification completion
self._conversation_notifier = EventNotifier() # Signals conversation detected
self._voicemail_notifier = EventNotifier() # Signals voicemail detected
# Create the processor components
self._classifier_gate = ClassifierGate(self._gate_notifier, self._conversation_notifier)
self._conversation_gate = ConversationGate(self._voicemail_notifier)
self._classification_processor = ClassificationProcessor(
gate_notifier=self._gate_notifier,
conversation_notifier=self._conversation_notifier,
voicemail_notifier=self._voicemail_notifier,
voicemail_response_delay=voicemail_response_delay,
)
self._voicemail_gate = TTSGate(self._conversation_notifier, self._voicemail_notifier)
# Initialize the parallel pipeline with conversation and classifier branches
super().__init__(
# Conversation branch: gate to blocks after voicemail detection
[self._conversation_gate],
# Classification branch: gate -> context -> LLM -> processor -> context
[
self._classifier_gate,
self._context_aggregator.user(),
self._classifier_llm,
self._classification_processor,
self._context_aggregator.assistant(),
],
)
# Register the voicemail detected event after super().__init__()
self._register_event_handler("on_voicemail_detected")
def _validate_prompt(self, prompt: str) -> None:
"""Validate custom prompt contains required response format instructions.
Custom prompts must instruct the LLM to respond with exactly "CONVERSATION"
or "VOICEMAIL" for the detection logic to work properly. This method
checks for the presence of these keywords and warns if they're missing.
Args:
prompt: The custom system prompt to validate.
"""
has_conversation = "CONVERSATION" in prompt
has_voicemail = "VOICEMAIL" in prompt
if not has_conversation or not has_voicemail:
logger.warning(
"Custom system prompt should instruct the LLM to respond with exactly "
'"CONVERSATION" or "VOICEMAIL" for proper detection functionality. '
f"Consider appending VoicemailDetector.CLASSIFIER_RESPONSE_INSTRUCTION to your prompt: "
f'"{self.CLASSIFIER_RESPONSE_INSTRUCTION}"'
)
def detector(self) -> "VoicemailDetector":
"""Get the detector pipeline for placement after STT in the main pipeline.
This should be placed after the STT service and before the context
aggregator in your main pipeline to enable voicemail classification.
Returns:
The VoicemailDetector instance itself (which is a ParallelPipeline).
"""
return self
def gate(self) -> TTSGate:
"""Get the gate processor for placement after TTS in the main pipeline.
This should be placed after the TTS service and before the transport
output to enable TTS frame gating during classification.
Returns:
The TTSGate processor instance.
"""
return self._voicemail_gate
def add_event_handler(self, event_name: str, handler):
"""Add an event handler for voicemail detection events.
Args:
event_name: The name of the event to handle.
handler: The function to call when the event occurs.
"""
if event_name == "on_voicemail_detected":
self._classification_processor.add_event_handler(event_name, handler)
else:
super().add_event_handler(event_name, handler)

View File

@@ -228,7 +228,7 @@ class OutputImageRawFrame(DataFrame, ImageRawFrame):
def __str__(self):
pts = format_pts(self.pts)
return f"{self.name}(pts: {pts}, destination: {self.transport_destination}, size: {self.size}, format: {self.format})"
return f"{self.name}(pts: {pts}, size: {self.size}, format: {self.format})"
@dataclass

View File

@@ -11,6 +11,7 @@ processors without modifying the pipeline structure. Observers can be used
for logging, debugging, analytics, and monitoring pipeline behavior.
"""
from abc import abstractmethod
from dataclasses import dataclass
from typing_extensions import TYPE_CHECKING
@@ -22,28 +23,6 @@ if TYPE_CHECKING:
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@dataclass
class FrameProcessed:
"""Event data for frame processing in the pipeline.
Represents an event where a frame is being processed by a processor. This
data structure is typically used by observers to track the flow of frames
through the pipeline for logging, debugging, or analytics purposes.
Parameters:
processor: The processor processing the frame.
frame: The frame being processed.
direction: The direction of the frame (e.g., downstream or upstream).
timestamp: The time when the frame was pushed, based on the pipeline clock.
"""
processor: "FrameProcessor"
frame: Frame
direction: "FrameDirection"
timestamp: int
@dataclass
class FramePushed:
"""Event data for frame transfers between processors in the pipeline.
@@ -77,18 +56,7 @@ class BaseObserver(BaseObject):
performance analysis, and analytics collection.
"""
async def on_process_frame(self, data: FrameProcessed):
"""Handle the event when a frame is being processed by a processor.
This method should be implemented by subclasses to define specific
behavior (e.g., logging, monitoring, debugging) when a frame is
being processed by a processor.
Args:
data: The event data containing details about the frame processing.
"""
pass
@abstractmethod
async def on_push_frame(self, data: FramePushed):
"""Handle the event when a frame is pushed from one processor to another.

View File

@@ -6,12 +6,31 @@
"""Base pipeline implementation for frame processing."""
from abc import abstractmethod
from typing import List
from pipecat.processors.frame_processor import FrameProcessor
class BasePipeline(FrameProcessor):
"""Base class for all pipeline implementations."""
"""Base class for all pipeline implementations.
def __init__(self, **kwargs):
Provides the foundation for pipeline processors that need to support
metrics collection from their contained processors.
"""
def __init__(self):
"""Initialize the base pipeline."""
super().__init__(**kwargs)
super().__init__()
@abstractmethod
def processors_with_metrics(self) -> List[FrameProcessor]:
"""Return processors that can generate metrics.
Implementing classes should collect and return all processors within
their pipeline that support metrics generation.
Returns:
List of frame processors that support metrics collection.
"""
pass

View File

@@ -11,15 +11,106 @@ sub-pipelines concurrently, with coordination for system frames and proper
handling of pipeline lifecycle events.
"""
import asyncio
from itertools import chain
from typing import Dict, List
from typing import Awaitable, Callable, Dict, List
from loguru import logger
from pipecat.frames.frames import EndFrame, Frame, StartFrame
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
Frame,
StartFrame,
StartInterruptionFrame,
SystemFrame,
)
from pipecat.pipeline.base_pipeline import BasePipeline
from pipecat.pipeline.pipeline import Pipeline, PipelineSink, PipelineSource
from pipecat.pipeline.pipeline import Pipeline
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
class ParallelPipelineSource(FrameProcessor):
"""Source processor for parallel pipeline branches.
Handles frame routing for parallel pipeline inputs, directing system frames
to the parent push function and other upstream frames to a queue for processing.
"""
def __init__(
self,
upstream_queue: asyncio.Queue,
push_frame_func: Callable[[Frame, FrameDirection], Awaitable[None]],
):
"""Initialize the parallel pipeline source.
Args:
upstream_queue: Queue for collecting upstream frames from this branch.
push_frame_func: Function to push frames to the parent parallel pipeline.
"""
super().__init__()
self._up_queue = upstream_queue
self._push_frame_func = push_frame_func
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames with special handling for system frames.
Args:
frame: The frame to process.
direction: The direction of frame flow.
"""
await super().process_frame(frame, direction)
match direction:
case FrameDirection.UPSTREAM:
if isinstance(frame, SystemFrame):
await self._push_frame_func(frame, direction)
else:
await self._up_queue.put(frame)
case FrameDirection.DOWNSTREAM:
await self.push_frame(frame, direction)
class ParallelPipelineSink(FrameProcessor):
"""Sink processor for parallel pipeline branches.
Handles frame routing for parallel pipeline outputs, directing system frames
to the parent push function and other downstream frames to a queue for coordination.
"""
def __init__(
self,
downstream_queue: asyncio.Queue,
push_frame_func: Callable[[Frame, FrameDirection], Awaitable[None]],
):
"""Initialize the parallel pipeline sink.
Args:
downstream_queue: Queue for collecting downstream frames from this branch.
push_frame_func: Function to push frames to the parent parallel pipeline.
"""
super().__init__()
self._down_queue = downstream_queue
self._push_frame_func = push_frame_func
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames with special handling for system frames.
Args:
frame: The frame to process.
direction: The direction of frame flow.
"""
await super().process_frame(frame, direction)
match direction:
case FrameDirection.UPSTREAM:
await self.push_frame(frame, direction)
case FrameDirection.DOWNSTREAM:
if isinstance(frame, SystemFrame):
await self._push_frame_func(frame, direction)
else:
await self._down_queue.put(frame)
class ParallelPipeline(BasePipeline):
@@ -41,69 +132,28 @@ class ParallelPipeline(BasePipeline):
Exception: If no processor lists are provided.
TypeError: If any argument is not a list of processors.
"""
# We don't set it to direct mode because we use frame pausing and that
# requires queues.
super().__init__()
if len(args) == 0:
raise Exception(f"ParallelPipeline needs at least one argument")
self._args = args
self._sources = []
self._sinks = []
self._pipelines = []
self._seen_ids = set()
self._frame_counter: Dict[int, int] = {}
self._endframe_counter: Dict[int, int] = {}
self._start_frame_counter: Dict[int, int] = {}
self._started = False
logger.debug(f"Creating {self} pipelines")
for processors in args:
if not isinstance(processors, list):
raise TypeError(f"ParallelPipeline argument {processors} is not a list")
num_pipelines = len(self._pipelines)
# We add a source before the pipeline and a sink after so we control
# the frames that are pushed upstream and downstream.
source = PipelineSource(
self._parallel_push_frame, name=f"{self}::Source{num_pipelines}"
)
sink = PipelineSink(self._pipeline_sink_push_frame, name=f"{self}::Sink{num_pipelines}")
# Create pipeline
pipeline = Pipeline(processors, source=source, sink=sink)
self._pipelines.append(pipeline)
logger.debug(f"Finished creating {self} pipelines")
self._up_task = None
self._down_task = None
#
# Frame processor
# BasePipeline
#
@property
def processors(self):
"""Return the list of sub-processors contained within this processor.
Only compound processors (e.g. pipelines and parallel pipelines) have
sub-processors. Non-compound processors will return an empty list.
Returns:
The list of sub-processors if this is a compound processor.
"""
return self._pipelines
@property
def entry_processors(self) -> List["FrameProcessor"]:
"""Return the list of entry processors for this processor.
Entry processors are the first processors in a compound processor
(e.g. pipelines, parallel pipelines). Note that pipelines can also be an
entry processor as pipelines are processors themselves. Non-compound
processors will simply return an empty list.
Returns:
The list of entry processors.
"""
return self._pipelines
def processors_with_metrics(self) -> List[FrameProcessor]:
"""Collect processors that can generate metrics from all parallel branches.
@@ -112,6 +162,10 @@ class ParallelPipeline(BasePipeline):
"""
return list(chain.from_iterable(p.processors_with_metrics() for p in self._pipelines))
#
# Frame processor
#
async def setup(self, setup: FrameProcessorSetup):
"""Set up the parallel pipeline and all its branches.
@@ -122,14 +176,39 @@ class ParallelPipeline(BasePipeline):
TypeError: If any processor list argument is not actually a list.
"""
await super().setup(setup)
for p in self._pipelines:
await p.setup(setup)
self._up_queue = WatchdogQueue(setup.task_manager)
self._down_queue = WatchdogQueue(setup.task_manager)
logger.debug(f"Creating {self} pipelines")
for processors in self._args:
if not isinstance(processors, list):
raise TypeError(f"ParallelPipeline argument {processors} is not a list")
# We will add a source before the pipeline and a sink after.
source = ParallelPipelineSource(self._up_queue, self._parallel_push_frame)
sink = ParallelPipelineSink(self._down_queue, self._pipeline_sink_push_frame)
self._sources.append(source)
self._sinks.append(sink)
# Create pipeline
pipeline = Pipeline(processors)
source.link(pipeline)
pipeline.link(sink)
self._pipelines.append(pipeline)
logger.debug(f"Finished creating {self} pipelines")
await asyncio.gather(*[s.setup(setup) for s in self._sources])
await asyncio.gather(*[p.setup(setup) for p in self._pipelines])
await asyncio.gather(*[s.setup(setup) for s in self._sinks])
async def cleanup(self):
"""Clean up the parallel pipeline and all its branches."""
await super().cleanup()
for p in self._pipelines:
await p.cleanup()
await asyncio.gather(*[s.cleanup() for s in self._sources])
await asyncio.gather(*[p.cleanup() for p in self._pipelines])
await asyncio.gather(*[s.cleanup() for s in self._sinks])
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames through all parallel branches with lifecycle coordination.
@@ -140,15 +219,79 @@ class ParallelPipeline(BasePipeline):
"""
await super().process_frame(frame, direction)
# Parallel pipeline synchronized frames.
if isinstance(frame, (StartFrame, EndFrame)):
self._frame_counter[frame.id] = len(self._pipelines)
await self.pause_processing_system_frames()
await self.pause_processing_frames()
if isinstance(frame, StartFrame):
self._start_frame_counter[frame.id] = len(self._pipelines)
elif isinstance(frame, EndFrame):
self._endframe_counter[frame.id] = len(self._pipelines)
elif isinstance(frame, CancelFrame):
await self._cancel()
# Process frames in each of the sub-pipelines.
for p in self._pipelines:
await p.queue_frame(frame, direction)
if direction == FrameDirection.UPSTREAM:
# If we get an upstream frame we process it in each sink.
await asyncio.gather(*[s.queue_frame(frame, direction) for s in self._sinks])
elif direction == FrameDirection.DOWNSTREAM:
# If we get a downstream frame we process it in each source.
await asyncio.gather(*[s.queue_frame(frame, direction) for s in self._sources])
# Handle interruptions after everything has been cancelled.
if isinstance(frame, StartInterruptionFrame):
await self._handle_interruption()
# Wait for tasks to finish.
elif isinstance(frame, EndFrame):
await self._stop()
async def _start(self, frame: StartFrame):
"""Start the parallel pipeline processing tasks."""
await self._create_tasks()
async def _stop(self):
"""Stop all parallel pipeline processing tasks."""
if self._up_task:
# The up task doesn't receive an EndFrame, so we just cancel it.
await self.cancel_task(self._up_task)
self._up_task = None
if self._down_task:
# The down tasks waits for the last EndFrame sent by the internal
# pipelines.
await self._down_task
self._down_task = None
async def _cancel(self):
"""Cancel all parallel pipeline processing tasks."""
if self._up_task:
self._up_queue.cancel()
await self.cancel_task(self._up_task)
self._up_task = None
if self._down_task:
self._down_queue.cancel()
await self.cancel_task(self._down_task)
self._down_task = None
async def _create_tasks(self):
"""Create upstream and downstream processing tasks if not already running."""
if not self._up_task:
self._up_task = self.create_task(self._process_up_queue())
if not self._down_task:
self._down_task = self.create_task(self._process_down_queue())
async def _drain_queue(self, queue: asyncio.Queue):
try:
while not queue.empty():
queue.get_nowait()
except asyncio.QueueEmpty:
logger.debug(f"Draining {self} queue already empty")
async def _drain_queues(self):
"""Drain all frames from upstream and downstream queues."""
await self._drain_queue(self._up_queue)
await self._drain_queue(self._down_queue)
async def _handle_interruption(self):
"""Handle interruption by cancelling tasks, draining queues, and restarting."""
await self._cancel()
await self._drain_queues()
await self._create_tasks()
async def _parallel_push_frame(self, frame: Frame, direction: FrameDirection):
"""Push frames while avoiding duplicates using frame ID tracking."""
@@ -157,18 +300,52 @@ class ParallelPipeline(BasePipeline):
await self.push_frame(frame, direction)
async def _pipeline_sink_push_frame(self, frame: Frame, direction: FrameDirection):
# Parallel pipeline synchronized frames.
if isinstance(frame, (StartFrame, EndFrame)):
# Decrement counter.
frame_counter = self._frame_counter.get(frame.id, 0)
if frame_counter > 0:
self._frame_counter[frame.id] -= 1
frame_counter = self._frame_counter[frame.id]
if isinstance(frame, StartFrame):
# Decrement counter and check if all pipelines have processed the StartFrame
start_frame_counter = self._start_frame_counter.get(frame.id, 0)
if start_frame_counter > 0:
self._start_frame_counter[frame.id] -= 1
start_frame_counter = self._start_frame_counter[frame.id]
# Only push the frame when all pipelines have processed it.
if frame_counter == 0:
# Only push the StartFrame when all pipelines have processed it
if start_frame_counter == 0:
self._started = True
await self._start(frame)
await self._parallel_push_frame(frame, direction)
await self.resume_processing_system_frames()
await self.resume_processing_frames()
else:
await self._parallel_push_frame(frame, direction)
if self._started:
await self._parallel_push_frame(frame, direction)
else:
await self._down_queue.put(frame)
async def _process_up_queue(self):
"""Process upstream frames from all parallel branches."""
while True:
frame = await self._up_queue.get()
await self._parallel_push_frame(frame, FrameDirection.UPSTREAM)
self._up_queue.task_done()
async def _process_down_queue(self):
"""Process downstream frames with EndFrame coordination.
Coordinates EndFrames to ensure they are only pushed upstream once
all parallel branches have completed processing them.
"""
running = True
while running:
frame = await self._down_queue.get()
endframe_counter = self._endframe_counter.get(frame.id, 0)
# If we have a counter, decrement it.
if endframe_counter > 0:
self._endframe_counter[frame.id] -= 1
endframe_counter = self._endframe_counter[frame.id]
# If we don't have a counter or we reached 0, push the frame.
if endframe_counter == 0:
await self._parallel_push_frame(frame, FrameDirection.DOWNSTREAM)
running = not (endframe_counter == 0 and isinstance(frame, EndFrame))
self._down_queue.task_done()

View File

@@ -11,7 +11,7 @@ in sequence and manages frame flow between them, along with helper classes
for pipeline source and sink operations.
"""
from typing import Callable, Coroutine, List, Optional
from typing import Callable, Coroutine, List
from pipecat.frames.frames import Frame
from pipecat.pipeline.base_pipeline import BasePipeline
@@ -26,14 +26,13 @@ class PipelineSource(FrameProcessor):
provided upstream handler function.
"""
def __init__(self, upstream_push_frame: Callable[[Frame, FrameDirection], Coroutine], **kwargs):
def __init__(self, upstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]):
"""Initialize the pipeline source.
Args:
upstream_push_frame: Coroutine function to handle upstream frames.
**kwargs: Additional arguments passed to parent class.
"""
super().__init__(enable_direct_mode=True, **kwargs)
super().__init__()
self._upstream_push_frame = upstream_push_frame
async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -60,16 +59,13 @@ class PipelineSink(FrameProcessor):
provided downstream handler function.
"""
def __init__(
self, downstream_push_frame: Callable[[Frame, FrameDirection], Coroutine], **kwargs
):
def __init__(self, downstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]):
"""Initialize the pipeline sink.
Args:
downstream_push_frame: Coroutine function to handle downstream frames.
**kwargs: Additional arguments passed to parent class.
"""
super().__init__(enable_direct_mode=True, **kwargs)
super().__init__()
self._downstream_push_frame = downstream_push_frame
async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -96,60 +92,26 @@ class Pipeline(BasePipeline):
provides metrics collection from contained processors.
"""
def __init__(
self,
processors: List[FrameProcessor],
*,
source: Optional[FrameProcessor] = None,
sink: Optional[FrameProcessor] = None,
):
def __init__(self, processors: List[FrameProcessor]):
"""Initialize the pipeline with a list of processors.
Args:
processors: List of frame processors to connect in sequence.
source: An optional pipeline source processor.
sink: An optional pipeline sink processor.
"""
super().__init__(enable_direct_mode=True)
super().__init__()
# Add a source and a sink queue so we can forward frames upstream and
# downstream outside of the pipeline.
self._source = source or PipelineSource(self.push_frame, name=f"{self}::Source")
self._sink = sink or PipelineSink(self.push_frame, name=f"{self}::Sink")
self._source = PipelineSource(self.push_frame)
self._sink = PipelineSink(self.push_frame)
self._processors: List[FrameProcessor] = [self._source] + processors + [self._sink]
self._link_processors()
#
# Frame processor
# BasePipeline
#
@property
def processors(self):
"""Return the list of sub-processors contained within this processor.
Only compound processors (e.g. pipelines and parallel pipelines) have
sub-processors. Non-compound processors will return an empty list.
Returns:
The list of sub-processors if this is a compound processor.
"""
return self._processors
@property
def entry_processors(self) -> List["FrameProcessor"]:
"""Return the list of entry processors for this processor.
Entry processors are the first processors in a compound processor
(e.g. pipelines, parallel pipelines). Note that pipelines can also be an
entry processor as pipelines are processors themselves. Non-compound
processors will simply return an empty list.
Returns:
The list of entry processors.
"""
return [self._source]
def processors_with_metrics(self):
"""Return processors that can generate metrics.
@@ -160,12 +122,17 @@ class Pipeline(BasePipeline):
List of frame processors that can generate metrics.
"""
services = []
for p in self.processors:
if p.can_generate_metrics():
for p in self._processors:
if isinstance(p, BasePipeline):
services.extend(p.processors_with_metrics())
elif p.can_generate_metrics():
services.append(p)
services.extend(p.processors_with_metrics())
return services
#
# Frame processor
#
async def setup(self, setup: FrameProcessorSetup):
"""Set up the pipeline and all contained processors.
@@ -208,5 +175,7 @@ class Pipeline(BasePipeline):
"""Link all processors in sequence and set their parent."""
prev = self._processors[0]
for curr in self._processors[1:]:
prev.set_parent(self)
prev.link(curr)
prev = curr
prev.set_parent(self)

View File

@@ -71,10 +71,7 @@ class PipelineRunner(BaseObject):
logger.debug(f"Runner {self} started running {task}")
self._tasks[task.name] = task
params = PipelineTaskParams(loop=self._loop)
try:
await task.run(params)
except asyncio.CancelledError:
await self._cancel()
await task.run(params)
del self._tasks[task.name]
# Cleanup base object.
@@ -98,10 +95,6 @@ class PipelineRunner(BaseObject):
async def cancel(self):
"""Cancel all running tasks immediately."""
logger.debug(f"Cancelling runner {self}")
await self._cancel()
async def _cancel(self):
"""Cancel all running tasks immediately."""
await asyncio.gather(*[t.cancel() for t in self._tasks.values()])
def _setup_sigint(self):

View File

@@ -22,6 +22,7 @@ from pipecat.frames.frames import ControlFrame, EndFrame, Frame, SystemFrame
from pipecat.pipeline.base_pipeline import BasePipeline
from pipecat.pipeline.pipeline import Pipeline
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
@dataclass
@@ -48,7 +49,7 @@ class SyncParallelPipelineSource(FrameProcessor):
Args:
upstream_queue: Queue for collecting upstream frames from the pipeline.
"""
super().__init__(enable_direct_mode=True)
super().__init__()
self._up_queue = upstream_queue
async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -80,7 +81,7 @@ class SyncParallelPipelineSink(FrameProcessor):
Args:
downstream_queue: Queue for collecting downstream frames from the pipeline.
"""
super().__init__(enable_direct_mode=True)
super().__init__()
self._down_queue = downstream_queue
async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -127,65 +128,15 @@ class SyncParallelPipeline(BasePipeline):
if len(args) == 0:
raise Exception(f"SyncParallelPipeline needs at least one argument")
self._args = args
self._sinks = []
self._sources = []
self._pipelines = []
self._up_queue = asyncio.Queue()
self._down_queue = asyncio.Queue()
logger.debug(f"Creating {self} pipelines")
for processors in args:
if not isinstance(processors, list):
raise TypeError(f"SyncParallelPipeline argument {processors} is not a list")
# We add a source at the beginning of the pipeline and a sink at the end.
up_queue = asyncio.Queue()
down_queue = asyncio.Queue()
source = SyncParallelPipelineSource(up_queue)
sink = SyncParallelPipelineSink(down_queue)
# Keep track of sources and sinks. We also keep the output queue of
# the source and the sinks so we can use it later.
self._sources.append({"processor": source, "queue": down_queue})
self._sinks.append({"processor": sink, "queue": up_queue})
# Create pipeline
pipeline = Pipeline(processors, source=source, sink=sink)
self._pipelines.append(pipeline)
logger.debug(f"Finished creating {self} pipelines")
#
# Frame processor
# BasePipeline
#
@property
def processors(self):
"""Return the list of sub-processors contained within this processor.
Only compound processors (e.g. pipelines and parallel pipelines) have
sub-processors. Non-compound processors will return an empty list.
Returns:
The list of sub-processors if this is a compound processor.
"""
return self._pipelines
@property
def entry_processors(self) -> List["FrameProcessor"]:
"""Return the list of entry processors for this processor.
Entry processors are the first processors in a compound processor
(e.g. pipelines, parallel pipelines). Note that pipelines can also be an
entry processor as pipelines are processors themselves. Non-compound
processors will simply return an empty list.
Returns:
The list of entry processors.
"""
return self._sources
def processors_with_metrics(self) -> List[FrameProcessor]:
"""Collect processors that can generate metrics from all parallel pipelines.
@@ -194,6 +145,10 @@ class SyncParallelPipeline(BasePipeline):
"""
return list(chain.from_iterable(p.processors_with_metrics() for p in self._pipelines))
#
# Frame processor
#
async def setup(self, setup: FrameProcessorSetup):
"""Set up the parallel pipeline and all contained processors.
@@ -201,12 +156,44 @@ class SyncParallelPipeline(BasePipeline):
setup: Configuration for frame processor setup.
"""
await super().setup(setup)
self._up_queue = WatchdogQueue(setup.task_manager)
self._down_queue = WatchdogQueue(setup.task_manager)
logger.debug(f"Creating {self} pipelines")
for processors in self._args:
if not isinstance(processors, list):
raise TypeError(f"SyncParallelPipeline argument {processors} is not a list")
# We add a source at the beginning of the pipeline and a sink at the end.
up_queue = asyncio.Queue()
down_queue = asyncio.Queue()
source = SyncParallelPipelineSource(up_queue)
sink = SyncParallelPipelineSink(down_queue)
# Create pipeline
pipeline = Pipeline(processors)
source.link(pipeline)
pipeline.link(sink)
self._pipelines.append(pipeline)
# Keep track of sources and sinks. We also keep the output queue of
# the source and the sinks so we can use it later.
self._sources.append({"processor": source, "queue": down_queue})
self._sinks.append({"processor": sink, "queue": up_queue})
logger.debug(f"Finished creating {self} pipelines")
await asyncio.gather(*[s["processor"].setup(setup) for s in self._sources])
await asyncio.gather(*[p.setup(setup) for p in self._pipelines])
await asyncio.gather(*[s["processor"].setup(setup) for s in self._sinks])
async def cleanup(self):
"""Clean up the parallel pipeline and all contained processors."""
await super().cleanup()
await asyncio.gather(*[s["processor"].cleanup() for s in self._sources])
await asyncio.gather(*[p.cleanup() for p in self._pipelines])
await asyncio.gather(*[s["processor"].cleanup() for s in self._sinks])
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames through all parallel pipelines with synchronization.

View File

@@ -32,24 +32,26 @@ from pipecat.frames.frames import (
Frame,
HeartbeatFrame,
InputAudioRawFrame,
InterimTranscriptionFrame,
LLMFullResponseEndFrame,
MetricsFrame,
StartFrame,
StopFrame,
StopTaskFrame,
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.metrics.metrics import ProcessingMetricsData, TTFBMetricsData
from pipecat.observers.base_observer import BaseObserver
from pipecat.observers.turn_tracking_observer import TurnTrackingObserver
from pipecat.pipeline.base_pipeline import BasePipeline
from pipecat.pipeline.base_task import BasePipelineTask, PipelineTaskParams
from pipecat.pipeline.pipeline import Pipeline, PipelineSink, PipelineSource
from pipecat.pipeline.task_observer import TaskObserver
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
from pipecat.utils.asyncio.task_manager import BaseTaskManager, TaskManager, TaskManagerParams
from pipecat.utils.asyncio.task_manager import (
WATCHDOG_TIMEOUT,
BaseTaskManager,
TaskManager,
TaskManagerParams,
)
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
from pipecat.utils.tracing.setup import is_tracing_available
from pipecat.utils.tracing.turn_trace_observer import TurnTraceObserver
@@ -99,6 +101,70 @@ class PipelineParams(BaseModel):
start_metadata: Dict[str, Any] = Field(default_factory=dict)
class PipelineTaskSource(FrameProcessor):
"""Source processor for pipeline tasks that handles frame routing.
This is the source processor that is linked at the beginning of the
pipeline given to the pipeline task. It allows us to easily push frames
downstream to the pipeline and also receive upstream frames coming from the
pipeline.
"""
def __init__(self, up_queue: asyncio.Queue, **kwargs):
"""Initialize the pipeline task source.
Args:
up_queue: Queue for upstream frame processing.
**kwargs: Additional arguments passed to the parent class.
"""
super().__init__(**kwargs)
self._up_queue = up_queue
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames and route them based on direction.
Args:
frame: The frame to process.
direction: The direction of frame flow.
"""
await super().process_frame(frame, direction)
match direction:
case FrameDirection.UPSTREAM:
await self._up_queue.put(frame)
case FrameDirection.DOWNSTREAM:
await self.push_frame(frame, direction)
class PipelineTaskSink(FrameProcessor):
"""Sink processor for pipeline tasks that handles final frame processing.
This is the sink processor that is linked at the end of the pipeline
given to the pipeline task. It allows us to receive downstream frames and
act on them, for example, waiting to receive an EndFrame.
"""
def __init__(self, down_queue: asyncio.Queue, **kwargs):
"""Initialize the pipeline task sink.
Args:
down_queue: Queue for downstream frame processing.
**kwargs: Additional arguments passed to the parent class.
"""
super().__init__(**kwargs)
self._down_queue = down_queue
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames and route them to the downstream queue.
Args:
frame: The frame to process.
direction: The direction of frame flow.
"""
await super().process_frame(frame, direction)
await self._down_queue.put(frame)
class PipelineTask(BasePipelineTask):
"""Manages the execution of a pipeline, handling frame processing and task lifecycle.
@@ -130,7 +196,7 @@ class PipelineTask(BasePipelineTask):
def __init__(
self,
pipeline: FrameProcessor,
pipeline: BasePipeline,
*,
params: Optional[PipelineParams] = None,
additional_span_attributes: Optional[dict] = None,
@@ -140,17 +206,16 @@ class PipelineTask(BasePipelineTask):
conversation_id: Optional[str] = None,
enable_tracing: bool = False,
enable_turn_tracking: bool = True,
enable_watchdog_logging: bool = False,
enable_watchdog_timers: bool = False,
idle_timeout_frames: Tuple[Type[Frame], ...] = (
BotSpeakingFrame,
InterimTranscriptionFrame,
LLMFullResponseEndFrame,
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
),
idle_timeout_secs: Optional[float] = 300,
observers: Optional[List[BaseObserver]] = None,
task_manager: Optional[BaseTaskManager] = None,
watchdog_timeout_secs: float = WATCHDOG_TIMEOUT,
):
"""Initialize the PipelineTask.
@@ -166,6 +231,8 @@ class PipelineTask(BasePipelineTask):
conversation_id: Optional custom ID for the conversation.
enable_tracing: Whether to enable tracing.
enable_turn_tracking: Whether to enable turn tracking.
enable_watchdog_logging: Whether to print task processing times.
enable_watchdog_timers: Whether to enable task watchdog timers.
idle_timeout_frames: A tuple with the frames that should trigger an idle
timeout if not received within `idle_timeout_seconds`.
idle_timeout_secs: Timeout (in seconds) to consider pipeline idle or
@@ -173,8 +240,11 @@ class PipelineTask(BasePipelineTask):
automatically.
observers: List of observers for monitoring pipeline execution.
task_manager: Optional task manager for handling asyncio tasks.
watchdog_timeout_secs: Watchdog timer timeout (in seconds). A warning
will be logged if the watchdog timer is not reset before this timeout.
"""
super().__init__()
self._pipeline = pipeline
self._params = params or PipelineParams()
self._additional_span_attributes = additional_span_attributes or {}
self._cancel_on_idle_timeout = cancel_on_idle_timeout
@@ -183,8 +253,11 @@ class PipelineTask(BasePipelineTask):
self._conversation_id = conversation_id
self._enable_tracing = enable_tracing and is_tracing_available()
self._enable_turn_tracking = enable_turn_tracking
self._enable_watchdog_logging = enable_watchdog_logging
self._enable_watchdog_timers = enable_watchdog_timers
self._idle_timeout_frames = idle_timeout_frames
self._idle_timeout_secs = idle_timeout_secs
self._watchdog_timeout_secs = watchdog_timeout_secs
if self._params.observers:
import warnings
@@ -215,30 +288,40 @@ class PipelineTask(BasePipelineTask):
# PipelineTask and its frame processors.
self._task_manager = task_manager or TaskManager()
# This queue receives frames coming from the pipeline upstream.
self._up_queue = WatchdogQueue(self._task_manager)
self._process_up_task: Optional[asyncio.Task] = None
# This queue receives frames coming from the pipeline downstream.
self._down_queue = WatchdogQueue(self._task_manager)
self._process_down_task: Optional[asyncio.Task] = None
# This queue is the queue used to push frames to the pipeline.
self._push_queue = asyncio.Queue()
self._push_queue = WatchdogQueue(self._task_manager)
self._process_push_task: Optional[asyncio.Task] = None
# This is the heartbeat queue. When a heartbeat frame is received in the
# down queue we add it to the heartbeat queue for processing.
self._heartbeat_queue = asyncio.Queue()
self._heartbeat_queue = WatchdogQueue(self._task_manager)
self._heartbeat_push_task: Optional[asyncio.Task] = None
self._heartbeat_monitor_task: Optional[asyncio.Task] = None
# This is the idle queue. When frames are received downstream they are
# put in the queue. If no frame is received the pipeline is considered
# idle.
self._idle_queue = asyncio.Queue()
self._idle_queue = WatchdogQueue(self._task_manager)
self._idle_monitor_task: Optional[asyncio.Task] = None
# This event is used to indicate a finalize frame (e.g. EndFrame,
# StopFrame) has been received in the down queue.
self._pipeline_end_event = asyncio.Event()
# This is the final pipeline. It is composed of a source processor,
# followed by the user pipeline, and ending with a sink processor. The
# source allows us to receive and react to upstream frames, and the sink
# allows us to receive and react to downstream frames.
source = PipelineSource(self._source_push_frame, name=f"{self}::Source")
sink = PipelineSink(self._sink_push_frame, name=f"{self}::Sink")
self._pipeline = Pipeline([pipeline], source=source, sink=sink)
# This is a source processor that we connect to the provided
# pipeline. This source processor allows up to receive and react to
# upstream frames.
self._source = PipelineTaskSource(self._up_queue)
self._source.link(pipeline)
# This is a sink processor that we connect to the provided
# pipeline. This sink processor allows up to receive and react to
# downstream frames.
self._sink = PipelineTaskSink(self._down_queue)
pipeline.link(self._sink)
# The task observer acts as a proxy to the provided observers. This way,
# we only need to pass a single observer (using the StartFrame) which
@@ -363,43 +446,24 @@ class PipelineTask(BasePipelineTask):
# Create all main tasks and wait of the main push task. This is the
# task that pushes frames to the very beginning of our pipeline (our
# controlled source processor).
# controlled PipelineTaskSource processor).
push_task = await self._create_tasks()
await push_task
await self._task_manager.wait_for_task(push_task)
# We have already cleaned up the pipeline inside the task.
cleanup_pipeline = False
# Pipeline has finished nicely.
self._finished = True
except asyncio.CancelledError:
# Raise exception back to the pipeline runner so it can cancel this
# task properly.
raise
# We are awaiting on the push task and it might be cancelled
# (e.g. Ctrl-C). This means we will get a CancelledError here as
# well, because you get a CancelledError in every place you are
# awaiting a task.
pass
finally:
# We can reach this point for different reasons:
#
# 1. The task has finished properly (e.g. `EndFrame`).
# 2. By calling `PipelineTask.cancel()`.
# 3. By asyncio task cancellation.
#
# Case (1) will execute the code below without issues because
# `self._finished` is true.
#
# Case (2) will execute the code below without issues because
# `self._cancelled` is true.
#
# Case (3) will raise the exception above (because we are cancelling
# the asyncio task). This will be then captured by the
# `PipelineRunner` which will call `PipelineTask.cancel()` and
# therefore becoming case (2).
if self._finished or self._cancelled:
logger.debug(f"Pipeline task {self} has finished, cleaning up resources")
await self._cancel_tasks()
await self._cleanup(cleanup_pipeline)
if self._check_dangling_tasks:
self._print_dangling_tasks()
self._finished = True
await self._cancel_tasks()
await self._cleanup(cleanup_pipeline)
if self._check_dangling_tasks:
self._print_dangling_tasks()
self._finished = True
async def queue_frame(self, frame: Frame):
"""Queue a single frame to be pushed down the pipeline.
@@ -430,7 +494,7 @@ class PipelineTask(BasePipelineTask):
# Make sure everything is cleaned up downstream. This is sent
# out-of-band from the main streaming task which is what we want since
# we want to cancel right away.
await self._pipeline.queue_frame(CancelFrame())
await self._source.push_frame(CancelFrame())
# Wait for CancelFrame to make it throught the pipeline.
await self._wait_for_pipeline_end()
# Only cancel the push task, we don't want to be able to process any
@@ -442,6 +506,12 @@ class PipelineTask(BasePipelineTask):
async def _create_tasks(self):
"""Create and start all pipeline processing tasks."""
self._process_up_task = self._task_manager.create_task(
self._process_up_queue(), f"{self}::_process_up_queue"
)
self._process_down_task = self._task_manager.create_task(
self._process_down_queue(), f"{self}::_process_down_queue"
)
self._process_push_task = self._task_manager.create_task(
self._process_push_queue(), f"{self}::_process_push_queue"
)
@@ -475,6 +545,14 @@ class PipelineTask(BasePipelineTask):
await self._task_manager.cancel_task(self._process_push_task)
self._process_push_task = None
if self._process_up_task:
await self._task_manager.cancel_task(self._process_up_task)
self._process_up_task = None
if self._process_down_task:
await self._task_manager.cancel_task(self._process_down_task)
self._process_down_task = None
await self._maybe_cancel_heartbeat_tasks()
await self._maybe_cancel_idle_task()
@@ -494,6 +572,7 @@ class PipelineTask(BasePipelineTask):
async def _maybe_cancel_idle_task(self):
"""Cancel idle monitoring task if it is running."""
if self._idle_timeout_secs and self._idle_monitor_task:
self._idle_queue.cancel()
await self._task_manager.cancel_task(self._idle_monitor_task)
self._idle_monitor_task = None
@@ -513,15 +592,23 @@ class PipelineTask(BasePipelineTask):
async def _setup(self, params: PipelineTaskParams):
"""Set up the pipeline task and all processors."""
mgr_params = TaskManagerParams(loop=params.loop)
mgr_params = TaskManagerParams(
loop=params.loop,
enable_watchdog_logging=self._enable_watchdog_logging,
enable_watchdog_timers=self._enable_watchdog_timers,
watchdog_timeout=self._watchdog_timeout_secs,
)
self._task_manager.setup(mgr_params)
setup = FrameProcessorSetup(
clock=self._clock,
task_manager=self._task_manager,
observer=self._observer,
watchdog_timers_enabled=self._enable_watchdog_timers,
)
await self._source.setup(setup)
await self._pipeline.setup(setup)
await self._sink.setup(setup)
async def _cleanup(self, cleanup_pipeline: bool):
"""Clean up the pipeline task and processors."""
@@ -533,8 +620,10 @@ class PipelineTask(BasePipelineTask):
self._turn_trace_observer.end_conversation_tracing()
# Cleanup pipeline processors.
await self._source.cleanup()
if cleanup_pipeline:
await self._pipeline.cleanup()
await self._sink.cleanup()
async def _process_push_queue(self):
"""Process frames from the push queue and send them through the pipeline.
@@ -558,16 +647,16 @@ class PipelineTask(BasePipelineTask):
interruption_strategies=self._params.interruption_strategies,
)
start_frame.metadata = self._params.start_metadata
await self._pipeline.queue_frame(start_frame)
await self._source.queue_frame(start_frame, FrameDirection.DOWNSTREAM)
if self._params.enable_metrics and self._params.send_initial_empty_metrics:
await self._pipeline.queue_frame(self._initial_metrics_frame())
await self._source.queue_frame(self._initial_metrics_frame(), FrameDirection.DOWNSTREAM)
running = True
cleanup_pipeline = True
while running:
frame = await self._push_queue.get()
await self._pipeline.queue_frame(frame)
await self._source.queue_frame(frame, FrameDirection.DOWNSTREAM)
if isinstance(frame, (CancelFrame, EndFrame, StopFrame)):
await self._wait_for_pipeline_end()
running = not isinstance(frame, (CancelFrame, EndFrame, StopFrame))
@@ -575,7 +664,7 @@ class PipelineTask(BasePipelineTask):
self._push_queue.task_done()
await self._cleanup(cleanup_pipeline)
async def _source_push_frame(self, frame: Frame, direction: FrameDirection):
async def _process_up_queue(self):
"""Process frames coming upstream from the pipeline.
This is the task that processes frames coming upstream from the
@@ -583,29 +672,33 @@ class PipelineTask(BasePipelineTask):
pipeline to be stopped (e.g. EndTaskFrame) in which case we would send
an EndFrame down the pipeline.
"""
if isinstance(frame, self._reached_upstream_types):
await self._call_event_handler("on_frame_reached_upstream", frame)
while True:
frame = await self._up_queue.get()
if isinstance(frame, EndTaskFrame):
# Tell the task we should end nicely.
await self.queue_frame(EndFrame())
elif isinstance(frame, CancelTaskFrame):
# Tell the task we should end right away.
await self.queue_frame(CancelFrame())
elif isinstance(frame, StopTaskFrame):
# Tell the task we should stop nicely.
await self.queue_frame(StopFrame())
elif isinstance(frame, ErrorFrame):
if frame.fatal:
logger.error(f"A fatal error occurred: {frame}")
# Cancel all tasks downstream.
if isinstance(frame, self._reached_upstream_types):
await self._call_event_handler("on_frame_reached_upstream", frame)
if isinstance(frame, EndTaskFrame):
# Tell the task we should end nicely.
await self.queue_frame(EndFrame())
elif isinstance(frame, CancelTaskFrame):
# Tell the task we should end right away.
await self.queue_frame(CancelFrame())
# Tell the task we should stop.
await self.queue_frame(StopTaskFrame())
else:
logger.warning(f"Something went wrong: {frame}")
elif isinstance(frame, StopTaskFrame):
# Tell the task we should stop nicely.
await self.queue_frame(StopFrame())
elif isinstance(frame, ErrorFrame):
if frame.fatal:
logger.error(f"A fatal error occurred: {frame}")
# Cancel all tasks downstream.
await self.queue_frame(CancelFrame())
# Tell the task we should stop.
await self.queue_frame(StopTaskFrame())
else:
logger.warning(f"Something went wrong: {frame}")
self._up_queue.task_done()
async def _sink_push_frame(self, frame: Frame, direction: FrameDirection):
async def _process_down_queue(self):
"""Process frames coming downstream from the pipeline.
This tasks process frames coming downstream from the pipeline. For
@@ -613,30 +706,34 @@ class PipelineTask(BasePipelineTask):
processors have handled the EndFrame and therefore we can exit the task
cleanly.
"""
# Queue received frame to the idle queue so we can monitor idle
# pipelines.
await self._idle_queue.put(frame)
while True:
frame = await self._down_queue.get()
if isinstance(frame, self._reached_downstream_types):
await self._call_event_handler("on_frame_reached_downstream", frame)
# Queue received frame to the idle queue so we can monitor idle
# pipelines.
await self._idle_queue.put(frame)
if isinstance(frame, StartFrame):
await self._call_event_handler("on_pipeline_started", frame)
if isinstance(frame, self._reached_downstream_types):
await self._call_event_handler("on_frame_reached_downstream", frame)
# Start heartbeat tasks now that StartFrame has been processed
# by all processors in the pipeline
self._maybe_start_heartbeat_tasks()
elif isinstance(frame, EndFrame):
await self._call_event_handler("on_pipeline_ended", frame)
self._pipeline_end_event.set()
elif isinstance(frame, StopFrame):
await self._call_event_handler("on_pipeline_stopped", frame)
self._pipeline_end_event.set()
elif isinstance(frame, CancelFrame):
await self._call_event_handler("on_pipeline_cancelled", frame)
self._pipeline_end_event.set()
elif isinstance(frame, HeartbeatFrame):
await self._heartbeat_queue.put(frame)
if isinstance(frame, StartFrame):
await self._call_event_handler("on_pipeline_started", frame)
# Start heartbeat tasks now that StartFrame has been processed
# by all processors in the pipeline
self._maybe_start_heartbeat_tasks()
elif isinstance(frame, EndFrame):
await self._call_event_handler("on_pipeline_ended", frame)
self._pipeline_end_event.set()
elif isinstance(frame, StopFrame):
await self._call_event_handler("on_pipeline_stopped", frame)
self._pipeline_end_event.set()
elif isinstance(frame, CancelFrame):
await self._call_event_handler("on_pipeline_cancelled", frame)
self._pipeline_end_event.set()
elif isinstance(frame, HeartbeatFrame):
await self._heartbeat_queue.put(frame)
self._down_queue.task_done()
async def _heartbeat_push_handler(self):
"""Push heartbeat frames at regular intervals."""
@@ -644,7 +741,7 @@ class PipelineTask(BasePipelineTask):
# Don't use `queue_frame()` because if an EndFrame is queued the
# task will just stop waiting for the pipeline to finish not
# allowing more frames to be pushed.
await self._pipeline.queue_frame(HeartbeatFrame(timestamp=self._clock.get_time()))
await self._source.queue_frame(HeartbeatFrame(timestamp=self._clock.get_time()))
await asyncio.sleep(self._params.heartbeats_period_secs)
async def _heartbeat_monitor_handler(self):
@@ -719,10 +816,6 @@ class PipelineTask(BasePipelineTask):
Returns:
Whether the pipeline task should continue running.
"""
# If we are cancelling, just exit the task.
if self._cancelled:
return True
logger.warning("Idle timeout detected. Last 10 frames received:")
for i, frame in enumerate(last_frames, 1):
logger.warning(f"Frame {i}: {frame}")

View File

@@ -13,12 +13,13 @@ the main pipeline execution.
import asyncio
import inspect
from typing import Any, Dict, List, Optional
from typing import Dict, List, Optional
from attr import dataclass
from pipecat.observers.base_observer import BaseObserver, FrameProcessed, FramePushed
from pipecat.observers.base_observer import BaseObserver, FramePushed
from pipecat.utils.asyncio.task_manager import BaseTaskManager
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
@dataclass
@@ -85,7 +86,7 @@ class TaskObserver(BaseObserver):
# If we already started, create a new proxy for the observer.
# Otherwise, it will be created in start().
if self._proxies:
if self._started():
proxy = self._create_proxy(observer)
self._proxies[observer] = proxy
@@ -96,7 +97,7 @@ class TaskObserver(BaseObserver):
observer: The observer to remove.
"""
# If the observer has a proxy, remove it.
if self._proxies and observer in self._proxies:
if observer in self._proxies:
proxy = self._proxies[observer]
# Remove the proxy so it doesn't get called anymore.
del self._proxies[observer]
@@ -119,25 +120,22 @@ class TaskObserver(BaseObserver):
for proxy in self._proxies.values():
await self._task_manager.cancel_task(proxy.task)
async def on_process_frame(self, data: FramePushed):
"""Queue frame data for all managed observers.
Args:
data: The frame push event data to distribute to observers.
"""
await self._send_to_proxy(data)
async def on_push_frame(self, data: FramePushed):
"""Queue frame data for all managed observers.
Args:
data: The frame push event data to distribute to observers.
"""
await self._send_to_proxy(data)
for proxy in self._proxies.values():
await proxy.queue.put(data)
def _started(self) -> bool:
"""Check if the task observer has been started."""
return self._proxies is not None
def _create_proxy(self, observer: BaseObserver) -> Proxy:
"""Create a proxy for a single observer."""
queue = asyncio.Queue()
queue = WatchdogQueue(self._task_manager)
task = self._task_manager.create_task(
self._proxy_task_handler(queue, observer),
f"TaskObserver::{observer}::_proxy_task_handler",
@@ -153,10 +151,6 @@ class TaskObserver(BaseObserver):
proxies[observer] = proxy
return proxies
async def _send_to_proxy(self, data: Any):
for proxy in self._proxies.values():
await proxy.queue.put(data)
async def _proxy_task_handler(self, queue: asyncio.Queue, observer: BaseObserver):
"""Handle frame processing for a single observer."""
on_push_frame_deprecated = False
@@ -175,15 +169,11 @@ class TaskObserver(BaseObserver):
while True:
data = await queue.get()
if isinstance(data, FramePushed):
if on_push_frame_deprecated:
await observer.on_push_frame(
data.src, data.dst, data.frame, data.direction, data.timestamp
)
else:
await observer.on_push_frame(data)
elif isinstance(data, FrameProcessed):
await observer.on_process_frame(data)
if on_push_frame_deprecated:
await observer.on_push_frame(
data.src, data.dst, data.frame, data.direction, data.timestamp
)
else:
await observer.on_push_frame(data)
queue.task_done()

View File

@@ -24,7 +24,7 @@ from pipecat.frames.frames import (
StartFrame,
TranscriptionFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.utils.time import time_now_iso8601
@@ -64,11 +64,7 @@ class DTMFAggregator(FrameProcessor):
self._digit_event = asyncio.Event()
self._aggregation_task: Optional[asyncio.Task] = None
async def cleanup(self) -> None:
"""Clean up resources."""
await super().cleanup()
await self._stop_aggregation_task()
self._interruption_task: Optional[asyncio.Task] = None
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
"""Process incoming frames and handle DTMF aggregation.
@@ -86,6 +82,7 @@ class DTMFAggregator(FrameProcessor):
if self._aggregation:
await self._flush_aggregation()
await self._stop_aggregation_task()
await self._stop_interruption_task()
await self.push_frame(frame, direction)
elif isinstance(frame, InputDTMFFrame):
# Push the DTMF frame downstream first
@@ -105,7 +102,7 @@ class DTMFAggregator(FrameProcessor):
# For first digit, schedule interruption in separate task
if is_first_digit:
await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
self._interruption_task = self.create_task(self._send_interruption_task())
# Check for immediate flush conditions
if frame.button == self._termination_digit:
@@ -114,6 +111,16 @@ class DTMFAggregator(FrameProcessor):
# Signal digit received for timeout handling
self._digit_event.set()
async def _send_interruption_task(self):
"""Send interruption frame safely in a separate task."""
await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
async def _stop_interruption_task(self) -> None:
"""Stops the interruption task."""
if self._interruption_task:
await self.cancel_task(self._interruption_task)
self._interruption_task = None
def _create_aggregation_task(self) -> None:
"""Creates the aggregation task if it hasn't been created yet."""
if not self._aggregation_task:
@@ -132,6 +139,7 @@ class DTMFAggregator(FrameProcessor):
await asyncio.wait_for(self._digit_event.wait(), timeout=self._idle_timeout)
self._digit_event.clear()
except asyncio.TimeoutError:
self.reset_watchdog()
if self._aggregation:
await self._flush_aggregation()
@@ -149,3 +157,8 @@ class DTMFAggregator(FrameProcessor):
await self.push_frame(transcription_frame)
self._aggregation = ""
async def cleanup(self) -> None:
"""Clean up resources."""
await super().cleanup()
await self._stop_aggregation_task()

View File

@@ -670,7 +670,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
if self._vad_params
else self._params.turn_emulated_vad_timeout
)
await asyncio.wait_for(self._aggregation_event.wait(), timeout=timeout)
await asyncio.wait_for(self._aggregation_event.wait(), timeout)
await self._maybe_emulate_user_speaking()
except asyncio.TimeoutError:
if not self._user_speaking:
@@ -684,6 +684,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
)
self._emulating_vad = False
finally:
self.reset_watchdog()
self._aggregation_event.clear()
async def _maybe_emulate_user_speaking(self):
@@ -985,6 +986,10 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
def _context_updated_task_finished(self, task: asyncio.Task):
self._context_updated_tasks.discard(task)
# The task is finished so this should exit immediately. We need to do
# this because otherwise the task manager would report a dangling task
# if we don't remove it.
asyncio.run_coroutine_threadsafe(self.wait_for_task(task), self.get_event_loop())
class LLMUserResponseAggregator(LLMUserContextAggregator):

View File

@@ -178,7 +178,6 @@ class AudioBufferProcessor(FrameProcessor):
Calls audio handlers with any remaining buffered audio before stopping.
"""
await self._call_on_audio_data_handler()
self._reset_recording()
self._recording = False
async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -231,7 +230,6 @@ class AudioBufferProcessor(FrameProcessor):
if self._buffer_size > 0 and len(self._user_audio_buffer) > self._buffer_size:
await self._call_on_audio_data_handler()
self._reset_recording()
# Process turn recording with preprocessed data.
if self._enable_turn_audio:
@@ -290,6 +288,8 @@ class AudioBufferProcessor(FrameProcessor):
self._num_channels,
)
self._reset_audio_buffers()
def _buffer_has_audio(self, buffer: bytearray) -> bool:
"""Check if a buffer contains audio data."""
return buffer is not None and len(buffer) > 0

View File

@@ -12,6 +12,7 @@ from typing import Awaitable, Callable, Optional
from pipecat.frames.frames import CancelFrame, EndFrame, Frame, StartFrame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.processors.producer_processor import ProducerProcessor, identity_transformer
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
class ConsumerProcessor(FrameProcessor):
@@ -65,7 +66,7 @@ class ConsumerProcessor(FrameProcessor):
async def _start(self, _: StartFrame):
"""Start the consumer task and register with the producer."""
if not self._consumer_task:
self._queue = self._producer.add_consumer()
self._queue: WatchdogQueue = self._producer.add_consumer()
self._consumer_task = self.create_task(self._consumer_task_handler())
async def _stop(self, _: EndFrame):
@@ -76,6 +77,7 @@ class ConsumerProcessor(FrameProcessor):
async def _cancel(self, _: CancelFrame):
"""Cancel the consumer task."""
if self._consumer_task:
self._queue.cancel()
await self.cancel_task(self._consumer_task)
async def _consumer_task_handler(self):

View File

@@ -32,8 +32,6 @@ from pipecat.frames.frames import (
TranscriptionFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
VADUserStartedSpeakingFrame,
VADUserStoppedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -207,8 +205,6 @@ class STTMuteFilter(FrameProcessor):
(
StartInterruptionFrame,
StopInterruptionFrame,
VADUserStartedSpeakingFrame,
VADUserStoppedSpeakingFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
InputAudioRawFrame,

View File

@@ -14,7 +14,7 @@ management, and frame flow control mechanisms.
import asyncio
from dataclasses import dataclass
from enum import Enum
from typing import Any, Awaitable, Callable, Coroutine, List, Optional, Sequence, Tuple
from typing import Any, Awaitable, Callable, Coroutine, List, Optional, Sequence
from loguru import logger
@@ -34,9 +34,11 @@ from pipecat.frames.frames import (
SystemFrame,
)
from pipecat.metrics.metrics import LLMTokenUsage, MetricsData
from pipecat.observers.base_observer import BaseObserver, FrameProcessed, FramePushed
from pipecat.observers.base_observer import BaseObserver, FramePushed
from pipecat.processors.metrics.frame_processor_metrics import FrameProcessorMetrics
from pipecat.utils.asyncio.task_manager import BaseTaskManager
from pipecat.utils.asyncio.watchdog_event import WatchdogEvent
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
from pipecat.utils.base_object import BaseObject
@@ -52,9 +54,6 @@ class FrameDirection(Enum):
UPSTREAM = 2
FrameCallback = Callable[["FrameProcessor", Frame, FrameDirection], Awaitable[None]]
@dataclass
class FrameProcessorSetup:
"""Configuration parameters for frame processor initialization.
@@ -63,54 +62,59 @@ class FrameProcessorSetup:
clock: The clock instance for timing operations.
task_manager: The task manager for handling async operations.
observer: Optional observer for monitoring frame processing events.
watchdog_timers_enabled: Whether to enable watchdog timers by default.
"""
clock: BaseClock
task_manager: BaseTaskManager
observer: Optional[BaseObserver] = None
watchdog_timers_enabled: bool = False
class FrameProcessorQueue(asyncio.PriorityQueue):
class FrameProcessorQueue(WatchdogQueue):
"""A priority queue for systems frames and other frames.
This is a specialized queue for frame processors that separates and
prioritizes system frames over other frames. It ensures that `SystemFrame`
objects are processed before any other frames by using a priority queue.
prioritizes system frames over other frames.
This queue uses two internal `WatchdogQueue` instances:
- One for system-level frames (`SystemFrame`)
- One for regular frames
It ensures that `SystemFrame` objects are processed before any other
frames. Additionally, it uses an `asyncio.Event` to signal when new items
have been added to either queue, allowing consumers to wait efficiently when
the queue is empty.
"""
HIGH_PRIORITY = 1
LOW_PRIORITY = 2
def __init__(self):
def __init__(self, manager: BaseTaskManager):
"""Initialize the FrameProcessorQueue.
Args:
manager (BaseTaskManager): The task manager used by the internal watchdog queues.
"""
super().__init__()
self.__high_counter = 0
self.__low_counter = 0
super().__init__(manager)
self.__event = WatchdogEvent(manager)
self.__main_queue = WatchdogQueue(manager)
self.__system_queue = WatchdogQueue(manager)
async def put(self, item: Tuple[Frame, FrameDirection, FrameCallback]):
"""Put an item into the priority queue.
async def put(self, item: Any):
"""Put an item into the appropriate queue.
System frames (`SystemFrame`) have higher priority than any other
frames. If a non-frame item (e.g. a watchdog cancellation sentinel) is
provided it will have the highest priority.
System frames (`SystemFrame`) are placed into the system queue and all others
into the regular queue. Signals the event to wake up any waiting consumers.
Args:
item (Any): The item to enqueue.
"""
frame, _, _ = item
if isinstance(frame, SystemFrame):
self.__high_counter += 1
await super().put((self.HIGH_PRIORITY, self.__high_counter, item))
if isinstance(item, SystemFrame):
await self.__system_queue.put(item)
else:
self.__low_counter += 1
await super().put((self.LOW_PRIORITY, self.__low_counter, item))
await self.__main_queue.put(item)
self.__event.set()
async def get(self) -> Any:
"""Retrieve the next item from the queue.
@@ -122,9 +126,38 @@ class FrameProcessorQueue(asyncio.PriorityQueue):
Any: The next item from the system or main queue.
"""
_, _, item = await super().get()
# Wait for an item in any of the queues if they are empty.
if self.__main_queue.empty() and self.__system_queue.empty():
await self.__event.wait()
# Prioritize system frames.
if self.__system_queue.qsize() > 0:
item = await self.__system_queue.get()
self.__system_queue.task_done()
else:
item = await self.__main_queue.get()
self.__main_queue.task_done()
# Clear the event only if all queues are empty.
if self.__main_queue.empty() and self.__system_queue.empty():
self.__event.clear()
return item
def cancel(self):
"""Cancel both internal queues.
This method is used to stop processing and release any pending tasks
in both the system and main queues. Typically used during shutdown
or cleanup to prevent further processing of frames.
"""
self.__main_queue.cancel()
self.__system_queue.cancel()
FrameCallback = Callable[["FrameProcessor", Frame, FrameDirection], Awaitable[None]]
class FrameProcessor(BaseObject):
"""Base class for all frame processors in the pipeline.
@@ -142,24 +175,35 @@ class FrameProcessor(BaseObject):
self,
*,
name: Optional[str] = None,
enable_direct_mode: bool = False,
enable_watchdog_logging: Optional[bool] = None,
enable_watchdog_timers: Optional[bool] = None,
metrics: Optional[FrameProcessorMetrics] = None,
watchdog_timeout_secs: Optional[float] = None,
**kwargs,
):
"""Initialize the frame processor.
Args:
name: Optional name for this processor instance.
enable_direct_mode: Whether to process frames immediately or use internal queues.
enable_watchdog_logging: Whether to enable watchdog logging for tasks.
enable_watchdog_timers: Whether to enable watchdog timers for tasks.
metrics: Optional metrics collector for this processor.
watchdog_timeout_secs: Timeout in seconds for watchdog operations.
**kwargs: Additional arguments passed to parent class.
"""
super().__init__(name=name, **kwargs)
super().__init__(name=name)
self._parent: Optional["FrameProcessor"] = None
self._prev: Optional["FrameProcessor"] = None
self._next: Optional["FrameProcessor"] = None
# Enable direct mode to skip queues and process frames right away.
self._enable_direct_mode = enable_direct_mode
# Enable watchdog timers for all tasks created by this frame processor.
self._enable_watchdog_timers = enable_watchdog_timers
# Enable watchdog logging for all tasks created by this frame processor.
self._enable_watchdog_logging = enable_watchdog_logging
# Allow this frame processor to control their tasks timeout.
self._watchdog_timeout_secs = watchdog_timeout_secs
# Clock
self._clock: Optional[BaseClock] = None
@@ -202,8 +246,6 @@ class FrameProcessor(BaseObject):
# The input task that handles all types of frames. It processes system
# frames right away and queues non-system frames for later processing.
self.__should_block_system_frames = False
self.__input_event: Optional[asyncio.Event] = None
self.__input_frame_task: Optional[asyncio.Task] = None
# The process task processes non-system frames. Non-system frames will
@@ -212,8 +254,9 @@ class FrameProcessor(BaseObject):
# called. To resume processing frames we need to call
# `resume_processing_frames()` which will wake up the event.
self.__should_block_frames = False
self.__process_event: Optional[asyncio.Event] = None
self.__process_event = None
self.__process_frame_task: Optional[asyncio.Task] = None
self.__process_queue = None
@property
def id(self) -> int:
@@ -233,50 +276,6 @@ class FrameProcessor(BaseObject):
"""
return self._name
@property
def processors(self) -> List["FrameProcessor"]:
"""Return the list of sub-processors contained within this processor.
Only compound processors (e.g. pipelines and parallel pipelines) have
sub-processors. Non-compound processors will return an empty list.
Returns:
The list of sub-processors if this is a compound processor.
"""
return []
@property
def entry_processors(self) -> List["FrameProcessor"]:
"""Return the list of entry processors for this processor.
Entry processors are the first processors in a compound processor
(e.g. pipelines, parallel pipelines). Note that pipelines can also be an
entry processor as pipelines are processors themselves. Non-compound
processors will simply return an empty list.
Returns:
The list of entry processors.
"""
return []
@property
def next(self) -> Optional["FrameProcessor"]:
"""Get the next processor.
Returns:
The next processor, or None if there's no next processor.
"""
return self._next
@property
def previous(self) -> Optional["FrameProcessor"]:
"""Get the previous processor.
Returns:
The previous processor, or None if there's no previous processor.
"""
return self._prev
@property
def interruptions_allowed(self):
"""Check if interruptions are allowed for this processor.
@@ -336,17 +335,6 @@ class FrameProcessor(BaseObject):
raise Exception(f"{self} TaskManager is still not initialized.")
return self._task_manager
def processors_with_metrics(self):
"""Return processors that can generate metrics.
Recursively collects all processors that support metrics generation,
including those from nested processors.
Returns:
List of frame processors that can generate metrics.
"""
return []
def can_generate_metrics(self) -> bool:
"""Check if this processor can generate metrics.
@@ -414,12 +402,23 @@ class FrameProcessor(BaseObject):
await self.stop_ttfb_metrics()
await self.stop_processing_metrics()
def create_task(self, coroutine: Coroutine, name: Optional[str] = None) -> asyncio.Task:
def create_task(
self,
coroutine: Coroutine,
name: Optional[str] = None,
*,
enable_watchdog_logging: Optional[bool] = None,
enable_watchdog_timers: Optional[bool] = None,
watchdog_timeout_secs: Optional[float] = None,
) -> asyncio.Task:
"""Create a new task managed by this processor.
Args:
coroutine: The coroutine to run in the task.
name: Optional name for the task.
enable_watchdog_logging: Whether to enable watchdog logging.
enable_watchdog_timers: Whether to enable watchdog timers.
watchdog_timeout_secs: Timeout in seconds for watchdog operations.
Returns:
The created asyncio task.
@@ -428,7 +427,21 @@ class FrameProcessor(BaseObject):
name = f"{self}::{name}"
else:
name = f"{self}::{coroutine.cr_code.co_name}"
return self.task_manager.create_task(coroutine, name)
return self.task_manager.create_task(
coroutine,
name,
enable_watchdog_logging=(
enable_watchdog_logging
if enable_watchdog_logging
else self._enable_watchdog_logging
),
enable_watchdog_timers=(
enable_watchdog_timers if enable_watchdog_timers else self._enable_watchdog_timers
),
watchdog_timeout=(
watchdog_timeout_secs if watchdog_timeout_secs else self._watchdog_timeout_secs
),
)
async def cancel_task(self, task: asyncio.Task, timeout: Optional[float] = None):
"""Cancel a task managed by this processor.
@@ -442,27 +455,15 @@ class FrameProcessor(BaseObject):
async def wait_for_task(self, task: asyncio.Task, timeout: Optional[float] = None):
"""Wait for a task to complete.
.. deprecated:: 0.0.81
This function is deprecated, use `await task` or
`await asyncio.wait_for(task, timeout) instead.
Args:
task: The task to wait for.
timeout: Optional timeout for waiting.
"""
import warnings
await self.task_manager.wait_for_task(task, timeout)
warnings.warn(
"`FrameProcessor.wait_for_task()` is deprecated. "
"Use `await task` or `await asyncio.wait_for(task, timeout)` instead.",
DeprecationWarning,
stacklevel=2,
)
if timeout:
await asyncio.wait_for(task, timeout)
else:
await task
def reset_watchdog(self):
"""Reset the watchdog timer for the current task."""
self.task_manager.task_reset_watchdog()
async def setup(self, setup: FrameProcessorSetup):
"""Set up the processor with required components.
@@ -473,6 +474,11 @@ class FrameProcessor(BaseObject):
self._clock = setup.clock
self._task_manager = setup.task_manager
self._observer = setup.observer
self._watchdog_timers_enabled = (
self._enable_watchdog_timers
if self._enable_watchdog_timers
else setup.watchdog_timers_enabled
)
# Create processing tasks.
self.__create_input_task()
@@ -498,6 +504,30 @@ class FrameProcessor(BaseObject):
processor._prev = self
logger.debug(f"Linking {self} -> {self._next}")
def get_event_loop(self) -> asyncio.AbstractEventLoop:
"""Get the event loop used by this processor.
Returns:
The asyncio event loop.
"""
return self.task_manager.get_event_loop()
def set_parent(self, parent: "FrameProcessor"):
"""Set the parent processor for this processor.
Args:
parent: The parent processor.
"""
self._parent = parent
def get_parent(self) -> Optional["FrameProcessor"]:
"""Get the parent processor.
Returns:
The parent processor, or None if no parent is set.
"""
return self._parent
def get_clock(self) -> BaseClock:
"""Get the clock used by this processor.
@@ -511,14 +541,6 @@ class FrameProcessor(BaseObject):
raise Exception(f"{self} Clock is still not initialized.")
return self._clock
def get_event_loop(self) -> asyncio.AbstractEventLoop:
"""Get the event loop used by this processor.
Returns:
The asyncio event loop.
"""
return self.task_manager.get_event_loop()
async def queue_frame(
self,
frame: Frame,
@@ -536,33 +558,19 @@ class FrameProcessor(BaseObject):
if self._cancelling:
return
if self._enable_direct_mode:
await self.__process_frame(frame, direction, callback)
else:
await self.__input_queue.put((frame, direction, callback))
await self.__input_queue.put((frame, direction, callback))
async def pause_processing_frames(self):
"""Pause processing of queued frames."""
logger.trace(f"{self}: pausing frame processing")
self.__should_block_frames = True
async def pause_processing_system_frames(self):
"""Pause processing of queued system frames."""
logger.trace(f"{self}: pausing system frame processing")
self.__should_block_system_frames = True
async def resume_processing_frames(self):
"""Resume processing of queued frames."""
logger.trace(f"{self}: resuming frame processing")
if self.__process_event:
self.__process_event.set()
async def resume_processing_system_frames(self):
"""Resume processing of queued system frames."""
logger.trace(f"{self}: resuming system frame processing")
if self.__input_event:
self.__input_event.set()
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process a frame.
@@ -570,16 +578,6 @@ class FrameProcessor(BaseObject):
frame: The frame to process.
direction: The direction of frame flow.
"""
if self._observer:
timestamp = self._clock.get_time() if self._clock else 0
data = FrameProcessed(
processor=self,
frame=frame,
direction=direction,
timestamp=timestamp,
)
await self._observer.on_process_frame(data)
if isinstance(frame, StartFrame):
await self.__start(frame)
elif isinstance(frame, StartInterruptionFrame):
@@ -732,39 +730,36 @@ class FrameProcessor(BaseObject):
def __create_input_task(self):
"""Create the frame input processing task."""
if self._enable_direct_mode:
return
if not self.__input_frame_task:
self.__input_event = asyncio.Event()
self.__input_queue = FrameProcessorQueue()
self.__input_queue = FrameProcessorQueue(self.task_manager)
self.__input_frame_task = self.create_task(self.__input_frame_task_handler())
async def __cancel_input_task(self):
"""Cancel the frame input processing task."""
if self.__input_frame_task:
self.__input_queue.cancel()
await self.cancel_task(self.__input_frame_task)
self.__input_frame_task = None
def __create_process_task(self):
"""Create the non-system frame processing task."""
if self._enable_direct_mode:
return
if not self.__process_frame_task:
self.__should_block_frames = False
self.__process_event = asyncio.Event()
self.__process_queue = asyncio.Queue()
if not self.__process_event:
self.__process_event = WatchdogEvent(self.task_manager)
self.__process_event.clear()
self.__process_queue = WatchdogQueue(self.task_manager)
self.__process_frame_task = self.create_task(self.__process_frame_task_handler())
async def __cancel_process_task(self):
"""Cancel the non-system frame processing task."""
if self.__process_frame_task:
self.__process_queue.cancel()
await self.cancel_task(self.__process_frame_task)
self.__process_frame_task = None
async def __process_frame(
self, frame: Frame, direction: FrameDirection, callback: Optional[FrameCallback]
self, frame: Frame, direction: FrameDirection, callback: FrameCallback
):
try:
# Process the frame.
@@ -784,13 +779,6 @@ class FrameProcessor(BaseObject):
"""
while True:
if self.__should_block_system_frames and self.__input_event:
logger.trace(f"{self}: system frame processing paused")
await self.__input_event.wait()
self.__input_event.clear()
self.__should_block_system_frames = False
logger.trace(f"{self}: system frame processing resumed")
(frame, direction, callback) = await self.__input_queue.get()
if isinstance(frame, SystemFrame):
@@ -802,8 +790,6 @@ class FrameProcessor(BaseObject):
f"{self}: __process_queue is None when processing frame {frame.name}"
)
self.__input_queue.task_done()
async def __process_frame_task_handler(self):
"""Handle non-system frames from the process queue."""
while True:
@@ -817,5 +803,3 @@ class FrameProcessor(BaseObject):
(frame, direction, callback) = await self.__process_queue.get()
await self.__process_frame(frame, direction, callback)
self.__process_queue.task_done()

View File

@@ -72,9 +72,11 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.llm_service import (
FunctionCallParams, # TODO(aleix): we shouldn't import `services` from `processors`
)
from pipecat.services.openai.llm import OpenAIContextAggregatorPair
from pipecat.transports.base_input import BaseInputTransport
from pipecat.transports.base_output import BaseOutputTransport
from pipecat.transports.base_transport import BaseTransport
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
from pipecat.utils.string import match_endofsentence
RTVI_PROTOCOL_VERSION = "1.0.0"
@@ -1313,10 +1315,10 @@ class RTVIProcessor(FrameProcessor):
async def _start(self, frame: StartFrame):
"""Start the RTVI processor tasks."""
if not self._action_task:
self._action_queue = asyncio.Queue()
self._action_queue = WatchdogQueue(self.task_manager)
self._action_task = self.create_task(self._action_task_handler())
if not self._message_task:
self._message_queue = asyncio.Queue()
self._message_queue = WatchdogQueue(self.task_manager)
self._message_task = self.create_task(self._message_task_handler())
await self._call_event_handler("on_bot_started")
@@ -1331,10 +1333,12 @@ class RTVIProcessor(FrameProcessor):
async def _cancel_tasks(self):
"""Cancel all running tasks."""
if self._action_task:
self._action_queue.cancel()
await self.cancel_task(self._action_task)
self._action_task = None
if self._message_task:
self._message_queue.cancel()
await self.cancel_task(self._message_task)
self._message_task = None

View File

@@ -11,6 +11,7 @@ from typing import Awaitable, Callable, List, Optional
from pipecat.frames.frames import Frame, StartFrame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.utils.asyncio.watchdog_event import WatchdogEvent
class IdleFrameProcessor(FrameProcessor):
@@ -77,7 +78,7 @@ class IdleFrameProcessor(FrameProcessor):
def _create_idle_task(self):
"""Create and start the idle monitoring task."""
if not self._idle_task:
self._idle_event = asyncio.Event()
self._idle_event = WatchdogEvent(self.task_manager)
self._idle_task = self.create_task(self._idle_task_handler())
async def _idle_task_handler(self):

View File

@@ -9,6 +9,7 @@
from loguru import logger
from pipecat.utils.asyncio.task_manager import BaseTaskManager
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
try:
import sentry_sdk
@@ -50,7 +51,7 @@ class SentryMetrics(FrameProcessorMetrics):
"""
await super().setup(task_manager)
if self._sentry_available:
self._sentry_queue = asyncio.Queue()
self._sentry_queue = WatchdogQueue(task_manager)
self._sentry_task = self.task_manager.create_task(
self._sentry_task_handler(), name=f"{self}::_sentry_task_handler"
)
@@ -63,7 +64,7 @@ class SentryMetrics(FrameProcessorMetrics):
await super().cleanup()
if self._sentry_task:
await self._sentry_queue.put(None)
await self._sentry_task
await self.task_manager.wait_for_task(self._sentry_task)
self._sentry_task = None
logger.trace(f"{self} Flushing Sentry metrics")
sentry_sdk.flush(timeout=5.0)

View File

@@ -11,6 +11,7 @@ from typing import Awaitable, Callable, List
from pipecat.frames.frames import Frame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
async def identity_transformer(frame: Frame):
@@ -63,7 +64,7 @@ class ProducerProcessor(FrameProcessor):
Returns:
asyncio.Queue: The queue for the newly added consumer.
"""
queue = asyncio.Queue()
queue = WatchdogQueue(self.task_manager)
self._consumers.append(queue)
return queue

View File

@@ -22,6 +22,7 @@ from pipecat.frames.frames import (
UserStoppedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.utils.asyncio.watchdog_event import WatchdogEvent
class UserIdleProcessor(FrameProcessor):
@@ -77,7 +78,7 @@ class UserIdleProcessor(FrameProcessor):
self._interrupted = False
self._conversation_started = False
self._idle_task = None
self._idle_event = asyncio.Event()
self._idle_event = None
def _wrap_callback(
self,
@@ -137,6 +138,9 @@ class UserIdleProcessor(FrameProcessor):
"""
await super().process_frame(frame, direction)
if isinstance(frame, StartFrame):
self._idle_event = WatchdogEvent(self.task_manager)
# Check for end frames before processing
if isinstance(frame, (EndFrame, CancelFrame)):
# Stop the idle task, if it exists

View File

@@ -53,7 +53,7 @@ Supported transports:
- Daily - Creates rooms and tokens, runs bot as participant
- WebRTC - Provides local WebRTC interface with prebuilt UI
- Telephony - Handles webhook and WebSocket connections for Twilio, Telnyx, Plivo, Exotel
- Telephony - Handles webhook and WebSocket connections for Twilio, Telnyx, Plivo
To run locally:
@@ -62,7 +62,6 @@ To run locally:
- Daily (server): `python bot.py -t daily`
- Daily (direct, testing only): `python bot.py -d`
- Telephony: `python bot.py -t twilio -x your_username.ngrok.io`
- Exotel: `python bot.py -t exotel` (no proxy needed, but ngrok connection to HTTP 7860 is required)
"""
import argparse
@@ -146,6 +145,7 @@ async def _run_telephony_bot(websocket: WebSocket):
# Just pass the WebSocket - let the bot handle parsing
runner_args = WebSocketRunnerArguments(websocket=websocket)
runner_args.handle_sigint = False
await bot_module.bot(runner_args)
@@ -169,7 +169,7 @@ def _create_server_app(
_setup_webrtc_routes(app, esp32_mode=esp32_mode, host=host)
elif transport_type == "daily":
_setup_daily_routes(app)
elif transport_type in ["twilio", "telnyx", "plivo", "exotel"]:
elif transport_type in ["twilio", "telnyx", "plivo"]:
_setup_telephony_routes(app, transport_type, proxy)
else:
logger.warning(f"Unknown transport type: {transport_type}")
@@ -223,6 +223,7 @@ def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "lo
bot_module = _get_bot_module()
runner_args = SmallWebRTCRunnerArguments(webrtc_connection=pipecat_connection)
runner_args.handle_sigint = False
background_tasks.add_task(bot_module.bot, runner_args)
answer = pipecat_connection.get_answer()
@@ -265,6 +266,7 @@ def _setup_daily_routes(app: FastAPI):
# Start the bot in the background with empty body for GET requests
bot_module = _get_bot_module()
runner_args = DailyRunnerArguments(room_url=room_url, token=token)
runner_args.handle_sigint = False
asyncio.create_task(bot_module.bot(runner_args))
return RedirectResponse(room_url)
@@ -309,6 +311,7 @@ def _setup_daily_routes(app: FastAPI):
# Start the bot in the background with extracted body data
bot_module = _get_bot_module()
runner_args = DailyRunnerArguments(room_url=room_url, token=token, body=bot_body)
runner_args.handle_sigint = False
asyncio.create_task(bot_module.bot(runner_args))
# Match PCC /start endpoint response format:
return {"dailyRoom": room_url, "dailyToken": token}
@@ -334,7 +337,7 @@ def _setup_daily_routes(app: FastAPI):
def _setup_telephony_routes(app: FastAPI, transport_type: str, proxy: str):
"""Set up telephony-specific routes."""
# XML response templates (Exotel doesn't use XML webhooks)
# XML response templates
XML_TEMPLATES = {
"twilio": f"""<?xml version="1.0" encoding="UTF-8"?>
<Response>
@@ -359,18 +362,9 @@ def _setup_telephony_routes(app: FastAPI, transport_type: str, proxy: str):
@app.post("/")
async def start_call():
"""Handle telephony webhook and return XML response."""
if transport_type == "exotel":
# Exotel doesn't use POST webhooks - redirect to proper documentation
logger.debug("POST Exotel endpoint - not used")
return {
"error": "Exotel doesn't use POST webhooks",
"websocket_url": f"wss://{proxy}/ws",
"note": "Configure the WebSocket URL above in your Exotel App Bazaar Voicebot Applet",
}
else:
logger.debug(f"POST {transport_type.upper()} XML")
xml_content = XML_TEMPLATES.get(transport_type, "<Response></Response>")
return HTMLResponse(content=xml_content, media_type="application/xml")
logger.debug(f"POST {transport_type.upper()} XML")
xml_content = XML_TEMPLATES.get(transport_type, "<Response></Response>")
return HTMLResponse(content=xml_content, media_type="application/xml")
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
@@ -402,7 +396,6 @@ async def _run_daily_direct():
# Direct connections have no request body, so use empty dict
runner_args = DailyRunnerArguments(room_url=room_url, token=token)
runner_args.handle_sigint = True
# Get the bot module and run it directly
bot_module = _get_bot_module()
@@ -447,7 +440,7 @@ def main():
Args:
--host: Server host address (default: localhost)
--port: Server port (default: 7860)
-t/--transport: Transport type (daily, webrtc, twilio, telnyx, plivo, exotel)
-t/--transport: Transport type (daily, webrtc, twilio, telnyx, plivo)
-x/--proxy: Public proxy hostname for telephony webhooks
--esp32: Enable SDP munging for ESP32 compatibility (requires --host with IP address)
-d/--direct: Connect directly to Daily room (automatically sets transport to daily)
@@ -462,7 +455,7 @@ def main():
"-t",
"--transport",
type=str,
choices=["daily", "webrtc", "twilio", "telnyx", "plivo", "exotel"],
choices=["daily", "webrtc", "twilio", "telnyx", "plivo"],
default="webrtc",
help="Transport type",
)

View File

@@ -25,7 +25,7 @@ class RunnerArguments:
pipeline_idle_timeout_secs: int = field(init=False)
def __post_init__(self):
self.handle_sigint = False
self.handle_sigint = True
self.handle_sigterm = False
self.pipeline_idle_timeout_secs = 300

View File

@@ -77,17 +77,6 @@ def _detect_transport_type_from_message(message_data: dict) -> str:
logger.trace("Auto-detected: PLIVO")
return "plivo"
# Exotel detection
if (
message_data.get("event") == "start"
and "start" in message_data
and "stream_sid" in message_data.get("start", {})
and "call_sid" in message_data.get("start", {})
and "account_sid" in message_data.get("start", {})
):
logger.trace("Auto-detected: EXOTEL")
return "exotel"
logger.trace("Auto-detection failed - unknown format")
return "unknown"
@@ -102,7 +91,6 @@ async def parse_telephony_websocket(websocket: WebSocket):
- Twilio: {"stream_id": str, "call_id": str}
- Telnyx: {"stream_id": str, "call_control_id": str, "outbound_encoding": str}
- Plivo: {"stream_id": str, "call_id": str}
- Exotel: {"stream_id": str, "call_id": str, "account_sid": str}
Example usage::
@@ -172,14 +160,6 @@ async def parse_telephony_websocket(websocket: WebSocket):
"call_id": start_data.get("callId"),
}
elif transport_type == "exotel":
start_data = call_data_raw.get("start", {})
call_data = {
"stream_id": start_data.get("stream_sid"),
"call_id": start_data.get("call_sid"),
"account_sid": start_data.get("account_sid"),
}
else:
call_data = {}
@@ -260,7 +240,6 @@ async def maybe_capture_participant_screen(
await transport.capture_participant_video(
client["id"], framerate=framerate, video_source="screenVideo"
)
except ImportError:
pass
@@ -400,17 +379,10 @@ async def _create_telephony_transport(
auth_id=os.getenv("PLIVO_AUTH_ID", ""),
auth_token=os.getenv("PLIVO_AUTH_TOKEN", ""),
)
elif transport_type == "exotel":
from pipecat.serializers.exotel import ExotelFrameSerializer
params.serializer = ExotelFrameSerializer(
stream_sid=call_data["stream_id"],
call_sid=call_data["call_id"],
)
else:
raise ValueError(
f"Unsupported telephony provider: {transport_type}. "
f"Supported providers: twilio, telnyx, plivo, exotel"
f"Supported providers: twilio, telnyx, plivo"
)
return FastAPIWebsocketTransport(websocket=websocket, params=params)
@@ -427,7 +399,7 @@ async def create_transport(
Args:
runner_args: Arguments from the runner.
transport_params: Dict mapping transport names to parameter factory functions.
Keys should be: "daily", "webrtc", "twilio", "telnyx", "plivo", "exotel"
Keys should be: "daily", "webrtc", "twilio", "telnyx", "plivo"
Values should be functions that return transport parameters when called.
Returns:
@@ -468,12 +440,6 @@ async def create_transport(
vad_analyzer=SileroVADAnalyzer(),
# add_wav_header and serializer will be set automatically
),
"exotel": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
# add_wav_header and serializer will be set automatically
),
}
transport = await create_transport(runner_args, transport_params)

View File

@@ -53,10 +53,11 @@ from pipecat.processors.aggregators.openai_llm_context import (
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
from pipecat.utils.tracing.service_decorators import traced_llm
try:
from anthropic import NOT_GIVEN, APITimeoutError, AsyncAnthropic, NotGiven
from anthropic import NOT_GIVEN, AsyncAnthropic, NotGiven
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`.")
@@ -132,8 +133,6 @@ class AnthropicLLMService(LLMService):
model: str = "claude-sonnet-4-20250514",
params: Optional[InputParams] = None,
client=None,
retry_timeout_secs: Optional[float] = 5.0,
retry_on_timeout: Optional[bool] = False,
**kwargs,
):
"""Initialize the Anthropic LLM service.
@@ -143,8 +142,6 @@ class AnthropicLLMService(LLMService):
model: Model name to use. Defaults to "claude-sonnet-4-20250514".
params: Optional model parameters for inference.
client: Optional custom Anthropic client instance.
retry_timeout_secs: Request timeout in seconds for retry logic.
retry_on_timeout: Whether to retry the request once if it times out.
**kwargs: Additional arguments passed to parent LLMService.
"""
super().__init__(**kwargs)
@@ -153,8 +150,6 @@ class AnthropicLLMService(LLMService):
api_key=api_key
) # if the client is provided, use it and remove it, otherwise create a new one
self.set_model_name(model)
self._retry_timeout_secs = retry_timeout_secs
self._retry_on_timeout = retry_on_timeout
self._settings = {
"max_tokens": params.max_tokens,
"enable_prompt_caching_beta": params.enable_prompt_caching_beta or False,
@@ -172,31 +167,6 @@ class AnthropicLLMService(LLMService):
"""
return True
async def _create_message_stream(self, api_call, params):
"""Create message stream with optional timeout and retry.
Args:
api_call: The Anthropic API method to call.
params: Parameters for the API call.
Returns:
Async stream of message events.
"""
if self._retry_on_timeout:
try:
response = await asyncio.wait_for(
api_call(**params), timeout=self._retry_timeout_secs
)
return response
except (APITimeoutError, asyncio.TimeoutError):
# Retry, this time without a timeout so we get a response
logger.debug(f"{self}: Retrying message creation due to timeout")
response = await api_call(**params)
return response
else:
response = await api_call(**params)
return response
@property
def enable_prompt_caching_beta(self) -> bool:
"""Check if prompt caching beta feature is enabled.
@@ -280,7 +250,7 @@ class AnthropicLLMService(LLMService):
params.update(self._settings["extra"])
response = await self._create_message_stream(api_call, params)
response = await api_call(**params)
await self.stop_ttfb_metrics()
@@ -289,7 +259,7 @@ class AnthropicLLMService(LLMService):
json_accumulator = ""
function_calls = []
async for event in response:
async for event in WatchdogAsyncIterator(response, manager=self.task_manager):
# Aggregate streaming content, create frames, trigger events
if event.type == "content_block_delta":

View File

@@ -219,7 +219,10 @@ class AssemblyAISTTService(STTService):
await self._websocket.send(json.dumps({"type": "Terminate"}))
try:
await asyncio.wait_for(self._termination_event.wait(), timeout=5.0)
await asyncio.wait_for(
self._termination_event.wait(),
timeout=5.0,
)
except asyncio.TimeoutError:
logger.warning("Timed out waiting for termination message from server")
@@ -244,9 +247,11 @@ class AssemblyAISTTService(STTService):
try:
while self._connected:
try:
message = await self._websocket.recv()
message = await asyncio.wait_for(self._websocket.recv(), timeout=1.0)
data = json.loads(message)
await self._handle_message(data)
except asyncio.TimeoutError:
self.reset_watchdog()
except websockets.exceptions.ConnectionClosedOK:
break
except Exception as e:

View File

@@ -29,6 +29,7 @@ from pipecat.frames.frames import (
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.tts_service import InterruptibleTTSService, TTSService
from pipecat.transcriptions.language import Language
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
from pipecat.utils.tracing.service_decorators import traced_tts
try:
@@ -275,7 +276,9 @@ class AsyncAITTSService(InterruptibleTTSService):
self._started = False
async def _receive_messages(self):
async for message in self._get_websocket():
async for message in WatchdogAsyncIterator(
self._get_websocket(), manager=self.task_manager
):
msg = json.loads(message)
if not msg:
continue
@@ -298,8 +301,9 @@ class AsyncAITTSService(InterruptibleTTSService):
async def _keepalive_task_handler(self):
"""Send periodic keepalive messages to maintain WebSocket connection."""
KEEPALIVE_SLEEP = 3
KEEPALIVE_SLEEP = 10 if self.task_manager.task_watchdog_enabled else 3
while True:
self.reset_watchdog()
await asyncio.sleep(KEEPALIVE_SLEEP)
try:
if self._websocket and self._websocket.state is State.OPEN:
@@ -331,7 +335,7 @@ class AsyncAITTSService(InterruptibleTTSService):
yield TTSStartedFrame()
self._started = True
msg = self._build_msg(text=text, force=True)
msg = self._build_msg(text=text)
try:
await self._get_websocket().send(msg)

View File

@@ -58,7 +58,6 @@ try:
import aioboto3
import httpx
from botocore.config import Config
from botocore.exceptions import ReadTimeoutError
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
@@ -725,8 +724,6 @@ class AWSBedrockLLMService(LLMService):
aws_region: str = "us-east-1",
params: Optional[InputParams] = None,
client_config: Optional[Config] = None,
retry_timeout_secs: Optional[float] = 5.0,
retry_on_timeout: Optional[bool] = False,
**kwargs,
):
"""Initialize the AWS Bedrock LLM service.
@@ -739,8 +736,6 @@ class AWSBedrockLLMService(LLMService):
aws_region: AWS region for the Bedrock service.
params: Model parameters and configuration.
client_config: Custom boto3 client configuration.
retry_timeout_secs: Request timeout in seconds for retry logic.
retry_on_timeout: Whether to retry the request once if it times out.
**kwargs: Additional arguments passed to parent LLMService.
"""
super().__init__(**kwargs)
@@ -767,8 +762,6 @@ class AWSBedrockLLMService(LLMService):
}
self.set_model_name(model)
self._retry_timeout_secs = retry_timeout_secs
self._retry_on_timeout = retry_on_timeout
self._settings = {
"max_tokens": params.max_tokens,
"temperature": params.temperature,
@@ -789,31 +782,6 @@ class AWSBedrockLLMService(LLMService):
"""
return True
async def _create_converse_stream(self, client, request_params):
"""Create converse stream with optional timeout and retry.
Args:
client: The AWS Bedrock client instance.
request_params: Parameters for the converse_stream call.
Returns:
Async stream of response events.
"""
if self._retry_on_timeout:
try:
response = await asyncio.wait_for(
await client.converse_stream(**request_params), timeout=self._retry_timeout_secs
)
return response
except (ReadTimeoutError, asyncio.TimeoutError) as e:
# Retry, this time without a timeout so we get a response
logger.debug(f"{self}: Retrying converse_stream due to timeout")
response = await client.converse_stream(**request_params)
return response
else:
response = await client.converse_stream(**request_params)
return response
def create_context_aggregator(
self,
context: OpenAILLMContext,
@@ -943,7 +911,7 @@ class AWSBedrockLLMService(LLMService):
service_name="bedrock-runtime", **self._aws_params
) as client:
# Call AWS Bedrock with streaming
response = await self._create_converse_stream(client, request_params)
response = await client.converse_stream(**request_params)
await self.stop_ttfb_metrics()
@@ -954,6 +922,8 @@ class AWSBedrockLLMService(LLMService):
function_calls = []
async for event in response["stream"]:
self.reset_watchdog()
# Handle text content
if "contentBlockDelta" in event:
delta = event["contentBlockDelta"]["delta"]

View File

@@ -480,7 +480,7 @@ class AWSTranscribeSTTService(STTService):
break
try:
response = await self._ws_client.recv()
response = await asyncio.wait_for(self._ws_client.recv(), timeout=1.0)
headers, payload = decode_event(response)
@@ -531,6 +531,8 @@ class AWSTranscribeSTTService(STTService):
else:
logger.debug(f"{self} Other message type received: {headers}")
logger.debug(f"{self} Payload: {payload}")
except asyncio.TimeoutError:
self.reset_watchdog()
except websockets.exceptions.ConnectionClosed as e:
logger.error(
f"{self} WebSocket connection closed in receive loop with code {e.code}: {e.reason}"

View File

@@ -62,6 +62,7 @@ from pipecat.services.aws_nova_sonic.context import (
)
from pipecat.services.aws_nova_sonic.frames import AWSNovaSonicFunctionCallResultFrame
from pipecat.services.llm_service import LLMService
from pipecat.utils.asyncio.watchdog_coroutine import watchdog_coroutine
from pipecat.utils.time import time_now_iso8601
try:
@@ -794,7 +795,7 @@ class AWSNovaSonicLLMService(LLMService):
try:
while self._stream and not self._disconnecting:
output = await self._stream.await_output()
result = await output[1].receive()
result = await watchdog_coroutine(output[1].receive(), manager=self.task_manager)
if result.value and result.value.bytes_:
response_data = result.value.bytes_.decode("utf-8")

View File

@@ -60,7 +60,6 @@ class AzureSTTService(STTService):
region: str,
language: Language = Language.EN_US,
sample_rate: Optional[int] = None,
endpoint_id: Optional[str] = None,
**kwargs,
):
"""Initialize the Azure STT service.
@@ -70,7 +69,6 @@ class AzureSTTService(STTService):
region: Azure region for the Speech service (e.g., 'eastus').
language: Language for speech recognition. Defaults to English (US).
sample_rate: Audio sample rate in Hz. If None, uses service default.
endpoint_id: Custom model endpoint id.
**kwargs: Additional arguments passed to parent STTService.
"""
super().__init__(sample_rate=sample_rate, **kwargs)
@@ -81,9 +79,6 @@ class AzureSTTService(STTService):
speech_recognition_language=language_to_azure_language(language),
)
if endpoint_id:
self._speech_config.endpoint_id = endpoint_id
self._audio_stream = None
self._speech_recognizer = None
self._settings = {

View File

@@ -68,16 +68,6 @@ class AzureBaseTTSService(TTSService):
construction, voice configuration, and parameter management.
"""
# Define SSML escape mappings based on SSML reserved characters
# See - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-structure
SSML_ESCAPE_CHARS = {
"&": "&amp;",
"<": "&lt;",
">": "&gt;",
'"': "&quot;",
"'": "&apos;",
}
class InputParams(BaseModel):
"""Input parameters for Azure TTS voice configuration.
@@ -164,10 +154,6 @@ class AzureBaseTTSService(TTSService):
def _construct_ssml(self, text: str) -> str:
language = self._settings["language"]
# Escape special characters
escaped_text = self._escape_text(text)
ssml = (
f"<speak version='1.0' xml:lang='{language}' "
"xmlns='http://www.w3.org/2001/10/synthesis' "
@@ -197,7 +183,7 @@ class AzureBaseTTSService(TTSService):
if self._settings["emphasis"]:
ssml += f"<emphasis level='{self._settings['emphasis']}'>"
ssml += escaped_text
ssml += text
if self._settings["emphasis"]:
ssml += "</emphasis>"
@@ -211,27 +197,6 @@ class AzureBaseTTSService(TTSService):
return ssml
def _escape_text(self, text: str) -> str:
"""Escapes XML/SSML reserved characters according to Microsoft documentation.
This method escapes the following characters:
- & becomes &amp;
- < becomes &lt;
- > becomes &gt;
- " becomes &quot;
- ' becomes &apos;
Args:
text: The text to escape.
Returns:
The escaped text.
"""
escaped_text = text
for char, escape_code in AzureBaseTTSService.SSML_ESCAPE_CHARS.items():
escaped_text = escaped_text.replace(char, escape_code)
return escaped_text
class AzureTTSService(AzureBaseTTSService):
"""Azure Cognitive Services streaming TTS service.

View File

@@ -29,6 +29,7 @@ from pipecat.frames.frames import (
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.tts_service import AudioContextWordTTSService, TTSService
from pipecat.transcriptions.language import Language
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
from pipecat.utils.tracing.service_decorators import traced_tts
@@ -386,8 +387,10 @@ class CartesiaTTSService(AudioContextWordTTSService):
await self._websocket.send(msg)
self._context_id = None
async def _process_messages(self):
async for message in self._get_websocket():
async def _receive_messages(self):
async for message in WatchdogAsyncIterator(
self._get_websocket(), manager=self.task_manager
):
msg = json.loads(message)
if not msg or not self.audio_context_available(msg["context_id"]):
continue
@@ -419,14 +422,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
else:
logger.error(f"{self} error, unknown message type: {msg}")
async def _receive_messages(self):
while True:
await self._process_messages()
# Cartesia times out after 5 minutes of innactivity (no keepalive
# mechanism is available). So, we try to reconnect.
logger.debug(f"{self} Cartesia connection was disconnected (timeout?), reconnecting")
await self._connect_websocket()
@traced_tts
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"""Generate speech from text using Cartesia's streaming API.

View File

@@ -9,7 +9,8 @@
from typing import List
from loguru import logger
from openai.types.chat import ChatCompletionMessageParam
from openai import AsyncStream
from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.openai.llm import OpenAILLMService
@@ -54,13 +55,20 @@ class CerebrasLLMService(OpenAILLMService):
logger.debug(f"Creating Cerebras client with api {base_url}")
return super().create_client(api_key, base_url, **kwargs)
def build_chat_completion_params(
async def get_chat_completions(
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
) -> dict:
"""Build parameters for Cerebras chat completion request.
) -> AsyncStream[ChatCompletionChunk]:
"""Create a streaming chat completion using Cerebras's API.
Cerebras supports a subset of OpenAI parameters, focusing on core
completion settings without advanced features like frequency/presence penalties.
Args:
context: The context object containing tools configuration
and other settings for the chat completion.
messages: The list of messages comprising
the conversation history and current request.
Returns:
A streaming response of chat completion
chunks that can be processed asynchronously.
"""
params = {
"model": self.model_name,
@@ -75,4 +83,6 @@ class CerebrasLLMService(OpenAILLMService):
}
params.update(self._settings["extra"])
return params
chunks = await self._client.chat.completions.create(**params)
return chunks

View File

@@ -9,7 +9,8 @@
from typing import List
from loguru import logger
from openai.types.chat import ChatCompletionMessageParam
from openai import AsyncStream
from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.openai.llm import OpenAILLMService
@@ -54,12 +55,20 @@ class DeepSeekLLMService(OpenAILLMService):
logger.debug(f"Creating DeepSeek client with api {base_url}")
return super().create_client(api_key, base_url, **kwargs)
def _build_chat_completion_params(
async def get_chat_completions(
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
) -> dict:
"""Build parameters for DeepSeek chat completion request.
) -> AsyncStream[ChatCompletionChunk]:
"""Create a streaming chat completion using DeepSeek's API.
DeepSeek doesn't support some OpenAI parameters like seed and max_completion_tokens.
Args:
context: The context object containing tools configuration
and other settings for the chat completion.
messages: The list of messages comprising the conversation
history and current request.
Returns:
A streaming response of chat completion chunks that can be
processed asynchronously.
"""
params = {
"model": self.model_name,
@@ -76,4 +85,6 @@ class DeepSeekLLMService(OpenAILLMService):
}
params.update(self._settings["extra"])
return params
chunks = await self._client.chat.completions.create(**params)
return chunks

View File

@@ -38,6 +38,7 @@ from pipecat.services.tts_service import (
WordTTSService,
)
from pipecat.transcriptions.language import Language
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
from pipecat.utils.tracing.service_decorators import traced_tts
# See .env.example for ElevenLabs configuration needed
@@ -244,7 +245,6 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
auto_mode: Whether to enable automatic mode optimization.
enable_ssml_parsing: Whether to parse SSML tags in text.
enable_logging: Whether to enable ElevenLabs logging.
apply_text_normalization: Text normalization mode ("auto", "on", "off").
"""
language: Optional[Language] = None
@@ -256,7 +256,6 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
auto_mode: Optional[bool] = True
enable_ssml_parsing: Optional[bool] = None
enable_logging: Optional[bool] = None
apply_text_normalization: Optional[Literal["auto", "on", "off"]] = None
def __init__(
self,
@@ -321,7 +320,6 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
"auto_mode": str(params.auto_mode).lower(),
"enable_ssml_parsing": params.enable_ssml_parsing,
"enable_logging": params.enable_logging,
"apply_text_normalization": params.apply_text_normalization,
}
self.set_model_name(model)
self.set_voice(voice_id)
@@ -372,49 +370,13 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
await self._connect()
async def _update_settings(self, settings: Mapping[str, Any]):
"""Update service settings and reconnect if voice, model, or language changed."""
# Track previous values for settings that require reconnection
"""Update service settings and reconnect if voice changed."""
prev_voice = self._voice_id
prev_model = self.model_name
prev_language = self._settings.get("language")
# Create snapshot of current voice settings to detect changes after update
prev_voice_settings = self._voice_settings.copy() if self._voice_settings else None
await super()._update_settings(settings)
# Update voice settings for the next context creation
self._voice_settings = self._set_voice_settings()
# Check if URL-level settings changed (these require reconnection)
url_changed = (
prev_voice != self._voice_id
or prev_model != self.model_name
or prev_language != self._settings.get("language")
)
# Check if only voice settings changed (speed, stability, etc.)
voice_settings_changed = prev_voice_settings != self._voice_settings
if url_changed:
# These settings are in the WebSocket URL, so we need to reconnect
logger.debug(
f"URL-level setting changed (voice/model/language), reconnecting WebSocket"
)
if not prev_voice == self._voice_id:
logger.info(f"Switching TTS voice to: [{self._voice_id}]")
await self._disconnect()
await self._connect()
elif voice_settings_changed and self._context_id:
# Voice settings can be updated by closing current context
# so new one gets created with updated voice settings
logger.debug(f"Voice settings changed, closing current context to apply changes")
try:
if self._websocket:
await self._websocket.send(
json.dumps({"context_id": self._context_id, "close_context": True})
)
except Exception as e:
logger.warning(f"Error closing context for voice settings update: {e}")
self._context_id = None
self._started = False
async def start(self, frame: StartFrame):
"""Start the ElevenLabs TTS service.
@@ -503,9 +465,6 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
if self._settings["enable_logging"]:
url += f"&enable_logging={self._settings['enable_logging']}"
if self._settings["apply_text_normalization"] is not None:
url += f"&apply_text_normalization={self._settings['apply_text_normalization']}"
# Language can only be used with the ELEVENLABS_MULTILINGUAL_MODELS
language = self._settings["language"]
if model in ELEVENLABS_MULTILINGUAL_MODELS and language is not None:
@@ -536,7 +495,6 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
if self._context_id:
await self._websocket.send(json.dumps({"close_socket": True}))
await self._websocket.close()
logger.debug("Disconnected from ElevenLabs")
except Exception as e:
logger.error(f"{self} error closing websocket: {e}")
finally:
@@ -573,7 +531,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
async def _receive_messages(self):
"""Handle incoming WebSocket messages from ElevenLabs."""
async for message in self._get_websocket():
async for message in WatchdogAsyncIterator(
self._get_websocket(), manager=self.task_manager
):
msg = json.loads(message)
received_ctx_id = msg.get("contextId")
@@ -632,8 +592,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
async def _keepalive_task_handler(self):
"""Send periodic keepalive messages to maintain WebSocket connection."""
KEEPALIVE_SLEEP = 10
KEEPALIVE_SLEEP = 10 if self.task_manager.task_watchdog_enabled else 3
while True:
self.reset_watchdog()
await asyncio.sleep(KEEPALIVE_SLEEP)
try:
if self._websocket and self._websocket.state is State.OPEN:
@@ -734,7 +695,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
style: Style control for voice expression (0.0 to 1.0).
use_speaker_boost: Whether to use speaker boost enhancement.
speed: Voice speed control (0.25 to 4.0).
apply_text_normalization: Text normalization mode ("auto", "on", "off").
"""
language: Optional[Language] = None
@@ -744,7 +704,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
style: Optional[float] = None
use_speaker_boost: Optional[bool] = None
speed: Optional[float] = None
apply_text_normalization: Optional[Literal["auto", "on", "off"]] = None
def __init__(
self,
@@ -795,7 +754,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
"style": params.style,
"use_speaker_boost": params.use_speaker_boost,
"speed": params.speed,
"apply_text_normalization": params.apply_text_normalization,
}
self.set_model_name(model)
self.set_voice(voice_id)
@@ -979,8 +937,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
}
if self._settings["optimize_streaming_latency"] is not None:
params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
if self._settings["apply_text_normalization"] is not None:
params["apply_text_normalization"] = self._settings["apply_text_normalization"]
try:
await self.start_ttfb_metrics()

View File

@@ -54,13 +54,20 @@ class FireworksLLMService(OpenAILLMService):
logger.debug(f"Creating Fireworks client with api {base_url}")
return super().create_client(api_key, base_url, **kwargs)
def build_chat_completion_params(
async def get_chat_completions(
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
) -> dict:
"""Build parameters for Fireworks chat completion request.
):
"""Get chat completions from Fireworks API.
Fireworks doesn't support some OpenAI parameters like seed, max_completion_tokens,
and stream_options.
Removes OpenAI-specific parameters not supported by Fireworks and
configures the request with Fireworks-compatible settings.
Args:
context: The OpenAI LLM context containing tools and settings.
messages: List of chat completion message parameters.
Returns:
Async generator yielding chat completion chunks from Fireworks API.
"""
params = {
"model": self.model_name,
@@ -76,4 +83,6 @@ class FireworksLLMService(OpenAILLMService):
}
params.update(self._settings["extra"])
return params
chunks = await self._client.chat.completions.create(**params)
return chunks

View File

@@ -67,6 +67,7 @@ from pipecat.services.openai.llm import (
OpenAIUserContextAggregator,
)
from pipecat.transcriptions.language import Language
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
from pipecat.utils.string import match_endofsentence
from pipecat.utils.time import time_now_iso8601
from pipecat.utils.tracing.service_decorators import traced_gemini_live, traced_stt
@@ -928,7 +929,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
async def _receive_task_handler(self):
"""Handle incoming messages from the WebSocket connection."""
async for message in self._websocket:
async for message in WatchdogAsyncIterator(self._websocket, manager=self.task_manager):
evt = events.parse_server_event(message)
# logger.debug(f"Received event: {message[:500]}")
# logger.debug(f"Received event: {evt}")

View File

@@ -15,6 +15,7 @@ import base64
import json
import warnings
from typing import Any, AsyncGenerator, Dict, Literal, Optional
from urllib.parse import urlencode
import aiohttp
from loguru import logger
@@ -31,6 +32,7 @@ from pipecat.frames.frames import (
from pipecat.services.gladia.config import GladiaInputParams
from pipecat.services.stt_service import STTService
from pipecat.transcriptions.language import Language
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
from pipecat.utils.time import time_now_iso8601
from pipecat.utils.tracing.service_decorators import traced_stt
@@ -202,7 +204,7 @@ class GladiaSTTService(STTService):
self,
*,
api_key: str,
region: Literal["us-west", "eu-west"] | None = None,
region: Optional[Literal["us-west", "eu-west"]] = "eu-west",
url: str = "https://api.gladia.io/v2/live",
confidence: float = 0.5,
sample_rate: Optional[int] = None,
@@ -339,6 +341,13 @@ class GladiaSTTService(STTService):
return settings
def _get_endpoint_url(self) -> str:
query_params = dict()
query_params["region"] = self._region or "eu-west"
query = urlencode(query_params)
return f"{self._url}?{query}"
async def start(self, frame: StartFrame):
"""Start the Gladia STT websocket connection.
@@ -431,7 +440,7 @@ class GladiaSTTService(STTService):
try:
self._websocket = websocket
self._connection_active = True
logger.debug(f"{self} Connected to Gladia WebSocket")
logger.info("Connected to Gladia WebSocket")
# Send buffered audio if any
await self._send_buffered_audio()
@@ -486,16 +495,14 @@ class GladiaSTTService(STTService):
async def _setup_gladia(self, settings: Dict[str, Any]):
async with aiohttp.ClientSession() as session:
params = {}
if self._region:
params["region"] = self._region
async with session.post(
self._url,
headers={"X-Gladia-Key": self._api_key},
self._get_endpoint_url(),
headers={"X-Gladia-Key": self._api_key, "Content-Type": "application/json"},
json=settings,
params=params,
) as response:
if response.ok:
response_text = await response.json()
logger.error(f"Gladia response: {response_text}")
return await response.json()
else:
error_text = await response.text()
@@ -524,7 +531,7 @@ class GladiaSTTService(STTService):
"""Send any buffered audio after reconnection."""
async with self._buffer_lock:
if self._audio_buffer:
logger.debug(f"{self} Sending {len(self._audio_buffer)} bytes of buffered audio")
logger.info(f"Sending {len(self._audio_buffer)} bytes of buffered audio")
await self._send_audio(bytes(self._audio_buffer))
async def _send_stop_recording(self):
@@ -534,8 +541,9 @@ class GladiaSTTService(STTService):
async def _keepalive_task_handler(self):
"""Send periodic empty audio chunks to keep the connection alive."""
try:
KEEPALIVE_SLEEP = 20
KEEPALIVE_SLEEP = 20 if self.task_manager.task_watchdog_enabled else 3
while self._connection_active:
self.reset_watchdog()
# Send keepalive (Gladia times out after 30 seconds)
await asyncio.sleep(KEEPALIVE_SLEEP)
if self._websocket and self._websocket.state is State.OPEN:
@@ -552,7 +560,7 @@ class GladiaSTTService(STTService):
async def _receive_task_handler(self):
try:
async for message in self._websocket:
async for message in WatchdogAsyncIterator(self._websocket, manager=self.task_manager):
content = json.loads(message)
# Handle audio chunk acknowledgments
@@ -610,6 +618,8 @@ class GladiaSTTService(STTService):
translation, "", time_now_iso8601(), translated_language
)
)
self.reset_watchdog()
except websockets.exceptions.ConnectionClosed:
# Expected when closing the connection
pass
@@ -626,8 +636,8 @@ class GladiaSTTService(STTService):
self._should_reconnect = False
return False
delay = self._reconnection_delay * (2 ** (self._reconnection_attempts - 1))
logger.debug(
f"{self} Reconnecting in {delay} seconds (attempt {self._reconnection_attempts}/{self._max_reconnection_attempts})"
logger.info(
f"Reconnecting in {delay} seconds (attempt {self._reconnection_attempts}/{self._max_reconnection_attempts})"
)
await asyncio.sleep(delay)
return True

View File

@@ -53,6 +53,7 @@ from pipecat.services.openai.llm import (
OpenAIAssistantContextAggregator,
OpenAIUserContextAggregator,
)
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
from pipecat.utils.tracing.service_decorators import traced_llm
# Suppress gRPC fork warnings
@@ -806,7 +807,7 @@ class GoogleLLMService(LLMService):
)
function_calls = []
async for chunk in response:
async for chunk in WatchdogAsyncIterator(response, manager=self.task_manager):
# Stop TTFB metrics after the first chunk
await self.stop_ttfb_metrics()
if chunk.usage_metadata:

View File

@@ -17,6 +17,7 @@ from openai import AsyncStream
from openai.types.chat import ChatCompletionChunk
from pipecat.services.llm_service import FunctionCallFromLLM
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
# Suppress gRPC fork warnings
os.environ["GRPC_ENABLE_FORK_SUPPORT"] = "false"
@@ -76,7 +77,7 @@ class GoogleLLMOpenAIBetaService(OpenAILLMService):
context
)
async for chunk in chunk_stream:
async for chunk in WatchdogAsyncIterator(chunk_stream, manager=self.task_manager):
if chunk.usage:
tokens = LLMTokenUsage(
prompt_tokens=chunk.usage.prompt_tokens,

View File

@@ -16,6 +16,7 @@ import json
import os
import time
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
from pipecat.utils.tracing.service_decorators import traced_stt
# Suppress gRPC fork warnings
@@ -780,6 +781,7 @@ class GoogleSTTService(STTService):
if self._request_queue.empty():
# wait for 10ms in case we don't have audio
await asyncio.sleep(0.01)
self.reset_watchdog()
continue
# Start bi-directional streaming
@@ -834,7 +836,9 @@ class GoogleSTTService(STTService):
async def _process_responses(self, streaming_recognize):
"""Process streaming recognition responses."""
try:
async for response in streaming_recognize:
async for response in WatchdogAsyncIterator(
streaming_recognize, manager=self.task_manager
):
# Check streaming limit
if (int(time.time() * 1000) - self._stream_start_time) > self.STREAMING_LIMIT:
logger.debug("Stream timeout reached in response processing")

View File

@@ -9,9 +9,6 @@
This module provides integration with Google Cloud Text-to-Speech API,
offering both HTTP-based synthesis with SSML support and streaming synthesis
for real-time applications.
It also includes GeminiTTSService which uses Gemini's TTS-specific models
for natural voice control and multi-speaker conversations.
"""
import json
@@ -22,7 +19,7 @@ from pipecat.utils.tracing.service_decorators import traced_tts
# Suppress gRPC fork warnings
os.environ["GRPC_ENABLE_FORK_SUPPORT"] = "false"
from typing import AsyncGenerator, List, Literal, Optional
from typing import AsyncGenerator, Literal, Optional
from loguru import logger
from pydantic import BaseModel
@@ -30,7 +27,6 @@ from pydantic import BaseModel
from pipecat.frames.frames import (
ErrorFrame,
Frame,
StartFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
@@ -51,15 +47,6 @@ except ModuleNotFoundError as e:
)
raise Exception(f"Missing module: {e}")
try:
from google import genai
from google.genai import types
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use Gemini TTS, you need to `pip install pipecat-ai[google]`.")
raise Exception(f"Missing module: {e}")
def language_to_google_tts_language(language: Language) -> Optional[str]:
"""Convert a Language enum to Google TTS language code.
@@ -655,252 +642,3 @@ class GoogleTTSService(TTSService):
logger.exception(f"{self} error generating TTS: {e}")
error_message = f"TTS generation error: {str(e)}"
yield ErrorFrame(error=error_message)
class GeminiTTSService(TTSService):
"""Gemini Text-to-Speech service using Gemini TTS models.
Provides text-to-speech synthesis using Gemini's TTS-specific models
(gemini-2.5-flash-preview-tts and gemini-2.5-pro-preview-tts) with
support for natural voice control, multiple speakers, and voice styles.
Note:
Requires Google AI API key. This uses the Gemini API, not Google Cloud TTS.
Audio-out is currently a preview feature.
Example::
tts = GeminiTTSService(
api_key="your-google-ai-api-key",
model="gemini-2.5-flash-preview-tts",
voice_id="Kore",
params=GeminiTTSService.InputParams(
language=Language.EN_US,
)
)
"""
GOOGLE_SAMPLE_RATE = 24000 # Google TTS always outputs at 24kHz
# List of available Gemini TTS voices
AVAILABLE_VOICES = [
"Zephyr",
"Puck",
"Charon",
"Kore",
"Fenrir",
"Leda",
"Orus",
"Aoede",
"Callirhoe",
"Autonoe",
"Enceladus",
"Iapetus",
"Umbriel",
"Algieba",
"Despina",
"Erinome",
"Algenib",
"Rasalgethi",
"Laomedeia",
"Achernar",
"Alnilam",
"Schedar",
"Gacrux",
"Pulcherrima",
"Achird",
"Zubenelgenubi",
"Vindemiatrix",
"Sadachbia",
"Sadaltager",
"Sulafar",
]
class InputParams(BaseModel):
"""Input parameters for Gemini TTS configuration.
Parameters:
language: Language for synthesis. Defaults to English.
multi_speaker: Whether to enable multi-speaker support.
speaker_configs: List of speaker configurations for multi-speaker mode.
"""
language: Optional[Language] = Language.EN
multi_speaker: bool = False
speaker_configs: Optional[List[dict]] = None
def __init__(
self,
*,
api_key: str,
model: str = "gemini-2.5-flash-preview-tts",
voice_id: str = "Kore",
sample_rate: Optional[int] = None,
params: Optional[InputParams] = None,
**kwargs,
):
"""Initializes the Gemini TTS service.
Args:
api_key: Google AI API key for authentication.
model: Gemini TTS model to use. Must be a TTS model like
"gemini-2.5-flash-preview-tts" or "gemini-2.5-pro-preview-tts".
voice_id: Voice name from the available Gemini voices.
sample_rate: Audio sample rate in Hz. If None, uses Google's default 24kHz.
params: TTS configuration parameters.
**kwargs: Additional arguments passed to parent TTSService.
"""
if sample_rate and sample_rate != self.GOOGLE_SAMPLE_RATE:
logger.warning(
f"Google TTS only supports {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
f"Current rate of {sample_rate}Hz may cause issues."
)
super().__init__(sample_rate=sample_rate, **kwargs)
params = params or GeminiTTSService.InputParams()
if voice_id not in self.AVAILABLE_VOICES:
logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")
self._api_key = api_key
self._model = model
self._voice_id = voice_id
self._settings = {
"language": self.language_to_service_language(params.language)
if params.language
else "en-US",
"multi_speaker": params.multi_speaker,
"speaker_configs": params.speaker_configs,
}
self._client = genai.Client(api_key=api_key)
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
Returns:
True, as Gemini TTS service supports metrics generation.
"""
return True
def language_to_service_language(self, language: Language) -> Optional[str]:
"""Convert a Language enum to Gemini TTS language format.
Args:
language: The language to convert.
Returns:
The Gemini TTS-specific language code, or None if not supported.
"""
return language_to_google_tts_language(language)
def set_voice(self, voice_id: str):
"""Set the voice for TTS generation.
Args:
voice_id: Name of the voice to use from AVAILABLE_VOICES.
"""
if voice_id not in self.AVAILABLE_VOICES:
logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")
self._voice_id = voice_id
async def start(self, frame: StartFrame):
"""Start the Gemini TTS service.
Args:
frame: The start frame containing initialization parameters.
"""
await super().start(frame)
if self.sample_rate != self.GOOGLE_SAMPLE_RATE:
logger.warning(
f"Google TTS requires {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
f"Current rate of {self.sample_rate}Hz may cause issues."
)
@traced_tts
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"""Generate speech from text using Gemini TTS models.
Args:
text: The text to synthesize into speech. Can include natural language
instructions for style, tone, etc.
Yields:
Frame: Audio frames containing the synthesized speech.
"""
logger.debug(f"{self}: Generating TTS [{text}]")
try:
await self.start_ttfb_metrics()
# Build the speech config
if self._settings["multi_speaker"] and self._settings["speaker_configs"]:
# Multi-speaker mode
speaker_voice_configs = []
for speaker_config in self._settings["speaker_configs"]:
speaker_voice_configs.append(
types.SpeakerVoiceConfig(
speaker=speaker_config["speaker"],
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=speaker_config.get("voice_id", self._voice_id)
)
),
)
)
speech_config = types.SpeechConfig(
multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
speaker_voice_configs=speaker_voice_configs
)
)
else:
# Single speaker mode
speech_config = types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._voice_id)
)
)
# Create the generation config
generation_config = types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=speech_config,
)
# Generate the content
response = await self._client.aio.models.generate_content(
model=self._model,
contents=text,
config=generation_config,
)
await self.start_tts_usage_metrics(text)
yield TTSStartedFrame()
# Extract audio data from response
if response.candidates and len(response.candidates) > 0:
candidate = response.candidates[0]
if candidate.content and candidate.content.parts:
for part in candidate.content.parts:
if part.inline_data and part.inline_data.mime_type.startswith("audio/"):
audio_data = part.inline_data.data
await self.stop_ttfb_metrics()
# Gemini TTS returns PCM audio data, chunk it appropriately
CHUNK_SIZE = self.chunk_size
for i in range(0, len(audio_data), CHUNK_SIZE):
chunk = audio_data[i : i + CHUNK_SIZE]
if not chunk:
break
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
yield frame
yield TTSStoppedFrame()
except Exception as e:
logger.exception(f"{self} error generating TTS: {e}")
error_message = f"Gemini TTS generation error: {str(e)}"
yield ErrorFrame(error=error_message)

Some files were not shown because too many files have changed in this diff Show More