Compare commits
1 Commits
filipi/add
...
hush/aggre
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a163201ea |
14
.github/workflows/build.yaml
vendored
14
.github/workflows/build.yaml
vendored
@@ -21,20 +21,20 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
version: "latest"
|
||||
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install 3.12
|
||||
|
||||
run: uv python install 3.10
|
||||
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
|
||||
|
||||
- name: Build project
|
||||
run: uv build
|
||||
|
||||
|
||||
- name: Install project in editable mode
|
||||
run: uv pip install --editable .
|
||||
run: uv pip install --editable .
|
||||
2
.github/workflows/coverage.yaml
vendored
2
.github/workflows/coverage.yaml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv sync --group dev --extra anthropic --extra aws --extra google --extra langchain --extra websocket
|
||||
uv sync --group dev --extra anthropic --extra aws --extra google --extra langchain
|
||||
|
||||
- name: Run tests with coverage
|
||||
run: |
|
||||
|
||||
14
.github/workflows/format.yaml
vendored
14
.github/workflows/format.yaml
vendored
@@ -22,22 +22,22 @@ jobs:
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
version: "latest"
|
||||
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install 3.12
|
||||
|
||||
run: uv python install 3.10
|
||||
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
|
||||
|
||||
- name: Ruff formatter
|
||||
id: ruff-format
|
||||
run: uv run ruff format --diff
|
||||
|
||||
|
||||
- name: Ruff linter (all rules)
|
||||
id: ruff-check
|
||||
run: uv run ruff check
|
||||
run: uv run ruff check
|
||||
174
.github/workflows/generate-changelog.yml
vendored
174
.github/workflows/generate-changelog.yml
vendored
@@ -1,174 +0,0 @@
|
||||
name: Generate Changelog for Release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
description: "Release version (e.g., 0.0.97)"
|
||||
required: true
|
||||
type: string
|
||||
date:
|
||||
description: "Release date (YYYY-MM-DD format, defaults to today)"
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
generate-changelog:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v4
|
||||
with:
|
||||
enable-cache: true
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv sync --group dev
|
||||
|
||||
- name: Set release date
|
||||
id: set_date
|
||||
run: |
|
||||
if [ -z "${{ inputs.date }}" ]; then
|
||||
RELEASE_DATE=$(date +%Y-%m-%d)
|
||||
echo "Using today's date: $RELEASE_DATE"
|
||||
else
|
||||
RELEASE_DATE="${{ inputs.date }}"
|
||||
echo "Using provided date: $RELEASE_DATE"
|
||||
fi
|
||||
echo "release_date=$RELEASE_DATE" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Validate inputs
|
||||
run: |
|
||||
# Validate version format (basic check)
|
||||
if ! [[ "${{ inputs.version }}" =~ ^[0-9]+\.[0-9]+\.[0-9]+.*$ ]]; then
|
||||
echo "Error: Version must be in format X.Y.Z (e.g., 0.0.97)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Validate date format if provided
|
||||
if [ -n "${{ inputs.date }}" ]; then
|
||||
if ! date -d "${{ inputs.date }}" >/dev/null 2>&1; then
|
||||
# Try macOS date format
|
||||
if ! date -j -f "%Y-%m-%d" "${{ inputs.date }}" >/dev/null 2>&1; then
|
||||
echo "Error: Date must be in YYYY-MM-DD format (e.g., 2025-12-04)"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
- name: Check for changelog fragments
|
||||
id: check_fragments
|
||||
run: |
|
||||
FRAGMENT_COUNT=$(find changelog -name "*.md" ! -name "_template.md.j2" | wc -l | tr -d ' ')
|
||||
echo "fragment_count=$FRAGMENT_COUNT" >> $GITHUB_OUTPUT
|
||||
|
||||
if [ "$FRAGMENT_COUNT" -eq "0" ]; then
|
||||
echo "❌ Error: No changelog fragments found in changelog/"
|
||||
echo ""
|
||||
echo "Cannot create a release without changelog entries."
|
||||
echo "Add changelog fragments to the changelog/ directory (e.g., 1234.added.md) and try again."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Validate fragment types
|
||||
VALID_TYPES="added changed deprecated removed fixed security other"
|
||||
INVALID_FRAGMENTS=""
|
||||
|
||||
for file in changelog/*.md; do
|
||||
# Skip template
|
||||
if [[ "$file" == "changelog/_template.md.j2" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# Extract type from filename (e.g., 1234.added.md -> added)
|
||||
filename=$(basename "$file")
|
||||
# Handle both 1234.added.md and 1234.added.2.md patterns
|
||||
type=$(echo "$filename" | sed -E 's/^[0-9]+\.([a-z]+)(\.[0-9]+)?\.md$/\1/')
|
||||
|
||||
# Check if type is valid
|
||||
if ! echo "$VALID_TYPES" | grep -wq "$type"; then
|
||||
INVALID_FRAGMENTS="$INVALID_FRAGMENTS\n - $filename (type: '$type')"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -n "$INVALID_FRAGMENTS" ]; then
|
||||
echo "❌ Error: Invalid changelog fragment types found:"
|
||||
echo -e "$INVALID_FRAGMENTS"
|
||||
echo ""
|
||||
echo "Valid types are: $VALID_TYPES"
|
||||
echo "Example: 1234.added.md, 5678.fixed.md"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ Found $FRAGMENT_COUNT changelog fragment(s)"
|
||||
echo "has_fragments=true" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Preview changelog
|
||||
run: |
|
||||
echo "## Preview of changelog for version ${{ inputs.version }}"
|
||||
echo ""
|
||||
uv run towncrier build --draft --version "${{ inputs.version }}" --date "${{ steps.set_date.outputs.release_date }}"
|
||||
|
||||
- name: Build changelog
|
||||
run: |
|
||||
uv run towncrier build --version "${{ inputs.version }}" --date "${{ steps.set_date.outputs.release_date }}" --yes
|
||||
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
commit-message: "Update changelog for version ${{ inputs.version }}"
|
||||
title: "Release ${{ inputs.version }} - Changelog Update"
|
||||
body: |
|
||||
## Changelog Update for Release ${{ inputs.version }}
|
||||
|
||||
This PR updates the CHANGELOG.md with all changes for version **${{ inputs.version }}**.
|
||||
|
||||
### Summary
|
||||
- **Version:** ${{ inputs.version }}
|
||||
- **Date:** ${{ steps.set_date.outputs.release_date }}
|
||||
- **Fragments processed:** ${{ steps.check_fragments.outputs.fragment_count }}
|
||||
|
||||
### What this PR does
|
||||
- ✅ Adds new release section to CHANGELOG.md
|
||||
- ✅ Removes processed changelog fragments
|
||||
- ✅ Ready to merge for release
|
||||
|
||||
### Next Steps
|
||||
1. Review the changelog entries below
|
||||
2. Make any necessary edits to CHANGELOG.md if needed
|
||||
3. Merge this PR
|
||||
4. Continue with your release process
|
||||
|
||||
---
|
||||
|
||||
<details>
|
||||
<summary>📋 Preview of changes</summary>
|
||||
|
||||
The changelog has been updated with entries from the following fragments:
|
||||
|
||||
```bash
|
||||
${{ steps.check_fragments.outputs.fragment_count }} fragments processed
|
||||
```
|
||||
|
||||
</details>
|
||||
branch: changelog-${{ inputs.version }}
|
||||
delete-branch: true
|
||||
labels: |
|
||||
changelog
|
||||
release
|
||||
22
.github/workflows/publish.yaml
vendored
22
.github/workflows/publish.yaml
vendored
@@ -5,25 +5,25 @@ on:
|
||||
inputs:
|
||||
gitref:
|
||||
type: string
|
||||
description: 'what git tag to build (e.g. v0.0.74)'
|
||||
description: "what git tag to build (e.g. v0.0.74)"
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: 'Build and upload wheels'
|
||||
name: "Build and upload wheels"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.gitref }}
|
||||
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
version: 'latest'
|
||||
version: "latest"
|
||||
- name: Set up Python
|
||||
run: uv python install 3.12
|
||||
run: uv python install 3.10
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
- name: Build project
|
||||
@@ -35,9 +35,9 @@ jobs:
|
||||
path: ./dist
|
||||
|
||||
publish-to-pypi:
|
||||
name: 'Publish to PyPI'
|
||||
name: "Publish to PyPI"
|
||||
runs-on: ubuntu-latest
|
||||
needs: [build]
|
||||
needs: [ build ]
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
@@ -56,12 +56,12 @@ jobs:
|
||||
print-hash: true
|
||||
|
||||
publish-to-test-pypi:
|
||||
name: 'Publish to Test PyPI'
|
||||
name: "Publish to Test PyPI"
|
||||
runs-on: ubuntu-latest
|
||||
needs: [build]
|
||||
needs: [ build ]
|
||||
environment:
|
||||
name: testpypi
|
||||
url: https://test.pypi.org/p/pipecat-ai
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
- name: Publish to Test PyPI
|
||||
- name: Publish to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
verbose: true
|
||||
|
||||
12
.github/workflows/publish_test.yaml
vendored
12
.github/workflows/publish_test.yaml
vendored
@@ -4,7 +4,7 @@ on: workflow_dispatch
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: 'Build and upload wheels'
|
||||
name: "Build and upload wheels"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
@@ -15,9 +15,9 @@ jobs:
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
version: 'latest'
|
||||
version: "latest"
|
||||
- name: Set up Python
|
||||
run: uv python install 3.12
|
||||
run: uv python install 3.10
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
- name: Build project
|
||||
@@ -29,12 +29,12 @@ jobs:
|
||||
path: ./dist
|
||||
|
||||
publish-to-test-pypi:
|
||||
name: 'Publish to Test PyPI'
|
||||
name: "Publish to Test PyPI"
|
||||
runs-on: ubuntu-latest
|
||||
needs: [build]
|
||||
environment:
|
||||
name: testpypi
|
||||
url: https://test.pypi.org/p/pipecat-ai
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
- name: Publish to Test PyPI
|
||||
- name: Publish to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
verbose: true
|
||||
|
||||
1
.github/workflows/python-compatibility.yaml
vendored
1
.github/workflows/python-compatibility.yaml
vendored
@@ -50,6 +50,7 @@ jobs:
|
||||
run: |
|
||||
uv sync --group dev --all-extras \
|
||||
--no-extra krisp \
|
||||
--no-extra ultravox \
|
||||
--no-extra local-smart-turn \
|
||||
--no-extra moondream \
|
||||
--no-extra mlx-whisper
|
||||
|
||||
2
.github/workflows/tests.yaml
vendored
2
.github/workflows/tests.yaml
vendored
@@ -37,7 +37,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv sync --group dev --extra anthropic --extra aws --extra google --extra langchain --extra websocket
|
||||
uv sync --group dev --extra anthropic --extra aws --extra google --extra langchain
|
||||
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
|
||||
@@ -11,7 +11,7 @@ build:
|
||||
jobs:
|
||||
post_install:
|
||||
- pip install uv
|
||||
- UV_PROJECT_ENVIRONMENT=$READTHEDOCS_VIRTUALENV_PATH uv sync --group docs --all-extras --no-extra krisp --no-extra gstreamer --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
- UV_PROJECT_ENVIRONMENT=$READTHEDOCS_VIRTUALENV_PATH uv sync --group docs --all-extras --no-extra krisp --no-extra gstreamer --no-extra ultravox --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
|
||||
sphinx:
|
||||
configuration: docs/api/conf.py
|
||||
|
||||
1469
CHANGELOG.md
1469
CHANGELOG.md
File diff suppressed because it is too large
Load Diff
@@ -1,336 +0,0 @@
|
||||
# Community Integrations Guide
|
||||
|
||||
Pipecat welcomes community-maintained integrations! As our ecosystem grows, we've established a process for any developer to create and maintain their own service integrations while ensuring discoverability for the Pipecat community.
|
||||
|
||||
## Overview
|
||||
|
||||
**What we support:** Community-maintained integrations that live in separate repositories and are maintained by their authors.
|
||||
|
||||
**What we don't do:** The Pipecat team does not code review, test, or maintain community integrations. We provide guidance and list approved integrations for discoverability.
|
||||
|
||||
**Why this approach:** This allows the community to move quickly while keeping the Pipecat core team focused on maintaining the framework itself.
|
||||
|
||||
## Submitting your Integration
|
||||
|
||||
To be listed as an official community integration, follow these steps:
|
||||
|
||||
### Step 1: Build Your Integration
|
||||
|
||||
Create your integration following the patterns and examples shown in the "Integration Patterns and Examples" section below.
|
||||
|
||||
### Step 2: Set Up Your Repository
|
||||
|
||||
Your repository must contain these components:
|
||||
|
||||
- **Source code** - Complete implementation following Pipecat patterns
|
||||
- **Foundational example** - Single file example showing basic usage (see [Pipecat examples](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational))
|
||||
- **README.md** - Must include:
|
||||
|
||||
- Introduction and explanation of your integration
|
||||
- Installation instructions
|
||||
- Usage instructions with Pipecat Pipeline
|
||||
- How to run your example
|
||||
- Pipecat version compatibility (e.g., "Tested with Pipecat v0.0.86")
|
||||
- Company attribution: If you work for the company providing the service, please mention this in your README. This helps build confidence that the integration will be actively maintained.
|
||||
|
||||
- **LICENSE** - Permissive license (BSD-2 like Pipecat, or equivalent open source terms)
|
||||
- **Code documentation** - Source code with docstrings (we recommend following [Pipecat's docstring conventions](https://github.com/pipecat-ai/pipecat/blob/main/CONTRIBUTING.md#docstring-conventions))
|
||||
- **Changelog** - Maintain a changelog for version updates
|
||||
|
||||
### Step 3: Join Discord
|
||||
|
||||
Join our Discord: https://discord.gg/pipecat
|
||||
|
||||
### Step 4: Submit for Listing
|
||||
|
||||
Submit a pull request to add your integration to our [Community Integrations documentation page](https://docs.pipecat.ai/server/services/community-integrations).
|
||||
|
||||
**To submit:**
|
||||
|
||||
1. Fork the [Pipecat docs repository](https://github.com/pipecat-ai/docs)
|
||||
2. Edit the file `server/services/community-integrations.mdx`
|
||||
3. Add your integration to the appropriate service category table with:
|
||||
- Service name
|
||||
- Link to your repository
|
||||
- Maintainer GitHub username(s)
|
||||
4. Include a link to your demo video (approx 30-60 seconds) in your PR description showing:
|
||||
- Core functionality of your integration
|
||||
- Handling of an interruption (if applicable to service type)
|
||||
5. Submit your pull request
|
||||
|
||||
Once your PR is submitted, post in the `#community-integrations` Discord channel to let us know.
|
||||
|
||||
## Integration Patterns and Examples
|
||||
|
||||
### STT (Speech-to-Text) Services
|
||||
|
||||
#### Websocket-based Services
|
||||
|
||||
**Base class:** `STTService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [DeepgramSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/deepgram/stt.py)
|
||||
- [SpeechmaticsSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/speechmatics/stt.py)
|
||||
|
||||
#### File-based Services
|
||||
|
||||
**Base class:** `SegmentedSTTService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [NvidiaSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/nvidia/stt.py)
|
||||
- [FalSTTService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/fal/stt.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- STT services should push `InterimTranscriptionFrames` and `TranscriptionFrames`
|
||||
- If confidence values are available, filter for values >50% confidence
|
||||
|
||||
### LLM (Large Language Model) Services
|
||||
|
||||
#### OpenAI-Compatible Services
|
||||
|
||||
**Base class:** `OpenAILLMService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [AzureLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/azure/llm.py)
|
||||
- [GrokLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/grok/llm.py) - Shows overriding the base class where needed
|
||||
|
||||
#### Non-OpenAI Compatible Services
|
||||
|
||||
**Requires:** Full implementation
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [AnthropicLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/anthropic/llm.py)
|
||||
- [GoogleLLMService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/llm.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- **Frame sequence:** Output must follow this frame sequence pattern:
|
||||
|
||||
- `LLMFullResponseStartFrame` - Signals the start of an LLM response
|
||||
- `LLMTextFrame` - Contains LLM content, typically streamed as tokens
|
||||
- `LLMFullResponseEndFrame` - Signals the end of an LLM response
|
||||
|
||||
- **Context aggregation:** Implement context aggregation to collect user and assistant content:
|
||||
- Aggregators come in pairs with a `user()` instance and `assistant()` instance
|
||||
- Context must adhere to the `LLMContext` universal format
|
||||
- Aggregators should handle adding messages, function calls, and images to the context
|
||||
|
||||
### TTS (Text-to-Speech) Services
|
||||
|
||||
#### AudioContextWordTTSService
|
||||
|
||||
**Use for:** Websocket-based services supporting word/timestamp alignment
|
||||
|
||||
**Example:**
|
||||
|
||||
- [CartesiaTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/cartesia/tts.py)
|
||||
|
||||
#### InterruptibleTTSService
|
||||
|
||||
**Use for:** Websocket-based services without word/timestamp alignment, requiring disconnection on interruption
|
||||
|
||||
**Example:**
|
||||
|
||||
- [SarvamTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/sarvam/tts.py)
|
||||
|
||||
#### WordTTSService
|
||||
|
||||
**Use for:** HTTP-based services supporting word/timestamp alignment
|
||||
|
||||
**Example:**
|
||||
|
||||
- [ElevenLabsHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/elevenlabs/tts.py)
|
||||
|
||||
#### TTSService
|
||||
|
||||
**Use for:** HTTP-based services without word/timestamp alignment
|
||||
|
||||
**Example:**
|
||||
|
||||
- [GoogleHttpTTSService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/tts.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- For websocket services, use asyncio WebSocket implementation (required for v13+ support)
|
||||
- Handle idle service timeouts with keepalives
|
||||
- TTSServices push both audio (`TTSRawAudioFrame`) and text (`TTSTextFrame`) frames
|
||||
|
||||
### Telephony Serializers
|
||||
|
||||
Pipecat supports telephony provider integration using websocket connections to exchange MediaStreams. These services use a FrameSerializer to serialize and deserialize inputs from the FastAPIWebsocketTransport.
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [Twilio](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/serializers/twilio.py)
|
||||
- [Telnyx](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/serializers/telnyx.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- Include hang-up functionality using the provider's native API, ideally using `aiohttp`
|
||||
- Support DTMF (dual-tone multi-frequency) events if the provider supports them:
|
||||
- Deserialize DTMF events from the provider's protocol to `InputDTMFFrame`
|
||||
- Use `KeypadEntry` enum for valid keypad entries (0-9, \*, #, A-D)
|
||||
- Handle invalid DTMF digits gracefully by returning `None`
|
||||
|
||||
### Image Generation Services
|
||||
|
||||
**Base class:** `ImageGenService`
|
||||
|
||||
**Examples:**
|
||||
|
||||
- [FalImageGenService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/fal/image.py)
|
||||
- [GoogleImageGenService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/google/image.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- Must implement `run_image_gen` method returning an `AsyncGenerator`
|
||||
|
||||
### Vision Services
|
||||
|
||||
Vision services process images and provide analysis such as descriptions, object detection, or visual question answering.
|
||||
|
||||
**Base class:** `VisionService`
|
||||
|
||||
**Example:**
|
||||
|
||||
- [MoondreamVisionService](https://github.com/pipecat-ai/pipecat/blob/main/src/pipecat/services/moondream/vision.py)
|
||||
|
||||
#### Key requirements:
|
||||
|
||||
- Must implement `run_vision` method that takes an `LLMContext` and returns an `AsyncGenerator[Frame, None]`
|
||||
- The method processes the latest image in the context and yields frames with analysis results
|
||||
- Typically yields `TextFrame` objects containing descriptions or answers
|
||||
|
||||
## Implementation Guidelines
|
||||
|
||||
### Naming Conventions
|
||||
|
||||
- **STT:** `VendorSTTService`
|
||||
- **LLM:** `VendorLLMService`
|
||||
- **TTS:**
|
||||
- Websocket: `VendorTTSService`
|
||||
- HTTP: `VendorHttpTTSService`
|
||||
- **Image:** `VendorImageGenService`
|
||||
- **Vision:** `VendorVisionService`
|
||||
- **Telephony:** `VendorFrameSerializer`
|
||||
|
||||
### Metrics Support
|
||||
|
||||
Enable metrics in your service:
|
||||
|
||||
```python
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as this service supports metrics.
|
||||
"""
|
||||
return True
|
||||
```
|
||||
|
||||
### Dynamic Settings Updates
|
||||
|
||||
STT, LLM, and TTS services support `ServiceUpdateSettingsFrame` for dynamic configuration changes. The base STTService has an `_update_settings()` method that handles settings, and the private `_settings` `Dict` is used to store settings and provide access to the subclass.
|
||||
|
||||
```python
|
||||
async def set_language(self, language: Language):
|
||||
"""Set the recognition language and reconnect.
|
||||
|
||||
Args:
|
||||
language: The language to use for speech recognition.
|
||||
"""
|
||||
logger.info(f"Switching STT language to: [{language}]")
|
||||
self._settings["language"] = language
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
```
|
||||
|
||||
Note that, in this example, Deepgram requires the websocket connection be disconnected and reconnected to reinitialize the service with the new value. Consider if your service requires reconnection.
|
||||
|
||||
### Sample Rate Handling
|
||||
|
||||
Sample rates are set via PipelineParams and passed to each frame processor at initialization. The pattern is to _not_ set the sample rate value in the constructor of a given service. Instead, use the `start()` method to initialize sample rates from the frame:
|
||||
|
||||
```python
|
||||
async def start(self, frame: StartFrame):
|
||||
"""Start the service."""
|
||||
await super().start(frame)
|
||||
self._settings["output_format"]["sample_rate"] = self.sample_rate
|
||||
await self._connect()
|
||||
```
|
||||
|
||||
Note that `self.sample_rate` is a `@property` set in the TTSService base class, which provides access to the private sample rate value obtained from the StartFrame.
|
||||
|
||||
### Tracing Decorators
|
||||
|
||||
Use Pipecat's tracing decorators:
|
||||
|
||||
- **STT:** `@traced_stt` - decorate a function that handles `transcript`, `is_final`, `language` as args
|
||||
- **LLM:** `@traced_llm` - decorate the `_process_context()` method
|
||||
- **TTS:** `@traced_tts` - decorate the `run_tts()` method
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Packaging and Distribution
|
||||
|
||||
- Use [uv](https://docs.astral.sh/uv/) for packaging (encouraged)
|
||||
- Consider releasing to PyPI for easier installation
|
||||
- Follow semantic versioning principles
|
||||
- Maintain a changelog
|
||||
|
||||
### HTTP Communication
|
||||
|
||||
For REST-based communication, use aiohttp. Pipecat includes this as a required dependency, so using it prevents adding an additional dependency to your integration.
|
||||
|
||||
### Error Handling
|
||||
|
||||
- Wrap API calls in appropriate try/catch blocks
|
||||
- Handle rate limits and network failures gracefully
|
||||
- Provide meaningful error messages
|
||||
- When errors occur, raise exceptions AND push `ErrorFrame`s to notify the pipeline:
|
||||
|
||||
```python
|
||||
from pipecat.frames.frames import ErrorFrame
|
||||
|
||||
try:
|
||||
# Your API call
|
||||
result = await self._make_api_call()
|
||||
except Exception as e:
|
||||
# Push error frame to pipeline
|
||||
await self.push_error(ErrorFrame(error=f"{self} error: {e}"))
|
||||
# Raise or handle as appropriate
|
||||
raise
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
- Your foundational example serves as a valuable integration-level test
|
||||
- Unit tests are nice to have. As the Pipecat teams provides better guidance, we will encourage unit testing more
|
||||
|
||||
## Disclaimer
|
||||
|
||||
Community integrations are community-maintained and not officially supported by the Pipecat team. Users should evaluate these integrations independently. The Pipecat team reserves the right to remove listings that become unmaintained or problematic.
|
||||
|
||||
## Staying Up to Date
|
||||
|
||||
Pipecat evolves rapidly to support the latest AI technologies and patterns. While we strive to minimize breaking changes, they do occur as the framework matures.
|
||||
|
||||
**We strongly recommend:**
|
||||
|
||||
- Join our Discord at https://discord.gg/pipecat and monitor the `#announcements` channel for release notifications
|
||||
- Follow our changelog: https://github.com/pipecat-ai/pipecat/blob/main/CHANGELOG.md
|
||||
- Test your integration against new Pipecat releases promptly
|
||||
- Update your README with the last tested Pipecat version
|
||||
|
||||
This helps ensure your integration remains compatible and your users have clear expectations about version support.
|
||||
|
||||
## Questions?
|
||||
|
||||
Join our Discord community at https://discord.gg/pipecat and post in the `#community-integrations` channel for guidance and support.
|
||||
|
||||
For additional questions, you can also reach out to us at pipecat-ai@daily.co.
|
||||
110
CONTRIBUTING.md
110
CONTRIBUTING.md
@@ -1,9 +1,5 @@
|
||||
## Contributing to Pipecat
|
||||
|
||||
**Want to add a new service integration?**
|
||||
We encourage community-maintained integrations! Please see our [Community Integration Guide](COMMUNITY_INTEGRATIONS.md) for the process and requirements.
|
||||
|
||||
**Want to contribute to Pipecat core?**
|
||||
We welcome contributions of all kinds! Your help is appreciated. Follow these steps to get involved:
|
||||
|
||||
1. **Fork this repository**: Start by forking the Pipecat Documentation repository to your GitHub account.
|
||||
@@ -17,122 +13,24 @@ We welcome contributions of all kinds! Your help is appreciated. Follow these st
|
||||
git checkout -b your-branch-name
|
||||
```
|
||||
4. **Make your changes**: Edit or add files as necessary.
|
||||
5. **Add a changelog entry**: Create a changelog fragment file (see [Changelog Entries](#changelog-entries) below).
|
||||
6. **Test your changes**: Ensure that your changes look correct and follow the style set in the codebase.
|
||||
7. **Commit your changes**: Once you're satisfied with your changes, commit them with a meaningful message.
|
||||
5. **Test your changes**: Ensure that your changes look correct and follow the style set in the codebase.
|
||||
6. **Commit your changes**: Once you're satisfied with your changes, commit them with a meaningful message.
|
||||
|
||||
```bash
|
||||
git commit -m "Description of your changes"
|
||||
```
|
||||
|
||||
8. **Push your changes**: Push your branch to your forked repository.
|
||||
7. **Push your changes**: Push your branch to your forked repository.
|
||||
|
||||
```bash
|
||||
git push origin your-branch-name
|
||||
```
|
||||
|
||||
9. **Submit a Pull Request (PR)**: Open a PR from your forked repository to the main branch of this repo.
|
||||
8. **Submit a Pull Request (PR)**: Open a PR from your forked repository to the main branch of this repo.
|
||||
> Important: Describe the changes you've made clearly!
|
||||
|
||||
Our maintainers will review your PR, and once everything is good, your contributions will be merged!
|
||||
|
||||
## Changelog Entries
|
||||
|
||||
Every pull request that makes a user-facing change should include a changelog entry. We use a changelog fragment system to avoid merge conflicts.
|
||||
|
||||
### Creating a Changelog Fragment
|
||||
|
||||
1. Create a new file in the `changelog/` directory with this naming pattern:
|
||||
|
||||
```
|
||||
<PR_number>.<type>.md
|
||||
```
|
||||
|
||||
2. Choose the appropriate type:
|
||||
|
||||
- `added.md` - New features
|
||||
- `changed.md` - Changes in existing functionality
|
||||
- `deprecated.md` - Soon-to-be removed features
|
||||
- `removed.md` - Removed features
|
||||
- `fixed.md` - Bug fixes
|
||||
- `security.md` - Security fixes
|
||||
- `other.md` - Other changes (documentation, dependencies, etc.)
|
||||
|
||||
3. Write your changelog entry as a Markdown bullet point. Include the `-` at the start:
|
||||
|
||||
**Example files:**
|
||||
|
||||
`changelog/1234.added.md`:
|
||||
|
||||
```markdown
|
||||
- Added support for Anthropic Claude 3.5 Sonnet with improved streaming performance.
|
||||
```
|
||||
|
||||
`changelog/5678.fixed.md`:
|
||||
|
||||
```markdown
|
||||
- Fixed an issue where audio frames were dropped during high-load scenarios.
|
||||
```
|
||||
|
||||
**For entries with nested bullets:**
|
||||
|
||||
`changelog/1234.changed.md`:
|
||||
|
||||
```markdown
|
||||
- Updated service configuration:
|
||||
|
||||
- Changed default timeout to 30 seconds
|
||||
- Added retry logic for failed connections
|
||||
```
|
||||
|
||||
### Multiple Changes in One PR
|
||||
|
||||
**Different types of changes:** Create separate fragment files for each type:
|
||||
|
||||
```
|
||||
changelog/1234.added.md
|
||||
changelog/1234.fixed.md
|
||||
```
|
||||
|
||||
**Multiple changes of the same type:** Create numbered fragment files:
|
||||
|
||||
```
|
||||
changelog/1234.changed.md
|
||||
changelog/1234.changed.2.md
|
||||
```
|
||||
|
||||
**Related changes:** Use nested bullets in a single fragment:
|
||||
|
||||
```markdown
|
||||
- Updated service configuration:
|
||||
|
||||
- Changed default timeout to 30 seconds
|
||||
- Added retry logic for failed connections
|
||||
```
|
||||
|
||||
**Rule of thumb:** One logical change per fragment file. If changes are unrelated, use separate files.
|
||||
|
||||
### Preview Your Changes
|
||||
|
||||
To see what your changelog entry will look like:
|
||||
|
||||
```bash
|
||||
towncrier build --draft --version Unreleased
|
||||
```
|
||||
|
||||
This won't modify any files, just show you a preview.
|
||||
|
||||
### When to Skip Changelog Entries
|
||||
|
||||
You can skip adding a changelog entry for:
|
||||
|
||||
- Documentation-only changes
|
||||
- Internal refactoring with no user-facing impact
|
||||
- Test-only changes
|
||||
- CI/build configuration changes
|
||||
|
||||
If you're unsure whether your change needs a changelog entry, ask in your PR!
|
||||
|
||||
## Dependency Management
|
||||
|
||||
This project uses [uv](https://docs.astral.sh/uv/) for dependency management. The `uv.lock` file is committed to ensure reproducible builds.
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
BSD 2-Clause License
|
||||
|
||||
Copyright (c) 2024–2026, Daily
|
||||
Copyright (c) 2024–2025, Daily
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
143
README.md
143
README.md
@@ -19,6 +19,10 @@
|
||||
- **Business Agents** – customer intake, support bots, guided flows
|
||||
- **Complex Dialog Systems** – design logic with structured conversations
|
||||
|
||||
🧭 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
|
||||
|
||||
🔍 Looking for help debugging your pipeline and processors? Check out [Whisker](https://github.com/pipecat-ai/whisker), a real-time Pipecat debugger.
|
||||
|
||||
## 🧠 Why Pipecat?
|
||||
|
||||
- **Voice-first**: Integrates speech recognition, text-to-speech, and conversation handling
|
||||
@@ -26,38 +30,40 @@
|
||||
- **Composable Pipelines**: Build complex behavior from modular components
|
||||
- **Real-Time**: Ultra-low latency interaction with different transports (e.g. WebSockets or WebRTC)
|
||||
|
||||
## 🌐 Pipecat Ecosystem
|
||||
## 📱 Client SDKs
|
||||
|
||||
### 📱 Client SDKs
|
||||
You can connect to Pipecat from any platform using our official SDKs:
|
||||
|
||||
Building client applications? You can connect to Pipecat from any platform using our official SDKs:
|
||||
|
||||
<a href="https://docs.pipecat.ai/client/js/introduction">JavaScript</a> | <a href="https://docs.pipecat.ai/client/react/introduction">React</a> | <a href="https://docs.pipecat.ai/client/react-native/introduction">React Native</a> |
|
||||
<a href="https://docs.pipecat.ai/client/ios/introduction">Swift</a> | <a href="https://docs.pipecat.ai/client/android/introduction">Kotlin</a> | <a href="https://docs.pipecat.ai/client/c++/introduction">C++</a> | <a href="https://github.com/pipecat-ai/pipecat-esp32">ESP32</a>
|
||||
|
||||
### 🧭 Structured conversations
|
||||
|
||||
Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
|
||||
|
||||
### 🪄 Beautiful UIs
|
||||
|
||||
Want to build beautiful and engaging experiences? Checkout the [Voice UI Kit](https://github.com/pipecat-ai/voice-ui-kit), a collection of components, hooks and templates for building voice AI applications quickly.
|
||||
|
||||
### 🛠️ Create and deploy projects
|
||||
|
||||
Create a new project in under a minute with the [Pipecat CLI](https://github.com/pipecat-ai/pipecat-cli). Then use the CLI to monitor and deploy your agent to production.
|
||||
|
||||
### 🔍 Debugging
|
||||
|
||||
Looking for help debugging your pipeline and processors? Check out [Whisker](https://github.com/pipecat-ai/whisker), a real-time Pipecat debugger.
|
||||
|
||||
### 🖥️ Terminal
|
||||
|
||||
Love terminal applications? Check out [Tail](https://github.com/pipecat-ai/tail), a terminal dashboard for Pipecat.
|
||||
|
||||
### 📺️ Pipecat TV Channel
|
||||
|
||||
Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.youtube.com/playlist?list=PLzU2zoMTQIHjqC3v4q2XVSR3hGSzwKFwH) channel.
|
||||
<table>
|
||||
<tr>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/javascript/javascript-original.svg" width="40" height="40" alt="JavaScript"/>
|
||||
<a href="https://docs.pipecat.ai/client/js/introduction">JavaScript</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/react/react-original.svg" width="40" height="40" alt="React"/>
|
||||
<a href="https://docs.pipecat.ai/client/react/introduction">React</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/react/react-original.svg" width="40" height="40" alt="React Native"/>
|
||||
<a href="https://docs.pipecat.ai/client/react-native/introduction">React Native</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/swift/swift-original.svg" width="40" height="40" alt="Swift"/>
|
||||
<a href="https://docs.pipecat.ai/client/ios/introduction">Swift</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/kotlin/kotlin-original.svg" width="40" height="40" alt="Kotlin"/>
|
||||
<a href="https://docs.pipecat.ai/client/android/introduction">Kotlin</a>
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/cplusplus/cplusplus-original.svg" width="40" height="40" alt="JavaScript"/>
|
||||
<a href="https://docs.pipecat.ai/client/c++/introduction">C++</a>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## 🎬 See it in action
|
||||
|
||||
@@ -66,24 +72,24 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout
|
||||
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/storytelling-chatbot/image.png" width="400" /></a>
|
||||
<br/>
|
||||
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/translation-chatbot/image.png" width="400" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/blob/main/examples/foundational/12-describe-video.py"><img src="https://github.com/pipecat-ai/pipecat/blob/main/examples/foundational/assets/moondream.png" width="400" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat-examples/tree/main/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat-examples/main/moondream-chatbot/image.png" width="400" /></a>
|
||||
</p>
|
||||
|
||||
## 🧩 Available services
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
| Category | Services |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||
|
||||
@@ -153,6 +159,7 @@ You can get started with Pipecat running on your local machine, then move your a
|
||||
--no-extra gstreamer \
|
||||
--no-extra krisp \
|
||||
--no-extra local \
|
||||
--no-extra ultravox # (ultravox not fully supported on macOS)
|
||||
```
|
||||
|
||||
3. Install the git pre-commit hooks:
|
||||
@@ -177,6 +184,54 @@ Run a specific test suite:
|
||||
uv run pytest tests/test_name.py
|
||||
```
|
||||
|
||||
### Setting up your editor
|
||||
|
||||
This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting via [Ruff](https://github.com/astral-sh/ruff).
|
||||
|
||||
#### Emacs
|
||||
|
||||
You can use [use-package](https://github.com/jwiegley/use-package) to install [emacs-lazy-ruff](https://github.com/christophermadsen/emacs-lazy-ruff) package and configure `ruff` arguments:
|
||||
|
||||
```elisp
|
||||
(use-package lazy-ruff
|
||||
:ensure t
|
||||
:hook ((python-mode . lazy-ruff-mode))
|
||||
:config
|
||||
(setq lazy-ruff-format-command "ruff format")
|
||||
(setq lazy-ruff-check-command "ruff check --select I"))
|
||||
```
|
||||
|
||||
`ruff` was installed in the `venv` environment described before, so you should be able to use [pyvenv-auto](https://github.com/ryotaro612/pyvenv-auto) to automatically load that environment inside Emacs.
|
||||
|
||||
```elisp
|
||||
(use-package pyvenv-auto
|
||||
:ensure t
|
||||
:defer t
|
||||
:hook ((python-mode . pyvenv-auto-run)))
|
||||
```
|
||||
|
||||
#### Visual Studio Code
|
||||
|
||||
Install the
|
||||
[Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, and enable formatting on save:
|
||||
|
||||
```json
|
||||
"[python]": {
|
||||
"editor.defaultFormatter": "charliermarsh.ruff",
|
||||
"editor.formatOnSave": true
|
||||
}
|
||||
```
|
||||
|
||||
#### PyCharm
|
||||
|
||||
`ruff` was installed in the `venv` environment described before, now to enable autoformatting on save, go to `File` -> `Settings` -> `Tools` -> `File Watchers` and add a new watcher with the following settings:
|
||||
|
||||
1. **Name**: `Ruff formatter`
|
||||
2. **File type**: `Python`
|
||||
3. **Working directory**: `$ContentRoot$`
|
||||
4. **Arguments**: `format $FilePath$`
|
||||
5. **Program**: `$PyInterpreterDirectory$/ruff`
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
# Security Policy
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
Please email `disclosures@daily.co`.
|
||||
@@ -1,42 +0,0 @@
|
||||
- Introducing user turn strategies. User turn strategies indicate when the user turn starts or stops. In conversational agents, these are often referred to as start/stop speaking or turn-taking plans or policies.
|
||||
|
||||
User turn start strategies indicate when the user starts speaking (e.g. using VAD events or when a user says one or more words).
|
||||
|
||||
User turn stop strategies indicate when the user stops speaking (e.g. using an end-of-turn detection model or by observing incoming transcriptions).
|
||||
|
||||
A list of strategies can be specified for both strategies; strategies are evaluated in order until one evaluates to true.
|
||||
|
||||
Available user turn start strategies:
|
||||
- VADUserTurnStartStrategy
|
||||
- TranscriptionUserTurnStartStrategy
|
||||
- MinWordsUserTurnStartStrategy
|
||||
- ExternalUserTurnStartStrategy
|
||||
|
||||
Available user turn stop strategies:
|
||||
- TranscriptionUserTurnStopStrategy
|
||||
- TurnAnalyzerUserTurnStopStrategy
|
||||
- ExternalUserTurnStopStrategy
|
||||
|
||||
The default strategies are:
|
||||
|
||||
- start: [VADUserTurnStartStrategy, TranscriptionUserTurnStartStrategy]
|
||||
- stop: [TranscriptionUserTurnStopStrategy]
|
||||
|
||||
Turn strategies are configured when setting up `LLMContextAggregatorPair`. For example:
|
||||
|
||||
```python
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[
|
||||
TurnAnalyzerUserTurnStopStrategy(
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
)
|
||||
],
|
||||
)
|
||||
),
|
||||
)
|
||||
```
|
||||
|
||||
In order to use the user turn strategies you must update to the new universal `LLMContext` and `LLMContextAggregatorPair`.
|
||||
@@ -1 +0,0 @@
|
||||
- ⚠️ `TransportParams.turn_analyzer` is deprecated and might result in unexpected behavior, use `LLMUserAggregator`'s new `user_turn_strategies` parameter instead.
|
||||
@@ -1 +0,0 @@
|
||||
- `FrameProcessor.interruption_strategies` is deprecated, use `LLMUserAggregator`'s new `user_turn_strategies` parameter instead.
|
||||
@@ -1 +0,0 @@
|
||||
- `EmulateUserStartedSpeakingFrame` and `EmulateUserStoppedSpeakingFrame` frames are deprecated.
|
||||
@@ -1 +0,0 @@
|
||||
- Deprecated the `emulated` field in the `UserStartedSpeakingFrame` and `UserStoppedSpeakingFrame` frames.
|
||||
@@ -1 +0,0 @@
|
||||
- The `LLMUserAggregatorParams` and `LLMAssistantAggregatorParams` classes in `pipecat.processors.aggregators.llm_response` are now deprecated. Use the new universal `LLMContext` and `LLMContextAggregatorPair` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- `pipecat.audio.interruptions.MinWordsInterruptionStrategy` is deprecated. Use `pipecat.turns.user_start.MinWordsUserTurnStartStrategy` with `LLMUserAggregator`'s new `user_turn_strategies` parameter instead.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `RNNoiseFilter` for real-time noise suppression using RNNoise neural network via pyrnnoise library.
|
||||
@@ -1,7 +0,0 @@
|
||||
- Updated `ElevenLabsRealtimeSTTService` to accept the `include_language_detection` parameter to detect language.
|
||||
```python
|
||||
stt = ElevenLabsRealtimeSTTService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
include_language_detection=True
|
||||
)
|
||||
```
|
||||
@@ -1,15 +0,0 @@
|
||||
- Updated `SpeechmaticsSTTService` to use new Python Voice SDK with improved VAD,
|
||||
Smart Turn capabilities, and brings dramatic improvements to latency without
|
||||
any impact on accuracy. Use the `turn_detection_mode` parameter to control the
|
||||
endpointing of speech, with `TurnDetectionMode.EXTERNAL` (default),
|
||||
`TurnDetectionMode.ADAPTIVE`, or `TurnDetectionMode.SMART_TURN`.
|
||||
```python
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.ADAPTIVE,
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
),
|
||||
)
|
||||
```
|
||||
@@ -1,4 +0,0 @@
|
||||
- For `SpeechmaticsSTTService`, the `end_of_utterance_mode` parameter is deprecated.
|
||||
Use the new `turn_detection_mode` parameter instead, with `TurnDetectionMode.EXTERNAL`,
|
||||
`TurnDetectionMode.ADAPTIVE`, or `TurnDetectionMode.SMART_TURN`. The `enable_vad`
|
||||
parameter is also deprecated and is inferred from the `turn_detection_mode`.
|
||||
@@ -1,2 +0,0 @@
|
||||
- Improved error handling in `ElevenLabsRealtimeSTTService`
|
||||
- Fixed an issue in `ElevenLabsRealtimeSTTService` causing an infinite loop that blocks the process if the websocket disconnects due to an error
|
||||
@@ -1 +0,0 @@
|
||||
- `TranscriptionFrame` and `InterimTranscriptionFrame` produced by `DailyTransport` now include the transport source (i.e., the originating audio track).
|
||||
@@ -1 +0,0 @@
|
||||
- `daily-python` updated to 0.23.0.
|
||||
@@ -1,15 +0,0 @@
|
||||
- `OpenAILLMContext` and its associated things (context aggregators, etc.) are now deprecated in favor of the universal `LLMContext` and its associated things.
|
||||
|
||||
From the developer's point of view, switching to using `LLMContext` machinery will usually be a matter of going from this:
|
||||
|
||||
```python
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
```
|
||||
|
||||
To this:
|
||||
|
||||
```
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
```
|
||||
@@ -1,8 +0,0 @@
|
||||
- Added `GrokRealtimeLLMService` for xAI's Grok Voice Agent API with real-time voice conversations:
|
||||
|
||||
- Support for real-time audio streaming with WebSocket connection
|
||||
- Built-in server-side VAD (Voice Activity Detection)
|
||||
- Multiple voice options: Ara, Rex, Sal, Eve, Leo
|
||||
- Built-in tools support: web_search, x_search, file_search
|
||||
- Custom function calling with standard Pipecat tools schema
|
||||
- Configurable audio formats (PCM at 8kHz-48kHz)
|
||||
@@ -1 +0,0 @@
|
||||
- Added an approximation of TTFB for Ultravox.
|
||||
@@ -1,5 +0,0 @@
|
||||
- Updates to Inworld TTS services:
|
||||
|
||||
- Improved `InworldTTSService`'s websocket implementation to better flush and
|
||||
close context to better handle long inputs.
|
||||
- Improved docstrings for `InworldTTSService` and `InworldHttpTTSService`.
|
||||
@@ -1 +0,0 @@
|
||||
- Added a new `AudioContextTTSService` to the TTS service base classes. The `AudioContextWordTTSService` now inherits from `AudioContextTTSService` and `WebsocketWordTTSService`.
|
||||
@@ -1,4 +0,0 @@
|
||||
- `LLMUserAggregator` now exposes the following events:
|
||||
- `on_user_turn_started`: triggered when a user turn starts
|
||||
- `on_user_turn_stopped`: triggered when a user turn ends
|
||||
- `on_user_turn_stop_timeout`: triggered when a user turn does not stop and times out
|
||||
@@ -1,29 +0,0 @@
|
||||
- Introducing user mute strategies. User mute strategies indicate when user input should be muted based on the current system state.
|
||||
|
||||
In conversational agents, user mute strategies are used to prevent user input from interrupting bot speech, tool execution, or other critical system operations.
|
||||
|
||||
A list of strategies can be specified; all strategies are evaluated for every frame so that each strategy can maintain its internal state. A user frame is muted if any of the configured strategies indicates it should be muted.
|
||||
|
||||
Available user mute strategies:
|
||||
|
||||
* `FirstSpeechUserMuteStrategy`
|
||||
* `MuteUntilFirstBotCompleteUserMuteStrategy`
|
||||
* `AlwaysUserMuteStrategy`
|
||||
* `FunctionCallUserMuteStrategy`
|
||||
|
||||
User mute strategies replace the legacy `STTMuteFilter` and provide a more flexible and composable approach to muting user input.
|
||||
|
||||
User mute strategies are configured when setting up the `LLMContextAggregatorPair`. For example:
|
||||
|
||||
```python
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_mute_strategies=[
|
||||
FirstSpeechUserMuteStrategy(),
|
||||
]
|
||||
),
|
||||
)
|
||||
```
|
||||
|
||||
In order to use user mute strategies you should update to the new universal `LLMContext` and `LLMContextAggregatorPair`.
|
||||
@@ -1 +0,0 @@
|
||||
- `STTMuteFilter` is deprecated and will be removed in a future version. Use `LLMUserAggregator`'s new `user_mute_strategies` instead.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed a bug in `STTMuteFilter` where the user was not always muted during function calls, especially when there were multiple simultaneous calls.
|
||||
@@ -1 +0,0 @@
|
||||
- `FrameProcessor.interruptions_allowed` is now deprecated, use `LLMUserAggregator`'s new parameter `user_mute_strategies` instead.
|
||||
@@ -1,12 +0,0 @@
|
||||
- `PipelineParams.allow_interruptions` is now deprecated, use `LLMUserAggregator`'s new parameter `user_turn_strategies` instead. For example, to disable interruptions but still get user turns you can do:
|
||||
|
||||
```python
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
start=[TranscriptionUserTurnStartStrategy(enable_interruptions=False)],
|
||||
),
|
||||
),
|
||||
)
|
||||
```
|
||||
@@ -1 +0,0 @@
|
||||
- Added `use_ssl` parameter to `NvidiaSTTService`, `NvidiaSegmentedSTTService` and `NvidiaTTSService`.
|
||||
@@ -1 +0,0 @@
|
||||
- Updated `DeepgramSTTService` to push user started/stopped speaking and interruption frames when `vad_enabled` is set to true. This centralizes the frames into the service, removing the need to have your application code handle Deepgram's events and push these frames.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `enable_interruptions` constructor argument to all user turn strategies. This tells the `LLMUserAggregator` to push or not push an `InterruptionFrame`.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `52-live-transcription.py` foundational example demonstrating live transcription and translation from English to Spanish. In this example, the bot is not interruptible: as the user continues speaking, English transcriptions are queued, and the bot continuously translates and speaks each queued sentence in Spanish without being interrupted by new user speech.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed a `RNNoiseFilter` issue that would cause a "[Errno 12] Cannot allocate memory" error when processing silence audio frames.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `split_sentences` parameter to `SpeechmaticsSTTService` to control sentence splitting behavior for finals on sentence boundaries.
|
||||
@@ -1,4 +0,0 @@
|
||||
- Updated `SpeechmaticsSTTService` for version `0.0.99+`:
|
||||
- Fixed `SpeechmaticsSTTService` to listen for `VADUserStoppedSpeakingFrame` in order to finalize transcription.
|
||||
- Default to `TurnDetectionMode.FIXED` for Pipecat-controlled end of turn detection.
|
||||
- Only emit VAD + interruption frames if VAD is enabled within the plugin (modes other than `TurnDetectionMode.FIXED` or `TurnDetectionMode.EXTERNAL`).
|
||||
@@ -1 +0,0 @@
|
||||
- Added encoding validation to `DeepgramTTSService` to prevent unsupported encodings from reaching the API. The service now raises `ValueError` at initialization with a clear error message.
|
||||
@@ -1,2 +0,0 @@
|
||||
- Added word-level timestamp support to `AzureTTSService` for accurate text-to-audio synchronization.
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
- Updated `read_audio_frame` & `read_video_frame` methods in `SmallWebRTCClient` to check if the track is enabled before logging a warning.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed an issue with function calling where a handler failing to invoke its result callback could leave the context stuck in IN_PROGRESS, causing LLM inference for subsequent function call results to block while waiting on the unresolved call.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed an issue with DeepgramTTSService where the model would output "Dot" instead of a period in some circumstances.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `pronunciation_dict_id` parameter to `CartesiaTTSService.InputParams` and `CartesiaHttpTTSService.InputParams` to support Cartesia's pronunciation dictionary feature for custom pronunciations.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed an issue in GeminiLiveLLMService where TranscriptionFrames were occasionally not pushed.
|
||||
@@ -1 +0,0 @@
|
||||
- Added support for using the HeyGen LiveAvatar API with the `HeyGenTransport` (see https://www.liveavatar.com/).
|
||||
@@ -1,8 +0,0 @@
|
||||
- Added image support to `OpenAIRealtimeLLMService` via `InputImageRawFrame`:
|
||||
- New `start_video_paused` parameter to control initial video input state
|
||||
- New `video_frame_detail` parameter to set image processing quality ("auto",
|
||||
"low", or "high"). This corresponds to OpenAI Realtime's `image_detail`
|
||||
parameter.
|
||||
- `set_video_input_paused()` method to pause/resume video input at runtime
|
||||
- `set_video_frame_detail()` method to adjust video frame quality dynamically
|
||||
- Automatic rate limiting (1 frame per second) to prevent API overload
|
||||
@@ -1 +0,0 @@
|
||||
- Updated `CartesiaTTSService` to support setting `language=None`, resulting in Cartesia auto-detecting the language of the conversation.
|
||||
@@ -1,3 +0,0 @@
|
||||
- The bundled Smart Turn weights are now updated to v3.2, which has better
|
||||
handling of short utterances, and is more robust against background
|
||||
noise.
|
||||
@@ -1 +0,0 @@
|
||||
- Updated `SpeechmaticsSTTService` dependency to `speechmatics-voice[smart]>=0.2.6`
|
||||
@@ -1 +0,0 @@
|
||||
- Added `UserTurnProcessor`, a frame processor built on `UserTurnController` that pushes `UserStartedSpeakingFrame` and `UserStoppedSpeakingFrame` frames and interruptions based on the controller's user turn strategies.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `UserTurnController` to manage user turns. It emits `on_user_turn_started`, `on_user_turn_stopped`, and `on_user_turn_stop_timeout` events, and can be integrated into processors to detect and handle user turns. `LLMUserAggregator` and `UserTurnProcessor` are implemented using this controller.
|
||||
@@ -1 +0,0 @@
|
||||
- Added a new foundational example `53-concurrent-llm-evaluation.py` that shows how to use `UserTurnProcessor`.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `should_interrupt` property to `DeepgramFluxSTTService`, `DeepgramSTTService`, and `SpeechmaticsSTTService` to configure whether the bot should be interrupted when the external service detects user speech.
|
||||
@@ -1,5 +0,0 @@
|
||||
- Smart Turn now takes into account `vad_start_seconds` when buffering audio,
|
||||
meaning that the start of the turn audio is not cut off. This improves
|
||||
accuracy for short utterances.
|
||||
|
||||
- The default value of `pre_speech_ms` is now set to 500ms for Smart Turn.
|
||||
@@ -1,4 +0,0 @@
|
||||
- `LLMAssistantAggregator` now exposes the following events:
|
||||
- `on_assistant_turn_started`: triggered when the assistant turn starts
|
||||
- `on_assistant_turn_stopped`: triggered when the assistant turn ends
|
||||
- `on_assistant_thought`: triggered when there's an assistant thought available
|
||||
@@ -1 +0,0 @@
|
||||
- `TranscriptProcessor` and related data classes and frames (`TranscriptionMessage`, `ThoughtTranscriptionMessage`, `TranscriptionUpdateFrame`) are deprecated. Use `LLMUserAggregator`'s and `LLMAssistantAggregator`'s new events (`on_user_turn_stopped` and `on_assistant_turn_stopped`) instead.
|
||||
@@ -1 +0,0 @@
|
||||
- Added a new foundational example `28-user-assistant-turns.py` that shows how to use the new `LLMUserAggregator` and `LLMAssistantAggregator` events to gather a conversation transcript.
|
||||
@@ -1 +0,0 @@
|
||||
- Deprecated support for the `vad_events` `LiveOptions` in `DeepgramSTTService`. Instead, use a local Silero VAD for VAD events. Additionally, deprecated `should_interrupt` which will be removed along with `vad_events` support in a future release.
|
||||
@@ -1 +0,0 @@
|
||||
- Added `KrispVivaTurn` analyzer for end of turn detection using the Krisp VIVA SDK (requires `krisp_audio`).
|
||||
@@ -1 +0,0 @@
|
||||
- Improved Krisp SDK management to allow `KrispVivaTurn` and `KrispVivaFilter` to share a single SDK instance within the same process.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed potential memory leaks and initialization issues in `KrispVivaFilter` by improving SDK lifecycle management.
|
||||
@@ -1,6 +0,0 @@
|
||||
- Added support for setting up a pipeline task from external files. You can now register custom pipeline task setup files by setting the `PIPECAT_SETUP_FILES` environment variable. This variable should contain a colon-separated list of Python files (e.g. `export PIPECAT_SETUP_FILES="setup1.py:setup.py:..."`). Each file must define a function with the following signature:
|
||||
|
||||
```python
|
||||
async def setup_pipeline_task(task: PipelineTask):
|
||||
...
|
||||
```
|
||||
@@ -1 +0,0 @@
|
||||
- Loading external observers from files is deprecated, use the new pipeline task setup files and `PIPECAT_SETUP_FILES` environment variable instead.
|
||||
@@ -1 +0,0 @@
|
||||
- Updated default model for `GroqTTSService` to `canopylabs/orpheus-v1-english` and voice ID to `autumn`.
|
||||
@@ -1 +0,0 @@
|
||||
- Fixed timing issue in `BaseOutputTransport` where the bot speaking flag was set after awaiting, allowing the event loop to re-enter the method before the guard was set.
|
||||
@@ -1,16 +0,0 @@
|
||||
{% for section, _ in sections.items() %}
|
||||
{% if sections[section] %}
|
||||
{% for category, val in definitions.items() if category in sections[section]%}
|
||||
### {{ definitions[category]['name'] }}
|
||||
|
||||
{% for text, values in sections[section][category].items() %}
|
||||
{{ text }}
|
||||
(PR {{ values|join(', ') }})
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
No significant changes.
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
# Build docs using uv
|
||||
echo "Installing dependencies with uv..."
|
||||
uv sync --group docs --all-extras --no-extra krisp --no-extra gstreamer --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
uv sync --group docs --all-extras --no-extra krisp --no-extra gstreamer --no-extra ultravox --no-extra local_smart_turn --no-extra moondream --no-extra riva --no-extra mlx-whisper
|
||||
|
||||
# Check if sphinx-build is available
|
||||
if ! uv run sphinx-build --version &> /dev/null; then
|
||||
@@ -24,4 +24,4 @@ if [ $? -eq 0 ]; then
|
||||
else
|
||||
echo "Documentation build failed!" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
@@ -50,7 +50,6 @@ autodoc_mock_imports = [
|
||||
# Krisp - has build issues on some platforms
|
||||
"pipecat_ai_krisp",
|
||||
"krisp",
|
||||
"krisp_audio",
|
||||
# System-specific GUI libraries
|
||||
"_tkinter",
|
||||
"tkinter",
|
||||
@@ -61,6 +60,9 @@ autodoc_mock_imports = [
|
||||
# OpenCV - sometimes has import issues during docs build
|
||||
"cv2",
|
||||
# Heavy ML packages excluded from ReadTheDocs
|
||||
# ultravox dependencies
|
||||
"vllm",
|
||||
"vllm.engine.arg_utils",
|
||||
# local-smart-turn dependencies
|
||||
"coremltools",
|
||||
"coremltools.models",
|
||||
@@ -116,6 +118,7 @@ def import_core_modules():
|
||||
"pipecat.observers",
|
||||
"pipecat.runner",
|
||||
"pipecat.serializers",
|
||||
"pipecat.sync",
|
||||
"pipecat.transcriptions",
|
||||
"pipecat.utils",
|
||||
]
|
||||
|
||||
@@ -30,6 +30,7 @@ Quick Links
|
||||
Runner <api/pipecat.runner>
|
||||
Serializers <api/pipecat.serializers>
|
||||
Services <api/pipecat.services>
|
||||
Sync <api/pipecat.sync>
|
||||
Transcriptions <api/pipecat.transcriptions>
|
||||
Transports <api/pipecat.transports>
|
||||
Utils <api/pipecat.utils>
|
||||
Utils <api/pipecat.utils>
|
||||
187
env.example
187
env.example
@@ -4,9 +4,6 @@ AICOUSTICS_LICENSE_KEY=...
|
||||
# Anthropic
|
||||
ANTHROPIC_API_KEY=...
|
||||
|
||||
# Assembly AI
|
||||
ASSEMBLYAI_API_KEY=...
|
||||
|
||||
# Async
|
||||
ASYNCAI_API_KEY=...
|
||||
ASYNCAI_VOICE_ID=...
|
||||
@@ -24,19 +21,12 @@ AZURE_CHATGPT_API_KEY=...
|
||||
AZURE_CHATGPT_ENDPOINT=https://...
|
||||
AZURE_CHATGPT_MODEL=...
|
||||
|
||||
AZURE_REALTIME_API_KEY=...
|
||||
AZURE_REALTIME_BASE_URL=...
|
||||
|
||||
AZURE_DALLE_API_KEY=...
|
||||
AZURE_DALLE_ENDPOINT=https://...
|
||||
AZURE_DALLE_MODEL=...
|
||||
|
||||
# Cartesia
|
||||
CARTESIA_API_KEY=...
|
||||
CARTESIA_VOICE_ID=...
|
||||
|
||||
# Cerebras
|
||||
CEREBRAS_API_KEY=...
|
||||
|
||||
# Daily
|
||||
DAILY_API_KEY=...
|
||||
@@ -44,82 +34,40 @@ DAILY_SAMPLE_ROOM_URL=https://...
|
||||
|
||||
# Deepgram
|
||||
DEEPGRAM_API_KEY=...
|
||||
SAGEMAKER_ENDPOINT_NAME=...
|
||||
|
||||
# DeepSeek
|
||||
DEEPSEEK_API_KEY=...
|
||||
|
||||
# ElevenLabs
|
||||
ELEVENLABS_API_KEY=...
|
||||
ELEVENLABS_VOICE_ID=...
|
||||
|
||||
# Neuphonic
|
||||
NEUPHONIC_API_KEY=...
|
||||
|
||||
# Fal
|
||||
FAL_KEY=...
|
||||
|
||||
# Fireworks
|
||||
FIREWORKS_API_KEY=...
|
||||
|
||||
# Fish Audio
|
||||
FISH_API_KEY=...
|
||||
|
||||
# Gladia
|
||||
GLADIA_API_KEY=...
|
||||
GLADIA_REGION=...
|
||||
|
||||
# Google
|
||||
GOOGLE_API_KEY=...
|
||||
GOOGLE_VERTEX_TEST_CREDENTIALS=...
|
||||
GOOGLE_CLOUD_PROJECT_ID=...
|
||||
GOOGLE_CLOUD_LOCATION=...
|
||||
GOOGLE_TEST_CREDENTIALS=...
|
||||
|
||||
# Gradium
|
||||
GRAPDIUM_API_KEY=...
|
||||
|
||||
# Grok
|
||||
GROK_API_KEY=...
|
||||
|
||||
# Groq
|
||||
GROQ_API_KEY=...
|
||||
|
||||
# Heygen
|
||||
HEYGEN_API_KEY=...
|
||||
HEYGEN_LIVE_AVATAR_API_KEY=...
|
||||
|
||||
# Hume
|
||||
HUME_API_KEY=...
|
||||
HUME_VOICE_ID=...
|
||||
|
||||
# Inworld
|
||||
INWORLD_API_KEY=...
|
||||
|
||||
# Krisp
|
||||
KRISP_MODEL_PATH=...
|
||||
|
||||
# Krisp Viva
|
||||
KRISP_VIVA_FILTER_MODEL_PATH=...
|
||||
KRISP_VIVA_TURN_MODEL_PATH=...
|
||||
|
||||
# LiveKit
|
||||
LIVEKIT_API_KEY=...
|
||||
LIVEKIT_API_SECRET=...
|
||||
GOOGLE_VERTEX_TEST_CREDENTIALS=...
|
||||
|
||||
# LMNT
|
||||
LMNT_API_KEY=...
|
||||
LMNT_VOICE_ID=...
|
||||
|
||||
# MiniMax
|
||||
MINIMAX_API_KEY=...
|
||||
MINIMAX_GROUP_ID=...
|
||||
# Perplexity
|
||||
PERPLEXITY_API_KEY=...
|
||||
|
||||
# Mistral
|
||||
MISTRAL_API_KEY=...
|
||||
|
||||
# Neuphonic
|
||||
NEUPHONIC_API_KEY=...
|
||||
|
||||
# NVIDIA
|
||||
NVIDIA_API_KEY=...
|
||||
# PlayHT
|
||||
PLAYHT_USER_ID=...
|
||||
PLAYHT_API_KEY=...
|
||||
|
||||
# OpenAI
|
||||
OPENAI_API_KEY=...
|
||||
@@ -127,76 +75,83 @@ OPENAI_API_KEY=...
|
||||
# OpenPipe
|
||||
OPENPIPE_API_KEY=...
|
||||
|
||||
# OpenRouter
|
||||
OPENROUTER_API_KEY=...
|
||||
|
||||
# Perplexity
|
||||
PERPLEXITY_API_KEY=...
|
||||
|
||||
# Picovoice Koala
|
||||
KOALA_ACCESS_KEY=...
|
||||
|
||||
# Piper
|
||||
PIPER_BASE_URL=...
|
||||
|
||||
# PlayHT
|
||||
PLAYHT_USER_ID=...
|
||||
PLAYHT_API_KEY=...
|
||||
|
||||
# Plivo
|
||||
PLIVO_AUTH_ID=...
|
||||
PLIVO_AUTH_TOKEN=...
|
||||
|
||||
# Qwen
|
||||
QWEN_API_KEY=...
|
||||
|
||||
# Rime
|
||||
RIME_API_KEY=...
|
||||
RIME_VOICE_ID=...
|
||||
|
||||
# SambaNova
|
||||
SAMBANOVA_API_KEY=...
|
||||
|
||||
# Sarvam AI
|
||||
SARVAM_API_KEY=...
|
||||
|
||||
# Sentry
|
||||
SENTRY_DSN=...
|
||||
# Tavus
|
||||
TAVUS_API_KEY=...
|
||||
TAVUS_REPLICA_ID=...
|
||||
TAVUS_PERSONA_ID=...
|
||||
|
||||
# Simli
|
||||
SIMLI_API_KEY=...
|
||||
SIMLI_FACE_ID=...
|
||||
|
||||
# Smart turn
|
||||
LOCAL_SMART_TURN_MODEL_PATH=...
|
||||
FAL_SMART_TURN_API_KEY=...
|
||||
# Krisp
|
||||
KRISP_MODEL_PATH=...
|
||||
|
||||
# Soniox
|
||||
SONIOX_API_KEY=...
|
||||
# DeepSeek
|
||||
DEEPSEEK_API_KEY=...
|
||||
|
||||
# Speechmatics
|
||||
SPEECHMATICS_API_KEY=...
|
||||
# Groq
|
||||
GROQ_API_KEY=...
|
||||
|
||||
# Tavus
|
||||
TAVUS_API_KEY=...
|
||||
TAVUS_REPLICA_ID=...
|
||||
# Grok
|
||||
GROK_API_KEY=...
|
||||
|
||||
# Telnyx
|
||||
TELNYX_API_KEY=...
|
||||
TELNYX_ACCOUNT_SID=...
|
||||
# Inworld
|
||||
INWORLD_API_KEY=...
|
||||
|
||||
# Together.ai
|
||||
TOGETHER_API_KEY=...
|
||||
|
||||
# Cerebras
|
||||
CEREBRAS_API_KEY=...
|
||||
|
||||
# Fish Audio
|
||||
FISH_API_KEY=...
|
||||
|
||||
# Assembly AI
|
||||
ASSEMBLYAI_API_KEY=...
|
||||
|
||||
# OpenRouter
|
||||
OPENROUTER_API_KEY=...
|
||||
|
||||
# Piper
|
||||
PIPER_BASE_URL=...
|
||||
|
||||
# Smart turn
|
||||
LOCAL_SMART_TURN_MODEL_PATH=...
|
||||
FAL_SMART_TURN_API_KEY=...
|
||||
|
||||
# Twilio
|
||||
TWILIO_ACCOUNT_SID=...
|
||||
TWILIO_AUTH_TOKEN=...
|
||||
|
||||
# Ultravox Realtime
|
||||
ULTRAVOX_API_KEY=...
|
||||
# MiniMax
|
||||
MINIMAX_API_KEY=...
|
||||
MINIMAX_GROUP_ID=...
|
||||
|
||||
# WhatsApp
|
||||
WHATSAPP_TOKEN=...
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN=...
|
||||
WHATSAPP_PHONE_NUMBER_ID=...
|
||||
WHATSAPP_APP_SECRET=...
|
||||
# Sarvam AI
|
||||
SARVAM_API_KEY=...
|
||||
|
||||
# Soniox
|
||||
SONIOX_API_KEY=
|
||||
|
||||
# Speechmatics
|
||||
SPEECHMATICS_API_KEY=...
|
||||
|
||||
# SambaNova
|
||||
SAMBANOVA_API_KEY=...
|
||||
|
||||
# Sentry
|
||||
SENTRY_DSN=...
|
||||
|
||||
# Heygen
|
||||
HEYGEN_API_KEY=...
|
||||
|
||||
# Mistral
|
||||
MISTRAL_API_KEY=...
|
||||
|
||||
# NVIDIA
|
||||
NVIDIA_API_KEY=...
|
||||
|
||||
# Qwen
|
||||
QWEN_API_KEY=...
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
@@ -15,7 +15,7 @@ from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.nvidia.tts import NvidiaTTSService
|
||||
from pipecat.services.riva.tts import FastPitchTTSService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -36,7 +36,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
tts = NvidiaTTSService(api_key=os.getenv("NVIDIA_API_KEY"))
|
||||
tts = FastPitchTTSService(api_key=os.getenv("NVIDIA_API_KEY"))
|
||||
|
||||
task = PipelineTask(
|
||||
Pipeline([tts, transport.output()]),
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
@@ -17,6 +17,7 @@ from fastapi.responses import RedirectResponse
|
||||
from loguru import logger
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
@@ -25,18 +26,13 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
from pipecat.transports.smallwebrtc.connection import IceServer, SmallWebRTCConnection
|
||||
from pipecat.transports.smallwebrtc.transport import SmallWebRTCTransport
|
||||
from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -65,6 +61,7 @@ async def run_example(webrtc_connection: SmallWebRTCConnection):
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
)
|
||||
|
||||
@@ -80,19 +77,12 @@ async def run_example(webrtc_connection: SmallWebRTCConnection):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())]
|
||||
),
|
||||
),
|
||||
)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
@@ -12,6 +12,7 @@ import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
@@ -20,16 +21,11 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.daily import configure
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.daily.transport import DailyParams, DailyTransport
|
||||
from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
from pipecat.transports.daily.transport import DailyLogLevel, DailyParams, DailyTransport
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -50,8 +46,10 @@ async def main():
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
)
|
||||
transport.set_log_level(DailyLogLevel.Info)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
@@ -63,21 +61,12 @@ async def main():
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[
|
||||
TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())
|
||||
]
|
||||
),
|
||||
),
|
||||
)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
@@ -12,6 +12,7 @@ import sys
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
@@ -26,17 +27,12 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.livekit import configure
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.livekit.transport import LiveKitParams, LiveKitTransport
|
||||
from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -55,6 +51,7 @@ async def main():
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
)
|
||||
|
||||
@@ -72,20 +69,13 @@ async def main():
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. "
|
||||
"Your output will be converted to audio so don't include special characters in your answers. "
|
||||
"Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())]
|
||||
),
|
||||
),
|
||||
)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
@@ -23,6 +23,7 @@ from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.aggregators.sentence import SentenceAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
@@ -26,6 +26,7 @@ from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.aggregators.sentence import SentenceAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.services.cartesia.tts import CartesiaHttpTTSService
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
@@ -9,6 +9,7 @@ import os
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
@@ -23,10 +24,7 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -36,8 +34,6 @@ from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -70,16 +66,19 @@ transport_params = {
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
@@ -101,19 +100,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())]
|
||||
),
|
||||
),
|
||||
)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
@@ -10,6 +10,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
@@ -24,10 +25,7 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
@@ -36,8 +34,6 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -88,6 +84,7 @@ transport_params = {
|
||||
video_out_width=1024,
|
||||
video_out_height=1024,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
@@ -96,6 +93,7 @@ transport_params = {
|
||||
video_out_width=1024,
|
||||
video_out_height=1024,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
@@ -115,19 +113,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())]
|
||||
),
|
||||
),
|
||||
)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
image_sync_aggregator = ImageSyncAggregator(
|
||||
os.path.join(os.path.dirname(__file__), "assets", "speaking.png"),
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
@@ -9,6 +9,7 @@ import os
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
@@ -17,20 +18,15 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.stt import CartesiaSTTService
|
||||
from pipecat.services.cartesia.tts import CartesiaHttpTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -43,16 +39,19 @@ transport_params = {
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
@@ -60,7 +59,7 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaHttpTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
@@ -72,19 +71,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())]
|
||||
),
|
||||
),
|
||||
)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
@@ -9,6 +9,7 @@ import os
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
@@ -17,10 +18,7 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
@@ -29,8 +27,6 @@ from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -42,16 +38,19 @@ transport_params = {
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
}
|
||||
|
||||
@@ -71,19 +70,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())]
|
||||
),
|
||||
),
|
||||
)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
@@ -15,21 +14,20 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
|
||||
from pipecat.services.openai.base_llm import BaseOpenAILLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.speechmatics.stt import SpeechmaticsSTTService
|
||||
from pipecat.services.speechmatics.tts import SpeechmaticsTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.user_turn_strategies import ExternalUserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -53,125 +51,121 @@ transport_params = {
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
"""Speechmatics STT and TTS Service Example
|
||||
"""Speechmatics STT Service Example
|
||||
|
||||
This example demonstrates using Speechmatics Speech-to-Text and Text-to-Speech services
|
||||
with speaker diarization and intelligent speaker management. Key features:
|
||||
This example demonstrates using Speechmatics Speech-to-Text service with speaker diarization and intelligent speaker management. Key features:
|
||||
|
||||
1. Speaker Diarization (STT)
|
||||
1. Speaker Diarization
|
||||
- Automatically identifies and distinguishes between different speakers
|
||||
- First speaker is identified as 'S1', others get subsequent IDs
|
||||
- Uses `enable_diarization` parameter to manage speaker detection
|
||||
|
||||
2. Smart Speaker Control (STT)
|
||||
2. Smart Speaker Control
|
||||
- `focus_speakers` parameter lets you target specific speakers (e.g. ["S1"])
|
||||
- Other speakers will be wrapped in PASSIVE tags
|
||||
- Only processes speech from focused speakers
|
||||
- Words from all speakers are wrapped with XML tags for clear speaker identification
|
||||
- Other speakers' speech only sent when focused speaker is active
|
||||
|
||||
3. Voice Activity Detection (STT)
|
||||
3. Voice Activity Detection
|
||||
- Built-in VAD using `enable_vad` parameter
|
||||
- Remove `vad_analyzer` from `transport` config to use module's VAD
|
||||
- Emits speaker started/stopped events
|
||||
|
||||
4. Text-to-Speech (TTS)
|
||||
- Low latency streaming audio synthesis
|
||||
- Multiple voice options available including `sarah`, `theo`, `megan` and `jack`
|
||||
|
||||
5. Configuration Options
|
||||
4. Configuration Options
|
||||
- `operating_point` parameter defaults to `ENHANCED` for optimal accuracy
|
||||
- Configurable `end_of_utterance_silence_trigger` (default 0.5s)
|
||||
- Customizable speaker formatting
|
||||
- Additional diarization settings available
|
||||
|
||||
For detailed information:
|
||||
- STT: https://docs.speechmatics.com/rt-api-ref
|
||||
- TTS: https://docs.speechmatics.com/text-to-speech/quickstart
|
||||
For detailed information about operating points and configuration:
|
||||
https://docs.speechmatics.com/rt-api-ref
|
||||
"""
|
||||
|
||||
logger.info(f"Starting bot")
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.ADAPTIVE,
|
||||
# focus_speakers=["S1"],
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
speaker_passive_format="<PASSIVE><{speaker_id}>{text}</{speaker_id}></PASSIVE>",
|
||||
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
enable_vad=True,
|
||||
enable_diarization=True,
|
||||
focus_speakers=["S1"],
|
||||
end_of_utterance_silence_trigger=0.5,
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
speaker_passive_format="<PASSIVE><{speaker_id}>{text}</{speaker_id}></PASSIVE>",
|
||||
),
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
model="eleven_turbo_v2_5",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
params=BaseOpenAILLMService.InputParams(temperature=0.75),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful British assistant called Alfred. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be converted to audio so don't include special characters in your answers. "
|
||||
"Always include punctuation in your responses. "
|
||||
"Give very short replies - do not give longer replies unless strictly necessary. "
|
||||
"Respond to what the user said in a concise, funny, creative and helpful way. "
|
||||
"Use `<Sn/>` tags to identify different speakers - do not use tags in your replies. "
|
||||
"Do not respond to speakers within `<PASSIVE/>` tags unless explicitly asked to. "
|
||||
),
|
||||
)
|
||||
},
|
||||
]
|
||||
|
||||
tts = SpeechmaticsTTSService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
voice_id="sarah",
|
||||
aiohttp_session=session,
|
||||
)
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(aggregation_timeout=0.005),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
params=BaseOpenAILLMService.InputParams(temperature=0.75),
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful British assistant called Sarah. "
|
||||
"Your goal is to demonstrate your capabilities in a succinct way. "
|
||||
"Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. "
|
||||
"Always include punctuation in your responses. "
|
||||
"Give very short replies - do not give longer replies unless strictly necessary. "
|
||||
"Respond to what the user said in a concise, funny, creative and helpful way. "
|
||||
"Use `<Sn/>` tags to identify different speakers - do not use tags in your replies. "
|
||||
"Do not respond to speakers within `<PASSIVE/>` tags unless explicitly asked to. "
|
||||
),
|
||||
},
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(user_turn_strategies=ExternalUserTurnStrategies()),
|
||||
)
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Say a short hello to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user