Compare commits

...

52 Commits

Author SHA1 Message Date
Mark Backman
b23ca5a4a8 Merge pull request #1641 from pipecat-ai/mb/11labs-input-params
ElevenLabs: InputParams can be set individually
2025-04-23 14:31:37 -04:00
Mark Backman
63a6697a90 ElevenLabs: InputParams can be set individually 2025-04-23 14:28:38 -04:00
Aleix Conchillo Flaqué
f1e45d0f02 Merge pull request #1646 from pipecat-ai/aleix/pipecat-0.0.65
update CHANGELOG for 0.0.65
2025-04-23 11:27:25 -07:00
Mark Backman
4ad227ca2d Merge pull request #1643 from pipecat-ai/mb/gladia-keepalive
Add a keepalive task to GladiaSTTService
2025-04-23 14:27:15 -04:00
Aleix Conchillo Flaqué
66cc18194b update CHANGELOG for 0.0.65 2025-04-23 11:25:32 -07:00
Mark Backman
7d65132c93 Add a keepalive task to GladiaSTTService 2025-04-23 14:21:44 -04:00
Mark Backman
db7d7a4204 Merge pull request #1645 from pipecat-ai/mb/telnyx-auto-hang-up
Add auto_hang_up to Telnyx serializer
2025-04-23 14:20:28 -04:00
Mark Backman
7bbac11084 Add docstrings to TelnyxFrameSerializer 2025-04-23 14:13:20 -04:00
Mark Backman
76c8322b57 Make call_sid optional in TwilioFrameSerializer 2025-04-23 14:08:50 -04:00
Mark Backman
7b1cd3523d Twilio: send only one hangup command 2025-04-23 13:41:36 -04:00
Mark Backman
6bd821ac9a Add auto_hang_up to Telnyx serializer 2025-04-23 13:29:54 -04:00
Aleix Conchillo Flaqué
b91780ced2 Merge pull request #1638 from pipecat-ai/aleix/pipecat-0.0.64
update CHANGELOG for 0.0.64
2025-04-22 17:35:25 -07:00
Aleix Conchillo Flaqué
8ded666958 update CHANGELOG for 0.0.64 2025-04-22 17:32:06 -07:00
Filipi da Silva Fuchter
2490c804a5 Merge pull request #1631 from pipecat-ai/smart_turn_timeout
Returning the turn as complete if the request don’t return a result within SmartTurnParams stop_secs
2025-04-22 19:51:10 -03:00
Filipi Fuchter
dd8856a673 Merge branch 'main' into smart_turn_timeout
# Conflicts:
#	dot-env.template
2025-04-22 19:49:32 -03:00
Aleix Conchillo Flaqué
e7da08dab1 move smart turn files to audio.turn.smart_turn package 2025-04-22 15:29:31 -07:00
Aleix Conchillo Flaqué
ae60d42016 s/SmartTurnAnalyzer/HttpSmartTurnAnalyzer/ and add FalSmartTurnAnalyzer 2025-04-22 15:13:12 -07:00
Aleix Conchillo Flaqué
50e8d82ece SmartTurn: some linting cleanup 2025-04-22 14:39:02 -07:00
Mark Backman
cc9901a82f Replace httpx with aiohttp 2025-04-22 17:14:19 -04:00
Aleix Conchillo Flaqué
1fd43e8a3f Merge pull request #1636 from pipecat-ai/aleix/examples-logging
examples: always use loguru for logging
2025-04-22 13:06:40 -07:00
Aleix Conchillo Flaqué
fdc508a1a5 examples: always use loguru for logging 2025-04-22 11:51:49 -07:00
Mark Backman
37269db247 Merge pull request #1634 from pipecat-ai/mb/twilio-end-call
Automatically hangup Twilio calls
2025-04-22 14:05:10 -04:00
Mark Backman
51269aabbd Added cancel method to WebsocketServerOutputTransport 2025-04-22 13:58:39 -04:00
Mark Backman
74ecc19e09 Code review feedback 2025-04-22 13:54:12 -04:00
Mark Backman
c6d48c16df Add twilio to pyproject.toml, update demo to use twilio option 2025-04-22 13:01:56 -04:00
Mark Backman
873d84aa09 Twilio serializer to return None 2025-04-22 12:50:11 -04:00
Mark Backman
7360866c97 Add docstrings 2025-04-22 12:49:17 -04:00
Mark Backman
81f4768661 Automatically hangup Twilio calls 2025-04-22 12:45:34 -04:00
Vanessa Pyne
972d65f61b Merge pull request #1628 from pipecat-ai/vp-typo-fixes
typo fixes in phone-chatbot example
2025-04-22 10:05:56 -05:00
Mark Backman
1da9d398e3 Merge pull request #1619 from pipecat-ai/mb/grok-3-beta
GrokLLMService uses grok-3-beta as default model
2025-04-22 10:33:32 -04:00
Filipi Fuchter
7358bc6428 Returning the turn as complete if the request don’t return a result within SmartTurnParams stop_secs 2025-04-22 10:35:14 -03:00
vipyne
a6af499f84 typo fixes in phone-chatbot example 2025-04-21 23:49:13 -05:00
Aleix Conchillo Flaqué
f9d1a53e28 Merge pull request #1609 from pipecat-ai/aleix/pyproject-py-typed
pyproject: fix license fields
2025-04-21 16:14:22 -07:00
Mark Backman
3f3010af79 Add a SmartTurnMetricsData class, emitted by Metrics Frame in response to smart turn responses 2025-04-21 18:56:14 -04:00
Aleix Conchillo Flaqué
a02d47ddbd Merge pull request #1625 from 0xPatryk/patch-1
Fixed AttributeError: object has no attribute '_sample_rate"
2025-04-21 15:40:54 -07:00
Patryk
a649aff3e7 Fixed AttributeError: 'OpenAITTSService' object has no attribute '_sample_rate' 2025-04-21 11:03:45 +02:00
Mark Backman
a9b551d73e GrokLLMService uses grok-3-beta as default model 2025-04-19 08:05:59 -04:00
Mark Backman
747a821943 Merge pull request #1614 from pipecat-ai/mb/changelog-for-1525
Add CHANGELOG entry for PR 1525
2025-04-19 07:10:13 -04:00
Aleix Conchillo Flaqué
010db3ccd5 README: minor update 2025-04-18 20:57:05 -07:00
Aleix Conchillo Flaqué
db773b8b93 Merge pull request #1616 from pipecat-ai/aleix/new-readme
make README more fun
2025-04-18 18:15:35 -07:00
Mark Backman
16b7bf71b4 Additional README changes 2025-04-18 21:00:57 -04:00
Aleix Conchillo Flaqué
82d19508a4 make README more fun 2025-04-18 14:37:28 -07:00
Mark Backman
dc3646f0e7 Merge pull request #1615 from pipecat-ai/mb/issue-template
Add issue templates and move the pull request template to .github
2025-04-18 14:58:09 -04:00
Mark Backman
62e659cd3a Update to .yml templates so that types are used 2025-04-18 13:21:01 -04:00
Mark Backman
b2945f44fd Add issue templates and move the pull request template to .github 2025-04-18 12:17:46 -04:00
Mark Backman
618fbef81c Add CHANGELOG entry for PR 1525 2025-04-18 11:32:34 -04:00
Mark Backman
70c42dfa6e Merge pull request #1525 from shaiyon/google-default-creds
Enable usage of Application Default Credentials in Google services
2025-04-18 11:31:08 -04:00
Mark Backman
9ab374dd1f Merge pull request #1612 from pipecat-ai/mb/07g-stt-model
examples: Fix 07g by changing STT model
2025-04-18 08:04:20 -04:00
Mark Backman
cc6d284417 examples: Fix 07g by changing STT model 2025-04-18 07:13:34 -04:00
Aleix Conchillo Flaqué
d77c37ff14 pyproject: add py.typed (PEP 561) 2025-04-17 17:29:04 -07:00
Aleix Conchillo Flaqué
b4916f9dae pyproject: fix license fields 2025-04-17 17:28:14 -07:00
Shaiyon Hariri
af23200511 Use default google creds as fallback when not provided in llm_vertex,stt, and tts 2025-04-03 16:42:58 -04:00
53 changed files with 1425 additions and 515 deletions

87
.github/ISSUE_TEMPLATE/1-bug_report.yml vendored Normal file
View File

@@ -0,0 +1,87 @@
name: Bug report
description: Report a bug or unexpected behavior
type: Bug
body:
- type: markdown
attributes:
value: |
## Bug Report
Thank you for taking the time to fill out this bug report.
- type: markdown
attributes:
value: |
### Environment
- type: input
id: pipecat-version
attributes:
label: pipecat version
description: Which version are you using?
placeholder: e.g., 0.0.63
validations:
required: true
- type: input
id: python-version
attributes:
label: Python version
description: Which Python version are you using?
placeholder: e.g., 3.12.8
validations:
required: true
- type: input
id: os
attributes:
label: Operating System
description: Which OS are you using?
placeholder: e.g., Ubuntu 24.04, Windows 11, macOS 12.5
validations:
required: true
- type: textarea
id: description
attributes:
label: Issue description
description: Provide a clear description of the issue.
validations:
required: true
- type: textarea
id: repro
attributes:
label: Reproduction steps
description: List the steps to reproduce the issue.
placeholder: |
1. Do this...
2. Then do that...
3. Observe the error...
validations:
required: true
- type: textarea
id: expected
attributes:
label: Expected behavior
description: What did you expect to happen?
validations:
required: true
- type: textarea
id: actual
attributes:
label: Actual behavior
description: What actually happened?
validations:
required: true
- type: textarea
id: logs
attributes:
label: Logs
description: If applicable, include any relevant logs or error messages
render: shell
validations:
required: false

67
.github/ISSUE_TEMPLATE/2-question.yml vendored Normal file
View File

@@ -0,0 +1,67 @@
name: Question
description: Ask a question or get help
type: Question
body:
- type: markdown
attributes:
value: |
## Question
Use this form to ask a question about pipecat.
- type: markdown
attributes:
value: |
### Environment (if applicable)
- type: input
id: pipecat-version
attributes:
label: pipecat version
description: Which version are you using? (if applicable)
placeholder: e.g., 0.0.63
validations:
required: false
- type: input
id: python-version
attributes:
label: Python version
description: Which Python version are you using? (if applicable)
placeholder: e.g., 3.12.8
validations:
required: false
- type: input
id: os
attributes:
label: Operating System
description: Which OS are you using? (if applicable)
placeholder: e.g., Ubuntu 24.04, Windows 11, macOS 12.5
validations:
required: false
- type: textarea
id: question
attributes:
label: Question
description: Provide your question in detail here.
validations:
required: true
- type: textarea
id: tried
attributes:
label: What I've tried
description: Describe what you've already tried or research you've done.
placeholder: I've looked at the documentation and tried...
validations:
required: false
- type: textarea
id: context
attributes:
label: Context
description: Any additional context or information that might help others understand your question better.
validations:
required: false

View File

@@ -0,0 +1,52 @@
name: Feature request
description: Suggest an enhancement or new feature
type: Enhancement
body:
- type: markdown
attributes:
value: |
## Feature Request
Thank you for suggesting an enhancement to pipecat.
- type: textarea
id: problem
attributes:
label: Problem Statement
description: A clear description of the problem this feature would solve.
placeholder: I'm always frustrated when...
validations:
required: true
- type: textarea
id: solution
attributes:
label: Proposed Solution
description: A clear and concise description of what you want to happen.
validations:
required: true
- type: textarea
id: alternatives
attributes:
label: Alternative Solutions
description: Any alternative solutions or features you've considered.
validations:
required: false
- type: textarea
id: context
attributes:
label: Additional Context
description: Add any other context, mockups, or screenshots about the feature request here.
placeholder: You can drag and drop images here to include them.
validations:
required: false
- type: checkboxes
id: contribution
attributes:
label: Would you be willing to help implement this feature?
options:
- label: Yes, I'd like to contribute
- label: No, I'm just suggesting

View File

@@ -0,0 +1,82 @@
name: Service Issue
description: An issue with a third-party service
type: Service Issue
body:
- type: markdown
attributes:
value: |
## Service Issue
Use this form to report an issue with a third-party service integration.
- type: input
id: pipecat-version
attributes:
label: pipecat version
description: Which version are you using?
placeholder: e.g., 0.0.63
validations:
required: true
- type: input
id: service-name
attributes:
label: Service Name
description: Which third-party service is having issues?
placeholder: e.g., OpenAI, ElevenLabs, Anthropic
validations:
required: true
- type: input
id: service-version
attributes:
label: Service or model version
description: Which version of the service API or model are you using?
placeholder: e.g., v1, gpt-4.1
validations:
required: false
- type: textarea
id: description
attributes:
label: Issue Description
description: Provide a clear description of the service issue.
validations:
required: true
- type: textarea
id: reproduction
attributes:
label: Reproduction Steps
description: Provide steps to reproduce the issue.
placeholder: |
1. Configure service X
2. Call method Y
3. See error Z
validations:
required: true
- type: textarea
id: expected
attributes:
label: Expected Behavior
description: What did you expect to happen?
validations:
required: true
- type: textarea
id: actual
attributes:
label: Actual Behavior
description: What actually happened?
validations:
required: true
- type: textarea
id: logs
attributes:
label: Error Logs
description: If available, include any error messages or logs.
render: shell
validations:
required: false

View File

@@ -0,0 +1,56 @@
name: New Service
description: Request to support a new third-party service
type: New Service
body:
- type: markdown
attributes:
value: |
## New Service Request
Use this form to request support for a new third-party service in pipecat.
- type: input
id: service-name
attributes:
label: Service Name
description: What is the name of the third-party service?
placeholder: e.g., NewAPI, SomeService
validations:
required: true
- type: input
id: service-website
attributes:
label: Service Website
description: Link to the service's website or documentation
placeholder: e.g., https://newapi.com
validations:
required: true
- type: textarea
id: service-description
attributes:
label: Service Description
description: Briefly describe what this service does and how it works.
validations:
required: true
- type: textarea
id: api-info
attributes:
label: API Information
description: If available, provide details about the service's API.
placeholder: |
- API documentation link
- Authentication method
- Key endpoints you'd like supported
validations:
required: false
- type: checkboxes
id: contribution
attributes:
label: Would you be willing to help implement this service?
options:
- label: Yes, I'd like to contribute
- label: No, I'm just suggesting

74
.github/ISSUE_TEMPLATE/6-dependency.yml vendored Normal file
View File

@@ -0,0 +1,74 @@
name: Dependency Issue
description: An issue with a Pipecat dependency (not a third-party service)
type: Dependency Issue
body:
- type: markdown
attributes:
value: |
## Dependency Issue
Use this form to report an issue with a Pipecat dependency.
- type: input
id: pipecat-version
attributes:
label: pipecat version
description: Which version are you using?
placeholder: e.g., 0.0.63
validations:
required: true
- type: input
id: dependency-name
attributes:
label: Dependency Name
description: Which Pipecat dependency is causing the issue?
placeholder: e.g., openai, anthropic, fastapi
validations:
required: true
- type: input
id: dependency-version
attributes:
label: Dependency Version
description: Which version of the dependency are you using?
placeholder: e.g., 1.2.3
validations:
required: true
- type: textarea
id: description
attributes:
label: Issue Description
description: Provide a clear description of the dependency issue.
validations:
required: true
- type: textarea
id: impact
attributes:
label: Impact
description: How is this dependency issue affecting your usage of pipecat?
validations:
required: true
- type: textarea
id: reproduction
attributes:
label: Reproduction Steps
description: If applicable, provide steps to reproduce the issue.
placeholder: |
1. Install dependency X
2. Run command Y
3. See error Z
validations:
required: false
- type: textarea
id: logs
attributes:
label: Error Logs
description: If applicable, include any relevant error messages or logs.
render: shell
validations:
required: false

View File

@@ -0,0 +1,70 @@
name: Troubleshooting
description: Help with a specific use case
type: Troubleshooting
body:
- type: markdown
attributes:
value: |
## Troubleshooting Request
Use this form to get help with a specific use case or implementation.
- type: input
id: pipecat-version
attributes:
label: pipecat version
description: Which version are you using?
placeholder: e.g., 0.0.63
validations:
required: true
- type: input
id: python-version
attributes:
label: Python version
description: Which version of Python are you using?
placeholder: e.g., 3.12.8
validations:
required: true
- type: input
id: os
attributes:
label: Operating System
description: Which OS are you using?
placeholder: e.g., Ubuntu 24.04, Windows 11, macOS 12.5
validations:
required: true
- type: textarea
id: use-case
attributes:
label: Use Case Description
description: Describe what you're trying to accomplish with pipecat.
validations:
required: true
- type: textarea
id: current-approach
attributes:
label: Current Approach
description: What have you tried so far? Include code snippets if relevant.
render: python
validations:
required: true
- type: textarea
id: errors
attributes:
label: Errors or Unexpected Behavior
description: Describe any errors or unexpected behavior you're encountering.
validations:
required: true
- type: textarea
id: additional-context
attributes:
label: Additional Context
description: Any other information that might help us understand your situation.
validations:
required: false

1
.github/ISSUE_TEMPLATE/config.yml vendored Normal file
View File

@@ -0,0 +1 @@
blank_issues_enabled: false

View File

@@ -5,14 +5,52 @@ All notable changes to **Pipecat** will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [0.0.65] - 2025-04-23 "Sant Jordi's release"
https://en.wikipedia.org/wiki/Saint_George%27s_Day_in_Catalonia
### Added
- Added automatic hangup logic to the Telnyx serializer. This feature hangs up
the Telnyx call when an `EndFrame` or `CancelFrame` is received. It is
enabled by default and is configurable via the `auto_hang_up` `InputParam`.
- Added a keepalive task to `GladiaSTTService` to prevent the websocket from
disconnecting after 30 seconds of no audio input.
### Changed
- The `InputParams` for `ElevenLabsTTSService` and `ElevenLabsHttpTTSService`
no longer require that `stability` and `similarity_boost` be set. You can
individually set each param.
- In `TwilioFrameSerializer`, `call_sid` is Optional so as to avoid a breaking
changed. `call_sid` is required to automatically hang up.
### Fixed
- Fixed an issue where `TwilioFrameSerializer` would send two hang up commands:
one for the `EndFrame` and one for the `CancelFrame`.
## [0.0.64] - 2025-04-22
### Added
- Added automatic hangup logic to the Twilio serializer. This feature hangs up
the Twilio call when an `EndFrame` or `CancelFrame` is received. It is
enabled by default and is configurable via the `auto_hang_up` `InputParam`.
- Added `SmartTurnMetricsData`, which contains end-of-turn prediction metrics,
to the `MetricsFrame`. Using `MetricsFrame`, you can now retrieve prediction
confidence scores and processing time metrics from the smart turn analyzers.
- Added support for Application Default Credentials in Google services,
`GoogleSTTService`, `GoogleTTSService`, and `GoogleVertexLLMService`.
- Added support for Smart Turn Detection via the `turn_analyzer` transport
parameter. You can now choose between `SmartTurnAnalyzer()` for remote
inference or `LocalCoreMLSmartTurnAnalyzer()` for on-device inference using
Core ML.
parameter. You can now choose between `HttpSmartTurnAnalyzer()` or
`FalSmartTurnAnalyzer()` for remote inference or
`LocalCoreMLSmartTurnAnalyzer()` for on-device inference using Core ML.
- `DeepgramTTSService` accepts `base_url` argument again, allowing you to
connect to an on-prem service.
@@ -37,6 +75,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- `GrokLLMService` now uses `grok-3-beta` as its default model.
- Daily's REST helpers now include an `eject_at_token_exp` param, which ejects
the user when their token expires. This new parameter defaults to False.
Also, the default value for `enable_prejoin_ui` changed to False and
@@ -71,6 +111,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed an issue where LLM input parameters were not working and applied correctly in `GoogleVertexLLMService`, causing
unexpected behavior during inference.
### Other
- Updated the `twilio-chatbot` example to use the auto-hangup feature.
## [0.0.63] - 2025-04-11
### Added

233
README.md
View File

@@ -1,43 +1,72 @@
<h1><div align="center">
 <img alt="pipecat" width="300px" height="auto" src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/pipecat.png">
<img alt="pipecat" width="300px" height="auto" src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/pipecat.png">
</div></h1>
[![PyPI](https://img.shields.io/pypi/v/pipecat-ai)](https://pypi.org/project/pipecat-ai) ![Tests](https://github.com/pipecat-ai/pipecat/actions/workflows/tests.yaml/badge.svg) [![codecov](https://codecov.io/gh/pipecat-ai/pipecat/graph/badge.svg?token=LNVUIVO4Y9)](https://codecov.io/gh/pipecat-ai/pipecat) [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.pipecat.ai) [![Discord](https://img.shields.io/discord/1239284677165056021)](https://discord.gg/pipecat)
Pipecat is an open source Python framework for building voice and multimodal conversational agents. It handles the complex orchestration of AI services, network transport, audio processing, and multimodal interactions, letting you focus on creating engaging experiences.
# 🎙️ Pipecat: Real-Time Voice & Multimodal AI Agents
## What you can build
**Pipecat** is an open-source Python framework for building real-time voice and multimodal conversational agents. Orchestrate audio and video, AI services, different transports, and conversation pipelines effortlessly—so you can focus on what makes your agent unique.
- **Voice Assistants**: [Natural, real-time conversations with AI](https://demo.dailybots.ai/)
- **Interactive Agents**: Personal coaches and meeting assistants
- **Multimodal Apps**: Combine voice, video, images, and text
- **Creative Tools**: [Story-telling experiences](https://storytelling-chatbot.fly.dev/) and social companions
- **Business Solutions**: [Customer intake flows](https://www.youtube.com/watch?v=lDevgsp9vn0) and support bots
- **Complex conversational flows**: [Refer to Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) to learn more
## 🚀 What You Can Build
## See it in action
- **Voice Assistants** natural, streaming conversations with AI
- **AI Companions** coaches, meeting assistants, characters
- **Multimodal Interfaces** voice, video, images, and more
- **Interactive Storytelling** creative tools with generative media
- **Business Agents** customer intake, support bots, guided flows
- **Complex Dialog Systems** design logic with structured conversations
🧭 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
## 🧠 Why Pipecat?
- **Voice-first**: Integrates speech recognition, text-to-speech, and conversation handling
- **Pluggable**: Supports many AI services and tools
- **Composable Pipelines**: Build complex behavior from modular components
- **Real-Time**: Ultra-low latency interaction with different transports (e.g. WebSockets or WebRTC)
## 🎬 See it in action
<p float="left">
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/simple-chatbot/image.png" width="280" /></a>&nbsp;
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/storytelling-chatbot/image.png" width="280" /></a>
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/simple-chatbot/image.png" width="400" /></a>&nbsp;
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/storytelling-chatbot/image.png" width="400" /></a>
<br/>
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/translation-chatbot/image.png" width="280" /></a>&nbsp;
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/moondream-chatbot/image.png" width="280" /></a>
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/translation-chatbot/image.png" width="400" /></a>&nbsp;
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/moondream-chatbot/image.png" width="400" /></a>
</p>
## Key features
## 📱 Client SDKs
- **Voice-first Design**: Built-in speech recognition, TTS, and conversation handling
- **Flexible Integration**: Works with popular AI services (OpenAI, ElevenLabs, etc.)
- **Pipeline Architecture**: Build complex apps from simple, reusable components
- **Real-time Processing**: Frame-based pipeline architecture for fluid interactions
- **Production Ready**: Enterprise-grade WebRTC and Websocket support
You can connect to Pipecat from any platform using our official SDKs:
💡 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
| Platform | SDK Repo | Description |
| -------- | ------------------------------------------------------------------------------ | -------------------------------- |
| Web | [pipecat-client-web](https://github.com/pipecat-ai/pipecat-client-web) | JavaScript and React client SDKs |
| iOS | [pipecat-client-ios](https://github.com/pipecat-ai/pipecat-client-ios) | Swift SDK for iOS |
| Android | [pipecat-client-android](https://github.com/pipecat-ai/pipecat-client-android) | Kotlin SDK for Android |
| C++ | [pipecat-client-cxx](https://github.com/pipecat-ai/pipecat-client-cxx) | C++ client SDK |
## Getting started
## 🧩 Available services
You can get started with Pipecat running on your local machine, then move your agent processes to the cloud when youre ready. You can also add a 📞 telephone number, 🖼️ image output, 📺 video input, use different LLMs, and more.
| Category | Services |
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
| Speech-to-Speech | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) |
| Analytics & Metrics | [Canonical AI](https://docs.pipecat.ai/server/services/analytics/canonical), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
## ⚡ Getting started
You can get started with Pipecat running on your local machine, then move your agent processes to the cloud when youre ready.
```shell
# Install the module
@@ -53,141 +82,51 @@ To keep things lightweight, only the core framework is included by default. If y
pip install "pipecat-ai[option,...]"
```
### Available services
| Category | Services | Install Command Example |
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------- |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | `pip install "pipecat-ai[deepgram]"` |
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) | `pip install "pipecat-ai[openai]"` |
| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | `pip install "pipecat-ai[cartesia]"` |
| Speech-to-Speech | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) | `pip install "pipecat-ai[google]"` |
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | `pip install "pipecat-ai[daily]"` |
| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | `pip install "pipecat-ai[tavus,simli]"` |
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | `pip install "pipecat-ai[mem0]"` |
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | `pip install "pipecat-ai[moondream]"` |
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) | `pip install "pipecat-ai[silero]"` |
| Analytics & Metrics | [Canonical AI](https://docs.pipecat.ai/server/services/analytics/canonical), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | `pip install "pipecat-ai[canonical]"` |
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
## Code examples
## 🧪 Code examples
- [Foundational](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational) — small snippets that build on each other, introducing one or two concepts at a time
- [Example apps](https://github.com/pipecat-ai/pipecat/tree/main/examples/) — complete applications that you can use as starting points for development
## A simple voice agent running locally
## 🛠️ Hacking on the framework itself
Here is a very basic Pipecat bot that greets a user when they join a real-time session. We'll use [Daily](https://daily.co) for real-time media transport, and [Cartesia](https://cartesia.ai/) for text-to-speech.
1. Set up a virtual environment before following these instructions. From the root of the repo:
```python
import asyncio
```shell
python3 -m venv venv
source venv/bin/activate
```
from pipecat.frames.frames import TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.task import PipelineTask
from pipecat.pipeline.runner import PipelineRunner
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
2. Install the development dependencies:
async def main():
# Use Daily as a real-time media transport (WebRTC)
transport = DailyTransport(
room_url=...,
token="", # leave empty. Note: token is _not_ your api key
bot_name="Bot Name",
params=DailyParams(audio_out_enabled=True))
```shell
pip install -r dev-requirements.txt
```
# Use Cartesia for Text-to-Speech
tts = CartesiaTTSService(
api_key=...,
voice_id=...
)
3. Install the git pre-commit hooks (these help ensure your code follows project rules):
# Simple pipeline that will process text to speech and output the result
pipeline = Pipeline([tts, transport.output()])
```shell
pre-commit install
```
# Create Pipecat processor that can run one or more pipelines tasks
runner = PipelineRunner()
4. Install the `pipecat-ai` package locally in editable mode:
# Assign the task callable to run the pipeline
task = PipelineTask(pipeline)
```shell
pip install -e .
```
# Register an event handler to play audio when a
# participant joins the transport WebRTC session
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
participant_name = participant.get("info", {}).get("userName", "")
# Queue a TextFrame that will get spoken by the TTS service (Cartesia)
await task.queue_frame(TextFrame(f"Hello there, {participant_name}!"))
> The `-e` or `--editable` option allows you to modify the code without reinstalling.
# Register an event handler to exit the application when the user leaves.
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.cancel()
5. Include optional dependencies as needed. For example:
# Run the pipeline task
await runner.run(task)
```shell
pip install -e ".[daily,deepgram,cartesia,openai,silero]"
```
if __name__ == "__main__":
asyncio.run(main())
```
6. (Optional) If you want to use this package from another directory:
Run it with:
```shell
python app.py
```
Daily provides a prebuilt WebRTC user interface. While the app is running, you can visit at `https://<yourdomain>.daily.co/<room_url>` and listen to the bot say hello!
## WebRTC for production use
WebSockets are fine for server-to-server communication or for initial development. But for production use, youll need client-server audio to use a protocol designed for real-time media transport. (For an explanation of the difference between WebSockets and WebRTC, see [this post.](https://www.daily.co/blog/how-to-talk-to-an-llm-with-your-voice/#webrtc))
One way to get up and running quickly with WebRTC is to sign up for a Daily developer account. Daily gives you SDKs and global infrastructure for audio (and video) routing. Every account gets 10,000 audio/video/transcription minutes free each month.
Sign up [here](https://dashboard.daily.co/u/signup) and [create a room](https://docs.daily.co/reference/rest-api/rooms) in the developer Dashboard.
## Hacking on the framework itself
_Note: You may need to set up a virtual environment before following these instructions. From the root of the repo:_
```shell
python3 -m venv venv
source venv/bin/activate
```
Install the development dependencies:
```shell
pip install -r dev-requirements.txt
```
Install the git pre-commit hooks (these help ensure your code follows project rules):
```shell
pre-commit install
```
Install the `pipecat-ai` package locally in editable mode:
```shell
pip install -e .
```
The `-e` or `--editable` option allows you to modify the code without reinstalling.
To include optional dependencies, add them to the install command. For example:
```shell
pip install -e ".[daily,deepgram,cartesia,openai,silero]" # Updated for the services you're using
```
If you want to use this package from another directory:
```shell
pip install "path_to_this_repo[option,...]"
```
```shell
pip install "path_to_this_repo[option,...]"
```
### Running tests
@@ -197,11 +136,11 @@ From the root directory, run:
pytest
```
## Setting up your editor
### Setting up your editor
This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting via [Ruff](https://github.com/astral-sh/ruff).
### Emacs
#### Emacs
You can use [use-package](https://github.com/jwiegley/use-package) to install [emacs-lazy-ruff](https://github.com/christophermadsen/emacs-lazy-ruff) package and configure `ruff` arguments:
@@ -223,7 +162,7 @@ You can use [use-package](https://github.com/jwiegley/use-package) to install [e
:hook ((python-mode . pyvenv-auto-run)))
```
### Visual Studio Code
#### Visual Studio Code
Install the
[Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, and enable formatting on save:
@@ -235,7 +174,7 @@ Install the
}
```
### PyCharm
#### PyCharm
`ruff` was installed in the `venv` environment described before, now to enable autoformatting on save, go to `File` -> `Settings` -> `Tools` -> `File Watchers` and add a new watcher with the following settings:
@@ -245,7 +184,7 @@ Install the
4. **Arguments**: `format $FilePath$`
5. **Program**: `$PyInterpreterDirectory$/ruff`
## Contributing
## 🤝 Contributing
We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:
@@ -258,7 +197,7 @@ Before submitting a pull request, please check existing issues and PRs to avoid
We aim to review all contributions promptly and provide constructive feedback to help get your changes merged.
## Getting help
## 🛟 Getting help
➡️ [Join our Discord](https://discord.gg/pipecat)

View File

@@ -1,22 +0,0 @@
# Description
Is this reporting a bug or feature request?
If reporting a bug, please fill out the following:
### Environment
- pipecat-ai version:
- python version:
- OS:
### Issue description
Provide a clear description of the issue.
### Repro steps
List the steps to reproduce the issue.
### Expected behavior
### Actual behavior
### Logs

View File

@@ -96,4 +96,8 @@ PIPER_BASE_URL=...
# Smart turn
LOCAL_SMART_TURN_MODEL_PATH=
REMOTE_SMART_TURN_URL=
FAL_SMART_TURN_API_KEY=...
# Twilio
TWILIO_ACCOUNT_SID=
TWILIO_AUTH_TOKEN=

View File

@@ -40,7 +40,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
stt = OpenAISTTService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o-transcribe-latest",
model="gpt-4o-transcribe",
prompt="Expect words related to dogs, such as breed names.",
)

View File

@@ -0,0 +1,113 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.turn.smart_turn.fal_smart_turn import FalSmartTurnAnalyzer
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
load_dotenv(override=True)
async def run_bot(webrtc_connection: SmallWebRTCConnection):
logger.info(f"Starting bot")
async with aiohttp.ClientSession() as session:
transport = SmallWebRTCTransport(
webrtc_connection=webrtc_connection,
params=TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
vad_audio_passthrough=True,
turn_analyzer=FalSmartTurnAnalyzer(
api_key=os.getenv("FAL_SMART_TURN_API_KEY"), aiohttp_session=session
),
),
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt,
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
@transport.event_handler("on_client_closed")
async def on_client_closed(transport, client):
logger.info(f"Client closed connection")
await task.cancel()
runner = PipelineRunner(handle_sigint=False)
await runner.run(task)
if __name__ == "__main__":
from run import main
main()

View File

@@ -1,111 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.turn.smart_turn import SmartTurnAnalyzer
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
load_dotenv(override=True)
async def run_bot(webrtc_connection: SmallWebRTCConnection):
logger.info(f"Starting bot")
remote_smart_turn_url = os.getenv("REMOTE_SMART_TURN_URL")
transport = SmallWebRTCTransport(
webrtc_connection=webrtc_connection,
params=TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
vad_audio_passthrough=True,
turn_analyzer=SmartTurnAnalyzer(url=remote_smart_turn_url),
),
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt,
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
@transport.event_handler("on_client_closed")
async def on_client_closed(transport, client):
logger.info(f"Client closed connection")
await task.cancel()
runner = PipelineRunner(handle_sigint=False)
await runner.run(task)
if __name__ == "__main__":
from run import main
main()

View File

@@ -9,8 +9,8 @@ import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.local_smart_turn import LocalCoreMLSmartTurnAnalyzer
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_coreml_smart_turn import LocalCoreMLSmartTurnAnalyzer
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.pipeline.pipeline import Pipeline

View File

@@ -7,7 +7,6 @@
import argparse
import asyncio
import importlib.util
import logging
import os
import sys
from contextlib import asynccontextmanager
@@ -18,6 +17,7 @@ import uvicorn
from dotenv import load_dotenv
from fastapi import BackgroundTasks, FastAPI
from fastapi.responses import RedirectResponse
from loguru import logger
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
@@ -25,14 +25,6 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
# Load environment variables
load_dotenv(override=True)
# Configure logger
logging.basicConfig(
level=logging.INFO,
format="%(message)s",
handlers=[logging.StreamHandler()],
)
logger = logging.getLogger("pipecat-server")
app = FastAPI()
# Store connections by pc_id
@@ -162,10 +154,11 @@ def main():
parser.add_argument("--verbose", "-v", action="count", default=0)
args = parser.parse_args()
logger.remove(0)
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
logger.add(sys.stderr, level="TRACE")
else:
logging.basicConfig(level=logging.INFO)
logger.add(sys.stderr, level="DEBUG")
# Infer the bot file from the caller if not provided explicitly
bot_file = args.bot_file

View File

@@ -26,9 +26,6 @@ from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
class MirrorProcessor(FrameProcessor):
async def process_frame(self, frame: Frame, direction: FrameDirection):

View File

@@ -1,6 +1,12 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import logging
import sys
from contextlib import asynccontextmanager
from typing import Dict
@@ -9,6 +15,7 @@ from bot import run_bot
from dotenv import load_dotenv
from fastapi import BackgroundTasks, FastAPI
from fastapi.responses import RedirectResponse
from loguru import logger
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
@@ -16,8 +23,6 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
# Load environment variables
load_dotenv(override=True)
logger = logging.getLogger("pc")
app = FastAPI()
# Store connections by pc_id
@@ -81,9 +86,10 @@ if __name__ == "__main__":
parser.add_argument("--verbose", "-v", action="count")
args = parser.parse_args()
logger.remove(0)
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
logger.add(sys.stderr, level="TRACE")
else:
logging.basicConfig(level=logging.INFO)
logger.add(sys.stderr, level="DEBUG")
uvicorn.run(app, host=args.host, port=args.port)

View File

@@ -25,9 +25,6 @@ from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
class EdgeDetectionProcessor(FrameProcessor):
def __init__(self, camera_out_width, camera_out_height: int):

View File

@@ -1,6 +1,12 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import logging
import sys
from contextlib import asynccontextmanager
from typing import Dict
@@ -9,6 +15,7 @@ from bot import run_bot
from dotenv import load_dotenv
from fastapi import BackgroundTasks, FastAPI
from fastapi.responses import RedirectResponse
from loguru import logger
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
@@ -16,8 +23,6 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
# Load environment variables
load_dotenv(override=True)
logger = logging.getLogger("pc")
app = FastAPI()
# Store connections by pc_id
@@ -81,9 +86,10 @@ if __name__ == "__main__":
parser.add_argument("--verbose", "-v", action="count")
args = parser.parse_args()
logger.remove(0)
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
logger.add(sys.stderr, level="TRACE")
else:
logging.basicConfig(level=logging.INFO)
logger.add(sys.stderr, level="DEBUG")
uvicorn.run(app, host=args.host, port=args.port)

View File

@@ -20,10 +20,6 @@ from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
SYSTEM_INSTRUCTION = f"""
"You are Gemini Chatbot, a friendly, helpful robot.

View File

@@ -1,6 +1,12 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import logging
import sys
from contextlib import asynccontextmanager
from typing import Dict
@@ -9,14 +15,13 @@ from bot import run_bot
from dotenv import load_dotenv
from fastapi import BackgroundTasks, FastAPI
from fastapi.responses import FileResponse
from loguru import logger
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
# Load environment variables
load_dotenv(override=True)
logger = logging.getLogger("pc")
app = FastAPI()
# Store connections by pc_id
@@ -73,9 +78,10 @@ if __name__ == "__main__":
parser.add_argument("--verbose", "-v", action="count")
args = parser.parse_args()
logger.remove(0)
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
logger.add(sys.stderr, level="TRACE")
else:
logging.basicConfig(level=logging.INFO)
logger.add(sys.stderr, level="DEBUG")
uvicorn.run(app, host=args.host, port=args.port)

View File

@@ -1,6 +1,7 @@
DAILY_SAMPLE_ROOM_URL=https://yourdomain.daily.co/yourroom # (optional: for joining the bot to the same room repeatedly for local dev)
DAILY_API_KEY=
DAILY_API_URL=api.daily.co/v1
DAILY_API_URL=https://api.daily.co/v1
DEEPGRAM_API_KEY=
OPENAI_API_KEY=
GOOGLE_API_KEY
CARTESIA_API_KEY=

View File

@@ -1,5 +1,5 @@
pipecat-ai[daily,cartesia,openai,google,silero]
fastapi==3.11.12
pipecat-ai[daily,cartesia,deepgram,openai,google,silero]
fastapi==0.115.6
uvicorn
python-dotenv
twilio

View File

@@ -63,20 +63,35 @@ This project is a FastAPI-based chatbot that integrates with Telnyx to handle We
ngrok http 8765
```
2. **Update the Telnyx TeXML applications Webhook**:
2. **Purchase a number**
- Go to your TeXML configuration page
- Provide the ngrok URL to the Webhook URL field and ensure the POST method is selected
- Click Save at the bottom of the page
If you haven't already, purchase a number from Telnyx.
3. **Configure streams.xml**:
- Log in to the Telnyx developer portal: https://portal.telnyx.com/
- Buy a number: https://portal.telnyx.com/#/numbers/buy-numbers
3. **Update the Telnyx TeXML applications Webhook**:
- Go to your TeXML configuration page: https://portal.telnyx.com/#/call-control/texml
- Create a new TeXML app, if one doesn't exist already:
- Add an application name
- Under Webhooks, select POST as the "Voice Method"
- Select "Custom URL" under Webhook URL Method
- Enter your ngrok URL in the "Webhook URL" field (e.g. https://your-name.ngrok.io)
- Click "Create" to save
Note: You'll see subsequent pages to set up SIP and Outbound, both are not required, so just skip.
- Navigate to "Manage Numbers" (https://portal.telnyx.com/#/numbers/my-numbers) and under SIP connection, select the pencil icon to edit and select the TeXML application that you just created.
Now your number is ready to call.
4. **Configure streams.xml**:
- Copy the template file to create your local version:
```sh
cp templates/streams.xml.template templates/streams.xml
```
- In `templates/streams.xml`, replace `<your server url>` with your ngrok URL (without `https://`)
- The final URL should look like: `wss://abc123.ngrok.io/ws`
- The encoding (`bidirectionalCodec`) should be `PCMU` or `PCMA` depending on your needs. Based on selected encoding, set the outbound_encoding in `server.py` when the bot is initialized.
- The final URL should look like: `wss://abc123.ngrok.io/ws`. This needs to be the same URL that you added to your TeXML app above.
- The encoding (`bidirectionalCodec`) should be `PCMU` or `PCMA` depending on your needs. Based on selected encoding, set the outbound_encoding in `server.py` when the bot is initialized. (No changes are required by default.)
- The inbound encoding can be controlled from the application configuration for inbound calls and dial/transfer commands for outbound calls.
## Running the Application

View File

@@ -33,9 +33,18 @@ logger.add(sys.stderr, level="DEBUG")
async def run_bot(
websocket_client,
stream_id: str,
call_control_id: str,
outbound_encoding: str,
inbound_encoding: str,
):
serializer = TelnyxFrameSerializer(
stream_id=stream_id,
outbound_encoding=outbound_encoding,
inbound_encoding=inbound_encoding,
call_control_id=call_control_id,
api_key=os.getenv("TELNYX_API_KEY"),
)
transport = FastAPIWebsocketTransport(
websocket=websocket_client,
params=FastAPIWebsocketParams(
@@ -44,7 +53,7 @@ async def run_bot(
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
serializer=TelnyxFrameSerializer(stream_id, outbound_encoding, inbound_encoding),
serializer=serializer,
),
)

View File

@@ -37,9 +37,10 @@ async def websocket_endpoint(websocket: WebSocket):
call_data = json.loads(await start_data.__anext__())
print(call_data, flush=True)
stream_id = call_data["stream_id"]
call_control_id = call_data["start"]["call_control_id"]
outbound_encoding = call_data["start"]["media_format"]["encoding"]
print("WebSocket connection accepted")
await run_bot(websocket, stream_id, outbound_encoding, "PCMU")
await run_bot(websocket, stream_id, call_control_id, outbound_encoding, "PCMU")
if __name__ == "__main__":

View File

@@ -54,7 +54,14 @@ async def save_audio(server_name: str, audio: bytes, sample_rate: int, num_chann
logger.info("No audio data to save")
async def run_bot(websocket_client: WebSocket, stream_sid: str, testing: bool):
async def run_bot(websocket_client: WebSocket, stream_sid: str, call_sid: str, testing: bool):
serializer = TwilioFrameSerializer(
stream_sid=stream_sid,
call_sid=call_sid,
account_sid=os.getenv("TWILIO_ACCOUNT_SID", ""),
auth_token=os.getenv("TWILIO_AUTH_TOKEN", ""),
)
transport = FastAPIWebsocketTransport(
websocket=websocket_client,
params=FastAPIWebsocketParams(
@@ -64,7 +71,7 @@ async def run_bot(websocket_client: WebSocket, stream_sid: str, testing: bool):
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
serializer=TwilioFrameSerializer(stream_sid),
serializer=serializer,
),
)

View File

@@ -38,8 +38,9 @@ async def websocket_endpoint(websocket: WebSocket):
call_data = json.loads(await start_data.__anext__())
print(call_data, flush=True)
stream_sid = call_data["start"]["streamSid"]
call_sid = call_data["start"]["callSid"]
print("WebSocket connection accepted")
await run_bot(websocket, stream_sid, app.state.testing)
await run_bot(websocket, stream_sid, call_sid, app.state.testing)
if __name__ == "__main__":

View File

@@ -6,14 +6,14 @@ build-backend = "setuptools.build_meta"
name = "pipecat-ai"
dynamic = ["version"]
description = "An open source framework for voice (and multimodal) assistants"
license = { text = "BSD 2-Clause License" }
license = "BSD-2-Clause"
license-files = ["LICENSE"]
readme = "README.md"
requires-python = ">=3.10"
keywords = ["webrtc", "audio", "video", "ai"]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Topic :: Communications :: Conferencing",
"Topic :: Multimedia :: Sound/Audio",
"Topic :: Multimedia :: Video",
@@ -92,9 +92,11 @@ websocket = [ "websockets~=13.1", "fastapi~=0.115.6" ]
whisper = [ "faster-whisper~=1.1.1" ]
[tool.setuptools.packages.find]
# All the following settings are optional:
where = ["src"]
[tool.setuptools.package-data]
"pipecat" = ["py.typed"]
[tool.pytest.ini_options]
addopts = "--verbose"
testpaths = ["tests"]

View File

@@ -6,7 +6,9 @@
from abc import ABC, abstractmethod
from enum import Enum
from typing import Optional
from typing import Optional, Tuple
from pipecat.metrics.metrics import MetricsData
class EndOfTurnState(Enum):
@@ -15,8 +17,10 @@ class EndOfTurnState(Enum):
class BaseTurnAnalyzer(ABC):
"""
Abstract base class for analyzing user end of turn.
"""Abstract base class for analyzing user end of turn.
This class inherits from BaseObject to leverage its event handling system
while still defining an abstract interface through abstract methods.
"""
def __init__(self, *, sample_rate: Optional[int] = None):
@@ -25,8 +29,7 @@ class BaseTurnAnalyzer(ABC):
@property
def sample_rate(self) -> int:
"""
Returns the current sample rate.
"""Returns the current sample rate.
Returns:
int: The effective sample rate for audio processing.
@@ -34,8 +37,7 @@ class BaseTurnAnalyzer(ABC):
return self._sample_rate
def set_sample_rate(self, sample_rate: int):
"""
Sets the sample rate for audio processing.
"""Sets the sample rate for audio processing.
If the initial sample rate was provided, it will use that; otherwise, it sets to
the provided sample rate.
@@ -48,8 +50,7 @@ class BaseTurnAnalyzer(ABC):
@property
@abstractmethod
def speech_triggered(self) -> bool:
"""
Determines if speech has been detected.
"""Determines if speech has been detected.
Returns:
bool: True if speech is triggered, otherwise False.
@@ -58,8 +59,7 @@ class BaseTurnAnalyzer(ABC):
@abstractmethod
def append_audio(self, buffer: bytes, is_speech: bool) -> EndOfTurnState:
"""
Appends audio data for analysis.
"""Appends audio data for analysis.
Args:
buffer (bytes): The audio data to append.
@@ -71,9 +71,8 @@ class BaseTurnAnalyzer(ABC):
pass
@abstractmethod
def analyze_end_of_turn(self) -> EndOfTurnState:
"""
Analyzes if an end of turn has occurred based on the audio input.
async def analyze_end_of_turn(self) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
"""Analyzes if an end of turn has occurred based on the audio input.
Returns:
EndOfTurnState: The result of the end of turn analysis.

View File

@@ -1,75 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import io
import os
from typing import Dict
import numpy as np
import requests
from loguru import logger
from pipecat.audio.turn.base_smart_turn import BaseSmartTurn
class SmartTurnAnalyzer(BaseSmartTurn):
def __init__(self, url: str, **kwargs):
super().__init__(**kwargs)
self.remote_smart_turn_url = url
if not self.remote_smart_turn_url:
logger.error("remote_smart_turn_url is not set.")
raise Exception("remote_smart_turn_url must be provided.")
# Use a session to reuse connections (keep-alive)
self.session = requests.Session()
self.session.headers.update({"Connection": "keep-alive"})
def _serialize_array(self, audio_array: np.ndarray) -> bytes:
logger.trace("Serializing NumPy array to bytes...")
buffer = io.BytesIO()
np.save(buffer, audio_array)
serialized_bytes = buffer.getvalue()
logger.trace(f"Serialized size: {len(serialized_bytes)} bytes")
return serialized_bytes
def _send_raw_request(self, data_bytes: bytes):
headers = {"Content-Type": "application/octet-stream"}
logger.trace(
f"Sending {len(data_bytes)} bytes as raw body to {self.remote_smart_turn_url}..."
)
try:
response = self.session.post(
self.remote_smart_turn_url,
data=data_bytes,
headers=headers,
timeout=60,
)
logger.trace("\n--- Response ---")
logger.trace(f"Status Code: {response.status_code}")
if response.ok:
try:
logger.trace("Response JSON:")
logger.trace(response.json())
return response.json()
except requests.exceptions.JSONDecodeError:
logger.trace("Response Content (non-JSON):")
logger.trace(response.text)
else:
logger.trace("Response Content (Error):")
logger.trace(response.text)
response.raise_for_status()
except requests.exceptions.RequestException as e:
logger.error(f"Failed to send raw request to Daily Smart Turn: {e}")
raise Exception("Failed to send raw request to Daily Smart Turn.")
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, any]:
serialized_array = self._serialize_array(audio_array)
return self._send_raw_request(serialized_array)

View File

@@ -6,13 +6,14 @@
import time
from abc import abstractmethod
from typing import Dict, Optional
from typing import Any, Dict, Optional, Tuple
import numpy as np
from loguru import logger
from pydantic import BaseModel
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
# Default timing parameters
STOP_SECS = 3
@@ -29,6 +30,10 @@ class SmartTurnParams(BaseModel):
# use_only_last_vad_segment: bool = USE_ONLY_LAST_VAD_SEGMENT
class SmartTurnTimeoutException(Exception):
pass
class BaseSmartTurn(BaseTurnAnalyzer):
def __init__(
self, *, sample_rate: Optional[int] = None, params: SmartTurnParams = SmartTurnParams()
@@ -41,7 +46,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
self._audio_buffer = []
self._speech_triggered = False
self._silence_ms = 0
self._speech_start_time = None
self._speech_start_time = 0
@property
def speech_triggered(self) -> bool:
@@ -59,9 +64,8 @@ class BaseSmartTurn(BaseTurnAnalyzer):
# Reset silence tracking on speech
self._silence_ms = 0
self._speech_triggered = True
if self._speech_start_time is None:
if self._speech_start_time == 0:
self._speech_start_time = time.time()
logger.debug(f"Speech started at {self._speech_start_time}")
else:
if self._speech_triggered:
chunk_duration_ms = len(audio_int16) / (self._sample_rate / 1000)
@@ -87,28 +91,27 @@ class BaseSmartTurn(BaseTurnAnalyzer):
return state
def analyze_end_of_turn(self) -> EndOfTurnState:
logger.debug("Analyzing End of Turn...")
state = self._process_speech_segment(self._audio_buffer)
async def analyze_end_of_turn(self) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
state, result = await self._process_speech_segment(self._audio_buffer)
if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
self._clear(state)
logger.debug(f"End of Turn result: {state}")
return state
return state, result
def _clear(self, turn_state: EndOfTurnState):
# Reset internal state for next turn
logger.debug("Clearing audio buffer...")
# If the state is still incomplete, keep the _speech_triggered as True
self._speech_triggered = turn_state == EndOfTurnState.INCOMPLETE
self._audio_buffer = []
self._speech_start_time = None
self._speech_start_time = 0
self._silence_ms = 0
def _process_speech_segment(self, audio_buffer) -> EndOfTurnState:
async def _process_speech_segment(
self, audio_buffer
) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
state = EndOfTurnState.INCOMPLETE
if not audio_buffer:
return state
return state, None
# Extract recent audio segment for prediction
start_time = self._speech_start_time - (self._params.pre_speech_ms / 1000)
@@ -124,41 +127,63 @@ class BaseSmartTurn(BaseTurnAnalyzer):
segment_audio_chunks = [chunk for _, chunk in audio_buffer[start_index : end_index + 1]]
segment_audio = np.concatenate(segment_audio_chunks)
logger.debug(f"Segment audio chunks after start index: {len(segment_audio)}")
# Limit maximum duration
max_samples = int(self._params.max_duration_secs * self.sample_rate)
if len(segment_audio) > max_samples:
# slices the array to keep the last max_samples samples, discarding the earlier part.
segment_audio = segment_audio[-max_samples:]
logger.debug(f"Segment audio chunks after limiting duration: {len(segment_audio)}")
result_data = None
if len(segment_audio) > 0:
start_time = time.perf_counter()
result = self._predict_endpoint(segment_audio)
state = (
EndOfTurnState.COMPLETE if result["prediction"] == 1 else EndOfTurnState.INCOMPLETE
)
end_time = time.perf_counter()
try:
result = await self._predict_endpoint(segment_audio)
state = (
EndOfTurnState.COMPLETE
if result["prediction"] == 1
else EndOfTurnState.INCOMPLETE
)
end_time = time.perf_counter()
# Calculate processing time
e2e_processing_time_ms = (end_time - start_time) * 1000
# Prepare the result data
result_data = SmartTurnMetricsData(
processor="BaseSmartTurn",
is_complete=result["prediction"] == 1,
probability=result["probability"],
inference_time_ms=result.get("inference_time", 0) * 1000,
server_total_time_ms=result.get("total_time", 0) * 1000,
e2e_processing_time_ms=e2e_processing_time_ms,
)
logger.trace(
f"Prediction: {'Complete' if result_data.is_complete else 'Incomplete'}"
)
logger.trace(f"Probability of complete: {result_data.probability:.4f}")
logger.trace(f"Inference time: {result_data.inference_time_ms:.2f}ms")
logger.trace(f"Server total time: {result_data.server_total_time_ms:.2f}ms")
logger.trace(f"E2E processing time: {result_data.e2e_processing_time_ms:.2f}ms")
except SmartTurnTimeoutException:
logger.debug(
f"End of Turn complete due to stop_secs. Silence in ms: {self._silence_ms}"
)
state = EndOfTurnState.COMPLETE
logger.debug("--------")
logger.debug(f"Prediction: {'Complete' if result['prediction'] == 1 else 'Incomplete'}")
logger.debug(f"Probability of complete: {result['probability']:.4f}")
logger.debug(f"Prediction took {(end_time - start_time) * 1000:.2f}ms seconds")
else:
logger.debug(f"params: {self._params}, stop_ms: {self._stop_ms}")
logger.debug("Captured empty audio segment, skipping prediction.")
logger.trace(f"params: {self._params}, stop_ms: {self._stop_ms}")
logger.trace("Captured empty audio segment, skipping prediction.")
return state
return state, result_data
@abstractmethod
def _predict_endpoint(self, buffer: np.ndarray) -> Dict[str, any]:
"""
Abstract method to predict if a turn has ended based on audio.
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
"""Abstract method to predict if a turn has ended based on audio.
Args:
buffer: Float32 numpy array of audio samples at 16kHz.
audio_array: Float32 numpy array of audio samples at 16kHz.
Returns:
Dictionary with:

View File

@@ -0,0 +1,26 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from typing import Optional
import aiohttp
from pipecat.audio.turn.smart_turn.http_smart_turn import HttpSmartTurnAnalyzer
class FalSmartTurnAnalyzer(HttpSmartTurnAnalyzer):
def __init__(
self,
*,
aiohttp_session: aiohttp.ClientSession,
url: str = "https://fal.run/fal-ai/smart-turn/raw",
api_key: Optional[str] = None,
**kwargs,
):
headers = {}
if api_key:
headers = {"Authorization": f"Key {api_key}"}
super().__init__(url=url, aiohttp_session=aiohttp_session, headers=headers, **kwargs)

View File

@@ -0,0 +1,80 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import io
from typing import Any, Dict
import aiohttp
import numpy as np
from loguru import logger
from pipecat.audio.turn.smart_turn.base_smart_turn import BaseSmartTurn, SmartTurnTimeoutException
class HttpSmartTurnAnalyzer(BaseSmartTurn):
def __init__(
self,
*,
url: str,
aiohttp_session: aiohttp.ClientSession,
headers: Dict[str, str] = {},
**kwargs,
):
super().__init__(**kwargs)
self._url = url
self._headers = headers
self._aiohttp_session = aiohttp_session
def _serialize_array(self, audio_array: np.ndarray) -> bytes:
logger.trace("Serializing NumPy array to bytes...")
buffer = io.BytesIO()
np.save(buffer, audio_array)
serialized_bytes = buffer.getvalue()
logger.trace(f"Serialized size: {len(serialized_bytes)} bytes")
return serialized_bytes
async def _send_raw_request(self, data_bytes: bytes) -> Dict[str, Any]:
headers = {"Content-Type": "application/octet-stream"}
headers.update(self._headers)
logger.trace(f"Sending {len(data_bytes)} bytes as raw body to {self._url}...")
try:
timeout = aiohttp.ClientTimeout(total=self._params.stop_secs)
async with self._aiohttp_session.post(
self._url, data=data_bytes, headers=headers, timeout=timeout
) as response:
logger.trace("\n--- Response ---")
logger.trace(f"Status Code: {response.status}")
if response.status == 200:
try:
json_data = await response.json()
logger.trace("Response JSON:")
logger.trace(json_data)
return json_data
except aiohttp.ContentTypeError:
# Non-JSON response
text = await response.text()
logger.trace("Response Content (non-JSON):")
logger.trace(text)
raise Exception(f"Non-JSON response: {text}")
else:
error_text = await response.text()
logger.trace("Response Content (Error):")
logger.trace(error_text)
response.raise_for_status()
except asyncio.TimeoutError:
logger.error(f"Request timed out after {self._params.stop_secs} seconds")
raise SmartTurnTimeoutException(f"Request exceeded {self._params.stop_secs} seconds.")
except aiohttp.ClientError as e:
logger.error(f"Failed to send raw request to Daily Smart Turn: {e}")
raise Exception("Failed to send raw request to Daily Smart Turn.")
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
serialized_array = self._serialize_array(audio_array)
return await self._send_raw_request(serialized_array)

View File

@@ -5,17 +5,16 @@
#
import os
from typing import Dict
from typing import Any, Dict
import numpy as np
import torch
from loguru import logger
from pipecat.audio.turn.base_smart_turn import BaseSmartTurn
from pipecat.audio.turn.smart_turn.base_smart_turn import BaseSmartTurn
try:
import coremltools as ct
import torch
from transformers import AutoFeatureExtractor
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
@@ -26,7 +25,7 @@ except ModuleNotFoundError as e:
class LocalCoreMLSmartTurnAnalyzer(BaseSmartTurn):
def __init__(self, smart_turn_model_path: str, **kwargs):
def __init__(self, *, smart_turn_model_path: str, **kwargs):
super().__init__(**kwargs)
if not smart_turn_model_path:
@@ -41,7 +40,7 @@ class LocalCoreMLSmartTurnAnalyzer(BaseSmartTurn):
self._turn_model = ct.models.MLModel(core_ml_model_path)
logger.debug("Loaded Local Smart Turn")
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, any]:
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
inputs = self._turn_processor(
audio_array,
sampling_rate=16000,

View File

@@ -30,3 +30,13 @@ class LLMUsageMetricsData(MetricsData):
class TTSUsageMetricsData(MetricsData):
value: int
class SmartTurnMetricsData(MetricsData):
"""Metrics data for smart turn predictions."""
is_complete: bool
probability: float
inference_time_ms: float
server_total_time_ms: float
e2e_processing_time_ms: float

View File

@@ -1 +0,0 @@

View File

@@ -8,6 +8,8 @@ import base64
import json
from typing import Optional
import aiohttp
from loguru import logger
from pydantic import BaseModel
from pipecat.audio.utils import (
@@ -19,6 +21,8 @@ from pipecat.audio.utils import (
)
from pipecat.frames.frames import (
AudioRawFrame,
CancelFrame,
EndFrame,
Frame,
InputAudioRawFrame,
InputDTMFFrame,
@@ -30,38 +34,120 @@ from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializer
class TelnyxFrameSerializer(FrameSerializer):
"""Serializer for Telnyx WebSocket protocol.
This serializer handles converting between Pipecat frames and Telnyx's WebSocket
media streams protocol. It supports audio conversion, DTMF events, and automatic
call termination.
When auto_hang_up is enabled (default), the serializer will automatically terminate
the Telnyx call when an EndFrame or CancelFrame is processed, but requires Telnyx
credentials to be provided.
Attributes:
_stream_id: The Telnyx Stream ID.
_call_control_id: The associated Telnyx Call Control ID.
_api_key: Telnyx API key for API access.
_params: Configuration parameters.
_telnyx_sample_rate: Sample rate used by Telnyx (typically 8kHz).
_sample_rate: Input sample rate for the pipeline.
_resampler: Audio resampler for format conversion.
_hangup_attempted: Flag to track if hang-up has been attempted.
"""
class InputParams(BaseModel):
telnyx_sample_rate: int = 8000 # Default Telnyx rate (8kHz)
sample_rate: Optional[int] = None # Pipeline input rate
"""Configuration parameters for TelnyxFrameSerializer.
Attributes:
telnyx_sample_rate: Sample rate used by Telnyx, defaults to 8000 Hz.
sample_rate: Optional override for pipeline input sample rate.
inbound_encoding: Audio encoding for data sent to Telnyx (e.g., "PCMU").
outbound_encoding: Audio encoding for data received from Telnyx (e.g., "PCMU").
auto_hang_up: Whether to automatically terminate call on EndFrame.
"""
telnyx_sample_rate: int = 8000
sample_rate: Optional[int] = None
inbound_encoding: str = "PCMU"
outbound_encoding: str = "PCMU"
auto_hang_up: bool = True
def __init__(
self,
stream_id: str,
outbound_encoding: str,
inbound_encoding: str,
call_control_id: Optional[str] = None,
api_key: Optional[str] = None,
params: InputParams = InputParams(),
):
"""Initialize the TelnyxFrameSerializer.
Args:
stream_id: The Stream ID for Telnyx.
outbound_encoding: The encoding type for outbound audio (e.g., "PCMU").
inbound_encoding: The encoding type for inbound audio (e.g., "PCMU").
call_control_id: The Call Control ID for the Telnyx call (optional, but required for auto hang-up).
api_key: Your Telnyx API key (required for auto hang-up).
params: Configuration parameters.
"""
self._stream_id = stream_id
params.outbound_encoding = outbound_encoding
params.inbound_encoding = inbound_encoding
self._call_control_id = call_control_id
self._api_key = api_key
self._params = params
self._telnyx_sample_rate = self._params.telnyx_sample_rate
self._sample_rate = 0 # Pipeline input rate
self._resampler = create_default_resampler()
self._hangup_attempted = False
@property
def type(self) -> FrameSerializerType:
"""Gets the serializer type.
Returns:
The serializer type, either TEXT or BINARY.
"""
return FrameSerializerType.TEXT
async def setup(self, frame: StartFrame):
"""Sets up the serializer with pipeline configuration.
Args:
frame: The StartFrame containing pipeline configuration.
"""
self._sample_rate = self._params.sample_rate or frame.audio_in_sample_rate
async def serialize(self, frame: Frame) -> str | bytes | None:
if isinstance(frame, AudioRawFrame):
"""Serializes a Pipecat frame to Telnyx WebSocket format.
Handles conversion of various frame types to Telnyx WebSocket messages.
For EndFrames and CancelFrames, initiates call termination if auto_hang_up is enabled.
Args:
frame: The Pipecat frame to serialize.
Returns:
Serialized data as string or bytes, or None if the frame isn't handled.
Raises:
ValueError: If an unsupported encoding is specified.
"""
if (
self._params.auto_hang_up
and not self._hangup_attempted
and isinstance(frame, (EndFrame, CancelFrame))
):
self._hangup_attempted = True
await self._hang_up_call()
return None
elif isinstance(frame, StartInterruptionFrame):
answer = {"event": "clear"}
return json.dumps(answer)
elif isinstance(frame, AudioRawFrame):
data = frame.audio
# Output: Convert PCM at frame's rate to 8kHz encoded for Telnyx
@@ -84,11 +170,58 @@ class TelnyxFrameSerializer(FrameSerializer):
return json.dumps(answer)
if isinstance(frame, StartInterruptionFrame):
answer = {"event": "clear"}
return json.dumps(answer)
# Return None for unhandled frames
return None
async def _hang_up_call(self):
"""Hang up the Telnyx call using Telnyx's REST API."""
try:
call_control_id = self._call_control_id
api_key = self._api_key
if not call_control_id or not api_key:
logger.warning(
"Cannot hang up Telnyx call: call_control_id and api_key must be provided"
)
return
# Telnyx API endpoint for hanging up a call
endpoint = f"https://api.telnyx.com/v2/calls/{call_control_id}/actions/hangup"
# Set headers with API key
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
# Make the POST request to hang up the call
async with aiohttp.ClientSession() as session:
async with session.post(endpoint, headers=headers) as response:
if response.status == 200:
logger.info(f"Successfully terminated Telnyx call {call_control_id}")
else:
# Get the error details for better debugging
error_text = await response.text()
logger.error(
f"Failed to terminate Telnyx call {call_control_id}: "
f"Status {response.status}, Response: {error_text}"
)
except Exception as e:
logger.exception(f"Failed to hang up Telnyx call: {e}")
async def deserialize(self, data: str | bytes) -> Frame | None:
"""Deserializes Telnyx WebSocket data to Pipecat frames.
Handles conversion of Telnyx media events to appropriate Pipecat frames,
including audio data and DTMF keypresses.
Args:
data: The raw WebSocket data from Telnyx.
Returns:
A Pipecat frame corresponding to the Telnyx event, or None if unhandled.
Raises:
ValueError: If an unsupported encoding is specified.
"""
message = json.loads(data)
if message["event"] == "media":

View File

@@ -8,11 +8,14 @@ import base64
import json
from typing import Optional
from loguru import logger
from pydantic import BaseModel
from pipecat.audio.utils import create_default_resampler, pcm_to_ulaw, ulaw_to_pcm
from pipecat.frames.frames import (
AudioRawFrame,
CancelFrame,
EndFrame,
Frame,
InputAudioRawFrame,
InputDTMFFrame,
@@ -26,28 +29,107 @@ from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializer
class TwilioFrameSerializer(FrameSerializer):
class InputParams(BaseModel):
twilio_sample_rate: int = 8000 # Default Twilio rate (8kHz)
sample_rate: Optional[int] = None # Pipeline input rate
"""Serializer for Twilio Media Streams WebSocket protocol.
def __init__(self, stream_sid: str, params: InputParams = InputParams()):
This serializer handles converting between Pipecat frames and Twilio's WebSocket
media streams protocol. It supports audio conversion, DTMF events, and automatic
call termination.
When auto_hang_up is enabled (default), the serializer will automatically terminate
the Twilio call when an EndFrame or CancelFrame is processed, but requires Twilio
credentials to be provided.
Attributes:
_stream_sid: The Twilio Media Stream SID.
_call_sid: The associated Twilio Call SID.
_account_sid: Twilio account SID for API access.
_auth_token: Twilio authentication token for API access.
_params: Configuration parameters.
_twilio_sample_rate: Sample rate used by Twilio (typically 8kHz).
_sample_rate: Input sample rate for the pipeline.
_resampler: Audio resampler for format conversion.
"""
class InputParams(BaseModel):
"""Configuration parameters for TwilioFrameSerializer.
Attributes:
twilio_sample_rate: Sample rate used by Twilio, defaults to 8000 Hz.
sample_rate: Optional override for pipeline input sample rate.
auto_hang_up: Whether to automatically terminate call on EndFrame.
"""
twilio_sample_rate: int = 8000
sample_rate: Optional[int] = None
auto_hang_up: bool = True
def __init__(
self,
stream_sid: str,
call_sid: Optional[str] = None,
account_sid: Optional[str] = None,
auth_token: Optional[str] = None,
params: InputParams = InputParams(),
):
"""Initialize the TwilioFrameSerializer.
Args:
stream_sid: The Twilio Media Stream SID.
call_sid: The associated Twilio Call SID (optional, but required for auto hang-up).
account_sid: Twilio account SID (required for auto hang-up).
auth_token: Twilio auth token (required for auto hang-up).
params: Configuration parameters.
"""
self._stream_sid = stream_sid
self._call_sid = call_sid
self._account_sid = account_sid
self._auth_token = auth_token
self._params = params
self._twilio_sample_rate = self._params.twilio_sample_rate
self._sample_rate = 0 # Pipeline input rate
self._resampler = create_default_resampler()
self._hangup_attempted = False
@property
def type(self) -> FrameSerializerType:
"""Gets the serializer type.
Returns:
The serializer type, either TEXT or BINARY.
"""
return FrameSerializerType.TEXT
async def setup(self, frame: StartFrame):
"""Sets up the serializer with pipeline configuration.
Args:
frame: The StartFrame containing pipeline configuration.
"""
self._sample_rate = self._params.sample_rate or frame.audio_in_sample_rate
async def serialize(self, frame: Frame) -> str | bytes | None:
if isinstance(frame, StartInterruptionFrame):
"""Serializes a Pipecat frame to Twilio WebSocket format.
Handles conversion of various frame types to Twilio WebSocket messages.
For EndFrames, initiates call termination if auto_hang_up is enabled.
Args:
frame: The Pipecat frame to serialize.
Returns:
Serialized data as string or bytes, or None if the frame isn't handled.
"""
if (
self._params.auto_hang_up
and not self._hangup_attempted
and isinstance(frame, (EndFrame, CancelFrame))
):
self._hangup_attempted = True
await self._hang_up_call()
return None
elif isinstance(frame, StartInterruptionFrame):
answer = {"event": "clear", "streamSid": self._stream_sid}
return json.dumps(answer)
elif isinstance(frame, AudioRawFrame):
@@ -68,7 +150,70 @@ class TwilioFrameSerializer(FrameSerializer):
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
return json.dumps(frame.message)
# Return None for unhandled frames
return None
async def _hang_up_call(self):
"""Hang up the Twilio call using Twilio's REST API."""
try:
import aiohttp
account_sid = self._account_sid
auth_token = self._auth_token
call_sid = self._call_sid
if not call_sid or not account_sid or not auth_token:
missing = []
if not call_sid:
missing.append("call_sid")
if not account_sid:
missing.append("account_sid")
if not auth_token:
missing.append("auth_token")
logger.warning(
f"Cannot hang up Twilio call: missing required parameters: {', '.join(missing)}"
)
return
# Twilio API endpoint for updating calls
endpoint = (
f"https://api.twilio.com/2010-04-01/Accounts/{account_sid}/Calls/{call_sid}.json"
)
# Create basic auth from account_sid and auth_token
auth = aiohttp.BasicAuth(account_sid, auth_token)
# Parameters to set the call status to "completed" (hang up)
params = {"Status": "completed"}
# Make the POST request to update the call
async with aiohttp.ClientSession() as session:
async with session.post(endpoint, auth=auth, data=params) as response:
if response.status == 200:
logger.info(f"Successfully terminated Twilio call {call_sid}")
else:
# Get the error details for better debugging
error_text = await response.text()
logger.error(
f"Failed to terminate Twilio call {call_sid}: "
f"Status {response.status}, Response: {error_text}"
)
except Exception as e:
logger.exception(f"Failed to hang up Twilio call: {e}")
async def deserialize(self, data: str | bytes) -> Frame | None:
"""Deserializes Twilio WebSocket data to Pipecat frames.
Handles conversion of Twilio media events to appropriate Pipecat frames.
Args:
data: The raw WebSocket data from Twilio.
Returns:
A Pipecat frame corresponding to the Twilio event, or None if unhandled.
"""
message = json.loads(data)
if message["event"] == "media":

View File

@@ -126,31 +126,14 @@ def build_elevenlabs_voice_settings(
settings: Dictionary containing voice settings parameters
Returns:
Dictionary of voice settings or None if required parameters are missing
Dictionary of voice settings or None if no valid settings are provided
"""
voice_setting_keys = ["stability", "similarity_boost", "style", "use_speaker_boost", "speed"]
voice_settings = {}
if settings["stability"] is not None and settings["similarity_boost"] is not None:
voice_settings["stability"] = settings["stability"]
voice_settings["similarity_boost"] = settings["similarity_boost"]
if settings["style"] is not None:
voice_settings["style"] = settings["style"]
if settings["use_speaker_boost"] is not None:
voice_settings["use_speaker_boost"] = settings["use_speaker_boost"]
if settings["speed"] is not None:
voice_settings["speed"] = settings["speed"]
else:
if settings["style"] is not None:
logger.warning(
"'style' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
if settings["use_speaker_boost"] is not None:
logger.warning(
"'use_speaker_boost' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
if settings["speed"] is not None:
logger.warning(
"'speed' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
for key in voice_setting_keys:
if key in settings and settings[key] is not None:
voice_settings[key] = settings[key]
return voice_settings or None

View File

@@ -4,6 +4,7 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import base64
import json
import warnings
@@ -224,6 +225,7 @@ class GladiaSTTService(STTService):
self._params = params
self._websocket = None
self._receive_task = None
self._keepalive_task = None
def language_to_service_language(self, language: Language) -> Optional[str]:
"""Convert pipecat Language enum to Gladia's language code."""
@@ -287,14 +289,22 @@ class GladiaSTTService(STTService):
self._websocket = await websockets.connect(response["url"])
if self._websocket and not self._receive_task:
self._receive_task = self.create_task(self._receive_task_handler())
if self._websocket and not self._keepalive_task:
self._keepalive_task = self.create_task(self._keepalive_task_handler())
async def stop(self, frame: EndFrame):
"""Stop the Gladia STT websocket connection."""
await super().stop(frame)
await self._send_stop_recording()
if self._keepalive_task:
await self.cancel_task(self._keepalive_task)
self._keepalive_task = None
if self._websocket:
await self._websocket.close()
self._websocket = None
if self._receive_task:
await self.wait_for_task(self._receive_task)
self._receive_task = None
@@ -302,7 +312,15 @@ class GladiaSTTService(STTService):
async def cancel(self, frame: CancelFrame):
"""Cancel the Gladia STT websocket connection."""
await super().cancel(frame)
await self._websocket.close()
if self._keepalive_task:
await self.cancel_task(self._keepalive_task)
self._keepalive_task = None
if self._websocket:
await self._websocket.close()
self._websocket = None
if self._receive_task:
await self.cancel_task(self._receive_task)
self._receive_task = None
@@ -341,6 +359,24 @@ class GladiaSTTService(STTService):
if self._websocket and not self._websocket.closed:
await self._websocket.send(json.dumps({"type": "stop_recording"}))
async def _keepalive_task_handler(self):
"""Send periodic empty audio chunks to keep the connection alive."""
try:
while True:
# Send keepalive every 20 seconds (Gladia times out after 30 seconds)
await asyncio.sleep(20)
if self._websocket and not self._websocket.closed:
# Send an empty audio chunk as keepalive
empty_audio = b""
await self._send_audio(empty_audio)
else:
logger.debug("Websocket closed, stopping keepalive")
break
except websockets.exceptions.ConnectionClosed:
logger.debug("Connection closed during keepalive")
except Exception as e:
logger.error(f"Error in Gladia keepalive task: {e}")
async def _receive_task_handler(self):
try:
async for message in self._websocket:

View File

@@ -17,6 +17,8 @@ from loguru import logger
from pipecat.services.openai.llm import OpenAILLMService
try:
from google.auth import default
from google.auth.exceptions import GoogleAuthError
from google.auth.transport.requests import Request
from google.oauth2 import service_account
@@ -100,6 +102,13 @@ class GoogleVertexLLMService(OpenAILLMService):
creds = service_account.Credentials.from_service_account_file(
credentials_path, scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
else:
try:
creds, project_id = default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
except GoogleAuthError:
pass
if not creds:
raise ValueError("No valid credentials provided.")

View File

@@ -32,6 +32,8 @@ from pipecat.utils.time import time_now_iso8601
try:
from google.api_core.client_options import ClientOptions
from google.auth import default
from google.auth.exceptions import GoogleAuthError
from google.cloud import speech_v2
from google.cloud.speech_v2.types import cloud_speech
from google.oauth2 import service_account
@@ -451,6 +453,7 @@ class GoogleSTTService(STTService):
client_options = ClientOptions(api_endpoint=f"{self._location}-speech.googleapis.com")
# Extract project ID and create client
creds: Optional[service_account.Credentials] = None
if credentials:
json_account_info = json.loads(credentials)
self._project_id = json_account_info.get("project_id")
@@ -461,7 +464,16 @@ class GoogleSTTService(STTService):
self._project_id = json_account_info.get("project_id")
creds = service_account.Credentials.from_service_account_file(credentials_path)
else:
raise ValueError("Either credentials or credentials_path must be provided")
try:
creds, project_id = default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
self._project_id = project_id
except GoogleAuthError:
pass
if not creds:
raise ValueError("No valid credentials provided.")
if not self._project_id:
raise ValueError("Project ID not found in credentials")

View File

@@ -27,6 +27,8 @@ from pipecat.services.tts_service import TTSService
from pipecat.transcriptions.language import Language
try:
from google.auth import default
from google.auth.exceptions import GoogleAuthError
from google.cloud import texttospeech_v1
from google.oauth2 import service_account
@@ -251,6 +253,16 @@ class GoogleTTSService(TTSService):
elif credentials_path:
# Use service account JSON file if provided
creds = service_account.Credentials.from_service_account_file(credentials_path)
else:
try:
creds, project_id = default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
except GoogleAuthError:
pass
if not creds:
raise ValueError("No valid credentials provided.")
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)

View File

@@ -42,7 +42,7 @@ class GrokLLMService(OpenAILLMService):
Args:
api_key (str): The API key for accessing Grok's API
base_url (str, optional): The base URL for Grok API. Defaults to "https://api.x.ai/v1"
model (str, optional): The model identifier to use. Defaults to "grok-2"
model (str, optional): The model identifier to use. Defaults to "grok-3-beta"
**kwargs: Additional keyword arguments passed to OpenAILLMService
"""
@@ -51,7 +51,7 @@ class GrokLLMService(OpenAILLMService):
*,
api_key: str,
base_url: str = "https://api.x.ai/v1",
model: str = "grok-2",
model: str = "grok-3-beta",
**kwargs,
):
super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)

View File

@@ -70,7 +70,7 @@ class OpenAITTSService(TTSService):
if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
logger.warning(
f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
f"Current rate of {self.sample_rate}Hz may cause issues."
f"Current rate of {sample_rate}Hz may cause issues."
)
super().__init__(sample_rate=sample_rate, **kwargs)

View File

@@ -6,11 +6,14 @@
import asyncio
from concurrent.futures import ThreadPoolExecutor
from typing import Optional
from typing import Mapping, Optional
from loguru import logger
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
from pipecat.audio.turn.base_turn_analyzer import (
BaseTurnAnalyzer,
EndOfTurnState,
)
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADState
from pipecat.frames.frames import (
BotInterruptionFrame,
@@ -21,6 +24,7 @@ from pipecat.frames.frames import (
FilterUpdateSettingsFrame,
Frame,
InputAudioRawFrame,
MetricsFrame,
StartFrame,
StartInterruptionFrame,
StopInterruptionFrame,
@@ -29,6 +33,7 @@ from pipecat.frames.frames import (
UserStoppedSpeakingFrame,
VADParamsUpdateFrame,
)
from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.transports.base_transport import TransportParams
@@ -78,6 +83,7 @@ class BaseInputTransport(FrameProcessor):
# Configure End of turn analyzer.
if self._params.turn_analyzer:
self._params.turn_analyzer.set_sample_rate(self._sample_rate)
# Start audio filter.
if self._params.audio_in_filter:
await self._params.audio_in_filter.start(self._sample_rate)
@@ -216,9 +222,8 @@ class BaseInputTransport(FrameProcessor):
async def _handle_end_of_turn(self):
if self.turn_analyzer:
state = await self.get_event_loop().run_in_executor(
self._executor, self.turn_analyzer.analyze_end_of_turn
)
state, prediction = await self.turn_analyzer.analyze_end_of_turn()
await self._handle_prediction_result(prediction)
await self._handle_end_of_turn_complete(state)
async def _handle_end_of_turn_complete(self, state: EndOfTurnState):
@@ -263,3 +268,11 @@ class BaseInputTransport(FrameProcessor):
await self.push_frame(frame)
self._audio_in_queue.task_done()
async def _handle_prediction_result(self, result: MetricsData):
"""Handle a prediction result event from the turn analyzer.
Args:
result: The prediction result MetricsData.
"""
await self.push_frame(MetricsFrame(data=[result]))

View File

@@ -207,10 +207,12 @@ class FastAPIWebsocketOutputTransport(BaseOutputTransport):
async def stop(self, frame: EndFrame):
await super().stop(frame)
await self._write_frame(frame)
await self._client.disconnect()
async def cancel(self, frame: CancelFrame):
await super().cancel(frame)
await self._write_frame(frame)
await self._client.disconnect()
async def cleanup(self):

View File

@@ -157,7 +157,8 @@ class WebsocketServerInputTransport(BaseInputTransport):
self, websocket: websockets.WebSocketServerProtocol, session_timeout: int
):
"""Wait for session_timeout seconds, if the websocket is still open,
trigger timeout event."""
trigger timeout event.
"""
try:
await asyncio.sleep(session_timeout)
if not websocket.closed:
@@ -195,6 +196,14 @@ class WebsocketServerOutputTransport(BaseOutputTransport):
await self._params.serializer.setup(frame)
self._send_interval = (self._audio_chunk_size / self.sample_rate) / 2
async def stop(self, frame: EndFrame):
await super().stop(frame)
await self._write_frame(frame)
async def cancel(self, frame: CancelFrame):
await super().cancel(frame)
await self._write_frame(frame)
async def cleanup(self):
await super().cleanup()
await self._transport.cleanup()