Compare commits
2 Commits
v0.0.65
...
hush/custo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dddfd791e1 | ||
|
|
e721c2086c |
87
.github/ISSUE_TEMPLATE/1-bug_report.yml
vendored
87
.github/ISSUE_TEMPLATE/1-bug_report.yml
vendored
@@ -1,87 +0,0 @@
|
||||
name: Bug report
|
||||
description: Report a bug or unexpected behavior
|
||||
type: Bug
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
## Bug Report
|
||||
|
||||
Thank you for taking the time to fill out this bug report.
|
||||
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
### Environment
|
||||
|
||||
- type: input
|
||||
id: pipecat-version
|
||||
attributes:
|
||||
label: pipecat version
|
||||
description: Which version are you using?
|
||||
placeholder: e.g., 0.0.63
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: python-version
|
||||
attributes:
|
||||
label: Python version
|
||||
description: Which Python version are you using?
|
||||
placeholder: e.g., 3.12.8
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: os
|
||||
attributes:
|
||||
label: Operating System
|
||||
description: Which OS are you using?
|
||||
placeholder: e.g., Ubuntu 24.04, Windows 11, macOS 12.5
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: Issue description
|
||||
description: Provide a clear description of the issue.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: repro
|
||||
attributes:
|
||||
label: Reproduction steps
|
||||
description: List the steps to reproduce the issue.
|
||||
placeholder: |
|
||||
1. Do this...
|
||||
2. Then do that...
|
||||
3. Observe the error...
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: expected
|
||||
attributes:
|
||||
label: Expected behavior
|
||||
description: What did you expect to happen?
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: actual
|
||||
attributes:
|
||||
label: Actual behavior
|
||||
description: What actually happened?
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Logs
|
||||
description: If applicable, include any relevant logs or error messages
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
||||
67
.github/ISSUE_TEMPLATE/2-question.yml
vendored
67
.github/ISSUE_TEMPLATE/2-question.yml
vendored
@@ -1,67 +0,0 @@
|
||||
name: Question
|
||||
description: Ask a question or get help
|
||||
type: Question
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
## Question
|
||||
|
||||
Use this form to ask a question about pipecat.
|
||||
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
### Environment (if applicable)
|
||||
|
||||
- type: input
|
||||
id: pipecat-version
|
||||
attributes:
|
||||
label: pipecat version
|
||||
description: Which version are you using? (if applicable)
|
||||
placeholder: e.g., 0.0.63
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: input
|
||||
id: python-version
|
||||
attributes:
|
||||
label: Python version
|
||||
description: Which Python version are you using? (if applicable)
|
||||
placeholder: e.g., 3.12.8
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: input
|
||||
id: os
|
||||
attributes:
|
||||
label: Operating System
|
||||
description: Which OS are you using? (if applicable)
|
||||
placeholder: e.g., Ubuntu 24.04, Windows 11, macOS 12.5
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: textarea
|
||||
id: question
|
||||
attributes:
|
||||
label: Question
|
||||
description: Provide your question in detail here.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: tried
|
||||
attributes:
|
||||
label: What I've tried
|
||||
description: Describe what you've already tried or research you've done.
|
||||
placeholder: I've looked at the documentation and tried...
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: textarea
|
||||
id: context
|
||||
attributes:
|
||||
label: Context
|
||||
description: Any additional context or information that might help others understand your question better.
|
||||
validations:
|
||||
required: false
|
||||
52
.github/ISSUE_TEMPLATE/3-feature_request.yml
vendored
52
.github/ISSUE_TEMPLATE/3-feature_request.yml
vendored
@@ -1,52 +0,0 @@
|
||||
name: Feature request
|
||||
description: Suggest an enhancement or new feature
|
||||
type: Enhancement
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
## Feature Request
|
||||
|
||||
Thank you for suggesting an enhancement to pipecat.
|
||||
|
||||
- type: textarea
|
||||
id: problem
|
||||
attributes:
|
||||
label: Problem Statement
|
||||
description: A clear description of the problem this feature would solve.
|
||||
placeholder: I'm always frustrated when...
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: solution
|
||||
attributes:
|
||||
label: Proposed Solution
|
||||
description: A clear and concise description of what you want to happen.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: alternatives
|
||||
attributes:
|
||||
label: Alternative Solutions
|
||||
description: Any alternative solutions or features you've considered.
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: textarea
|
||||
id: context
|
||||
attributes:
|
||||
label: Additional Context
|
||||
description: Add any other context, mockups, or screenshots about the feature request here.
|
||||
placeholder: You can drag and drop images here to include them.
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: checkboxes
|
||||
id: contribution
|
||||
attributes:
|
||||
label: Would you be willing to help implement this feature?
|
||||
options:
|
||||
- label: Yes, I'd like to contribute
|
||||
- label: No, I'm just suggesting
|
||||
82
.github/ISSUE_TEMPLATE/4-service-issue.yml
vendored
82
.github/ISSUE_TEMPLATE/4-service-issue.yml
vendored
@@ -1,82 +0,0 @@
|
||||
name: Service Issue
|
||||
description: An issue with a third-party service
|
||||
type: Service Issue
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
## Service Issue
|
||||
|
||||
Use this form to report an issue with a third-party service integration.
|
||||
|
||||
- type: input
|
||||
id: pipecat-version
|
||||
attributes:
|
||||
label: pipecat version
|
||||
description: Which version are you using?
|
||||
placeholder: e.g., 0.0.63
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: service-name
|
||||
attributes:
|
||||
label: Service Name
|
||||
description: Which third-party service is having issues?
|
||||
placeholder: e.g., OpenAI, ElevenLabs, Anthropic
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: service-version
|
||||
attributes:
|
||||
label: Service or model version
|
||||
description: Which version of the service API or model are you using?
|
||||
placeholder: e.g., v1, gpt-4.1
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: Issue Description
|
||||
description: Provide a clear description of the service issue.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: reproduction
|
||||
attributes:
|
||||
label: Reproduction Steps
|
||||
description: Provide steps to reproduce the issue.
|
||||
placeholder: |
|
||||
1. Configure service X
|
||||
2. Call method Y
|
||||
3. See error Z
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: expected
|
||||
attributes:
|
||||
label: Expected Behavior
|
||||
description: What did you expect to happen?
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: actual
|
||||
attributes:
|
||||
label: Actual Behavior
|
||||
description: What actually happened?
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Error Logs
|
||||
description: If available, include any error messages or logs.
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
||||
56
.github/ISSUE_TEMPLATE/5-new-service.yml
vendored
56
.github/ISSUE_TEMPLATE/5-new-service.yml
vendored
@@ -1,56 +0,0 @@
|
||||
name: New Service
|
||||
description: Request to support a new third-party service
|
||||
type: New Service
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
## New Service Request
|
||||
|
||||
Use this form to request support for a new third-party service in pipecat.
|
||||
|
||||
- type: input
|
||||
id: service-name
|
||||
attributes:
|
||||
label: Service Name
|
||||
description: What is the name of the third-party service?
|
||||
placeholder: e.g., NewAPI, SomeService
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: service-website
|
||||
attributes:
|
||||
label: Service Website
|
||||
description: Link to the service's website or documentation
|
||||
placeholder: e.g., https://newapi.com
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: service-description
|
||||
attributes:
|
||||
label: Service Description
|
||||
description: Briefly describe what this service does and how it works.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: api-info
|
||||
attributes:
|
||||
label: API Information
|
||||
description: If available, provide details about the service's API.
|
||||
placeholder: |
|
||||
- API documentation link
|
||||
- Authentication method
|
||||
- Key endpoints you'd like supported
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: checkboxes
|
||||
id: contribution
|
||||
attributes:
|
||||
label: Would you be willing to help implement this service?
|
||||
options:
|
||||
- label: Yes, I'd like to contribute
|
||||
- label: No, I'm just suggesting
|
||||
74
.github/ISSUE_TEMPLATE/6-dependency.yml
vendored
74
.github/ISSUE_TEMPLATE/6-dependency.yml
vendored
@@ -1,74 +0,0 @@
|
||||
name: Dependency Issue
|
||||
description: An issue with a Pipecat dependency (not a third-party service)
|
||||
type: Dependency Issue
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
## Dependency Issue
|
||||
|
||||
Use this form to report an issue with a Pipecat dependency.
|
||||
|
||||
- type: input
|
||||
id: pipecat-version
|
||||
attributes:
|
||||
label: pipecat version
|
||||
description: Which version are you using?
|
||||
placeholder: e.g., 0.0.63
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: dependency-name
|
||||
attributes:
|
||||
label: Dependency Name
|
||||
description: Which Pipecat dependency is causing the issue?
|
||||
placeholder: e.g., openai, anthropic, fastapi
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: dependency-version
|
||||
attributes:
|
||||
label: Dependency Version
|
||||
description: Which version of the dependency are you using?
|
||||
placeholder: e.g., 1.2.3
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: Issue Description
|
||||
description: Provide a clear description of the dependency issue.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: impact
|
||||
attributes:
|
||||
label: Impact
|
||||
description: How is this dependency issue affecting your usage of pipecat?
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: reproduction
|
||||
attributes:
|
||||
label: Reproduction Steps
|
||||
description: If applicable, provide steps to reproduce the issue.
|
||||
placeholder: |
|
||||
1. Install dependency X
|
||||
2. Run command Y
|
||||
3. See error Z
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Error Logs
|
||||
description: If applicable, include any relevant error messages or logs.
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
||||
70
.github/ISSUE_TEMPLATE/7-troubleshooting.yml
vendored
70
.github/ISSUE_TEMPLATE/7-troubleshooting.yml
vendored
@@ -1,70 +0,0 @@
|
||||
name: Troubleshooting
|
||||
description: Help with a specific use case
|
||||
type: Troubleshooting
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
## Troubleshooting Request
|
||||
|
||||
Use this form to get help with a specific use case or implementation.
|
||||
|
||||
- type: input
|
||||
id: pipecat-version
|
||||
attributes:
|
||||
label: pipecat version
|
||||
description: Which version are you using?
|
||||
placeholder: e.g., 0.0.63
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: python-version
|
||||
attributes:
|
||||
label: Python version
|
||||
description: Which version of Python are you using?
|
||||
placeholder: e.g., 3.12.8
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: os
|
||||
attributes:
|
||||
label: Operating System
|
||||
description: Which OS are you using?
|
||||
placeholder: e.g., Ubuntu 24.04, Windows 11, macOS 12.5
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: use-case
|
||||
attributes:
|
||||
label: Use Case Description
|
||||
description: Describe what you're trying to accomplish with pipecat.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: current-approach
|
||||
attributes:
|
||||
label: Current Approach
|
||||
description: What have you tried so far? Include code snippets if relevant.
|
||||
render: python
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: errors
|
||||
attributes:
|
||||
label: Errors or Unexpected Behavior
|
||||
description: Describe any errors or unexpected behavior you're encountering.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: additional-context
|
||||
attributes:
|
||||
label: Additional Context
|
||||
description: Any other information that might help us understand your situation.
|
||||
validations:
|
||||
required: false
|
||||
1
.github/ISSUE_TEMPLATE/config.yml
vendored
1
.github/ISSUE_TEMPLATE/config.yml
vendored
@@ -1 +0,0 @@
|
||||
blank_issues_enabled: false
|
||||
52
CHANGELOG.md
52
CHANGELOG.md
@@ -5,52 +5,14 @@ All notable changes to **Pipecat** will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.0.65] - 2025-04-23 "Sant Jordi's release"
|
||||
|
||||
https://en.wikipedia.org/wiki/Saint_George%27s_Day_in_Catalonia
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
- Added automatic hangup logic to the Telnyx serializer. This feature hangs up
|
||||
the Telnyx call when an `EndFrame` or `CancelFrame` is received. It is
|
||||
enabled by default and is configurable via the `auto_hang_up` `InputParam`.
|
||||
|
||||
- Added a keepalive task to `GladiaSTTService` to prevent the websocket from
|
||||
disconnecting after 30 seconds of no audio input.
|
||||
|
||||
### Changed
|
||||
|
||||
- The `InputParams` for `ElevenLabsTTSService` and `ElevenLabsHttpTTSService`
|
||||
no longer require that `stability` and `similarity_boost` be set. You can
|
||||
individually set each param.
|
||||
|
||||
- In `TwilioFrameSerializer`, `call_sid` is Optional so as to avoid a breaking
|
||||
changed. `call_sid` is required to automatically hang up.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where `TwilioFrameSerializer` would send two hang up commands:
|
||||
one for the `EndFrame` and one for the `CancelFrame`.
|
||||
|
||||
## [0.0.64] - 2025-04-22
|
||||
|
||||
### Added
|
||||
|
||||
- Added automatic hangup logic to the Twilio serializer. This feature hangs up
|
||||
the Twilio call when an `EndFrame` or `CancelFrame` is received. It is
|
||||
enabled by default and is configurable via the `auto_hang_up` `InputParam`.
|
||||
|
||||
- Added `SmartTurnMetricsData`, which contains end-of-turn prediction metrics,
|
||||
to the `MetricsFrame`. Using `MetricsFrame`, you can now retrieve prediction
|
||||
confidence scores and processing time metrics from the smart turn analyzers.
|
||||
|
||||
- Added support for Application Default Credentials in Google services,
|
||||
`GoogleSTTService`, `GoogleTTSService`, and `GoogleVertexLLMService`.
|
||||
|
||||
- Added support for Smart Turn Detection via the `turn_analyzer` transport
|
||||
parameter. You can now choose between `HttpSmartTurnAnalyzer()` or
|
||||
`FalSmartTurnAnalyzer()` for remote inference or
|
||||
`LocalCoreMLSmartTurnAnalyzer()` for on-device inference using Core ML.
|
||||
parameter. You can now choose between `SmartTurnAnalyzer()` for remote
|
||||
inference or `LocalCoreMLSmartTurnAnalyzer()` for on-device inference using
|
||||
Core ML.
|
||||
|
||||
- `DeepgramTTSService` accepts `base_url` argument again, allowing you to
|
||||
connect to an on-prem service.
|
||||
@@ -75,8 +37,6 @@ https://en.wikipedia.org/wiki/Saint_George%27s_Day_in_Catalonia
|
||||
|
||||
### Changed
|
||||
|
||||
- `GrokLLMService` now uses `grok-3-beta` as its default model.
|
||||
|
||||
- Daily's REST helpers now include an `eject_at_token_exp` param, which ejects
|
||||
the user when their token expires. This new parameter defaults to False.
|
||||
Also, the default value for `enable_prejoin_ui` changed to False and
|
||||
@@ -111,10 +71,6 @@ https://en.wikipedia.org/wiki/Saint_George%27s_Day_in_Catalonia
|
||||
- Fixed an issue where LLM input parameters were not working and applied correctly in `GoogleVertexLLMService`, causing
|
||||
unexpected behavior during inference.
|
||||
|
||||
### Other
|
||||
|
||||
- Updated the `twilio-chatbot` example to use the auto-hangup feature.
|
||||
|
||||
## [0.0.63] - 2025-04-11
|
||||
|
||||
### Added
|
||||
|
||||
233
README.md
233
README.md
@@ -1,72 +1,43 @@
|
||||
<h1><div align="center">
|
||||
<img alt="pipecat" width="300px" height="auto" src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/pipecat.png">
|
||||
<img alt="pipecat" width="300px" height="auto" src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/pipecat.png">
|
||||
</div></h1>
|
||||
|
||||
[](https://pypi.org/project/pipecat-ai)  [](https://codecov.io/gh/pipecat-ai/pipecat) [](https://docs.pipecat.ai) [](https://discord.gg/pipecat)
|
||||
|
||||
# 🎙️ Pipecat: Real-Time Voice & Multimodal AI Agents
|
||||
Pipecat is an open source Python framework for building voice and multimodal conversational agents. It handles the complex orchestration of AI services, network transport, audio processing, and multimodal interactions, letting you focus on creating engaging experiences.
|
||||
|
||||
**Pipecat** is an open-source Python framework for building real-time voice and multimodal conversational agents. Orchestrate audio and video, AI services, different transports, and conversation pipelines effortlessly—so you can focus on what makes your agent unique.
|
||||
## What you can build
|
||||
|
||||
## 🚀 What You Can Build
|
||||
- **Voice Assistants**: [Natural, real-time conversations with AI](https://demo.dailybots.ai/)
|
||||
- **Interactive Agents**: Personal coaches and meeting assistants
|
||||
- **Multimodal Apps**: Combine voice, video, images, and text
|
||||
- **Creative Tools**: [Story-telling experiences](https://storytelling-chatbot.fly.dev/) and social companions
|
||||
- **Business Solutions**: [Customer intake flows](https://www.youtube.com/watch?v=lDevgsp9vn0) and support bots
|
||||
- **Complex conversational flows**: [Refer to Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) to learn more
|
||||
|
||||
- **Voice Assistants** – natural, streaming conversations with AI
|
||||
- **AI Companions** – coaches, meeting assistants, characters
|
||||
- **Multimodal Interfaces** – voice, video, images, and more
|
||||
- **Interactive Storytelling** – creative tools with generative media
|
||||
- **Business Agents** – customer intake, support bots, guided flows
|
||||
- **Complex Dialog Systems** – design logic with structured conversations
|
||||
|
||||
🧭 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
|
||||
|
||||
## 🧠 Why Pipecat?
|
||||
|
||||
- **Voice-first**: Integrates speech recognition, text-to-speech, and conversation handling
|
||||
- **Pluggable**: Supports many AI services and tools
|
||||
- **Composable Pipelines**: Build complex behavior from modular components
|
||||
- **Real-Time**: Ultra-low latency interaction with different transports (e.g. WebSockets or WebRTC)
|
||||
|
||||
## 🎬 See it in action
|
||||
## See it in action
|
||||
|
||||
<p float="left">
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/simple-chatbot/image.png" width="400" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/storytelling-chatbot/image.png" width="400" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/simple-chatbot/image.png" width="280" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/storytelling-chatbot/image.png" width="280" /></a>
|
||||
<br/>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/translation-chatbot/image.png" width="400" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/moondream-chatbot/image.png" width="400" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/translation-chatbot/image.png" width="280" /></a>
|
||||
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/moondream-chatbot/image.png" width="280" /></a>
|
||||
</p>
|
||||
|
||||
## 📱 Client SDKs
|
||||
## Key features
|
||||
|
||||
You can connect to Pipecat from any platform using our official SDKs:
|
||||
- **Voice-first Design**: Built-in speech recognition, TTS, and conversation handling
|
||||
- **Flexible Integration**: Works with popular AI services (OpenAI, ElevenLabs, etc.)
|
||||
- **Pipeline Architecture**: Build complex apps from simple, reusable components
|
||||
- **Real-time Processing**: Frame-based pipeline architecture for fluid interactions
|
||||
- **Production Ready**: Enterprise-grade WebRTC and Websocket support
|
||||
|
||||
| Platform | SDK Repo | Description |
|
||||
| -------- | ------------------------------------------------------------------------------ | -------------------------------- |
|
||||
| Web | [pipecat-client-web](https://github.com/pipecat-ai/pipecat-client-web) | JavaScript and React client SDKs |
|
||||
| iOS | [pipecat-client-ios](https://github.com/pipecat-ai/pipecat-client-ios) | Swift SDK for iOS |
|
||||
| Android | [pipecat-client-android](https://github.com/pipecat-ai/pipecat-client-android) | Kotlin SDK for Android |
|
||||
| C++ | [pipecat-client-cxx](https://github.com/pipecat-ai/pipecat-client-cxx) | C++ client SDK |
|
||||
💡 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
|
||||
|
||||
## 🧩 Available services
|
||||
## Getting started
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) |
|
||||
| Analytics & Metrics | [Canonical AI](https://docs.pipecat.ai/server/services/analytics/canonical), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||
|
||||
## ⚡ Getting started
|
||||
|
||||
You can get started with Pipecat running on your local machine, then move your agent processes to the cloud when you’re ready.
|
||||
You can get started with Pipecat running on your local machine, then move your agent processes to the cloud when you’re ready. You can also add a 📞 telephone number, 🖼️ image output, 📺 video input, use different LLMs, and more.
|
||||
|
||||
```shell
|
||||
# Install the module
|
||||
@@ -82,51 +53,141 @@ To keep things lightweight, only the core framework is included by default. If y
|
||||
pip install "pipecat-ai[option,...]"
|
||||
```
|
||||
|
||||
## 🧪 Code examples
|
||||
### Available services
|
||||
|
||||
| Category | Services | Install Command Example |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | `pip install "pipecat-ai[deepgram]"` |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) | `pip install "pipecat-ai[openai]"` |
|
||||
| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | `pip install "pipecat-ai[cartesia]"` |
|
||||
| Speech-to-Speech | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) | `pip install "pipecat-ai[google]"` |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | `pip install "pipecat-ai[daily]"` |
|
||||
| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | `pip install "pipecat-ai[tavus,simli]"` |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | `pip install "pipecat-ai[mem0]"` |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | `pip install "pipecat-ai[moondream]"` |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) | `pip install "pipecat-ai[silero]"` |
|
||||
| Analytics & Metrics | [Canonical AI](https://docs.pipecat.ai/server/services/analytics/canonical), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | `pip install "pipecat-ai[canonical]"` |
|
||||
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||
|
||||
## Code examples
|
||||
|
||||
- [Foundational](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational) — small snippets that build on each other, introducing one or two concepts at a time
|
||||
- [Example apps](https://github.com/pipecat-ai/pipecat/tree/main/examples/) — complete applications that you can use as starting points for development
|
||||
|
||||
## 🛠️ Hacking on the framework itself
|
||||
## A simple voice agent running locally
|
||||
|
||||
1. Set up a virtual environment before following these instructions. From the root of the repo:
|
||||
Here is a very basic Pipecat bot that greets a user when they join a real-time session. We'll use [Daily](https://daily.co) for real-time media transport, and [Cartesia](https://cartesia.ai/) for text-to-speech.
|
||||
|
||||
```shell
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
```
|
||||
```python
|
||||
import asyncio
|
||||
|
||||
2. Install the development dependencies:
|
||||
from pipecat.frames.frames import TextFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
|
||||
```shell
|
||||
pip install -r dev-requirements.txt
|
||||
```
|
||||
async def main():
|
||||
# Use Daily as a real-time media transport (WebRTC)
|
||||
transport = DailyTransport(
|
||||
room_url=...,
|
||||
token="", # leave empty. Note: token is _not_ your api key
|
||||
bot_name="Bot Name",
|
||||
params=DailyParams(audio_out_enabled=True))
|
||||
|
||||
3. Install the git pre-commit hooks (these help ensure your code follows project rules):
|
||||
# Use Cartesia for Text-to-Speech
|
||||
tts = CartesiaTTSService(
|
||||
api_key=...,
|
||||
voice_id=...
|
||||
)
|
||||
|
||||
```shell
|
||||
pre-commit install
|
||||
```
|
||||
# Simple pipeline that will process text to speech and output the result
|
||||
pipeline = Pipeline([tts, transport.output()])
|
||||
|
||||
4. Install the `pipecat-ai` package locally in editable mode:
|
||||
# Create Pipecat processor that can run one or more pipelines tasks
|
||||
runner = PipelineRunner()
|
||||
|
||||
```shell
|
||||
pip install -e .
|
||||
```
|
||||
# Assign the task callable to run the pipeline
|
||||
task = PipelineTask(pipeline)
|
||||
|
||||
> The `-e` or `--editable` option allows you to modify the code without reinstalling.
|
||||
# Register an event handler to play audio when a
|
||||
# participant joins the transport WebRTC session
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
participant_name = participant.get("info", {}).get("userName", "")
|
||||
# Queue a TextFrame that will get spoken by the TTS service (Cartesia)
|
||||
await task.queue_frame(TextFrame(f"Hello there, {participant_name}!"))
|
||||
|
||||
5. Include optional dependencies as needed. For example:
|
||||
# Register an event handler to exit the application when the user leaves.
|
||||
@transport.event_handler("on_participant_left")
|
||||
async def on_participant_left(transport, participant, reason):
|
||||
await task.cancel()
|
||||
|
||||
```shell
|
||||
pip install -e ".[daily,deepgram,cartesia,openai,silero]"
|
||||
```
|
||||
# Run the pipeline task
|
||||
await runner.run(task)
|
||||
|
||||
6. (Optional) If you want to use this package from another directory:
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```shell
|
||||
pip install "path_to_this_repo[option,...]"
|
||||
```
|
||||
Run it with:
|
||||
|
||||
```shell
|
||||
python app.py
|
||||
```
|
||||
|
||||
Daily provides a prebuilt WebRTC user interface. While the app is running, you can visit at `https://<yourdomain>.daily.co/<room_url>` and listen to the bot say hello!
|
||||
|
||||
## WebRTC for production use
|
||||
|
||||
WebSockets are fine for server-to-server communication or for initial development. But for production use, you’ll need client-server audio to use a protocol designed for real-time media transport. (For an explanation of the difference between WebSockets and WebRTC, see [this post.](https://www.daily.co/blog/how-to-talk-to-an-llm-with-your-voice/#webrtc))
|
||||
|
||||
One way to get up and running quickly with WebRTC is to sign up for a Daily developer account. Daily gives you SDKs and global infrastructure for audio (and video) routing. Every account gets 10,000 audio/video/transcription minutes free each month.
|
||||
|
||||
Sign up [here](https://dashboard.daily.co/u/signup) and [create a room](https://docs.daily.co/reference/rest-api/rooms) in the developer Dashboard.
|
||||
|
||||
## Hacking on the framework itself
|
||||
|
||||
_Note: You may need to set up a virtual environment before following these instructions. From the root of the repo:_
|
||||
|
||||
```shell
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
```
|
||||
|
||||
Install the development dependencies:
|
||||
|
||||
```shell
|
||||
pip install -r dev-requirements.txt
|
||||
```
|
||||
|
||||
Install the git pre-commit hooks (these help ensure your code follows project rules):
|
||||
|
||||
```shell
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
Install the `pipecat-ai` package locally in editable mode:
|
||||
|
||||
```shell
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
The `-e` or `--editable` option allows you to modify the code without reinstalling.
|
||||
|
||||
To include optional dependencies, add them to the install command. For example:
|
||||
|
||||
```shell
|
||||
pip install -e ".[daily,deepgram,cartesia,openai,silero]" # Updated for the services you're using
|
||||
```
|
||||
|
||||
If you want to use this package from another directory:
|
||||
|
||||
```shell
|
||||
pip install "path_to_this_repo[option,...]"
|
||||
```
|
||||
|
||||
### Running tests
|
||||
|
||||
@@ -136,11 +197,11 @@ From the root directory, run:
|
||||
pytest
|
||||
```
|
||||
|
||||
### Setting up your editor
|
||||
## Setting up your editor
|
||||
|
||||
This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting via [Ruff](https://github.com/astral-sh/ruff).
|
||||
|
||||
#### Emacs
|
||||
### Emacs
|
||||
|
||||
You can use [use-package](https://github.com/jwiegley/use-package) to install [emacs-lazy-ruff](https://github.com/christophermadsen/emacs-lazy-ruff) package and configure `ruff` arguments:
|
||||
|
||||
@@ -162,7 +223,7 @@ You can use [use-package](https://github.com/jwiegley/use-package) to install [e
|
||||
:hook ((python-mode . pyvenv-auto-run)))
|
||||
```
|
||||
|
||||
#### Visual Studio Code
|
||||
### Visual Studio Code
|
||||
|
||||
Install the
|
||||
[Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, and enable formatting on save:
|
||||
@@ -174,7 +235,7 @@ Install the
|
||||
}
|
||||
```
|
||||
|
||||
#### PyCharm
|
||||
### PyCharm
|
||||
|
||||
`ruff` was installed in the `venv` environment described before, now to enable autoformatting on save, go to `File` -> `Settings` -> `Tools` -> `File Watchers` and add a new watcher with the following settings:
|
||||
|
||||
@@ -184,7 +245,7 @@ Install the
|
||||
4. **Arguments**: `format $FilePath$`
|
||||
5. **Program**: `$PyInterpreterDirectory$/ruff`
|
||||
|
||||
## 🤝 Contributing
|
||||
## Contributing
|
||||
|
||||
We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:
|
||||
|
||||
@@ -197,7 +258,7 @@ Before submitting a pull request, please check existing issues and PRs to avoid
|
||||
|
||||
We aim to review all contributions promptly and provide constructive feedback to help get your changes merged.
|
||||
|
||||
## 🛟 Getting help
|
||||
## Getting help
|
||||
|
||||
➡️ [Join our Discord](https://discord.gg/pipecat)
|
||||
|
||||
|
||||
22
docs/ISSUE_TEMPLATE.md
Normal file
22
docs/ISSUE_TEMPLATE.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Description
|
||||
Is this reporting a bug or feature request?
|
||||
|
||||
|
||||
If reporting a bug, please fill out the following:
|
||||
|
||||
### Environment
|
||||
- pipecat-ai version:
|
||||
- python version:
|
||||
- OS:
|
||||
|
||||
### Issue description
|
||||
Provide a clear description of the issue.
|
||||
|
||||
### Repro steps
|
||||
List the steps to reproduce the issue.
|
||||
|
||||
### Expected behavior
|
||||
|
||||
### Actual behavior
|
||||
|
||||
### Logs
|
||||
@@ -96,8 +96,4 @@ PIPER_BASE_URL=...
|
||||
|
||||
# Smart turn
|
||||
LOCAL_SMART_TURN_MODEL_PATH=
|
||||
FAL_SMART_TURN_API_KEY=...
|
||||
|
||||
# Twilio
|
||||
TWILIO_ACCOUNT_SID=
|
||||
TWILIO_AUTH_TOKEN=
|
||||
REMOTE_SMART_TURN_URL=
|
||||
@@ -9,10 +9,11 @@ import os
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
|
||||
from pipecat.frames.frames import EndFrame, TranscriptionFrame, TTSSpeakFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.frame_processor import FrameProcessor
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
|
||||
|
||||
@@ -10,7 +10,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import Frame, MetricsFrame
|
||||
from pipecat.frames.frames import Frame, MetricsFrame, TranscriptionFrame, TTSSpeakFrame
|
||||
from pipecat.metrics.metrics import (
|
||||
LLMUsageMetricsData,
|
||||
ProcessingMetricsData,
|
||||
@@ -32,7 +32,30 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# Custom processor that prints a message if it receives a TranscriptionFrame that says "banana"
|
||||
class BananaProcessor(FrameProcessor):
|
||||
"""A custom processor that listens for transcription frames containing the word 'banana'."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
# Ensure the super method is called first
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
logger.debug(f"Received transcription frame: {frame.text}")
|
||||
if "banana" in frame.text.lower():
|
||||
logger.info("---- Received 'banana' in transcription frame")
|
||||
|
||||
# Push the frame after processing
|
||||
await self.push_frame(frame)
|
||||
|
||||
|
||||
class MetricsLogger(FrameProcessor):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
@@ -87,10 +110,13 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
banana = BananaProcessor()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
banana,
|
||||
context_aggregator.user(),
|
||||
llm,
|
||||
tts,
|
||||
|
||||
@@ -40,7 +40,7 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection):
|
||||
|
||||
stt = OpenAISTTService(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model="gpt-4o-transcribe",
|
||||
model="gpt-4o-transcribe-latest",
|
||||
prompt="Expect words related to dogs, such as breed names.",
|
||||
)
|
||||
|
||||
|
||||
@@ -1,113 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.fal_smart_turn import FalSmartTurnAnalyzer
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
|
||||
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def run_bot(webrtc_connection: SmallWebRTCConnection):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = SmallWebRTCTransport(
|
||||
webrtc_connection=webrtc_connection,
|
||||
params=TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
vad_audio_passthrough=True,
|
||||
turn_analyzer=FalSmartTurnAnalyzer(
|
||||
api_key=os.getenv("FAL_SMART_TURN_API_KEY"), aiohttp_session=session
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
allow_interruptions=True,
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
report_only_initial_ttfb=True,
|
||||
),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
|
||||
@transport.event_handler("on_client_closed")
|
||||
async def on_client_closed(transport, client):
|
||||
logger.info(f"Client closed connection")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=False)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from run import main
|
||||
|
||||
main()
|
||||
111
examples/foundational/38-smart-turn.py
Normal file
111
examples/foundational/38-smart-turn.py
Normal file
@@ -0,0 +1,111 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn import SmartTurnAnalyzer
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
|
||||
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def run_bot(webrtc_connection: SmallWebRTCConnection):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
remote_smart_turn_url = os.getenv("REMOTE_SMART_TURN_URL")
|
||||
|
||||
transport = SmallWebRTCTransport(
|
||||
webrtc_connection=webrtc_connection,
|
||||
params=TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
vad_audio_passthrough=True,
|
||||
turn_analyzer=SmartTurnAnalyzer(url=remote_smart_turn_url),
|
||||
),
|
||||
)
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
allow_interruptions=True,
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
report_only_initial_ttfb=True,
|
||||
),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
|
||||
@transport.event_handler("on_client_closed")
|
||||
async def on_client_closed(transport, client):
|
||||
logger.info(f"Client closed connection")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=False)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from run import main
|
||||
|
||||
main()
|
||||
@@ -9,8 +9,8 @@ import os
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_coreml_smart_turn import LocalCoreMLSmartTurnAnalyzer
|
||||
from pipecat.audio.turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.local_smart_turn import LocalCoreMLSmartTurnAnalyzer
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
@@ -7,6 +7,7 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import importlib.util
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from contextlib import asynccontextmanager
|
||||
@@ -17,7 +18,6 @@ import uvicorn
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import BackgroundTasks, FastAPI
|
||||
from fastapi.responses import RedirectResponse
|
||||
from loguru import logger
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
|
||||
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
|
||||
@@ -25,6 +25,14 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
|
||||
# Load environment variables
|
||||
load_dotenv(override=True)
|
||||
|
||||
# Configure logger
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(message)s",
|
||||
handlers=[logging.StreamHandler()],
|
||||
)
|
||||
logger = logging.getLogger("pipecat-server")
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Store connections by pc_id
|
||||
@@ -154,11 +162,10 @@ def main():
|
||||
parser.add_argument("--verbose", "-v", action="count", default=0)
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.remove(0)
|
||||
if args.verbose:
|
||||
logger.add(sys.stderr, level="TRACE")
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Infer the bot file from the caller if not provided explicitly
|
||||
bot_file = args.bot_file
|
||||
|
||||
@@ -26,6 +26,9 @@ from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
class MirrorProcessor(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
|
||||
@@ -1,12 +1,6 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Dict
|
||||
|
||||
@@ -15,7 +9,6 @@ from bot import run_bot
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import BackgroundTasks, FastAPI
|
||||
from fastapi.responses import RedirectResponse
|
||||
from loguru import logger
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
|
||||
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
|
||||
@@ -23,6 +16,8 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
|
||||
# Load environment variables
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger = logging.getLogger("pc")
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Store connections by pc_id
|
||||
@@ -86,10 +81,9 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--verbose", "-v", action="count")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.remove(0)
|
||||
if args.verbose:
|
||||
logger.add(sys.stderr, level="TRACE")
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
uvicorn.run(app, host=args.host, port=args.port)
|
||||
|
||||
@@ -25,6 +25,9 @@ from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
class EdgeDetectionProcessor(FrameProcessor):
|
||||
def __init__(self, camera_out_width, camera_out_height: int):
|
||||
|
||||
@@ -1,12 +1,6 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Dict
|
||||
|
||||
@@ -15,7 +9,6 @@ from bot import run_bot
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import BackgroundTasks, FastAPI
|
||||
from fastapi.responses import RedirectResponse
|
||||
from loguru import logger
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
|
||||
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
|
||||
@@ -23,6 +16,8 @@ from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
|
||||
# Load environment variables
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger = logging.getLogger("pc")
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Store connections by pc_id
|
||||
@@ -86,10 +81,9 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--verbose", "-v", action="count")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.remove(0)
|
||||
if args.verbose:
|
||||
logger.add(sys.stderr, level="TRACE")
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
uvicorn.run(app, host=args.host, port=args.port)
|
||||
|
||||
@@ -20,6 +20,10 @@ from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
SYSTEM_INSTRUCTION = f"""
|
||||
"You are Gemini Chatbot, a friendly, helpful robot.
|
||||
|
||||
|
||||
@@ -1,12 +1,6 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Dict
|
||||
|
||||
@@ -15,13 +9,14 @@ from bot import run_bot
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import BackgroundTasks, FastAPI
|
||||
from fastapi.responses import FileResponse
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger = logging.getLogger("pc")
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Store connections by pc_id
|
||||
@@ -78,10 +73,9 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--verbose", "-v", action="count")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.remove(0)
|
||||
if args.verbose:
|
||||
logger.add(sys.stderr, level="TRACE")
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
uvicorn.run(app, host=args.host, port=args.port)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
DAILY_SAMPLE_ROOM_URL=https://yourdomain.daily.co/yourroom # (optional: for joining the bot to the same room repeatedly for local dev)
|
||||
DAILY_API_KEY=
|
||||
DAILY_API_URL=https://api.daily.co/v1
|
||||
DEEPGRAM_API_KEY=
|
||||
DAILY_API_URL=api.daily.co/v1
|
||||
OPENAI_API_KEY=
|
||||
GOOGLE_API_KEY
|
||||
CARTESIA_API_KEY=
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
pipecat-ai[daily,cartesia,deepgram,openai,google,silero]
|
||||
fastapi==0.115.6
|
||||
pipecat-ai[daily,cartesia,openai,google,silero]
|
||||
fastapi==3.11.12
|
||||
uvicorn
|
||||
python-dotenv
|
||||
twilio
|
||||
|
||||
@@ -63,35 +63,20 @@ This project is a FastAPI-based chatbot that integrates with Telnyx to handle We
|
||||
ngrok http 8765
|
||||
```
|
||||
|
||||
2. **Purchase a number**
|
||||
2. **Update the Telnyx TeXML applications Webhook**:
|
||||
|
||||
If you haven't already, purchase a number from Telnyx.
|
||||
- Go to your TeXML configuration page
|
||||
- Provide the ngrok URL to the Webhook URL field and ensure the POST method is selected
|
||||
- Click Save at the bottom of the page
|
||||
|
||||
- Log in to the Telnyx developer portal: https://portal.telnyx.com/
|
||||
- Buy a number: https://portal.telnyx.com/#/numbers/buy-numbers
|
||||
|
||||
3. **Update the Telnyx TeXML applications Webhook**:
|
||||
|
||||
- Go to your TeXML configuration page: https://portal.telnyx.com/#/call-control/texml
|
||||
- Create a new TeXML app, if one doesn't exist already:
|
||||
- Add an application name
|
||||
- Under Webhooks, select POST as the "Voice Method"
|
||||
- Select "Custom URL" under Webhook URL Method
|
||||
- Enter your ngrok URL in the "Webhook URL" field (e.g. https://your-name.ngrok.io)
|
||||
- Click "Create" to save
|
||||
Note: You'll see subsequent pages to set up SIP and Outbound, both are not required, so just skip.
|
||||
- Navigate to "Manage Numbers" (https://portal.telnyx.com/#/numbers/my-numbers) and under SIP connection, select the pencil icon to edit and select the TeXML application that you just created.
|
||||
|
||||
Now your number is ready to call.
|
||||
|
||||
4. **Configure streams.xml**:
|
||||
3. **Configure streams.xml**:
|
||||
- Copy the template file to create your local version:
|
||||
```sh
|
||||
cp templates/streams.xml.template templates/streams.xml
|
||||
```
|
||||
- In `templates/streams.xml`, replace `<your server url>` with your ngrok URL (without `https://`)
|
||||
- The final URL should look like: `wss://abc123.ngrok.io/ws`. This needs to be the same URL that you added to your TeXML app above.
|
||||
- The encoding (`bidirectionalCodec`) should be `PCMU` or `PCMA` depending on your needs. Based on selected encoding, set the outbound_encoding in `server.py` when the bot is initialized. (No changes are required by default.)
|
||||
- The final URL should look like: `wss://abc123.ngrok.io/ws`
|
||||
- The encoding (`bidirectionalCodec`) should be `PCMU` or `PCMA` depending on your needs. Based on selected encoding, set the outbound_encoding in `server.py` when the bot is initialized.
|
||||
- The inbound encoding can be controlled from the application configuration for inbound calls and dial/transfer commands for outbound calls.
|
||||
|
||||
## Running the Application
|
||||
|
||||
@@ -33,18 +33,9 @@ logger.add(sys.stderr, level="DEBUG")
|
||||
async def run_bot(
|
||||
websocket_client,
|
||||
stream_id: str,
|
||||
call_control_id: str,
|
||||
outbound_encoding: str,
|
||||
inbound_encoding: str,
|
||||
):
|
||||
serializer = TelnyxFrameSerializer(
|
||||
stream_id=stream_id,
|
||||
outbound_encoding=outbound_encoding,
|
||||
inbound_encoding=inbound_encoding,
|
||||
call_control_id=call_control_id,
|
||||
api_key=os.getenv("TELNYX_API_KEY"),
|
||||
)
|
||||
|
||||
transport = FastAPIWebsocketTransport(
|
||||
websocket=websocket_client,
|
||||
params=FastAPIWebsocketParams(
|
||||
@@ -53,7 +44,7 @@ async def run_bot(
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
vad_audio_passthrough=True,
|
||||
serializer=serializer,
|
||||
serializer=TelnyxFrameSerializer(stream_id, outbound_encoding, inbound_encoding),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -37,10 +37,9 @@ async def websocket_endpoint(websocket: WebSocket):
|
||||
call_data = json.loads(await start_data.__anext__())
|
||||
print(call_data, flush=True)
|
||||
stream_id = call_data["stream_id"]
|
||||
call_control_id = call_data["start"]["call_control_id"]
|
||||
outbound_encoding = call_data["start"]["media_format"]["encoding"]
|
||||
print("WebSocket connection accepted")
|
||||
await run_bot(websocket, stream_id, call_control_id, outbound_encoding, "PCMU")
|
||||
await run_bot(websocket, stream_id, outbound_encoding, "PCMU")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -54,14 +54,7 @@ async def save_audio(server_name: str, audio: bytes, sample_rate: int, num_chann
|
||||
logger.info("No audio data to save")
|
||||
|
||||
|
||||
async def run_bot(websocket_client: WebSocket, stream_sid: str, call_sid: str, testing: bool):
|
||||
serializer = TwilioFrameSerializer(
|
||||
stream_sid=stream_sid,
|
||||
call_sid=call_sid,
|
||||
account_sid=os.getenv("TWILIO_ACCOUNT_SID", ""),
|
||||
auth_token=os.getenv("TWILIO_AUTH_TOKEN", ""),
|
||||
)
|
||||
|
||||
async def run_bot(websocket_client: WebSocket, stream_sid: str, testing: bool):
|
||||
transport = FastAPIWebsocketTransport(
|
||||
websocket=websocket_client,
|
||||
params=FastAPIWebsocketParams(
|
||||
@@ -71,7 +64,7 @@ async def run_bot(websocket_client: WebSocket, stream_sid: str, call_sid: str, t
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
vad_audio_passthrough=True,
|
||||
serializer=serializer,
|
||||
serializer=TwilioFrameSerializer(stream_sid),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -38,9 +38,8 @@ async def websocket_endpoint(websocket: WebSocket):
|
||||
call_data = json.loads(await start_data.__anext__())
|
||||
print(call_data, flush=True)
|
||||
stream_sid = call_data["start"]["streamSid"]
|
||||
call_sid = call_data["start"]["callSid"]
|
||||
print("WebSocket connection accepted")
|
||||
await run_bot(websocket, stream_sid, call_sid, app.state.testing)
|
||||
await run_bot(websocket, stream_sid, app.state.testing)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -6,14 +6,14 @@ build-backend = "setuptools.build_meta"
|
||||
name = "pipecat-ai"
|
||||
dynamic = ["version"]
|
||||
description = "An open source framework for voice (and multimodal) assistants"
|
||||
license = "BSD-2-Clause"
|
||||
license-files = ["LICENSE"]
|
||||
license = { text = "BSD 2-Clause License" }
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
keywords = ["webrtc", "audio", "video", "ai"]
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: BSD License",
|
||||
"Topic :: Communications :: Conferencing",
|
||||
"Topic :: Multimedia :: Sound/Audio",
|
||||
"Topic :: Multimedia :: Video",
|
||||
@@ -92,11 +92,9 @@ websocket = [ "websockets~=13.1", "fastapi~=0.115.6" ]
|
||||
whisper = [ "faster-whisper~=1.1.1" ]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
# All the following settings are optional:
|
||||
where = ["src"]
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
"pipecat" = ["py.typed"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
addopts = "--verbose"
|
||||
testpaths = ["tests"]
|
||||
|
||||
@@ -6,14 +6,13 @@
|
||||
|
||||
import time
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
from typing import Dict, Optional
|
||||
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
|
||||
from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
|
||||
|
||||
# Default timing parameters
|
||||
STOP_SECS = 3
|
||||
@@ -30,10 +29,6 @@ class SmartTurnParams(BaseModel):
|
||||
# use_only_last_vad_segment: bool = USE_ONLY_LAST_VAD_SEGMENT
|
||||
|
||||
|
||||
class SmartTurnTimeoutException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
def __init__(
|
||||
self, *, sample_rate: Optional[int] = None, params: SmartTurnParams = SmartTurnParams()
|
||||
@@ -46,7 +41,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
self._audio_buffer = []
|
||||
self._speech_triggered = False
|
||||
self._silence_ms = 0
|
||||
self._speech_start_time = 0
|
||||
self._speech_start_time = None
|
||||
|
||||
@property
|
||||
def speech_triggered(self) -> bool:
|
||||
@@ -64,8 +59,9 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
# Reset silence tracking on speech
|
||||
self._silence_ms = 0
|
||||
self._speech_triggered = True
|
||||
if self._speech_start_time == 0:
|
||||
if self._speech_start_time is None:
|
||||
self._speech_start_time = time.time()
|
||||
logger.debug(f"Speech started at {self._speech_start_time}")
|
||||
else:
|
||||
if self._speech_triggered:
|
||||
chunk_duration_ms = len(audio_int16) / (self._sample_rate / 1000)
|
||||
@@ -91,27 +87,28 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
|
||||
return state
|
||||
|
||||
async def analyze_end_of_turn(self) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
|
||||
state, result = await self._process_speech_segment(self._audio_buffer)
|
||||
def analyze_end_of_turn(self) -> EndOfTurnState:
|
||||
logger.debug("Analyzing End of Turn...")
|
||||
state = self._process_speech_segment(self._audio_buffer)
|
||||
if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
|
||||
self._clear(state)
|
||||
logger.debug(f"End of Turn result: {state}")
|
||||
return state, result
|
||||
return state
|
||||
|
||||
def _clear(self, turn_state: EndOfTurnState):
|
||||
# Reset internal state for next turn
|
||||
logger.debug("Clearing audio buffer...")
|
||||
# If the state is still incomplete, keep the _speech_triggered as True
|
||||
self._speech_triggered = turn_state == EndOfTurnState.INCOMPLETE
|
||||
self._audio_buffer = []
|
||||
self._speech_start_time = 0
|
||||
self._speech_start_time = None
|
||||
self._silence_ms = 0
|
||||
|
||||
async def _process_speech_segment(
|
||||
self, audio_buffer
|
||||
) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
|
||||
def _process_speech_segment(self, audio_buffer) -> EndOfTurnState:
|
||||
state = EndOfTurnState.INCOMPLETE
|
||||
|
||||
if not audio_buffer:
|
||||
return state, None
|
||||
return state
|
||||
|
||||
# Extract recent audio segment for prediction
|
||||
start_time = self._speech_start_time - (self._params.pre_speech_ms / 1000)
|
||||
@@ -127,63 +124,41 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
segment_audio_chunks = [chunk for _, chunk in audio_buffer[start_index : end_index + 1]]
|
||||
segment_audio = np.concatenate(segment_audio_chunks)
|
||||
|
||||
logger.debug(f"Segment audio chunks after start index: {len(segment_audio)}")
|
||||
|
||||
# Limit maximum duration
|
||||
max_samples = int(self._params.max_duration_secs * self.sample_rate)
|
||||
if len(segment_audio) > max_samples:
|
||||
# slices the array to keep the last max_samples samples, discarding the earlier part.
|
||||
segment_audio = segment_audio[-max_samples:]
|
||||
|
||||
result_data = None
|
||||
logger.debug(f"Segment audio chunks after limiting duration: {len(segment_audio)}")
|
||||
|
||||
if len(segment_audio) > 0:
|
||||
start_time = time.perf_counter()
|
||||
try:
|
||||
result = await self._predict_endpoint(segment_audio)
|
||||
state = (
|
||||
EndOfTurnState.COMPLETE
|
||||
if result["prediction"] == 1
|
||||
else EndOfTurnState.INCOMPLETE
|
||||
)
|
||||
end_time = time.perf_counter()
|
||||
|
||||
# Calculate processing time
|
||||
e2e_processing_time_ms = (end_time - start_time) * 1000
|
||||
|
||||
# Prepare the result data
|
||||
result_data = SmartTurnMetricsData(
|
||||
processor="BaseSmartTurn",
|
||||
is_complete=result["prediction"] == 1,
|
||||
probability=result["probability"],
|
||||
inference_time_ms=result.get("inference_time", 0) * 1000,
|
||||
server_total_time_ms=result.get("total_time", 0) * 1000,
|
||||
e2e_processing_time_ms=e2e_processing_time_ms,
|
||||
)
|
||||
|
||||
logger.trace(
|
||||
f"Prediction: {'Complete' if result_data.is_complete else 'Incomplete'}"
|
||||
)
|
||||
logger.trace(f"Probability of complete: {result_data.probability:.4f}")
|
||||
logger.trace(f"Inference time: {result_data.inference_time_ms:.2f}ms")
|
||||
logger.trace(f"Server total time: {result_data.server_total_time_ms:.2f}ms")
|
||||
logger.trace(f"E2E processing time: {result_data.e2e_processing_time_ms:.2f}ms")
|
||||
except SmartTurnTimeoutException:
|
||||
logger.debug(
|
||||
f"End of Turn complete due to stop_secs. Silence in ms: {self._silence_ms}"
|
||||
)
|
||||
state = EndOfTurnState.COMPLETE
|
||||
result = self._predict_endpoint(segment_audio)
|
||||
state = (
|
||||
EndOfTurnState.COMPLETE if result["prediction"] == 1 else EndOfTurnState.INCOMPLETE
|
||||
)
|
||||
end_time = time.perf_counter()
|
||||
|
||||
logger.debug("--------")
|
||||
logger.debug(f"Prediction: {'Complete' if result['prediction'] == 1 else 'Incomplete'}")
|
||||
logger.debug(f"Probability of complete: {result['probability']:.4f}")
|
||||
logger.debug(f"Prediction took {(end_time - start_time) * 1000:.2f}ms seconds")
|
||||
else:
|
||||
logger.trace(f"params: {self._params}, stop_ms: {self._stop_ms}")
|
||||
logger.trace("Captured empty audio segment, skipping prediction.")
|
||||
logger.debug(f"params: {self._params}, stop_ms: {self._stop_ms}")
|
||||
logger.debug("Captured empty audio segment, skipping prediction.")
|
||||
|
||||
return state, result_data
|
||||
return state
|
||||
|
||||
@abstractmethod
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Abstract method to predict if a turn has ended based on audio.
|
||||
def _predict_endpoint(self, buffer: np.ndarray) -> Dict[str, any]:
|
||||
"""
|
||||
Abstract method to predict if a turn has ended based on audio.
|
||||
|
||||
Args:
|
||||
audio_array: Float32 numpy array of audio samples at 16kHz.
|
||||
buffer: Float32 numpy array of audio samples at 16kHz.
|
||||
|
||||
Returns:
|
||||
Dictionary with:
|
||||
@@ -6,9 +6,7 @@
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from pipecat.metrics.metrics import MetricsData
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class EndOfTurnState(Enum):
|
||||
@@ -17,10 +15,8 @@ class EndOfTurnState(Enum):
|
||||
|
||||
|
||||
class BaseTurnAnalyzer(ABC):
|
||||
"""Abstract base class for analyzing user end of turn.
|
||||
|
||||
This class inherits from BaseObject to leverage its event handling system
|
||||
while still defining an abstract interface through abstract methods.
|
||||
"""
|
||||
Abstract base class for analyzing user end of turn.
|
||||
"""
|
||||
|
||||
def __init__(self, *, sample_rate: Optional[int] = None):
|
||||
@@ -29,7 +25,8 @@ class BaseTurnAnalyzer(ABC):
|
||||
|
||||
@property
|
||||
def sample_rate(self) -> int:
|
||||
"""Returns the current sample rate.
|
||||
"""
|
||||
Returns the current sample rate.
|
||||
|
||||
Returns:
|
||||
int: The effective sample rate for audio processing.
|
||||
@@ -37,7 +34,8 @@ class BaseTurnAnalyzer(ABC):
|
||||
return self._sample_rate
|
||||
|
||||
def set_sample_rate(self, sample_rate: int):
|
||||
"""Sets the sample rate for audio processing.
|
||||
"""
|
||||
Sets the sample rate for audio processing.
|
||||
|
||||
If the initial sample rate was provided, it will use that; otherwise, it sets to
|
||||
the provided sample rate.
|
||||
@@ -50,7 +48,8 @@ class BaseTurnAnalyzer(ABC):
|
||||
@property
|
||||
@abstractmethod
|
||||
def speech_triggered(self) -> bool:
|
||||
"""Determines if speech has been detected.
|
||||
"""
|
||||
Determines if speech has been detected.
|
||||
|
||||
Returns:
|
||||
bool: True if speech is triggered, otherwise False.
|
||||
@@ -59,7 +58,8 @@ class BaseTurnAnalyzer(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def append_audio(self, buffer: bytes, is_speech: bool) -> EndOfTurnState:
|
||||
"""Appends audio data for analysis.
|
||||
"""
|
||||
Appends audio data for analysis.
|
||||
|
||||
Args:
|
||||
buffer (bytes): The audio data to append.
|
||||
@@ -71,8 +71,9 @@ class BaseTurnAnalyzer(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def analyze_end_of_turn(self) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
|
||||
"""Analyzes if an end of turn has occurred based on the audio input.
|
||||
def analyze_end_of_turn(self) -> EndOfTurnState:
|
||||
"""
|
||||
Analyzes if an end of turn has occurred based on the audio input.
|
||||
|
||||
Returns:
|
||||
EndOfTurnState: The result of the end of turn analysis.
|
||||
|
||||
@@ -5,16 +5,17 @@
|
||||
#
|
||||
|
||||
|
||||
from typing import Any, Dict
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import BaseSmartTurn
|
||||
from pipecat.audio.turn.base_smart_turn import BaseSmartTurn
|
||||
|
||||
try:
|
||||
import coremltools as ct
|
||||
import torch
|
||||
from transformers import AutoFeatureExtractor
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
@@ -25,7 +26,7 @@ except ModuleNotFoundError as e:
|
||||
|
||||
|
||||
class LocalCoreMLSmartTurnAnalyzer(BaseSmartTurn):
|
||||
def __init__(self, *, smart_turn_model_path: str, **kwargs):
|
||||
def __init__(self, smart_turn_model_path: str, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if not smart_turn_model_path:
|
||||
@@ -40,7 +41,7 @@ class LocalCoreMLSmartTurnAnalyzer(BaseSmartTurn):
|
||||
self._turn_model = ct.models.MLModel(core_ml_model_path)
|
||||
logger.debug("Loaded Local Smart Turn")
|
||||
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, any]:
|
||||
inputs = self._turn_processor(
|
||||
audio_array,
|
||||
sampling_rate=16000,
|
||||
75
src/pipecat/audio/turn/smart_turn.py
Normal file
75
src/pipecat/audio/turn/smart_turn.py
Normal file
@@ -0,0 +1,75 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import io
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.base_smart_turn import BaseSmartTurn
|
||||
|
||||
|
||||
class SmartTurnAnalyzer(BaseSmartTurn):
|
||||
def __init__(self, url: str, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.remote_smart_turn_url = url
|
||||
|
||||
if not self.remote_smart_turn_url:
|
||||
logger.error("remote_smart_turn_url is not set.")
|
||||
raise Exception("remote_smart_turn_url must be provided.")
|
||||
|
||||
# Use a session to reuse connections (keep-alive)
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({"Connection": "keep-alive"})
|
||||
|
||||
def _serialize_array(self, audio_array: np.ndarray) -> bytes:
|
||||
logger.trace("Serializing NumPy array to bytes...")
|
||||
buffer = io.BytesIO()
|
||||
np.save(buffer, audio_array)
|
||||
serialized_bytes = buffer.getvalue()
|
||||
logger.trace(f"Serialized size: {len(serialized_bytes)} bytes")
|
||||
return serialized_bytes
|
||||
|
||||
def _send_raw_request(self, data_bytes: bytes):
|
||||
headers = {"Content-Type": "application/octet-stream"}
|
||||
logger.trace(
|
||||
f"Sending {len(data_bytes)} bytes as raw body to {self.remote_smart_turn_url}..."
|
||||
)
|
||||
try:
|
||||
response = self.session.post(
|
||||
self.remote_smart_turn_url,
|
||||
data=data_bytes,
|
||||
headers=headers,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
logger.trace("\n--- Response ---")
|
||||
logger.trace(f"Status Code: {response.status_code}")
|
||||
|
||||
if response.ok:
|
||||
try:
|
||||
logger.trace("Response JSON:")
|
||||
logger.trace(response.json())
|
||||
return response.json()
|
||||
except requests.exceptions.JSONDecodeError:
|
||||
logger.trace("Response Content (non-JSON):")
|
||||
logger.trace(response.text)
|
||||
else:
|
||||
logger.trace("Response Content (Error):")
|
||||
logger.trace(response.text)
|
||||
response.raise_for_status()
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Failed to send raw request to Daily Smart Turn: {e}")
|
||||
raise Exception("Failed to send raw request to Daily Smart Turn.")
|
||||
|
||||
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, any]:
|
||||
serialized_array = self._serialize_array(audio_array)
|
||||
return self._send_raw_request(serialized_array)
|
||||
@@ -1,26 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
|
||||
from pipecat.audio.turn.smart_turn.http_smart_turn import HttpSmartTurnAnalyzer
|
||||
|
||||
|
||||
class FalSmartTurnAnalyzer(HttpSmartTurnAnalyzer):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
url: str = "https://fal.run/fal-ai/smart-turn/raw",
|
||||
api_key: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
headers = {}
|
||||
if api_key:
|
||||
headers = {"Authorization": f"Key {api_key}"}
|
||||
super().__init__(url=url, aiohttp_session=aiohttp_session, headers=headers, **kwargs)
|
||||
@@ -1,80 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
from typing import Any, Dict
|
||||
|
||||
import aiohttp
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import BaseSmartTurn, SmartTurnTimeoutException
|
||||
|
||||
|
||||
class HttpSmartTurnAnalyzer(BaseSmartTurn):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
url: str,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
headers: Dict[str, str] = {},
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self._url = url
|
||||
self._headers = headers
|
||||
self._aiohttp_session = aiohttp_session
|
||||
|
||||
def _serialize_array(self, audio_array: np.ndarray) -> bytes:
|
||||
logger.trace("Serializing NumPy array to bytes...")
|
||||
buffer = io.BytesIO()
|
||||
np.save(buffer, audio_array)
|
||||
serialized_bytes = buffer.getvalue()
|
||||
logger.trace(f"Serialized size: {len(serialized_bytes)} bytes")
|
||||
return serialized_bytes
|
||||
|
||||
async def _send_raw_request(self, data_bytes: bytes) -> Dict[str, Any]:
|
||||
headers = {"Content-Type": "application/octet-stream"}
|
||||
headers.update(self._headers)
|
||||
logger.trace(f"Sending {len(data_bytes)} bytes as raw body to {self._url}...")
|
||||
try:
|
||||
timeout = aiohttp.ClientTimeout(total=self._params.stop_secs)
|
||||
|
||||
async with self._aiohttp_session.post(
|
||||
self._url, data=data_bytes, headers=headers, timeout=timeout
|
||||
) as response:
|
||||
logger.trace("\n--- Response ---")
|
||||
logger.trace(f"Status Code: {response.status}")
|
||||
|
||||
if response.status == 200:
|
||||
try:
|
||||
json_data = await response.json()
|
||||
logger.trace("Response JSON:")
|
||||
logger.trace(json_data)
|
||||
return json_data
|
||||
except aiohttp.ContentTypeError:
|
||||
# Non-JSON response
|
||||
text = await response.text()
|
||||
logger.trace("Response Content (non-JSON):")
|
||||
logger.trace(text)
|
||||
raise Exception(f"Non-JSON response: {text}")
|
||||
else:
|
||||
error_text = await response.text()
|
||||
logger.trace("Response Content (Error):")
|
||||
logger.trace(error_text)
|
||||
response.raise_for_status()
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"Request timed out after {self._params.stop_secs} seconds")
|
||||
raise SmartTurnTimeoutException(f"Request exceeded {self._params.stop_secs} seconds.")
|
||||
except aiohttp.ClientError as e:
|
||||
logger.error(f"Failed to send raw request to Daily Smart Turn: {e}")
|
||||
raise Exception("Failed to send raw request to Daily Smart Turn.")
|
||||
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
serialized_array = self._serialize_array(audio_array)
|
||||
return await self._send_raw_request(serialized_array)
|
||||
@@ -30,13 +30,3 @@ class LLMUsageMetricsData(MetricsData):
|
||||
|
||||
class TTSUsageMetricsData(MetricsData):
|
||||
value: int
|
||||
|
||||
|
||||
class SmartTurnMetricsData(MetricsData):
|
||||
"""Metrics data for smart turn predictions."""
|
||||
|
||||
is_complete: bool
|
||||
probability: float
|
||||
inference_time_ms: float
|
||||
server_total_time_ms: float
|
||||
e2e_processing_time_ms: float
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -8,8 +8,6 @@ import base64
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.utils import (
|
||||
@@ -21,8 +19,6 @@ from pipecat.audio.utils import (
|
||||
)
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
InputDTMFFrame,
|
||||
@@ -34,120 +30,38 @@ from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializer
|
||||
|
||||
|
||||
class TelnyxFrameSerializer(FrameSerializer):
|
||||
"""Serializer for Telnyx WebSocket protocol.
|
||||
|
||||
This serializer handles converting between Pipecat frames and Telnyx's WebSocket
|
||||
media streams protocol. It supports audio conversion, DTMF events, and automatic
|
||||
call termination.
|
||||
|
||||
When auto_hang_up is enabled (default), the serializer will automatically terminate
|
||||
the Telnyx call when an EndFrame or CancelFrame is processed, but requires Telnyx
|
||||
credentials to be provided.
|
||||
|
||||
Attributes:
|
||||
_stream_id: The Telnyx Stream ID.
|
||||
_call_control_id: The associated Telnyx Call Control ID.
|
||||
_api_key: Telnyx API key for API access.
|
||||
_params: Configuration parameters.
|
||||
_telnyx_sample_rate: Sample rate used by Telnyx (typically 8kHz).
|
||||
_sample_rate: Input sample rate for the pipeline.
|
||||
_resampler: Audio resampler for format conversion.
|
||||
_hangup_attempted: Flag to track if hang-up has been attempted.
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Configuration parameters for TelnyxFrameSerializer.
|
||||
|
||||
Attributes:
|
||||
telnyx_sample_rate: Sample rate used by Telnyx, defaults to 8000 Hz.
|
||||
sample_rate: Optional override for pipeline input sample rate.
|
||||
inbound_encoding: Audio encoding for data sent to Telnyx (e.g., "PCMU").
|
||||
outbound_encoding: Audio encoding for data received from Telnyx (e.g., "PCMU").
|
||||
auto_hang_up: Whether to automatically terminate call on EndFrame.
|
||||
"""
|
||||
|
||||
telnyx_sample_rate: int = 8000
|
||||
sample_rate: Optional[int] = None
|
||||
telnyx_sample_rate: int = 8000 # Default Telnyx rate (8kHz)
|
||||
sample_rate: Optional[int] = None # Pipeline input rate
|
||||
inbound_encoding: str = "PCMU"
|
||||
outbound_encoding: str = "PCMU"
|
||||
auto_hang_up: bool = True
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
stream_id: str,
|
||||
outbound_encoding: str,
|
||||
inbound_encoding: str,
|
||||
call_control_id: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
params: InputParams = InputParams(),
|
||||
):
|
||||
"""Initialize the TelnyxFrameSerializer.
|
||||
|
||||
Args:
|
||||
stream_id: The Stream ID for Telnyx.
|
||||
outbound_encoding: The encoding type for outbound audio (e.g., "PCMU").
|
||||
inbound_encoding: The encoding type for inbound audio (e.g., "PCMU").
|
||||
call_control_id: The Call Control ID for the Telnyx call (optional, but required for auto hang-up).
|
||||
api_key: Your Telnyx API key (required for auto hang-up).
|
||||
params: Configuration parameters.
|
||||
"""
|
||||
self._stream_id = stream_id
|
||||
params.outbound_encoding = outbound_encoding
|
||||
params.inbound_encoding = inbound_encoding
|
||||
self._call_control_id = call_control_id
|
||||
self._api_key = api_key
|
||||
self._params = params
|
||||
|
||||
self._telnyx_sample_rate = self._params.telnyx_sample_rate
|
||||
self._sample_rate = 0 # Pipeline input rate
|
||||
|
||||
self._resampler = create_default_resampler()
|
||||
self._hangup_attempted = False
|
||||
|
||||
@property
|
||||
def type(self) -> FrameSerializerType:
|
||||
"""Gets the serializer type.
|
||||
|
||||
Returns:
|
||||
The serializer type, either TEXT or BINARY.
|
||||
"""
|
||||
return FrameSerializerType.TEXT
|
||||
|
||||
async def setup(self, frame: StartFrame):
|
||||
"""Sets up the serializer with pipeline configuration.
|
||||
|
||||
Args:
|
||||
frame: The StartFrame containing pipeline configuration.
|
||||
"""
|
||||
self._sample_rate = self._params.sample_rate or frame.audio_in_sample_rate
|
||||
|
||||
async def serialize(self, frame: Frame) -> str | bytes | None:
|
||||
"""Serializes a Pipecat frame to Telnyx WebSocket format.
|
||||
|
||||
Handles conversion of various frame types to Telnyx WebSocket messages.
|
||||
For EndFrames and CancelFrames, initiates call termination if auto_hang_up is enabled.
|
||||
|
||||
Args:
|
||||
frame: The Pipecat frame to serialize.
|
||||
|
||||
Returns:
|
||||
Serialized data as string or bytes, or None if the frame isn't handled.
|
||||
|
||||
Raises:
|
||||
ValueError: If an unsupported encoding is specified.
|
||||
"""
|
||||
if (
|
||||
self._params.auto_hang_up
|
||||
and not self._hangup_attempted
|
||||
and isinstance(frame, (EndFrame, CancelFrame))
|
||||
):
|
||||
self._hangup_attempted = True
|
||||
await self._hang_up_call()
|
||||
return None
|
||||
elif isinstance(frame, StartInterruptionFrame):
|
||||
answer = {"event": "clear"}
|
||||
return json.dumps(answer)
|
||||
elif isinstance(frame, AudioRawFrame):
|
||||
if isinstance(frame, AudioRawFrame):
|
||||
data = frame.audio
|
||||
|
||||
# Output: Convert PCM at frame's rate to 8kHz encoded for Telnyx
|
||||
@@ -170,58 +84,11 @@ class TelnyxFrameSerializer(FrameSerializer):
|
||||
|
||||
return json.dumps(answer)
|
||||
|
||||
# Return None for unhandled frames
|
||||
return None
|
||||
|
||||
async def _hang_up_call(self):
|
||||
"""Hang up the Telnyx call using Telnyx's REST API."""
|
||||
try:
|
||||
call_control_id = self._call_control_id
|
||||
api_key = self._api_key
|
||||
|
||||
if not call_control_id or not api_key:
|
||||
logger.warning(
|
||||
"Cannot hang up Telnyx call: call_control_id and api_key must be provided"
|
||||
)
|
||||
return
|
||||
|
||||
# Telnyx API endpoint for hanging up a call
|
||||
endpoint = f"https://api.telnyx.com/v2/calls/{call_control_id}/actions/hangup"
|
||||
|
||||
# Set headers with API key
|
||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
||||
|
||||
# Make the POST request to hang up the call
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(endpoint, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
logger.info(f"Successfully terminated Telnyx call {call_control_id}")
|
||||
else:
|
||||
# Get the error details for better debugging
|
||||
error_text = await response.text()
|
||||
logger.error(
|
||||
f"Failed to terminate Telnyx call {call_control_id}: "
|
||||
f"Status {response.status}, Response: {error_text}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to hang up Telnyx call: {e}")
|
||||
if isinstance(frame, StartInterruptionFrame):
|
||||
answer = {"event": "clear"}
|
||||
return json.dumps(answer)
|
||||
|
||||
async def deserialize(self, data: str | bytes) -> Frame | None:
|
||||
"""Deserializes Telnyx WebSocket data to Pipecat frames.
|
||||
|
||||
Handles conversion of Telnyx media events to appropriate Pipecat frames,
|
||||
including audio data and DTMF keypresses.
|
||||
|
||||
Args:
|
||||
data: The raw WebSocket data from Telnyx.
|
||||
|
||||
Returns:
|
||||
A Pipecat frame corresponding to the Telnyx event, or None if unhandled.
|
||||
|
||||
Raises:
|
||||
ValueError: If an unsupported encoding is specified.
|
||||
"""
|
||||
message = json.loads(data)
|
||||
|
||||
if message["event"] == "media":
|
||||
|
||||
@@ -8,14 +8,11 @@ import base64
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.utils import create_default_resampler, pcm_to_ulaw, ulaw_to_pcm
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
InputDTMFFrame,
|
||||
@@ -29,107 +26,28 @@ from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializer
|
||||
|
||||
|
||||
class TwilioFrameSerializer(FrameSerializer):
|
||||
"""Serializer for Twilio Media Streams WebSocket protocol.
|
||||
|
||||
This serializer handles converting between Pipecat frames and Twilio's WebSocket
|
||||
media streams protocol. It supports audio conversion, DTMF events, and automatic
|
||||
call termination.
|
||||
|
||||
When auto_hang_up is enabled (default), the serializer will automatically terminate
|
||||
the Twilio call when an EndFrame or CancelFrame is processed, but requires Twilio
|
||||
credentials to be provided.
|
||||
|
||||
Attributes:
|
||||
_stream_sid: The Twilio Media Stream SID.
|
||||
_call_sid: The associated Twilio Call SID.
|
||||
_account_sid: Twilio account SID for API access.
|
||||
_auth_token: Twilio authentication token for API access.
|
||||
_params: Configuration parameters.
|
||||
_twilio_sample_rate: Sample rate used by Twilio (typically 8kHz).
|
||||
_sample_rate: Input sample rate for the pipeline.
|
||||
_resampler: Audio resampler for format conversion.
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Configuration parameters for TwilioFrameSerializer.
|
||||
twilio_sample_rate: int = 8000 # Default Twilio rate (8kHz)
|
||||
sample_rate: Optional[int] = None # Pipeline input rate
|
||||
|
||||
Attributes:
|
||||
twilio_sample_rate: Sample rate used by Twilio, defaults to 8000 Hz.
|
||||
sample_rate: Optional override for pipeline input sample rate.
|
||||
auto_hang_up: Whether to automatically terminate call on EndFrame.
|
||||
"""
|
||||
|
||||
twilio_sample_rate: int = 8000
|
||||
sample_rate: Optional[int] = None
|
||||
auto_hang_up: bool = True
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
stream_sid: str,
|
||||
call_sid: Optional[str] = None,
|
||||
account_sid: Optional[str] = None,
|
||||
auth_token: Optional[str] = None,
|
||||
params: InputParams = InputParams(),
|
||||
):
|
||||
"""Initialize the TwilioFrameSerializer.
|
||||
|
||||
Args:
|
||||
stream_sid: The Twilio Media Stream SID.
|
||||
call_sid: The associated Twilio Call SID (optional, but required for auto hang-up).
|
||||
account_sid: Twilio account SID (required for auto hang-up).
|
||||
auth_token: Twilio auth token (required for auto hang-up).
|
||||
params: Configuration parameters.
|
||||
"""
|
||||
def __init__(self, stream_sid: str, params: InputParams = InputParams()):
|
||||
self._stream_sid = stream_sid
|
||||
self._call_sid = call_sid
|
||||
self._account_sid = account_sid
|
||||
self._auth_token = auth_token
|
||||
self._params = params
|
||||
|
||||
self._twilio_sample_rate = self._params.twilio_sample_rate
|
||||
self._sample_rate = 0 # Pipeline input rate
|
||||
|
||||
self._resampler = create_default_resampler()
|
||||
self._hangup_attempted = False
|
||||
|
||||
@property
|
||||
def type(self) -> FrameSerializerType:
|
||||
"""Gets the serializer type.
|
||||
|
||||
Returns:
|
||||
The serializer type, either TEXT or BINARY.
|
||||
"""
|
||||
return FrameSerializerType.TEXT
|
||||
|
||||
async def setup(self, frame: StartFrame):
|
||||
"""Sets up the serializer with pipeline configuration.
|
||||
|
||||
Args:
|
||||
frame: The StartFrame containing pipeline configuration.
|
||||
"""
|
||||
self._sample_rate = self._params.sample_rate or frame.audio_in_sample_rate
|
||||
|
||||
async def serialize(self, frame: Frame) -> str | bytes | None:
|
||||
"""Serializes a Pipecat frame to Twilio WebSocket format.
|
||||
|
||||
Handles conversion of various frame types to Twilio WebSocket messages.
|
||||
For EndFrames, initiates call termination if auto_hang_up is enabled.
|
||||
|
||||
Args:
|
||||
frame: The Pipecat frame to serialize.
|
||||
|
||||
Returns:
|
||||
Serialized data as string or bytes, or None if the frame isn't handled.
|
||||
"""
|
||||
if (
|
||||
self._params.auto_hang_up
|
||||
and not self._hangup_attempted
|
||||
and isinstance(frame, (EndFrame, CancelFrame))
|
||||
):
|
||||
self._hangup_attempted = True
|
||||
await self._hang_up_call()
|
||||
return None
|
||||
elif isinstance(frame, StartInterruptionFrame):
|
||||
if isinstance(frame, StartInterruptionFrame):
|
||||
answer = {"event": "clear", "streamSid": self._stream_sid}
|
||||
return json.dumps(answer)
|
||||
elif isinstance(frame, AudioRawFrame):
|
||||
@@ -150,70 +68,7 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
||||
return json.dumps(frame.message)
|
||||
|
||||
# Return None for unhandled frames
|
||||
return None
|
||||
|
||||
async def _hang_up_call(self):
|
||||
"""Hang up the Twilio call using Twilio's REST API."""
|
||||
try:
|
||||
import aiohttp
|
||||
|
||||
account_sid = self._account_sid
|
||||
auth_token = self._auth_token
|
||||
call_sid = self._call_sid
|
||||
|
||||
if not call_sid or not account_sid or not auth_token:
|
||||
missing = []
|
||||
if not call_sid:
|
||||
missing.append("call_sid")
|
||||
if not account_sid:
|
||||
missing.append("account_sid")
|
||||
if not auth_token:
|
||||
missing.append("auth_token")
|
||||
|
||||
logger.warning(
|
||||
f"Cannot hang up Twilio call: missing required parameters: {', '.join(missing)}"
|
||||
)
|
||||
return
|
||||
|
||||
# Twilio API endpoint for updating calls
|
||||
endpoint = (
|
||||
f"https://api.twilio.com/2010-04-01/Accounts/{account_sid}/Calls/{call_sid}.json"
|
||||
)
|
||||
|
||||
# Create basic auth from account_sid and auth_token
|
||||
auth = aiohttp.BasicAuth(account_sid, auth_token)
|
||||
|
||||
# Parameters to set the call status to "completed" (hang up)
|
||||
params = {"Status": "completed"}
|
||||
|
||||
# Make the POST request to update the call
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(endpoint, auth=auth, data=params) as response:
|
||||
if response.status == 200:
|
||||
logger.info(f"Successfully terminated Twilio call {call_sid}")
|
||||
else:
|
||||
# Get the error details for better debugging
|
||||
error_text = await response.text()
|
||||
logger.error(
|
||||
f"Failed to terminate Twilio call {call_sid}: "
|
||||
f"Status {response.status}, Response: {error_text}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to hang up Twilio call: {e}")
|
||||
|
||||
async def deserialize(self, data: str | bytes) -> Frame | None:
|
||||
"""Deserializes Twilio WebSocket data to Pipecat frames.
|
||||
|
||||
Handles conversion of Twilio media events to appropriate Pipecat frames.
|
||||
|
||||
Args:
|
||||
data: The raw WebSocket data from Twilio.
|
||||
|
||||
Returns:
|
||||
A Pipecat frame corresponding to the Twilio event, or None if unhandled.
|
||||
"""
|
||||
message = json.loads(data)
|
||||
|
||||
if message["event"] == "media":
|
||||
|
||||
@@ -126,14 +126,31 @@ def build_elevenlabs_voice_settings(
|
||||
settings: Dictionary containing voice settings parameters
|
||||
|
||||
Returns:
|
||||
Dictionary of voice settings or None if no valid settings are provided
|
||||
Dictionary of voice settings or None if required parameters are missing
|
||||
"""
|
||||
voice_setting_keys = ["stability", "similarity_boost", "style", "use_speaker_boost", "speed"]
|
||||
|
||||
voice_settings = {}
|
||||
for key in voice_setting_keys:
|
||||
if key in settings and settings[key] is not None:
|
||||
voice_settings[key] = settings[key]
|
||||
if settings["stability"] is not None and settings["similarity_boost"] is not None:
|
||||
voice_settings["stability"] = settings["stability"]
|
||||
voice_settings["similarity_boost"] = settings["similarity_boost"]
|
||||
if settings["style"] is not None:
|
||||
voice_settings["style"] = settings["style"]
|
||||
if settings["use_speaker_boost"] is not None:
|
||||
voice_settings["use_speaker_boost"] = settings["use_speaker_boost"]
|
||||
if settings["speed"] is not None:
|
||||
voice_settings["speed"] = settings["speed"]
|
||||
else:
|
||||
if settings["style"] is not None:
|
||||
logger.warning(
|
||||
"'style' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
|
||||
)
|
||||
if settings["use_speaker_boost"] is not None:
|
||||
logger.warning(
|
||||
"'use_speaker_boost' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
|
||||
)
|
||||
if settings["speed"] is not None:
|
||||
logger.warning(
|
||||
"'speed' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
|
||||
)
|
||||
|
||||
return voice_settings or None
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import warnings
|
||||
@@ -225,7 +224,6 @@ class GladiaSTTService(STTService):
|
||||
self._params = params
|
||||
self._websocket = None
|
||||
self._receive_task = None
|
||||
self._keepalive_task = None
|
||||
|
||||
def language_to_service_language(self, language: Language) -> Optional[str]:
|
||||
"""Convert pipecat Language enum to Gladia's language code."""
|
||||
@@ -289,22 +287,14 @@ class GladiaSTTService(STTService):
|
||||
self._websocket = await websockets.connect(response["url"])
|
||||
if self._websocket and not self._receive_task:
|
||||
self._receive_task = self.create_task(self._receive_task_handler())
|
||||
if self._websocket and not self._keepalive_task:
|
||||
self._keepalive_task = self.create_task(self._keepalive_task_handler())
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
"""Stop the Gladia STT websocket connection."""
|
||||
await super().stop(frame)
|
||||
await self._send_stop_recording()
|
||||
|
||||
if self._keepalive_task:
|
||||
await self.cancel_task(self._keepalive_task)
|
||||
self._keepalive_task = None
|
||||
|
||||
if self._websocket:
|
||||
await self._websocket.close()
|
||||
self._websocket = None
|
||||
|
||||
if self._receive_task:
|
||||
await self.wait_for_task(self._receive_task)
|
||||
self._receive_task = None
|
||||
@@ -312,15 +302,7 @@ class GladiaSTTService(STTService):
|
||||
async def cancel(self, frame: CancelFrame):
|
||||
"""Cancel the Gladia STT websocket connection."""
|
||||
await super().cancel(frame)
|
||||
|
||||
if self._keepalive_task:
|
||||
await self.cancel_task(self._keepalive_task)
|
||||
self._keepalive_task = None
|
||||
|
||||
if self._websocket:
|
||||
await self._websocket.close()
|
||||
self._websocket = None
|
||||
|
||||
await self._websocket.close()
|
||||
if self._receive_task:
|
||||
await self.cancel_task(self._receive_task)
|
||||
self._receive_task = None
|
||||
@@ -359,24 +341,6 @@ class GladiaSTTService(STTService):
|
||||
if self._websocket and not self._websocket.closed:
|
||||
await self._websocket.send(json.dumps({"type": "stop_recording"}))
|
||||
|
||||
async def _keepalive_task_handler(self):
|
||||
"""Send periodic empty audio chunks to keep the connection alive."""
|
||||
try:
|
||||
while True:
|
||||
# Send keepalive every 20 seconds (Gladia times out after 30 seconds)
|
||||
await asyncio.sleep(20)
|
||||
if self._websocket and not self._websocket.closed:
|
||||
# Send an empty audio chunk as keepalive
|
||||
empty_audio = b""
|
||||
await self._send_audio(empty_audio)
|
||||
else:
|
||||
logger.debug("Websocket closed, stopping keepalive")
|
||||
break
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
logger.debug("Connection closed during keepalive")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Gladia keepalive task: {e}")
|
||||
|
||||
async def _receive_task_handler(self):
|
||||
try:
|
||||
async for message in self._websocket:
|
||||
|
||||
@@ -17,8 +17,6 @@ from loguru import logger
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
|
||||
try:
|
||||
from google.auth import default
|
||||
from google.auth.exceptions import GoogleAuthError
|
||||
from google.auth.transport.requests import Request
|
||||
from google.oauth2 import service_account
|
||||
|
||||
@@ -102,13 +100,6 @@ class GoogleVertexLLMService(OpenAILLMService):
|
||||
creds = service_account.Credentials.from_service_account_file(
|
||||
credentials_path, scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
||||
)
|
||||
else:
|
||||
try:
|
||||
creds, project_id = default(
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
||||
)
|
||||
except GoogleAuthError:
|
||||
pass
|
||||
|
||||
if not creds:
|
||||
raise ValueError("No valid credentials provided.")
|
||||
|
||||
@@ -32,8 +32,6 @@ from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
try:
|
||||
from google.api_core.client_options import ClientOptions
|
||||
from google.auth import default
|
||||
from google.auth.exceptions import GoogleAuthError
|
||||
from google.cloud import speech_v2
|
||||
from google.cloud.speech_v2.types import cloud_speech
|
||||
from google.oauth2 import service_account
|
||||
@@ -453,7 +451,6 @@ class GoogleSTTService(STTService):
|
||||
client_options = ClientOptions(api_endpoint=f"{self._location}-speech.googleapis.com")
|
||||
|
||||
# Extract project ID and create client
|
||||
creds: Optional[service_account.Credentials] = None
|
||||
if credentials:
|
||||
json_account_info = json.loads(credentials)
|
||||
self._project_id = json_account_info.get("project_id")
|
||||
@@ -464,16 +461,7 @@ class GoogleSTTService(STTService):
|
||||
self._project_id = json_account_info.get("project_id")
|
||||
creds = service_account.Credentials.from_service_account_file(credentials_path)
|
||||
else:
|
||||
try:
|
||||
creds, project_id = default(
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
||||
)
|
||||
self._project_id = project_id
|
||||
except GoogleAuthError:
|
||||
pass
|
||||
|
||||
if not creds:
|
||||
raise ValueError("No valid credentials provided.")
|
||||
raise ValueError("Either credentials or credentials_path must be provided")
|
||||
|
||||
if not self._project_id:
|
||||
raise ValueError("Project ID not found in credentials")
|
||||
|
||||
@@ -27,8 +27,6 @@ from pipecat.services.tts_service import TTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
|
||||
try:
|
||||
from google.auth import default
|
||||
from google.auth.exceptions import GoogleAuthError
|
||||
from google.cloud import texttospeech_v1
|
||||
from google.oauth2 import service_account
|
||||
|
||||
@@ -253,16 +251,6 @@ class GoogleTTSService(TTSService):
|
||||
elif credentials_path:
|
||||
# Use service account JSON file if provided
|
||||
creds = service_account.Credentials.from_service_account_file(credentials_path)
|
||||
else:
|
||||
try:
|
||||
creds, project_id = default(
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
||||
)
|
||||
except GoogleAuthError:
|
||||
pass
|
||||
|
||||
if not creds:
|
||||
raise ValueError("No valid credentials provided.")
|
||||
|
||||
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ class GrokLLMService(OpenAILLMService):
|
||||
Args:
|
||||
api_key (str): The API key for accessing Grok's API
|
||||
base_url (str, optional): The base URL for Grok API. Defaults to "https://api.x.ai/v1"
|
||||
model (str, optional): The model identifier to use. Defaults to "grok-3-beta"
|
||||
model (str, optional): The model identifier to use. Defaults to "grok-2"
|
||||
**kwargs: Additional keyword arguments passed to OpenAILLMService
|
||||
"""
|
||||
|
||||
@@ -51,7 +51,7 @@ class GrokLLMService(OpenAILLMService):
|
||||
*,
|
||||
api_key: str,
|
||||
base_url: str = "https://api.x.ai/v1",
|
||||
model: str = "grok-3-beta",
|
||||
model: str = "grok-2",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
|
||||
|
||||
@@ -70,7 +70,7 @@ class OpenAITTSService(TTSService):
|
||||
if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
|
||||
logger.warning(
|
||||
f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
|
||||
f"Current rate of {sample_rate}Hz may cause issues."
|
||||
f"Current rate of {self.sample_rate}Hz may cause issues."
|
||||
)
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
|
||||
@@ -6,14 +6,11 @@
|
||||
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Mapping, Optional
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.base_turn_analyzer import (
|
||||
BaseTurnAnalyzer,
|
||||
EndOfTurnState,
|
||||
)
|
||||
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
|
||||
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADState
|
||||
from pipecat.frames.frames import (
|
||||
BotInterruptionFrame,
|
||||
@@ -24,7 +21,6 @@ from pipecat.frames.frames import (
|
||||
FilterUpdateSettingsFrame,
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
MetricsFrame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
StopInterruptionFrame,
|
||||
@@ -33,7 +29,6 @@ from pipecat.frames.frames import (
|
||||
UserStoppedSpeakingFrame,
|
||||
VADParamsUpdateFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
|
||||
@@ -83,7 +78,6 @@ class BaseInputTransport(FrameProcessor):
|
||||
# Configure End of turn analyzer.
|
||||
if self._params.turn_analyzer:
|
||||
self._params.turn_analyzer.set_sample_rate(self._sample_rate)
|
||||
|
||||
# Start audio filter.
|
||||
if self._params.audio_in_filter:
|
||||
await self._params.audio_in_filter.start(self._sample_rate)
|
||||
@@ -222,8 +216,9 @@ class BaseInputTransport(FrameProcessor):
|
||||
|
||||
async def _handle_end_of_turn(self):
|
||||
if self.turn_analyzer:
|
||||
state, prediction = await self.turn_analyzer.analyze_end_of_turn()
|
||||
await self._handle_prediction_result(prediction)
|
||||
state = await self.get_event_loop().run_in_executor(
|
||||
self._executor, self.turn_analyzer.analyze_end_of_turn
|
||||
)
|
||||
await self._handle_end_of_turn_complete(state)
|
||||
|
||||
async def _handle_end_of_turn_complete(self, state: EndOfTurnState):
|
||||
@@ -268,11 +263,3 @@ class BaseInputTransport(FrameProcessor):
|
||||
await self.push_frame(frame)
|
||||
|
||||
self._audio_in_queue.task_done()
|
||||
|
||||
async def _handle_prediction_result(self, result: MetricsData):
|
||||
"""Handle a prediction result event from the turn analyzer.
|
||||
|
||||
Args:
|
||||
result: The prediction result MetricsData.
|
||||
"""
|
||||
await self.push_frame(MetricsFrame(data=[result]))
|
||||
|
||||
@@ -207,12 +207,10 @@ class FastAPIWebsocketOutputTransport(BaseOutputTransport):
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
await super().stop(frame)
|
||||
await self._write_frame(frame)
|
||||
await self._client.disconnect()
|
||||
|
||||
async def cancel(self, frame: CancelFrame):
|
||||
await super().cancel(frame)
|
||||
await self._write_frame(frame)
|
||||
await self._client.disconnect()
|
||||
|
||||
async def cleanup(self):
|
||||
|
||||
@@ -157,8 +157,7 @@ class WebsocketServerInputTransport(BaseInputTransport):
|
||||
self, websocket: websockets.WebSocketServerProtocol, session_timeout: int
|
||||
):
|
||||
"""Wait for session_timeout seconds, if the websocket is still open,
|
||||
trigger timeout event.
|
||||
"""
|
||||
trigger timeout event."""
|
||||
try:
|
||||
await asyncio.sleep(session_timeout)
|
||||
if not websocket.closed:
|
||||
@@ -196,14 +195,6 @@ class WebsocketServerOutputTransport(BaseOutputTransport):
|
||||
await self._params.serializer.setup(frame)
|
||||
self._send_interval = (self._audio_chunk_size / self.sample_rate) / 2
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
await super().stop(frame)
|
||||
await self._write_frame(frame)
|
||||
|
||||
async def cancel(self, frame: CancelFrame):
|
||||
await super().cancel(frame)
|
||||
await self._write_frame(frame)
|
||||
|
||||
async def cleanup(self):
|
||||
await super().cleanup()
|
||||
await self._transport.cleanup()
|
||||
|
||||
Reference in New Issue
Block a user