Compare commits

..

2 Commits

Author SHA1 Message Date
James Hush
ac7e38719d Add local view and recording 2025-04-01 19:26:41 +08:00
James Hush
7fdf996f0e Recording demo 2025-04-01 14:46:21 +08:00
464 changed files with 15795 additions and 50819 deletions

View File

@@ -1,87 +0,0 @@
name: Bug report
description: Report a bug or unexpected behavior
type: Bug
body:
- type: markdown
attributes:
value: |
## Bug Report
Thank you for taking the time to fill out this bug report.
- type: markdown
attributes:
value: |
### Environment
- type: input
id: pipecat-version
attributes:
label: pipecat version
description: Which version are you using?
placeholder: e.g., 0.0.63
validations:
required: true
- type: input
id: python-version
attributes:
label: Python version
description: Which Python version are you using?
placeholder: e.g., 3.12.8
validations:
required: true
- type: input
id: os
attributes:
label: Operating System
description: Which OS are you using?
placeholder: e.g., Ubuntu 24.04, Windows 11, macOS 12.5
validations:
required: true
- type: textarea
id: description
attributes:
label: Issue description
description: Provide a clear description of the issue.
validations:
required: true
- type: textarea
id: repro
attributes:
label: Reproduction steps
description: List the steps to reproduce the issue.
placeholder: |
1. Do this...
2. Then do that...
3. Observe the error...
validations:
required: true
- type: textarea
id: expected
attributes:
label: Expected behavior
description: What did you expect to happen?
validations:
required: true
- type: textarea
id: actual
attributes:
label: Actual behavior
description: What actually happened?
validations:
required: true
- type: textarea
id: logs
attributes:
label: Logs
description: If applicable, include any relevant logs or error messages
render: shell
validations:
required: false

View File

@@ -1,67 +0,0 @@
name: Question
description: Ask a question or get help
type: Question
body:
- type: markdown
attributes:
value: |
## Question
Use this form to ask a question about pipecat.
- type: markdown
attributes:
value: |
### Environment (if applicable)
- type: input
id: pipecat-version
attributes:
label: pipecat version
description: Which version are you using? (if applicable)
placeholder: e.g., 0.0.63
validations:
required: false
- type: input
id: python-version
attributes:
label: Python version
description: Which Python version are you using? (if applicable)
placeholder: e.g., 3.12.8
validations:
required: false
- type: input
id: os
attributes:
label: Operating System
description: Which OS are you using? (if applicable)
placeholder: e.g., Ubuntu 24.04, Windows 11, macOS 12.5
validations:
required: false
- type: textarea
id: question
attributes:
label: Question
description: Provide your question in detail here.
validations:
required: true
- type: textarea
id: tried
attributes:
label: What I've tried
description: Describe what you've already tried or research you've done.
placeholder: I've looked at the documentation and tried...
validations:
required: false
- type: textarea
id: context
attributes:
label: Context
description: Any additional context or information that might help others understand your question better.
validations:
required: false

View File

@@ -1,52 +0,0 @@
name: Feature request
description: Suggest an enhancement or new feature
type: Enhancement
body:
- type: markdown
attributes:
value: |
## Feature Request
Thank you for suggesting an enhancement to pipecat.
- type: textarea
id: problem
attributes:
label: Problem Statement
description: A clear description of the problem this feature would solve.
placeholder: I'm always frustrated when...
validations:
required: true
- type: textarea
id: solution
attributes:
label: Proposed Solution
description: A clear and concise description of what you want to happen.
validations:
required: true
- type: textarea
id: alternatives
attributes:
label: Alternative Solutions
description: Any alternative solutions or features you've considered.
validations:
required: false
- type: textarea
id: context
attributes:
label: Additional Context
description: Add any other context, mockups, or screenshots about the feature request here.
placeholder: You can drag and drop images here to include them.
validations:
required: false
- type: checkboxes
id: contribution
attributes:
label: Would you be willing to help implement this feature?
options:
- label: Yes, I'd like to contribute
- label: No, I'm just suggesting

View File

@@ -1,82 +0,0 @@
name: Service Issue
description: An issue with a third-party service
type: Service Issue
body:
- type: markdown
attributes:
value: |
## Service Issue
Use this form to report an issue with a third-party service integration.
- type: input
id: pipecat-version
attributes:
label: pipecat version
description: Which version are you using?
placeholder: e.g., 0.0.63
validations:
required: true
- type: input
id: service-name
attributes:
label: Service Name
description: Which third-party service is having issues?
placeholder: e.g., OpenAI, ElevenLabs, Anthropic
validations:
required: true
- type: input
id: service-version
attributes:
label: Service or model version
description: Which version of the service API or model are you using?
placeholder: e.g., v1, gpt-4.1
validations:
required: false
- type: textarea
id: description
attributes:
label: Issue Description
description: Provide a clear description of the service issue.
validations:
required: true
- type: textarea
id: reproduction
attributes:
label: Reproduction Steps
description: Provide steps to reproduce the issue.
placeholder: |
1. Configure service X
2. Call method Y
3. See error Z
validations:
required: true
- type: textarea
id: expected
attributes:
label: Expected Behavior
description: What did you expect to happen?
validations:
required: true
- type: textarea
id: actual
attributes:
label: Actual Behavior
description: What actually happened?
validations:
required: true
- type: textarea
id: logs
attributes:
label: Error Logs
description: If available, include any error messages or logs.
render: shell
validations:
required: false

View File

@@ -1,56 +0,0 @@
name: New Service
description: Request to support a new third-party service
type: New Service
body:
- type: markdown
attributes:
value: |
## New Service Request
Use this form to request support for a new third-party service in pipecat.
- type: input
id: service-name
attributes:
label: Service Name
description: What is the name of the third-party service?
placeholder: e.g., NewAPI, SomeService
validations:
required: true
- type: input
id: service-website
attributes:
label: Service Website
description: Link to the service's website or documentation
placeholder: e.g., https://newapi.com
validations:
required: true
- type: textarea
id: service-description
attributes:
label: Service Description
description: Briefly describe what this service does and how it works.
validations:
required: true
- type: textarea
id: api-info
attributes:
label: API Information
description: If available, provide details about the service's API.
placeholder: |
- API documentation link
- Authentication method
- Key endpoints you'd like supported
validations:
required: false
- type: checkboxes
id: contribution
attributes:
label: Would you be willing to help implement this service?
options:
- label: Yes, I'd like to contribute
- label: No, I'm just suggesting

View File

@@ -1,74 +0,0 @@
name: Dependency Issue
description: An issue with a Pipecat dependency (not a third-party service)
type: Dependency Issue
body:
- type: markdown
attributes:
value: |
## Dependency Issue
Use this form to report an issue with a Pipecat dependency.
- type: input
id: pipecat-version
attributes:
label: pipecat version
description: Which version are you using?
placeholder: e.g., 0.0.63
validations:
required: true
- type: input
id: dependency-name
attributes:
label: Dependency Name
description: Which Pipecat dependency is causing the issue?
placeholder: e.g., openai, anthropic, fastapi
validations:
required: true
- type: input
id: dependency-version
attributes:
label: Dependency Version
description: Which version of the dependency are you using?
placeholder: e.g., 1.2.3
validations:
required: true
- type: textarea
id: description
attributes:
label: Issue Description
description: Provide a clear description of the dependency issue.
validations:
required: true
- type: textarea
id: impact
attributes:
label: Impact
description: How is this dependency issue affecting your usage of pipecat?
validations:
required: true
- type: textarea
id: reproduction
attributes:
label: Reproduction Steps
description: If applicable, provide steps to reproduce the issue.
placeholder: |
1. Install dependency X
2. Run command Y
3. See error Z
validations:
required: false
- type: textarea
id: logs
attributes:
label: Error Logs
description: If applicable, include any relevant error messages or logs.
render: shell
validations:
required: false

View File

@@ -1,70 +0,0 @@
name: Troubleshooting
description: Help with a specific use case
type: Troubleshooting
body:
- type: markdown
attributes:
value: |
## Troubleshooting Request
Use this form to get help with a specific use case or implementation.
- type: input
id: pipecat-version
attributes:
label: pipecat version
description: Which version are you using?
placeholder: e.g., 0.0.63
validations:
required: true
- type: input
id: python-version
attributes:
label: Python version
description: Which version of Python are you using?
placeholder: e.g., 3.12.8
validations:
required: true
- type: input
id: os
attributes:
label: Operating System
description: Which OS are you using?
placeholder: e.g., Ubuntu 24.04, Windows 11, macOS 12.5
validations:
required: true
- type: textarea
id: use-case
attributes:
label: Use Case Description
description: Describe what you're trying to accomplish with pipecat.
validations:
required: true
- type: textarea
id: current-approach
attributes:
label: Current Approach
description: What have you tried so far? Include code snippets if relevant.
render: python
validations:
required: true
- type: textarea
id: errors
attributes:
label: Errors or Unexpected Behavior
description: Describe any errors or unexpected behavior you're encountering.
validations:
required: true
- type: textarea
id: additional-context
attributes:
label: Additional Context
description: Any other information that might help us understand your situation.
validations:
required: false

View File

@@ -1 +0,0 @@
blank_issues_enabled: false

30
.gitignore vendored
View File

@@ -7,7 +7,7 @@ venv
/.idea
#*#
# Distribution / Packaging
# Distribution / packaging
.Python
build/
develop-eggs/
@@ -30,24 +30,24 @@ MANIFEST
.env
fly.toml
# Examples
examples/telnyx-chatbot/templates/streams.xml
examples/twilio-chatbot/templates/streams.xml
examples/**/node_modules/
examples/**/.expo/
examples/**/dist/
examples/**/npm-debug.*
examples/**/*.jks
examples/**/*.p8
examples/**/*.p12
examples/**/*.key
examples/**/*.mobileprovision
examples/**/*.orig.*
examples/**/web-build/
# Example files
pipecat/examples/twilio-chatbot/templates/streams.xml
pipecat/examples/bot-ready-signalling/client/react-native/node_modules/
pipecat/examples/bot-ready-signalling/client/react-native/.expo/
pipecat/examples/bot-ready-signalling/client/react-native/dist/
pipecat/examples/bot-ready-signalling/client/react-native/npm-debug.*
pipecat/examples/bot-ready-signalling/client/react-native/*.jks
pipecat/examples/bot-ready-signalling/client/react-native/*.p8
pipecat/examples/bot-ready-signalling/client/react-native/*.p12
pipecat/examples/bot-ready-signalling/client/react-native/*.key
pipecat/examples/bot-ready-signalling/client/react-native/*.mobileprovision
pipecat/examples/bot-ready-signalling/client/react-native/*.orig.*
pipecat/examples/bot-ready-signalling/client/react-native/web-build/
# macOS
.DS_Store
# Documentation
docs/api/_build/
docs/api/api

View File

@@ -9,302 +9,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added `RTVIObserverParams` which allows you to configure what RTVI messages
are sent to the clients.
- Added a `context_window_compression` InputParam to
`GeminiMultimodalLiveLLMService` which allows you to enable a sliding context
window for the session as well as set the token limit of the sliding window.
- Updated `SmallWebRTCConnection` to support `ice_servers` with credentials.
- Added `VADUserStartedSpeakingFrame` and `VADUserStoppedSpeakingFrame`,
indicating when the VAD detected the user to start and stop speaking. These
events are helpful when using smart turn detection, as the user's stop time
can differ from when their turn ends (signified by UserStoppedSpeakingFrame).
- Added `TranslationFrame`, a new frame type that contains a translated
transcription.
- Added `TransportParams.audio_in_passthrough`. If set (the default), incoming
audio will be pushed downstream.
- Added `MCPClient`; a way to connect to MCP servers and use the MCP servers'
tools.
- Added `Mem0 OSS`, along with Mem0 cloud support now the OSS version is also
available.
### Changed
- The `STTMuteFilter` now mutes `InterimTranscriptionFrame` and
`TranscriptionFrame` which allows the `STTMuteFilter` to be used in
conjunction with transports that generate transcripts, e.g. `DailyTransport`.
- Function calls now receive a single parameter `FunctionCallParams` instead of
`(function_name, tool_call_id, args, llm, context, result_callback)` which is
now deprecated.
- Changed the user aggregator timeout for late transcriptions from 1.0s to 0.5s
(`LLMUserAggregatorParams.aggregation_timeout`). Sometimes, the STT services
might give us more than one transcription which could come after the user
stopped speaking. We still want to include these additional transcriptions
with the first one because it's part of the user turn. This is what this
timeout is helpful with.
- Short utterances not detected by VAD while the bot is speaking are now
ignored. This reduces the amount of bot interruptions significantly providing
a more natural conversation experience.
- Updated `GladiaSTTService` to output a `TranslationFrame` when specifying a
`translation` and `translation_config`.
- STT services now passthrough audio frames by default. This allows you to add
audio recording without worrying about what's wrong in your pipeline when it
doesn't work the first time.
- Input transports now always push audio downstream unless disabled with
`TransportParams.audio_in_passthrough`. After many Pipecat releases, we
realized this is the common use case. There are use cases where the input
transport already provides STT and you also don't want recordings, in which
case there's no need to push audio to the rest of the pipeline, but this is
not a very common case.
### Deprecated
- Function calls with parameters
`(function_name, tool_call_id, args, llm, context, result_callback)` are
deprectated, use a single `FunctionCallParams` parameter instead.
- `TransportParams.camera_*` parameters are now deprecated, use
`TransportParams.video_*` instead.
- `TransportParams.vad_enabled` parameter is now deprecated, use
`TransportParams.audio_in_enabled` and `TransportParams.vad_analyzer` instead.
- `TransportParams.vad_audio_passthrough` parameter is now deprecated, use
`TransportParams.audio_in_passthrough` instead.
### Fixed
- Fixed an issue with `GeminiMultimodalLiveLLMService` where the context
contained tokens instead of words.
- Fixed an issue with HTTP Smart Turn handling, where the service returns a 500
error. Previously, this would cause an unhandled exception. Now, a 500 error
is treated as an incomplete response.
- Fixed a TTS services issue that could cause assistant output not to be
aggregated to the context when also using `TTSSpeakFrame`s.
- Fixed an issue where the `SmartTurnMetricsData` was reporting 0ms for
inference and processing time when using the `FalSmartTurnAnalyzer`.
### Other
- Added 04 foundational examples for client/server transports. Also, renamed
`29-livekit-audio-chat.py` to `04b-transports-livekit.py`.
- Added foundational example `13c-gladia-translation.py` showing how to use
`TranscriptionFrame` and `TranslationFrame`.
## [0.0.65] - 2025-04-23 "Sant Jordi's release" 🌹📕
https://en.wikipedia.org/wiki/Saint_George%27s_Day_in_Catalonia
### Added
- Added automatic hangup logic to the Telnyx serializer. This feature hangs up
the Telnyx call when an `EndFrame` or `CancelFrame` is received. It is
enabled by default and is configurable via the `auto_hang_up` `InputParam`.
- Added a keepalive task to `GladiaSTTService` to prevent the websocket from
disconnecting after 30 seconds of no audio input.
### Changed
- The `InputParams` for `ElevenLabsTTSService` and `ElevenLabsHttpTTSService`
no longer require that `stability` and `similarity_boost` be set. You can
individually set each param.
- In `TwilioFrameSerializer`, `call_sid` is Optional so as to avoid a breaking
changed. `call_sid` is required to automatically hang up.
### Fixed
- Fixed an issue where `TwilioFrameSerializer` would send two hang up commands:
one for the `EndFrame` and one for the `CancelFrame`.
## [0.0.64] - 2025-04-22
### Added
- Added automatic hangup logic to the Twilio serializer. This feature hangs up
the Twilio call when an `EndFrame` or `CancelFrame` is received. It is
enabled by default and is configurable via the `auto_hang_up` `InputParam`.
- Added `SmartTurnMetricsData`, which contains end-of-turn prediction metrics,
to the `MetricsFrame`. Using `MetricsFrame`, you can now retrieve prediction
confidence scores and processing time metrics from the smart turn analyzers.
- Added support for Application Default Credentials in Google services,
`GoogleSTTService`, `GoogleTTSService`, and `GoogleVertexLLMService`.
- Added support for Smart Turn Detection via the `turn_analyzer` transport
parameter. You can now choose between `HttpSmartTurnAnalyzer()` or
`FalSmartTurnAnalyzer()` for remote inference or
`LocalCoreMLSmartTurnAnalyzer()` for on-device inference using Core ML.
- `DeepgramTTSService` accepts `base_url` argument again, allowing you to
connect to an on-prem service.
- Added `LLMUserAggregatorParams` and `LLMAssistantAggregatorParams` which allow
you to control aggregator settings. You can now pass these arguments when
creating aggregator pairs with `create_context_aggregator()`.
- Added `previous_text` context support to ElevenLabsHttpTTSService, improving
speech consistency across sentences within an LLM response.
- Added word/timestamp pairs to `ElevenLabsHttpTTSService`.
- It is now possible to disable `SoundfileMixer` when created. You can then use
`MixerEnableFrame` to dynamically enable it when necessary.
- Added `on_client_connected` and `on_client_disconnected` event handlers to
the `DailyTransport` class. These handlers map to the same underlying Daily
events as `on_participant_joined` and `on_participant_left`, respectively.
This makes it easier to write a single bot pipeline that can also use other
transports like `SmallWebRTCTransport` and `FastAPIWebsocketTransport`.
### Changed
- `GrokLLMService` now uses `grok-3-beta` as its default model.
- Daily's REST helpers now include an `eject_at_token_exp` param, which ejects
the user when their token expires. This new parameter defaults to False.
Also, the default value for `enable_prejoin_ui` changed to False and
`eject_at_room_exp` changed to False.
- `OpenAILLMService` and `OpenPipeLLMService` now use `gpt-4.1` as their
default model.
- `SoundfileMixer` constructor arguments need to be keywords.
### Deprecated
- `DeepgramSTTService` parameter `url` is now deprecated, use `base_url`
instead.
### Removed
- Parameters `user_kwargs` and `assistant_kwargs` when creating a context
aggregator pair using `create_context_aggregator()` have been removed. Use
`user_params` and `assistant_params` instead.
### Fixed
- Fixed an issue that would cause TTS websocket-based services to not cleanup
resources properly when disconnecting.
- Fixed a `TavusVideoService` issue that was causing audio choppiness.
- Fixed an issue in `SmallWebRTCTransport` where an error was thrown if the
client did not create a video transceiver.
- Fixed an issue where LLM input parameters were not working and applied
correctly in `GoogleVertexLLMService`, causing unexpected behavior during
inference.
### Other
- Updated the `twilio-chatbot` example to use the auto-hangup feature.
## [0.0.63] - 2025-04-11
### Added
- Added media resolution control to `GeminiMultimodalLiveLLMService` with
`GeminiMediaResolution` enum, allowing configuration of token usage for
image processing (LOW: 64 tokens, MEDIUM: 256 tokens, HIGH: zoomed reframing
with 256 tokens).
- Added Gemini's Voice Activity Detection (VAD) configuration to
`GeminiMultimodalLiveLLMService` with `GeminiVADParams`, allowing fine
control over speech detection sensitivity and timing, including:
- Start sensitivity (how quickly speech is detected)
- End sensitivity (how quickly turns end after pauses)
- Prefix padding (milliseconds of audio to keep before speech is detected)
- Silence duration (milliseconds of silence required to end a turn)
- Added comprehensive language support to `GeminiMultimodalLiveLLMService`,
supporting over 30 languages via the `language` parameter, with proper
mapping between Pipecat's `Language` enum and Gemini's language codes.
- Added support in `SmallWebRTCTransport` to detect when remote tracks are
muted.
- Added support for image capture from a video stream to the
`SmallWebRTCTransport`.
- Added a new iOS client option to the `SmallWebRTCTransport`
**video-transform** example.
- Added new processors `ProducerProcessor` and `ConsumerProcessor`. The
producer processor processes frames from the pipeline and decides whether the
consumers should consume it or not. If so, the same frame that is received by
the producer is sent to the consumer. There can be multiple consumers per
producer. These processors can be useful to push frames from one part of a
pipeline to a different one (e.g. when using `ParallelPipeline`).
- Improvements for the `SmallWebRTCTransport`:
- Wait until the pipeline is ready before triggering the `connected` event.
- Queue messages if the data channel is not ready.
- Update the aiortc dependency to fix an issue where the 'video/rtx' MIME
type was incorrectly handled as a codec retransmission.
- Avoid initial video delays.
### Changed
- In `GeminiMultimodalLiveLLMService`, removed the `transcribe_model_audio`
parameter in favor of Gemini Live's native output transcription support. Now
text transcriptions are produced directly by the model. No configuration is
required.
- Updated `GeminiMultimodalLiveLLMService`s default `model` to
`models/gemini-2.0-flash-live-001` and `base_url` to the `v1beta` websocket
URL.
### Fixed
- Updated `daily-python` to 0.17.0 to fix an issue that was preventing to run on
older platforms.
- Fixed an issue where `CartesiaTTSService`'s spell feature would result in
the spelled word in the context appearing as "F,O,O,B,A,R" instead of
"FOOBAR".
- Fixed an issue in the Azure TTS services where the language was being set
incorrectly.
- Fixed `SmallWebRTCTransport` to support dynamic values for
`TransportParams.audio_out_10ms_chunks`. Previously, it only worked with 20ms
chunks.
- Fixed an issue with `GeminiMultimodalLiveLLMService` where the assistant
context messages had no space between words.
- Fixed an issue where `LLMAssistantContextAggregator` would prevent a
`BotStoppedSpeakingFrame` from moving through the pipeline.
## [0.0.62] - 2025-04-01 "An April Fools' release"
### Added
- Added `TransportParams.audio_out_10ms_chunks` parameter to allow controlling
the amount of audio being sent by the output transport. It defaults to 4, so
40ms audio chunks are sent.
- Added `QwenLLMService` for Qwen integration with an OpenAI-compatible
interface. Added foundational example `14q-function-calling-qwen.py`.
@@ -349,9 +53,6 @@ https://en.wikipedia.org/wiki/Saint_George%27s_Day_in_Catalonia
### Changed
- `FunctionCallResultFrame`a are now system frames. This is to prevent function
call results to be discarded during interruptions.
- Pipecat services have been reorganized into packages. Each package can have
one or more of the following modules (in the future new module names might be
needed) depending on the services implemented:
@@ -368,8 +69,15 @@ https://en.wikipedia.org/wiki/Saint_George%27s_Day_in_Catalonia
be found in
`pipecat.services.[ai_service,image_service,llm_service,stt_service,vision_service]`.
- `GladiaSTTService` now uses the `solaria-1` model by default. Other params
use Gladia's default values. Added support for more language codes.
- `GladiaSTTService` now uses Gladia's default values.
### Fixed
- Fixed an issue that would cause `SegmentedSTTService` based services
(e.g. `OpenAISTTService`) to try to transcribe non-spoken audio, causing
invalid transcriptions.
- Fixed an issue where `GoogleTTSService` was emitting two `TTSStoppedFrames`.
### Deprecated
@@ -388,31 +96,6 @@ https://en.wikipedia.org/wiki/Saint_George%27s_Day_in_Catalonia
- Deprecated using `GladiaSTTService.InputParams` directly. Use the new
`GladiaInputParams` class instead.
### Fixed
- Fixed a `FastAPIWebsocketTransport` and `WebsocketClientTransport` issue that
would cause the transport to be closed prematurely, preventing the internally
queued audio to be sent. The same issue could also cause an infinite loop
while using an output mixer and when sending an `EndFrame`, preventing the bot
to finish.
- Fixed an issue that could cause the `TranscriptionUpdateFrame` being pushed
because of an interruption to be discarded.
- Fixed an issue that would cause `SegmentedSTTService` based services
(e.g. `OpenAISTTService`) to try to transcribe non-spoken audio, causing
invalid transcriptions.
- Fixed an issue where `GoogleTTSService` was emitting two `TTSStoppedFrames`.
### Performance
- Output transports now send 40ms audio chunks instead of 20ms. This should
improve performance.
- `BotSpeakingFrame`s are now sent every 200ms. If the output transport audio chunks
are higher than 200ms then they will be sent at every audio chunk.
### Other
- Added foundational example `37-mem0.py` demonstrating how to use the

View File

@@ -26,52 +26,11 @@ git commit -m "Description of your changes"
git push origin your-branch-name
```
8. **Submit a Pull Request (PR)**: Open a PR from your forked repository to the main branch of this repo.
> Important: Describe the changes you've made clearly!
9. **Submit a Pull Request (PR)**: Open a PR from your forked repository to the main branch of this repo.
> Important: Describe the changes you've made clearly!
Our maintainers will review your PR, and once everything is good, your contributions will be merged!
## Code Style and Documentation
### Python Code Style
We use Ruff for code linting and formatting. Please ensure your code passes all linting checks before submitting a PR.
### Docstring Conventions
We follow Google-style docstrings with these specific conventions:
- Class docstrings should fully document all parameters used in `__init__`
- We don't require separate docstrings for `__init__` methods when parameters are documented in the class docstring
- Property methods should have docstrings explaining their purpose and return value
Example of correctly documented class:
```python
class MyClass:
"""Class description.
Additional details about the class.
Args:
param1: Description of first parameter.
param2: Description of second parameter.
"""
def __init__(self, param1, param2):
# No docstring required here as parameters are documented above
self.param1 = param1
self.param2 = param2
@property
def some_property(self) -> str:
"""Get the formatted property value.
Returns:
A string representation of the property.
"""
return f"Property: {self.param1}"
```
# Contributor Covenant Code of Conduct
@@ -92,23 +51,23 @@ diverse, inclusive, and healthy community.
Examples of behavior that contributes to a positive environment for our
community include:
- Demonstrating empathy and kindness toward other people
- Being respectful of differing opinions, viewpoints, and experiences
- Giving and gracefully accepting constructive feedback
- Accepting responsibility and apologizing to those affected by our mistakes,
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
- Focusing on what is best not just for us as individuals, but for the overall
* Focusing on what is best not just for us as individuals, but for the overall
community
Examples of unacceptable behavior include:
- The use of sexualized language or imagery, and sexual attention or advances of
* The use of sexualized language or imagery, and sexual attention or advances of
any kind
- Trolling, insulting or derogatory comments, and personal or political attacks
- Public or private harassment
- Publishing others' private information, such as a physical or email address,
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
without their explicit permission
- Other conduct which could reasonably be considered inappropriate in a
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
@@ -203,4 +162,4 @@ For answers to common questions about this code of conduct, see the FAQ at
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations
[translations]: https://www.contributor-covenant.org/translations

237
README.md
View File

@@ -1,72 +1,43 @@
<h1><div align="center">
<img alt="pipecat" width="300px" height="auto" src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/pipecat.png">
 <img alt="pipecat" width="300px" height="auto" src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/pipecat.png">
</div></h1>
[![PyPI](https://img.shields.io/pypi/v/pipecat-ai)](https://pypi.org/project/pipecat-ai) ![Tests](https://github.com/pipecat-ai/pipecat/actions/workflows/tests.yaml/badge.svg) [![codecov](https://codecov.io/gh/pipecat-ai/pipecat/graph/badge.svg?token=LNVUIVO4Y9)](https://codecov.io/gh/pipecat-ai/pipecat) [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.pipecat.ai) [![Discord](https://img.shields.io/discord/1239284677165056021)](https://discord.gg/pipecat)
# 🎙️ Pipecat: Real-Time Voice & Multimodal AI Agents
Pipecat is an open source Python framework for building voice and multimodal conversational agents. It handles the complex orchestration of AI services, network transport, audio processing, and multimodal interactions, letting you focus on creating engaging experiences.
**Pipecat** is an open-source Python framework for building real-time voice and multimodal conversational agents. Orchestrate audio and video, AI services, different transports, and conversation pipelines effortlessly—so you can focus on what makes your agent unique.
## What you can build
## 🚀 What You Can Build
- **Voice Assistants**: [Natural, real-time conversations with AI](https://demo.dailybots.ai/)
- **Interactive Agents**: Personal coaches and meeting assistants
- **Multimodal Apps**: Combine voice, video, images, and text
- **Creative Tools**: [Story-telling experiences](https://storytelling-chatbot.fly.dev/) and social companions
- **Business Solutions**: [Customer intake flows](https://www.youtube.com/watch?v=lDevgsp9vn0) and support bots
- **Complex conversational flows**: [Refer to Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) to learn more
- **Voice Assistants** natural, streaming conversations with AI
- **AI Companions** coaches, meeting assistants, characters
- **Multimodal Interfaces** voice, video, images, and more
- **Interactive Storytelling** creative tools with generative media
- **Business Agents** customer intake, support bots, guided flows
- **Complex Dialog Systems** design logic with structured conversations
🧭 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
## 🧠 Why Pipecat?
- **Voice-first**: Integrates speech recognition, text-to-speech, and conversation handling
- **Pluggable**: Supports many AI services and tools
- **Composable Pipelines**: Build complex behavior from modular components
- **Real-Time**: Ultra-low latency interaction with different transports (e.g. WebSockets or WebRTC)
## 🎬 See it in action
## See it in action
<p float="left">
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/simple-chatbot/image.png" width="400" /></a>&nbsp;
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/storytelling-chatbot/image.png" width="400" /></a>
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/simple-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/simple-chatbot/image.png" width="280" /></a>&nbsp;
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/storytelling-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/storytelling-chatbot/image.png" width="280" /></a>
<br/>
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/translation-chatbot/image.png" width="400" /></a>&nbsp;
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/moondream-chatbot/image.png" width="400" /></a>
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/translation-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/translation-chatbot/image.png" width="280" /></a>&nbsp;
<a href="https://github.com/pipecat-ai/pipecat/tree/main/examples/moondream-chatbot"><img src="https://raw.githubusercontent.com/pipecat-ai/pipecat/main/examples/moondream-chatbot/image.png" width="280" /></a>
</p>
## 📱 Client SDKs
## Key features
You can connect to Pipecat from any platform using our official SDKs:
- **Voice-first Design**: Built-in speech recognition, TTS, and conversation handling
- **Flexible Integration**: Works with popular AI services (OpenAI, ElevenLabs, etc.)
- **Pipeline Architecture**: Build complex apps from simple, reusable components
- **Real-time Processing**: Frame-based pipeline architecture for fluid interactions
- **Production Ready**: Enterprise-grade WebRTC and Websocket support
| Platform | SDK Repo | Description |
| -------- | ------------------------------------------------------------------------------ | -------------------------------- |
| Web | [pipecat-client-web](https://github.com/pipecat-ai/pipecat-client-web) | JavaScript and React client SDKs |
| iOS | [pipecat-client-ios](https://github.com/pipecat-ai/pipecat-client-ios) | Swift SDK for iOS |
| Android | [pipecat-client-android](https://github.com/pipecat-ai/pipecat-client-android) | Kotlin SDK for Android |
| C++ | [pipecat-client-cxx](https://github.com/pipecat-ai/pipecat-client-cxx) | C++ client SDK |
💡 Looking to build structured conversations? Check out [Pipecat Flows](https://github.com/pipecat-ai/pipecat-flows) for managing complex conversational states and transitions.
## 🧩 Available services
## Getting started
| Category | Services |
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
| Speech-to-Speech | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) |
| Analytics & Metrics | [Canonical AI](https://docs.pipecat.ai/server/services/analytics/canonical), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
## ⚡ Getting started
You can get started with Pipecat running on your local machine, then move your agent processes to the cloud when youre ready.
You can get started with Pipecat running on your local machine, then move your agent processes to the cloud when youre ready. You can also add a 📞 telephone number, 🖼️ image output, 📺 video input, use different LLMs, and more.
```shell
# Install the module
@@ -82,71 +53,155 @@ To keep things lightweight, only the core framework is included by default. If y
pip install "pipecat-ai[option,...]"
```
## 🧪 Code examples
### Available services
| Category | Services | Install Command Example |
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------- |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | `pip install "pipecat-ai[deepgram]"` |
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together) | `pip install "pipecat-ai[openai]"` |
| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | `pip install "pipecat-ai[cartesia]"` |
| Speech-to-Speech | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) | `pip install "pipecat-ai[google]"` |
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | `pip install "pipecat-ai[daily]"` |
| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | `pip install "pipecat-ai[tavus,simli]"` |
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | `pip install "pipecat-ai[mem0]"` |
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | `pip install "pipecat-ai[moondream]"` |
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) | `pip install "pipecat-ai[silero]"` |
| Analytics & Metrics | [Canonical AI](https://docs.pipecat.ai/server/services/analytics/canonical), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | `pip install "pipecat-ai[canonical]"` |
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
## Code examples
- [Foundational](https://github.com/pipecat-ai/pipecat/tree/main/examples/foundational) — small snippets that build on each other, introducing one or two concepts at a time
- [Example apps](https://github.com/pipecat-ai/pipecat/tree/main/examples/) — complete applications that you can use as starting points for development
## 🛠️ Hacking on the framework itself
## A simple voice agent running locally
1. Set up a virtual environment before following these instructions. From the root of the repo:
Here is a very basic Pipecat bot that greets a user when they join a real-time session. We'll use [Daily](https://daily.co) for real-time media transport, and [Cartesia](https://cartesia.ai/) for text-to-speech.
```shell
python3 -m venv venv
source venv/bin/activate
```
```python
import asyncio
2. Install the development dependencies:
from pipecat.frames.frames import TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.task import PipelineTask
from pipecat.pipeline.runner import PipelineRunner
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport
```shell
pip install -r dev-requirements.txt
```
async def main():
# Use Daily as a real-time media transport (WebRTC)
transport = DailyTransport(
room_url=...,
token="", # leave empty. Note: token is _not_ your api key
bot_name="Bot Name",
params=DailyParams(audio_out_enabled=True))
3. Install the git pre-commit hooks (these help ensure your code follows project rules):
# Use Cartesia for Text-to-Speech
tts = CartesiaTTSService(
api_key=...,
voice_id=...
)
```shell
pre-commit install
```
# Simple pipeline that will process text to speech and output the result
pipeline = Pipeline([tts, transport.output()])
4. Install the `pipecat-ai` package locally in editable mode:
# Create Pipecat processor that can run one or more pipelines tasks
runner = PipelineRunner()
```shell
pip install -e .
```
# Assign the task callable to run the pipeline
task = PipelineTask(pipeline)
> The `-e` or `--editable` option allows you to modify the code without reinstalling.
# Register an event handler to play audio when a
# participant joins the transport WebRTC session
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
participant_name = participant.get("info", {}).get("userName", "")
# Queue a TextFrame that will get spoken by the TTS service (Cartesia)
await task.queue_frame(TextFrame(f"Hello there, {participant_name}!"))
5. Include optional dependencies as needed. For example:
# Register an event handler to exit the application when the user leaves.
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.cancel()
```shell
pip install -e ".[daily,deepgram,cartesia,openai,silero]"
```
# Run the pipeline task
await runner.run(task)
6. (Optional) If you want to use this package from another directory:
if __name__ == "__main__":
asyncio.run(main())
```
```shell
pip install "path_to_this_repo[option,...]"
```
### Running tests
Install the test dependencies:
Run it with:
```shell
pip install -r test-requirements.txt
python app.py
```
Daily provides a prebuilt WebRTC user interface. While the app is running, you can visit at `https://<yourdomain>.daily.co/<room_url>` and listen to the bot say hello!
## WebRTC for production use
WebSockets are fine for server-to-server communication or for initial development. But for production use, youll need client-server audio to use a protocol designed for real-time media transport. (For an explanation of the difference between WebSockets and WebRTC, see [this post.](https://www.daily.co/blog/how-to-talk-to-an-llm-with-your-voice/#webrtc))
One way to get up and running quickly with WebRTC is to sign up for a Daily developer account. Daily gives you SDKs and global infrastructure for audio (and video) routing. Every account gets 10,000 audio/video/transcription minutes free each month.
Sign up [here](https://dashboard.daily.co/u/signup) and [create a room](https://docs.daily.co/reference/rest-api/rooms) in the developer Dashboard.
## Hacking on the framework itself
_Note: You may need to set up a virtual environment before following these instructions. From the root of the repo:_
```shell
python3 -m venv venv
source venv/bin/activate
```
Install the development dependencies:
```shell
pip install -r dev-requirements.txt
```
Install the git pre-commit hooks (these help ensure your code follows project rules):
```shell
pre-commit install
```
Install the `pipecat-ai` package locally in editable mode:
```shell
pip install -e .
```
The `-e` or `--editable` option allows you to modify the code without reinstalling.
To include optional dependencies, add them to the install command. For example:
```shell
pip install -e ".[daily,deepgram,cartesia,openai,silero]" # Updated for the services you're using
```
If you want to use this package from another directory:
```shell
pip install "path_to_this_repo[option,...]"
```
### Running tests
From the root directory, run:
```shell
pytest
```
### Setting up your editor
## Setting up your editor
This project uses strict [PEP 8](https://peps.python.org/pep-0008/) formatting via [Ruff](https://github.com/astral-sh/ruff).
#### Emacs
### Emacs
You can use [use-package](https://github.com/jwiegley/use-package) to install [emacs-lazy-ruff](https://github.com/christophermadsen/emacs-lazy-ruff) package and configure `ruff` arguments:
@@ -168,7 +223,7 @@ You can use [use-package](https://github.com/jwiegley/use-package) to install [e
:hook ((python-mode . pyvenv-auto-run)))
```
#### Visual Studio Code
### Visual Studio Code
Install the
[Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension. Then edit the user settings (_Ctrl-Shift-P_ `Open User Settings (JSON)`) and set it as the default Python formatter, and enable formatting on save:
@@ -180,7 +235,7 @@ Install the
}
```
#### PyCharm
### PyCharm
`ruff` was installed in the `venv` environment described before, now to enable autoformatting on save, go to `File` -> `Settings` -> `Tools` -> `File Watchers` and add a new watcher with the following settings:
@@ -190,7 +245,7 @@ Install the
4. **Arguments**: `format $FilePath$`
5. **Program**: `$PyInterpreterDirectory$/ruff`
## 🤝 Contributing
## Contributing
We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or adding new features, here's how you can help:
@@ -203,7 +258,7 @@ Before submitting a pull request, please check existing issues and PRs to avoid
We aim to review all contributions promptly and provide constructive feedback to help get your changes merged.
## 🛟 Getting help
## Getting help
➡️ [Join our Discord](https://discord.gg/pipecat)

22
docs/ISSUE_TEMPLATE.md Normal file
View File

@@ -0,0 +1,22 @@
# Description
Is this reporting a bug or feature request?
If reporting a bug, please fill out the following:
### Environment
- pipecat-ai version:
- python version:
- OS:
### Issue description
Provide a clear description of the issue.
### Repro steps
List the steps to reproduce the issue.
### Expected behavior
### Actual behavior
### Logs

View File

@@ -50,6 +50,7 @@ autodoc_mock_imports = [
"pyht.protos",
"pyht.protos.api_pb2",
"pipecat_ai_playht", # PlayHT wrapper
"vllm",
"aiortc",
"aiortc.mediastreams",
"cv2",
@@ -75,6 +76,7 @@ autodoc_mock_imports = [
"openpipe",
"simli",
"soundfile",
# Existing mocks
"pipecat_ai_krisp",
"pyaudio",
"_tkinter",
@@ -85,66 +87,6 @@ autodoc_mock_imports = [
"pydantic.Field",
"pydantic._internal._model_construction",
"pydantic._internal._fields",
# Moondream dependencies
"torch",
"transformers",
"intel_extension_for_pytorch",
# Ultravox dependencies
"huggingface_hub",
"vllm",
"vllm.engine.arg_utils",
"transformers.AutoTokenizer",
# Langchain dependencies
"langchain_core",
"langchain_core.messages",
"langchain_core.runnables",
"langchain_core.messages.AIMessageChunk",
"langchain_core.runnables.Runnable",
# LiveKit dependencies
"livekit",
"livekit.rtc",
"livekit_api",
"livekit_protocol",
"tenacity",
"tenacity.retry",
"tenacity.stop_after_attempt",
"tenacity.wait_exponential",
"rtc",
"rtc.Room",
"rtc.RoomOptions",
"rtc.AudioSource",
"rtc.LocalAudioTrack",
"rtc.TrackPublishOptions",
"rtc.TrackSource",
"rtc.AudioStream",
"rtc.AudioFrameEvent",
"rtc.AudioFrame",
"rtc.Track",
"rtc.TrackKind",
"rtc.RemoteParticipant",
"rtc.RemoteTrackPublication",
"rtc.DataPacket",
# Riva dependencies
"riva",
"riva.client",
"riva.client.Auth",
"riva.client.ASRService",
"riva.client.StreamingRecognitionConfig",
"riva.client.RecognitionConfig",
"riva.client.AudioEncoding",
"riva.client.proto.riva_tts_pb2",
"riva.client.SpeechSynthesisService",
# Local CoreML Smart Turn dependencies
"coremltools",
"coremltools.models",
"coremltools.models.MLModel",
"torch",
"torch.nn",
"torch.nn.functional",
"transformers",
"transformers.AutoFeatureExtractor",
# Also add specific classes that are imported
"AutoFeatureExtractor",
]
# HTML output settings
@@ -176,25 +118,12 @@ def verify_modules():
},
}
# Skip importing modules that are in autodoc_mock_imports
skipped_modules = set(autodoc_mock_imports)
missing = []
for category, modules in required_modules.items():
if isinstance(modules, dict):
# Handle nested structure
for subcategory, submodules in modules.items():
for module in submodules:
# Check if module is in autodoc_mock_imports
if (
f"pipecat.{category}.{subcategory}.{module}" in skipped_modules
or module in skipped_modules
):
logger.info(
f"Skipping import of mocked module: pipecat.{category}.{subcategory}.{module}"
)
continue
try:
__import__(f"pipecat.{category}.{subcategory}.{module}")
logger.info(
@@ -208,11 +137,6 @@ def verify_modules():
else:
# Handle flat structure
for module in modules:
# Check if module is in autodoc_mock_imports
if f"pipecat.{category}.{module}" in skipped_modules or module in skipped_modules:
logger.info(f"Skipping import of mocked module: pipecat.{category}.{module}")
continue
try:
__import__(f"pipecat.{category}.{module}")
logger.info(f"Successfully imported pipecat.{category}.{module}")

View File

@@ -45,10 +45,8 @@ Transport & Serialization
Utilities
~~~~~~~~~
* :mod:`Adapters <pipecat.adapters>`
* :mod:`Clocks <pipecat.clocks>`
* :mod:`Metrics <pipecat.metrics>`
* :mod:`Observers <pipecat.observers>`
* :mod:`Sync <pipecat.sync>`
* :mod:`Transcriptions <pipecat.transcriptions>`
* :mod:`Utils <pipecat.utils>`
@@ -58,12 +56,10 @@ Utilities
:caption: API Reference
:hidden:
Adapters <api/pipecat.adapters>
Audio <api/pipecat.audio>
Clocks <api/pipecat.clocks>
Frames <api/pipecat.frames>
Metrics <api/pipecat.metrics>
Observers <api/pipecat.observers>
Pipeline <api/pipecat.pipeline>
Processors <api/pipecat.processors>
Serializers <api/pipecat.serializers>

View File

@@ -26,23 +26,20 @@ pipecat-ai[grok]
pipecat-ai[groq]
# pipecat-ai[krisp] # Mocked
pipecat-ai[koala]
# pipecat-ai[langchain] # Mocked
# pipecat-ai[livekit] # Mocked
pipecat-ai[langchain]
pipecat-ai[livekit]
pipecat-ai[lmnt]
pipecat-ai[local]
# pipecat-ai[local-smart-turn] # Mocked
# pipecat-ai[mem0] # Mocked
# pipecat-ai[mlx-whisper] # Mocked
# pipecat-ai[moondream] # Mocked
pipecat-ai[moondream]
pipecat-ai[nim]
# pipecat-ai[neuphonic] # Mocked
pipecat-ai[noisereduce]
pipecat-ai[openai]
# pipecat-ai[openpipe]
# pipecat-ai[playht] # Mocked due to grpcio conflict with riva
pipecat-ai[qwen]
pipecat-ai[remote-smart-turn]
# pipecat-ai[riva] # Mocked
pipecat-ai[riva]
pipecat-ai[silero]
pipecat-ai[simli]
pipecat-ai[soundfile]

View File

@@ -92,12 +92,4 @@ ASSEMBLYAI_API_KEY=...
OPENROUTER_API_KEY=...
# Piper
PIPER_BASE_URL=...
# Smart turn
LOCAL_SMART_TURN_MODEL_PATH=
FAL_SMART_TURN_API_KEY=...
# Twilio
TWILIO_ACCOUNT_SID=
TWILIO_AUTH_TOKEN=
PIPER_BASE_URL=...

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -43,7 +43,9 @@ async def main():
DailyParams(
audio_out_enabled=True,
audio_in_enabled=True,
video_out_enabled=False,
camera_out_enabled=False,
vad_enabled=True,
vad_audio_passthrough=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
#
@@ -70,7 +72,7 @@ async def main():
# voice_id="gD1IexrzCvsXPHUuT0s3",
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{

View File

@@ -66,7 +66,9 @@ async def main():
DailyParams(
audio_out_enabled=True,
audio_in_enabled=True,
video_out_enabled=False,
camera_out_enabled=False,
vad_enabled=True,
vad_audio_passthrough=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
#
@@ -93,7 +95,7 @@ async def main():
# voice_id="gD1IexrzCvsXPHUuT0s3",
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{

View File

@@ -41,7 +41,8 @@ async def main(room_url: str, token: str):
api_key=daily_api_key,
audio_in_enabled=True,
audio_out_enabled=True,
video_out_enabled=False,
camera_out_enabled=False,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
),
@@ -52,7 +53,7 @@ async def main(room_url: str, token: str):
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{

View File

@@ -32,9 +32,9 @@ async def main(room_url: str, token: str):
token,
"bot",
DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
@@ -43,7 +43,7 @@ async def main(room_url: str, token: str):
api_key=os.getenv("CARTESIA_API_KEY", ""), voice_id="71a7ad14-091c-4e8e-a314-022ece01c121"
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{

View File

@@ -1,4 +1,5 @@
python-dotenv==1.0.1
modal==0.71.3
pipecat-ai[daily,silero,cartesia,openai]
pipecat-ai[daily,silero,cartesia,openai]==0.0.52
fastapi==0.115.6
aiohttp==3.11.11

View File

@@ -1,178 +0,0 @@
# Handling PSTN/SIP Dial-in on Pipecat Cloud
This repository contains two server implementations for handling
the pinless dial-in workflow in Pipecat Cloud. This is the companion to the
Pipecat Cloud [pstn_sip starter image](https://github.com/daily-co/pipecat-cloud-images/tree/main/pipecat-starters/pstn_sip).
In addition you can use `/api/dial` to trigger dial-out, and
eventually, call-transfers.
1. [FastAPI Server](fastapi-webhook-server/README.md) -
A FastAPI implementation that handles PSTN (Public Switched Telephone
Network) and SIP (Session Initiation Protocol) calls using the Daily API.
2. [Next.js Serverless](nextjs-webhook-server/README.md) -
A Next.js API implementation designed for deployment on Vercel's
serverless platform.
Both implementations provide:
- HMAC signature validation for pinless webhook
- Structured logging
- Support for dial-in and dial-out settings
- Voicemail detection and call transfer functionality (coming soon)
- Test request handling
## Choosing an Implementation
- Use the **FastAPI Server** if you:
- Need a standalone server
- Prefer Python and FastAPI
- Want to deploy to traditional hosting platforms
- Use the **Next.js Serverless** implementation if you:
- Want serverless deployment
- Prefer JavaScript/TypeScript
- Already use Next.js and Vercel for other projects
- Need quick scaling and zero maintenance
## Prerequisites
### Environment Variables
Both implementations require similar environment variables:
- `PIPECAT_CLOUD_API_KEY`: Pipecat Cloud API Key, begins with pk\_\*
- `AGENT_NAME`: Your Daily agent name
- `PINLESS_HMAC_SECRET`: Your HMAC secret for request verification
- `LOG_LEVEL`: (Optional) Logging level (defaults to 'info')
See the individual README files in each implementation directory for
specific setup instructions.
### Phone number setup
You can buy a phone number through the Pipecat Cloud Dashboard:
1. Go to `Settings` > `Telephony`
2. Follow the UI to purchase a phone number
3. Configure the webhook URL to receive incoming calls (e.g. `https://my-webhook-url.com/api/dial`)
Or purchase the number using Daily's
[PhoneNumbers API](https://docs.daily.co/reference/rest-api/phone-numbers).
```bash
curl --request POST \
--url https://api.daily.co/v1/domain-dialin-config \
--header 'Authorization: Bearer $TOKEN' \
--header 'Content-Type: application/json' \
--data-raw '{
"type": "pinless_dialin",
"name_prefix": "Customer1",
"phone_number": "+1PURCHASED_NUM",
"room_creation_api": "https://example.com/api/dial",
"hold_music_url": "https://example.com/static/ringtone.mp3",
"timeout_config": {
"message": "No agent is available right now"
}
}'
```
The API will return a static SIP URI (`sip_uri`) that can be called
from other SIP services.
### `room_creation_api`
To make and receive calls currently you have to host a server that
handles incoming calls. In the coming weeks, incoming calls will be
directly handled within Daily and we will expose an endpoint similar
to `{service}/start` that will manage this for you.
In the meantime, the server described below serves as the webhook
handler for the `room_creation_api`. Configure your pinless phone
number or SIP interconnect to the `ngrok` tunnel or
the actual server URL, append `/api/dial` to the webhook URL.
## Example curl commands
Note: Replace `http://localhost:3000` with your actual server URL and
phone numbers with valid values for your use case.
### Dialin Request
The server will receive a request when a call is received from Daily.
### Dialout Request
Dial a number, will use any purchased number
```bash
curl -X POST http://localhost:3000/api/dial \
-H "Content-Type: application/json" \
-d '{
"dialout_settings": [
{
"phoneNumber": "+1234567890",
}
]
}'
```
Dial a number with callerId, which is the UUID of a purchased number.
```bash
curl -X POST http://localhost:3000/api/dial \
-H "Content-Type: application/json" \
-d '{
"dialout_settings": [
{
"phoneNumber": "+1234567890",
"callerId": "purchased_phone_uuid"
}
]
}'
```
Dial a number
```bash
curl -X POST http://localhost:3000/api/dial \
-H "Content-Type: application/json" \
-d '{
"dialout_settings": [
{
"phoneNumber": "+1234567890",
"callerId": "purchased_phone_uuid"
}
]
}'
```
### Advanced Request with Voicemail Detection
```bash
curl -X POST http://localhost:3000/api/dial \
-H "Content-Type: application/json" \
-d '{
"To": "+1234567890",
"From": "+1987654321",
"callId": "call-uuid-123",
"callDomain": "domain-uuid-456",
"dialout_settings": [
{
"phoneNumber": "+1234567890",
"callerId": "purchased_phone_uuid"
}
],
"voicemail_detection": {
"testInPrebuilt": true
},
"call_transfer": {
"mode": "dialout",
"speakSummary": true,
"storeSummary": true,
"operatorNumber": "+1234567890",
"testInPrebuilt": true
}
}'
```

View File

@@ -1,98 +0,0 @@
# FastAPI server for handling Daily PSTN/SIP Webhook
A FastAPI server that handles PSTN (Public Switched Telephone Network) and SIP (Session Initiation Protocol) calls using the Daily API.
## Setup
1. Clone the repository
2. Navigate to the `fastapi-webhook-server` directory:
```bash
cd fastapi-webhook-server
```
3. Install dependencies:
```bash
pip install -r requirements.txt
```
4. Copy `env.example` to `.env`:
```bash
cp env.example .env
```
5. Update `.env` with your credentials:
- `AGENT_NAME`: Your Daily agent name
- `PIPECAT_CLOUD_API_KEY`: Your Daily API key
- `PINLESS_HMAC_SECRET`: Your HMAC secret for request verification
## Running the Server
Start the server:
```bash
python server.py
```
The server will run on `http://localhost:7860` and you can expose it via ngrok for testing:
```bash
`ngrok http 7860`
```
> Tip: Use a subdomain for a consistent URL (e.g. `ngrok http -subdomain=mydomain http://localhost:7860`)
## API Endpoints
### GET /
Health check endpoint that returns a "Hello, World!" message.
### POST /api/dial
Initiates a PSTN/SIP call with the following request body format:
```json
{
"To": "+14152251493",
"From": "+14158483432",
"callId": "string-contains-uuid",
"callDomain": "string-contains-uuid",
"dialout_settings": [
{
"phoneNumber": "+14158483432",
"callerId": "+14152251493"
}
],
"voicemail_detection": {
"testInPrebuilt": true
},
"call_transfer": {
"mode": "dialout",
"speakSummary": true,
"storeSummary": true,
"operatorNumber": "+14152250006",
"testInPrebuilt": true
}
}
```
#### Response
Returns a JSON object containing:
- `status`: Success/failure status
- `data`: Response from Daily API
- `room_properties`: Properties of the created Daily room
## Error Handling
- 401: Invalid signature
- 400: Invalid authorization header (e.g. missing Daily API key in bot.py)
- 405: Method not allowed (e.g. incorrect route on the webhook URL)
- 500: Server errors (missing API key, network issues)
- Other status codes are passed through from the Daily API

View File

@@ -1,3 +0,0 @@
AGENT_NAME="your-agent-name"
PIPECAT_CLOUD_API_KEY="your-daily-api-key"
PINLESS_HMAC_SECRET="hmac-secret-pinless-dialin"

View File

@@ -1,6 +0,0 @@
fastapi
uvicorn
python-dotenv
requests
pydantic
loguru

View File

@@ -1,202 +0,0 @@
#
# Copyright (c) 2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
# server.py
import base64 # for calculating hmac signature
import hmac
import os # for accessing environment variables
import time # for setting expiration time
from typing import Any, Dict, List, Optional
import requests
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException, Request
from loguru import logger
from pydantic import BaseModel, Field
load_dotenv(override=True)
app = FastAPI()
class RoomRequest(BaseModel):
test: Optional[str] = Field(None, alias="Test", description="Test field")
To: Optional[str] = Field(None, alias="to", description="Destination phone number")
From: Optional[str] = Field(None, alias="from", description="Source phone number")
callId: Optional[str] = Field(None, alias="call_id", description="Unique call identifier")
callDomain: Optional[str] = Field(
None, alias="call_domain", description="Call domain identifier"
)
dialout_settings: Optional[List[Dict[str, Any]]] = Field(
None, description="An array of phone numbers or SIP URIs to dialout to"
)
voicemail_detection: Optional[Dict[str, Any]] = Field(
None, description="A flag to perform voicemail or answeing-machine detection"
)
call_transfer: Optional[Dict[str, Any]] = Field(None, description="to initiate a call transfer")
class Config:
populate_by_name = True
alias_generator = None
"""
body can contain any fields, but for handling PSTN/SIP,
we recommend sending the following custom values:
dialin, dialout, voicemail detection, and call transfer
"To": "+14152251493",
"From": "+14158483432",
"callId": "string-contains-uuid",
"callDomain": "string-contains-uuid"
These need to be remapped to dialin_settings
"dialout_settings": [
{"phoneNumber": "+14158483432", "callerId": "+14152251493"},
{"sipUri": "sip:username@sip.hostname"}
],
},
voicemail_detection:{
testInPrebuilt: true
},
"call_transfer": {
"mode": "dialout",
"speakSummary": true,
"storeSummary": true,
"operatorNumber": "+14152250006",
"testInPrebuilt": true
}
"""
@app.get("/")
async def read_root():
return {"message": "Hello, World!"}
@app.post("/api/dial")
async def dial(request: RoomRequest, raw_request: Request):
logger.info("Incoming request to /dial:")
logger.info(f"Headers: {dict(raw_request.headers)}")
raw_body = await raw_request.body()
raw_body_str = raw_body.decode()
logger.info(f"Raw body: {raw_body_str}")
logger.info(f"Parsed body: {request.dict()}")
# calculate signature and compare/verify
hmac_secret = os.getenv("PINLESS_HMAC_SECRET")
timestamp = raw_request.headers.get("x-pinless-timestamp")
signature = raw_request.headers.get("x-pinless-signature")
if not hmac_secret:
logger.debug("Skipping HMAC validation - PINLESS_HMAC_SECRET not set")
elif timestamp and signature:
message = timestamp + "." + raw_body_str
base64_decoded_secret = base64.b64decode(hmac_secret)
computed_signature = base64.b64encode(
hmac.new(base64_decoded_secret, message.encode(), "sha256").digest()
).decode()
if computed_signature != signature:
logger.error(f"Invalid signature. Expected {signature}, got {computed_signature}")
raise HTTPException(status_code=401, detail="Invalid signature")
else:
logger.debug("Skipping HMAC validation - no signature headers present")
if request.test == "test":
logger.debug("Test request received")
return {"status": "success", "message": "Test request received"}
dialin_settings = None
# these fields are camelCase in the request
required_fields = ["To", "From", "callId", "callDomain"]
if all(
field in request.dict() and request.dict()[field] is not None for field in required_fields
):
# transform from camelCase to snake_case because daily-python expects snake_case
dialin_settings = {
"From": request.From,
"To": request.To,
"call_id": request.callId,
"call_domain": request.callDomain,
# transform from camelCase to snake_case
}
logger.debug(f"Populated dialin_settings from request: {dialin_settings}")
daily_room_properties = {
"enable_dialout": request.dialout_settings is not None,
}
if dialin_settings is not None:
sip_config = {
"display_name": request.From,
"sip_mode": "dial-in",
"num_endpoints": 2 if request.call_transfer is not None else 1,
"codecs": {"audio": ["OPUS"]},
}
daily_room_properties["sip"] = sip_config
# Setting default expiry to 5 minutes from now
daily_room_properties["exp"] = int(time.time()) + (5 * 60)
logger.debug(f"Daily room properties: {daily_room_properties}")
payload = {
"createDailyRoom": True,
"dailyRoomProperties": daily_room_properties,
"body": {
"dialin_settings": dialin_settings,
"dialout_settings": request.dialout_settings,
"voicemail_detection": request.voicemail_detection,
"call_transfer": request.call_transfer,
},
}
pcc_api_key = os.getenv("PIPECAT_CLOUD_API_KEY")
agent_name = os.getenv("AGENT_NAME", "my-first-agent")
if not pcc_api_key:
raise HTTPException(status_code=500, detail="DAILY_API_KEY environment variable is not set")
headers = {"Authorization": f"Bearer {pcc_api_key}", "Content-Type": "application/json"}
url = f"https://api.pipecat.daily.co/v1/public/{agent_name}/start"
logger.debug(f"Making API call to Daily: {url} {headers} {payload}")
try:
response = requests.post(url, json=payload, headers=headers)
response.raise_for_status()
response_data = response.json()
logger.debug(f"Response: {response_data}")
return {
"status": "success",
"data": response_data,
"room_properties": daily_room_properties,
}
except requests.exceptions.HTTPError as e:
# Pass through the status code and error details from the Daily API
status_code = e.response.status_code
error_detail = e.response.json() if e.response.content else str(e)
logger.error(f"HTTP error: {error_detail}")
raise HTTPException(status_code=status_code, detail=error_detail)
except requests.exceptions.RequestException as e:
logger.error(f"Request error: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
try:
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
except KeyboardInterrupt:
logger.info("Server stopped manually")

View File

@@ -1,53 +0,0 @@
# dependencies
/node_modules
/.pnp
.pnp.js
# testing
/coverage
# next.js
/.next/
/out/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*
# local env files
.env*.local
# vercel
.vercel
# typescript
*.tsbuildinfo
next-env.d.ts
# IDE specific files
.idea/
.vscode/
*.swp
*.swo
# Logs
logs
*.log
# OS generated files
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db

View File

@@ -1,115 +0,0 @@
# Next.js server for handling Daily PSTN/SIP Webhook
Next.js API routes for handling Daily PSTN/SIP Pipecat requests.
## Features
- API endpoint for handling Daily PSTN/SIP Pipecat requests
- HMAC signature validation
- Structured logging with Pino
- Support for dial-in and dial-out settings
- Voicemail detection and call transfer functionality
- Test request handling
## Setup
1. Clone the repository
2. Navigate to the `nextjs-webhook-server` directory:
```bash
cd nextjs-webhook-server
```
3. Install dependencies:
```bash
npm install
```
4. Create `.env.local` file with your credentials:
```bash
cp env.local.example .env.local
```
5. Update your `.env` with your secrets:
```bash
PIPECAT_CLOUD_API_KEY=pk_*
AGENT_NAME=my-first-agent
PINLESS_HMAC_SECRET=your_hmac_secret
LOG_LEVEL=info
```
### Running the server
Run the development server:
```bash
npm run dev
```
The server will run on `http://localhost:7860` and you can expose it via ngrok for testing:
```bash
`ngrok http 7860`
```
> Tip: Use a subdomain for a consistent URL (e.g. `ngrok http -subdomain=mydomain http://localhost:7860`)
## API Endpoints
### GET /api
Returns a simple "Hello, World!" message with a cute cat emoji to verify the server is running.
### POST /api/dial
Handles dial-in and dial-out requests for Pipecat Cloud.
#### Test Requests
The endpoint handles test requests when a webhook is configured. Send a request with `"Test": "test"` to verify your setup:
```json
{
"Test": "test"
}
```
#### Production Request Format
```json
{
// for dial-in from webhook
"To": "+14152251493",
"From": "+14158483432",
"callId": "string-contains-uuid",
"callDomain": "string-contains-uuid",
// for making a dial out to a phone or SIP
"dialout_settings": [
{ "phoneNumber": "+14158483432", "callerId": "purchased_phone_uuid" },
{ "sipUri": "sip:username@sip.hostname.com" }
]
}
```
## Deployment
The application is configured for Vercel deployment:
1. Push your code to a Git repository
2. Import your project in Vercel dashboard
3. Configure environment variables:
- `PIPECAT_CLOUD_API_KEY`
- `AGENT_NAME`
- `PINLESS_HMAC_SECRET`
- `LOG_LEVEL` (optional, defaults to 'info')
4. Deploy!
## Security
- HMAC signature validation for request authentication
- Environment variables for sensitive credentials
- Method validation (POST only for /dial)

View File

@@ -1,4 +0,0 @@
AGENT_NAME=my-first-agent
PIPECAT_CLOUD_API_KEY=your_daily_api_key
PINLESS_HMAC_SECRET=your_hmac_secret
LOG_LEVEL="info"

View File

@@ -1,22 +0,0 @@
{
"name": "my-daily-app",
"version": "0.1.0",
"private": true,
"scripts": {
"dev": "next dev -p 7860",
"build": "next build",
"start": "next start -p 7860",
"lint": "next lint"
},
"dependencies": {
"axios": "^1.6.0",
"next": "^14.0.0",
"pino": "^8.15.0",
"react": "^18.2.0",
"react-dom": "^18.2.0"
},
"devDependencies": {
"eslint": "^8.46.0",
"eslint-config-next": "^14.0.0"
}
}

View File

@@ -1,176 +0,0 @@
import { logger } from '../../lib/utils';
import axios from 'axios';
import crypto from 'crypto';
const validateSignature = (body, signature, timestamp, secret) => {
// Skip if any required fields are missing
if (!signature || !timestamp || !secret) {
logger.warn('Missing required fields for HMAC validation');
return true;
}
try {
const decodedSecret = Buffer.from(secret, 'base64');
const hmac = crypto.createHmac('sha256', decodedSecret);
const signatureData = `${timestamp}.${body}`;
const computedSignature = hmac.update(signatureData).digest('base64');
logger.debug('Signature validation:', {
timestamp,
signatureData: signatureData.substring(0, 50) + '...',
computedSignature,
receivedSignature: signature
});
return computedSignature === signature;
} catch (error) {
logger.error('Error validating signature:', error);
return true; // Allow request to proceed on error
}
};
export default async function handler(req, res) {
// Only allow POST requests
if (req.method !== 'POST') {
return res.status(405).json({ error: 'Method not allowed' });
}
try {
logger.info('Incoming request to /api/dial:');
logger.info(`Headers: ${JSON.stringify(req.headers)}`);
const rawBody = JSON.stringify(req.body);
logger.info(`Raw body: ${rawBody}`);
const signature = req.headers['x-pinless-signature'];
const timestamp = req.headers['x-pinless-timestamp'];
if (signature && timestamp) {
logger.info('Validating HMAC signature');
if (!validateSignature(rawBody, signature, timestamp, process.env.PINLESS_HMAC_SECRET)) {
logger.error('Invalid HMAC signature', { signature, timestamp });
return res.status(401).json({
error: 'Invalid signature',
message: 'Invalid HMAC signature'
});
}
} else {
logger.info('Skipping HMAC validation - no signature headers present');
}
// Extract request data
const {
Test: test,
To,
From,
callId,
callDomain,
dialout_settings,
voicemail_detection,
call_transfer
} = req.body;
// Handle test requests when a webhook is configured
if (test === 'test') {
logger.debug('Test request received');
return res.status(200).json({ status: 'success', message: 'Test request received' });
}
// Process dialin settings
let dialin_settings = null;
const requiredFields = ['To', 'From', 'callId', 'callDomain'];
if (requiredFields.every(field => req.body[field] !== undefined && req.body[field] !== null)) {
dialin_settings = {
// snake_case because pipecat expects this format
From,
To,
call_id: callId,
call_domain: callDomain,
};
logger.debug(`Populated dialin_settings from request: ${JSON.stringify(dialin_settings)}`);
}
// Set up Daily room properties
const daily_room_properties = {
enable_dialout: dialout_settings !== undefined && dialout_settings !== null,
exp: Math.floor(Date.now() / 1000) + (5 * 60), // 5 minutes from now
};
// Configure SIP if dialin settings are provided
if (dialin_settings !== null) {
const sip_config = {
display_name: From,
sip_mode: 'dial-in',
num_endpoints: call_transfer !== null ? 2 : 1,
codecs: {"audio": ["OPUS"]},
};
daily_room_properties.sip = sip_config;
}
// Prepare payload for {service}/start API call
const payload = {
createDailyRoom: true,
dailyRoomProperties: daily_room_properties,
body: {
dialin_settings,
dialout_settings,
voicemail_detection,
call_transfer,
},
};
logger.debug(`Daily room properties: ${JSON.stringify(daily_room_properties)}`);
// Get Daily API key and agent name from environment variables
const pccApiKey = process.env.PIPECAT_CLOUD_API_KEY;
const agentName = process.env.AGENT_NAME || 'my-first-agent';
if (!pccApiKey) {
throw new Error('PIPECAT_CLOUD_API_KEY environment variable is not set');
}
// Set up headers for Daily API call
const headers = {
'Authorization': `Bearer ${pccApiKey}`,
'Content-Type': 'application/json',
};
const url = `https://api.pipecat.daily.co/v1/public/${agentName}/start`;
logger.debug(`Making API call to Daily: ${url} ${JSON.stringify(headers)} ${JSON.stringify(payload)}`);
try {
const response = await axios.post(url, payload, { headers });
logger.debug(`Response: ${JSON.stringify(response.data)}`);
return res.status(200).json({
status: 'success',
data: response.data,
room_properties: daily_room_properties,
});
} catch (error) {
if (error.response) {
// Pass through status code and error details from the Daily API
const statusCode = error.response.status;
const errorDetail = error.response.data || error.message;
logger.error(`HTTP error: ${JSON.stringify(errorDetail)}`);
return res.status(statusCode).json(errorDetail);
} else {
logger.error(`Request error: ${error.message}`);
return res.status(500).json({ error: error.message });
}
}
} catch (error) {
logger.error(`Unexpected error: ${error.message}`);
return res.status(500).json({ error: 'Internal server error', message: error.message });
}
}
// Configure body parser to preserve raw body text
export const config = {
api: {
bodyParser: {
sizeLimit: '1mb',
},
},
};

View File

@@ -1,6 +0,0 @@
import { logger } from '../../lib/utils';
export default function handler(req, res) {
logger.info('Received request to /api');
res.status(200).json({ message: 'Hello, World! from ᓚᘏᗢ' });
}

View File

@@ -1,6 +0,0 @@
module.exports = {
version: 2,
buildCommand: "next build",
outputDirectory: ".next",
cleanUrls: true
};

View File

@@ -50,9 +50,9 @@ async def main(room_url: str, token: str):
token,
"bot",
DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
@@ -61,7 +61,7 @@ async def main(room_url: str, token: str):
api_key=os.getenv("CARTESIA_API_KEY"), voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22"
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{

View File

@@ -1,51 +0,0 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
dist/
*.egg-info/
.installed.cfg
*.egg
.pytest_cache/
.coverage
.coverage.*
.env
.venv
env/
venv/
ENV/
.mypy_cache/
.dmypy.json
dmypy.json
# JavaScript/Node.js
node_modules/
dist/
dist-ssr/
*.local
.env.local
.env.development.local
.env.test.local
.env.production.local
# Logs
logs/
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
# Editor/IDE
.vscode/*
!.vscode/extensions.json
.idea/
*.swp
*.swo
.DS_Store
# Project specific
runpod.toml

View File

@@ -1,152 +0,0 @@
# Smart Turn Detection Demo
This demo showcases Pipecat's Smart Turn Detection feature - an advanced conversational turn detection system that uses machine learning to identify when a speaker has finished their turn in a conversation. Unlike basic Voice Activity Detection (VAD) which only detects speech vs. silence, Smart Turn detects natural conversational cues like intonation patterns, pacing, and linguistic signals.
This demo uses the [pipecat-ai/smart-turn](https://huggingface.co/pipecat-ai/smart-turn) model - an open-source, community-driven conversational turn detection model designed to provide more natural turn-taking in voice interactions. The model is being hosted on Fal's infrastructure for GPU acceleration, offering inference times between 50-60ms.
In the client UI, you can see the transcription messages along with the smart-turn model's prediction results in real-time.
## Try the demo
Try the hosted version of the demo here: https://pcc-smart-turn.vercel.app/.
## Run the demo locally
### Run the Server
1. Set up and activate your virtual environment:
```bash
python3 -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
2. Install dependencies:
```bash
pip install -r requirements.txt
```
3. Create your .env file and set your env vars:
```bash
cp env.example .env
```
Keys to provide:
- GOOGLE_API_KEY
- CARTESIA_API_KEY
- DEEPGRAM_API_KEY
- DAILY_API_KEY
- FAL_SMART_TURN_API_KEY
4. Run the server:
```bash
LOCAL=1 python server.py
```
### Run the client
1. Open a new terminal and navigate to the client directory:
```bash
cd client
```
2. Install dependencies:
```bash
npm install
```
3. Create your .env.local file:
```bash
cp env.local.example .env.local
```
> Note: No keys need to be modified. `NEXT_PUBLIC_API_BASE_URL` is already configured for local use.
4. Start the development server:
```bash
npm run dev
```
5. Open [http://localhost:3000](http://localhost:3000) in your browser.
## Deploy the app
### Deploy the server to Pipecat Cloud
1. Navigate to server
```bash
cd server
```
2. You should already have a .env set up from running locally. If not, do that now.
3. Update your build and deploy scripts.
- In build.sh, set `DOCKER_USERNAME` and `AGENT_NAME`.
- In pcc-deploy.toml, set `image`, which specifies where your Docker image is stored.
4. Build your Docker image by running the build script:
```bash
./build.sh
```
> Note: This builds, tags and pushes your docker image and assumes Docker Hub is the container registry.
5. Make sure you have the Pipecat Cloud CLI installed:
```bash
pip install pipecatcloud
```
6. Login via the Pipecat Cloud CLI:
```bash
pcc auth login
```
> Note: If you don't have an account, sign up at https://pipecat.daily.co.
7. Add a secrets set:
```bash
pcc secrets set pcc-smart-turn-secrets --file .env
```
8. Deploy your agent:
```bash
pcc deploy
```
> Note: This uses your pcc-deploy.toml settings. Modify as needed.
### Deploy the client to Vercel
This project uses TypeScript, React, and Next.js, making it a perfect fit for [Vercel](https://vercel.com/).
- In your client directory, install Vercel's CLI tool: `npm install -g vercel`
- Verify it's installed using `vercel --version`
- Log in your Vercel account using `vercel login`
- Deploy your client to Vercel using `vercel`
Follow the vercel prompts to deploy your project.
### Test your deployed app
Now with the client and server deployed, you can join the call using your Vercel URL.
See the debug information for the Smart Turn data. It prints a log line for each smart-turn inference:
```
Smart Turn: COMPLETE, Probability: 95.3%, Model inference: 65.23ms, Server processing: 82.09ms, End-to-end: 245.43ms
```

View File

@@ -1,41 +0,0 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.*
.yarn/*
!.yarn/patches
!.yarn/plugins
!.yarn/releases
!.yarn/versions
# testing
/coverage
# next.js
/.next/
/out/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*
# env files (can opt-in for committing if needed)
.env*
# vercel
.vercel
# typescript
*.tsbuildinfo
next-env.d.ts

View File

@@ -1,3 +0,0 @@
NEXT_PUBLIC_API_BASE_URL=http://localhost:7860
PIPECAT_CLOUD_API_KEY=
AGENT_NAME=pcc-smart-turn

View File

@@ -1,16 +0,0 @@
import { dirname } from "path";
import { fileURLToPath } from "url";
import { FlatCompat } from "@eslint/eslintrc";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
const compat = new FlatCompat({
baseDirectory: __dirname,
});
const eslintConfig = [
...compat.extends("next/core-web-vitals", "next/typescript"),
];
export default eslintConfig;

View File

@@ -1,7 +0,0 @@
import type { NextConfig } from "next";
const nextConfig: NextConfig = {
/* config options here */
};
export default nextConfig;

File diff suppressed because it is too large Load Diff

View File

@@ -1,28 +0,0 @@
{
"name": "my-nextjs-app",
"version": "0.1.0",
"private": true,
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"lint": "next lint"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.3.5",
"@pipecat-ai/client-react": "^0.3.5",
"@pipecat-ai/daily-transport": "^0.3.10",
"next": "15.3.1",
"react": "^19.0.0",
"react-dom": "^19.0.0"
},
"devDependencies": {
"@eslint/eslintrc": "^3",
"@types/node": "^20",
"@types/react": "^19",
"@types/react-dom": "^19",
"eslint": "^9",
"eslint-config-next": "15.2.3",
"typescript": "^5"
}
}

View File

@@ -1,7 +0,0 @@
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M3.3088 5.05615C3.64682 4.92779 4.02833 5.02411 4.26653 5.29797L7.36884 8.86461H16.6312L19.7335 5.29797C19.9717 5.02411 20.3532 4.92779 20.6912 5.05615C21.0292 5.18452 21.253 5.51072 21.253 5.87504V13.75H24V15.5H19.5181V8.19909L17.6762 10.3167C17.5115 10.506 17.2738 10.6146 17.0241 10.6146H6.9759C6.72616 10.6146 6.48854 10.506 6.32383 10.3167L4.48193 8.19909V15.5H0V13.75H2.74699V5.87504C2.74699 5.51072 2.97078 5.18452 3.3088 5.05615Z" fill="black"/>
<path d="M19.5181 17.25H24V19H19.5181V17.25Z" fill="black"/>
<path d="M0 17.25H4.48193V19H0V17.25Z" fill="black"/>
<path d="M9.25301 14.3333C9.25301 14.9777 8.73517 15.5 8.09639 15.5C7.4576 15.5 6.93976 14.9777 6.93976 14.3333C6.93976 13.689 7.4576 13.1667 8.09639 13.1667C8.73517 13.1667 9.25301 13.689 9.25301 14.3333Z" fill="black"/>
<path d="M17.0602 14.3333C17.0602 14.9777 16.5424 15.5 15.9036 15.5C15.2648 15.5 14.747 14.9777 14.747 14.3333C14.747 13.689 15.2648 13.1667 15.9036 13.1667C16.5424 13.1667 17.0602 13.689 17.0602 14.3333Z" fill="black"/>
</svg>

Before

Width:  |  Height:  |  Size: 1.1 KiB

View File

@@ -1,44 +0,0 @@
import { NextResponse, NextRequest } from 'next/server';
export async function POST(request: NextRequest) {
const { MY_CUSTOM_DATA } = await request.json();
try {
const response = await fetch(
`https://api.pipecat.daily.co/v1/public/${process.env.AGENT_NAME}/start`,
{
method: 'POST',
headers: {
Authorization: `Bearer ${process.env.PIPECAT_CLOUD_API_KEY}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
// Create Daily room
createDailyRoom: true,
// Optionally set Daily room properties
dailyRoomProperties: { start_video_off: true },
// Optionally pass custom data to the bot
body: { MY_CUSTOM_DATA },
}),
}
);
if (!response.ok) {
throw new Error(`API responded with status: ${response.status}`);
}
const data = await response.json();
// Transform the response to match what RTVI client expects
return NextResponse.json({
room_url: data.dailyRoom,
token: data.dailyToken,
});
} catch (error) {
console.error('API error:', error);
return NextResponse.json(
{ error: 'Failed to start agent' },
{ status: 500 }
);
}
}

View File

@@ -1,82 +0,0 @@
body {
margin: 0;
padding: 20px;
font-family: Arial, sans-serif;
background-color: #f0f0f0;
}
.app {
max-width: 1200px;
margin: 0 auto;
}
.status-bar {
display: flex;
justify-content: space-between;
align-items: center;
padding: 10px;
background-color: #fff;
border-radius: 8px;
margin-bottom: 20px;
}
.controls button {
padding: 8px 16px;
margin-left: 10px;
border: none;
border-radius: 4px;
cursor: pointer;
}
button:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.connect-btn {
background-color: #4caf50;
color: white;
}
.disconnect-btn {
background-color: #f44336;
color: white;
}
.main-content {
background-color: #fff;
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
}
.bot-container {
display: flex;
flex-direction: column;
align-items: center;
}
.video-container {
width: 640px;
height: 360px;
background-color: #ddd;
margin-bottom: 20px;
border-radius: 8px;
overflow: hidden;
}
.video-container video {
width: 100%;
height: 100%;
object-fit: cover;
}
.mic-enabled {
background-color: #4caf50;
color: white;
}
.mic-disabled {
background-color: #f44336;
color: white;
}

View File

@@ -1,27 +0,0 @@
import './globals.css';
import { RTVIProvider } from '@/providers/RTVIProvider';
export const metadata = {
title: 'Pipecat React Client',
description: 'Pipecat RTVI Client using Next.js',
icons: {
icon: [{ url: '/favicon.svg', type: 'image/svg+xml' }],
},
};
export default function RootLayout({
children,
}: {
children: React.ReactNode;
}) {
return (
<html lang="en">
<head>
<link rel="icon" href="/favicon.svg" type="image/svg+xml" />
</head>
<body>
<RTVIProvider>{children}</RTVIProvider>
</body>
</html>
);
}

View File

@@ -1,41 +0,0 @@
'use client';
import {
RTVIClientAudio,
RTVIClientVideo,
useRTVIClientTransportState,
} from '@pipecat-ai/client-react';
import { ConnectButton } from '../components/ConnectButton';
import { StatusDisplay } from '../components/StatusDisplay';
import { DebugDisplay } from '../components/DebugDisplay';
function BotVideo() {
const transportState = useRTVIClientTransportState();
const isConnected = transportState !== 'disconnected';
return (
<div className="bot-container">
<div className="video-container">
{isConnected && <RTVIClientVideo participant="bot" fit="cover" />}
</div>
</div>
);
}
export default function Home() {
return (
<div className="app">
<div className="status-bar">
<StatusDisplay />
<ConnectButton />
</div>
<div className="main-content">
<BotVideo />
</div>
<DebugDisplay />
<RTVIClientAudio />
</div>
);
}

View File

@@ -1,40 +0,0 @@
import {
useRTVIClient,
useRTVIClientTransportState,
} from '@pipecat-ai/client-react';
export function ConnectButton() {
const client = useRTVIClient();
const transportState = useRTVIClientTransportState();
const isConnected = ['connected', 'ready'].includes(transportState);
const handleClick = async () => {
if (!client) {
console.error('RTVI client is not initialized');
return;
}
try {
if (isConnected) {
await client.disconnect();
} else {
await client.connect();
}
} catch (error) {
console.error('Connection error:', error);
}
};
return (
<div className="controls">
<button
className={isConnected ? 'disconnect-btn' : 'connect-btn'}
onClick={handleClick}
disabled={
!client || ['connecting', 'disconnecting'].includes(transportState)
}>
{isConnected ? 'Disconnect' : 'Connect'}
</button>
</div>
);
}

View File

@@ -1,26 +0,0 @@
.debug-panel {
background-color: #fff;
border-radius: 8px;
padding: 20px;
}
.debug-panel h3 {
margin: 0 0 10px 0;
font-size: 16px;
font-weight: bold;
}
.debug-log {
height: 200px;
overflow-y: auto;
background-color: #f8f8f8;
padding: 10px;
border-radius: 4px;
font-family: monospace;
font-size: 12px;
line-height: 1.4;
}
.debug-log div {
margin-bottom: 4px;
}

View File

@@ -1,171 +0,0 @@
import { useRef, useCallback } from 'react';
import {
Participant,
RTVIEvent,
TransportState,
TranscriptData,
BotLLMTextData,
} from '@pipecat-ai/client-js';
import { useRTVIClient, useRTVIClientEvent } from '@pipecat-ai/client-react';
import './DebugDisplay.css';
interface SmartTurnResultData {
type: 'smart_turn_result';
is_complete: boolean;
probability: number;
inference_time_ms: number; // Pure model inference time
server_total_time_ms: number; // Server processing time
e2e_processing_time_ms: number; // Complete end-to-end time
}
export function DebugDisplay() {
const debugLogRef = useRef<HTMLDivElement>(null);
const client = useRTVIClient();
const log = useCallback((message: string) => {
if (!debugLogRef.current) return;
const entry = document.createElement('div');
entry.textContent = `${new Date().toISOString()} - ${message}`;
// Add styling based on message type
if (message.startsWith('User: ')) {
entry.style.color = '#2196F3'; // blue for user
} else if (message.startsWith('Bot: ')) {
entry.style.color = '#4CAF50'; // green for bot
} else if (message.includes('Smart Turn:')) {
entry.style.color = '#9C27B0'; // purple for smart turn
}
debugLogRef.current.appendChild(entry);
debugLogRef.current.scrollTop = debugLogRef.current.scrollHeight;
}, []);
// Log transport state changes
useRTVIClientEvent(
RTVIEvent.TransportStateChanged,
useCallback(
(state: TransportState) => {
log(`Transport state changed: ${state}`);
},
[log]
)
);
// Log bot connection events
useRTVIClientEvent(
RTVIEvent.BotConnected,
useCallback(
(participant?: Participant) => {
log(`Bot connected: ${JSON.stringify(participant)}`);
},
[log]
)
);
useRTVIClientEvent(
RTVIEvent.BotDisconnected,
useCallback(
(participant?: Participant) => {
log(`Bot disconnected: ${JSON.stringify(participant)}`);
},
[log]
)
);
// Log track events
useRTVIClientEvent(
RTVIEvent.TrackStarted,
useCallback(
(track: MediaStreamTrack, participant?: Participant) => {
log(
`Track started: ${track.kind} from ${participant?.name || 'unknown'}`
);
},
[log]
)
);
useRTVIClientEvent(
RTVIEvent.TrackStopped,
useCallback(
(track: MediaStreamTrack, participant?: Participant) => {
log(
`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`
);
},
[log]
)
);
// Log bot ready state and check tracks
useRTVIClientEvent(
RTVIEvent.BotReady,
useCallback(() => {
log(`Bot ready`);
if (!client) return;
const tracks = client.tracks();
log(
`Available tracks: ${JSON.stringify({
local: {
audio: !!tracks.local.audio,
video: !!tracks.local.video,
},
bot: {
audio: !!tracks.bot?.audio,
video: !!tracks.bot?.video,
},
})}`
);
}, [client, log])
);
// Log transcripts
useRTVIClientEvent(
RTVIEvent.UserTranscript,
useCallback(
(data: TranscriptData) => {
// Only log final transcripts
if (data.final) {
log(`User: ${data.text}`);
}
},
[log]
)
);
useRTVIClientEvent(
RTVIEvent.BotTranscript,
useCallback(
(data: BotLLMTextData) => {
log(`Bot: ${data.text}`);
},
[log]
)
);
useRTVIClientEvent(
RTVIEvent.ServerMessage,
useCallback(
(data: SmartTurnResultData) => {
log(
`Smart Turn:
${data.is_complete ? 'COMPLETE' : 'INCOMPLETE'},
Probability: ${(data.probability * 100).toFixed(1)}%,
Model inference: ${data.inference_time_ms?.toFixed(2) || 'N/A'}ms,
Server processing: ${data.server_total_time_ms?.toFixed(2) || 'N/A'}ms,
End-to-end: ${data.e2e_processing_time_ms?.toFixed(2) || 'N/A'}ms`
);
},
[log]
)
);
return (
<div className="debug-panel">
<h3>Debug Info</h3>
<div ref={debugLogRef} className="debug-log" />
</div>
);
}

View File

@@ -1,11 +0,0 @@
import { useRTVIClientTransportState } from '@pipecat-ai/client-react';
export function StatusDisplay() {
const transportState = useRTVIClientTransportState();
return (
<div className="status">
Status: <span>{transportState}</span>
</div>
);
}

View File

@@ -1,43 +0,0 @@
'use client';
import { RTVIClient } from '@pipecat-ai/client-js';
import { DailyTransport } from '@pipecat-ai/daily-transport';
import { RTVIClientProvider } from '@pipecat-ai/client-react';
import { PropsWithChildren, useEffect, useState } from 'react';
// Get the API base URL from environment variables
// Default to "/api" if not specified
// "/api" is the default for Next.js API routes and used
// for the Pipecat Cloud deployed agent
const API_BASE_URL = process.env.NEXT_PUBLIC_API_BASE_URL || '/api';
console.log('Using API base URL:', API_BASE_URL);
export function RTVIProvider({ children }: PropsWithChildren) {
const [client, setClient] = useState<RTVIClient | null>(null);
useEffect(() => {
const transport = new DailyTransport();
const rtviClient = new RTVIClient({
transport,
params: {
baseUrl: API_BASE_URL,
endpoints: {
connect: '/connect',
},
requestData: { foo: 'bar' },
},
enableMic: true,
enableCam: false,
});
setClient(rtviClient);
}, []);
if (!client) {
return null;
}
return <RTVIClientProvider client={client}>{children}</RTVIClientProvider>;
}

View File

@@ -1,28 +0,0 @@
{
"compilerOptions": {
"target": "ES2017",
"lib": ["dom", "dom.iterable", "esnext"],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
"noEmit": true,
"esModuleInterop": true,
"module": "esnext",
"moduleResolution": "bundler",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve",
"incremental": true,
"plugins": [
{
"name": "next"
}
],
"paths": {
"@/components/*": ["./src/components/*"],
"@/providers/*": ["./src/providers/*"]
}
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
"exclude": ["node_modules"]
}

View File

@@ -1,8 +0,0 @@
FROM dailyco/pipecat-base:latest
COPY ./requirements.txt requirements.txt
RUN pip install --no-cache-dir --upgrade -r requirements.txt
COPY ./assets assets
COPY ./bot.py bot.py

Binary file not shown.

Before

Width:  |  Height:  |  Size: 759 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 884 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 876 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 881 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 866 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 874 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 882 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 885 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 888 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 890 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 898 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 836 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 903 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 908 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 908 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 905 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 903 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 866 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 849 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 866 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 866 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 864 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 858 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 875 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 881 KiB

View File

@@ -1,299 +0,0 @@
#
# Copyright (c) 2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from PIL import Image
from pipecatcloud.agent import DailySessionArguments
from pipecat.audio.turn.smart_turn.fal_smart_turn import FalSmartTurnAnalyzer
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import (
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
Frame,
MetricsFrame,
OutputImageRawFrame,
SpriteFrame,
)
from pipecat.metrics.metrics import SmartTurnMetricsData
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.processors.frameworks.rtvi import (
RTVIConfig,
RTVIObserver,
RTVIProcessor,
RTVIServerMessageFrame,
)
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.google.llm import GoogleLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
# Check if we're in local development mode
LOCAL = os.getenv("LOCAL")
logger.remove()
logger.add(sys.stderr, level="DEBUG")
sprites = []
script_dir = os.path.dirname(__file__)
# Load sequential animation frames
for i in range(1, 26):
# Build the full path to the image file
full_path = os.path.join(script_dir, f"assets/robot0{i}.png")
# Get the filename without the extension to use as the dictionary key
# Open the image and convert it to bytes
with Image.open(full_path) as img:
sprites.append(OutputImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
# Create a smooth animation by adding reversed frames
flipped = sprites[::-1]
sprites.extend(flipped)
# Define static and animated states
quiet_frame = sprites[0] # Static frame for when bot is listening
talking_frame = SpriteFrame(images=sprites) # Animation sequence for when bot is talking
class TalkingAnimation(FrameProcessor):
"""Manages the bot's visual animation states.
Switches between static (listening) and animated (talking) states based on
the bot's current speaking status.
"""
def __init__(self):
super().__init__()
self._is_talking = False
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process incoming frames and update animation state.
Args:
frame: The incoming frame to process
direction: The direction of frame flow in the pipeline
"""
await super().process_frame(frame, direction)
# Switch to talking animation when bot starts speaking
if isinstance(frame, BotStartedSpeakingFrame):
if not self._is_talking:
await self.push_frame(talking_frame)
self._is_talking = True
# Return to static frame when bot stops speaking
elif isinstance(frame, BotStoppedSpeakingFrame):
await self.push_frame(quiet_frame)
self._is_talking = False
await self.push_frame(frame, direction)
class SmartTurnMetricsProcessor(FrameProcessor):
"""Processes the metrics data from Smart Turn Analyzer.
This processor is responsible for handling smart turn metrics data
and forwarding it to the client UI via RTVI.
"""
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process incoming frames and handle Smart Turn metrics.
Args:
frame: The incoming frame to process
direction: The direction of frame flow in the pipeline
"""
await super().process_frame(frame, direction)
# Handle Smart Turn metrics
if isinstance(frame, MetricsFrame):
for metrics in frame.data:
if isinstance(metrics, SmartTurnMetricsData):
logger.info(f"Smart Turn metrics: {metrics}")
# Create a payload with the smart turn prediction data
smart_turn_data = {
"type": "smart_turn_result",
"is_complete": metrics.is_complete,
"probability": metrics.probability,
"inference_time_ms": metrics.inference_time_ms,
"server_total_time_ms": metrics.server_total_time_ms,
"e2e_processing_time_ms": metrics.e2e_processing_time_ms,
}
# Send the data to the client via RTVI
rtvi_frame = RTVIServerMessageFrame(data=smart_turn_data)
await self.push_frame(rtvi_frame)
await self.push_frame(frame, direction)
async def main(transport: DailyTransport):
# Configure your STT, LLM, and TTS services here
# Swap out different processors or properties to customize your bot
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
# Set up the initial context for the conversation
# You can specified initial system and assistant messages here
messages = [
{
"role": "system",
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself.",
},
]
# This sets up the LLM context by providing messages and tools
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
ta = TalkingAnimation()
smart_turn_metrics_processor = SmartTurnMetricsProcessor()
# RTVI events for Pipecat client UI
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
# A core voice AI pipeline
# Add additional processors to customize the bot's behavior
pipeline = Pipeline(
[
transport.input(),
rtvi,
smart_turn_metrics_processor,
stt,
context_aggregator.user(),
llm,
tts,
ta,
transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
),
observers=[RTVIObserver(rtvi)],
)
@rtvi.event_handler("on_client_ready")
async def on_client_ready(rtvi):
logger.debug("Client ready event received")
await rtvi.set_bot_ready()
# Kick off the conversation
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
logger.info("First participant joined: {}", participant["id"])
# Push a static frame to show the bot is listening
await task.queue_frame(quiet_frame)
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
logger.info("Participant left: {}", participant)
await task.cancel()
runner = PipelineRunner(handle_sigint=False, force_gc=True)
await runner.run(task)
async def bot(args: DailySessionArguments):
"""Main bot entry point compatible with the FastAPI route handler.
Args:
room_url: The Daily room URL
token: The Daily room token
body: The configuration object from the request body
session_id: The session ID for logging
"""
from pipecat.audio.filters.krisp_filter import KrispFilter
logger.info(f"Bot process initialized {args.room_url} {args.token}")
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
args.room_url,
args.token,
"Smart Turn Bot",
params=DailyParams(
audio_in_enabled=True,
audio_in_filter=KrispFilter(),
audio_out_enabled=True,
video_out_enabled=True,
video_out_width=1024,
video_out_height=576,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=FalSmartTurnAnalyzer(
api_key=os.getenv("FAL_SMART_TURN_API_KEY"), aiohttp_session=session
),
),
)
try:
await main(transport)
logger.info("Bot process completed")
except Exception as e:
logger.exception(f"Error in bot process: {str(e)}")
raise
# Local development
async def local_daily():
"""Daily transport for local development."""
from runner import configure
try:
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
transport = DailyTransport(
room_url,
token,
"Smart Turn Bot",
params=DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_out_enabled=True,
video_out_width=1024,
video_out_height=576,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=FalSmartTurnAnalyzer(
api_key=os.getenv("FAL_SMART_TURN_API_KEY"), aiohttp_session=session
),
),
)
await main(transport)
except Exception as e:
logger.exception(f"Error in local development mode: {e}")
# Local development entry point
if LOCAL and __name__ == "__main__":
try:
asyncio.run(local_daily())
except Exception as e:
logger.exception(f"Failed to run in local mode: {e}")

View File

@@ -1,19 +0,0 @@
#!/bin/bash
set -e
VERSION="0.1"
DOCKER_USERNAME=""
AGENT_NAME="pcc-smart-turn"
# Build the Docker image with the correct context
echo "Building Docker image..."
docker build --platform=linux/arm64 -t "$DOCKER_USERNAME/$AGENT_NAME:$VERSION" -t "$DOCKER_USERNAME/$AGENT_NAME:latest" .
# Push the Docker images
echo "Pushing Docker image $DOCKER_USERNAME/$AGENT_NAME:$VERSION..."
docker push "$DOCKER_USERNAME/$AGENT_NAME:$VERSION"
echo "Pushing Docker image $DOCKER_USERNAME/$AGENT_NAME:latest..."
docker push "$DOCKER_USERNAME/$AGENT_NAME:latest"
echo "Successfully built and pushed $DOCKER_USERNAME/$AGENT_NAME:$VERSION and $DOCKER_USERNAME/$AGENT_NAME:latest"

View File

@@ -1,5 +0,0 @@
GOOGLE_API_KEY=
CARTESIA_API_KEY=
DEEPGRAM_API_KEY=
DAILY_API_KEY=
FAL_SMART_TURN_API_KEY=

View File

@@ -1,7 +0,0 @@
agent_name = "pcc-smart-turn"
image = "your-username/pcc-smart-turn:0.1"
secret_set = "pcc-smart-turn-secrets"
enable_krisp = true
[scaling]
min_instances = 0

View File

@@ -1,3 +0,0 @@
pipecatcloud
pipecat-ai[google,daily,deepgram,cartesia,silero]
python-dotenv

View File

@@ -1,56 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import os
import aiohttp
from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper
async def configure(aiohttp_session: aiohttp.ClientSession):
"""Configure the Daily room and Daily REST helper."""
parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample")
parser.add_argument(
"-u", "--url", type=str, required=False, help="URL of the Daily room to join"
)
parser.add_argument(
"-k",
"--apikey",
type=str,
required=False,
help="Daily API Key (needed to create an owner token for the room)",
)
args, unknown = parser.parse_known_args()
url = args.url or os.getenv("DAILY_SAMPLE_ROOM_URL")
key = args.apikey or os.getenv("DAILY_API_KEY")
if not url:
raise Exception(
"No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL."
)
if not key:
raise Exception(
"No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers."
)
daily_rest_helper = DailyRESTHelper(
daily_api_key=key,
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=aiohttp_session,
)
# Create a meeting token for the given room with an expiration 1 hour in
# the future.
expiry_time: float = 60 * 60
token = await daily_rest_helper.get_token(url, expiry_time)
return (url, token)

View File

@@ -1,228 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""RTVI Bot Server Implementation.
This FastAPI server manages RTVI bot instances and provides endpoints for both
direct browser access and RTVI client connections. It handles:
- Creating Daily rooms
- Managing bot processes
- Providing connection credentials
- Monitoring bot status
Requirements:
- Daily API key (set in .env file)
- Python 3.10+
- FastAPI
- Running bot implementation
"""
import argparse
import os
import subprocess
from contextlib import asynccontextmanager
from typing import Any, Dict
import aiohttp
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, RedirectResponse
from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomParams
# Load environment variables from .env file
load_dotenv(override=True)
# Maximum number of bot instances allowed per room
MAX_BOTS_PER_ROOM = 1
# Dictionary to track bot processes: {pid: (process, room_url)}
bot_procs = {}
# Store Daily API helpers
daily_helpers = {}
def cleanup():
"""Cleanup function to terminate all bot processes.
Called during server shutdown.
"""
for entry in bot_procs.values():
proc = entry[0]
proc.terminate()
proc.wait()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""FastAPI lifespan manager that handles startup and shutdown tasks.
- Creates aiohttp session
- Initializes Daily API helper
- Cleans up resources on shutdown
"""
aiohttp_session = aiohttp.ClientSession()
daily_helpers["rest"] = DailyRESTHelper(
daily_api_key=os.getenv("DAILY_API_KEY", ""),
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=aiohttp_session,
)
yield
await aiohttp_session.close()
cleanup()
# Initialize FastAPI app with lifespan manager
app = FastAPI(lifespan=lifespan)
# Configure CORS to allow requests from any origin
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
async def create_room_and_token() -> tuple[str, str]:
"""Helper function to create a Daily room and generate an access token.
Returns:
tuple[str, str]: A tuple containing (room_url, token)
Raises:
HTTPException: If room creation or token generation fails
"""
room = await daily_helpers["rest"].create_room(DailyRoomParams())
if not room.url:
raise HTTPException(status_code=500, detail="Failed to create room")
token = await daily_helpers["rest"].get_token(room.url)
if not token:
raise HTTPException(status_code=500, detail=f"Failed to get token for room: {room.url}")
return room.url, token
@app.get("/")
async def start_agent(request: Request):
"""Endpoint for direct browser access to the bot.
Creates a room, starts a bot instance, and redirects to the Daily room URL.
Returns:
RedirectResponse: Redirects to the Daily room URL
Raises:
HTTPException: If room creation, token generation, or bot startup fails
"""
print("Creating room")
room_url, token = await create_room_and_token()
print(f"Room URL: {room_url}")
# Check if there is already an existing process running in this room
num_bots_in_room = sum(
1 for proc in bot_procs.values() if proc[1] == room_url and proc[0].poll() is None
)
if num_bots_in_room >= MAX_BOTS_PER_ROOM:
raise HTTPException(status_code=500, detail=f"Max bot limit reached for room: {room_url}")
# Spawn a new bot process
try:
proc = subprocess.Popen(
[f"python3 bot.py -u {room_url} -t {token}"],
shell=True,
bufsize=1,
cwd=os.path.dirname(os.path.abspath(__file__)),
)
bot_procs[proc.pid] = (proc, room_url)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}")
return RedirectResponse(room_url)
@app.post("/connect")
async def rtvi_connect(request: Request) -> Dict[Any, Any]:
"""RTVI connect endpoint that creates a room and returns connection credentials.
This endpoint is called by RTVI clients to establish a connection.
Returns:
Dict[Any, Any]: Authentication bundle containing room_url and token
Raises:
HTTPException: If room creation, token generation, or bot startup fails
"""
print("Creating room for RTVI connection")
room_url, token = await create_room_and_token()
print(f"Room URL: {room_url}")
# Start the bot process
try:
proc = subprocess.Popen(
[f"python3 -m bot -u {room_url} -t {token}"],
shell=True,
bufsize=1,
cwd=os.path.dirname(os.path.abspath(__file__)),
)
bot_procs[proc.pid] = (proc, room_url)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to start subprocess: {e}")
# Return the authentication bundle in format expected by DailyTransport
return {"room_url": room_url, "token": token}
@app.get("/status/{pid}")
def get_status(pid: int):
"""Get the status of a specific bot process.
Args:
pid (int): Process ID of the bot
Returns:
JSONResponse: Status information for the bot
Raises:
HTTPException: If the specified bot process is not found
"""
# Look up the subprocess
proc = bot_procs.get(pid)
# If the subprocess doesn't exist, return an error
if not proc:
raise HTTPException(status_code=404, detail=f"Bot with process id: {pid} not found")
# Check the status of the subprocess
status = "running" if proc[0].poll() is None else "finished"
return JSONResponse({"bot_id": pid, "status": status})
if __name__ == "__main__":
import uvicorn
# Parse command line arguments for server configuration
default_host = os.getenv("HOST", "0.0.0.0")
default_port = int(os.getenv("FAST_API_PORT", "7860"))
parser = argparse.ArgumentParser(description="Daily Storyteller FastAPI server")
parser.add_argument("--host", type=str, default=default_host, help="Host address")
parser.add_argument("--port", type=int, default=default_port, help="Port number")
parser.add_argument("--reload", action="store_true", help="Reload code on change")
config = parser.parse_args()
# Start the FastAPI server
uvicorn.run(
"server:app",
host=config.host,
port=config.port,
reload=config.reload,
)

View File

@@ -4,55 +4,54 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.piper.tts import PiperTTSService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
logger.info(f"Starting bot")
# Create a transport using the WebRTC connection
transport = SmallWebRTCTransport(
webrtc_connection=webrtc_connection,
params=TransportParams(
audio_out_enabled=True,
),
)
# Create an HTTP session
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url, None, "Say One Thing", DailyParams(audio_out_enabled=True)
)
tts = PiperTTSService(
base_url=os.getenv("PIPER_BASE_URL"), aiohttp_session=session, sample_rate=24000
)
runner = PipelineRunner()
task = PipelineTask(Pipeline([tts, transport.output()]))
# Register an event handler so we can play the audio when the client joins
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
await task.queue_frames([TTSSpeakFrame(f"Hello there!"), EndFrame()])
runner = PipelineRunner(handle_sigint=False)
# Register an event handler so we can play the audio when the
# participant joins.
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await task.queue_frames(
[TTSSpeakFrame(f"Hello there, how are you today ?"), EndFrame()]
)
await runner.run(task)
if __name__ == "__main__":
from run import main
main()
asyncio.run(main())

View File

@@ -1,60 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import os
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.rime.tts import RimeHttpTTSService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
load_dotenv(override=True)
async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
logger.info(f"Starting bot")
# Create a transport using the WebRTC connection
transport = SmallWebRTCTransport(
webrtc_connection=webrtc_connection,
params=TransportParams(
audio_out_enabled=True,
),
)
# Create an HTTP session
async with aiohttp.ClientSession() as session:
tts = RimeHttpTTSService(
api_key=os.getenv("RIME_API_KEY", ""),
voice_id="rex",
aiohttp_session=session,
)
task = PipelineTask(Pipeline([tts, transport.output()]))
# Register an event handler so we can play the audio when the client joins
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
await task.queue_frames([TTSSpeakFrame(f"Hello there!"), EndFrame()])
runner = PipelineRunner(handle_sigint=False)
await runner.run(task)
if __name__ == "__main__":
from run import main
main()

View File

@@ -4,53 +4,56 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
logger.info(f"Starting bot")
# Create a transport using the WebRTC connection
transport = SmallWebRTCTransport(
webrtc_connection=webrtc_connection,
params=TransportParams(
audio_out_enabled=True,
),
)
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
transport = DailyTransport(
room_url, None, "Say One Thing", DailyParams(audio_out_enabled=True)
)
task = PipelineTask(Pipeline([tts, transport.output()]))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
# Register an event handler so we can play the audio when the client joins
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
await task.queue_frames([TTSSpeakFrame(f"Hello there!"), EndFrame()])
runner = PipelineRunner()
runner = PipelineRunner(handle_sigint=False)
task = PipelineTask(Pipeline([tts, transport.output()]))
await runner.run(task)
# Register an event handler so we can play the audio when the
# participant joins.
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
participant_name = participant.get("info", {}).get("userName", "")
await task.queue_frames(
[TTSSpeakFrame(f"Hello there, {participant_name}!"), EndFrame()]
)
await runner.run(task)
if __name__ == "__main__":
from run import main
main()
asyncio.run(main())

View File

@@ -4,50 +4,51 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import EndFrame, TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.riva.tts import FastPitchTTSService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
logger.info(f"Starting bot")
# Create a transport using the WebRTC connection
transport = SmallWebRTCTransport(
webrtc_connection=webrtc_connection,
params=TransportParams(
audio_out_enabled=True,
),
)
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
tts = FastPitchTTSService(api_key=os.getenv("NVIDIA_API_KEY"))
transport = DailyTransport(
room_url, None, "Say One Thing", DailyParams(audio_out_enabled=True)
)
task = PipelineTask(Pipeline([tts, transport.output()]))
tts = FastPitchTTSService(api_key=os.getenv("NVIDIA_API_KEY"))
# Register an event handler so we can play the audio when the client joins
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
await task.queue_frames([TTSSpeakFrame(f"Hello there!"), EndFrame()])
runner = PipelineRunner()
runner = PipelineRunner(handle_sigint=False)
task = PipelineTask(Pipeline([tts, transport.output()]))
await runner.run(task)
# Register an event handler so we can play the audio when the
# participant joins.
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
participant_name = participant.get("info", {}).get("userName", "")
await task.queue_frames([TTSSpeakFrame(f"Aloha, {participant_name}!"), EndFrame()])
await runner.run(task)
if __name__ == "__main__":
from run import main
main()
asyncio.run(main())

View File

@@ -4,11 +4,14 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
@@ -16,51 +19,46 @@ from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
logger.info(f"Starting bot")
# Create a transport using the WebRTC connection
transport = SmallWebRTCTransport(
webrtc_connection=webrtc_connection,
params=TransportParams(
audio_out_enabled=True,
),
)
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
transport = DailyTransport(
room_url, None, "Say One Thing From an LLM", DailyParams(audio_out_enabled=True)
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
messages = [
{
"role": "system",
"content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world.",
}
]
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
task = PipelineTask(Pipeline([llm, tts, transport.output()]))
messages = [
{
"role": "system",
"content": "You are an LLM in a WebRTC session, and this is a 'hello world' demo. Say hello to the world.",
}
]
# Register an event handler so we can play the audio when the client joins
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
await task.queue_frames([LLMMessagesFrame(messages), EndFrame()])
runner = PipelineRunner()
runner = PipelineRunner(handle_sigint=False)
task = PipelineTask(Pipeline([llm, tts, transport.output()]))
await runner.run(task)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await task.queue_frames([LLMMessagesFrame(messages), EndFrame()])
await runner.run(task)
if __name__ == "__main__":
from run import main
main()
asyncio.run(main())

View File

@@ -4,68 +4,59 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.services.fal.image import FalImageGenService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
logger.info(f"Starting bot")
# Create a transport using the WebRTC connection
transport = SmallWebRTCTransport(
webrtc_connection=webrtc_connection,
params=TransportParams(
video_out_enabled=True,
video_out_width=1024,
video_out_height=1024,
),
)
# Create an HTTP session
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
None,
"Show a still frame image",
DailyParams(camera_out_enabled=True, camera_out_width=1024, camera_out_height=1024),
)
imagegen = FalImageGenService(
params=FalImageGenService.InputParams(image_size="square_hd"),
aiohttp_session=session,
key=os.getenv("FAL_KEY"),
)
runner = PipelineRunner()
task = PipelineTask(Pipeline([imagegen, transport.output()]))
# Register an event handler so we can play the audio when the client joins
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await task.queue_frame(TextFrame("a cat in the style of picasso"))
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
@transport.event_handler("on_client_closed")
async def on_client_closed(transport, client):
logger.info(f"Client closed connection")
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.cancel()
runner = PipelineRunner(handle_sigint=False)
await runner.run(task)
if __name__ == "__main__":
from run import main
main()
asyncio.run(main())

View File

@@ -33,7 +33,9 @@ async def main():
transport = TkLocalTransport(
tk_root,
TkTransportParams(video_out_enabled=True, video_out_width=1024, video_out_height=1024),
TkTransportParams(
camera_out_enabled=True, camera_out_width=1024, camera_out_height=1024
),
)
imagegen = FalImageGenService(

View File

@@ -4,68 +4,62 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.frames.frames import TextFrame
from pipecat.frames.frames import EndFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.services.google.image import GoogleImageGenService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
logger.info(f"Starting bot")
# Create a transport using the WebRTC connection
transport = SmallWebRTCTransport(
webrtc_connection=webrtc_connection,
params=TransportParams(
video_out_enabled=True,
video_out_width=1024,
video_out_height=1024,
),
)
async def main():
async with aiohttp.ClientSession() as session:
(room_url, _) = await configure(session)
imagegen = GoogleImageGenService(
api_key=os.getenv("GOOGLE_API_KEY"),
)
transport = DailyTransport(
room_url,
None,
"Show a still frame image",
DailyParams(camera_out_enabled=True, camera_out_width=1024, camera_out_height=1024),
)
task = PipelineTask(
Pipeline([imagegen, transport.output()]),
params=PipelineParams(enable_metrics=True),
)
imagegen = GoogleImageGenService(
api_key=os.getenv("GOOGLE_API_KEY"),
)
# Register an event handler so we can play the audio when the client joins
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
await task.queue_frame(TextFrame("a cat in the style of picasso"))
await task.queue_frame(TextFrame("a dog in the style of picasso"))
await task.queue_frame(TextFrame("a fish in the style of picasso"))
runner = PipelineRunner()
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
task = PipelineTask(
Pipeline([imagegen, transport.output()]),
params=PipelineParams(enable_metrics=True),
)
@transport.event_handler("on_client_closed")
async def on_client_closed(transport, client):
logger.info(f"Client closed connection")
await task.cancel()
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await task.queue_frame(TextFrame("a cat in the style of picasso"))
await task.queue_frame(TextFrame("a dog in the style of picasso"))
await task.queue_frame(TextFrame("a fish in the style of picasso"))
runner = PipelineRunner(handle_sigint=False)
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
await runner.run(task)
await runner.run(task)
if __name__ == "__main__":
from run import main
main()
asyncio.run(main())

View File

@@ -1,105 +0,0 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
load_dotenv(override=True)
async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
logger.info(f"Starting bot")
transport = SmallWebRTCTransport(
webrtc_connection=webrtc_connection,
params=TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt,
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
@transport.event_handler("on_client_closed")
async def on_client_closed(transport, client):
logger.info(f"Client closed connection")
await task.cancel()
runner = PipelineRunner(handle_sigint=False)
await runner.run(task)
if __name__ == "__main__":
from run import main
main()

Some files were not shown because too many files have changed in this diff Show More