Compare commits
180 Commits
v0.0.99
...
kompfner-p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c861beb066 | ||
|
|
8951442b8e | ||
|
|
7e6e3031e7 | ||
|
|
308829f92b | ||
|
|
82a799e63e | ||
|
|
6b5bcae86f | ||
|
|
836073849c | ||
|
|
b13b65d6e2 | ||
|
|
3d545b718d | ||
|
|
f2fa5d9733 | ||
|
|
76b774072c | ||
|
|
b6341ffaa5 | ||
|
|
29fae67c9e | ||
|
|
718ea1c15e | ||
|
|
8e09d94614 | ||
|
|
de73e28563 | ||
|
|
55250b4f7e | ||
|
|
281145a991 | ||
|
|
7bd32e2fe5 | ||
|
|
8f05d95f50 | ||
|
|
87c12f3098 | ||
|
|
9c0bf89247 | ||
|
|
6e44a2ab49 | ||
|
|
7aa7b86aed | ||
|
|
5ad9faeb4c | ||
|
|
9e8f8b45c6 | ||
|
|
0ee11ad333 | ||
|
|
124a3c35af | ||
|
|
054e504868 | ||
|
|
e85a00cc0e | ||
|
|
cc61cdbba3 | ||
|
|
62f4708d43 | ||
|
|
ba0ddb1832 | ||
|
|
eacd2a4b71 | ||
|
|
7ed110650d | ||
|
|
4a724379fc | ||
|
|
768d3958dd | ||
|
|
5f9ff8bd58 | ||
|
|
59ed422052 | ||
|
|
7e0ca113af | ||
|
|
13c52e0e6d | ||
|
|
a787fd9cd8 | ||
|
|
14495c425a | ||
|
|
461bd0a2e0 | ||
|
|
bd45ce2b4e | ||
|
|
a266644b06 | ||
|
|
03faadd7f9 | ||
|
|
bf43032652 | ||
|
|
fa6f924b31 | ||
|
|
a010a020fd | ||
|
|
655006aff5 | ||
|
|
671dc8cd9b | ||
|
|
9a718ded1e | ||
|
|
024809b39a | ||
|
|
6cf0d53d00 | ||
|
|
778dacc9a8 | ||
|
|
06b3ecd2d6 | ||
|
|
b4d143e39b | ||
|
|
c89083e72e | ||
|
|
1ac811ab32 | ||
|
|
f6359d460e | ||
|
|
f03a7175c7 | ||
|
|
aed44c863a | ||
|
|
cddd6d5b0a | ||
|
|
11cf891ac8 | ||
|
|
c89ae717fe | ||
|
|
562bdd3084 | ||
|
|
cc4c3650e1 | ||
|
|
dfc1f09b77 | ||
|
|
5fc46cc450 | ||
|
|
4a9eb82f92 | ||
|
|
990d8386e4 | ||
|
|
ce7d823770 | ||
|
|
0b93c3f900 | ||
|
|
829c5f4604 | ||
|
|
dc8ea615d9 | ||
|
|
a3d206050d | ||
|
|
f48a567873 | ||
|
|
e69ccd8ea7 | ||
|
|
11924bb980 | ||
|
|
af89154e96 | ||
|
|
1485ea0831 | ||
|
|
e22bc777d8 | ||
|
|
043403fe23 | ||
|
|
1e1160906e | ||
|
|
f7d3e63063 | ||
|
|
6fa797c8e4 | ||
|
|
473d39791b | ||
|
|
2114abb8c6 | ||
|
|
4fb4c26f55 | ||
|
|
2e8e574ea5 | ||
|
|
84c7e97be2 | ||
|
|
a6e7c99d55 | ||
|
|
ac3fa7f91f | ||
|
|
6eadad53b2 | ||
|
|
b11150f31f | ||
|
|
836cf60611 | ||
|
|
1c13ad95a5 | ||
|
|
1e8516e91d | ||
|
|
32c775311d | ||
|
|
28d0bb98de | ||
|
|
a9a9f3aeaa | ||
|
|
c2a0735975 | ||
|
|
41cb53f6c2 | ||
|
|
58552af8fd | ||
|
|
c7ab87b0cc | ||
|
|
11ecc5fdee | ||
|
|
19fb3eed9f | ||
|
|
b292b32374 | ||
|
|
63d1393bb0 | ||
|
|
37914cb062 | ||
|
|
ec40696854 | ||
|
|
2249f3d673 | ||
|
|
d2df324f29 | ||
|
|
67fdb0b659 | ||
|
|
e77bdf66f9 | ||
|
|
1b3b67779c | ||
|
|
6c7e386391 | ||
|
|
ba25b279d6 | ||
|
|
e7c83c19b6 | ||
|
|
7be7fb49a3 | ||
|
|
bcccb4cbb3 | ||
|
|
e9f1d951d3 | ||
|
|
e5632a9339 | ||
|
|
1510fb4fc0 | ||
|
|
64a1ad2649 | ||
|
|
4458ca1d24 | ||
|
|
21aaa48e62 | ||
|
|
e75c241030 | ||
|
|
60216048a8 | ||
|
|
f3c2e29fb4 | ||
|
|
ce99924be4 | ||
|
|
5de80a60d4 | ||
|
|
5753762350 | ||
|
|
885b318b04 | ||
|
|
7a22d58cf4 | ||
|
|
c8e4b462c9 | ||
|
|
30a3f42255 | ||
|
|
26ddb2de2f | ||
|
|
f60eeaa212 | ||
|
|
8cf72b36cb | ||
|
|
38c3bcef96 | ||
|
|
80604ba7b6 | ||
|
|
256c70c631 | ||
|
|
0e3532c529 | ||
|
|
9942fcfeb2 | ||
|
|
003c24ca6e | ||
|
|
ed120d014d | ||
|
|
e76a3d04f0 | ||
|
|
641d17007f | ||
|
|
9293b5f24a | ||
|
|
c1f3cbd1d4 | ||
|
|
78fa2ab65e | ||
|
|
56da2caeed | ||
|
|
a541d65255 | ||
|
|
a3d7e9eafe | ||
|
|
54933bea2a | ||
|
|
fcab9899cc | ||
|
|
be098e85db | ||
|
|
ed0ff46a87 | ||
|
|
7ae0d651d6 | ||
|
|
efd4432cfb | ||
|
|
24082b84f2 | ||
|
|
dcd5840341 | ||
|
|
9e705ce768 | ||
|
|
965466cc09 | ||
|
|
f3993f1775 | ||
|
|
e107902b14 | ||
|
|
e7b5ff49f4 | ||
|
|
e33172c44e | ||
|
|
3d858e8aa6 | ||
|
|
eab059c49a | ||
|
|
4aaff04fb3 | ||
|
|
cb364f3cab | ||
|
|
a9bfb090c3 | ||
|
|
c4ae4025f3 | ||
|
|
15067c678d | ||
|
|
5ae592f38e | ||
|
|
9cdbc56be3 | ||
|
|
1ceb01665f |
40
.claude/skills/changelog/SKILL.md
Normal file
40
.claude/skills/changelog/SKILL.md
Normal file
@@ -0,0 +1,40 @@
|
||||
---
|
||||
name: changelog
|
||||
description: Create changelog files for important commits in a PR
|
||||
---
|
||||
|
||||
Create changelog files for the important commits in this PR. The PR number is provided as an argument.
|
||||
|
||||
## Instructions
|
||||
|
||||
1. First, check what commits are on the current branch compared to main:
|
||||
```
|
||||
git log main..HEAD --oneline
|
||||
```
|
||||
|
||||
2. For each significant change, create a changelog file in the `changelog/` folder using the format:
|
||||
- `{PR_NUMBER}.added.md` - for new features
|
||||
- `{PR_NUMBER}.added.2.md`, `{PR_NUMBER}.added.3.md` - for additional new features
|
||||
- `{PR_NUMBER}.changed.md` - for changes to existing functionality
|
||||
- `{PR_NUMBER}.fixed.md` - for bug fixes
|
||||
- `{PR_NUMBER}.deprecated.md` - for deprecations
|
||||
|
||||
3. Each changelog file should at least contain a main single line starting with `- ` followed by a clear description of the change.
|
||||
|
||||
4. If the change is complicated, changelog files can have indented lines after the main line with additional details or code samples.
|
||||
|
||||
5. Use ⚠️ emoji prefix for breaking changes.
|
||||
|
||||
## Example
|
||||
|
||||
For PR #3519 with a new feature and a bug fix:
|
||||
|
||||
`changelog/3519.added.md`:
|
||||
```
|
||||
- Added `SomeNewFeature` for doing something useful.
|
||||
```
|
||||
|
||||
`changelog/3519.fixed.md`:
|
||||
```
|
||||
- Fixed an issue where something was not working correctly.
|
||||
```
|
||||
257
.claude/skills/docstring/SKILL.md
Normal file
257
.claude/skills/docstring/SKILL.md
Normal file
@@ -0,0 +1,257 @@
|
||||
---
|
||||
name: docstring
|
||||
description: Document a Python module and its classes using Google style
|
||||
---
|
||||
|
||||
Document a Python module and its classes using Google-style docstrings following project conventions. The class name is provided as an argument.
|
||||
|
||||
## Instructions
|
||||
|
||||
1. First, find the class in the codebase:
|
||||
```
|
||||
Search for "class ClassName" in src/pipecat/
|
||||
```
|
||||
|
||||
2. If multiple files contain that class name:
|
||||
- List all matches with their file paths
|
||||
- Ask the user which one they want to document
|
||||
- Wait for confirmation before proceeding
|
||||
|
||||
3. Once the file is identified, read the module to understand its structure:
|
||||
- Identify all classes, functions, and important type aliases
|
||||
- Understand the purpose of each component
|
||||
|
||||
4. Apply documentation in this order:
|
||||
- Module docstring (at top, after imports)
|
||||
- Class docstrings
|
||||
- `__init__` methods (always document constructor parameters)
|
||||
- Public methods (not starting with `_`)
|
||||
- Dataclass/config classes with field descriptions
|
||||
|
||||
5. Skip documentation for:
|
||||
- Private methods (starting with `_`)
|
||||
- Simple dunder methods (`__str__`, `__repr__`, `__post_init__`)
|
||||
- Very simple pass-through properties
|
||||
- **Already documented code** - If a class, method, or function already has a complete docstring that follows the project style, do not modify it. A docstring is complete if it has:
|
||||
- A one-line summary
|
||||
- Args section (if it has parameters)
|
||||
- Returns section (if it returns something meaningful)
|
||||
- Only add or improve documentation where it is missing or incomplete
|
||||
|
||||
## Module Docstring Format
|
||||
|
||||
```python
|
||||
"""[One-line description of module purpose].
|
||||
|
||||
[Optional: Longer explanation of functionality, key classes, or use cases.]
|
||||
"""
|
||||
```
|
||||
|
||||
Example:
|
||||
```python
|
||||
"""Neuphonic text-to-speech service implementations.
|
||||
|
||||
This module provides WebSocket and HTTP-based integrations with Neuphonic's
|
||||
text-to-speech API for real-time audio synthesis.
|
||||
"""
|
||||
```
|
||||
|
||||
## Class Docstring Format
|
||||
|
||||
```python
|
||||
class ClassName:
|
||||
"""One-line summary describing what the class does.
|
||||
|
||||
[Longer description explaining purpose, behavior, and key features.
|
||||
Use action-oriented language.]
|
||||
|
||||
[Optional: Event handlers, usage notes, or important caveats.]
|
||||
"""
|
||||
```
|
||||
|
||||
Example:
|
||||
```python
|
||||
class FrameProcessor(BaseObject):
|
||||
"""Base class for all frame processors in the pipeline.
|
||||
|
||||
Frame processors are the building blocks of Pipecat pipelines, they can be
|
||||
linked to form complex processing pipelines. They receive frames, process
|
||||
them, and pass them to the next or previous processor in the chain.
|
||||
|
||||
Event handlers available:
|
||||
|
||||
- on_before_process_frame: Called before a frame is processed
|
||||
- on_after_process_frame: Called after a frame is processed
|
||||
|
||||
Example::
|
||||
|
||||
@processor.event_handler("on_before_process_frame")
|
||||
async def on_before_process_frame(processor, frame):
|
||||
...
|
||||
|
||||
@processor.event_handler("on_after_process_frame")
|
||||
async def on_after_process_frame(processor, frame):
|
||||
...
|
||||
"""
|
||||
```
|
||||
|
||||
Note: When listing event handlers, do NOT use backticks. Include an `Example::` section (with double colon for Sphinx) showing the decorator pattern and function signature for each event.
|
||||
|
||||
## Constructor (`__init__`) Format
|
||||
|
||||
```python
|
||||
def __init__(self, *, param1: Type, param2: Type = default, **kwargs):
|
||||
"""Initialize the [ClassName].
|
||||
|
||||
Args:
|
||||
param1: Description of param1 and its purpose.
|
||||
param2: Description of param2. Defaults to [default].
|
||||
**kwargs: Additional arguments passed to parent class.
|
||||
"""
|
||||
```
|
||||
|
||||
Example:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
voice_id: Optional[str] = None,
|
||||
sample_rate: Optional[int] = 22050,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Neuphonic TTS service.
|
||||
|
||||
Args:
|
||||
api_key: Neuphonic API key for authentication.
|
||||
voice_id: ID of the voice to use for synthesis.
|
||||
sample_rate: Audio sample rate in Hz. Defaults to 22050.
|
||||
**kwargs: Additional arguments passed to parent InterruptibleTTSService.
|
||||
"""
|
||||
```
|
||||
|
||||
## Method Docstring Format
|
||||
|
||||
```python
|
||||
async def method_name(self, param1: Type) -> ReturnType:
|
||||
"""One-line summary of what method does.
|
||||
|
||||
[Longer description if behavior isn't obvious.]
|
||||
|
||||
Args:
|
||||
param1: Description of param1.
|
||||
|
||||
Returns:
|
||||
Description of return value.
|
||||
|
||||
Raises:
|
||||
ExceptionType: When this exception is raised.
|
||||
"""
|
||||
```
|
||||
|
||||
Example:
|
||||
```python
|
||||
async def put(self, item: Tuple[Frame, FrameDirection, FrameCallback]):
|
||||
"""Put an item into the priority queue.
|
||||
|
||||
System frames (`SystemFrame`) have higher priority than any other
|
||||
frames. If a non-frame item is provided it will have the highest priority.
|
||||
|
||||
Args:
|
||||
item: The item to enqueue.
|
||||
"""
|
||||
```
|
||||
|
||||
## Dataclass/Config Format
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class ConfigName:
|
||||
"""One-line description of configuration.
|
||||
|
||||
[Explanation of when/how to use this config.]
|
||||
|
||||
Parameters:
|
||||
field1: Description of field1.
|
||||
field2: Description of field2. Defaults to [default].
|
||||
"""
|
||||
|
||||
field1: Type
|
||||
field2: Type = default_value
|
||||
```
|
||||
|
||||
Example:
|
||||
```python
|
||||
@dataclass
|
||||
class FrameProcessorSetup:
|
||||
"""Configuration parameters for frame processor initialization.
|
||||
|
||||
Parameters:
|
||||
clock: The clock instance for timing operations.
|
||||
task_manager: The task manager for handling async operations.
|
||||
observer: Optional observer for monitoring frame processing events.
|
||||
"""
|
||||
|
||||
clock: BaseClock
|
||||
task_manager: BaseTaskManager
|
||||
observer: Optional[BaseObserver] = None
|
||||
```
|
||||
|
||||
## Enum Documentation Format
|
||||
|
||||
```python
|
||||
class EnumName(Enum):
|
||||
"""One-line description of the enum purpose.
|
||||
|
||||
[Longer description of how the enum is used.]
|
||||
|
||||
Parameters:
|
||||
VALUE1: Description of VALUE1.
|
||||
VALUE2: Description of VALUE2.
|
||||
"""
|
||||
|
||||
VALUE1 = 1
|
||||
VALUE2 = 2
|
||||
```
|
||||
|
||||
## Writing Style Guidelines
|
||||
|
||||
- **Concise and professional** - No casual language or filler words
|
||||
- **Action-oriented** - Start with verbs: "Processes...", "Manages...", "Converts..."
|
||||
- **Purpose before implementation** - Explain WHY before HOW
|
||||
- **Clear parameter descriptions** - Include type hints, defaults, and purpose
|
||||
- **No redundant type info** - Type hints are in the signature, don't repeat in description
|
||||
- **Use backticks for code references** - Wrap class names, method names, event names, parameter names, and code snippets in backticks
|
||||
|
||||
Good: "Neuphonic API key for authentication."
|
||||
Bad: "str: The API key (string) that is used for authenticating with Neuphonic."
|
||||
|
||||
Good: "Triggers `on_speech_started` when the `VADAnalyzer` detects speech."
|
||||
Bad: "Triggers on_speech_started when the VADAnalyzer detects speech."
|
||||
|
||||
## Deprecation Notice Format
|
||||
|
||||
When documenting deprecated code:
|
||||
|
||||
```python
|
||||
"""[Description].
|
||||
|
||||
.. deprecated:: X.X.X
|
||||
`ClassName` is deprecated and will be removed in a future version.
|
||||
Use `NewClassName` instead.
|
||||
"""
|
||||
```
|
||||
|
||||
## Checklist
|
||||
|
||||
Before finishing, verify:
|
||||
|
||||
- [ ] Module has a docstring at the top (after copyright header and imports)
|
||||
- [ ] All public classes have docstrings
|
||||
- [ ] All `__init__` methods document their parameters
|
||||
- [ ] All public methods have docstrings with Args/Returns/Raises as needed
|
||||
- [ ] Dataclasses use "Parameters:" section for field descriptions
|
||||
- [ ] Enums document each value in "Parameters:" section
|
||||
- [ ] Writing is concise and action-oriented
|
||||
- [ ] No documentation added to private methods (starting with `_`)
|
||||
- [ ] Existing complete docstrings were left unchanged
|
||||
128
.claude/skills/pr-description/SKILL.md
Normal file
128
.claude/skills/pr-description/SKILL.md
Normal file
@@ -0,0 +1,128 @@
|
||||
---
|
||||
name: pr-description
|
||||
description: Update a GitHub PR description with a summary of changes
|
||||
---
|
||||
|
||||
Update a GitHub pull request description based on the changes in the PR.
|
||||
|
||||
## Arguments
|
||||
|
||||
```
|
||||
/pr-description <PR_NUMBER> [--fixes <ISSUE_NUMBERS>]
|
||||
```
|
||||
|
||||
- `PR_NUMBER` (required): The pull request number to update
|
||||
- `--fixes` (optional): Comma-separated issue numbers that this PR fixes (e.g., `--fixes 123,456`)
|
||||
|
||||
Examples:
|
||||
- `/pr-description 3534`
|
||||
- `/pr-description 3534 --fixes 123`
|
||||
- `/pr-description 3534 --fixes 123,456,789`
|
||||
|
||||
## Instructions
|
||||
|
||||
1. First, gather information about the PR:
|
||||
- Use GitHub plugin to get PR details (title, current description, base branch)
|
||||
- Use local git to get commits: `git log main..HEAD --oneline`
|
||||
- Use local git to get the diff: `git diff main..HEAD`
|
||||
- Parse any `--fixes` argument for issue numbers
|
||||
|
||||
2. Check the existing PR description:
|
||||
- If it already has a complete, accurate description that reflects the changes, do nothing
|
||||
- If it's missing sections, incomplete, or outdated compared to the actual changes, proceed to update
|
||||
- If it only has the template placeholder text, generate a full description
|
||||
|
||||
3. Analyze the changes:
|
||||
- Understand the purpose of each commit
|
||||
- Identify any breaking changes (API changes, removed features, behavior changes)
|
||||
- Look for new features, bug fixes, refactoring, or documentation changes
|
||||
- Collect issue numbers from:
|
||||
- The `--fixes` argument (if provided)
|
||||
- Commit messages (patterns like "Fixes #123", "Closes #456", "Resolves #789")
|
||||
|
||||
4. Generate or update the PR description with these sections:
|
||||
|
||||
## PR Description Format
|
||||
|
||||
### Summary (always include)
|
||||
|
||||
Brief bullet points describing what changed and why. Focus on the *purpose* and *impact*, not implementation details.
|
||||
|
||||
```markdown
|
||||
## Summary
|
||||
|
||||
- Added X to enable Y
|
||||
- Fixed bug where Z would happen
|
||||
- Refactored W for better maintainability
|
||||
```
|
||||
|
||||
### Breaking Changes (include only if applicable)
|
||||
|
||||
Document any changes that affect existing users or APIs.
|
||||
|
||||
```markdown
|
||||
## Breaking Changes
|
||||
|
||||
- `ClassName.method()` now requires a `param` argument
|
||||
- Removed deprecated `old_function()` - use `new_function()` instead
|
||||
```
|
||||
|
||||
### Testing (include when non-obvious)
|
||||
|
||||
How to verify the changes work. Skip for trivial changes.
|
||||
|
||||
```markdown
|
||||
## Testing
|
||||
|
||||
- Run `uv run pytest tests/test_feature.py` to verify the fix
|
||||
- Example usage: `uv run examples/new_feature.py`
|
||||
```
|
||||
|
||||
### Fixes (include if issues are provided or found in commits)
|
||||
|
||||
List issues this PR fixes. GitHub will automatically close these issues when the PR is merged.
|
||||
|
||||
```markdown
|
||||
## Fixes
|
||||
|
||||
- Fixes #123
|
||||
- Fixes #456
|
||||
```
|
||||
|
||||
Note: Use "Fixes #X" format (not "Closes" or "Resolves") for consistency. Each issue should be on its own line with "Fixes" to ensure GitHub auto-closes them.
|
||||
|
||||
## Guidelines
|
||||
|
||||
- **Be concise** - Reviewers should understand the PR in 30 seconds
|
||||
- **Focus on why** - The diff shows *what* changed, explain *why*
|
||||
- **Skip empty sections** - Only include sections that have content
|
||||
- **Use bullet points** - Easier to scan than paragraphs
|
||||
- **Don't duplicate the diff** - Avoid listing every file or line changed
|
||||
|
||||
## Example Output
|
||||
|
||||
```markdown
|
||||
## Summary
|
||||
|
||||
- Added `/docstring` skill for documenting Python modules with Google-style docstrings
|
||||
- Skill finds classes by name and handles conflicts when multiple matches exist
|
||||
- Skips already-documented code to avoid unnecessary changes
|
||||
|
||||
## Testing
|
||||
|
||||
/docstring ClassName
|
||||
|
||||
## Fixes
|
||||
|
||||
- Fixes #123
|
||||
```
|
||||
|
||||
## Checklist
|
||||
|
||||
Before updating the PR:
|
||||
|
||||
- [ ] Verified existing description needs updating (not already complete)
|
||||
- [ ] Summary accurately reflects the changes
|
||||
- [ ] Breaking changes are clearly documented (if any)
|
||||
- [ ] No unnecessary sections included
|
||||
- [ ] Description is concise and scannable
|
||||
2
.github/workflows/coverage.yaml
vendored
2
.github/workflows/coverage.yaml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv sync --group dev --extra anthropic --extra aws --extra google --extra langchain --extra websocket
|
||||
uv sync --group dev --extra anthropic --extra aws --extra google --extra langchain --extra livekit --extra websocket
|
||||
|
||||
- name: Run tests with coverage
|
||||
run: |
|
||||
|
||||
2
.github/workflows/tests.yaml
vendored
2
.github/workflows/tests.yaml
vendored
@@ -37,7 +37,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv sync --group dev --extra anthropic --extra aws --extra google --extra langchain --extra websocket
|
||||
uv sync --group dev --extra anthropic --extra aws --extra google --extra langchain --extra livekit --extra websocket
|
||||
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
|
||||
16
.gitignore
vendored
16
.gitignore
vendored
@@ -4,7 +4,14 @@ __pycache__/
|
||||
*~
|
||||
venv
|
||||
.venv
|
||||
/.idea
|
||||
.idea
|
||||
.gradle
|
||||
.next
|
||||
next-env.d.ts
|
||||
local.properties
|
||||
*.log
|
||||
*.lock
|
||||
smart_turn_audio_log
|
||||
#*#
|
||||
|
||||
# Distribution / Packaging
|
||||
@@ -27,7 +34,7 @@ share/python-wheels/
|
||||
*.egg
|
||||
MANIFEST
|
||||
.DS_Store
|
||||
.env
|
||||
.env*
|
||||
fly.toml
|
||||
|
||||
# Examples
|
||||
@@ -51,4 +58,7 @@ docs/api/_build/
|
||||
docs/api/api
|
||||
|
||||
# uv
|
||||
.python-version
|
||||
.python-version
|
||||
|
||||
# Pipecat
|
||||
whisker_setup.py
|
||||
376
CHANGELOG.md
376
CHANGELOG.md
@@ -7,6 +7,129 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
<!-- towncrier release notes start -->
|
||||
|
||||
## [0.0.100] - 2026-01-20
|
||||
|
||||
### Added
|
||||
|
||||
- Added Hathora service to support Hathora-hosted TTS and STT models (only
|
||||
non-streaming)
|
||||
(PR [#3169](https://github.com/pipecat-ai/pipecat/pull/3169))
|
||||
|
||||
- Added `CambTTSService`, using Camb.ai's TTS integration with MARS models
|
||||
(mars-flash, mars-pro, mars-instruct) for high-quality text-to-speech
|
||||
synthesis.
|
||||
(PR [#3349](https://github.com/pipecat-ai/pipecat/pull/3349))
|
||||
|
||||
- Added the `additional_headers` param to `WebsocketClientParams`, allowing
|
||||
`WebsocketClientTransport` to send custom headers on connect, for cases such
|
||||
as authentication.
|
||||
(PR [#3461](https://github.com/pipecat-ai/pipecat/pull/3461))
|
||||
|
||||
- Added `UserIdleController` for detecting user idle state, integrated into
|
||||
`LLMUserAggregator` and `UserTurnProcessor` via optional `user_idle_timeout`
|
||||
parameter. Emits `on_user_turn_idle` event for application-level handling.
|
||||
Deprecated `UserIdleProcessor` in favor of the new compositional approach.
|
||||
(PR [#3482](https://github.com/pipecat-ai/pipecat/pull/3482))
|
||||
|
||||
- Added `on_user_mute_started` and `on_user_mute_stopped` event handlers to
|
||||
`LLMUserAggregator` for tracking user mute state changes.
|
||||
(PR [#3490](https://github.com/pipecat-ai/pipecat/pull/3490))
|
||||
|
||||
### Changed
|
||||
|
||||
- Enhanced interruption handling in `AsyncAITTSService` by supporting
|
||||
multi-context WebSocket sessions for more robust context management.
|
||||
(PR [#3287](https://github.com/pipecat-ai/pipecat/pull/3287))
|
||||
|
||||
- Throttle `UserSpeakingFrame` to broadcast at most every 200ms instead of on
|
||||
every audio chunk, reducing frame processing overhead during user speech.
|
||||
(PR [#3483](https://github.com/pipecat-ai/pipecat/pull/3483))
|
||||
|
||||
### Deprecated
|
||||
|
||||
- For consistency with other package names, we just deprecated
|
||||
`pipecat.turns.mute` (introduced in Pipecat 0.0.99) in favor of
|
||||
`pipecat.turns.user_mute`.
|
||||
(PR [#3479](https://github.com/pipecat-ai/pipecat/pull/3479))
|
||||
|
||||
### Fixed
|
||||
|
||||
- Corrected TTFB metric calculation in `AsyncAIHttpTTSService`.
|
||||
(PR [#3287](https://github.com/pipecat-ai/pipecat/pull/3287))
|
||||
|
||||
- Fixed an issue where the "bot-llm-text" RTVI event would not fire for
|
||||
realtime (speech-to-speech) services:
|
||||
|
||||
- `AWSNovaSonicLLMService`
|
||||
- `GeminiLiveLLMService`
|
||||
- `OpenAIRealtimeLLMService`
|
||||
- `GrokRealtimeLLMService`
|
||||
|
||||
The issue was that these services weren't pushing `LLMTextFrame`s. Now
|
||||
they do.
|
||||
(PR [#3446](https://github.com/pipecat-ai/pipecat/pull/3446))
|
||||
|
||||
- Fixed an issue where `on_user_turn_stop_timeout` could fire while a user is
|
||||
talking when using `ExternalUserTurnStrategies`.
|
||||
(PR [#3454](https://github.com/pipecat-ai/pipecat/pull/3454))
|
||||
|
||||
- Fixed an issue where user turn start strategies were not being reset after a
|
||||
user turn started, causing incorrect strategy behavior.
|
||||
(PR [#3455](https://github.com/pipecat-ai/pipecat/pull/3455))
|
||||
|
||||
- Fixed `MinWordsUserTurnStartStrategy` to not aggregate transcriptions,
|
||||
preventing incorrect turn starts when words are spoken with pauses between
|
||||
them.
|
||||
(PR [#3462](https://github.com/pipecat-ai/pipecat/pull/3462))
|
||||
|
||||
- Fixed an issue where Grok Realtime would error out when running with
|
||||
SmallWebRTC transport.
|
||||
(PR [#3480](https://github.com/pipecat-ai/pipecat/pull/3480))
|
||||
|
||||
- Fixed a `Mem0MemoryService` issue where passing `async_mode: true` was
|
||||
causing an error. See
|
||||
https://docs.mem0.ai/platform/features/async-mode-default-change.
|
||||
(PR [#3484](https://github.com/pipecat-ai/pipecat/pull/3484))
|
||||
|
||||
- Fixed `AWSNovaSonicLLMService.reset_conversation()`, which would previously
|
||||
error out. Now it successfully reconnects and "rehydrates" from the context
|
||||
object.
|
||||
(PR [#3486](https://github.com/pipecat-ai/pipecat/pull/3486))
|
||||
|
||||
- Fixed `AzureTTSService` transcript formatting issues:
|
||||
- Punctuation now appears without extra spaces (e.g., "Hello!" instead of
|
||||
"Hello !")
|
||||
- CJK languages (Chinese, Japanese, Korean) no longer have unwanted spaces
|
||||
between characters
|
||||
(PR [#3489](https://github.com/pipecat-ai/pipecat/pull/3489))
|
||||
|
||||
- Fixed an issue where `UninterruptibleFrame` frames would not be preserved in
|
||||
some cases.
|
||||
(PR [#3494](https://github.com/pipecat-ai/pipecat/pull/3494))
|
||||
|
||||
- Fixed memory leak in `LiveKitTransport` when `video_in_enabled` is `False`.
|
||||
(PR [#3499](https://github.com/pipecat-ai/pipecat/pull/3499))
|
||||
|
||||
- Fixed an issue in `AIService` where unhandled exceptions in `start()`,
|
||||
`stop()`, or `cancel()` implementations would prevent `process_frame()` to
|
||||
continue and therefore `StartFrame`, `EndFrame`, or `CancelFrame` from being
|
||||
pushed downstream, causing the pipeline to not start or stop properly.
|
||||
(PR [#3503](https://github.com/pipecat-ai/pipecat/pull/3503))
|
||||
|
||||
- Moved `NVIDIATTSService` and `NVIDIASTTService` client initialization from
|
||||
constructor to `start()` for better error handling.
|
||||
(PR [#3504](https://github.com/pipecat-ai/pipecat/pull/3504))
|
||||
|
||||
- Optimized `NVIDIATTSService` to process incoming audio frames immediately.
|
||||
(PR [#3509](https://github.com/pipecat-ai/pipecat/pull/3509))
|
||||
|
||||
- Optimized `NVIDIASTTService` by removing unnecessary queue and task.
|
||||
(PR [#3509](https://github.com/pipecat-ai/pipecat/pull/3509))
|
||||
|
||||
- Fixed a `CambTTSService` issue where client was being initialized in the
|
||||
constructor which wouldn't allow for proper Pipeline error handling.
|
||||
(PR [#3511](https://github.com/pipecat-ai/pipecat/pull/3511))
|
||||
|
||||
## [0.0.99] - 2026-01-13
|
||||
|
||||
### Added
|
||||
@@ -24,39 +147,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
A list of strategies can be specified for both strategies; strategies are
|
||||
evaluated in order until one evaluates to true.
|
||||
|
||||
Available user turn start strategies:
|
||||
- VADUserTurnStartStrategy
|
||||
- TranscriptionUserTurnStartStrategy
|
||||
- MinWordsUserTurnStartStrategy
|
||||
- ExternalUserTurnStartStrategy
|
||||
Available user turn start strategies:
|
||||
|
||||
Available user turn stop strategies:
|
||||
- TranscriptionUserTurnStopStrategy
|
||||
- TurnAnalyzerUserTurnStopStrategy
|
||||
- ExternalUserTurnStopStrategy
|
||||
- VADUserTurnStartStrategy
|
||||
- TranscriptionUserTurnStartStrategy
|
||||
- MinWordsUserTurnStartStrategy
|
||||
- ExternalUserTurnStartStrategy
|
||||
|
||||
The default strategies are:
|
||||
Available user turn stop strategies:
|
||||
|
||||
- start: [VADUserTurnStartStrategy, TranscriptionUserTurnStartStrategy]
|
||||
- stop: [TranscriptionUserTurnStopStrategy]
|
||||
- TranscriptionUserTurnStopStrategy
|
||||
- TurnAnalyzerUserTurnStopStrategy
|
||||
- ExternalUserTurnStopStrategy
|
||||
|
||||
urn strategies are configured when setting up `LLMContextAggregatorPair`.
|
||||
The default strategies are:
|
||||
|
||||
- start: [VADUserTurnStartStrategy, TranscriptionUserTurnStartStrategy]
|
||||
- stop: [TranscriptionUserTurnStopStrategy]
|
||||
|
||||
Turn strategies are configured when setting up `LLMContextAggregatorPair`.
|
||||
For example:
|
||||
|
||||
```python
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[
|
||||
TurnAnalyzerUserTurnStopStrategy(
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
)
|
||||
],
|
||||
)
|
||||
),
|
||||
)
|
||||
```
|
||||
```python
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[
|
||||
TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
)
|
||||
],
|
||||
)
|
||||
),
|
||||
)
|
||||
```
|
||||
|
||||
In order to use the user turn strategies you must update to the new
|
||||
universal `LLMContext` and `LLMContextAggregatorPair`.
|
||||
@@ -69,13 +193,13 @@ turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
- Added `GrokRealtimeLLMService` for xAI's Grok Voice Agent API with real-time
|
||||
voice conversations:
|
||||
|
||||
- Support for real-time audio streaming with WebSocket connection
|
||||
- Built-in server-side VAD (Voice Activity Detection)
|
||||
- Multiple voice options: Ara, Rex, Sal, Eve, Leo
|
||||
- Built-in tools support: web_search, x_search, file_search
|
||||
- Custom function calling with standard Pipecat tools schema
|
||||
- Configurable audio formats (PCM at 8kHz-48kHz)
|
||||
(PR [#3267](https://github.com/pipecat-ai/pipecat/pull/3267))
|
||||
- Support for real-time audio streaming with WebSocket connection
|
||||
- Built-in server-side VAD (Voice Activity Detection)
|
||||
- Multiple voice options: Ara, Rex, Sal, Eve, Leo
|
||||
- Built-in tools support: web_search, x_search, file_search
|
||||
- Custom function calling with standard Pipecat tools schema
|
||||
- Configurable audio formats (PCM at 8kHz-48kHz)
|
||||
(PR [#3267](https://github.com/pipecat-ai/pipecat/pull/3267))
|
||||
|
||||
- Added an approximation of TTFB for Ultravox.
|
||||
(PR [#3268](https://github.com/pipecat-ai/pipecat/pull/3268))
|
||||
@@ -86,11 +210,12 @@ turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
(PR [#3289](https://github.com/pipecat-ai/pipecat/pull/3289))
|
||||
|
||||
- `LLMUserAggregator` now exposes the following events:
|
||||
- `on_user_turn_started`: triggered when a user turn starts
|
||||
- `on_user_turn_stopped`: triggered when a user turn ends
|
||||
- `on_user_turn_stop_timeout`: triggered when a user turn does not stop
|
||||
and times out
|
||||
(PR [#3291](https://github.com/pipecat-ai/pipecat/pull/3291))
|
||||
|
||||
- `on_user_turn_started`: triggered when a user turn starts
|
||||
- `on_user_turn_stopped`: triggered when a user turn ends
|
||||
- `on_user_turn_stop_timeout`: triggered when a user turn does not stop
|
||||
and times out
|
||||
(PR [#3291](https://github.com/pipecat-ai/pipecat/pull/3291))
|
||||
|
||||
- Introducing user mute strategies. User mute strategies indicate when user
|
||||
input should be muted based on the current system state.
|
||||
@@ -104,12 +229,12 @@ turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
frame is muted if any of the configured strategies indicates it should be
|
||||
muted.
|
||||
|
||||
Available user mute strategies:
|
||||
Available user mute strategies:
|
||||
|
||||
* `FirstSpeechUserMuteStrategy`
|
||||
* `MuteUntilFirstBotCompleteUserMuteStrategy`
|
||||
* `AlwaysUserMuteStrategy`
|
||||
* `FunctionCallUserMuteStrategy`
|
||||
- `FirstSpeechUserMuteStrategy`
|
||||
- `MuteUntilFirstBotCompleteUserMuteStrategy`
|
||||
- `AlwaysUserMuteStrategy`
|
||||
- `FunctionCallUserMuteStrategy`
|
||||
|
||||
User mute strategies replace the legacy `STTMuteFilter` and provide a more
|
||||
flexible and composable approach to muting user input.
|
||||
@@ -117,16 +242,16 @@ turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
User mute strategies are configured when setting up the
|
||||
`LLMContextAggregatorPair`. For example:
|
||||
|
||||
```python
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_mute_strategies=[
|
||||
FirstSpeechUserMuteStrategy(),
|
||||
]
|
||||
),
|
||||
)
|
||||
```
|
||||
```python
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_mute_strategies=[
|
||||
FirstSpeechUserMuteStrategy(),
|
||||
]
|
||||
),
|
||||
)
|
||||
```
|
||||
|
||||
In order to use user mute strategies you should update to the new universal
|
||||
`LLMContext` and `LLMContextAggregatorPair`.
|
||||
@@ -159,16 +284,17 @@ turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
(PR [#3357](https://github.com/pipecat-ai/pipecat/pull/3357))
|
||||
|
||||
- Added image support to `OpenAIRealtimeLLMService` via `InputImageRawFrame`:
|
||||
- New `start_video_paused` parameter to control initial video input state
|
||||
- New `video_frame_detail` parameter to set image processing quality
|
||||
("auto",
|
||||
"low", or "high"). This corresponds to OpenAI Realtime's `image_detail`
|
||||
parameter.
|
||||
- `set_video_input_paused()` method to pause/resume video input at runtime
|
||||
- `set_video_frame_detail()` method to adjust video frame quality
|
||||
dynamically
|
||||
- Automatic rate limiting (1 frame per second) to prevent API overload
|
||||
(PR [#3360](https://github.com/pipecat-ai/pipecat/pull/3360))
|
||||
|
||||
- New `start_video_paused` parameter to control initial video input state
|
||||
- New `video_frame_detail` parameter to set image processing quality
|
||||
("auto",
|
||||
"low", or "high"). This corresponds to OpenAI Realtime's `image_detail`
|
||||
parameter.
|
||||
- `set_video_input_paused()` method to pause/resume video input at runtime
|
||||
- `set_video_frame_detail()` method to adjust video frame quality
|
||||
dynamically
|
||||
- Automatic rate limiting (1 frame per second) to prevent API overload
|
||||
(PR [#3360](https://github.com/pipecat-ai/pipecat/pull/3360))
|
||||
|
||||
- Added `UserTurnProcessor`, a frame processor built on `UserTurnController`
|
||||
that pushes `UserStartedSpeakingFrame` and `UserStoppedSpeakingFrame` frames
|
||||
@@ -188,11 +314,12 @@ turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
(PR [#3374](https://github.com/pipecat-ai/pipecat/pull/3374))
|
||||
|
||||
- `LLMAssistantAggregator` now exposes the following events:
|
||||
- `on_assistant_turn_started`: triggered when the assistant turn starts
|
||||
- `on_assistant_turn_stopped`: triggered when the assistant turn ends
|
||||
- `on_assistant_thought`: triggered when there's an assistant thought
|
||||
available
|
||||
(PR [#3385](https://github.com/pipecat-ai/pipecat/pull/3385))
|
||||
|
||||
- `on_assistant_turn_started`: triggered when the assistant turn starts
|
||||
- `on_assistant_turn_stopped`: triggered when the assistant turn ends
|
||||
- `on_assistant_thought`: triggered when there's an assistant thought
|
||||
available
|
||||
(PR [#3385](https://github.com/pipecat-ai/pipecat/pull/3385))
|
||||
|
||||
- Added `KrispVivaTurn` analyzer for end of turn detection using the Krisp VIVA
|
||||
SDK (requires `krisp_audio`).
|
||||
@@ -202,13 +329,14 @@ turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
register custom pipeline task setup files by setting the
|
||||
`PIPECAT_SETUP_FILES` environment variable. This variable should contain a
|
||||
colon-separated list of Python files (e.g. `export
|
||||
PIPECAT_SETUP_FILES="setup1.py:setup.py:..."`). Each file must define a
|
||||
PIPECAT_SETUP_FILES="setup1.py:setup.py:..."`). Each file must define a
|
||||
function with the following signature:
|
||||
|
||||
```python
|
||||
async def setup_pipeline_task(task: PipelineTask):
|
||||
...
|
||||
```
|
||||
```python
|
||||
async def setup_pipeline_task(task: PipelineTask):
|
||||
...
|
||||
```
|
||||
|
||||
(PR [#3397](https://github.com/pipecat-ai/pipecat/pull/3397))
|
||||
|
||||
- Added a keepalive task for `InworldTTSService` to keep the service connected
|
||||
@@ -238,12 +366,14 @@ turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
|
||||
- Updated `ElevenLabsRealtimeSTTService` to accept the
|
||||
`include_language_detection` parameter to detect language.
|
||||
```python
|
||||
stt = ElevenLabsRealtimeSTTService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
include_language_detection=True
|
||||
)
|
||||
```
|
||||
|
||||
```python
|
||||
stt = ElevenLabsRealtimeSTTService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
include_language_detection=True
|
||||
)
|
||||
```
|
||||
|
||||
(PR [#3216](https://github.com/pipecat-ai/pipecat/pull/3216))
|
||||
|
||||
- Updated `SpeechmaticsSTTService` to use new Python Voice SDK with improved
|
||||
@@ -251,16 +381,18 @@ turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams())
|
||||
without any impact on accuracy. Use the `turn_detection_mode` parameter to control
|
||||
the endpointing of speech, with `TurnDetectionMode.EXTERNAL` (default),
|
||||
`TurnDetectionMode.ADAPTIVE`, or `TurnDetectionMode.SMART_TURN`.
|
||||
```python
|
||||
|
||||
```python
|
||||
stt = SpeechmaticsSTTService(
|
||||
api_key=os.getenv("SPEECHMATICS_API_KEY"),
|
||||
params=SpeechmaticsSTTService.InputParams(
|
||||
language=Language.EN,
|
||||
turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.ADAPTIVE,
|
||||
turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.ADAPTIVE,
|
||||
speaker_active_format="<{speaker_id}>{text}</{speaker_id}>",
|
||||
),
|
||||
)
|
||||
```
|
||||
```
|
||||
|
||||
(PR [#3225](https://github.com/pipecat-ai/pipecat/pull/3225))
|
||||
|
||||
- `daily-python` updated to 0.23.0.
|
||||
@@ -273,10 +405,15 @@ turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.ADAPTIVE,
|
||||
|
||||
- Updates to Inworld TTS services:
|
||||
|
||||
- Improved `InworldTTSService`'s websocket implementation to better flush
|
||||
and close context to better handle long inputs.
|
||||
- Improved docstrings for `InworldTTSService` and `InworldHttpTTSService`.
|
||||
(PR [#3288](https://github.com/pipecat-ai/pipecat/pull/3288))
|
||||
- Improved `InworldTTSService`'s websocket implementation to better flush
|
||||
and close context to better handle long inputs.
|
||||
- Improved docstrings for `InworldTTSService` and `InworldHttpTTSService`.
|
||||
(PR [#3288](https://github.com/pipecat-ai/pipecat/pull/3288))
|
||||
|
||||
- Improved the error handling and reconnection logic for `WebsocketServer` by
|
||||
distinguishing between errors when disconnecting and websocket communication
|
||||
errors.
|
||||
(PR [#3392](https://github.com/pipecat-ai/pipecat/pull/3392))
|
||||
|
||||
- Updated `DeepgramSTTService` to push user started/stopped speaking and
|
||||
interruption frames when `vad_enabled` is set to true. This centralizes the
|
||||
@@ -308,7 +445,8 @@ turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.ADAPTIVE,
|
||||
- Smart Turn now takes into account `vad_start_seconds` when buffering audio,
|
||||
meaning that the start of the turn audio is not cut off. This improves
|
||||
accuracy for short utterances.
|
||||
- The default value of `pre_speech_ms` is now set to 500ms for Smart Turn.
|
||||
|
||||
- The default value of `pre_speech_ms` is now set to 500ms for Smart Turn.
|
||||
(PR [#3377](https://github.com/pipecat-ai/pipecat/pull/3377))
|
||||
|
||||
- Improved Krisp SDK management to allow `KrispVivaTurn` and `KrispVivaFilter`
|
||||
@@ -376,17 +514,18 @@ turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.ADAPTIVE,
|
||||
From the developer's point of view, switching to using `LLMContext`
|
||||
machinery will usually be a matter of going from this:
|
||||
|
||||
```python
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
```
|
||||
```python
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
```
|
||||
|
||||
To this:
|
||||
To this:
|
||||
|
||||
```
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
```
|
||||
|
||||
```
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
```
|
||||
(PR [#3263](https://github.com/pipecat-ai/pipecat/pull/3263))
|
||||
|
||||
- `STTMuteFilter` is deprecated and will be removed in a future version. Use
|
||||
@@ -401,16 +540,17 @@ turn_detection_mode=SpeechmaticsSTTService.TurnDetectionMode.ADAPTIVE,
|
||||
`LLMUserAggregator`'s new parameter `user_turn_strategies` instead. For
|
||||
example, to disable interruptions but still get user turns you can do:
|
||||
|
||||
```python
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
start=[TranscriptionUserTurnStartStrategy(enable_interruptions=False)],
|
||||
),
|
||||
),
|
||||
)
|
||||
```
|
||||
```python
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
start=[TranscriptionUserTurnStartStrategy(enable_interruptions=False)],
|
||||
),
|
||||
),
|
||||
)
|
||||
```
|
||||
|
||||
(PR [#3297](https://github.com/pipecat-ai/pipecat/pull/3297))
|
||||
|
||||
- `TranscriptProcessor` and related data classes and frames
|
||||
@@ -433,7 +573,9 @@ start=[TranscriptionUserTurnStartStrategy(enable_interruptions=False)],
|
||||
### Fixed
|
||||
|
||||
- Improved error handling in `ElevenLabsRealtimeSTTService`
|
||||
- Fixed an issue in `ElevenLabsRealtimeSTTService` causing an infinite loop
|
||||
(PR [#3233](https://github.com/pipecat-ai/pipecat/pull/3233))
|
||||
|
||||
- Fixed an issue in `ElevenLabsRealtimeSTTService` causing an infinite loop
|
||||
that blocks the process if the websocket disconnects due to an error
|
||||
(PR [#3233](https://github.com/pipecat-ai/pipecat/pull/3233))
|
||||
|
||||
@@ -446,13 +588,14 @@ start=[TranscriptionUserTurnStartStrategy(enable_interruptions=False)],
|
||||
(PR [#3322](https://github.com/pipecat-ai/pipecat/pull/3322))
|
||||
|
||||
- Updated `SpeechmaticsSTTService` for version `0.0.99+`:
|
||||
- Fixed `SpeechmaticsSTTService` to listen for `VADUserStoppedSpeakingFrame`
|
||||
in order to finalize transcription.
|
||||
- Default to `TurnDetectionMode.FIXED` for Pipecat-controlled end of turn
|
||||
detection.
|
||||
- Only emit VAD + interruption frames if VAD is enabled within the plugin
|
||||
(modes other than `TurnDetectionMode.FIXED` or `TurnDetectionMode.EXTERNAL`).
|
||||
(PR [#3328](https://github.com/pipecat-ai/pipecat/pull/3328))
|
||||
|
||||
- Fixed `SpeechmaticsSTTService` to listen for `VADUserStoppedSpeakingFrame`
|
||||
in order to finalize transcription.
|
||||
- Default to `TurnDetectionMode.FIXED` for Pipecat-controlled end of turn
|
||||
detection.
|
||||
- Only emit VAD + interruption frames if VAD is enabled within the plugin
|
||||
(modes other than `TurnDetectionMode.FIXED` or `TurnDetectionMode.EXTERNAL`).
|
||||
(PR [#3328](https://github.com/pipecat-ai/pipecat/pull/3328))
|
||||
|
||||
- Fixed an issue with function calling where a handler failing to invoke its
|
||||
result callback could leave the context stuck in IN_PROGRESS, causing LLM
|
||||
@@ -481,6 +624,9 @@ start=[TranscriptionUserTurnStartStrategy(enable_interruptions=False)],
|
||||
guard was set.
|
||||
(PR [#3400](https://github.com/pipecat-ai/pipecat/pull/3400))
|
||||
|
||||
- Fixed parallel function calling when using Gemini thinking.
|
||||
(PR [3420](https://github.com/pipecat-ai/pipecat/pull/3420))
|
||||
|
||||
- Fixed an issue in `traced_llm` where `model_name` in OpenTelemetry appears as
|
||||
`unknown`.
|
||||
(PR [#3422](https://github.com/pipecat-ai/pipecat/pull/3422))
|
||||
|
||||
@@ -73,15 +73,15 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [Hathora](https://docs.pipecat.ai/server/services/stt/hathora), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hathora](https://docs.pipecat.ai/server/services/tts/hathora), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Exotel](https://docs.pipecat.ai/server/utilities/serializers/exotel), [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/utilities/serializers/vonage) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
|
||||
|
||||
1
changelog/3510.added.2.md
Normal file
1
changelog/3510.added.2.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added `add_reached_upstream_filter()` and `add_reached_downstream_filter()` methods to `PipelineTask` for appending frame types.
|
||||
1
changelog/3510.added.md
Normal file
1
changelog/3510.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added `reached_upstream_types` and `reached_downstream_types` read-only properties to `PipelineTask` for inspecting current frame filters.
|
||||
1
changelog/3510.changed.3.md
Normal file
1
changelog/3510.changed.3.md
Normal file
@@ -0,0 +1 @@
|
||||
- Changed frame filter storage from tuples to sets in `PipelineTask`.
|
||||
1
changelog/3519.added.2.md
Normal file
1
changelog/3519.added.2.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added `RTVIProcessor.create_rtvi_observer()` factory method for creating RTVI observers.
|
||||
1
changelog/3519.added.3.md
Normal file
1
changelog/3519.added.3.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added `FrameProcessor.broadcast_frame_instance(frame)` method to broadcast a frame instance by extracting its fields and creating new instances for each direction.
|
||||
1
changelog/3519.added.md
Normal file
1
changelog/3519.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- `PipelineTask` now automatically adds `RTVIProcessor` and registers `RTVIObserver` when `enable_rtvi=True` (default), simplifying pipeline setup.
|
||||
1
changelog/3519.fixed.2.md
Normal file
1
changelog/3519.fixed.2.md
Normal file
@@ -0,0 +1 @@
|
||||
- Fixed `FrameProcessor.broadcast_frame()` to deep copy kwargs, preventing shared mutable references between the downstream and upstream frame instances.
|
||||
1
changelog/3519.fixed.md
Normal file
1
changelog/3519.fixed.md
Normal file
@@ -0,0 +1 @@
|
||||
- Transports now properly broadcast `InputTransportMessageFrame` frames both upstream and downstream instead of only pushing downstream.
|
||||
1
changelog/3520.added.md
Normal file
1
changelog/3520.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added `video_out_codec` parameter to `TransportParams` allowing configuration of the preferred video codec (e.g., `"VP8"`, `"H264"`, `"H265"`) for video output in `DailyTransport`.
|
||||
1
changelog/3523.added.md
Normal file
1
changelog/3523.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added `location` parameter to Google TTS services (`GoogleHttpTTSService`, `GoogleTTSService`, `GeminiTTSService`) for regional endpoint support.
|
||||
1
changelog/3525.added.md
Normal file
1
changelog/3525.added.md
Normal file
@@ -0,0 +1 @@
|
||||
- Added new `SMART_TURN_LOG_DATA` environment variable, which causes Smart Turn input data to be saved to disk
|
||||
2
changelog/3531.changed.md
Normal file
2
changelog/3531.changed.md
Normal file
@@ -0,0 +1,2 @@
|
||||
- Changed default Inworld TTS model from `inworld-tts-1` to
|
||||
`inworld-tts-1.5-max`.
|
||||
@@ -91,6 +91,25 @@ autodoc_mock_imports = [
|
||||
# MLX dependencies (Apple Silicon specific)
|
||||
"mlx",
|
||||
"mlx_whisper", # Note: might need underscore format too
|
||||
# Pydantic v2 compatibility issues in third-party SDKs
|
||||
"hume",
|
||||
"hume.tts",
|
||||
"hume.tts.types",
|
||||
"cartesia",
|
||||
"camb",
|
||||
"sarvamai",
|
||||
"openpipe",
|
||||
"openai.types.beta.realtime",
|
||||
"langchain_core",
|
||||
"langchain_core.messages",
|
||||
# FastAPI - Pydantic v2 compatibility issues during Sphinx autodoc
|
||||
"fastapi",
|
||||
"fastapi.applications",
|
||||
"fastapi.routing",
|
||||
"fastapi.params",
|
||||
"fastapi.middleware",
|
||||
"fastapi.responses",
|
||||
"uvicorn",
|
||||
]
|
||||
|
||||
# HTML output settings
|
||||
|
||||
@@ -31,6 +31,9 @@ AZURE_DALLE_API_KEY=...
|
||||
AZURE_DALLE_ENDPOINT=https://...
|
||||
AZURE_DALLE_MODEL=...
|
||||
|
||||
# Camb.ai
|
||||
CAMB_API_KEY=...
|
||||
|
||||
# Cartesia
|
||||
CARTESIA_API_KEY=...
|
||||
CARTESIA_VOICE_ID=...
|
||||
@@ -82,6 +85,9 @@ GROK_API_KEY=...
|
||||
# Groq
|
||||
GROQ_API_KEY=...
|
||||
|
||||
# Hathora
|
||||
HATHORA_API_KEY=...
|
||||
|
||||
# Heygen
|
||||
HEYGEN_API_KEY=...
|
||||
HEYGEN_LIVE_AVATAR_API_KEY=...
|
||||
|
||||
@@ -10,7 +10,6 @@ import os
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
|
||||
@@ -45,7 +45,6 @@ from pipecat.services.google.tts import GoogleTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.filters.krisp_viva_filter import KrispVivaFilter
|
||||
from pipecat.audio.turn.krisp_viva_turn import KrispTurnParams, KrispVivaTurn
|
||||
from pipecat.audio.turn.krisp_viva_turn import KrispVivaTurn
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
|
||||
@@ -23,7 +23,6 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
@@ -93,12 +92,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
),
|
||||
)
|
||||
|
||||
rtvi = RTVIProcessor()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
rtvi,
|
||||
stt,
|
||||
user_aggregator,
|
||||
llm,
|
||||
@@ -115,7 +111,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
observers=[
|
||||
RTVIObserver(rtvi),
|
||||
DebugLogObserver(
|
||||
frame_types={
|
||||
TTSTextFrame: (BaseOutputTransport, FrameEndpoint.SOURCE),
|
||||
@@ -22,7 +22,6 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
@@ -88,12 +87,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
),
|
||||
)
|
||||
|
||||
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
rtvi,
|
||||
stt,
|
||||
user_aggregator,
|
||||
llm,
|
||||
@@ -110,7 +106,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
observers=[
|
||||
RTVIObserver(rtvi),
|
||||
DebugLogObserver(
|
||||
frame_types={
|
||||
TTSTextFrame: (BaseOutputTransport, FrameEndpoint.SOURCE),
|
||||
@@ -22,7 +22,6 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
@@ -90,12 +89,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
),
|
||||
)
|
||||
|
||||
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
rtvi,
|
||||
stt,
|
||||
user_aggregator, # User responses
|
||||
llm, # LLM
|
||||
@@ -114,7 +110,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
observers=[
|
||||
RTVIObserver(rtvi),
|
||||
DebugLogObserver(
|
||||
frame_types={
|
||||
TTSTextFrame: (BaseOutputTransport, FrameEndpoint.SOURCE),
|
||||
@@ -123,10 +118,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
],
|
||||
)
|
||||
|
||||
@rtvi.event_handler("on_client_ready")
|
||||
async def on_client_ready(rtvi):
|
||||
await rtvi.set_bot_ready()
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
138
examples/foundational/07zg-interruptible-camb.py
Normal file
138
examples/foundational/07zg-interruptible-camb.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#
|
||||
# Copyright (c) 2024–2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.camb.tts import CambTTSService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info("Starting Camb AI TTS bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = CambTTSService(
|
||||
api_key=os.getenv("CAMB_API_KEY"),
|
||||
model="mars-flash",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful voice assistant powered by Camb AI text-to-speech. "
|
||||
"Keep your responses concise and conversational since they will be spoken aloud. "
|
||||
"Avoid special characters, emojis, or bullet points.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())]
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
stt,
|
||||
user_aggregator,
|
||||
llm,
|
||||
tts,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
audio_out_sample_rate=22050,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info("Client connected")
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info("Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -1,18 +1,14 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
@@ -25,12 +21,10 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.filters.stt_mute_filter import STTMuteConfig, STTMuteFilter, STTMuteStrategy
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.hathora.stt import HathoraSTTService
|
||||
from pipecat.services.hathora.tts import HathoraTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
@@ -40,15 +34,6 @@ from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
async def fetch_weather_from_api(params: FunctionCallParams):
|
||||
# Add a delay to test interruption during function calls
|
||||
logger.info("Weather API call starting...")
|
||||
await asyncio.sleep(5) # 5-second delay
|
||||
logger.info("Weather API call completed")
|
||||
await params.result_callback({"conditions": "nice", "temperature": "75"})
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -74,50 +59,30 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
# Configure the mute processor with both strategies
|
||||
stt_mute_processor = STTMuteFilter(
|
||||
config=STTMuteConfig(
|
||||
strategies={
|
||||
STTMuteStrategy.MUTE_UNTIL_FIRST_BOT_COMPLETE,
|
||||
STTMuteStrategy.FUNCTION_CALL,
|
||||
}
|
||||
),
|
||||
stt = HathoraSTTService(
|
||||
model="nvidia-parakeet-tdt-0.6b-v3",
|
||||
)
|
||||
|
||||
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
llm.register_function("get_current_weather", fetch_weather_from_api)
|
||||
|
||||
weather_function = FunctionSchema(
|
||||
name="get_current_weather",
|
||||
description="Get the current weather",
|
||||
properties={
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "The temperature unit to use. Infer this from the user's location.",
|
||||
},
|
||||
},
|
||||
required=["location", "format"],
|
||||
tts = HathoraTTSService(
|
||||
model="hexgrad-kokoro-82m",
|
||||
)
|
||||
|
||||
# See https://models.hathora.dev/model/qwen3-30b-a3b
|
||||
llm = OpenAILLMService(
|
||||
base_url="https://app-362f7ca1-6975-4e18-a605-ab202bf2c315.app.hathora.dev/v1",
|
||||
api_key=os.getenv("HATHORA_API_KEY"),
|
||||
model=None,
|
||||
)
|
||||
tools = ToolsSchema(standard_tools=[weather_function])
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant who can check the weather. Always check the weather when a location is mentioned. Respond concisely and naturally. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points.",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages, tools)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
@@ -129,13 +94,12 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
stt_mute_processor, # Add the mute processor between STT and context aggregator
|
||||
user_aggregator, # User responses
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
assistant_aggregator, # Assistant spoken responses
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -151,13 +115,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation with a weather-related prompt
|
||||
messages.append(
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Ask the user what city they'd like to know the weather for.",
|
||||
}
|
||||
)
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
@@ -22,7 +22,7 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
|
||||
@@ -13,7 +13,12 @@ from loguru import logger
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import EndFrame, LLMMessagesAppendFrame, LLMRunFrame, TTSSpeakFrame
|
||||
from pipecat.frames.frames import (
|
||||
EndTaskFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMRunFrame,
|
||||
TTSSpeakFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
@@ -22,7 +27,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.user_idle_processor import UserIdleProcessor
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
@@ -36,6 +41,43 @@ from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class IdleHandler:
|
||||
"""Helper class to manage user idle retry logic."""
|
||||
|
||||
def __init__(self):
|
||||
self._retry_count = 0
|
||||
|
||||
def reset(self):
|
||||
"""Reset the retry count when user becomes active."""
|
||||
self._retry_count = 0
|
||||
|
||||
async def handle_idle(self, aggregator):
|
||||
"""Handle user idle event with escalating prompts."""
|
||||
self._retry_count += 1
|
||||
|
||||
if self._retry_count == 1:
|
||||
# First attempt: Add a gentle prompt to the conversation
|
||||
message = {
|
||||
"role": "system",
|
||||
"content": "The user has been quiet. Politely and briefly ask if they're still there.",
|
||||
}
|
||||
await aggregator.push_frame(LLMMessagesAppendFrame([message], run_llm=True))
|
||||
elif self._retry_count == 2:
|
||||
# Second attempt: More direct prompt
|
||||
message = {
|
||||
"role": "system",
|
||||
"content": "The user is still inactive. Ask if they'd like to continue our conversation.",
|
||||
}
|
||||
await aggregator.push_frame(LLMMessagesAppendFrame([message], run_llm=True))
|
||||
else:
|
||||
# Third attempt: End the conversation
|
||||
await aggregator.push_frame(
|
||||
TTSSpeakFrame("It seems like you're busy right now. Have a nice day!")
|
||||
)
|
||||
await aggregator.push_frame(EndTaskFrame(), FrameDirection.UPSTREAM)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
@@ -84,42 +126,15 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())]
|
||||
),
|
||||
user_idle_timeout=5.0, # Detect user idle after 5 seconds
|
||||
),
|
||||
)
|
||||
|
||||
async def handle_user_idle(user_idle: UserIdleProcessor, retry_count: int) -> bool:
|
||||
if retry_count == 1:
|
||||
# First attempt: Add a gentle prompt to the conversation
|
||||
message = {
|
||||
"role": "system",
|
||||
"content": "The user has been quiet. Politely and briefly ask if they're still there.",
|
||||
}
|
||||
await user_idle.push_frame(LLMMessagesAppendFrame([message], run_llm=True))
|
||||
return True
|
||||
elif retry_count == 2:
|
||||
# Second attempt: More direct prompt
|
||||
message = {
|
||||
"role": "system",
|
||||
"content": "The user is still inactive. Ask if they'd like to continue our conversation.",
|
||||
}
|
||||
await user_idle.push_frame(LLMMessagesAppendFrame([message], run_llm=True))
|
||||
return True
|
||||
else:
|
||||
# Third attempt: End the conversation
|
||||
await user_idle.push_frame(
|
||||
TTSSpeakFrame("It seems like you're busy right now. Have a nice day!")
|
||||
)
|
||||
await task.queue_frame(EndFrame())
|
||||
return False
|
||||
|
||||
user_idle = UserIdleProcessor(callback=handle_user_idle, timeout=5.0)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
user_idle, # Idle user check-in
|
||||
user_aggregator,
|
||||
user_aggregator, # User aggregator with built-in idle detection
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
@@ -136,6 +151,17 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
# Set up idle handling with retry logic
|
||||
idle_handler = IdleHandler()
|
||||
|
||||
@user_aggregator.event_handler("on_user_turn_idle")
|
||||
async def on_user_turn_idle(aggregator):
|
||||
await idle_handler.handle_idle(aggregator)
|
||||
|
||||
@user_aggregator.event_handler("on_user_turn_started")
|
||||
async def on_user_turn_started(aggregator, strategy):
|
||||
idle_handler.reset()
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
|
||||
@@ -17,7 +17,7 @@ from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
|
||||
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
|
||||
@@ -22,7 +22,6 @@ from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
|
||||
@@ -114,6 +113,14 @@ async def load_conversation(params: FunctionCallParams):
|
||||
# "content": f"{AWSNovaSonicLLMService.AWAIT_TRIGGER_ASSISTANT_RESPONSE_INSTRUCTION}",
|
||||
# }
|
||||
# )
|
||||
# If the last message isn't from the user, add a message asking for a recap
|
||||
if messages and messages[-1].get("role") != "user":
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Can you catch me up on what we were talking about?",
|
||||
}
|
||||
)
|
||||
params.context.set_messages(messages)
|
||||
await params.llm.reset_conversation()
|
||||
# await params.llm.trigger_assistant_response()
|
||||
|
||||
@@ -119,7 +119,7 @@ class CompletenessCheck(FrameProcessor):
|
||||
|
||||
if isinstance(frame, TextFrame) and frame.text == "YES":
|
||||
logger.debug("Completeness check YES")
|
||||
await self.push_frame(UserStoppedSpeakingFrame())
|
||||
await self.broadcast_frame(UserStoppedSpeakingFrame)
|
||||
await self._notifier.notify()
|
||||
elif isinstance(frame, TextFrame) and frame.text == "NO":
|
||||
logger.debug("Completeness check NO")
|
||||
|
||||
@@ -322,7 +322,7 @@ class CompletenessCheck(FrameProcessor):
|
||||
|
||||
if isinstance(frame, TextFrame) and frame.text == "YES":
|
||||
logger.debug("!!! Completeness check YES")
|
||||
await self.push_frame(UserStoppedSpeakingFrame())
|
||||
await self.broadcast_frame(UserStoppedSpeakingFrame)
|
||||
await self._notifier.notify()
|
||||
elif isinstance(frame, TextFrame) and frame.text == "NO":
|
||||
logger.debug("!!! Completeness check NO")
|
||||
|
||||
@@ -451,7 +451,7 @@ class CompletenessCheck(FrameProcessor):
|
||||
logger.debug("Completeness check YES")
|
||||
if self._idle_task:
|
||||
await self.cancel_task(self._idle_task)
|
||||
await self.push_frame(UserStoppedSpeakingFrame())
|
||||
await self.broadcast_frame(UserStoppedSpeakingFrame)
|
||||
await self._audio_accumulator.reset()
|
||||
await self._notifier.notify()
|
||||
elif isinstance(frame, TextFrame):
|
||||
|
||||
@@ -34,7 +34,7 @@ from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
from pipecat.turns.mute import (
|
||||
from pipecat.turns.user_mute import (
|
||||
FunctionCallUserMuteStrategy,
|
||||
MuteUntilFirstBotCompleteUserMuteStrategy,
|
||||
)
|
||||
@@ -161,6 +161,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@user_aggregator.event_handler("on_user_mute_started")
|
||||
async def on_user_mute_started(aggregator):
|
||||
logger.info(f"User mute started")
|
||||
|
||||
@user_aggregator.event_handler("on_user_mute_stopped")
|
||||
async def on_user_mute_stopped(aggregator):
|
||||
logger.info(f"User mute stopped")
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2025, Daily
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024-2025, Daily
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
@@ -59,7 +59,6 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
@@ -255,12 +254,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
),
|
||||
),
|
||||
)
|
||||
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
rtvi,
|
||||
stt,
|
||||
user_aggregator,
|
||||
memory,
|
||||
@@ -278,12 +275,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
observers=[RTVIObserver(rtvi)],
|
||||
)
|
||||
|
||||
@rtvi.event_handler("on_client_ready")
|
||||
@task.rtvi.event_handler("on_client_ready")
|
||||
async def on_client_ready(rtvi):
|
||||
await rtvi.set_bot_ready()
|
||||
# Get personalized greeting based on user memories. Can pass agent_id and run_id as per requirement of the application to manage short term memory or agent specific memory.
|
||||
greeting = await get_initial_greeting(
|
||||
memory_client=memory.memory_client, user_id=USER_ID, agent_id=None, run_id=None
|
||||
|
||||
@@ -22,7 +22,6 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
@@ -87,8 +86,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
),
|
||||
)
|
||||
|
||||
rtvi = RTVIProcessor()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
@@ -108,13 +105,11 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
observers=[RTVIObserver(rtvi)],
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@rtvi.event_handler("on_client_ready")
|
||||
@task.rtvi.event_handler("on_client_ready")
|
||||
async def on_client_ready(rtvi):
|
||||
await rtvi.set_bot_ready()
|
||||
# Kick off the conversation
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@@ -9,7 +9,6 @@ import asyncio
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
|
||||
import aiohttp
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2025, Daily
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
@@ -22,7 +22,6 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
@@ -125,14 +124,10 @@ async def run_bot(pipecat_transport):
|
||||
),
|
||||
)
|
||||
|
||||
# RTVI events for Pipecat client UI
|
||||
rtvi = RTVIProcessor()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
pipecat_transport.input(),
|
||||
user_aggregator,
|
||||
rtvi,
|
||||
llm, # LLM
|
||||
EdgeDetectionProcessor(
|
||||
pipecat_transport._params.video_out_width,
|
||||
@@ -149,13 +144,11 @@ async def run_bot(pipecat_transport):
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
observers=[RTVIObserver(rtvi)],
|
||||
)
|
||||
|
||||
@rtvi.event_handler("on_client_ready")
|
||||
@task.rtvi.event_handler("on_client_ready")
|
||||
async def on_client_ready(rtvi):
|
||||
logger.info("Pipecat client ready.")
|
||||
await rtvi.set_bot_ready()
|
||||
# Kick off the conversation.
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import LLMRunFrame, ThoughtTranscriptionMessage, TranscriptionMessage
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
|
||||
@@ -53,8 +53,6 @@ from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.grok.realtime.events import (
|
||||
SessionProperties,
|
||||
WebSearchTool,
|
||||
XSearchTool,
|
||||
)
|
||||
from pipecat.services.grok.realtime.llm import GrokRealtimeLLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
|
||||
@@ -44,8 +44,11 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
)
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
@@ -53,6 +56,10 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.turns.user_stop.turn_analyzer_user_turn_stop_strategy import (
|
||||
TurnAnalyzerUserTurnStopStrategy,
|
||||
)
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
logger.info("✅ All components loaded successfully!")
|
||||
|
||||
@@ -79,20 +86,27 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
user_turn_strategies=UserTurnStrategies(
|
||||
stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())]
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
|
||||
rtvi = RTVIProcessor()
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
rtvi, # RTVI processor
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
user_aggregator, # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
assistant_aggregator, # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
@@ -130,13 +144,11 @@ async def bot(runner_args: RunnerArguments):
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
agent_name = "quickstart"
|
||||
image = "your_username/quickstart:0.1"
|
||||
secret_set = "quickstart-secrets"
|
||||
agent_name = "quickstart-test"
|
||||
image = "markatdaily/quickstart-test:latest"
|
||||
secret_set = "quickstart-test-secrets"
|
||||
agent_profile = "agent-1x"
|
||||
|
||||
# RECOMMENDED: Set an image pull secret:
|
||||
# https://docs.pipecat.ai/deployment/pipecat-cloud/fundamentals/secrets#image-pull-secrets
|
||||
# image_credentials = "your_image_pull_secret"
|
||||
image_credentials = "dockerhub-access"
|
||||
|
||||
[scaling]
|
||||
min_agents = 1
|
||||
|
||||
@@ -41,8 +41,11 @@ dependencies = [
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://pipecat.ai"
|
||||
Documentation = "https://docs.pipecat.ai/"
|
||||
Source = "https://github.com/pipecat-ai/pipecat"
|
||||
Website = "https://pipecat.ai"
|
||||
Issues = "https://github.com/pipecat-ai/pipecat/issues"
|
||||
Changelog = "https://github.com/pipecat-ai/pipecat/blob/main/CHANGELOG.md"
|
||||
|
||||
[project.optional-dependencies]
|
||||
aic = [ "aic-sdk~=1.2.0" ]
|
||||
@@ -53,6 +56,7 @@ aws = [ "aioboto3~=15.5.0", "pipecat-ai[websockets-base]" ]
|
||||
aws-nova-sonic = [ "aws_sdk_bedrock_runtime~=0.2.0; python_version>='3.12'" ]
|
||||
azure = [ "azure-cognitiveservices-speech~=1.44.0"]
|
||||
cartesia = [ "cartesia~=2.0.3", "pipecat-ai[websockets-base]" ]
|
||||
camb = [ "camb-sdk>=1.5.4" ]
|
||||
cerebras = []
|
||||
daily = [ "daily-python~=0.23.0" ]
|
||||
deepgram = [ "deepgram-sdk~=4.7.0", "pipecat-ai[websockets-base]" ]
|
||||
@@ -96,7 +100,7 @@ qwen = []
|
||||
remote-smart-turn = []
|
||||
rime = [ "pipecat-ai[websockets-base]" ]
|
||||
riva = [ "pipecat-ai[nvidia]" ]
|
||||
runner = [ "python-dotenv>=1.0.0,<2.0.0", "uvicorn>=0.32.0,<1.0.0", "fastapi>=0.115.6,<0.122.0", "pipecat-ai-small-webrtc-prebuilt>=2.0.4"]
|
||||
runner = [ "python-dotenv>=1.0.0,<2.0.0", "uvicorn>=0.32.0,<1.0.0", "fastapi>=0.115.6,<0.128.0", "pipecat-ai-small-webrtc-prebuilt>=2.0.4"]
|
||||
sagemaker = ["aws_sdk_sagemaker_runtime_http2; python_version>='3.12'"]
|
||||
sambanova = []
|
||||
sarvam = [ "sarvamai==0.1.21", "pipecat-ai[websockets-base]" ]
|
||||
@@ -112,7 +116,7 @@ together = []
|
||||
tracing = [ "opentelemetry-sdk>=1.33.0", "opentelemetry-api>=1.33.0", "opentelemetry-instrumentation>=0.54b0" ]
|
||||
ultravox = [ "pipecat-ai[websockets-base]" ]
|
||||
webrtc = [ "aiortc>=1.14.0,<2", "opencv-python>=4.11.0.86,<5" ]
|
||||
websocket = [ "pipecat-ai[websockets-base]", "fastapi>=0.115.6,<0.122.0" ]
|
||||
websocket = [ "pipecat-ai[websockets-base]", "fastapi>=0.115.6,<0.128.0" ]
|
||||
websockets-base = [ "websockets>=13.1,<16.0" ]
|
||||
whisper = [ "faster-whisper~=1.1.1" ]
|
||||
|
||||
|
||||
@@ -293,12 +293,13 @@ async def run_eval_pipeline(
|
||||
"You should only call the eval function if:\n"
|
||||
"- The user explicitly attempts to answer the question, AND\n"
|
||||
f"- Their answer can be cleanly evaluated using: {eval_config.eval}\n"
|
||||
"Ignore greetings, comments, non-answers, or requests for clarification."
|
||||
"Ignore greetings, comments, non-answers, or requests for clarification.\n"
|
||||
"Numerical word answers are allowed (e.g., 'five' is the same as '5').\n"
|
||||
)
|
||||
if eval_config.eval_speaks_first:
|
||||
system_prompt = f"You are an evaluation agent, be extremly brief. Numerical word answers are allowed. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}"
|
||||
system_prompt = f"You are an evaluation agent, be extremly brief. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}"
|
||||
else:
|
||||
system_prompt = f"You are an evaluation agent, be extremly brief. Numerical word answers are allowed. First, ask one question: {example_prompt}. {common_system_prompt}"
|
||||
system_prompt = f"You are an evaluation agent, be extremly brief. First, ask one question: {example_prompt}. {common_system_prompt}"
|
||||
|
||||
messages = [
|
||||
{
|
||||
|
||||
@@ -97,15 +97,6 @@ TESTS_07 = [
|
||||
("07-interruptible-cartesia-http.py", EVAL_SIMPLE_MATH),
|
||||
("07a-interruptible-speechmatics.py", EVAL_SIMPLE_MATH),
|
||||
("07a-interruptible-speechmatics-vad.py", EVAL_SIMPLE_MATH),
|
||||
("07aa-interruptible-soniox.py", EVAL_SIMPLE_MATH),
|
||||
("07ab-interruptible-inworld.py", EVAL_SIMPLE_MATH),
|
||||
("07ab-interruptible-inworld-http.py", EVAL_SIMPLE_MATH),
|
||||
("07ac-interruptible-asyncai.py", EVAL_SIMPLE_MATH),
|
||||
("07ac-interruptible-asyncai-http.py", EVAL_SIMPLE_MATH),
|
||||
# Need license key to run
|
||||
# ("07ad-interruptible-aicoustics.py", EVAL_SIMPLE_MATH),
|
||||
("07ae-interruptible-hume.py", EVAL_SIMPLE_MATH),
|
||||
("07af-interruptible-gradium.py", EVAL_SIMPLE_MATH),
|
||||
("07b-interruptible-langchain.py", EVAL_SIMPLE_MATH),
|
||||
("07c-interruptible-deepgram.py", EVAL_SIMPLE_MATH),
|
||||
("07c-interruptible-deepgram-flux.py", EVAL_SIMPLE_MATH),
|
||||
@@ -137,6 +128,17 @@ TESTS_07 = [
|
||||
("07y-interruptible-minimax.py", EVAL_SIMPLE_MATH),
|
||||
("07z-interruptible-sarvam.py", EVAL_SIMPLE_MATH),
|
||||
("07z-interruptible-sarvam-http.py", EVAL_SIMPLE_MATH),
|
||||
("07za-interruptible-soniox.py", EVAL_SIMPLE_MATH),
|
||||
("07zb-interruptible-inworld.py", EVAL_SIMPLE_MATH),
|
||||
("07zb-interruptible-inworld-http.py", EVAL_SIMPLE_MATH),
|
||||
("07zc-interruptible-asyncai.py", EVAL_SIMPLE_MATH),
|
||||
("07zc-interruptible-asyncai-http.py", EVAL_SIMPLE_MATH),
|
||||
# Need license key to run
|
||||
# ("07zd-interruptible-aicoustics.py", EVAL_SIMPLE_MATH),
|
||||
("07ze-interruptible-hume.py", EVAL_SIMPLE_MATH),
|
||||
("07zf-interruptible-gradium.py", EVAL_SIMPLE_MATH),
|
||||
("07zg-interruptible-camb.py", EVAL_SIMPLE_MATH),
|
||||
("07zh-interruptible-hathora.py", EVAL_SIMPLE_MATH),
|
||||
# Needs a local XTTS docker instance running.
|
||||
# ("07i-interruptible-xtts.py", EVAL_SIMPLE_MATH),
|
||||
# Needs a Krisp license.
|
||||
|
||||
@@ -22,7 +22,7 @@ from pathlib import Path
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import soundfile as sf # noqa: F401
|
||||
from audio_file_utils import calculate_audio_stats, read_audio_file, write_audio_file
|
||||
except ImportError as e:
|
||||
print(f"Error: Missing required dependencies: {e}")
|
||||
|
||||
@@ -23,7 +23,7 @@ from pathlib import Path
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import soundfile as sf # noqa: F401
|
||||
from audio_file_utils import read_audio_file
|
||||
except ImportError as e:
|
||||
print(f"Error: Missing required dependencies: {e}")
|
||||
|
||||
@@ -10,7 +10,7 @@ import base64
|
||||
import copy
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Literal, Optional, TypedDict
|
||||
from typing import Any, Dict, List, Optional, TypedDict
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
import base64
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Tuple, TypedDict
|
||||
from typing import Any, Dict, List, Optional, TypedDict
|
||||
|
||||
from loguru import logger
|
||||
from openai import NotGiven
|
||||
|
||||
@@ -7,10 +7,8 @@
|
||||
"""OpenAI LLM adapter for Pipecat."""
|
||||
|
||||
import copy
|
||||
import json
|
||||
from typing import Any, Dict, List, TypedDict
|
||||
|
||||
from openai._types import NOT_GIVEN as OPEN_AI_NOT_GIVEN
|
||||
from openai._types import NotGiven as OpenAINotGiven
|
||||
from openai.types.chat import (
|
||||
ChatCompletionMessageParam,
|
||||
|
||||
@@ -61,6 +61,7 @@ class KrispFilter(BaseAudioFilter):
|
||||
Provides real-time noise reduction for audio streams using Krisp's
|
||||
proprietary noise suppression algorithms. Requires a Krisp model file
|
||||
for operation.
|
||||
|
||||
.. deprecated:: 0.0.94
|
||||
The KrispFilter is deprecated and will be removed in a future version.
|
||||
Use KrispVivaFilter instead.
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
This module provides an audio filter implementation using Krisp VIVA SDK.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -16,6 +16,7 @@ import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import BaseSmartTurn
|
||||
from pipecat.utils.env import env_truthy
|
||||
|
||||
try:
|
||||
import onnxruntime as ort
|
||||
@@ -48,6 +49,8 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._log_data = env_truthy("SMART_TURN_LOG_DATA", default=False)
|
||||
|
||||
if not smart_turn_model_path:
|
||||
# Load bundled model
|
||||
model_name = "smart-turn-v3.2-cpu.onnx"
|
||||
@@ -81,6 +84,49 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
||||
|
||||
logger.debug("Loaded Local Smart Turn v3.x")
|
||||
|
||||
def _write_audio_to_wav(
|
||||
self, audio_array: np.ndarray, sample_rate: int = 16000, suffix: str = ""
|
||||
) -> None:
|
||||
"""Write audio data to a WAV file in a background thread.
|
||||
|
||||
Args:
|
||||
audio_array: The audio data as a numpy array (float32, normalized to [-1, 1]).
|
||||
sample_rate: The sample rate of the audio data.
|
||||
suffix: Optional suffix to append to the filename (e.g., "_raw", "_padded").
|
||||
"""
|
||||
import os
|
||||
import threading
|
||||
import wave
|
||||
from datetime import datetime
|
||||
|
||||
# Generate filename with current timestamp (millisecond precision)
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d__%H:%M:%S.%f")[:-3]
|
||||
log_dir = "./smart_turn_audio_log"
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
filename = os.path.join(log_dir, f"{timestamp}{suffix}.wav")
|
||||
|
||||
# Make a copy of the audio data to avoid issues with the array being modified
|
||||
audio_copy = audio_array.copy()
|
||||
|
||||
def write_wav():
|
||||
try:
|
||||
# Convert float32 audio to int16 for WAV file
|
||||
audio_int16 = (audio_copy * 32767).astype(np.int16)
|
||||
|
||||
with wave.open(filename, "wb") as wav_file:
|
||||
wav_file.setnchannels(1) # Mono
|
||||
wav_file.setsampwidth(2) # 2 bytes for int16
|
||||
wav_file.setframerate(sample_rate)
|
||||
wav_file.writeframes(audio_int16.tobytes())
|
||||
|
||||
logger.debug(f"Wrote audio to {filename}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write audio to {filename}: {e}")
|
||||
|
||||
# Start background thread to write the WAV file
|
||||
thread = threading.Thread(target=write_wav, daemon=True)
|
||||
thread.start()
|
||||
|
||||
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Predict end-of-turn using local ONNX model."""
|
||||
|
||||
@@ -95,6 +141,8 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
||||
return np.pad(audio_array, (padding, 0), mode="constant", constant_values=0)
|
||||
return audio_array
|
||||
|
||||
audio_for_logging = audio_array
|
||||
|
||||
# Truncate to 8 seconds (keeping the end) or pad to 8 seconds
|
||||
audio_array = truncate_audio_to_last_n_seconds(audio_array, n_seconds=8)
|
||||
|
||||
@@ -122,6 +170,10 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
||||
# Make prediction (1 for Complete, 0 for Incomplete)
|
||||
prediction = 1 if probability > 0.5 else 0
|
||||
|
||||
if self._log_data:
|
||||
suffix = "_complete" if prediction == 1 else "_incomplete"
|
||||
self._write_audio_to_wav(audio_for_logging, sample_rate=16000, suffix=suffix)
|
||||
|
||||
return {
|
||||
"prediction": prediction,
|
||||
"probability": probability,
|
||||
|
||||
@@ -15,7 +15,7 @@ import asyncio
|
||||
import importlib.util
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, AsyncIterable, Dict, Iterable, List, Optional, Tuple, Type
|
||||
from typing import Any, AsyncIterable, Dict, Iterable, List, Optional, Set, Tuple, Type
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
@@ -49,6 +49,7 @@ from pipecat.pipeline.pipeline import Pipeline, PipelineSink, PipelineSource
|
||||
from pipecat.pipeline.task_observer import TaskObserver
|
||||
from pipecat.processors.aggregators.llm_response import LLMUserContextAggregator
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserverParams, RTVIProcessor
|
||||
from pipecat.utils.asyncio.task_manager import BaseTaskManager, TaskManager, TaskManagerParams
|
||||
from pipecat.utils.tracing.setup import is_tracing_available
|
||||
from pipecat.utils.tracing.turn_trace_observer import TurnTraceObserver
|
||||
@@ -225,9 +226,12 @@ class PipelineTask(BasePipelineTask):
|
||||
conversation_id: Optional[str] = None,
|
||||
enable_tracing: bool = False,
|
||||
enable_turn_tracking: bool = True,
|
||||
enable_rtvi: bool = True,
|
||||
idle_timeout_frames: Tuple[Type[Frame], ...] = (BotSpeakingFrame, UserSpeakingFrame),
|
||||
idle_timeout_secs: Optional[float] = IDLE_TIMEOUT_SECS,
|
||||
observers: Optional[List[BaseObserver]] = None,
|
||||
rtvi_processor: Optional[RTVIProcessor] = None,
|
||||
rtvi_observer_params: Optional[RTVIObserverParams] = None,
|
||||
task_manager: Optional[BaseTaskManager] = None,
|
||||
):
|
||||
"""Initialize the PipelineTask.
|
||||
@@ -244,6 +248,7 @@ class PipelineTask(BasePipelineTask):
|
||||
check_dangling_tasks: Whether to check for processors' tasks finishing properly.
|
||||
clock: Clock implementation for timing operations.
|
||||
conversation_id: Optional custom ID for the conversation.
|
||||
enable_rtvi: Whether to automatically add RTVI support to the pipeline.
|
||||
enable_tracing: Whether to enable tracing.
|
||||
enable_turn_tracking: Whether to enable turn tracking.
|
||||
idle_timeout_frames: A tuple with the frames that should trigger an idle
|
||||
@@ -252,6 +257,8 @@ class PipelineTask(BasePipelineTask):
|
||||
None. If a pipeline is idle the pipeline task will be cancelled
|
||||
automatically.
|
||||
observers: List of observers for monitoring pipeline execution.
|
||||
rtvi_observer_params: The RTVI observer parameter to use if RTVI is enabled.
|
||||
rtvi_processor: The RTVI processor to add if RTVI is enabled.
|
||||
task_manager: Optional task manager for handling asyncio tasks.
|
||||
"""
|
||||
super().__init__()
|
||||
@@ -306,6 +313,16 @@ class PipelineTask(BasePipelineTask):
|
||||
self._heartbeat_push_task: Optional[asyncio.Task] = None
|
||||
self._heartbeat_monitor_task: Optional[asyncio.Task] = None
|
||||
|
||||
# RTVI support
|
||||
self._rtvi = None
|
||||
if enable_rtvi:
|
||||
self._rtvi = rtvi_processor or RTVIProcessor()
|
||||
observers.append(self._rtvi.create_rtvi_observer(params=rtvi_observer_params))
|
||||
|
||||
@self.rtvi.event_handler("on_client_ready")
|
||||
async def on_client_ready(rtvi: RTVIProcessor):
|
||||
await rtvi.set_bot_ready()
|
||||
|
||||
# This is the idle event. When selected frames are pushed from any
|
||||
# processor we consider the pipeline is not idle. We use an observer
|
||||
# which will be listening any part of the pipeline.
|
||||
@@ -335,7 +352,8 @@ class PipelineTask(BasePipelineTask):
|
||||
# allows us to receive and react to downstream frames.
|
||||
source = PipelineSource(self._source_push_frame, name=f"{self}::Source")
|
||||
sink = PipelineSink(self._sink_push_frame, name=f"{self}::Sink")
|
||||
self._pipeline = Pipeline([pipeline], source=source, sink=sink)
|
||||
processors = [self._rtvi, pipeline] if self._rtvi else [pipeline]
|
||||
self._pipeline = Pipeline(processors, source=source, sink=sink)
|
||||
|
||||
# The task observer acts as a proxy to the provided observers. This way,
|
||||
# we only need to pass a single observer (using the StartFrame) which
|
||||
@@ -348,8 +366,8 @@ class PipelineTask(BasePipelineTask):
|
||||
# in. This is mainly for efficiency reason because each event handler
|
||||
# creates a task and most likely you only care about one or two frame
|
||||
# types.
|
||||
self._reached_upstream_types: Tuple[Type[Frame], ...] = ()
|
||||
self._reached_downstream_types: Tuple[Type[Frame], ...] = ()
|
||||
self._reached_upstream_types: Set[Type[Frame]] = set()
|
||||
self._reached_downstream_types: Set[Type[Frame]] = set()
|
||||
self._register_event_handler("on_frame_reached_upstream")
|
||||
self._register_event_handler("on_frame_reached_downstream")
|
||||
self._register_event_handler("on_idle_timeout")
|
||||
@@ -398,6 +416,35 @@ class PipelineTask(BasePipelineTask):
|
||||
"""
|
||||
return self._turn_trace_observer
|
||||
|
||||
@property
|
||||
def rtvi(self) -> RTVIProcessor:
|
||||
"""Get the RTVI processor if RTVI is enabled.
|
||||
|
||||
Returns:
|
||||
The RTVI processor added to the pipeline when RTVI is enabled.
|
||||
"""
|
||||
if not self._rtvi:
|
||||
raise Exception(f"{self} RTVI is not enabled.")
|
||||
return self._rtvi
|
||||
|
||||
@property
|
||||
def reached_upstream_types(self) -> Tuple[Type[Frame], ...]:
|
||||
"""Get the currently configured upstream frame type filters.
|
||||
|
||||
Returns:
|
||||
Tuple of frame types that trigger the on_frame_reached_upstream event.
|
||||
"""
|
||||
return tuple(self._reached_upstream_types)
|
||||
|
||||
@property
|
||||
def reached_downstream_types(self) -> Tuple[Type[Frame], ...]:
|
||||
"""Get the currently configured downstream frame type filters.
|
||||
|
||||
Returns:
|
||||
Tuple of frame types that trigger the on_frame_reached_downstream event.
|
||||
"""
|
||||
return tuple(self._reached_downstream_types)
|
||||
|
||||
def event_handler(self, event_name: str):
|
||||
"""Decorator for registering event handlers.
|
||||
|
||||
@@ -441,7 +488,7 @@ class PipelineTask(BasePipelineTask):
|
||||
Args:
|
||||
types: Tuple of frame types to monitor for upstream events.
|
||||
"""
|
||||
self._reached_upstream_types = types
|
||||
self._reached_upstream_types = set(types)
|
||||
|
||||
def set_reached_downstream_filter(self, types: Tuple[Type[Frame], ...]):
|
||||
"""Set which frame types trigger the on_frame_reached_downstream event.
|
||||
@@ -449,7 +496,23 @@ class PipelineTask(BasePipelineTask):
|
||||
Args:
|
||||
types: Tuple of frame types to monitor for downstream events.
|
||||
"""
|
||||
self._reached_downstream_types = types
|
||||
self._reached_downstream_types = set(types)
|
||||
|
||||
def add_reached_upstream_filter(self, types: Tuple[Type[Frame], ...]):
|
||||
"""Add frame types to trigger the on_frame_reached_upstream event.
|
||||
|
||||
Args:
|
||||
types: Tuple of frame types to add to upstream monitoring.
|
||||
"""
|
||||
self._reached_upstream_types.update(types)
|
||||
|
||||
def add_reached_downstream_filter(self, types: Tuple[Type[Frame], ...]):
|
||||
"""Add frame types to trigger the on_frame_reached_downstream event.
|
||||
|
||||
Args:
|
||||
types: Tuple of frame types to add to downstream monitoring.
|
||||
"""
|
||||
self._reached_downstream_types.update(types)
|
||||
|
||||
def has_finished(self) -> bool:
|
||||
"""Check if the pipeline task has finished execution.
|
||||
@@ -749,7 +812,7 @@ class PipelineTask(BasePipelineTask):
|
||||
pipeline to be stopped (e.g. EndTaskFrame) in which case we would send
|
||||
an EndFrame down the pipeline.
|
||||
"""
|
||||
if isinstance(frame, self._reached_upstream_types):
|
||||
if isinstance(frame, tuple(self._reached_upstream_types)):
|
||||
await self._call_event_handler("on_frame_reached_upstream", frame)
|
||||
|
||||
if isinstance(frame, EndTaskFrame):
|
||||
@@ -788,7 +851,7 @@ class PipelineTask(BasePipelineTask):
|
||||
processors have handled the EndFrame and therefore we can exit the task
|
||||
cleanly.
|
||||
"""
|
||||
if isinstance(frame, self._reached_downstream_types):
|
||||
if isinstance(frame, tuple(self._reached_downstream_types)):
|
||||
await self._call_event_handler("on_frame_reached_downstream", frame)
|
||||
|
||||
if isinstance(frame, StartFrame):
|
||||
|
||||
@@ -1024,10 +1024,8 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
|
||||
logger.debug(
|
||||
f"{self} FunctionCallCancelFrame: [{frame.function_name}:{frame.tool_call_id}]"
|
||||
)
|
||||
if frame.tool_call_id not in self._function_calls_in_progress:
|
||||
return
|
||||
|
||||
if self._function_calls_in_progress[frame.tool_call_id].cancel_on_interruption:
|
||||
function_call = self._function_calls_in_progress.get(frame.tool_call_id)
|
||||
if function_call and function_call.cancel_on_interruption:
|
||||
await self.handle_function_call_cancel(frame)
|
||||
del self._function_calls_in_progress[frame.tool_call_id]
|
||||
|
||||
|
||||
@@ -62,7 +62,8 @@ from pipecat.processors.aggregators.llm_context import (
|
||||
NotGiven,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.turns.mute import BaseUserMuteStrategy
|
||||
from pipecat.turns.user_idle_controller import UserIdleController
|
||||
from pipecat.turns.user_mute import BaseUserMuteStrategy
|
||||
from pipecat.turns.user_start import BaseUserTurnStartStrategy, UserTurnStartedParams
|
||||
from pipecat.turns.user_stop import BaseUserTurnStopStrategy, UserTurnStoppedParams
|
||||
from pipecat.turns.user_turn_controller import UserTurnController
|
||||
@@ -80,11 +81,16 @@ class LLMUserAggregatorParams:
|
||||
user_mute_strategies: List of user mute strategies.
|
||||
user_turn_stop_timeout: Time in seconds to wait before considering the
|
||||
user's turn finished.
|
||||
user_idle_timeout: Optional timeout in seconds for detecting user idle state.
|
||||
If set, the aggregator will emit an `on_user_turn_idle` event when the user
|
||||
has been idle (not speaking) for this duration. Set to None to disable
|
||||
idle detection.
|
||||
"""
|
||||
|
||||
user_turn_strategies: Optional[UserTurnStrategies] = None
|
||||
user_mute_strategies: List[BaseUserMuteStrategy] = field(default_factory=list)
|
||||
user_turn_stop_timeout: float = 5.0
|
||||
user_idle_timeout: Optional[float] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -291,11 +297,14 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
- on_user_turn_started: Called when the user turn starts
|
||||
- on_user_turn_stopped: Called when the user turn ends
|
||||
- on_user_turn_stop_timeout: Called when no user turn stop strategy triggers
|
||||
- on_user_turn_idle: Called when the user has been idle for the configured timeout
|
||||
- on_user_mute_started: Called when the user becomes muted
|
||||
- on_user_mute_stopped: Called when the user becomes unmuted
|
||||
|
||||
Example::
|
||||
|
||||
@aggregator.event_handler("on_user_turn_started")
|
||||
async def on_user_turn_started(aggregator, strategy: BaseUserTurnStartStrategy]):
|
||||
async def on_user_turn_started(aggregator, strategy: BaseUserTurnStartStrategy):
|
||||
...
|
||||
|
||||
@aggregator.event_handler("on_user_turn_stopped")
|
||||
@@ -306,6 +315,18 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
async def on_user_turn_stop_timeout(aggregator):
|
||||
...
|
||||
|
||||
@aggregator.event_handler("on_user_turn_idle")
|
||||
async def on_user_turn_idle(aggregator):
|
||||
...
|
||||
|
||||
@aggregator.event_handler("on_user_mute_started")
|
||||
async def on_user_mute_started(aggregator):
|
||||
...
|
||||
|
||||
@aggregator.event_handler("on_user_mute_stopped")
|
||||
async def on_user_mute_stopped(aggregator):
|
||||
...
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -328,6 +349,9 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
self._register_event_handler("on_user_turn_started")
|
||||
self._register_event_handler("on_user_turn_stopped")
|
||||
self._register_event_handler("on_user_turn_stop_timeout")
|
||||
self._register_event_handler("on_user_turn_idle")
|
||||
self._register_event_handler("on_user_mute_started")
|
||||
self._register_event_handler("on_user_mute_stopped")
|
||||
|
||||
user_turn_strategies = self._params.user_turn_strategies or UserTurnStrategies()
|
||||
|
||||
@@ -350,6 +374,16 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
"on_user_turn_stop_timeout", self._on_user_turn_stop_timeout
|
||||
)
|
||||
|
||||
# Optional user idle controller
|
||||
self._user_idle_controller: Optional[UserIdleController] = None
|
||||
if self._params.user_idle_timeout:
|
||||
self._user_idle_controller = UserIdleController(
|
||||
user_idle_timeout=self._params.user_idle_timeout
|
||||
)
|
||||
self._user_idle_controller.add_event_handler(
|
||||
"on_user_turn_idle", self._on_user_turn_idle
|
||||
)
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up processor resources."""
|
||||
await super().cleanup()
|
||||
@@ -405,6 +439,9 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
|
||||
await self._user_turn_controller.process_frame(frame)
|
||||
|
||||
if self._user_idle_controller:
|
||||
await self._user_idle_controller.process_frame(frame)
|
||||
|
||||
async def push_aggregation(self) -> str:
|
||||
"""Push the current aggregation."""
|
||||
if len(self._aggregation) == 0:
|
||||
@@ -420,6 +457,9 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
async def _start(self, frame: StartFrame):
|
||||
await self._user_turn_controller.setup(self.task_manager)
|
||||
|
||||
if self._user_idle_controller:
|
||||
await self._user_idle_controller.setup(self.task_manager)
|
||||
|
||||
for s in self._params.user_mute_strategies:
|
||||
await s.setup(self.task_manager)
|
||||
|
||||
@@ -432,6 +472,9 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
async def _cleanup(self):
|
||||
await self._user_turn_controller.cleanup()
|
||||
|
||||
if self._user_idle_controller:
|
||||
await self._user_idle_controller.cleanup()
|
||||
|
||||
for s in self._params.user_mute_strategies:
|
||||
await s.cleanup()
|
||||
|
||||
@@ -461,6 +504,12 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
logger.debug(f"{self}: user is now {'muted' if should_mute_next_time else 'unmuted'}")
|
||||
self._user_is_muted = should_mute_next_time
|
||||
|
||||
# Emit mute state change events
|
||||
if self._user_is_muted:
|
||||
await self._call_event_handler("on_user_mute_started")
|
||||
else:
|
||||
await self._call_event_handler("on_user_mute_stopped")
|
||||
|
||||
return should_mute_frame
|
||||
|
||||
async def _handle_llm_run(self, frame: LLMRunFrame):
|
||||
@@ -565,6 +614,9 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
async def _on_user_turn_stop_timeout(self, controller):
|
||||
await self._call_event_handler("on_user_turn_stop_timeout")
|
||||
|
||||
async def _on_user_turn_idle(self, controller):
|
||||
await self._call_event_handler("on_user_turn_idle")
|
||||
|
||||
|
||||
class LLMAssistantAggregator(LLMContextAggregator):
|
||||
"""Assistant LLM aggregator that processes bot responses and function calls.
|
||||
@@ -858,10 +910,8 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
logger.debug(
|
||||
f"{self} FunctionCallCancelFrame: [{frame.function_name}:{frame.tool_call_id}]"
|
||||
)
|
||||
if frame.tool_call_id not in self._function_calls_in_progress:
|
||||
return
|
||||
|
||||
if self._function_calls_in_progress[frame.tool_call_id].cancel_on_interruption:
|
||||
function_call = self._function_calls_in_progress.get(frame.tool_call_id)
|
||||
if function_call and function_call.cancel_on_interruption:
|
||||
# Update context with the function call cancellation
|
||||
self._update_function_call_result(frame.function_name, frame.tool_call_id, "CANCELLED")
|
||||
del self._function_calls_in_progress[frame.tool_call_id]
|
||||
|
||||
@@ -34,7 +34,6 @@ from PIL import Image
|
||||
from pipecat.adapters.base_llm_adapter import BaseLLMAdapter
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.frames.frames import AudioRawFrame, Frame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
# JSON custom encoder to handle bytes arrays so that we can log contexts
|
||||
# with images to the console.
|
||||
|
||||
@@ -18,7 +18,7 @@ from typing import List
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
|
||||
from pipecat.frames.frames import Frame, TranscriptionFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
|
||||
@@ -12,7 +12,9 @@ management, and frame flow control mechanisms.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import dataclasses
|
||||
import traceback
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import (
|
||||
@@ -779,8 +781,40 @@ class FrameProcessor(BaseObject):
|
||||
frame_cls: The class of the frame to be broadcasted.
|
||||
**kwargs: Keyword arguments to be passed to the frame's constructor.
|
||||
"""
|
||||
await self.push_frame(frame_cls(**kwargs))
|
||||
await self.push_frame(frame_cls(**kwargs), FrameDirection.UPSTREAM)
|
||||
await self.push_frame(frame_cls(**deepcopy(kwargs)))
|
||||
await self.push_frame(frame_cls(**deepcopy(kwargs)), FrameDirection.UPSTREAM)
|
||||
|
||||
async def broadcast_frame_instance(self, frame: Frame):
|
||||
"""Broadcasts a frame instance upstream and downstream.
|
||||
|
||||
This method creates two new frame instances copying all fields from the
|
||||
original frame except `id` and `name`, which get fresh values.
|
||||
|
||||
Args:
|
||||
frame: The frame instance to broadcast.
|
||||
|
||||
Note:
|
||||
Prefer using `broadcast_frame()` when possible, as it is more
|
||||
efficient. This method should only be used when you are not the
|
||||
creator of the frame and need to broadcast an existing instance.
|
||||
"""
|
||||
frame_cls = type(frame)
|
||||
init_fields = {f.name: getattr(frame, f.name) for f in dataclasses.fields(frame) if f.init}
|
||||
extra_fields = {
|
||||
f.name: getattr(frame, f.name)
|
||||
for f in dataclasses.fields(frame)
|
||||
if not f.init and f.name not in ("id", "name")
|
||||
}
|
||||
|
||||
new_frame = frame_cls(**deepcopy(init_fields))
|
||||
for k, v in deepcopy(extra_fields).items():
|
||||
setattr(new_frame, k, v)
|
||||
await self.push_frame(new_frame)
|
||||
|
||||
new_frame = frame_cls(**deepcopy(init_fields))
|
||||
for k, v in deepcopy(extra_fields).items():
|
||||
setattr(new_frame, k, v)
|
||||
await self.push_frame(new_frame, FrameDirection.UPSTREAM)
|
||||
|
||||
async def __start(self, frame: StartFrame):
|
||||
"""Handle the start frame to initialize processor state.
|
||||
@@ -950,7 +984,8 @@ class FrameProcessor(BaseObject):
|
||||
# Process current queue and keep UninterruptibleFrame frames.
|
||||
while not self.__process_queue.empty():
|
||||
item = self.__process_queue.get_nowait()
|
||||
if isinstance(item, UninterruptibleFrame):
|
||||
frame = item[0]
|
||||
if isinstance(frame, UninterruptibleFrame):
|
||||
new_queue.put_nowait(item)
|
||||
self.__process_queue.task_done()
|
||||
|
||||
|
||||
@@ -1100,13 +1100,11 @@ class RTVIObserver(BaseObserver):
|
||||
|
||||
if (
|
||||
isinstance(frame, (UserStartedSpeakingFrame, UserStoppedSpeakingFrame))
|
||||
and (direction == FrameDirection.DOWNSTREAM)
|
||||
and self._params.user_speaking_enabled
|
||||
):
|
||||
await self._handle_interruptions(frame)
|
||||
elif (
|
||||
isinstance(frame, (BotStartedSpeakingFrame, BotStoppedSpeakingFrame))
|
||||
and (direction == FrameDirection.UPSTREAM)
|
||||
and self._params.bot_speaking_enabled
|
||||
):
|
||||
await self._handle_bot_speaking(frame)
|
||||
@@ -1413,6 +1411,18 @@ class RTVIProcessor(FrameProcessor):
|
||||
|
||||
self._registered_services[service.name] = service
|
||||
|
||||
def create_rtvi_observer(self, *, params: Optional[RTVIObserverParams] = None, **kwargs):
|
||||
"""Creates a new RTVI Observer.
|
||||
|
||||
Args:
|
||||
params: Settings to enable/disable specific messages.
|
||||
**kwargs: Additional arguments passed to the observer.
|
||||
|
||||
Returns:
|
||||
A new RTVI observer.
|
||||
"""
|
||||
return RTVIObserver(self, params=params, **kwargs)
|
||||
|
||||
async def set_client_ready(self):
|
||||
"""Mark the client as ready and trigger the ready event."""
|
||||
self._client_ready = True
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
import asyncio
|
||||
import inspect
|
||||
import warnings
|
||||
from typing import Awaitable, Callable, Union
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
@@ -26,6 +27,10 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
class UserIdleProcessor(FrameProcessor):
|
||||
"""Monitors user inactivity and triggers callbacks after timeout periods.
|
||||
|
||||
.. deprecated::
|
||||
UserIdleProcessor is deprecated in 0.0.100 and will be removed in a future version.
|
||||
Use LLMUserAggregator with user_idle_timeout parameter instead.
|
||||
|
||||
This processor tracks user activity and triggers configurable callbacks when
|
||||
users become idle. It starts monitoring only after the first conversation
|
||||
activity and supports both basic and retry-based callback patterns.
|
||||
@@ -70,6 +75,14 @@ class UserIdleProcessor(FrameProcessor):
|
||||
**kwargs: Additional arguments passed to FrameProcessor.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
warnings.warn(
|
||||
"UserIdleProcessor is deprecated in 0.0.100 and will be removed in a "
|
||||
"future version. Use LLMUserAggregator with user_idle_timeout parameter "
|
||||
"instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
self._callback = self._wrap_callback(callback)
|
||||
self._timeout = timeout
|
||||
self._retry_count = 0
|
||||
|
||||
@@ -263,7 +263,7 @@ def _setup_webrtc_routes(
|
||||
"""Handle WebRTC offer requests via SmallWebRTCRequestHandler."""
|
||||
|
||||
# Prepare runner arguments with the callback to run your bot
|
||||
async def webrtc_connection_callback(connection):
|
||||
async def webrtc_connection_callback(connection: SmallWebRTCConnection):
|
||||
bot_module = _get_bot_module()
|
||||
|
||||
runner_args = SmallWebRTCRunnerArguments(
|
||||
@@ -406,13 +406,7 @@ def _setup_whatsapp_routes(app: FastAPI):
|
||||
return
|
||||
|
||||
try:
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
|
||||
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
|
||||
from pipecat.transports.smallwebrtc.request_handler import (
|
||||
SmallWebRTCRequest,
|
||||
SmallWebRTCRequestHandler,
|
||||
)
|
||||
from pipecat.transports.whatsapp.api import WhatsAppWebhookRequest
|
||||
from pipecat.transports.whatsapp.client import WhatsAppClient
|
||||
except ImportError as e:
|
||||
|
||||
@@ -126,7 +126,7 @@ class ProtobufFrameSerializer(FrameSerializer):
|
||||
if "pts" in args_dict:
|
||||
del args_dict["pts"]
|
||||
|
||||
# Special handling for MessageFrame -> OutputTransportMessageUrgentFrame
|
||||
# Special handling for MessageFrame -> InputTransportMessageFrame
|
||||
if class_name == MessageFrame:
|
||||
try:
|
||||
msg = json.loads(args_dict["data"])
|
||||
|
||||
@@ -34,8 +34,7 @@ class VonageFrameSerializer(FrameSerializer):
|
||||
WebSocket streaming protocol.
|
||||
|
||||
Note:
|
||||
Ref docs:
|
||||
https://developer.vonage.com/en/video/guides/audio-connector
|
||||
Ref docs: https://developer.vonage.com/en/video/guides/audio-connector
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
|
||||
@@ -148,11 +148,11 @@ class AIService(FrameProcessor):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, StartFrame):
|
||||
await self.start(frame)
|
||||
elif isinstance(frame, CancelFrame):
|
||||
await self.cancel(frame)
|
||||
await self._start(frame)
|
||||
elif isinstance(frame, EndFrame):
|
||||
await self.stop(frame)
|
||||
await self._stop(frame)
|
||||
elif isinstance(frame, CancelFrame):
|
||||
await self._cancel(frame)
|
||||
|
||||
async def process_generator(self, generator: AsyncGenerator[Frame | None, None]):
|
||||
"""Process frames from an async generator.
|
||||
@@ -169,3 +169,21 @@ class AIService(FrameProcessor):
|
||||
await self.push_error_frame(f)
|
||||
else:
|
||||
await self.push_frame(f)
|
||||
|
||||
async def _start(self, frame: StartFrame):
|
||||
try:
|
||||
await self.start(frame)
|
||||
except Exception as e:
|
||||
logger.error(f"{self}: exception processing {frame}: {e}")
|
||||
|
||||
async def _stop(self, frame: EndFrame):
|
||||
try:
|
||||
await self.stop(frame)
|
||||
except Exception as e:
|
||||
logger.error(f"{self}: exception processing {frame}: {e}")
|
||||
|
||||
async def _cancel(self, frame: CancelFrame):
|
||||
try:
|
||||
await self.cancel(frame)
|
||||
except Exception as e:
|
||||
logger.error(f"{self}: exception processing {frame}: {e}")
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import uuid
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
import aiohttp
|
||||
@@ -27,7 +28,7 @@ from pipecat.frames.frames import (
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.tts_service import InterruptibleTTSService, TTSService
|
||||
from pipecat.services.tts_service import AudioContextTTSService, TTSService
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
@@ -72,7 +73,7 @@ def language_to_async_language(language: Language) -> Optional[str]:
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=True)
|
||||
|
||||
|
||||
class AsyncAITTSService(InterruptibleTTSService):
|
||||
class AsyncAITTSService(AudioContextTTSService):
|
||||
"""Async TTS service with WebSocket streaming.
|
||||
|
||||
Provides text-to-speech using Async's streaming WebSocket API.
|
||||
@@ -148,6 +149,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
self._receive_task = None
|
||||
self._keepalive_task = None
|
||||
self._started = False
|
||||
self._context_id = None
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
@@ -168,8 +170,8 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
"""
|
||||
return language_to_async_language(language)
|
||||
|
||||
def _build_msg(self, text: str = "", force: bool = False) -> str:
|
||||
msg = {"transcript": text, "force": force}
|
||||
def _build_msg(self, text: str = "", context_id: str = "", force: bool = False) -> str:
|
||||
msg = {"transcript": text, "context_id": context_id, "force": force}
|
||||
return json.dumps(msg)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
@@ -253,11 +255,16 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
|
||||
if self._websocket:
|
||||
logger.debug("Disconnecting from Async")
|
||||
# Close all contexts and the socket
|
||||
if self._context_id:
|
||||
await self._websocket.send(json.dumps({"terminate": True}))
|
||||
await self._websocket.close()
|
||||
logger.debug("Disconnected from Async")
|
||||
except Exception as e:
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
finally:
|
||||
self._websocket = None
|
||||
self._context_id = None
|
||||
self._started = False
|
||||
await self._call_event_handler("on_disconnected")
|
||||
|
||||
@@ -268,10 +275,10 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
|
||||
async def flush_audio(self):
|
||||
"""Flush any pending audio."""
|
||||
if not self._websocket:
|
||||
if not self._context_id or not self._websocket:
|
||||
return
|
||||
logger.trace(f"{self}: flushing audio")
|
||||
msg = self._build_msg(text=" ", force=True)
|
||||
msg = self._build_msg(text=" ", context_id=self._context_id, force=True)
|
||||
await self._websocket.send(msg)
|
||||
|
||||
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
|
||||
@@ -291,35 +298,75 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
if not msg:
|
||||
continue
|
||||
|
||||
elif msg.get("audio"):
|
||||
received_ctx_id = msg.get("context_id")
|
||||
# Handle final messages first, regardless of context availability
|
||||
# At the moment, this message is received AFTER the close_context message is
|
||||
# sent, so it doesn't serve any functional purpose. For now, we'll just log it.
|
||||
if msg.get("final") is True:
|
||||
logger.trace(f"Received final message for context {received_ctx_id}")
|
||||
continue
|
||||
|
||||
# Check if this message belongs to the current context.
|
||||
if not self.audio_context_available(received_ctx_id):
|
||||
if self._context_id == received_ctx_id:
|
||||
logger.debug(
|
||||
f"Received a delayed message, recreating the context: {self._context_id}"
|
||||
)
|
||||
await self.create_audio_context(self._context_id)
|
||||
else:
|
||||
# This can happen if a message is received _after_ we have closed a context
|
||||
# due to user interruption but _before_ the `isFinal` message for the context
|
||||
# is received.
|
||||
logger.debug(f"Ignoring message from unavailable context: {received_ctx_id}")
|
||||
continue
|
||||
|
||||
if msg.get("audio"):
|
||||
await self.stop_ttfb_metrics()
|
||||
frame = TTSAudioRawFrame(
|
||||
audio=base64.b64decode(msg["audio"]),
|
||||
sample_rate=self.sample_rate,
|
||||
num_channels=1,
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
elif msg.get("error_code"):
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
await self.stop_all_metrics()
|
||||
await self.push_error(error_msg=f"Error: {msg['message']}")
|
||||
else:
|
||||
await self.push_error(error_msg=f"Unknown message type: {msg}")
|
||||
audio = base64.b64decode(msg["audio"])
|
||||
frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
|
||||
await self.append_to_audio_context(received_ctx_id, frame)
|
||||
|
||||
async def _keepalive_task_handler(self):
|
||||
"""Send periodic keepalive messages to maintain WebSocket connection."""
|
||||
KEEPALIVE_SLEEP = 3
|
||||
KEEPALIVE_SLEEP = 10
|
||||
while True:
|
||||
await asyncio.sleep(KEEPALIVE_SLEEP)
|
||||
try:
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
keepalive_message = {"transcript": " "}
|
||||
logger.trace("Sending keepalive message")
|
||||
if self._context_id:
|
||||
keepalive_message = {
|
||||
"transcript": " ",
|
||||
"context_id": self._context_id,
|
||||
}
|
||||
logger.trace("Sending keepalive message")
|
||||
else:
|
||||
# It's possible to have a user interruption which clears the context
|
||||
# without generating a new TTS response. In this case, we'll just send
|
||||
# an empty message to keep the connection alive.
|
||||
keepalive_message = {"transcript": " "}
|
||||
logger.trace("Sending keepalive without context")
|
||||
await self._websocket.send(json.dumps(keepalive_message))
|
||||
except websockets.ConnectionClosed as e:
|
||||
logger.warning(f"{self} keepalive error: {e}")
|
||||
break
|
||||
|
||||
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
||||
"""Handle interruption by closing the current context."""
|
||||
await super()._handle_interruption(frame, direction)
|
||||
|
||||
# Close the current context when interrupted without closing the websocket
|
||||
if self._context_id and self._websocket:
|
||||
try:
|
||||
await self._websocket.send(
|
||||
json.dumps(
|
||||
{"context_id": self._context_id, "close_context": True, "transcript": ""}
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing context on interruption: {e}")
|
||||
self._context_id = None
|
||||
self._started = False
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using Async API websocket endpoint.
|
||||
@@ -336,21 +383,29 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
if not self._websocket or self._websocket.state is State.CLOSED:
|
||||
await self._connect()
|
||||
|
||||
if not self._started:
|
||||
await self.start_ttfb_metrics()
|
||||
yield TTSStartedFrame()
|
||||
self._started = True
|
||||
|
||||
msg = self._build_msg(text=text, force=True)
|
||||
|
||||
try:
|
||||
await self._get_websocket().send(msg)
|
||||
await self.start_tts_usage_metrics(text)
|
||||
if not self._started:
|
||||
await self.start_ttfb_metrics()
|
||||
yield TTSStartedFrame()
|
||||
self._started = True
|
||||
|
||||
if not self._context_id:
|
||||
self._context_id = str(uuid.uuid4())
|
||||
if not self.audio_context_available(self._context_id):
|
||||
await self.create_audio_context(self._context_id)
|
||||
|
||||
msg = self._build_msg(text=text, force=True, context_id=self._context_id)
|
||||
await self._get_websocket().send(msg)
|
||||
await self.start_tts_usage_metrics(text)
|
||||
else:
|
||||
if self._websocket and self._context_id:
|
||||
msg = self._build_msg(text=text, force=True, context_id=self._context_id)
|
||||
await self._get_websocket().send(msg)
|
||||
|
||||
except Exception as e:
|
||||
yield ErrorFrame(error=f"Unknown error occurred: {e}")
|
||||
yield TTSStoppedFrame()
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
self._started = False
|
||||
return
|
||||
yield None
|
||||
except Exception as e:
|
||||
@@ -490,7 +545,14 @@ class AsyncAIHttpTTSService(TTSService):
|
||||
await self.push_error(error_msg=f"Async API error: {error_text}")
|
||||
raise Exception(f"Async API returned status {response.status}: {error_text}")
|
||||
|
||||
audio_data = await response.read()
|
||||
# Read streaming bytes; stop TTFB on the *first* received chunk
|
||||
buffer = bytearray()
|
||||
async for chunk in response.content.iter_chunked(64 * 1024):
|
||||
if not chunk:
|
||||
continue
|
||||
await self.stop_ttfb_metrics()
|
||||
buffer.extend(chunk)
|
||||
audio_data = bytes(buffer)
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
|
||||
@@ -38,6 +38,7 @@ from pipecat.frames.frames import (
|
||||
LLMContextFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMTextFrame,
|
||||
StartFrame,
|
||||
TranscriptionFrame,
|
||||
TTSAudioRawFrame,
|
||||
@@ -295,6 +296,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
self._user_text_buffer = ""
|
||||
self._assistant_text_buffer = ""
|
||||
self._completed_tool_calls = set()
|
||||
self._audio_input_started = False
|
||||
|
||||
file_path = files("pipecat.services.aws.nova_sonic").joinpath("ready.wav")
|
||||
with wave.open(file_path.open("rb"), "rb") as wav_file:
|
||||
@@ -531,14 +533,30 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
if system_instruction:
|
||||
await self._send_text_event(text=system_instruction, role=Role.SYSTEM)
|
||||
|
||||
# Send conversation history
|
||||
for message in llm_connection_params["messages"]:
|
||||
# Send conversation history (except for the last message if it's from the
|
||||
# user, which we'll send as interactive after starting audio input)
|
||||
messages = llm_connection_params["messages"]
|
||||
last_user_message = None
|
||||
for i, message in enumerate(messages):
|
||||
# logger.debug(f"Seeding conversation history with message: {message}")
|
||||
await self._send_text_event(text=message.text, role=message.role)
|
||||
is_last_message = i == len(messages) - 1
|
||||
if is_last_message and message.role == Role.USER:
|
||||
# Save for sending after audio input starts
|
||||
last_user_message = message
|
||||
else:
|
||||
await self._send_text_event(text=message.text, role=message.role)
|
||||
|
||||
# Start audio input
|
||||
await self._send_audio_input_start_event()
|
||||
|
||||
# Now send the last user message as interactive to trigger bot response
|
||||
if last_user_message:
|
||||
# logger.debug(
|
||||
# f"Sending last user message as interactive to trigger bot response: {last_user_message}")
|
||||
await self._send_text_event(
|
||||
text=last_user_message.text, role=last_user_message.role, interactive=True
|
||||
)
|
||||
|
||||
# Start receiving events
|
||||
self._receive_task = self.create_task(self._receive_task_handler())
|
||||
|
||||
@@ -601,6 +619,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
self._user_text_buffer = ""
|
||||
self._assistant_text_buffer = ""
|
||||
self._completed_tool_calls = set()
|
||||
self._audio_input_started = False
|
||||
|
||||
logger.info("Finished disconnecting")
|
||||
except Exception as e:
|
||||
@@ -726,8 +745,18 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
}}
|
||||
'''
|
||||
await self._send_client_event(audio_content_start)
|
||||
self._audio_input_started = True
|
||||
|
||||
async def _send_text_event(self, text: str, role: Role):
|
||||
async def _send_text_event(self, text: str, role: Role, interactive: bool = False):
|
||||
"""Send a text event to the LLM.
|
||||
|
||||
Args:
|
||||
text: The text content to send.
|
||||
role: The role associated with the text (e.g., USER, ASSISTANT, SYSTEM).
|
||||
interactive: Whether the content is interactive. Defaults to False.
|
||||
False: conversation history or system instruction, sent prior to interactive audio
|
||||
True: text input sent during (or at the start of) interactive audio
|
||||
"""
|
||||
if not self._stream or not self._prompt_name or not text:
|
||||
return
|
||||
|
||||
@@ -740,7 +769,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
"promptName": "{self._prompt_name}",
|
||||
"contentName": "{content_name}",
|
||||
"type": "TEXT",
|
||||
"interactive": true,
|
||||
"interactive": {json.dumps(interactive)},
|
||||
"role": "{role.value}",
|
||||
"textInputConfiguration": {{
|
||||
"mediaType": "text/plain"
|
||||
@@ -778,7 +807,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
await self._send_client_event(text_content_end)
|
||||
|
||||
async def _send_user_audio_event(self, audio: bytes):
|
||||
if not self._stream:
|
||||
if not self._stream or not self._audio_input_started:
|
||||
return
|
||||
|
||||
blob = base64.b64encode(audio)
|
||||
@@ -1077,9 +1106,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
logger.debug(f"Assistant response text added: {text}")
|
||||
|
||||
# Report the text of the assistant response.
|
||||
frame = TTSTextFrame(text, aggregated_by=AggregationType.SENTENCE)
|
||||
frame.includes_inter_frame_spaces = True
|
||||
await self.push_frame(frame)
|
||||
await self._push_assistant_response_text_frames(text)
|
||||
|
||||
# HACK: here we're also buffering the assistant text ourselves as a
|
||||
# backup rather than relying solely on the assistant context aggregator
|
||||
@@ -1112,11 +1139,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
# TTSTextFrame would be ignored otherwise (the interruption frame
|
||||
# would have cleared the assistant aggregator state).
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
frame = TTSTextFrame(
|
||||
self._assistant_text_buffer, aggregated_by=AggregationType.SENTENCE
|
||||
)
|
||||
frame.includes_inter_frame_spaces = True
|
||||
await self.push_frame(frame)
|
||||
await self._push_assistant_response_text_frames(self._assistant_text_buffer)
|
||||
self._may_need_repush_assistant_text = False
|
||||
|
||||
# Report the end of the assistant response.
|
||||
@@ -1128,6 +1151,25 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
# Clear out the buffered assistant text
|
||||
self._assistant_text_buffer = ""
|
||||
|
||||
async def _push_assistant_response_text_frames(self, text: str):
|
||||
# In a typical "cascade" LLM + TTS setup, LLMTextFrames would not
|
||||
# proceed beyond the TTS service. Therefore, since a speech-to-speech
|
||||
# service like Nova Sonic combines both LLM and TTS functionality, you
|
||||
# would think we wouldn't need to push LLMTextFrames at all. However,
|
||||
# RTVI relies on LLMTextFrames being pushed to trigger its
|
||||
# "bot-llm-text" event. So here we push an LLMTextFrame, too, but avoid
|
||||
# appending it to context to avoid context message duplication.
|
||||
|
||||
# Push LLMTextFrame
|
||||
llm_text_frame = LLMTextFrame(text)
|
||||
llm_text_frame.append_to_context = False
|
||||
await self.push_frame(llm_text_frame)
|
||||
|
||||
# Push TTSTextFrame
|
||||
tts_text_frame = TTSTextFrame(text, aggregated_by=AggregationType.SENTENCE)
|
||||
tts_text_frame.includes_inter_frame_spaces = True
|
||||
await self.push_frame(tts_text_frame)
|
||||
|
||||
#
|
||||
# user transcription reporting
|
||||
#
|
||||
@@ -1187,7 +1229,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
logger.debug(
|
||||
"Wrapping assistant response trigger transcription with upstream UserStarted/StoppedSpeakingFrames"
|
||||
)
|
||||
await self.push_frame(UserStartedSpeakingFrame(), direction=FrameDirection.UPSTREAM)
|
||||
await self.broadcast_frame(UserStartedSpeakingFrame)
|
||||
|
||||
# Send the transcription upstream for the user context aggregator
|
||||
frame = TranscriptionFrame(
|
||||
@@ -1197,7 +1239,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
|
||||
# Finish wrapping the upstream transcription in UserStarted/StoppedSpeakingFrames if needed
|
||||
if should_wrap_in_user_started_stopped_speaking_frames:
|
||||
await self.push_frame(UserStoppedSpeakingFrame(), direction=FrameDirection.UPSTREAM)
|
||||
await self.broadcast_frame(UserStoppedSpeakingFrame)
|
||||
|
||||
# Clear out the buffered user text
|
||||
self._user_text_buffer = ""
|
||||
|
||||
@@ -10,7 +10,6 @@ This module provides a WebSocket-based connection to AWS Transcribe for real-tim
|
||||
speech-to-text transcription with support for multiple languages and audio formats.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
|
||||
@@ -10,7 +10,6 @@ This module provides integration with Amazon Polly for text-to-speech synthesis,
|
||||
supporting multiple languages, voices, and SSML features.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import AsyncGenerator, List, Optional
|
||||
|
||||
|
||||
@@ -17,3 +17,8 @@ with warnings.catch_warnings():
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AWSNovaSonicLLMService",
|
||||
"Params",
|
||||
]
|
||||
|
||||
@@ -8,8 +8,6 @@
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@ import io
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
from PIL import Image
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, URLImageRawFrame
|
||||
|
||||
@@ -277,6 +277,8 @@ class AzureTTSService(WordTTSService, AzureBaseTTSService):
|
||||
self._started = False
|
||||
self._first_chunk = True
|
||||
self._cumulative_audio_offset: float = 0.0 # Cumulative audio duration in seconds
|
||||
self._last_word: Optional[str] = None # Track last word for punctuation merging
|
||||
self._last_timestamp: Optional[float] = None # Track last timestamp
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
@@ -346,9 +348,34 @@ class AzureTTSService(WordTTSService, AzureBaseTTSService):
|
||||
await self.cancel_task(self._word_processor_task)
|
||||
self._word_processor_task = None
|
||||
|
||||
def _is_cjk_language(self) -> bool:
|
||||
"""Check if the configured language is CJK (Chinese, Japanese, Korean).
|
||||
|
||||
Returns:
|
||||
True if the language is CJK, False otherwise.
|
||||
"""
|
||||
language = self._settings.get("language", "").lower()
|
||||
# Check if language starts with CJK language codes
|
||||
return language.startswith(("zh", "ja", "ko", "cmn", "yue", "wuu"))
|
||||
|
||||
def _is_punctuation_only(self, text: str) -> bool:
|
||||
"""Check if text consists only of punctuation and whitespace.
|
||||
|
||||
Args:
|
||||
text: Text to check.
|
||||
|
||||
Returns:
|
||||
True if text is only punctuation/whitespace, False otherwise.
|
||||
"""
|
||||
return text and all(not c.isalnum() for c in text)
|
||||
|
||||
def _handle_word_boundary(self, evt):
|
||||
"""Handle word boundary events from Azure SDK.
|
||||
|
||||
Azure sends punctuation as separate word boundaries, and breaks CJK text
|
||||
into individual characters/particles. This method routes to language-specific
|
||||
handlers to properly merge and emit word boundaries.
|
||||
|
||||
Args:
|
||||
evt: SpeechSynthesisWordBoundaryEventArgs from Azure Speech SDK
|
||||
containing word text and audio offset timing.
|
||||
@@ -362,13 +389,75 @@ class AzureTTSService(WordTTSService, AzureBaseTTSService):
|
||||
# Add cumulative offset to get absolute timestamp across sentences
|
||||
absolute_seconds = self._cumulative_audio_offset + sentence_relative_seconds
|
||||
|
||||
# Queue word timestamp for async processing
|
||||
# Use thread-safe queue since this is called from Azure SDK thread
|
||||
if word:
|
||||
logger.trace(f"{self}: Word boundary - '{word}' at {absolute_seconds:.2f}s")
|
||||
# Put in temporary queue - will be processed by async task
|
||||
# Store as (word, timestamp_in_seconds) tuple
|
||||
self._word_boundary_queue.put_nowait((word, absolute_seconds))
|
||||
if not word:
|
||||
return
|
||||
|
||||
# Route to language-specific handler
|
||||
if self._is_cjk_language():
|
||||
self._handle_cjk_word_boundary(word, absolute_seconds)
|
||||
else:
|
||||
self._handle_non_cjk_word_boundary(word, absolute_seconds)
|
||||
|
||||
def _emit_pending_word(self):
|
||||
"""Emit the currently buffered word if one exists."""
|
||||
if self._last_word is not None:
|
||||
self._word_boundary_queue.put_nowait((self._last_word, self._last_timestamp))
|
||||
self._last_word = None
|
||||
self._last_timestamp = None
|
||||
|
||||
def _handle_cjk_word_boundary(self, word: str, timestamp: float):
|
||||
"""Handle word boundaries for CJK languages (Chinese, Japanese, Korean).
|
||||
|
||||
CJK languages don't use spaces between words, so we merge characters together
|
||||
and only emit at natural break points (punctuation or whitespace boundaries).
|
||||
Without this logic, we don't get word output for CJK languages.
|
||||
|
||||
Args:
|
||||
word: The word/character from Azure.
|
||||
timestamp: Timestamp in seconds.
|
||||
"""
|
||||
# First word: just store it
|
||||
if self._last_word is None:
|
||||
self._last_word = word
|
||||
self._last_timestamp = timestamp
|
||||
return
|
||||
|
||||
# Punctuation: merge and emit (natural break)
|
||||
if self._is_punctuation_only(word):
|
||||
self._last_word += word
|
||||
self._emit_pending_word()
|
||||
return
|
||||
|
||||
# Whitespace: emit before boundary, start new segment
|
||||
if word.strip() != word:
|
||||
self._emit_pending_word()
|
||||
self._last_word = word
|
||||
self._last_timestamp = timestamp
|
||||
return
|
||||
|
||||
# Default: continue merging CJK characters
|
||||
self._last_word += word
|
||||
|
||||
def _handle_non_cjk_word_boundary(self, word: str, timestamp: float):
|
||||
"""Handle word boundaries for non-CJK languages.
|
||||
|
||||
Non-CJK languages use spaces between words, so we emit each word separately
|
||||
after merging any trailing punctuation.
|
||||
|
||||
Args:
|
||||
word: The word from Azure.
|
||||
timestamp: Timestamp in seconds.
|
||||
"""
|
||||
# Punctuation: merge with previous word (don't emit yet)
|
||||
if self._is_punctuation_only(word) and self._last_word is not None:
|
||||
self._last_word += word
|
||||
return
|
||||
|
||||
# Regular word: emit previous, store current
|
||||
if self._last_word is not None:
|
||||
self._word_boundary_queue.put_nowait((self._last_word, self._last_timestamp))
|
||||
self._last_word = word
|
||||
self._last_timestamp = timestamp
|
||||
|
||||
async def _word_processor_task_handler(self):
|
||||
"""Process word timestamps from the queue and call add_word_timestamps."""
|
||||
@@ -397,6 +486,12 @@ class AzureTTSService(WordTTSService, AzureBaseTTSService):
|
||||
Args:
|
||||
evt: Completion event from Azure Speech SDK.
|
||||
"""
|
||||
# Flush any pending word before completing
|
||||
if self._last_word is not None:
|
||||
self._word_boundary_queue.put_nowait((self._last_word, self._last_timestamp))
|
||||
self._last_word = None
|
||||
self._last_timestamp = None
|
||||
|
||||
# Update cumulative audio offset for next sentence
|
||||
if evt.result and evt.result.audio_duration:
|
||||
self._cumulative_audio_offset += evt.result.audio_duration.total_seconds()
|
||||
@@ -435,6 +530,8 @@ class AzureTTSService(WordTTSService, AzureBaseTTSService):
|
||||
self._started = False
|
||||
self._first_chunk = True
|
||||
self._cumulative_audio_offset = 0.0
|
||||
self._last_word = None
|
||||
self._last_timestamp = None
|
||||
|
||||
async def flush_audio(self):
|
||||
"""Flush any pending audio data."""
|
||||
|
||||
5
src/pipecat/services/camb/__init__.py
Normal file
5
src/pipecat/services/camb/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2024–2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
330
src/pipecat/services/camb/tts.py
Normal file
330
src/pipecat/services/camb/tts.py
Normal file
@@ -0,0 +1,330 @@
|
||||
#
|
||||
# Copyright (c) 2024–2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Camb.ai MARS text-to-speech service implementation.
|
||||
|
||||
This module provides TTS functionality using Camb.ai's MARS model family,
|
||||
offering high-quality text-to-speech synthesis with streaming support.
|
||||
|
||||
Features:
|
||||
- MARS models: mars-flash (fast), mars-pro (high quality)
|
||||
- 140+ languages supported
|
||||
- Real-time streaming via official SDK
|
||||
- Model-specific sample rates: mars-pro (48kHz), mars-flash (22.05kHz)
|
||||
"""
|
||||
|
||||
from typing import Any, AsyncGenerator, Dict, Optional
|
||||
|
||||
from camb import StreamTtsOutputConfiguration
|
||||
from camb.client import AsyncCambAI
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.services.tts_service import TTSService
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
# Model-specific sample rates
|
||||
MODEL_SAMPLE_RATES: Dict[str, int] = {
|
||||
"mars-flash": 22050, # 22.05kHz
|
||||
"mars-pro": 48000, # 48kHz
|
||||
"mars-instruct": 22050, # 22.05kHz
|
||||
}
|
||||
|
||||
|
||||
def language_to_camb_language(language: Language) -> Optional[str]:
|
||||
"""Convert a Pipecat Language enum to Camb.ai language code.
|
||||
|
||||
Args:
|
||||
language: The Language enum value to convert.
|
||||
|
||||
Returns:
|
||||
The corresponding Camb.ai language code (BCP-47 format), or None if not supported.
|
||||
"""
|
||||
LANGUAGE_MAP = {
|
||||
Language.EN: "en-us",
|
||||
Language.EN_US: "en-us",
|
||||
Language.EN_GB: "en-gb",
|
||||
Language.EN_AU: "en-au",
|
||||
Language.ES: "es-es",
|
||||
Language.ES_ES: "es-es",
|
||||
Language.ES_MX: "es-mx",
|
||||
Language.FR: "fr-fr",
|
||||
Language.FR_FR: "fr-fr",
|
||||
Language.FR_CA: "fr-ca",
|
||||
Language.DE: "de-de",
|
||||
Language.DE_DE: "de-de",
|
||||
Language.IT: "it-it",
|
||||
Language.PT: "pt-pt",
|
||||
Language.PT_BR: "pt-br",
|
||||
Language.PT_PT: "pt-pt",
|
||||
Language.NL: "nl-nl",
|
||||
Language.PL: "pl-pl",
|
||||
Language.RU: "ru-ru",
|
||||
Language.JA: "ja-jp",
|
||||
Language.KO: "ko-kr",
|
||||
Language.ZH: "zh-cn",
|
||||
Language.ZH_CN: "zh-cn",
|
||||
Language.ZH_TW: "zh-tw",
|
||||
Language.AR: "ar-sa",
|
||||
Language.HI: "hi-in",
|
||||
Language.TR: "tr-tr",
|
||||
Language.VI: "vi-vn",
|
||||
Language.TH: "th-th",
|
||||
Language.ID: "id-id",
|
||||
Language.MS: "ms-my",
|
||||
Language.SV: "sv-se",
|
||||
Language.DA: "da-dk",
|
||||
Language.NO: "no-no",
|
||||
Language.FI: "fi-fi",
|
||||
Language.CS: "cs-cz",
|
||||
Language.EL: "el-gr",
|
||||
Language.HE: "he-il",
|
||||
Language.HU: "hu-hu",
|
||||
Language.RO: "ro-ro",
|
||||
Language.SK: "sk-sk",
|
||||
Language.UK: "uk-ua",
|
||||
Language.BG: "bg-bg",
|
||||
Language.HR: "hr-hr",
|
||||
Language.SR: "sr-rs",
|
||||
Language.SL: "sl-si",
|
||||
Language.CA: "ca-es",
|
||||
Language.EU: "eu-es",
|
||||
Language.GL: "gl-es",
|
||||
Language.AF: "af-za",
|
||||
Language.SW: "sw-ke",
|
||||
Language.TA: "ta-in",
|
||||
Language.TE: "te-in",
|
||||
Language.BN: "bn-in",
|
||||
Language.MR: "mr-in",
|
||||
Language.GU: "gu-in",
|
||||
Language.KN: "kn-in",
|
||||
Language.ML: "ml-in",
|
||||
Language.PA: "pa-in",
|
||||
Language.UR: "ur-pk",
|
||||
Language.FA: "fa-ir",
|
||||
Language.TL: "tl-ph",
|
||||
}
|
||||
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=True)
|
||||
|
||||
|
||||
def _get_aligned_audio(buffer: bytes) -> tuple[bytes, bytes]:
|
||||
"""Split buffer into aligned audio (2-byte samples) and remainder.
|
||||
|
||||
Args:
|
||||
buffer: Raw audio bytes to align.
|
||||
|
||||
Returns:
|
||||
Tuple of (aligned audio bytes, remaining bytes).
|
||||
"""
|
||||
aligned_size = (len(buffer) // 2) * 2
|
||||
return buffer[:aligned_size], buffer[aligned_size:]
|
||||
|
||||
|
||||
class CambTTSService(TTSService):
|
||||
"""Camb.ai MARS text-to-speech service using the official SDK.
|
||||
|
||||
Converts text to speech using Camb.ai's MARS TTS models with support for
|
||||
multiple languages.
|
||||
|
||||
Models:
|
||||
- mars-flash: Fast inference, 22.05kHz output (default)
|
||||
- mars-pro: High quality, 48kHz output
|
||||
|
||||
Example::
|
||||
|
||||
# Basic usage with mars-flash (fast)
|
||||
tts = CambTTSService(api_key="your-api-key", model="mars-flash")
|
||||
|
||||
# High quality with mars-pro
|
||||
tts = CambTTSService(
|
||||
api_key="your-api-key",
|
||||
voice_id=12345,
|
||||
model="mars-pro",
|
||||
)
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Input parameters for Camb.ai TTS configuration.
|
||||
|
||||
Parameters:
|
||||
language: Language for synthesis (BCP-47 format). Defaults to English.
|
||||
user_instructions: Custom instructions for mars-instruct model only.
|
||||
Ignored for other models. Max 1000 characters.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = Language.EN
|
||||
user_instructions: Optional[str] = Field(
|
||||
default=None,
|
||||
max_length=1000,
|
||||
description="Custom instructions for mars-instruct model only. "
|
||||
"Use to control tone, style, or pronunciation. Max 1000 characters.",
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
voice_id: int = 147320,
|
||||
model: str = "mars-flash",
|
||||
timeout: float = 60.0,
|
||||
sample_rate: Optional[int] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Camb.ai TTS service.
|
||||
|
||||
Args:
|
||||
api_key: Camb.ai API key for authentication.
|
||||
voice_id: Voice ID to use. Defaults to 147320.
|
||||
model: TTS model to use. Options: "mars-flash" (fast), "mars-pro" (high quality).
|
||||
Defaults to "mars-flash".
|
||||
timeout: Request timeout in seconds. Defaults to 60.0 (minimum recommended
|
||||
by Camb.ai).
|
||||
sample_rate: Audio sample rate in Hz. If None, uses model-specific default.
|
||||
params: Additional voice parameters. If None, uses defaults.
|
||||
**kwargs: Additional arguments passed to parent TTSService.
|
||||
"""
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._timeout = timeout
|
||||
|
||||
params = params or CambTTSService.InputParams()
|
||||
|
||||
# Warn if sample rate doesn't match model's supported rate
|
||||
if sample_rate and sample_rate != MODEL_SAMPLE_RATES.get(model):
|
||||
logger.warning(
|
||||
f"Camb.ai's {model} model only supports {MODEL_SAMPLE_RATES.get(model)}Hz "
|
||||
f"sample rate. Current rate of {sample_rate}Hz may cause issues."
|
||||
)
|
||||
|
||||
# Build settings
|
||||
self._settings = {
|
||||
"language": (
|
||||
self.language_to_service_language(params.language) if params.language else "en-us"
|
||||
),
|
||||
"user_instructions": params.user_instructions,
|
||||
}
|
||||
|
||||
self.set_model_name(model)
|
||||
self.set_voice(str(voice_id))
|
||||
self._voice_id = voice_id
|
||||
|
||||
self._client = None
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as Camb.ai service supports metrics generation.
|
||||
"""
|
||||
return True
|
||||
|
||||
def language_to_service_language(self, language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to Camb.ai language format.
|
||||
|
||||
Args:
|
||||
language: The language to convert.
|
||||
|
||||
Returns:
|
||||
The Camb.ai-specific language code, or None if not supported.
|
||||
"""
|
||||
return language_to_camb_language(language)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
"""Start the Camb.ai TTS service.
|
||||
|
||||
Args:
|
||||
frame: The start frame containing initialization parameters.
|
||||
"""
|
||||
await super().start(frame)
|
||||
|
||||
self._client = AsyncCambAI(api_key=self._api_key, timeout=self._timeout)
|
||||
|
||||
# Use model-specific sample rate if not explicitly specified
|
||||
if not self._init_sample_rate:
|
||||
self._sample_rate = MODEL_SAMPLE_RATES.get(self.model_name, 22050)
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using Camb.ai's TTS API.
|
||||
|
||||
Args:
|
||||
text: The text to synthesize into speech (max 3000 characters).
|
||||
|
||||
Yields:
|
||||
Frame: Audio frames containing the synthesized speech.
|
||||
"""
|
||||
logger.debug(f"{self}: Generating TTS [{text}]")
|
||||
|
||||
# Validate text length
|
||||
if len(text) > 3000:
|
||||
logger.warning("Text too long for Camb.ai TTS (max 3000 chars), truncating")
|
||||
text = text[:3000]
|
||||
|
||||
try:
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
# Build SDK parameters
|
||||
tts_kwargs: Dict[str, Any] = {
|
||||
"text": text,
|
||||
"voice_id": self._voice_id,
|
||||
"language": self._settings["language"],
|
||||
"speech_model": self.model_name,
|
||||
"output_configuration": StreamTtsOutputConfiguration(format="pcm_s16le"),
|
||||
}
|
||||
|
||||
# Add user instructions if using mars-instruct model
|
||||
if self._model_name == "mars-instruct" and self._settings.get("user_instructions"):
|
||||
tts_kwargs["user_instructions"] = self._settings["user_instructions"]
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
yield TTSStartedFrame()
|
||||
|
||||
assert self._client is not None, "Camb.ai TTS service not initialized"
|
||||
|
||||
# Buffer for aligning chunks to 2-byte boundaries (16-bit PCM)
|
||||
audio_buffer = b""
|
||||
|
||||
# Stream audio chunks from SDK
|
||||
async for chunk in self._client.text_to_speech.tts(**tts_kwargs):
|
||||
if chunk:
|
||||
await self.stop_ttfb_metrics()
|
||||
audio_buffer += chunk
|
||||
|
||||
# Only yield complete 16-bit samples (2 bytes per sample)
|
||||
aligned_audio, audio_buffer = _get_aligned_audio(audio_buffer)
|
||||
if aligned_audio:
|
||||
yield TTSAudioRawFrame(
|
||||
audio=aligned_audio,
|
||||
sample_rate=self.sample_rate,
|
||||
num_channels=1,
|
||||
)
|
||||
|
||||
# Yield any remaining complete samples
|
||||
if len(audio_buffer) >= 2:
|
||||
aligned_audio, _ = _get_aligned_audio(audio_buffer)
|
||||
if aligned_audio:
|
||||
yield TTSAudioRawFrame(
|
||||
audio=aligned_audio,
|
||||
sample_rate=self.sample_rate,
|
||||
num_channels=1,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
yield ErrorFrame(error=f"Camb.ai TTS error: {e}")
|
||||
finally:
|
||||
yield TTSStoppedFrame()
|
||||
@@ -6,8 +6,6 @@
|
||||
|
||||
"""Cerebras LLM service implementation using OpenAI-compatible interface."""
|
||||
|
||||
from typing import List
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.services.open_ai_adapter import OpenAILLMInvocationParams
|
||||
|
||||
@@ -27,7 +27,6 @@ from pipecat.frames.frames import (
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.stt_service import WebsocketSTTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
@@ -676,8 +675,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
|
||||
await self._handle_transcription(transcript, True, self._language)
|
||||
await self.stop_processing_metrics()
|
||||
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
await self.broadcast_frame(UserStoppedSpeakingFrame)
|
||||
await self._call_event_handler("on_end_of_turn", transcript)
|
||||
|
||||
async def _handle_eager_end_of_turn(self, transcript: str, data: Dict[str, Any]):
|
||||
|
||||
@@ -6,8 +6,6 @@
|
||||
|
||||
"""DeepSeek LLM service implementation using OpenAI-compatible interface."""
|
||||
|
||||
from typing import List
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.services.open_ai_adapter import OpenAILLMInvocationParams
|
||||
|
||||
@@ -6,8 +6,6 @@
|
||||
|
||||
"""Fireworks AI service implementation using OpenAI-compatible interface."""
|
||||
|
||||
from typing import List
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.adapters.services.open_ai_adapter import OpenAILLMInvocationParams
|
||||
|
||||
@@ -1,2 +1,7 @@
|
||||
from .file_api import GeminiFileAPI
|
||||
from .gemini import GeminiMultimodalLiveLLMService
|
||||
|
||||
__all__ = [
|
||||
"GeminiFileAPI",
|
||||
"GeminiMultimodalLiveLLMService",
|
||||
]
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
from .file_api import GeminiFileAPI
|
||||
from .llm import GeminiLiveLLMService
|
||||
from .llm_vertex import GeminiLiveVertexLLMService
|
||||
|
||||
__all__ = [
|
||||
"GeminiFileAPI",
|
||||
"GeminiLiveLLMService",
|
||||
"GeminiLiveVertexLLMService",
|
||||
]
|
||||
|
||||
@@ -1674,7 +1674,7 @@ class GeminiLiveLLMService(LLMService):
|
||||
# start a timeout task to flush it later
|
||||
if self._user_transcription_buffer:
|
||||
self._transcription_timeout_task = self.create_task(
|
||||
self._transcription_timeout_handler()
|
||||
await self._transcription_timeout_handler()
|
||||
)
|
||||
|
||||
async def _handle_msg_output_transcription(self, message: LiveServerMessage):
|
||||
@@ -1710,11 +1710,26 @@ class GeminiLiveLLMService(LLMService):
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
|
||||
frame = TTSTextFrame(text=text, aggregated_by=AggregationType.SENTENCE)
|
||||
# Gemini Live text already includes any necessary inter-chunk spaces
|
||||
frame.includes_inter_frame_spaces = True
|
||||
await self._push_output_transcription_text_frames(text)
|
||||
|
||||
await self.push_frame(frame)
|
||||
async def _push_output_transcription_text_frames(self, text: str):
|
||||
# In a typical "cascade" LLM + TTS setup, LLMTextFrames would not
|
||||
# proceed beyond the TTS service. Therefore, since a speech-to-speech
|
||||
# service like Gemini Live combines both LLM and TTS functionality, you
|
||||
# might think we wouldn't need to push LLMTextFrames at all. However,
|
||||
# RTVI relies on LLMTextFrames being pushed to trigger its
|
||||
# "bot-llm-text" event. So here we push an LLMTextFrame, too, but avoid
|
||||
# appending it to context to avoid context message duplication.
|
||||
|
||||
# Push LLMTextFrame
|
||||
llm_text_frame = LLMTextFrame(text)
|
||||
llm_text_frame.append_to_context = False
|
||||
await self.push_frame(llm_text_frame)
|
||||
|
||||
# Push TTSTextFrame
|
||||
tts_text_frame = TTSTextFrame(text, aggregated_by=AggregationType.SENTENCE)
|
||||
tts_text_frame.includes_inter_frame_spaces = True
|
||||
await self.push_frame(tts_text_frame)
|
||||
|
||||
async def _handle_msg_grounding_metadata(self, message: LiveServerMessage):
|
||||
"""Handle dedicated grounding metadata messages."""
|
||||
|
||||
@@ -40,7 +40,6 @@ from pipecat.frames.frames import (
|
||||
LLMThoughtStartFrame,
|
||||
LLMThoughtTextFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
OutputImageRawFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
from pipecat.metrics.metrics import LLMTokenUsage
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Google RTVI integration models and observer implementation.
|
||||
"""Google RTVI processor and observer implementation.
|
||||
|
||||
This module provides integration with Google's services through the RTVI framework,
|
||||
including models for search responses and an observer for handling Google-specific
|
||||
@@ -15,10 +15,8 @@ from typing import List, Literal, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import Frame
|
||||
from pipecat.observers.base_observer import FramePushed
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
|
||||
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIObserverParams, RTVIProcessor
|
||||
from pipecat.services.google.frames import LLMSearchOrigin, LLMSearchResponseFrame
|
||||
|
||||
|
||||
@@ -88,4 +86,23 @@ class GoogleRTVIObserver(RTVIObserver):
|
||||
rendered_content=frame.rendered_content,
|
||||
)
|
||||
)
|
||||
await self.push_transport_message_urgent(message)
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
|
||||
class GoogleRTVIProcessor(RTVIProcessor):
|
||||
"""RTVI processor for Google service integration.
|
||||
|
||||
Creates a specific Google RTVI Observer.
|
||||
"""
|
||||
|
||||
def create_rtvi_observer(self, *, params: Optional[RTVIObserverParams] = None, **kwargs):
|
||||
"""Creates a new RTVI Observer.
|
||||
|
||||
Args:
|
||||
params: Settings to enable/disable specific messages.
|
||||
**kwargs: Additional arguments passed to the observer.
|
||||
|
||||
Returns:
|
||||
A new RTVI observer.
|
||||
"""
|
||||
return GoogleRTVIObserver(self)
|
||||
|
||||
@@ -29,7 +29,6 @@ from pydantic import BaseModel, Field, field_validator
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
StartFrame,
|
||||
|
||||
@@ -40,6 +40,7 @@ from pipecat.services.tts_service import TTSService
|
||||
from pipecat.transcriptions.language import Language, resolve_language
|
||||
|
||||
try:
|
||||
from google.api_core.client_options import ClientOptions
|
||||
from google.auth import default
|
||||
from google.auth.exceptions import GoogleAuthError
|
||||
from google.cloud import texttospeech_v1
|
||||
@@ -515,6 +516,7 @@ class GoogleHttpTTSService(TTSService):
|
||||
*,
|
||||
credentials: Optional[str] = None,
|
||||
credentials_path: Optional[str] = None,
|
||||
location: Optional[str] = None,
|
||||
voice_id: str = "en-US-Chirp3-HD-Charon",
|
||||
sample_rate: Optional[int] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
@@ -525,6 +527,7 @@ class GoogleHttpTTSService(TTSService):
|
||||
Args:
|
||||
credentials: JSON string containing Google Cloud service account credentials.
|
||||
credentials_path: Path to Google Cloud service account JSON file.
|
||||
location: Google Cloud location for regional endpoint (e.g., "us-central1").
|
||||
voice_id: Google TTS voice identifier (e.g., "en-US-Standard-A").
|
||||
sample_rate: Audio sample rate in Hz. If None, uses default.
|
||||
params: Voice customization parameters including pitch, rate, volume, etc.
|
||||
@@ -534,6 +537,7 @@ class GoogleHttpTTSService(TTSService):
|
||||
|
||||
params = params or GoogleHttpTTSService.InputParams()
|
||||
|
||||
self._location = location
|
||||
self._settings = {
|
||||
"pitch": params.pitch,
|
||||
"rate": params.rate,
|
||||
@@ -586,7 +590,15 @@ class GoogleHttpTTSService(TTSService):
|
||||
if not creds:
|
||||
raise ValueError("No valid credentials provided.")
|
||||
|
||||
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
|
||||
client_options = None
|
||||
if self._location:
|
||||
client_options = ClientOptions(
|
||||
api_endpoint=f"{self._location}-texttospeech.googleapis.com"
|
||||
)
|
||||
|
||||
return texttospeech_v1.TextToSpeechAsyncClient(
|
||||
credentials=creds, client_options=client_options
|
||||
)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
@@ -783,7 +795,15 @@ class GoogleBaseTTSService(TTSService):
|
||||
if not creds:
|
||||
raise ValueError("No valid credentials provided.")
|
||||
|
||||
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
|
||||
client_options = None
|
||||
if self._location:
|
||||
client_options = ClientOptions(
|
||||
api_endpoint=f"{self._location}-texttospeech.googleapis.com"
|
||||
)
|
||||
|
||||
return texttospeech_v1.TextToSpeechAsyncClient(
|
||||
credentials=creds, client_options=client_options
|
||||
)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
@@ -903,6 +923,7 @@ class GoogleTTSService(GoogleBaseTTSService):
|
||||
*,
|
||||
credentials: Optional[str] = None,
|
||||
credentials_path: Optional[str] = None,
|
||||
location: Optional[str] = None,
|
||||
voice_id: str = "en-US-Chirp3-HD-Charon",
|
||||
voice_cloning_key: Optional[str] = None,
|
||||
sample_rate: Optional[int] = None,
|
||||
@@ -914,6 +935,7 @@ class GoogleTTSService(GoogleBaseTTSService):
|
||||
Args:
|
||||
credentials: JSON string containing Google Cloud service account credentials.
|
||||
credentials_path: Path to Google Cloud service account JSON file.
|
||||
location: Google Cloud location for regional endpoint (e.g., "us-central1").
|
||||
voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
|
||||
voice_cloning_key: The voice cloning key for Chirp 3 custom voices.
|
||||
sample_rate: Audio sample rate in Hz. If None, uses default.
|
||||
@@ -924,6 +946,7 @@ class GoogleTTSService(GoogleBaseTTSService):
|
||||
|
||||
params = params or GoogleTTSService.InputParams()
|
||||
|
||||
self._location = location
|
||||
self._settings = {
|
||||
"language": self.language_to_service_language(params.language)
|
||||
if params.language
|
||||
@@ -1083,6 +1106,7 @@ class GeminiTTSService(GoogleBaseTTSService):
|
||||
model: str = "gemini-2.5-flash-tts",
|
||||
credentials: Optional[str] = None,
|
||||
credentials_path: Optional[str] = None,
|
||||
location: Optional[str] = None,
|
||||
voice_id: str = "Kore",
|
||||
sample_rate: Optional[int] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
@@ -1101,6 +1125,7 @@ class GeminiTTSService(GoogleBaseTTSService):
|
||||
"gemini-2.5-flash-tts" or "gemini-2.5-pro-tts".
|
||||
credentials: JSON string containing Google Cloud service account credentials.
|
||||
credentials_path: Path to Google Cloud service account JSON file.
|
||||
location: Google Cloud location for regional endpoint (e.g., "us-central1").
|
||||
voice_id: Voice name from the available Gemini voices.
|
||||
sample_rate: Audio sample rate in Hz. If None, uses Google's default 24kHz.
|
||||
params: TTS configuration parameters.
|
||||
@@ -1127,6 +1152,7 @@ class GeminiTTSService(GoogleBaseTTSService):
|
||||
if voice_id not in self.AVAILABLE_VOICES:
|
||||
logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")
|
||||
|
||||
self._location = location
|
||||
self._model = model
|
||||
self._voice_id = voice_id
|
||||
self._settings = {
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
import base64
|
||||
import json
|
||||
import uuid
|
||||
from typing import Any, AsyncGenerator, Mapping, Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -33,6 +33,7 @@ from pipecat.frames.frames import (
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
LLMSetToolsFrame,
|
||||
LLMTextFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
StartFrame,
|
||||
TranscriptionFrame,
|
||||
@@ -619,9 +620,26 @@ class GrokRealtimeLLMService(LLMService):
|
||||
async def _handle_evt_audio_transcript_delta(self, evt):
|
||||
"""Handle audio transcript delta event."""
|
||||
if evt.delta:
|
||||
frame = TTSTextFrame(evt.delta, aggregated_by=AggregationType.SENTENCE)
|
||||
frame.includes_inter_frame_spaces = True
|
||||
await self.push_frame(frame)
|
||||
await self._push_output_transcript_text_frames(evt.delta)
|
||||
|
||||
async def _push_output_transcript_text_frames(self, text: str):
|
||||
# In a typical "cascade" LLM + TTS setup, LLMTextFrames would not
|
||||
# proceed beyond the TTS service. Therefore, since a speech-to-speech
|
||||
# service like Grok Realtime combines both LLM and TTS functionality,
|
||||
# you might think we wouldn't need to push LLMTextFrames at all.
|
||||
# However, RTVI relies on LLMTextFrames being pushed to trigger its
|
||||
# "bot-llm-text" event. So here we push an LLMTextFrame, too, but avoid
|
||||
# appending it to context to avoid context message duplication.
|
||||
|
||||
# Push LLMTextFrame
|
||||
llm_text_frame = LLMTextFrame(text)
|
||||
llm_text_frame.append_to_context = False
|
||||
await self.push_frame(llm_text_frame)
|
||||
|
||||
# Push TTSTextFrame
|
||||
tts_text_frame = TTSTextFrame(text, aggregated_by=AggregationType.SENTENCE)
|
||||
tts_text_frame.includes_inter_frame_spaces = True
|
||||
await self.push_frame(tts_text_frame)
|
||||
|
||||
async def _handle_evt_function_call_arguments_done(self, evt):
|
||||
"""Handle function call arguments done event."""
|
||||
@@ -659,7 +677,7 @@ class GrokRealtimeLLMService(LLMService):
|
||||
"""Handle speech stopped event from VAD."""
|
||||
await self.start_ttfb_metrics()
|
||||
await self.start_processing_metrics()
|
||||
await self.push_frame(UserStoppedSpeakingFrame())
|
||||
await self.broadcast_frame(UserStoppedSpeakingFrame)
|
||||
|
||||
async def _handle_evt_error(self, evt):
|
||||
"""Handle error event."""
|
||||
@@ -734,6 +752,14 @@ class GrokRealtimeLLMService(LLMService):
|
||||
|
||||
async def _send_user_audio(self, frame):
|
||||
"""Send user audio to Grok."""
|
||||
# Don't send audio if conversation setup is still pending, as it can
|
||||
# lead to errors. For example: audio sent before conversation setup
|
||||
# will be interpreted as having Grok's default sample rate (24000),
|
||||
# and if that differs from the sample rate we eventually set through
|
||||
# the conversation setup, Grok will error out.
|
||||
if self._llm_needs_conversation_setup:
|
||||
return
|
||||
|
||||
payload = base64.b64encode(frame.audio).decode("utf-8")
|
||||
await self.send_client_event(events.InputAudioBufferAppendEvent(audio=payload))
|
||||
|
||||
|
||||
0
src/pipecat/services/hathora/__init__.py
Normal file
0
src/pipecat/services/hathora/__init__.py
Normal file
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user