Compare commits
62 Commits
hush/aggre
...
v0.0.87
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
63bc825008 | ||
|
|
e7ffde1c4c | ||
|
|
1c88565725 | ||
|
|
07a6c2fb0e | ||
|
|
e99f3bf75a | ||
|
|
f09d780413 | ||
|
|
e370d23374 | ||
|
|
b68ec14146 | ||
|
|
c567fd71b1 | ||
|
|
2ca1b2d6f8 | ||
|
|
04041a9a9a | ||
|
|
6c498dc70f | ||
|
|
32b07c1720 | ||
|
|
ad507ce23d | ||
|
|
be562cedfc | ||
|
|
089e703e1f | ||
|
|
4dc1e15a99 | ||
|
|
c7dc2e886f | ||
|
|
11bc4ea854 | ||
|
|
029d76033d | ||
|
|
924d7dea9a | ||
|
|
244e94f3ce | ||
|
|
af1f51d49e | ||
|
|
9ba3c168b8 | ||
|
|
e6ee8f7a16 | ||
|
|
2ea2bd99e0 | ||
|
|
0c2ced7c52 | ||
|
|
fb160646b8 | ||
|
|
89fed57af2 | ||
|
|
feae3b6d2d | ||
|
|
92d3be8975 | ||
|
|
0f53e1db2c | ||
|
|
d398e8cc10 | ||
|
|
e5f263d380 | ||
|
|
3a4c303c54 | ||
|
|
54a1ef47d0 | ||
|
|
149ffa4f3c | ||
|
|
e5465034d9 | ||
|
|
568c7c782d | ||
|
|
9851334221 | ||
|
|
e79c4fc99d | ||
|
|
55c321f4ff | ||
|
|
a14a53a005 | ||
|
|
a71f937e8f | ||
|
|
032032df65 | ||
|
|
d0178edad0 | ||
|
|
795c5e55d9 | ||
|
|
8f8d8ae0d8 | ||
|
|
741f192d04 | ||
|
|
a5595b82ea | ||
|
|
4d1915eb41 | ||
|
|
b3a84fc772 | ||
|
|
403d22e62c | ||
|
|
ee00ee5c57 | ||
|
|
f53fd880dc | ||
|
|
de3461e4cc | ||
|
|
7bafc3a1bb | ||
|
|
22ef61fe8d | ||
|
|
7078fb53bd | ||
|
|
33447ad6f2 | ||
|
|
6faa50ae5b | ||
|
|
889dc19a27 |
22
.github/workflows/publish.yaml
vendored
22
.github/workflows/publish.yaml
vendored
@@ -5,25 +5,25 @@ on:
|
||||
inputs:
|
||||
gitref:
|
||||
type: string
|
||||
description: "what git tag to build (e.g. v0.0.74)"
|
||||
description: 'what git tag to build (e.g. v0.0.74)'
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: "Build and upload wheels"
|
||||
name: 'Build and upload wheels'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.gitref }}
|
||||
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
version: "latest"
|
||||
version: 'latest'
|
||||
- name: Set up Python
|
||||
run: uv python install 3.10
|
||||
run: uv python install 3.12
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
- name: Build project
|
||||
@@ -35,9 +35,9 @@ jobs:
|
||||
path: ./dist
|
||||
|
||||
publish-to-pypi:
|
||||
name: "Publish to PyPI"
|
||||
name: 'Publish to PyPI'
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
needs: [build]
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
@@ -56,12 +56,12 @@ jobs:
|
||||
print-hash: true
|
||||
|
||||
publish-to-test-pypi:
|
||||
name: "Publish to Test PyPI"
|
||||
name: 'Publish to Test PyPI'
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ build ]
|
||||
needs: [build]
|
||||
environment:
|
||||
name: testpypi
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
url: https://test.pypi.org/p/pipecat-ai
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
- name: Publish to PyPI
|
||||
- name: Publish to Test PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
verbose: true
|
||||
|
||||
12
.github/workflows/publish_test.yaml
vendored
12
.github/workflows/publish_test.yaml
vendored
@@ -4,7 +4,7 @@ on: workflow_dispatch
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: "Build and upload wheels"
|
||||
name: 'Build and upload wheels'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
@@ -15,9 +15,9 @@ jobs:
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
version: "latest"
|
||||
version: 'latest'
|
||||
- name: Set up Python
|
||||
run: uv python install 3.10
|
||||
run: uv python install 3.12
|
||||
- name: Install development dependencies
|
||||
run: uv sync --group dev
|
||||
- name: Build project
|
||||
@@ -29,12 +29,12 @@ jobs:
|
||||
path: ./dist
|
||||
|
||||
publish-to-test-pypi:
|
||||
name: "Publish to Test PyPI"
|
||||
name: 'Publish to Test PyPI'
|
||||
runs-on: ubuntu-latest
|
||||
needs: [build]
|
||||
environment:
|
||||
name: testpypi
|
||||
url: https://pypi.org/p/pipecat-ai
|
||||
url: https://test.pypi.org/p/pipecat-ai
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
with:
|
||||
name: wheels
|
||||
path: ./dist
|
||||
- name: Publish to PyPI
|
||||
- name: Publish to Test PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
verbose: true
|
||||
|
||||
53
CHANGELOG.md
53
CHANGELOG.md
@@ -5,6 +5,59 @@ All notable changes to **Pipecat** will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.0.87] - 2025-10-02
|
||||
|
||||
### Added
|
||||
|
||||
- Added `WebsocketSTTService` base class for websocket-based STT services.
|
||||
Combines STT functionality with websocket connectivity, providing automatic
|
||||
error handling and reconnection capabilities with exponential backoff.
|
||||
|
||||
- Added `DeepgramFluxSTTService` for real-time speech recognition using
|
||||
Deepgram's Flux WebSocket API. Flux understands conversational flow and
|
||||
automatically handles turn-taking.
|
||||
|
||||
- Added RTVI messages for user/bot audio levels and system logs.
|
||||
|
||||
- Include OpenAI-based LLM services cached tokens to `MetricsFrame`.
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated the default model for `AnthropicLLMService` to
|
||||
`claude-sonnet-4-5-20250929`.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `DailyTransportMessageFrame` and `DailyTransportMessageUrgentFrame` are
|
||||
deprecated, use `DailyOutputTransportMessageFrame` and
|
||||
`DailyOutputTransportMessageUrgentFrame` respectively instead.
|
||||
|
||||
- `LiveKitTransportMessageFrame` and `LiveKitTransportMessageUrgentFrame` are
|
||||
deprecated, use `LiveKitOutputTransportMessageFrame` and
|
||||
`LiveKitOutputTransportMessageUrgentFrame` respectively instead.
|
||||
|
||||
- `TransportMessageFrame` and `TransportMessageUrgentFrame` are deprecated, use
|
||||
`OutputTransportMessageFrame` and `OutputTransportMessageUrgentFrame`
|
||||
respectively instead.
|
||||
|
||||
- `InputTransportMessageUrgentFrame` is deprecated, use
|
||||
`InputTransportMessageFrame` instead.
|
||||
|
||||
- `DailyUpdateRemoteParticipantsFrame` is deprecated and will be removed in a
|
||||
future version. Instead, create your own custom frame and handle it in the
|
||||
`@transport.output().event_handler("on_after_push_frame")` event handler or a
|
||||
custom processor.
|
||||
|
||||
## Fixed
|
||||
|
||||
- Fixed an issue in `AWSBedrockLLMService` where timeout exceptions weren't
|
||||
being detected.
|
||||
|
||||
- Fixed a `PipelineTask` issue that could prevent the application to exit if
|
||||
`task.cancel()` was called when the task was already finished.
|
||||
|
||||
- Fixed an issue where local SmartTurn was not being ran in a separate thread.
|
||||
|
||||
## [0.0.86] - 2025-09-24
|
||||
|
||||
### Added
|
||||
|
||||
@@ -25,7 +25,7 @@ from pipecat.processors.aggregators.llm_response_universal import LLMContextAggr
|
||||
from pipecat.runner.daily import configure
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.daily.transport import DailyLogLevel, DailyParams, DailyTransport
|
||||
from pipecat.transports.daily.transport import DailyParams, DailyTransport
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
@@ -49,7 +49,6 @@ async def main():
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
|
||||
),
|
||||
)
|
||||
transport.set_log_level(DailyLogLevel.Info)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
|
||||
118
examples/foundational/07c-interruptible-deepgram-flux.py
Normal file
118
examples/foundational/07c-interruptible-deepgram-flux.py
Normal file
@@ -0,0 +1,118 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import LLMRunFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
LLMContext,
|
||||
LLMContextAggregatorPair,
|
||||
)
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.flux.stt import DeepgramFluxSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramFluxSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-2-andromeda-en")
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = LLMContext(messages)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt, # STT
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -26,7 +26,11 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.deepgram.tts import DeepgramTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams, DailyTransportMessageFrame
|
||||
from pipecat.transports.daily.transport import (
|
||||
DailyOutputTransportMessageFrame,
|
||||
DailyOutputTransportMessageUrgentFrame,
|
||||
DailyParams,
|
||||
)
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
@@ -128,14 +132,14 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.debug(f"Received latency ping app message: {message}")
|
||||
ts = message["latency-ping"]["ts"]
|
||||
# Send immediately
|
||||
transport.output().send_message(
|
||||
DailyTransportMessageFrame(
|
||||
await task.queue_frame(
|
||||
DailyOutputTransportMessageUrgentFrame(
|
||||
message={"latency-pong-msg-handler": {"ts": ts}}, participant_id=sender
|
||||
)
|
||||
)
|
||||
# And push to the pipeline for the Daily transport.output to send
|
||||
await task.queue_frame(
|
||||
DailyTransportMessageFrame(
|
||||
DailyOutputTransportMessageFrame(
|
||||
message={"latency-pong-pipeline-delivery": {"ts": ts}},
|
||||
participant_id=sender,
|
||||
)
|
||||
|
||||
@@ -206,6 +206,14 @@ async def bot(runner_args: RunnerArguments):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.getenv("NASA_API_KEY"):
|
||||
logger.error(
|
||||
f"Please set NASA_API_KEY environment variable for this example. See https://api.nasa.gov"
|
||||
)
|
||||
import sys
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
|
||||
@@ -141,6 +141,14 @@ async def bot(runner_args: RunnerArguments):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.getenv("MCP_RUN_SSE_URL"):
|
||||
logger.error(
|
||||
f"Please set MCP_RUN_SSE_URL environment variable for this example. See https://mcp.run"
|
||||
)
|
||||
import sys
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
|
||||
@@ -219,6 +219,14 @@ async def bot(runner_args: RunnerArguments):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.getenv("NASA_API_KEY") or not os.getenv("MCP_RUN_SSE_URL"):
|
||||
logger.error(
|
||||
f"Please set NASA_API_KEY and MCP_RUN_SSE_URL environment variables. See https://api.nasa.gov and https://mcp.run"
|
||||
)
|
||||
import sys
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
|
||||
@@ -145,6 +145,14 @@ async def bot(runner_args: RunnerArguments):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN"):
|
||||
logger.error(
|
||||
f"Please set GITHUB_PERSONAL_ACCESS_TOKEN environment variable for this example."
|
||||
)
|
||||
import sys
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
|
||||
@@ -4,7 +4,7 @@ version = "0.1.0"
|
||||
description = "Quickstart example for building voice AI bots with Pipecat"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"pipecat-ai[webrtc,daily,silero,deepgram,openai,cartesia,local-smart-turn-v3,runner]>=0.0.85",
|
||||
"pipecat-ai[webrtc,daily,silero,deepgram,openai,cartesia,local-smart-turn-v3,runner]>=0.0.86",
|
||||
"pipecatcloud>=0.2.4"
|
||||
]
|
||||
|
||||
|
||||
@@ -34,7 +34,8 @@ from pipecat.frames.frames import EndTaskFrame, LLMRunFrame, OutputImageRawFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
@@ -283,8 +284,8 @@ async def run_eval_pipeline(
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages, tools)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
context = LLMContext(messages, tools)
|
||||
context_aggregator = LLMContextAggregatorPair(context)
|
||||
|
||||
audio_buffer = AudioBufferProcessor()
|
||||
|
||||
|
||||
@@ -67,6 +67,7 @@ TESTS_07 = [
|
||||
("07ac-interruptible-asyncai-http.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07b-interruptible-langchain.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07c-interruptible-deepgram.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07c-interruptible-deepgram-flux.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07d-interruptible-elevenlabs.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
(
|
||||
"07d-interruptible-elevenlabs-http.py",
|
||||
|
||||
@@ -14,6 +14,8 @@ from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.metrics.metrics import MetricsData
|
||||
|
||||
|
||||
@@ -29,6 +31,12 @@ class EndOfTurnState(Enum):
|
||||
INCOMPLETE = 2
|
||||
|
||||
|
||||
class BaseTurnParams(BaseModel):
|
||||
"""Base class for turn analyzer parameters."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class BaseTurnAnalyzer(ABC):
|
||||
"""Abstract base class for analyzing user end of turn.
|
||||
|
||||
@@ -78,7 +86,7 @@ class BaseTurnAnalyzer(ABC):
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def params(self):
|
||||
def params(self) -> BaseTurnParams:
|
||||
"""Get the current turn analyzer parameters.
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -11,15 +11,17 @@ machine learning models to determine when a user has finished speaking, going
|
||||
beyond simple silence-based detection.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from abc import abstractmethod
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
|
||||
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, BaseTurnParams, EndOfTurnState
|
||||
from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
|
||||
|
||||
# Default timing parameters
|
||||
@@ -29,7 +31,7 @@ MAX_DURATION_SECONDS = 8 # Max allowed segment duration
|
||||
USE_ONLY_LAST_VAD_SEGMENT = True
|
||||
|
||||
|
||||
class SmartTurnParams(BaseModel):
|
||||
class SmartTurnParams(BaseTurnParams):
|
||||
"""Configuration parameters for smart turn analysis.
|
||||
|
||||
Parameters:
|
||||
@@ -77,6 +79,9 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
self._speech_triggered = False
|
||||
self._silence_ms = 0
|
||||
self._speech_start_time = 0
|
||||
# Thread executor that will run the model. We only need one thread per
|
||||
# analyzer because one analyzer just handles one audio stream.
|
||||
self._executor = ThreadPoolExecutor(max_workers=1)
|
||||
|
||||
@property
|
||||
def speech_triggered(self) -> bool:
|
||||
@@ -151,7 +156,10 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
Tuple containing the end-of-turn state and optional metrics data
|
||||
from the ML model analysis.
|
||||
"""
|
||||
state, result = await self._process_speech_segment(self._audio_buffer)
|
||||
loop = asyncio.get_running_loop()
|
||||
state, result = await loop.run_in_executor(
|
||||
self._executor, self._process_speech_segment, self._audio_buffer
|
||||
)
|
||||
if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
|
||||
self._clear(state)
|
||||
logger.debug(f"End of Turn result: {state}")
|
||||
@@ -169,9 +177,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
self._speech_start_time = 0
|
||||
self._silence_ms = 0
|
||||
|
||||
async def _process_speech_segment(
|
||||
self, audio_buffer
|
||||
) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
|
||||
def _process_speech_segment(self, audio_buffer) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
|
||||
"""Process accumulated audio segment using ML model."""
|
||||
state = EndOfTurnState.INCOMPLETE
|
||||
|
||||
@@ -203,7 +209,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
if len(segment_audio) > 0:
|
||||
start_time = time.perf_counter()
|
||||
try:
|
||||
result = await self._predict_endpoint(segment_audio)
|
||||
result = self._predict_endpoint(segment_audio)
|
||||
state = (
|
||||
EndOfTurnState.COMPLETE
|
||||
if result["prediction"] == 1
|
||||
@@ -249,6 +255,6 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
||||
return state, result_data
|
||||
|
||||
@abstractmethod
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Predict end-of-turn using ML model from audio data."""
|
||||
pass
|
||||
|
||||
@@ -104,11 +104,15 @@ class HttpSmartTurnAnalyzer(BaseSmartTurn):
|
||||
logger.error(f"Failed to send raw request to Daily Smart Turn: {e}")
|
||||
raise Exception("Failed to send raw request to Daily Smart Turn.")
|
||||
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Predict end-of-turn using remote HTTP ML service."""
|
||||
try:
|
||||
serialized_array = self._serialize_array(audio_array)
|
||||
return await self._send_raw_request(serialized_array)
|
||||
loop = asyncio.get_running_loop()
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self._send_raw_request(serialized_array), loop
|
||||
)
|
||||
return future.result()
|
||||
except Exception as e:
|
||||
logger.error(f"Smart turn prediction failed: {str(e)}")
|
||||
# Return an incomplete prediction when a failure occurs
|
||||
|
||||
@@ -64,7 +64,7 @@ class LocalSmartTurnAnalyzer(BaseSmartTurn):
|
||||
self._turn_model.eval()
|
||||
logger.debug("Loaded Local Smart Turn")
|
||||
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Predict end-of-turn using local PyTorch model."""
|
||||
inputs = self._turn_processor(
|
||||
audio_array,
|
||||
|
||||
@@ -73,7 +73,7 @@ class LocalSmartTurnAnalyzerV2(BaseSmartTurn):
|
||||
self._turn_model.eval()
|
||||
logger.debug("Loaded Local Smart Turn v2")
|
||||
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Predict end-of-turn using local PyTorch model."""
|
||||
inputs = self._turn_processor(
|
||||
audio_array,
|
||||
|
||||
@@ -77,7 +77,7 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
||||
|
||||
logger.debug("Loaded Local Smart Turn v3")
|
||||
|
||||
async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
||||
"""Predict end-of-turn using local ONNX model."""
|
||||
|
||||
def truncate_audio_to_last_n_seconds(audio_array, n_seconds=8, sample_rate=16000):
|
||||
|
||||
@@ -11,7 +11,9 @@ data structures for voice activity detection in audio streams. Includes state
|
||||
management, parameter configuration, and audio analysis framework.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from abc import ABC, abstractmethod
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
@@ -84,6 +86,10 @@ class VADAnalyzer(ABC):
|
||||
self._smoothing_factor = 0.2
|
||||
self._prev_volume = 0
|
||||
|
||||
# Thread executor that will run the model. We only need one thread per
|
||||
# analyzer because one analyzer just handles one audio stream.
|
||||
self._executor = ThreadPoolExecutor(max_workers=1)
|
||||
|
||||
@property
|
||||
def sample_rate(self) -> int:
|
||||
"""Get the current sample rate.
|
||||
@@ -165,7 +171,7 @@ class VADAnalyzer(ABC):
|
||||
volume = calculate_audio_volume(audio, self.sample_rate)
|
||||
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
|
||||
|
||||
def analyze_audio(self, buffer) -> VADState:
|
||||
async def analyze_audio(self, buffer: bytes) -> VADState:
|
||||
"""Analyze audio buffer and return current VAD state.
|
||||
|
||||
Processes incoming audio data, maintains internal state, and determines
|
||||
@@ -177,6 +183,12 @@ class VADAnalyzer(ABC):
|
||||
Returns:
|
||||
Current VAD state after processing the buffer.
|
||||
"""
|
||||
loop = asyncio.get_running_loop()
|
||||
state = await loop.run_in_executor(self._executor, self._run_analyzer, buffer)
|
||||
return state
|
||||
|
||||
def _run_analyzer(self, buffer: bytes) -> VADState:
|
||||
"""Analyze audio buffer and return current VAD state."""
|
||||
self._vad_buffer += buffer
|
||||
|
||||
num_required_bytes = self._vad_frames_num_bytes
|
||||
|
||||
@@ -672,7 +672,7 @@ class TTSSpeakFrame(DataFrame):
|
||||
|
||||
|
||||
@dataclass
|
||||
class TransportMessageFrame(DataFrame):
|
||||
class OutputTransportMessageFrame(DataFrame):
|
||||
"""Frame containing transport-specific message data.
|
||||
|
||||
Parameters:
|
||||
@@ -685,6 +685,32 @@ class TransportMessageFrame(DataFrame):
|
||||
return f"{self.name}(message: {self.message})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class TransportMessageFrame(OutputTransportMessageFrame):
|
||||
"""Frame containing transport-specific message data.
|
||||
|
||||
.. deprecated:: 0.0.87
|
||||
This frame is deprecated and will be removed in a future version.
|
||||
Instead, use `OutputTransportMessageFrame`.
|
||||
|
||||
Parameters:
|
||||
message: The transport message payload.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"TransportMessageFrame is deprecated and will be removed in a future version. "
|
||||
"Instead, use OutputTransportMessageFrame.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DTMFFrame:
|
||||
"""Base class for DTMF (Dual-Tone Multi-Frequency) keypad frames.
|
||||
@@ -1092,8 +1118,8 @@ class STTMuteFrame(SystemFrame):
|
||||
|
||||
|
||||
@dataclass
|
||||
class TransportMessageUrgentFrame(SystemFrame):
|
||||
"""Frame for urgent transport messages that need immediate processing.
|
||||
class InputTransportMessageFrame(SystemFrame):
|
||||
"""Frame for transport messages received from external sources.
|
||||
|
||||
Parameters:
|
||||
message: The urgent transport message payload.
|
||||
@@ -1106,20 +1132,69 @@ class TransportMessageUrgentFrame(SystemFrame):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InputTransportMessageUrgentFrame(TransportMessageUrgentFrame):
|
||||
class InputTransportMessageUrgentFrame(InputTransportMessageFrame):
|
||||
"""Frame for transport messages received from external sources.
|
||||
|
||||
This frame wraps incoming transport messages to distinguish them from outgoing
|
||||
urgent transport messages (TransportMessageUrgentFrame), preventing infinite
|
||||
message loops in the transport layer. It inherits the message payload from
|
||||
TransportMessageFrame while marking the message as having been received
|
||||
rather than generated locally.
|
||||
.. deprecated:: 0.0.87
|
||||
This frame is deprecated and will be removed in a future version.
|
||||
Instead, use `InputTransportMessageFrame`.
|
||||
|
||||
Used by transport implementations to properly handle bidirectional message
|
||||
flow without creating feedback loops.
|
||||
Parameters:
|
||||
message: The urgent transport message payload.
|
||||
"""
|
||||
|
||||
pass
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"InputTransportMessageUrgentFrame is deprecated and will be removed in a future version. "
|
||||
"Instead, use InputTransportMessageFrame.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OutputTransportMessageUrgentFrame(SystemFrame):
|
||||
"""Frame for urgent transport messages that need to be sent immediately.
|
||||
|
||||
Parameters:
|
||||
message: The urgent transport message payload.
|
||||
"""
|
||||
|
||||
message: Any
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}(message: {self.message})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class TransportMessageUrgentFrame(OutputTransportMessageUrgentFrame):
|
||||
"""Frame for urgent transport messages that need to be sent immediately.
|
||||
|
||||
.. deprecated:: 0.0.87
|
||||
This frame is deprecated and will be removed in a future version.
|
||||
Instead, use `OutputTransportMessageUrgentFrame`.
|
||||
|
||||
Parameters:
|
||||
message: The urgent transport message payload.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"TransportMessageUrgentFrame is deprecated and will be removed in a future version. "
|
||||
"Instead, use OutputTransportMessageFrame.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -13,8 +13,7 @@ including heartbeats, idle detection, and observer integration.
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from collections import deque
|
||||
from typing import Any, AsyncIterable, Deque, Dict, Iterable, List, Optional, Tuple, Type
|
||||
from typing import Any, AsyncIterable, Dict, Iterable, List, Optional, Tuple, Type
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
@@ -31,7 +30,6 @@ from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
HeartbeatFrame,
|
||||
InputAudioRawFrame,
|
||||
InterruptionFrame,
|
||||
InterruptionTaskFrame,
|
||||
MetricsFrame,
|
||||
@@ -132,9 +130,11 @@ class PipelineTask(BasePipelineTask):
|
||||
|
||||
- on_pipeline_finished: Called after the pipeline has reached any terminal state.
|
||||
This includes:
|
||||
|
||||
- StopFrame: pipeline was stopped (processors keep connections open)
|
||||
- EndFrame: pipeline ended normally
|
||||
- CancelFrame: pipeline was cancelled
|
||||
|
||||
Use this event for cleanup, logging, or post-processing tasks. Users can inspect
|
||||
the frame if they need to handle specific cases.
|
||||
|
||||
@@ -395,7 +395,8 @@ class PipelineTask(BasePipelineTask):
|
||||
Cancels all running tasks and stops frame processing without
|
||||
waiting for completion.
|
||||
"""
|
||||
await self._cancel()
|
||||
if not self._finished:
|
||||
await self._cancel()
|
||||
|
||||
async def run(self, params: PipelineTaskParams):
|
||||
"""Start and manage the pipeline execution until completion or cancellation.
|
||||
|
||||
@@ -13,6 +13,7 @@ LLM processing, and text-to-speech components in conversational AI pipelines.
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, List, Literal, Optional, Set
|
||||
|
||||
from loguru import logger
|
||||
@@ -169,6 +170,11 @@ class LLMContextAggregator(FrameProcessor):
|
||||
"""Reset the aggregation state."""
|
||||
self._aggregation = ""
|
||||
|
||||
@abstractmethod
|
||||
async def push_aggregation(self):
|
||||
"""Push the current aggregation downstream."""
|
||||
pass
|
||||
|
||||
|
||||
class LLMUserAggregator(LLMContextAggregator):
|
||||
"""User LLM aggregator that processes speech-to-text transcriptions.
|
||||
@@ -301,7 +307,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
frame = LLMContextFrame(self._context)
|
||||
await self.push_frame(frame)
|
||||
|
||||
async def _push_aggregation(self):
|
||||
async def push_aggregation(self):
|
||||
"""Push the current aggregation based on interruption strategies and conditions."""
|
||||
if len(self._aggregation) > 0:
|
||||
if self.interruption_strategies and self._bot_speaking:
|
||||
@@ -392,7 +398,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
# pushing the aggregation as we will probably get a final transcription.
|
||||
if len(self._aggregation) > 0:
|
||||
if not self._seen_interim_results:
|
||||
await self._push_aggregation()
|
||||
await self.push_aggregation()
|
||||
# Handles the case where both the user and the bot are not speaking,
|
||||
# and the bot was previously speaking before the user interruption.
|
||||
# So in this case we are resetting the aggregation timer
|
||||
@@ -471,7 +477,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
||||
await self._maybe_emulate_user_speaking()
|
||||
except asyncio.TimeoutError:
|
||||
if not self._user_speaking:
|
||||
await self._push_aggregation()
|
||||
await self.push_aggregation()
|
||||
|
||||
# If we are emulating VAD we still need to send the user stopped
|
||||
# speaking frame.
|
||||
@@ -607,12 +613,12 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
elif isinstance(frame, UserImageRawFrame) and frame.request and frame.request.tool_call_id:
|
||||
await self._handle_user_image_frame(frame)
|
||||
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||
await self._push_aggregation()
|
||||
await self.push_aggregation()
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _push_aggregation(self):
|
||||
async def push_aggregation(self):
|
||||
"""Push the current assistant aggregation with timestamp."""
|
||||
if not self._aggregation:
|
||||
return
|
||||
@@ -644,7 +650,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
await self.push_context_frame(FrameDirection.UPSTREAM)
|
||||
|
||||
async def _handle_interruptions(self, frame: InterruptionFrame):
|
||||
await self._push_aggregation()
|
||||
await self.push_aggregation()
|
||||
self._started = 0
|
||||
await self.reset()
|
||||
|
||||
@@ -778,7 +784,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
text=frame.request.context,
|
||||
)
|
||||
|
||||
await self._push_aggregation()
|
||||
await self.push_aggregation()
|
||||
await self.push_context_frame(FrameDirection.UPSTREAM)
|
||||
|
||||
async def _handle_llm_start(self, _: LLMFullResponseStartFrame):
|
||||
@@ -786,7 +792,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
|
||||
async def _handle_llm_end(self, _: LLMFullResponseEndFrame):
|
||||
self._started -= 1
|
||||
await self._push_aggregation()
|
||||
await self.push_aggregation()
|
||||
|
||||
async def _handle_text(self, frame: TextFrame):
|
||||
if not self._started:
|
||||
|
||||
@@ -12,14 +12,14 @@ in conversational pipelines.
|
||||
"""
|
||||
|
||||
from pipecat.frames.frames import TextFrame
|
||||
from pipecat.processors.aggregators.llm_response import LLMUserContextAggregator
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMUserAggregator
|
||||
|
||||
|
||||
class UserResponseAggregator(LLMUserContextAggregator):
|
||||
class UserResponseAggregator(LLMUserAggregator):
|
||||
"""Aggregates user responses into TextFrame objects.
|
||||
|
||||
This aggregator extends LLMUserContextAggregator to specifically handle
|
||||
This aggregator extends LLMUserAggregator to specifically handle
|
||||
user input by collecting text responses and outputting them as TextFrame
|
||||
objects when the aggregation is complete.
|
||||
"""
|
||||
@@ -28,9 +28,9 @@ class UserResponseAggregator(LLMUserContextAggregator):
|
||||
"""Initialize the user response aggregator.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional arguments passed to parent LLMUserContextAggregator.
|
||||
**kwargs: Additional arguments passed to parent LLMUserAggregator.
|
||||
"""
|
||||
super().__init__(context=OpenAILLMContext(), **kwargs)
|
||||
super().__init__(context=LLMContext(), **kwargs)
|
||||
|
||||
async def push_aggregation(self):
|
||||
"""Push the aggregated user response as a TextFrame.
|
||||
|
||||
@@ -13,6 +13,7 @@ and frame observation for the RTVI protocol.
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import (
|
||||
Any,
|
||||
@@ -29,6 +30,7 @@ from typing import (
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, Field, PrivateAttr, ValidationError
|
||||
|
||||
from pipecat.audio.utils import calculate_audio_volume
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
@@ -40,6 +42,7 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
FunctionCallResultFrame,
|
||||
InputAudioRawFrame,
|
||||
InputTransportMessageUrgentFrame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMConfigureOutputFrame,
|
||||
LLMContextFrame,
|
||||
@@ -48,10 +51,11 @@ from pipecat.frames.frames import (
|
||||
LLMMessagesAppendFrame,
|
||||
LLMTextFrame,
|
||||
MetricsFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
SystemFrame,
|
||||
TranscriptionFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
TTSTextFrame,
|
||||
@@ -613,9 +617,9 @@ class RTVIAppendToContextData(BaseModel):
|
||||
|
||||
Contains the role, content, and whether to run the message immediately.
|
||||
|
||||
.. deprecated:: 0.0.85
|
||||
The RTVI message, append-to-context, has been deprecated. Use send-text
|
||||
or custom client and server messages instead.
|
||||
.. deprecated:: 0.0.85
|
||||
The RTVI message, append-to-context, has been deprecated. Use send-text
|
||||
or custom client and server messages instead.
|
||||
"""
|
||||
|
||||
role: Literal["user", "assistant"] | str
|
||||
@@ -839,6 +843,36 @@ class RTVIServerMessage(BaseModel):
|
||||
data: Any
|
||||
|
||||
|
||||
class RTVIAudioLevelMessageData(BaseModel):
|
||||
"""Data format for sending audio levels."""
|
||||
|
||||
value: float
|
||||
|
||||
|
||||
class RTVIUserAudioLevelMessage(BaseModel):
|
||||
"""Message indicating user audio level."""
|
||||
|
||||
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
|
||||
type: Literal["user-audio-level"] = "user-audio-level"
|
||||
data: RTVIAudioLevelMessageData
|
||||
|
||||
|
||||
class RTVIBotAudioLevelMessage(BaseModel):
|
||||
"""Message indicating bot audio level."""
|
||||
|
||||
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
|
||||
type: Literal["bot-audio-level"] = "bot-audio-level"
|
||||
data: RTVIAudioLevelMessageData
|
||||
|
||||
|
||||
class RTVISystemLogMessage(BaseModel):
|
||||
"""Message including a system log."""
|
||||
|
||||
label: RTVIMessageLiteral = RTVI_MESSAGE_LABEL
|
||||
type: Literal["system-log"] = "system-log"
|
||||
data: RTVITextMessageData
|
||||
|
||||
|
||||
@dataclass
|
||||
class RTVIServerMessageFrame(SystemFrame):
|
||||
"""A frame for sending server messages to the client.
|
||||
@@ -858,25 +892,36 @@ class RTVIServerMessageFrame(SystemFrame):
|
||||
class RTVIObserverParams:
|
||||
"""Parameters for configuring RTVI Observer behavior.
|
||||
|
||||
.. deprecated:: 0.0.87
|
||||
Parameter `errors_enabled` is deprecated. Error messages are always enabled.
|
||||
|
||||
Parameters:
|
||||
bot_llm_enabled: Indicates if the bot's LLM messages should be sent.
|
||||
bot_tts_enabled: Indicates if the bot's TTS messages should be sent.
|
||||
bot_speaking_enabled: Indicates if the bot's started/stopped speaking messages should be sent.
|
||||
bot_audio_level_enabled: Indicates if bot's audio level messages should be sent.
|
||||
user_llm_enabled: Indicates if the user's LLM input messages should be sent.
|
||||
user_speaking_enabled: Indicates if the user's started/stopped speaking messages should be sent.
|
||||
user_transcription_enabled: Indicates if user's transcription messages should be sent.
|
||||
user_audio_level_enabled: Indicates if user's audio level messages should be sent.
|
||||
metrics_enabled: Indicates if metrics messages should be sent.
|
||||
errors_enabled: Indicates if errors messages should be sent.
|
||||
system_logs_enabled: Indicates if system logs should be sent.
|
||||
errors_enabled: [Deprecated] Indicates if errors messages should be sent.
|
||||
audio_level_period_secs: How often audio levels should be sent if enabled.
|
||||
"""
|
||||
|
||||
bot_llm_enabled: bool = True
|
||||
bot_tts_enabled: bool = True
|
||||
bot_speaking_enabled: bool = True
|
||||
bot_audio_level_enabled: bool = False
|
||||
user_llm_enabled: bool = True
|
||||
user_speaking_enabled: bool = True
|
||||
user_transcription_enabled: bool = True
|
||||
user_audio_level_enabled: bool = False
|
||||
metrics_enabled: bool = True
|
||||
errors_enabled: bool = True
|
||||
system_logs_enabled: bool = False
|
||||
errors_enabled: Optional[bool] = None
|
||||
audio_level_period_secs: float = 0.15
|
||||
|
||||
|
||||
class RTVIObserver(BaseObserver):
|
||||
@@ -892,7 +937,11 @@ class RTVIObserver(BaseObserver):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, rtvi: "RTVIProcessor", *, params: Optional[RTVIObserverParams] = None, **kwargs
|
||||
self,
|
||||
rtvi: Optional["RTVIProcessor"] = None,
|
||||
*,
|
||||
params: Optional[RTVIObserverParams] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the RTVI observer.
|
||||
|
||||
@@ -904,9 +953,50 @@ class RTVIObserver(BaseObserver):
|
||||
super().__init__(**kwargs)
|
||||
self._rtvi = rtvi
|
||||
self._params = params or RTVIObserverParams()
|
||||
self._bot_transcription = ""
|
||||
|
||||
self._frames_seen = set()
|
||||
rtvi.set_errors_enabled(self._params.errors_enabled)
|
||||
|
||||
self._bot_transcription = ""
|
||||
self._last_user_audio_level = 0
|
||||
self._last_bot_audio_level = 0
|
||||
|
||||
if self._params.system_logs_enabled:
|
||||
self._system_logger_id = logger.add(self._logger_sink)
|
||||
|
||||
if self._params.errors_enabled is not None:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Parameter `errors_enabled` is deprecated. Error messages are always enabled.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
async def _logger_sink(self, message):
|
||||
"""Logger sink so we cna send system logs to RTVI clients."""
|
||||
message = RTVISystemLogMessage(data=RTVITextMessageData(text=message))
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup RTVI observer resources."""
|
||||
await super().cleanup()
|
||||
if self._params.system_logs_enabled:
|
||||
logger.remove(self._system_logger_id)
|
||||
|
||||
async def send_rtvi_message(self, model: BaseModel, exclude_none: bool = True):
|
||||
"""Send an RTVI message.
|
||||
|
||||
By default, we push a transport frame. But this function can be
|
||||
overriden by subclass to send RTVI messages in different ways.
|
||||
|
||||
Args:
|
||||
model: The message to send.
|
||||
exclude_none: Whether to exclude None values from the model dump.
|
||||
|
||||
"""
|
||||
if self._rtvi:
|
||||
await self._rtvi.push_transport_message(model, exclude_none)
|
||||
|
||||
async def on_push_frame(self, data: FramePushed):
|
||||
"""Process a frame being pushed through the pipeline.
|
||||
@@ -948,52 +1038,58 @@ class RTVIObserver(BaseObserver):
|
||||
):
|
||||
await self._handle_context(frame)
|
||||
elif isinstance(frame, LLMFullResponseStartFrame) and self._params.bot_llm_enabled:
|
||||
await self.push_transport_message_urgent(RTVIBotLLMStartedMessage())
|
||||
await self.send_rtvi_message(RTVIBotLLMStartedMessage())
|
||||
elif isinstance(frame, LLMFullResponseEndFrame) and self._params.bot_llm_enabled:
|
||||
await self.push_transport_message_urgent(RTVIBotLLMStoppedMessage())
|
||||
await self.send_rtvi_message(RTVIBotLLMStoppedMessage())
|
||||
elif isinstance(frame, LLMTextFrame) and self._params.bot_llm_enabled:
|
||||
await self._handle_llm_text_frame(frame)
|
||||
elif isinstance(frame, TTSStartedFrame) and self._params.bot_tts_enabled:
|
||||
await self.push_transport_message_urgent(RTVIBotTTSStartedMessage())
|
||||
await self.send_rtvi_message(RTVIBotTTSStartedMessage())
|
||||
elif isinstance(frame, TTSStoppedFrame) and self._params.bot_tts_enabled:
|
||||
await self.push_transport_message_urgent(RTVIBotTTSStoppedMessage())
|
||||
await self.send_rtvi_message(RTVIBotTTSStoppedMessage())
|
||||
elif isinstance(frame, TTSTextFrame) and self._params.bot_tts_enabled:
|
||||
if isinstance(src, BaseOutputTransport):
|
||||
message = RTVIBotTTSTextMessage(data=RTVITextMessageData(text=frame.text))
|
||||
await self.push_transport_message_urgent(message)
|
||||
await self.send_rtvi_message(message)
|
||||
else:
|
||||
mark_as_seen = False
|
||||
elif isinstance(frame, MetricsFrame) and self._params.metrics_enabled:
|
||||
await self._handle_metrics(frame)
|
||||
elif isinstance(frame, RTVIServerMessageFrame):
|
||||
message = RTVIServerMessage(data=frame.data)
|
||||
await self.push_transport_message_urgent(message)
|
||||
await self.send_rtvi_message(message)
|
||||
elif isinstance(frame, RTVIServerResponseFrame):
|
||||
if frame.error is not None:
|
||||
await self._send_error_response(frame)
|
||||
else:
|
||||
await self._send_server_response(frame)
|
||||
elif isinstance(frame, InputAudioRawFrame) and self._params.user_audio_level_enabled:
|
||||
curr_time = time.time()
|
||||
diff_time = curr_time - self._last_user_audio_level
|
||||
if diff_time > self._params.audio_level_period_secs:
|
||||
level = calculate_audio_volume(frame.audio, frame.sample_rate)
|
||||
message = RTVIUserAudioLevelMessage(data=RTVIAudioLevelMessageData(value=level))
|
||||
await self.send_rtvi_message(message)
|
||||
self._last_user_audio_level = curr_time
|
||||
elif isinstance(frame, TTSAudioRawFrame) and self._params.bot_audio_level_enabled:
|
||||
curr_time = time.time()
|
||||
diff_time = curr_time - self._last_bot_audio_level
|
||||
if diff_time > self._params.audio_level_period_secs:
|
||||
level = calculate_audio_volume(frame.audio, frame.sample_rate)
|
||||
message = RTVIBotAudioLevelMessage(data=RTVIAudioLevelMessageData(value=level))
|
||||
await self.send_rtvi_message(message)
|
||||
self._last_bot_audio_level = curr_time
|
||||
|
||||
if mark_as_seen:
|
||||
self._frames_seen.add(frame.id)
|
||||
|
||||
async def push_transport_message_urgent(self, model: BaseModel, exclude_none: bool = True):
|
||||
"""Push an urgent transport message to the RTVI processor.
|
||||
|
||||
Args:
|
||||
model: The message model to send.
|
||||
exclude_none: Whether to exclude None values from the model dump.
|
||||
"""
|
||||
frame = TransportMessageUrgentFrame(message=model.model_dump(exclude_none=exclude_none))
|
||||
await self._rtvi.push_frame(frame)
|
||||
|
||||
async def _push_bot_transcription(self):
|
||||
"""Push accumulated bot transcription as a message."""
|
||||
if len(self._bot_transcription) > 0:
|
||||
message = RTVIBotTranscriptionMessage(
|
||||
data=RTVITextMessageData(text=self._bot_transcription)
|
||||
)
|
||||
await self.push_transport_message_urgent(message)
|
||||
await self.send_rtvi_message(message)
|
||||
self._bot_transcription = ""
|
||||
|
||||
async def _handle_interruptions(self, frame: Frame):
|
||||
@@ -1005,7 +1101,7 @@ class RTVIObserver(BaseObserver):
|
||||
message = RTVIUserStoppedSpeakingMessage()
|
||||
|
||||
if message:
|
||||
await self.push_transport_message_urgent(message)
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
async def _handle_bot_speaking(self, frame: Frame):
|
||||
"""Handle bot speaking event frames."""
|
||||
@@ -1016,12 +1112,12 @@ class RTVIObserver(BaseObserver):
|
||||
message = RTVIBotStoppedSpeakingMessage()
|
||||
|
||||
if message:
|
||||
await self.push_transport_message_urgent(message)
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
async def _handle_llm_text_frame(self, frame: LLMTextFrame):
|
||||
"""Handle LLM text output frames."""
|
||||
message = RTVIBotLLMTextMessage(data=RTVITextMessageData(text=frame.text))
|
||||
await self.push_transport_message_urgent(message)
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
self._bot_transcription += frame.text
|
||||
if match_endofsentence(self._bot_transcription):
|
||||
@@ -1044,7 +1140,7 @@ class RTVIObserver(BaseObserver):
|
||||
)
|
||||
|
||||
if message:
|
||||
await self.push_transport_message_urgent(message)
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
async def _handle_context(self, frame: OpenAILLMContextFrame | LLMContextFrame):
|
||||
"""Process LLM context frames to extract user messages for the RTVI client."""
|
||||
@@ -1064,7 +1160,7 @@ class RTVIObserver(BaseObserver):
|
||||
text = "".join(part.text for part in message.parts if hasattr(part, "text"))
|
||||
if text:
|
||||
rtvi_message = RTVIUserLLMTextMessage(data=RTVITextMessageData(text=text))
|
||||
await self.push_transport_message_urgent(rtvi_message)
|
||||
await self.send_rtvi_message(rtvi_message)
|
||||
|
||||
# Handle OpenAI format (original implementation)
|
||||
elif isinstance(message, dict):
|
||||
@@ -1075,7 +1171,7 @@ class RTVIObserver(BaseObserver):
|
||||
else:
|
||||
text = content
|
||||
rtvi_message = RTVIUserLLMTextMessage(data=RTVITextMessageData(text=text))
|
||||
await self.push_transport_message_urgent(rtvi_message)
|
||||
await self.send_rtvi_message(rtvi_message)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Caught an error while trying to handle context: {e}")
|
||||
@@ -1102,7 +1198,7 @@ class RTVIObserver(BaseObserver):
|
||||
metrics["characters"].append(d.model_dump(exclude_none=True))
|
||||
|
||||
message = RTVIMetricsMessage(data=metrics)
|
||||
await self.push_transport_message_urgent(message)
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
async def _send_server_response(self, frame: RTVIServerResponseFrame):
|
||||
"""Send a response to the client for a specific request."""
|
||||
@@ -1110,15 +1206,14 @@ class RTVIObserver(BaseObserver):
|
||||
id=str(frame.client_msg.msg_id),
|
||||
data=RTVIRawServerResponseData(t=frame.client_msg.type, d=frame.data),
|
||||
)
|
||||
await self.push_transport_message_urgent(message)
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
async def _send_error_response(self, frame: RTVIServerResponseFrame):
|
||||
"""Send a response to the client for a specific request."""
|
||||
if self._params.errors_enabled:
|
||||
message = RTVIErrorResponse(
|
||||
id=str(frame.client_msg.msg_id), data=RTVIErrorResponseData(error=frame.error)
|
||||
)
|
||||
await self.push_transport_message_urgent(message)
|
||||
message = RTVIErrorResponse(
|
||||
id=str(frame.client_msg.msg_id), data=RTVIErrorResponseData(error=frame.error)
|
||||
)
|
||||
await self.send_rtvi_message(message)
|
||||
|
||||
|
||||
class RTVIProcessor(FrameProcessor):
|
||||
@@ -1152,7 +1247,6 @@ class RTVIProcessor(FrameProcessor):
|
||||
# Default to 0.3.0 which is the last version before actually having a
|
||||
# "client-version".
|
||||
self._client_version = [0, 3, 0]
|
||||
self._errors_enabled = True
|
||||
self._skip_tts: bool = False # Keep in sync with llm_service.py
|
||||
|
||||
self._registered_actions: Dict[str, RTVIAction] = {}
|
||||
@@ -1222,14 +1316,6 @@ class RTVIProcessor(FrameProcessor):
|
||||
await self._update_config(self._config, False)
|
||||
await self._send_bot_ready()
|
||||
|
||||
def set_errors_enabled(self, enabled: bool):
|
||||
"""Enable or disable error message sending.
|
||||
|
||||
Args:
|
||||
enabled: Whether to send error messages.
|
||||
"""
|
||||
self._errors_enabled = enabled
|
||||
|
||||
async def interrupt_bot(self):
|
||||
"""Send a bot interruption frame upstream."""
|
||||
await self.push_interruption_task_frame_and_wait()
|
||||
@@ -1258,6 +1344,13 @@ class RTVIProcessor(FrameProcessor):
|
||||
"""
|
||||
await self._send_error_frame(ErrorFrame(error=error))
|
||||
|
||||
async def push_transport_message(self, model: BaseModel, exclude_none: bool = True):
|
||||
"""Push a transport message frame."""
|
||||
frame = OutputTransportMessageUrgentFrame(
|
||||
message=model.model_dump(exclude_none=exclude_none)
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
|
||||
async def handle_message(self, message: RTVIMessage):
|
||||
"""Handle an incoming RTVI message.
|
||||
|
||||
@@ -1278,7 +1371,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
args=params.arguments,
|
||||
)
|
||||
message = RTVILLMFunctionCallMessage(data=fn)
|
||||
await self._push_transport_message(message, exclude_none=False)
|
||||
await self.push_transport_message(message, exclude_none=False)
|
||||
|
||||
async def handle_function_call_start(
|
||||
self, function_name: str, llm: FrameProcessor, context: OpenAILLMContext
|
||||
@@ -1305,7 +1398,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
|
||||
fn = RTVILLMFunctionCallStartMessageData(function_name=function_name)
|
||||
message = RTVILLMFunctionCallStartMessage(data=fn)
|
||||
await self._push_transport_message(message, exclude_none=False)
|
||||
await self.push_transport_message(message, exclude_none=False)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process incoming frames through the RTVI processor.
|
||||
@@ -1328,7 +1421,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
elif isinstance(frame, ErrorFrame):
|
||||
await self._send_error_frame(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, TransportMessageUrgentFrame):
|
||||
elif isinstance(frame, InputTransportMessageUrgentFrame):
|
||||
await self._handle_transport_message(frame)
|
||||
# All other system frames
|
||||
elif isinstance(frame, SystemFrame):
|
||||
@@ -1377,11 +1470,6 @@ class RTVIProcessor(FrameProcessor):
|
||||
await self.cancel_task(self._message_task)
|
||||
self._message_task = None
|
||||
|
||||
async def _push_transport_message(self, model: BaseModel, exclude_none: bool = True):
|
||||
"""Push a transport message frame."""
|
||||
frame = TransportMessageUrgentFrame(message=model.model_dump(exclude_none=exclude_none))
|
||||
await self.push_frame(frame)
|
||||
|
||||
async def _action_task_handler(self):
|
||||
"""Handle incoming action frames."""
|
||||
while True:
|
||||
@@ -1396,7 +1484,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
await self._handle_message(message)
|
||||
self._message_queue.task_done()
|
||||
|
||||
async def _handle_transport_message(self, frame: TransportMessageUrgentFrame):
|
||||
async def _handle_transport_message(self, frame: InputTransportMessageUrgentFrame):
|
||||
"""Handle an incoming transport message frame."""
|
||||
try:
|
||||
transport_message = frame.message
|
||||
@@ -1518,7 +1606,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
|
||||
services = list(self._registered_services.values())
|
||||
message = RTVIDescribeConfig(id=request_id, data=RTVIDescribeConfigData(config=services))
|
||||
await self._push_transport_message(message)
|
||||
await self.push_transport_message(message)
|
||||
|
||||
async def _handle_describe_actions(self, request_id: str):
|
||||
"""Handle a describe-actions request."""
|
||||
@@ -1533,7 +1621,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
|
||||
actions = list(self._registered_actions.values())
|
||||
message = RTVIDescribeActions(id=request_id, data=RTVIDescribeActionsData(actions=actions))
|
||||
await self._push_transport_message(message)
|
||||
await self.push_transport_message(message)
|
||||
|
||||
async def _handle_get_config(self, request_id: str):
|
||||
"""Handle a get-config request."""
|
||||
@@ -1547,7 +1635,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
)
|
||||
|
||||
message = RTVIConfigResponse(id=request_id, data=self._config)
|
||||
await self._push_transport_message(message)
|
||||
await self.push_transport_message(message)
|
||||
|
||||
def _update_config_option(self, service: str, config: RTVIServiceOptionConfig):
|
||||
"""Update a specific configuration option."""
|
||||
@@ -1672,7 +1760,7 @@ class RTVIProcessor(FrameProcessor):
|
||||
# action responses (such as webhooks) don't set a request_id
|
||||
if request_id:
|
||||
message = RTVIActionResponse(id=request_id, data=RTVIActionResponseData(result=result))
|
||||
await self._push_transport_message(message)
|
||||
await self.push_transport_message(message)
|
||||
|
||||
async def _send_bot_ready(self):
|
||||
"""Send the bot-ready message to the client."""
|
||||
@@ -1683,23 +1771,21 @@ class RTVIProcessor(FrameProcessor):
|
||||
id=self._client_ready_id,
|
||||
data=RTVIBotReadyData(version=RTVI_PROTOCOL_VERSION, config=config),
|
||||
)
|
||||
await self._push_transport_message(message)
|
||||
await self.push_transport_message(message)
|
||||
|
||||
async def _send_server_message(self, message: RTVIServerMessage | RTVIServerResponse):
|
||||
"""Send a message or response to the client."""
|
||||
await self._push_transport_message(message)
|
||||
await self.push_transport_message(message)
|
||||
|
||||
async def _send_error_frame(self, frame: ErrorFrame):
|
||||
"""Send an error frame as an RTVI error message."""
|
||||
if self._errors_enabled:
|
||||
message = RTVIError(data=RTVIErrorData(error=frame.error, fatal=frame.fatal))
|
||||
await self._push_transport_message(message)
|
||||
message = RTVIError(data=RTVIErrorData(error=frame.error, fatal=frame.fatal))
|
||||
await self.push_transport_message(message)
|
||||
|
||||
async def _send_error_response(self, id: str, error: str):
|
||||
"""Send an error response message."""
|
||||
if self._errors_enabled:
|
||||
message = RTVIErrorResponse(id=id, data=RTVIErrorResponseData(error=error))
|
||||
await self._push_transport_message(message)
|
||||
message = RTVIErrorResponse(id=id, data=RTVIErrorResponseData(error=error))
|
||||
await self.push_transport_message(message)
|
||||
|
||||
def _action_id(self, service: str, action: str) -> str:
|
||||
"""Generate an action ID from service and action names."""
|
||||
|
||||
@@ -15,7 +15,7 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
OutputAudioRawFrame,
|
||||
TransportMessageFrame,
|
||||
UserSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
@@ -36,9 +36,9 @@ class FrameLogger(FrameProcessor):
|
||||
color: Optional[str] = None,
|
||||
ignored_frame_types: Tuple[Type[Frame], ...] = (
|
||||
BotSpeakingFrame,
|
||||
UserSpeakingFrame,
|
||||
InputAudioRawFrame,
|
||||
OutputAudioRawFrame,
|
||||
TransportMessageFrame,
|
||||
),
|
||||
):
|
||||
"""Initialize the frame logger.
|
||||
|
||||
@@ -70,7 +70,9 @@ import asyncio
|
||||
import os
|
||||
import sys
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.runner.types import (
|
||||
@@ -82,7 +84,7 @@ from pipecat.runner.types import (
|
||||
try:
|
||||
import uvicorn
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import BackgroundTasks, FastAPI, Request, WebSocket
|
||||
from fastapi import BackgroundTasks, FastAPI, HTTPException, Request, WebSocket
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||
except ImportError as e:
|
||||
@@ -166,6 +168,7 @@ def _create_server_app(
|
||||
# Set up transport-specific routes
|
||||
if transport_type == "webrtc":
|
||||
_setup_webrtc_routes(app, esp32_mode=esp32_mode, host=host)
|
||||
_setup_whatsapp_routes(app)
|
||||
elif transport_type == "daily":
|
||||
_setup_daily_routes(app)
|
||||
elif transport_type in ["twilio", "telnyx", "plivo", "exotel"]:
|
||||
@@ -221,12 +224,174 @@ def _setup_webrtc_routes(app: FastAPI, esp32_mode: bool = False, host: str = "lo
|
||||
return answer
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
async def smallwebrtc_lifespan(app: FastAPI):
|
||||
"""Manage FastAPI application lifecycle and cleanup connections."""
|
||||
yield
|
||||
await small_webrtc_handler.close()
|
||||
|
||||
app.router.lifespan_context = lifespan
|
||||
# Add the SmallWebRTC lifespan to the app
|
||||
_add_lifespan_to_app(app, smallwebrtc_lifespan)
|
||||
|
||||
|
||||
def _add_lifespan_to_app(app: FastAPI, new_lifespan):
|
||||
"""Add a new lifespan context manager to the app, combining with existing if present.
|
||||
|
||||
Args:
|
||||
app: The FastAPI application instance
|
||||
new_lifespan: The new lifespan context manager to add
|
||||
"""
|
||||
if hasattr(app.router, "lifespan_context") and app.router.lifespan_context is not None:
|
||||
# If there's already a lifespan context, combine them
|
||||
existing_lifespan = app.router.lifespan_context
|
||||
|
||||
@asynccontextmanager
|
||||
async def combined_lifespan(app: FastAPI):
|
||||
async with existing_lifespan(app):
|
||||
async with new_lifespan(app):
|
||||
yield
|
||||
|
||||
app.router.lifespan_context = combined_lifespan
|
||||
else:
|
||||
# No existing lifespan, use the new one
|
||||
app.router.lifespan_context = new_lifespan
|
||||
|
||||
|
||||
def _setup_whatsapp_routes(app: FastAPI):
|
||||
"""Set up WebRTC-specific routes."""
|
||||
try:
|
||||
from pipecat_ai_small_webrtc_prebuilt.frontend import SmallWebRTCPrebuiltUI
|
||||
|
||||
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
|
||||
from pipecat.transports.smallwebrtc.request_handler import (
|
||||
SmallWebRTCRequest,
|
||||
SmallWebRTCRequestHandler,
|
||||
)
|
||||
from pipecat.transports.whatsapp.api import WhatsAppWebhookRequest
|
||||
from pipecat.transports.whatsapp.client import WhatsAppClient
|
||||
except ImportError as e:
|
||||
logger.error(f"WebRTC transport dependencies not installed: {e}")
|
||||
return
|
||||
|
||||
WHATSAPP_TOKEN = os.getenv("WHATSAPP_TOKEN")
|
||||
WHATSAPP_PHONE_NUMBER_ID = os.getenv("WHATSAPP_PHONE_NUMBER_ID")
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN = os.getenv("WHATSAPP_WEBHOOK_VERIFICATION_TOKEN")
|
||||
|
||||
if not all(
|
||||
[
|
||||
WHATSAPP_TOKEN,
|
||||
WHATSAPP_PHONE_NUMBER_ID,
|
||||
WHATSAPP_WEBHOOK_VERIFICATION_TOKEN,
|
||||
]
|
||||
):
|
||||
logger.debug(
|
||||
"Missing required environment variables for WhatsApp transport. Keeping it disabled."
|
||||
)
|
||||
return
|
||||
|
||||
# Global WhatsApp client instance
|
||||
whatsapp_client: Optional[WhatsAppClient] = None
|
||||
|
||||
@app.get(
|
||||
"/whatsapp",
|
||||
summary="Verify WhatsApp webhook",
|
||||
description="Handles WhatsApp webhook verification requests from Meta",
|
||||
)
|
||||
async def verify_webhook(request: Request):
|
||||
"""Verify WhatsApp webhook endpoint.
|
||||
|
||||
This endpoint is called by Meta's WhatsApp Business API to verify
|
||||
the webhook URL during setup. It validates the verification token
|
||||
and returns the challenge parameter if successful.
|
||||
"""
|
||||
if whatsapp_client is None:
|
||||
logger.error("WhatsApp client is not initialized")
|
||||
raise HTTPException(status_code=503, detail="Service unavailable")
|
||||
|
||||
params = dict(request.query_params)
|
||||
logger.debug(f"Webhook verification request received with params: {list(params.keys())}")
|
||||
|
||||
try:
|
||||
result = await whatsapp_client.handle_verify_webhook_request(
|
||||
params=params, expected_verification_token=WHATSAPP_WEBHOOK_VERIFICATION_TOKEN
|
||||
)
|
||||
logger.info("Webhook verification successful")
|
||||
return result
|
||||
except ValueError as e:
|
||||
logger.warning(f"Webhook verification failed: {e}")
|
||||
raise HTTPException(status_code=403, detail="Verification failed")
|
||||
|
||||
@app.post(
|
||||
"/whatsapp",
|
||||
summary="Handle WhatsApp webhook events",
|
||||
description="Processes incoming WhatsApp messages and call events",
|
||||
)
|
||||
async def whatsapp_webhook(body: WhatsAppWebhookRequest, background_tasks: BackgroundTasks):
|
||||
"""Handle incoming WhatsApp webhook events.
|
||||
|
||||
For call events, establishes WebRTC connections and spawns bot instances
|
||||
in the background to handle real-time communication.
|
||||
"""
|
||||
if whatsapp_client is None:
|
||||
logger.error("WhatsApp client is not initialized")
|
||||
raise HTTPException(status_code=503, detail="Service unavailable")
|
||||
|
||||
# Validate webhook object type
|
||||
if body.object != "whatsapp_business_account":
|
||||
logger.warning(f"Invalid webhook object type: {body.object}")
|
||||
raise HTTPException(status_code=400, detail="Invalid object type")
|
||||
|
||||
logger.debug(f"Processing WhatsApp webhook: {body.model_dump()}")
|
||||
|
||||
async def connection_callback(connection: SmallWebRTCConnection):
|
||||
"""Handle new WebRTC connections from WhatsApp calls.
|
||||
|
||||
Called when a WebRTC connection is established for a WhatsApp call.
|
||||
Spawns a bot instance to handle the conversation.
|
||||
|
||||
Args:
|
||||
connection: The established WebRTC connection
|
||||
"""
|
||||
bot_module = _get_bot_module()
|
||||
runner_args = SmallWebRTCRunnerArguments(webrtc_connection=connection)
|
||||
background_tasks.add_task(bot_module.bot, runner_args)
|
||||
|
||||
try:
|
||||
# Process the webhook request
|
||||
result = await whatsapp_client.handle_webhook_request(body, connection_callback)
|
||||
logger.debug(f"Webhook processed successfully: {result}")
|
||||
return {"status": "success", "message": "Webhook processed successfully"}
|
||||
except ValueError as ve:
|
||||
logger.warning(f"Invalid webhook request format: {ve}")
|
||||
raise HTTPException(status_code=400, detail=f"Invalid request: {str(ve)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Internal error processing webhook: {e}")
|
||||
raise HTTPException(status_code=500, detail="Internal server error processing webhook")
|
||||
|
||||
@asynccontextmanager
|
||||
async def whatsapp_lifespan(app: FastAPI):
|
||||
"""Manage WhatsApp client lifecycle and cleanup connections."""
|
||||
nonlocal whatsapp_client
|
||||
|
||||
# Initialize WhatsApp client with persistent HTTP session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
whatsapp_client = WhatsAppClient(
|
||||
whatsapp_token=WHATSAPP_TOKEN,
|
||||
phone_number_id=WHATSAPP_PHONE_NUMBER_ID,
|
||||
session=session,
|
||||
)
|
||||
logger.info("WhatsApp client initialized successfully")
|
||||
|
||||
try:
|
||||
yield # Run the application
|
||||
finally:
|
||||
# Cleanup all active calls on shutdown
|
||||
logger.info("Cleaning up WhatsApp client resources...")
|
||||
if whatsapp_client:
|
||||
await whatsapp_client.terminate_all_calls()
|
||||
logger.info("WhatsApp cleanup completed")
|
||||
|
||||
# Add the WhatsApp lifespan to the app
|
||||
_add_lifespan_to_app(app, whatsapp_lifespan)
|
||||
|
||||
|
||||
def _setup_daily_routes(app: FastAPI):
|
||||
|
||||
@@ -99,29 +99,41 @@ async def parse_telephony_websocket(websocket: WebSocket):
|
||||
tuple: (transport_type: str, call_data: dict)
|
||||
|
||||
call_data contains provider-specific fields:
|
||||
- Twilio: {
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"body": dict
|
||||
}
|
||||
- Telnyx: {
|
||||
"stream_id": str,
|
||||
"call_control_id": str,
|
||||
"outbound_encoding": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
- Plivo: {
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
}
|
||||
- Exotel: {
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"account_sid": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
|
||||
- Twilio::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"body": dict
|
||||
}
|
||||
|
||||
- Telnyx::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_control_id": str,
|
||||
"outbound_encoding": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
|
||||
- Plivo::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
}
|
||||
|
||||
- Exotel::
|
||||
|
||||
{
|
||||
"stream_id": str,
|
||||
"call_id": str,
|
||||
"account_sid": str,
|
||||
"from": str,
|
||||
"to": str,
|
||||
}
|
||||
|
||||
Example usage::
|
||||
|
||||
@@ -301,6 +313,7 @@ def _smallwebrtc_sdp_cleanup_ice_candidates(text: str, pattern: str) -> str:
|
||||
Returns:
|
||||
Cleaned SDP text with filtered ICE candidates.
|
||||
"""
|
||||
logger.debug("Removing unsupported ICE candidates from SDP")
|
||||
result = []
|
||||
lines = text.splitlines()
|
||||
for line in lines:
|
||||
@@ -309,7 +322,7 @@ def _smallwebrtc_sdp_cleanup_ice_candidates(text: str, pattern: str) -> str:
|
||||
result.append(line)
|
||||
else:
|
||||
result.append(line)
|
||||
return "\r\n".join(result)
|
||||
return "\r\n".join(result) + "\r\n"
|
||||
|
||||
|
||||
def _smallwebrtc_sdp_cleanup_fingerprints(text: str) -> str:
|
||||
@@ -321,15 +334,16 @@ def _smallwebrtc_sdp_cleanup_fingerprints(text: str) -> str:
|
||||
Returns:
|
||||
SDP text with sha-384 and sha-512 fingerprints removed.
|
||||
"""
|
||||
logger.debug("Removing unsupported fingerprints from SDP")
|
||||
result = []
|
||||
lines = text.splitlines()
|
||||
for line in lines:
|
||||
if not re.search("sha-384", line) and not re.search("sha-512", line):
|
||||
result.append(line)
|
||||
return "\r\n".join(result)
|
||||
return "\r\n".join(result) + "\r\n"
|
||||
|
||||
|
||||
def smallwebrtc_sdp_munging(sdp: str, host: str) -> str:
|
||||
def smallwebrtc_sdp_munging(sdp: str, host: Optional[str]) -> str:
|
||||
"""Apply SDP modifications for SmallWebRTC compatibility.
|
||||
|
||||
Args:
|
||||
@@ -340,7 +354,8 @@ def smallwebrtc_sdp_munging(sdp: str, host: str) -> str:
|
||||
Modified SDP string with fingerprint and ICE candidate cleanup.
|
||||
"""
|
||||
sdp = _smallwebrtc_sdp_cleanup_fingerprints(sdp)
|
||||
sdp = _smallwebrtc_sdp_cleanup_ice_candidates(sdp, host)
|
||||
if host:
|
||||
sdp = _smallwebrtc_sdp_cleanup_ice_candidates(sdp, host)
|
||||
return sdp
|
||||
|
||||
|
||||
|
||||
@@ -21,9 +21,9 @@ from pipecat.frames.frames import (
|
||||
InputAudioRawFrame,
|
||||
InputDTMFFrame,
|
||||
InterruptionFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||
|
||||
@@ -121,7 +121,7 @@ class ExotelFrameSerializer(FrameSerializer):
|
||||
}
|
||||
|
||||
return json.dumps(answer)
|
||||
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
||||
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||
return json.dumps(frame.message)
|
||||
|
||||
return None
|
||||
|
||||
@@ -23,9 +23,9 @@ from pipecat.frames.frames import (
|
||||
InputAudioRawFrame,
|
||||
InputDTMFFrame,
|
||||
InterruptionFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||
|
||||
@@ -148,7 +148,7 @@ class PlivoFrameSerializer(FrameSerializer):
|
||||
}
|
||||
|
||||
return json.dumps(answer)
|
||||
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
||||
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||
return json.dumps(frame.message)
|
||||
|
||||
# Return None for unhandled frames
|
||||
|
||||
@@ -15,11 +15,12 @@ import pipecat.frames.protobufs.frames_pb2 as frame_protos
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
InputTransportMessageFrame,
|
||||
OutputAudioRawFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
TextFrame,
|
||||
TranscriptionFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||
|
||||
@@ -82,7 +83,7 @@ class ProtobufFrameSerializer(FrameSerializer):
|
||||
Serialized frame as bytes, or None if frame type is not serializable.
|
||||
"""
|
||||
# Wrapping this messages as a JSONFrame to send
|
||||
if isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
||||
if isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||
frame = MessageFrame(
|
||||
data=json.dumps(frame.message),
|
||||
)
|
||||
@@ -134,11 +135,11 @@ class ProtobufFrameSerializer(FrameSerializer):
|
||||
if "pts" in args_dict:
|
||||
del args_dict["pts"]
|
||||
|
||||
# Special handling for MessageFrame -> TransportMessageUrgentFrame
|
||||
# Special handling for MessageFrame -> OutputTransportMessageUrgentFrame
|
||||
if class_name == MessageFrame:
|
||||
try:
|
||||
msg = json.loads(args_dict["data"])
|
||||
instance = TransportMessageUrgentFrame(message=msg)
|
||||
instance = InputTransportMessageFrame(message=msg)
|
||||
logger.debug(f"ProtobufFrameSerializer: Transport message {instance}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing MessageFrame data: {e}")
|
||||
|
||||
@@ -23,9 +23,9 @@ from pipecat.frames.frames import (
|
||||
InputAudioRawFrame,
|
||||
InputDTMFFrame,
|
||||
InterruptionFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||
|
||||
@@ -175,7 +175,7 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
}
|
||||
|
||||
return json.dumps(answer)
|
||||
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
||||
elif isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||
return json.dumps(frame.message)
|
||||
|
||||
# Return None for unhandled frames
|
||||
|
||||
@@ -151,7 +151,7 @@ class AnthropicLLMService(LLMService):
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
model: str = "claude-sonnet-4-20250514",
|
||||
model: str = "claude-sonnet-4-5-20250929",
|
||||
params: Optional[InputParams] = None,
|
||||
client=None,
|
||||
retry_timeout_secs: Optional[float] = 5.0,
|
||||
@@ -162,7 +162,7 @@ class AnthropicLLMService(LLMService):
|
||||
|
||||
Args:
|
||||
api_key: Anthropic API key for authentication.
|
||||
model: Model name to use. Defaults to "claude-sonnet-4-20250514".
|
||||
model: Model name to use. Defaults to "claude-sonnet-4-5-20250929".
|
||||
params: Optional model parameters for inference.
|
||||
client: Optional custom Anthropic client instance.
|
||||
retry_timeout_secs: Request timeout in seconds for retry logic.
|
||||
|
||||
@@ -61,7 +61,6 @@ from pipecat.utils.tracing.service_decorators import traced_llm
|
||||
|
||||
try:
|
||||
import aioboto3
|
||||
import httpx
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ReadTimeoutError
|
||||
except ModuleNotFoundError as e:
|
||||
@@ -1117,7 +1116,7 @@ class AWSBedrockLLMService(LLMService):
|
||||
# also get cancelled.
|
||||
use_completion_tokens_estimate = True
|
||||
raise
|
||||
except httpx.TimeoutException:
|
||||
except (ReadTimeoutError, asyncio.TimeoutError):
|
||||
await self._call_event_handler("on_completion_timeout")
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} exception: {e}")
|
||||
|
||||
@@ -429,7 +429,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
await self._finish_connecting_if_context_available()
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
self._disconnect()
|
||||
await self._disconnect()
|
||||
|
||||
async def _finish_connecting_if_context_available(self):
|
||||
# We can only finish connecting once we've gotten our initial context and we're ready to
|
||||
|
||||
636
src/pipecat/services/deepgram/flux/stt.py
Normal file
636
src/pipecat/services/deepgram/flux/stt.py
Normal file
@@ -0,0 +1,636 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Deepgram Flux speech-to-text service implementation."""
|
||||
|
||||
import json
|
||||
from enum import Enum
|
||||
from typing import Any, AsyncGenerator, Dict, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
StartFrame,
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.stt_service import WebsocketSTTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
|
||||
try:
|
||||
from websockets.asyncio.client import connect as websocket_connect
|
||||
from websockets.protocol import State
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Deepgram Flux, you need to `pip install pipecat-ai[deepgram]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class FluxMessageType(str, Enum):
|
||||
"""Deepgram Flux WebSocket message types.
|
||||
|
||||
These are the top-level message types that can be received from the
|
||||
Deepgram Flux WebSocket connection.
|
||||
"""
|
||||
|
||||
RECEIVE_CONNECTED = "Connected"
|
||||
RECEIVE_FATAL_ERROR = "Error"
|
||||
TURN_INFO = "TurnInfo"
|
||||
|
||||
|
||||
class FluxEventType(str, Enum):
|
||||
"""Deepgram Flux TurnInfo event types.
|
||||
|
||||
These events are contained within TurnInfo messages and indicate
|
||||
different stages of speech processing and turn detection.
|
||||
"""
|
||||
|
||||
START_OF_TURN = "StartOfTurn"
|
||||
TURN_RESUMED = "TurnResumed"
|
||||
END_OF_TURN = "EndOfTurn"
|
||||
EAGER_END_OF_TURN = "EagerEndOfTurn"
|
||||
UPDATE = "Update"
|
||||
|
||||
|
||||
class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
"""Deepgram Flux speech-to-text service.
|
||||
|
||||
Provides real-time speech recognition using Deepgram's WebSocket API with Flux capabilities.
|
||||
Supports configurable models, VAD events, and various audio processing options
|
||||
including advanced turn detection and EagerEndOfTurn events for improved conversational AI performance.
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Configuration parameters for Deepgram Flux API.
|
||||
|
||||
This class defines all available connection parameters for the Deepgram Flux API
|
||||
based on the official documentation.
|
||||
|
||||
Parameters:
|
||||
eager_eot_threshold: Optional. EagerEndOfTurn/TurnResumed are off by default.
|
||||
You can turn them on by setting eager_eot_threshold to a valid value.
|
||||
Lower values = more aggressive EagerEndOfTurning (faster response, more LLM calls).
|
||||
Higher values = more conservative EagerEndOfTurning (slower response, fewer LLM calls).
|
||||
eot_threshold: Optional. End-of-turn confidence required to finish a turn (default 0.7).
|
||||
Lower values = turns end sooner (more interruptions, faster responses).
|
||||
Higher values = turns end later (fewer interruptions, more complete utterances).
|
||||
eot_timeout_ms: Optional. Time in milliseconds after speech to finish a turn
|
||||
regardless of EOT confidence (default 5000).
|
||||
keyterm: List of keyterms to boost recognition accuracy for specialized terminology.
|
||||
mip_opt_out: Optional. Opts out requests from the Deepgram Model Improvement Program
|
||||
(default False).
|
||||
tag: List of tags to label requests for identification during usage reporting.
|
||||
"""
|
||||
|
||||
eager_eot_threshold: Optional[float] = None
|
||||
eot_threshold: Optional[float] = None
|
||||
eot_timeout_ms: Optional[int] = None
|
||||
keyterm: list = []
|
||||
mip_opt_out: Optional[bool] = None
|
||||
tag: list = []
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
url: str = "wss://api.deepgram.com/v2/listen",
|
||||
sample_rate: Optional[int] = None,
|
||||
model: str = "flux-general-en",
|
||||
flux_encoding: str = "linear16",
|
||||
params: Optional[InputParams] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Deepgram Flux STT service.
|
||||
|
||||
Args:
|
||||
api_key: Deepgram API key for authentication. Required for API access.
|
||||
url: WebSocket URL for the Deepgram Flux API. Defaults to the preview endpoint.
|
||||
sample_rate: Audio sample rate in Hz. If None, uses the rate from params or 16000.
|
||||
model: Deepgram Flux model to use for transcription. Currently only supports "flux-general-en".
|
||||
flux_encoding: Audio encoding format required by Flux API. Must be "linear16".
|
||||
Raw signed little-endian 16-bit PCM encoding.
|
||||
params: InputParams instance containing detailed API configuration options.
|
||||
If None, default parameters will be used.
|
||||
**kwargs: Additional arguments passed to the parent WebsocketSTTService class.
|
||||
|
||||
Examples:
|
||||
Basic usage with default parameters::
|
||||
|
||||
stt = DeepgramFluxSTTService(api_key="your-api-key")
|
||||
|
||||
Advanced usage with custom parameters::
|
||||
|
||||
params = DeepgramFluxSTTService.InputParams(
|
||||
eager_eot_threshold=0.5,
|
||||
eot_threshold=0.8,
|
||||
keyterm=["AI", "machine learning", "neural network"],
|
||||
tag=["production", "voice-agent"]
|
||||
)
|
||||
stt = DeepgramFluxSTTService(
|
||||
api_key="your-api-key",
|
||||
model="flux-general-en",
|
||||
params=params
|
||||
)
|
||||
"""
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._url = url
|
||||
self._model = model
|
||||
self._params = params or DeepgramFluxSTTService.InputParams()
|
||||
self._flux_encoding = flux_encoding
|
||||
# This is the currently only supported language
|
||||
self._language = Language.EN
|
||||
self._websocket_url = None
|
||||
self._receive_task = None
|
||||
|
||||
async def _connect(self):
|
||||
"""Connect to WebSocket and start background tasks.
|
||||
|
||||
Establishes the WebSocket connection to the Deepgram Flux API and starts
|
||||
the background task for receiving transcription results.
|
||||
"""
|
||||
await self._connect_websocket()
|
||||
|
||||
if self._websocket and not self._receive_task:
|
||||
self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
|
||||
|
||||
async def _disconnect(self):
|
||||
"""Disconnect from WebSocket and clean up tasks.
|
||||
|
||||
Gracefully disconnects from the Deepgram Flux API, cancels background tasks,
|
||||
and cleans up resources to prevent memory leaks.
|
||||
"""
|
||||
try:
|
||||
# Cancel background tasks BEFORE closing websocket
|
||||
if self._receive_task:
|
||||
await self.cancel_task(self._receive_task, timeout=2.0)
|
||||
self._receive_task = None
|
||||
|
||||
# Now close the websocket
|
||||
await self._disconnect_websocket()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during disconnect: {e}")
|
||||
finally:
|
||||
# Reset state only after everything is cleaned up
|
||||
self._websocket = None
|
||||
|
||||
async def _connect_websocket(self):
|
||||
"""Establish WebSocket connection to API.
|
||||
|
||||
Creates a WebSocket connection to the Deepgram Flux API using the configured
|
||||
URL and authentication headers. Handles connection errors and reports them
|
||||
through the event handler system.
|
||||
"""
|
||||
try:
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
return
|
||||
|
||||
self._websocket = await websocket_connect(
|
||||
self._websocket_url,
|
||||
additional_headers={"Authorization": f"Token {self._api_key}"},
|
||||
)
|
||||
logger.debug("Connected to Deepgram Flux Websocket")
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
self._websocket = None
|
||||
await self._call_event_handler("on_connection_error", f"{e}")
|
||||
|
||||
async def _disconnect_websocket(self):
|
||||
"""Close WebSocket connection and clean up state.
|
||||
|
||||
Closes the WebSocket connection to the Deepgram Flux API and stops all
|
||||
metrics collection. Handles disconnection errors gracefully.
|
||||
"""
|
||||
try:
|
||||
await self.stop_all_metrics()
|
||||
|
||||
if self._websocket:
|
||||
await self._send_close_stream()
|
||||
logger.debug("Disconnecting from Deepgram Flux Websocket")
|
||||
await self._websocket.close()
|
||||
except Exception as e:
|
||||
logger.error(f"{self} error closing websocket: {e}")
|
||||
|
||||
async def _send_close_stream(self) -> None:
|
||||
"""Sends a CloseStream control message to the Deepgram Flux WebSocket API.
|
||||
|
||||
This signals to the server that no more audio data will be sent.
|
||||
"""
|
||||
if self._websocket:
|
||||
logger.debug("Sending CloseStream message to Deepgram Flux")
|
||||
message = {"type": "CloseStream"}
|
||||
await self._websocket.send(json.dumps(message))
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as Deepgram service supports metrics generation.
|
||||
"""
|
||||
return True
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
"""Start the Deepgram Flux STT service.
|
||||
|
||||
Initializes the service by constructing the WebSocket URL with all configured
|
||||
parameters and establishing the connection to begin transcription processing.
|
||||
|
||||
Args:
|
||||
frame: The start frame containing initialization parameters and metadata.
|
||||
"""
|
||||
await super().start(frame)
|
||||
|
||||
url_params = [
|
||||
f"model={self._model}",
|
||||
f"sample_rate={self.sample_rate}",
|
||||
f"encoding={self._flux_encoding}",
|
||||
]
|
||||
|
||||
if self._params.eager_eot_threshold is not None:
|
||||
url_params.append(f"eager_eot_threshold={self._params.eager_eot_threshold}")
|
||||
|
||||
if self._params.eot_threshold is not None:
|
||||
url_params.append(f"eot_threshold={self._params.eot_threshold}")
|
||||
|
||||
if self._params.eot_timeout_ms is not None:
|
||||
url_params.append(f"eot_timeout_ms={self._params.eot_timeout_ms}")
|
||||
|
||||
if self._params.mip_opt_out is not None:
|
||||
url_params.append(f"mip_opt_out={str(self._params.mip_opt_out).lower()}")
|
||||
|
||||
# Add keyterm parameters (can have multiple)
|
||||
for keyterm in self._params.keyterm:
|
||||
url_params.append(f"keyterm={keyterm}")
|
||||
|
||||
# Add tag parameters (can have multiple)
|
||||
for tag_value in self._params.tag:
|
||||
url_params.append(f"tag={tag_value}")
|
||||
|
||||
self._websocket_url = f"{self._url}?{'&'.join(url_params)}"
|
||||
await self._connect()
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
"""Stop the Deepgram Flux STT service.
|
||||
|
||||
Args:
|
||||
frame: The end frame.
|
||||
"""
|
||||
await super().stop(frame)
|
||||
await self._disconnect()
|
||||
|
||||
async def cancel(self, frame: CancelFrame):
|
||||
"""Cancel the Deepgram Flux STT service.
|
||||
|
||||
Args:
|
||||
frame: The cancel frame.
|
||||
"""
|
||||
await super().cancel(frame)
|
||||
await self._disconnect()
|
||||
|
||||
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
||||
"""Send audio data to Deepgram Flux for transcription.
|
||||
|
||||
Transmits raw audio bytes to the Deepgram Flux API for real-time speech
|
||||
recognition. Transcription results are received asynchronously through
|
||||
WebSocket callbacks and processed in the background.
|
||||
|
||||
Args:
|
||||
audio: Raw audio bytes in linear16 format (signed little-endian 16-bit PCM).
|
||||
|
||||
Yields:
|
||||
Frame: None (transcription results are delivered via WebSocket callbacks
|
||||
rather than as return values from this method).
|
||||
|
||||
Raises:
|
||||
Exception: If the WebSocket connection is not established or if there
|
||||
are issues sending the audio data.
|
||||
"""
|
||||
if not self._websocket:
|
||||
logger.error("Not connected to Deepgram Flux.")
|
||||
yield ErrorFrame("Not connected to Deepgram Flux.", fatal=True)
|
||||
return
|
||||
|
||||
try:
|
||||
await self._websocket.send(audio)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send audio to Flux: {e}")
|
||||
yield ErrorFrame(f"Failed to send audio to Flux: {e}")
|
||||
return
|
||||
|
||||
yield None
|
||||
|
||||
async def start_metrics(self):
|
||||
"""Start TTFB and processing metrics collection."""
|
||||
# TTFB (Time To First Byte) metrics are currently disabled for Deepgram Flux.
|
||||
# Ideally, TTFB should measure the time from when a user starts speaking
|
||||
# until we receive the first transcript. However, Deepgram Flux delivers
|
||||
# both the "user started speaking" event and the first transcript simultaneously,
|
||||
# making this timing measurement meaningless in this context.
|
||||
# await self.start_ttfb_metrics()
|
||||
await self.start_processing_metrics()
|
||||
|
||||
@traced_stt
|
||||
async def _handle_transcription(
|
||||
self, transcript: str, is_final: bool, language: Optional[Language] = None
|
||||
):
|
||||
"""Handle a transcription result with tracing."""
|
||||
pass
|
||||
|
||||
def _get_websocket(self):
|
||||
"""Get the current WebSocket connection.
|
||||
|
||||
Returns the active WebSocket connection instance, raising an exception
|
||||
if no connection is currently established.
|
||||
|
||||
Returns:
|
||||
The active WebSocket connection instance.
|
||||
|
||||
Raises:
|
||||
Exception: If no WebSocket connection is currently active.
|
||||
"""
|
||||
if self._websocket:
|
||||
return self._websocket
|
||||
raise Exception("Websocket not connected")
|
||||
|
||||
def _validate_message(self, data: Dict[str, Any]) -> bool:
|
||||
"""Validate basic message structure from Deepgram Flux.
|
||||
|
||||
Ensures the received message has the expected structure before processing.
|
||||
|
||||
Args:
|
||||
data: The parsed JSON message data to validate.
|
||||
|
||||
Returns:
|
||||
True if the message structure is valid, False otherwise.
|
||||
"""
|
||||
if not isinstance(data, dict):
|
||||
logger.warning("Message is not a dictionary")
|
||||
return False
|
||||
|
||||
if "type" not in data:
|
||||
logger.warning("Message missing 'type' field")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
async def _receive_messages(self):
|
||||
"""Receive and process messages from WebSocket.
|
||||
|
||||
Continuously receives messages from the Deepgram Flux WebSocket connection
|
||||
and processes various message types including connection status, transcription
|
||||
results, turn information, and error conditions. Handles different event types
|
||||
such as StartOfTurn, EndOfTurn, EagerEndOfTurn, and Update events.
|
||||
"""
|
||||
async for message in self._get_websocket():
|
||||
if isinstance(message, str):
|
||||
try:
|
||||
data = json.loads(message)
|
||||
await self._handle_message(data)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to decode JSON message: {e}")
|
||||
# Skip malformed messages
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing message: {e}")
|
||||
# Error will be handled inside WebsocketService->_receive_task_handler
|
||||
raise
|
||||
else:
|
||||
logger.warning(f"Received non-string message: {type(message)}")
|
||||
|
||||
async def _handle_message(self, data: Dict[str, Any]):
|
||||
"""Handle a parsed WebSocket message from Deepgram Flux.
|
||||
|
||||
Routes messages to appropriate handlers based on their type. Validates
|
||||
message structure before processing.
|
||||
|
||||
Args:
|
||||
data: The parsed JSON message data from the WebSocket.
|
||||
"""
|
||||
if not self._validate_message(data):
|
||||
return
|
||||
|
||||
message_type = data.get("type")
|
||||
|
||||
try:
|
||||
flux_message_type = FluxMessageType(message_type)
|
||||
except ValueError:
|
||||
logger.debug(f"Unhandled message type: {message_type or 'unknown'}")
|
||||
return
|
||||
|
||||
match flux_message_type:
|
||||
case FluxMessageType.RECEIVE_CONNECTED:
|
||||
await self._handle_connection_established()
|
||||
case FluxMessageType.RECEIVE_FATAL_ERROR:
|
||||
await self._handle_fatal_error(data)
|
||||
case FluxMessageType.TURN_INFO:
|
||||
await self._handle_turn_info(data)
|
||||
|
||||
async def _handle_connection_established(self):
|
||||
"""Handle successful connection establishment to Deepgram Flux.
|
||||
|
||||
This event is fired when the WebSocket connection to Deepgram Flux
|
||||
is successfully established and ready to receive audio data for
|
||||
transcription processing.
|
||||
"""
|
||||
logger.info("Connected to Flux - ready to stream audio")
|
||||
|
||||
async def _handle_fatal_error(self, data: Dict[str, Any]):
|
||||
"""Handle fatal error messages from Deepgram Flux.
|
||||
|
||||
Fatal errors indicate unrecoverable issues with the connection or
|
||||
configuration that require intervention. These errors will cause
|
||||
the connection to be terminated.
|
||||
|
||||
Args:
|
||||
data: The error message data containing error details.
|
||||
|
||||
Raises:
|
||||
Exception: Always raises to trigger error handling in the parent service.
|
||||
"""
|
||||
error_msg = data.get("error", "Unknown error")
|
||||
deepgram_error = f"Fatal error: {error_msg}"
|
||||
logger.error(deepgram_error)
|
||||
# Error will be handled inside WebsocketService->_receive_task_handler
|
||||
raise Exception(deepgram_error)
|
||||
|
||||
async def _handle_turn_info(self, data: Dict[str, Any]):
|
||||
"""Handle TurnInfo events from Deepgram Flux.
|
||||
|
||||
TurnInfo messages contain various turn-based events that indicate
|
||||
the state of speech processing, including turn boundaries, interim
|
||||
results, and turn finalization events.
|
||||
|
||||
Args:
|
||||
data: The TurnInfo message data containing event type, transcript and some extra metadata.
|
||||
"""
|
||||
event = data.get("event")
|
||||
transcript = data.get("transcript", "")
|
||||
|
||||
try:
|
||||
flux_event_type = FluxEventType(event)
|
||||
except ValueError:
|
||||
logger.debug(f"Unhandled TurnInfo event: {event}")
|
||||
return
|
||||
|
||||
match flux_event_type:
|
||||
case FluxEventType.START_OF_TURN:
|
||||
await self._handle_start_of_turn(transcript)
|
||||
case FluxEventType.TURN_RESUMED:
|
||||
await self._handle_turn_resumed(event)
|
||||
case FluxEventType.END_OF_TURN:
|
||||
await self._handle_end_of_turn(transcript, data)
|
||||
case FluxEventType.EAGER_END_OF_TURN:
|
||||
await self._handle_eager_end_of_turn(transcript, data)
|
||||
case FluxEventType.UPDATE:
|
||||
await self._handle_update(transcript)
|
||||
|
||||
async def _handle_start_of_turn(self, transcript: str):
|
||||
"""Handle StartOfTurn events from Deepgram Flux.
|
||||
|
||||
StartOfTurn events are fired when Deepgram Flux detects the beginning
|
||||
of a new speaking turn. This triggers bot interruption to stop any
|
||||
ongoing speech synthesis and signals the start of user speech detection.
|
||||
|
||||
The service will:
|
||||
- Send a BotInterruptionFrame upstream to stop bot speech
|
||||
- Send a UserStartedSpeakingFrame downstream to notify other components
|
||||
- Start metrics collection for measuring response times
|
||||
|
||||
Args:
|
||||
transcript: maybe the first few words of the turn.
|
||||
"""
|
||||
logger.debug("User started speaking")
|
||||
await self.push_interruption_task_frame_and_wait()
|
||||
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
await self.start_metrics()
|
||||
if transcript:
|
||||
logger.trace(f"Start of turn transcript: {transcript}")
|
||||
|
||||
async def _handle_turn_resumed(self, event: str):
|
||||
"""Handle TurnResumed events from Deepgram Flux.
|
||||
|
||||
TurnResumed events indicate that speech has resumed after a brief pause
|
||||
within the same turn. This is primarily used for logging and debugging
|
||||
purposes and doesn't trigger any significant processing changes.
|
||||
|
||||
Args:
|
||||
event: The event type string for logging purposes.
|
||||
"""
|
||||
logger.trace(f"Received event TurnResumed: {event}")
|
||||
|
||||
async def _handle_end_of_turn(self, transcript: str, data: Dict[str, Any]):
|
||||
"""Handle EndOfTurn events from Deepgram Flux.
|
||||
|
||||
EndOfTurn events are fired when Deepgram Flux determines that a speaking
|
||||
turn has concluded, either due to sufficient silence or end-of-turn
|
||||
confidence thresholds being met. This provides the final transcript
|
||||
for the completed turn.
|
||||
|
||||
The service will:
|
||||
- Create and send a final TranscriptionFrame with the complete transcript
|
||||
- Trigger transcription handling with tracing for metrics
|
||||
- Stop processing metrics collection
|
||||
- Send a UserStoppedSpeakingFrame to signal turn completion
|
||||
|
||||
Args:
|
||||
transcript: The final transcript text for the completed turn.
|
||||
data: The TurnInfo message data containing event type, transcript and some extra metadata.
|
||||
"""
|
||||
logger.debug("User stopped speaking")
|
||||
|
||||
await self.push_frame(
|
||||
TranscriptionFrame(
|
||||
transcript,
|
||||
self._user_id,
|
||||
time_now_iso8601(),
|
||||
self._language,
|
||||
result=data,
|
||||
)
|
||||
)
|
||||
await self._handle_transcription(transcript, True, self._language)
|
||||
await self.stop_processing_metrics()
|
||||
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.DOWNSTREAM)
|
||||
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
|
||||
async def _handle_eager_end_of_turn(self, transcript: str, data: Dict[str, Any]):
|
||||
"""Handle EagerEndOfTurn events from Deepgram Flux.
|
||||
|
||||
EagerEndOfTurn events are fired when the end-of-turn confidence reaches the
|
||||
EagerEndOfTurn threshold but hasn't yet reached the full end-of-turn threshold.
|
||||
These provide interim transcripts that can be used for faster response
|
||||
generation while still allowing the user to continue speaking.
|
||||
|
||||
EagerEndOfTurn events enable more responsive conversational AI by allowing
|
||||
the LLM to start processing likely final transcripts before the turn
|
||||
is definitively ended.
|
||||
|
||||
Args:
|
||||
transcript: The interim transcript text that triggered the EagerEndOfTurn event.
|
||||
data: The TurnInfo message data containing event type, transcript and some extra metadata.
|
||||
"""
|
||||
logger.trace(f"EagerEndOfTurn - {transcript}")
|
||||
# Deepgram's EagerEndOfTurn feature enables lower-latency voice agents by sending
|
||||
# medium-confidence transcripts before EndOfTurn certainty, allowing LLM processing to
|
||||
# begin early.
|
||||
#
|
||||
# However, if speech resumes or the transcripts differ from the final EndOfTurn, the
|
||||
# EagerEndOfTurn response should be cancelled to avoid incorrect or partial responses.
|
||||
#
|
||||
# Pipecat doesn't yet provide built-in Gate/control mechanisms to:
|
||||
# 1. Start LLM/TTS processing early on EagerEndOfTurn events
|
||||
# 2. Cancel in-flight processing when TurnResumed occurs
|
||||
#
|
||||
# By pushing EagerEndOfTurn transcripts as InterimTranscriptionFrame, we enable
|
||||
# developers to implement custom EagerEndOfTurn handling in their applications while
|
||||
# maintaining compatibility with existing interim transcription workflows.
|
||||
#
|
||||
# TODO: Implement proper EagerEndOfTurn support with cancellable processing pipeline
|
||||
# that can start response generation on EagerEndOfTurn and cancel or confirm it.
|
||||
await self.push_frame(
|
||||
InterimTranscriptionFrame(
|
||||
transcript,
|
||||
self._user_id,
|
||||
time_now_iso8601(),
|
||||
self._language,
|
||||
result=data,
|
||||
)
|
||||
)
|
||||
|
||||
async def _handle_update(self, transcript: str):
|
||||
"""Handle Update events from Deepgram Flux.
|
||||
|
||||
Update events provide incremental transcript updates during an ongoing
|
||||
turn. These events allow for real-time display of transcription progress
|
||||
and can be used to provide visual feedback to users about what's being
|
||||
recognized.
|
||||
|
||||
The service stops TTFB (Time To First Byte) metrics when the first
|
||||
substantial update is received, indicating successful processing start.
|
||||
|
||||
Args:
|
||||
transcript: The current partial transcript text for the ongoing turn.
|
||||
"""
|
||||
if transcript:
|
||||
logger.trace(f"Update event: {transcript}")
|
||||
# TTFB (Time To First Byte) metrics are currently disabled for Deepgram Flux.
|
||||
# Ideally, TTFB should measure the time from when a user starts speaking
|
||||
# until we receive the first transcript. However, Deepgram Flux delivers
|
||||
# both the "user started speaking" event and the first transcript simultaneously,
|
||||
# making this timing measurement meaningless in this context.
|
||||
# await self.stop_ttfb_metrics()
|
||||
@@ -337,10 +337,16 @@ class BaseOpenAILLMService(LLMService):
|
||||
|
||||
async for chunk in chunk_stream:
|
||||
if chunk.usage:
|
||||
cached_tokens = (
|
||||
chunk.usage.prompt_tokens_details.cached_tokens
|
||||
if chunk.usage.prompt_tokens_details
|
||||
else None
|
||||
)
|
||||
tokens = LLMTokenUsage(
|
||||
prompt_tokens=chunk.usage.prompt_tokens,
|
||||
completion_tokens=chunk.usage.completion_tokens,
|
||||
total_tokens=chunk.usage.total_tokens,
|
||||
cache_read_input_tokens=cached_tokens,
|
||||
)
|
||||
await self.start_llm_usage_metrics(tokens)
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING
|
||||
from pipecat.frames.frames import DataFrame, FunctionCallResultFrame
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pipecat.services.openai_realtime_beta.context import OpenAIRealtimeLLMContext
|
||||
from pipecat.services.openai_realtime.context import OpenAIRealtimeLLMContext
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -15,6 +15,7 @@ from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
StartFrame,
|
||||
STTMuteFrame,
|
||||
@@ -24,6 +25,7 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_service import AIService
|
||||
from pipecat.services.websocket_service import WebsocketService
|
||||
from pipecat.transcriptions.language import Language
|
||||
|
||||
|
||||
@@ -283,3 +285,35 @@ class SegmentedSTTService(STTService):
|
||||
if not self._user_speaking and len(self._audio_buffer) > self._audio_buffer_size_1s:
|
||||
discarded = len(self._audio_buffer) - self._audio_buffer_size_1s
|
||||
self._audio_buffer = self._audio_buffer[discarded:]
|
||||
|
||||
|
||||
class WebsocketSTTService(STTService, WebsocketService):
|
||||
"""Base class for websocket-based STT services.
|
||||
|
||||
Combines STT functionality with websocket connectivity, providing automatic
|
||||
error handling and reconnection capabilities.
|
||||
|
||||
Event handlers:
|
||||
on_connection_error: Called when a websocket connection error occurs.
|
||||
|
||||
Example::
|
||||
|
||||
@stt.event_handler("on_connection_error")
|
||||
async def on_connection_error(stt: STTService, error: str):
|
||||
logger.error(f"STT connection error: {error}")
|
||||
"""
|
||||
|
||||
def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
|
||||
"""Initialize the Websocket STT service.
|
||||
|
||||
Args:
|
||||
reconnect_on_error: Whether to automatically reconnect on websocket errors.
|
||||
**kwargs: Additional arguments passed to parent classes.
|
||||
"""
|
||||
STTService.__init__(self, **kwargs)
|
||||
WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
|
||||
self._register_event_handler("on_connection_error")
|
||||
|
||||
async def _report_error(self, error: ErrorFrame):
|
||||
await self._call_event_handler("on_connection_error", error.error)
|
||||
await self.push_error(error)
|
||||
|
||||
@@ -11,7 +11,6 @@ input processing, including VAD, turn analysis, and interruption management.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
@@ -79,10 +78,6 @@ class BaseInputTransport(FrameProcessor):
|
||||
# Track user speaking state for interruption logic
|
||||
self._user_speaking = False
|
||||
|
||||
# We read audio from a single queue one at a time and we then run VAD in
|
||||
# a thread. Therefore, only one thread should be necessary.
|
||||
self._executor = ThreadPoolExecutor(max_workers=1)
|
||||
|
||||
# Task to process incoming audio (VAD) and push audio frames downstream
|
||||
# if passthrough is enabled.
|
||||
self._audio_task = None
|
||||
@@ -398,9 +393,7 @@ class BaseInputTransport(FrameProcessor):
|
||||
"""Analyze audio frame for voice activity."""
|
||||
state = VADState.QUIET
|
||||
if self.vad_analyzer:
|
||||
state = await self.get_event_loop().run_in_executor(
|
||||
self._executor, self.vad_analyzer.analyze_audio, audio_frame.audio
|
||||
)
|
||||
state = await self.vad_analyzer.analyze_audio(audio_frame.audio)
|
||||
return state
|
||||
|
||||
async def _handle_vad(self, audio_frame: InputAudioRawFrame, vad_state: VADState) -> VADState:
|
||||
|
||||
@@ -29,20 +29,19 @@ from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
InputTransportMessageUrgentFrame,
|
||||
InterruptionFrame,
|
||||
MixerControlFrame,
|
||||
OutputAudioRawFrame,
|
||||
OutputDTMFFrame,
|
||||
OutputDTMFUrgentFrame,
|
||||
OutputImageRawFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
OutputTransportReadyFrame,
|
||||
SpeechOutputAudioRawFrame,
|
||||
SpriteFrame,
|
||||
StartFrame,
|
||||
SystemFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
TTSAudioRawFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
@@ -178,7 +177,9 @@ class BaseOutputTransport(FrameProcessor):
|
||||
# Sending a frame indicating that the output transport is ready and able to receive frames.
|
||||
await self.push_frame(OutputTransportReadyFrame(), FrameDirection.UPSTREAM)
|
||||
|
||||
async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
"""Send a transport message.
|
||||
|
||||
Args:
|
||||
@@ -307,9 +308,7 @@ class BaseOutputTransport(FrameProcessor):
|
||||
elif isinstance(frame, InterruptionFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
await self._handle_frame(frame)
|
||||
elif isinstance(frame, TransportMessageUrgentFrame) and not isinstance(
|
||||
frame, InputTransportMessageUrgentFrame
|
||||
):
|
||||
elif isinstance(frame, OutputTransportMessageUrgentFrame):
|
||||
await self.send_message(frame)
|
||||
elif isinstance(frame, OutputDTMFUrgentFrame):
|
||||
await self.write_dtmf(frame)
|
||||
@@ -646,7 +645,7 @@ class BaseOutputTransport(FrameProcessor):
|
||||
await self._set_video_image(frame)
|
||||
elif isinstance(frame, SpriteFrame):
|
||||
await self._set_video_images(frame.images)
|
||||
elif isinstance(frame, TransportMessageFrame):
|
||||
elif isinstance(frame, OutputTransportMessageFrame):
|
||||
await self._transport.send_message(frame)
|
||||
elif isinstance(frame, OutputDTMFFrame):
|
||||
await self._transport.write_dtmf(frame)
|
||||
|
||||
@@ -30,15 +30,15 @@ from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
InputTransportMessageUrgentFrame,
|
||||
InputTransportMessageFrame,
|
||||
InterimTranscriptionFrame,
|
||||
OutputAudioRawFrame,
|
||||
OutputImageRawFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
SpriteFrame,
|
||||
StartFrame,
|
||||
TranscriptionFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
UserAudioRawFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
@@ -74,7 +74,7 @@ VAD_RESET_PERIOD_MS = 2000
|
||||
|
||||
|
||||
@dataclass
|
||||
class DailyTransportMessageFrame(TransportMessageFrame):
|
||||
class DailyOutputTransportMessageFrame(OutputTransportMessageFrame):
|
||||
"""Frame for transport messages in Daily calls.
|
||||
|
||||
Parameters:
|
||||
@@ -85,7 +85,7 @@ class DailyTransportMessageFrame(TransportMessageFrame):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DailyTransportMessageUrgentFrame(TransportMessageUrgentFrame):
|
||||
class DailyOutputTransportMessageUrgentFrame(OutputTransportMessageUrgentFrame):
|
||||
"""Frame for urgent transport messages in Daily calls.
|
||||
|
||||
Parameters:
|
||||
@@ -96,7 +96,59 @@ class DailyTransportMessageUrgentFrame(TransportMessageUrgentFrame):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DailyInputTransportMessageUrgentFrame(InputTransportMessageUrgentFrame):
|
||||
class DailyTransportMessageFrame(DailyOutputTransportMessageFrame):
|
||||
"""Frame for transport messages in Daily calls.
|
||||
|
||||
.. deprecated:: 0.0.87
|
||||
This frame is deprecated and will be removed in a future version.
|
||||
Instead, use `DailyOutputTransportMessageFrame`.
|
||||
|
||||
Parameters:
|
||||
participant_id: Optional ID of the participant this message is for/from.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"DailyTransportMessageFrame is deprecated and will be removed in a future version. "
|
||||
"Instead, use DailyOutputTransportMessageFrame.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DailyTransportMessageUrgentFrame(DailyOutputTransportMessageUrgentFrame):
|
||||
"""Frame for urgent transport messages in Daily calls.
|
||||
|
||||
.. deprecated:: 0.0.87
|
||||
This frame is deprecated and will be removed in a future version.
|
||||
Instead, use `DailyOutputTransportMessageUrgentFrame`.
|
||||
|
||||
Parameters:
|
||||
participant_id: Optional ID of the participant this message is for/from.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"DailyTransportMessageUrgentFrame is deprecated and will be removed in a future version. "
|
||||
"Instead, use DailyOutputTransportMessageUrgentFrame.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DailyInputTransportMessageFrame(InputTransportMessageFrame):
|
||||
"""Frame for input urgent transport messages in Daily calls.
|
||||
|
||||
Parameters:
|
||||
@@ -106,16 +158,61 @@ class DailyInputTransportMessageUrgentFrame(InputTransportMessageUrgentFrame):
|
||||
participant_id: Optional[str] = None
|
||||
|
||||
|
||||
class DailyInputTransportMessageUrgentFrame(DailyInputTransportMessageFrame):
|
||||
"""Frame for input urgent transport messages in Daily calls.
|
||||
|
||||
.. deprecated:: 0.0.87
|
||||
This frame is deprecated and will be removed in a future version.
|
||||
Instead, use `DailyInputTransportMessageFrame`.
|
||||
|
||||
Parameters:
|
||||
participant_id: Optional ID of the participant this message is for/from.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"DailyInputTransportMessageUrgentFrame is deprecated and will be removed in a future version. "
|
||||
"Instead, use DailyInputTransportMessageFrame.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DailyUpdateRemoteParticipantsFrame(ControlFrame):
|
||||
"""Frame to update remote participants in Daily calls.
|
||||
|
||||
.. deprecated:: 0.0.87
|
||||
`DailyUpdateRemoteParticipantsFrame` is deprecated and will be removed in a future version.
|
||||
Create your own custom frame and use a custom processor to handle it or use, for example,
|
||||
`on_after_push_frame` event instead in the output transport.
|
||||
|
||||
Parameters:
|
||||
remote_participants: See https://reference-python.daily.co/api_reference.html#daily.CallClient.update_remote_participants.
|
||||
"""
|
||||
|
||||
remote_participants: Mapping[str, Any] = None
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"DailyUpdateRemoteParticipantsFrame is deprecated and will be removed in a future version."
|
||||
"Instead, create your own custom frame and handle it in the "
|
||||
'`@transport.output().event_handler("on_after_push_frame")` event handler or a '
|
||||
"custom processor.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
class WebRTCVADAnalyzer(VADAnalyzer):
|
||||
"""Voice Activity Detection analyzer using WebRTC.
|
||||
@@ -454,7 +551,9 @@ class DailyTransportClient(EventHandler):
|
||||
"""
|
||||
return self._out_sample_rate
|
||||
|
||||
async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
"""Send an application message to participants.
|
||||
|
||||
Args:
|
||||
@@ -464,7 +563,9 @@ class DailyTransportClient(EventHandler):
|
||||
return
|
||||
|
||||
participant_id = None
|
||||
if isinstance(frame, (DailyTransportMessageFrame, DailyTransportMessageUrgentFrame)):
|
||||
if isinstance(
|
||||
frame, (DailyOutputTransportMessageFrame, DailyOutputTransportMessageUrgentFrame)
|
||||
):
|
||||
participant_id = frame.participant_id
|
||||
|
||||
future = self._get_event_loop().create_future()
|
||||
@@ -1601,7 +1702,7 @@ class DailyInputTransport(BaseInputTransport):
|
||||
message: The message data to send.
|
||||
sender: ID of the message sender.
|
||||
"""
|
||||
frame = DailyInputTransportMessageUrgentFrame(message=message, participant_id=sender)
|
||||
frame = DailyInputTransportMessageFrame(message=message, participant_id=sender)
|
||||
await self.push_frame(frame)
|
||||
|
||||
#
|
||||
@@ -1823,7 +1924,9 @@ class DailyOutputTransport(BaseOutputTransport):
|
||||
if isinstance(frame, DailyUpdateRemoteParticipantsFrame):
|
||||
await self._client.update_remote_participants(frame.remote_participants)
|
||||
|
||||
async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
"""Send a transport message to participants.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -29,9 +29,9 @@ from pipecat.frames.frames import (
|
||||
OutputAudioRawFrame,
|
||||
OutputDTMFFrame,
|
||||
OutputDTMFUrgentFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
UserAudioRawFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
@@ -68,7 +68,7 @@ DTMF_CODE_MAP = {
|
||||
|
||||
|
||||
@dataclass
|
||||
class LiveKitTransportMessageFrame(TransportMessageFrame):
|
||||
class LiveKitOutputTransportMessageFrame(OutputTransportMessageFrame):
|
||||
"""Frame for transport messages in LiveKit rooms.
|
||||
|
||||
Parameters:
|
||||
@@ -79,7 +79,7 @@ class LiveKitTransportMessageFrame(TransportMessageFrame):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LiveKitTransportMessageUrgentFrame(TransportMessageUrgentFrame):
|
||||
class LiveKitOutputTransportMessageUrgentFrame(OutputTransportMessageUrgentFrame):
|
||||
"""Frame for urgent transport messages in LiveKit rooms.
|
||||
|
||||
Parameters:
|
||||
@@ -89,6 +89,50 @@ class LiveKitTransportMessageUrgentFrame(TransportMessageUrgentFrame):
|
||||
participant_id: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class LiveKitTransportMessageFrame(LiveKitOutputTransportMessageFrame):
|
||||
"""Frame for transport messages in LiveKit rooms.
|
||||
|
||||
Parameters:
|
||||
participant_id: Optional ID of the participant this message is for/from.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"LiveKitTransportMessageFrame is deprecated and will be removed in a future version. "
|
||||
"Instead, use LiveKitOutputTransportMessageFrame.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LiveKitTransportMessageUrgentFrame(LiveKitOutputTransportMessageUrgentFrame):
|
||||
"""Frame for urgent transport messages in LiveKit rooms.
|
||||
|
||||
Parameters:
|
||||
participant_id: Optional ID of the participant this message is for/from.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"LiveKitTransportMessageUrgentFrame is deprecated and will be removed in a future version. "
|
||||
"Instead, use LiveKitOutputTransportMessageUrgentFrame.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
|
||||
class LiveKitParams(TransportParams):
|
||||
"""Configuration parameters for LiveKit transport.
|
||||
|
||||
@@ -310,10 +354,10 @@ class LiveKitTransportClient:
|
||||
logger.error(f"Error sending data: {e}")
|
||||
|
||||
async def send_dtmf(self, digit: str):
|
||||
"""Send DTMF tone to the room.
|
||||
r"""Send DTMF tone to the room.
|
||||
|
||||
Args:
|
||||
digit: The DTMF digit to send (0-9, *, #).
|
||||
digit: The DTMF digit to send (0-9, \*, #).
|
||||
"""
|
||||
if not self._connected:
|
||||
return
|
||||
@@ -677,7 +721,7 @@ class LiveKitInputTransport(BaseInputTransport):
|
||||
message: The message data to send.
|
||||
sender: ID of the message sender.
|
||||
"""
|
||||
frame = LiveKitTransportMessageUrgentFrame(message=message, participant_id=sender)
|
||||
frame = LiveKitOutputTransportMessageUrgentFrame(message=message, participant_id=sender)
|
||||
await self.push_frame(frame)
|
||||
|
||||
async def _audio_in_task_handler(self):
|
||||
@@ -836,7 +880,9 @@ class LiveKitOutputTransport(BaseOutputTransport):
|
||||
await super().cleanup()
|
||||
await self._transport.cleanup()
|
||||
|
||||
async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
"""Send a transport message to participants.
|
||||
|
||||
Args:
|
||||
@@ -846,7 +892,9 @@ class LiveKitOutputTransport(BaseOutputTransport):
|
||||
if isinstance(message, dict):
|
||||
# fix message encoding for dict-like messages, e.g. RTVI messages.
|
||||
message = json.dumps(message, ensure_ascii=False)
|
||||
if isinstance(frame, (LiveKitTransportMessageFrame, LiveKitTransportMessageUrgentFrame)):
|
||||
if isinstance(
|
||||
frame, (LiveKitOutputTransportMessageFrame, LiveKitOutputTransportMessageUrgentFrame)
|
||||
):
|
||||
await self._client.send_data(message.encode(), frame.participant_id)
|
||||
else:
|
||||
await self._client.send_data(message.encode())
|
||||
@@ -1105,7 +1153,9 @@ class LiveKitTransport(BaseTransport):
|
||||
participant_id: Optional specific participant to send to.
|
||||
"""
|
||||
if self._output:
|
||||
frame = LiveKitTransportMessageFrame(message=message, participant_id=participant_id)
|
||||
frame = LiveKitOutputTransportMessageFrame(
|
||||
message=message, participant_id=participant_id
|
||||
)
|
||||
await self._output.send_message(frame)
|
||||
|
||||
async def send_message_urgent(self, message: str, participant_id: Optional[str] = None):
|
||||
@@ -1116,7 +1166,7 @@ class LiveKitTransport(BaseTransport):
|
||||
participant_id: Optional specific participant to send to.
|
||||
"""
|
||||
if self._output:
|
||||
frame = LiveKitTransportMessageUrgentFrame(
|
||||
frame = LiveKitOutputTransportMessageUrgentFrame(
|
||||
message=message, participant_id=participant_id
|
||||
)
|
||||
await self._output.send_message(frame)
|
||||
|
||||
@@ -283,7 +283,6 @@ class SmallWebRTCConnection(BaseObject):
|
||||
self._data_channel = None
|
||||
self._renegotiation_in_progress = False
|
||||
self._last_received_time = None
|
||||
self._message_queue = []
|
||||
self._pending_app_messages = []
|
||||
self._connecting_timeout_task = None
|
||||
|
||||
@@ -297,10 +296,7 @@ class SmallWebRTCConnection(BaseObject):
|
||||
# Flush queued messages once the data channel is open
|
||||
@channel.on("open")
|
||||
async def on_open():
|
||||
logger.debug("Data channel is open, flushing queued messages")
|
||||
while self._message_queue:
|
||||
message = self._message_queue.pop(0)
|
||||
self._data_channel.send(message)
|
||||
logger.debug("Data channel is open!")
|
||||
|
||||
@channel.on("message")
|
||||
async def on_message(message):
|
||||
@@ -503,7 +499,6 @@ class SmallWebRTCConnection(BaseObject):
|
||||
self._track_map.clear()
|
||||
if self._pc:
|
||||
await self._pc.close()
|
||||
self._message_queue.clear()
|
||||
self._pending_app_messages.clear()
|
||||
self._track_map = {}
|
||||
self._cancel_monitoring_connecting_state()
|
||||
@@ -669,8 +664,8 @@ class SmallWebRTCConnection(BaseObject):
|
||||
if self._data_channel and self._data_channel.readyState == "open":
|
||||
self._data_channel.send(json_message)
|
||||
else:
|
||||
logger.debug("Data channel not ready, queuing message")
|
||||
self._message_queue.append(json_message)
|
||||
# The client might choose never to create a data channel.
|
||||
logger.trace("Data channel not ready, discarding message!")
|
||||
|
||||
def ask_to_renegotiate(self):
|
||||
"""Request renegotiation of the WebRTC connection."""
|
||||
|
||||
@@ -180,7 +180,7 @@ class SmallWebRTCRequestHandler:
|
||||
|
||||
answer = pipecat_connection.get_answer()
|
||||
|
||||
if self._esp32_mode and self._host and self._host != "localhost":
|
||||
if self._esp32_mode:
|
||||
from pipecat.runner.utils import smallwebrtc_sdp_munging
|
||||
|
||||
answer["sdp"] = smallwebrtc_sdp_munging(answer["sdp"], self._host)
|
||||
|
||||
@@ -26,13 +26,13 @@ from pipecat.frames.frames import (
|
||||
EndFrame,
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
InputTransportMessageUrgentFrame,
|
||||
InputTransportMessageFrame,
|
||||
OutputAudioRawFrame,
|
||||
OutputImageRawFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
SpriteFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
UserImageRawFrame,
|
||||
UserImageRequestFrame,
|
||||
)
|
||||
@@ -461,7 +461,9 @@ class SmallWebRTCClient:
|
||||
await self._webrtc_connection.disconnect()
|
||||
await self._handle_peer_disconnected()
|
||||
|
||||
async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
"""Send an application message through the WebRTC connection.
|
||||
|
||||
Args:
|
||||
@@ -683,7 +685,7 @@ class SmallWebRTCInputTransport(BaseInputTransport):
|
||||
message: The application message to process.
|
||||
"""
|
||||
logger.debug(f"Received app message inside SmallWebRTCInputTransport {message}")
|
||||
frame = InputTransportMessageUrgentFrame(message=message)
|
||||
frame = InputTransportMessageFrame(message=message)
|
||||
await self.push_frame(frame)
|
||||
|
||||
# Add this method similar to DailyInputTransport.request_participant_image
|
||||
@@ -820,7 +822,9 @@ class SmallWebRTCOutputTransport(BaseOutputTransport):
|
||||
await super().cancel(frame)
|
||||
await self._client.disconnect()
|
||||
|
||||
async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
"""Send a transport message through the WebRTC connection.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -27,9 +27,9 @@ from pipecat.frames.frames import (
|
||||
InputAudioRawFrame,
|
||||
InterruptionFrame,
|
||||
OutputAudioRawFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, FrameProcessorSetup
|
||||
from pipecat.transports.base_input import BaseInputTransport
|
||||
@@ -345,7 +345,9 @@ class TavusTransportClient:
|
||||
participant_id, callback, audio_source, sample_rate, callback_interval_ms
|
||||
)
|
||||
|
||||
async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
"""Send a message to participants.
|
||||
|
||||
Args:
|
||||
@@ -373,7 +375,7 @@ class TavusTransportClient:
|
||||
|
||||
async def send_interrupt_message(self) -> None:
|
||||
"""Send an interrupt message to the conversation."""
|
||||
transport_frame = TransportMessageUrgentFrame(
|
||||
transport_frame = OutputTransportMessageUrgentFrame(
|
||||
message={
|
||||
"message_type": "conversation",
|
||||
"event_type": "conversation.interrupt",
|
||||
@@ -605,7 +607,9 @@ class TavusOutputTransport(BaseOutputTransport):
|
||||
await super().cancel(frame)
|
||||
await self._client.stop()
|
||||
|
||||
async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
"""Send a message to participants.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -28,9 +28,9 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
OutputAudioRawFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameProcessorSetup
|
||||
from pipecat.serializers.base_serializer import FrameSerializer
|
||||
@@ -385,7 +385,9 @@ class WebsocketClientOutputTransport(BaseOutputTransport):
|
||||
await super().cleanup()
|
||||
await self._transport.cleanup()
|
||||
|
||||
async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
"""Send a transport message through the WebSocket.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -28,9 +28,9 @@ from pipecat.frames.frames import (
|
||||
InputAudioRawFrame,
|
||||
InterruptionFrame,
|
||||
OutputAudioRawFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
||||
@@ -402,7 +402,9 @@ class FastAPIWebsocketOutputTransport(BaseOutputTransport):
|
||||
await self._write_frame(frame)
|
||||
self._next_send_time = 0
|
||||
|
||||
async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
"""Send a transport message frame.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -27,9 +27,9 @@ from pipecat.frames.frames import (
|
||||
InputAudioRawFrame,
|
||||
InterruptionFrame,
|
||||
OutputAudioRawFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
StartFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.serializers.base_serializer import FrameSerializer
|
||||
@@ -338,7 +338,9 @@ class WebsocketServerOutputTransport(BaseOutputTransport):
|
||||
await self._write_frame(frame)
|
||||
self._next_send_time = 0
|
||||
|
||||
async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
|
||||
async def send_message(
|
||||
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
||||
):
|
||||
"""Send a transport message frame to the client.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -29,7 +29,7 @@ class EventHandler:
|
||||
This data class stores the event name, a list of handlers to run for this
|
||||
event, and whether these handlers will be executed in a task.
|
||||
|
||||
Attributes:
|
||||
Parameters:
|
||||
name (str): The name of the event handler.
|
||||
handlers (List[Any]): A list of functions to be called when this event is triggered.
|
||||
is_sync (bool): Indicates whether the functions are executed in a task.
|
||||
|
||||
@@ -12,14 +12,12 @@ from dotenv import load_dotenv
|
||||
|
||||
from pipecat.adapters.schemas.function_schema import FunctionSchema
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.frames.frames import LLMContextFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.processors.aggregators.openai_llm_context import (
|
||||
OpenAILLMContext,
|
||||
OpenAILLMContextFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.services.anthropic.llm import AnthropicLLMService
|
||||
from pipecat.services.google.llm import GoogleLLMService
|
||||
from pipecat.services.llm_service import LLMService
|
||||
from pipecat.services.llm_service import FunctionCallParams, LLMService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.tests.utils import run_test
|
||||
|
||||
@@ -48,8 +46,13 @@ def standard_tools() -> ToolsSchema:
|
||||
|
||||
|
||||
async def _test_llm_function_calling(llm: LLMService):
|
||||
# Create an AsyncMock for the function
|
||||
mock_fetch_weather = AsyncMock()
|
||||
# Create a mock weather function
|
||||
call_count = 0
|
||||
|
||||
async def mock_fetch_weather(params: FunctionCallParams):
|
||||
nonlocal call_count
|
||||
call_count += 1
|
||||
pass
|
||||
|
||||
llm.register_function(None, mock_fetch_weather)
|
||||
|
||||
@@ -60,21 +63,19 @@ async def _test_llm_function_calling(llm: LLMService):
|
||||
},
|
||||
{"role": "user", "content": " How is the weather today in San Francisco, California?"},
|
||||
]
|
||||
context = OpenAILLMContext(messages, standard_tools())
|
||||
# This is done by default inside the create_context_aggregator
|
||||
context.set_llm_adapter(llm.get_llm_adapter())
|
||||
context = LLMContext(messages, standard_tools())
|
||||
|
||||
pipeline = Pipeline([llm])
|
||||
|
||||
frames_to_send = [OpenAILLMContextFrame(context)]
|
||||
frames_to_send = [LLMContextFrame(context)]
|
||||
await run_test(
|
||||
pipeline,
|
||||
frames_to_send=frames_to_send,
|
||||
expected_down_frames=None,
|
||||
)
|
||||
|
||||
# Assert that the mock function was called
|
||||
mock_fetch_weather.assert_called_once()
|
||||
# Assert that the weather function was called once
|
||||
assert call_count == 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("OPENAI_API_KEY") is None, reason="OPENAI_API_KEY is not set")
|
||||
|
||||
@@ -11,8 +11,8 @@ from pipecat.frames.frames import (
|
||||
EndFrame,
|
||||
Frame,
|
||||
InterruptionFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
TextFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.processors.filters.identity_filter import IdentityFilter
|
||||
@@ -81,7 +81,7 @@ class TestFrameProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
|
||||
if isinstance(frame, TextFrame):
|
||||
await self.push_interruption_task_frame_and_wait()
|
||||
await self.push_frame(TransportMessageUrgentFrame(message=frame.text))
|
||||
await self.push_frame(OutputTransportMessageUrgentFrame(message=frame.text))
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
@@ -101,7 +101,7 @@ class TestFrameProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
expected_down_frames = [
|
||||
InterruptionFrame,
|
||||
InterruptionFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
EndFrame,
|
||||
]
|
||||
await run_test(
|
||||
|
||||
@@ -10,24 +10,21 @@ from langchain.prompts import ChatPromptTemplate
|
||||
from langchain_core.language_models import FakeStreamingListLLM
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
LLMContextAssistantTimestampFrame,
|
||||
LLMContextFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
OpenAILLMContextAssistantTimestampFrame,
|
||||
TextFrame,
|
||||
TranscriptionFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response import (
|
||||
LLMAssistantAggregatorParams,
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
)
|
||||
from pipecat.processors.aggregators.openai_llm_context import (
|
||||
OpenAILLMContext,
|
||||
OpenAILLMContextFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
|
||||
from pipecat.processors.frame_processor import FrameProcessor
|
||||
from pipecat.processors.frameworks.langchain import LangchainProcessor
|
||||
from pipecat.tests.utils import SleepFrame, run_test
|
||||
@@ -67,13 +64,14 @@ class TestLangchain(unittest.IsolatedAsyncioTestCase):
|
||||
proc = LangchainProcessor(chain=chain)
|
||||
self.mock_proc = self.MockProcessor("token_collector")
|
||||
|
||||
context = OpenAILLMContext()
|
||||
tma_in = LLMUserContextAggregator(context)
|
||||
tma_out = LLMAssistantContextAggregator(
|
||||
context, params=LLMAssistantAggregatorParams(expect_stripped_words=False)
|
||||
context = LLMContext()
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context, assistant_params=LLMAssistantAggregatorParams(expect_stripped_words=False)
|
||||
)
|
||||
|
||||
pipeline = Pipeline([tma_in, proc, self.mock_proc, tma_out])
|
||||
pipeline = Pipeline(
|
||||
[context_aggregator.user(), proc, self.mock_proc, context_aggregator.assistant()]
|
||||
)
|
||||
|
||||
frames_to_send = [
|
||||
UserStartedSpeakingFrame(),
|
||||
@@ -84,8 +82,8 @@ class TestLangchain(unittest.IsolatedAsyncioTestCase):
|
||||
expected_down_frames = [
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
OpenAILLMContextFrame,
|
||||
OpenAILLMContextAssistantTimestampFrame,
|
||||
LLMContextFrame,
|
||||
LLMContextAssistantTimestampFrame,
|
||||
]
|
||||
await run_test(
|
||||
pipeline,
|
||||
@@ -94,4 +92,6 @@ class TestLangchain(unittest.IsolatedAsyncioTestCase):
|
||||
)
|
||||
|
||||
self.assertEqual("".join(self.mock_proc.token), self.expected_response)
|
||||
self.assertEqual(tma_out.messages[-1]["content"], self.expected_response)
|
||||
self.assertEqual(
|
||||
context_aggregator.assistant().messages[-1]["content"], self.expected_response
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user