Compare commits
11 Commits
v0.0.96
...
aleix/clau
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
741ec7486d | ||
|
|
2235d8f5a2 | ||
|
|
6e20a50a4b | ||
|
|
89d9ca045a | ||
|
|
4b95ee92eb | ||
|
|
d481ac6cc6 | ||
|
|
e5a91296b5 | ||
|
|
d8d10a0685 | ||
|
|
6dd9ed03b1 | ||
|
|
d486c80804 | ||
|
|
dedea7c420 |
25
CHANGELOG.md
25
CHANGELOG.md
@@ -5,10 +5,33 @@ All notable changes to **Pipecat** will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Changed
|
||||
|
||||
- Updated `AICFilter` to use Quail STT as the default model
|
||||
(`AICModelType.QUAIL_STT`). Quail STT is optimized for human-to-machine
|
||||
interaction (e.g., voice agents, speech-to-text) and operates at a native
|
||||
sample rate of 16 kHz with fixed enhancement parameters.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- The `noise_gate_enable` parameter in `AICFilter` is deprecated and no longer
|
||||
has any effect. Noise gating is now handled automatically by the AIC VAD
|
||||
system. Use `AICFilter.create_vad_analyzer()` for VAD functionality instead.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue in `AWSTranscribeSTTService` where the `region` arg was
|
||||
always set to `us-east-1` when providing an AWS_REGION env var.
|
||||
|
||||
## [0.0.96] - 2025-11-26 🦃 "Happy Thanksgiving!" 🦃
|
||||
|
||||
### Added
|
||||
|
||||
- Added `AWSBedrockAgentCoreProcessor` to support invoking an AgentCore-hosted
|
||||
agent in a Pipecat pipeline.
|
||||
|
||||
- Enhanced error handling across the framework:
|
||||
|
||||
- Added `on_error` callback to `FrameProcessor` for centralized error
|
||||
@@ -280,7 +303,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue in `AWSBedrockLLMService` where the `aws_region` arg was
|
||||
always set to `us-east-1`.
|
||||
always set to `us-east-1` when providing an AWS_REGION env var.
|
||||
|
||||
- Fixed an issue with `DeepgramFluxSTTService` where it sometimes failed to reconnect.
|
||||
|
||||
|
||||
@@ -82,6 +82,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = AnthropicLLMService(
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
model="claude-3-7-sonnet-latest",
|
||||
wait_for_all=True,
|
||||
params=AnthropicLLMService.InputParams(
|
||||
max_tokens=16000,
|
||||
extra={
|
||||
"thinking": {"type": "enabled", "budget_tokens": 10000},
|
||||
},
|
||||
),
|
||||
)
|
||||
llm.register_function("get_weather", get_weather)
|
||||
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
|
||||
|
||||
@@ -45,7 +45,7 @@ Source = "https://github.com/pipecat-ai/pipecat"
|
||||
Website = "https://pipecat.ai"
|
||||
|
||||
[project.optional-dependencies]
|
||||
aic = [ "aic-sdk~=1.1.0" ]
|
||||
aic = [ "aic-sdk~=1.2.0" ]
|
||||
anthropic = [ "anthropic~=0.49.0" ]
|
||||
assemblyai = [ "pipecat-ai[websockets-base]" ]
|
||||
asyncai = [ "pipecat-ai[websockets-base]" ]
|
||||
|
||||
@@ -39,7 +39,7 @@ class AICFilter(BaseAudioFilter):
|
||||
self,
|
||||
*,
|
||||
license_key: str = "",
|
||||
model_type: AICModelType = AICModelType.QUAIL_L,
|
||||
model_type: AICModelType = AICModelType.QUAIL_STT,
|
||||
enhancement_level: Optional[float] = 1.0,
|
||||
voice_gain: Optional[float] = 1.0,
|
||||
noise_gate_enable: Optional[bool] = True,
|
||||
@@ -52,12 +52,27 @@ class AICFilter(BaseAudioFilter):
|
||||
enhancement_level: Optional overall enhancement strength (0.0..1.0).
|
||||
voice_gain: Optional linear gain applied to detected speech (0.0..4.0).
|
||||
noise_gate_enable: Optional enable/disable noise gate (default: True).
|
||||
|
||||
.. deprecated:: 1.3.0
|
||||
The `noise_gate_enable` parameter is deprecated and no longer has any effect.
|
||||
It will be removed in a future version.
|
||||
"""
|
||||
self._license_key = license_key
|
||||
self._model_type = model_type
|
||||
|
||||
self._enhancement_level = enhancement_level
|
||||
self._voice_gain = voice_gain
|
||||
if noise_gate_enable is not None:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Parameter `noise_gate_enable` is deprecated and no longer has any effect. "
|
||||
"It will be removed in a future version. Use AIC VAD instead (create_vad_analyzer()).",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
self._noise_gate_enable = noise_gate_enable
|
||||
|
||||
self._enabled = True
|
||||
@@ -149,10 +164,6 @@ class AICFilter(BaseAudioFilter):
|
||||
)
|
||||
if self._voice_gain is not None:
|
||||
self._aic.set_parameter(AICParameter.VOICE_GAIN, float(self._voice_gain))
|
||||
if self._noise_gate_enable is not None:
|
||||
self._aic.set_parameter(
|
||||
AICParameter.NOISE_GATE_ENABLE, 1.0 if bool(self._noise_gate_enable) else 0.0
|
||||
)
|
||||
|
||||
self._aic_ready = True
|
||||
|
||||
|
||||
@@ -563,6 +563,33 @@ class LLMContextFrame(Frame):
|
||||
context: "LLMContext"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMThinkingTextFrame(DataFrame):
|
||||
"""Reasoning frame generated by LLM services."""
|
||||
|
||||
thinking: str
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
# LLM services send text frames with all necessary spaces included
|
||||
self.includes_inter_frame_spaces = True
|
||||
|
||||
def __str__(self):
|
||||
pts = format_pts(self.pts)
|
||||
return f"{self.name}(pts: {pts}, thinking: {self.thinking})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMThinkingSignatureFrame(DataFrame):
|
||||
"""Reasoning signature frame generated by LLM services."""
|
||||
|
||||
signature: str
|
||||
|
||||
def __str__(self):
|
||||
pts = format_pts(self.pts)
|
||||
return f"{self.name}(pts: {pts}, signature: {self.signature})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMMessagesFrame(DataFrame):
|
||||
"""Frame containing LLM messages for chat completion.
|
||||
|
||||
@@ -47,6 +47,8 @@ from pipecat.frames.frames import (
|
||||
LLMRunFrame,
|
||||
LLMSetToolChoiceFrame,
|
||||
LLMSetToolsFrame,
|
||||
LLMThinkingSignatureFrame,
|
||||
LLMThinkingTextFrame,
|
||||
SpeechControlParamsFrame,
|
||||
StartFrame,
|
||||
TextFrame,
|
||||
@@ -591,6 +593,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
self._started = 0
|
||||
self._function_calls_in_progress: Dict[str, Optional[FunctionCallInProgressFrame]] = {}
|
||||
self._context_updated_tasks: Set[asyncio.Task] = set()
|
||||
self._thinking: List[TextPartForConcatenation] = []
|
||||
|
||||
@property
|
||||
def has_function_calls_in_progress(self) -> bool:
|
||||
@@ -601,6 +604,11 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
"""
|
||||
return bool(self._function_calls_in_progress)
|
||||
|
||||
async def reset(self):
|
||||
"""Reset the aggregation state."""
|
||||
await super().reset()
|
||||
self._thinking = []
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
"""Process frames for assistant response aggregation and function call management.
|
||||
|
||||
@@ -619,6 +627,10 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
await self._handle_llm_end(frame)
|
||||
elif isinstance(frame, TextFrame):
|
||||
await self._handle_text(frame)
|
||||
elif isinstance(frame, LLMThinkingTextFrame):
|
||||
await self._handle_thinking(frame)
|
||||
elif isinstance(frame, LLMThinkingSignatureFrame):
|
||||
await self._handle_thinking_signature(frame)
|
||||
elif isinstance(frame, LLMRunFrame):
|
||||
await self._handle_llm_run(frame)
|
||||
elif isinstance(frame, LLMMessagesAppendFrame):
|
||||
@@ -663,6 +675,14 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
timestamp_frame = LLMContextAssistantTimestampFrame(timestamp=time_now_iso8601())
|
||||
await self.push_frame(timestamp_frame)
|
||||
|
||||
def thinking_string(self) -> str:
|
||||
"""Get the current thinking as a string.
|
||||
|
||||
Returns:
|
||||
The concatenated thinking string.
|
||||
"""
|
||||
return concatenate_aggregated_text(self._thinking)
|
||||
|
||||
async def _handle_llm_run(self, frame: LLMRunFrame):
|
||||
await self.push_context_frame(FrameDirection.UPSTREAM)
|
||||
|
||||
@@ -824,6 +844,35 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
||||
)
|
||||
)
|
||||
|
||||
async def _handle_thinking(self, frame: LLMThinkingTextFrame):
|
||||
if not self._started:
|
||||
return
|
||||
|
||||
# Make sure we really have text (spaces count, too!)
|
||||
if len(frame.thinking) == 0:
|
||||
return
|
||||
|
||||
self._thinking.append(
|
||||
TextPartForConcatenation(
|
||||
frame.thinking, includes_inter_part_spaces=frame.includes_inter_frame_spaces
|
||||
)
|
||||
)
|
||||
|
||||
async def _handle_thinking_signature(self, frame: LLMThinkingSignatureFrame):
|
||||
if not self._started:
|
||||
return
|
||||
|
||||
thinking = self.thinking_string()
|
||||
|
||||
self._context.add_message(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{"type": "thinking", "thinking": thinking, "signature": frame.signature},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
def _context_updated_task_finished(self, task: asyncio.Task):
|
||||
self._context_updated_tasks.discard(task)
|
||||
|
||||
|
||||
@@ -302,7 +302,7 @@ def _setup_webrtc_routes(
|
||||
result: StartBotResult = {"sessionId": session_id}
|
||||
if request_data.get("enableDefaultIceServers"):
|
||||
result["iceConfig"] = IceConfig(
|
||||
iceServers=[IceServer(urls="stun:stun.l.google.com:19302")]
|
||||
iceServers=[IceServer(urls=["stun:stun.l.google.com:19302"])]
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
@@ -40,6 +40,8 @@ from pipecat.frames.frames import (
|
||||
LLMFullResponseStartFrame,
|
||||
LLMMessagesFrame,
|
||||
LLMTextFrame,
|
||||
LLMThinkingSignatureFrame,
|
||||
LLMThinkingTextFrame,
|
||||
LLMUpdateSettingsFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
@@ -380,6 +382,10 @@ class AnthropicLLMService(LLMService):
|
||||
completion_tokens_estimate += self._estimate_tokens(
|
||||
event.delta.partial_json
|
||||
)
|
||||
elif hasattr(event.delta, "thinking"):
|
||||
await self.push_frame(LLMThinkingTextFrame(event.delta.thinking))
|
||||
elif hasattr(event.delta, "signature"):
|
||||
await self.push_frame(LLMThinkingSignatureFrame(event.delta.signature))
|
||||
elif event.type == "content_block_start":
|
||||
if event.content_block.type == "tool_use":
|
||||
tool_use_block = event.content_block
|
||||
|
||||
@@ -58,7 +58,7 @@ class AWSTranscribeSTTService(STTService):
|
||||
api_key: Optional[str] = None,
|
||||
aws_access_key_id: Optional[str] = None,
|
||||
aws_session_token: Optional[str] = None,
|
||||
region: Optional[str] = "us-east-1",
|
||||
region: Optional[str] = None,
|
||||
sample_rate: int = 16000,
|
||||
language: Language = Language.EN,
|
||||
**kwargs,
|
||||
@@ -69,7 +69,7 @@ class AWSTranscribeSTTService(STTService):
|
||||
api_key: AWS secret access key. If None, uses AWS_SECRET_ACCESS_KEY environment variable.
|
||||
aws_access_key_id: AWS access key ID. If None, uses AWS_ACCESS_KEY_ID environment variable.
|
||||
aws_session_token: AWS session token for temporary credentials. If None, uses AWS_SESSION_TOKEN environment variable.
|
||||
region: AWS region for the service. Defaults to "us-east-1".
|
||||
region: AWS region for the service.
|
||||
sample_rate: Audio sample rate in Hz. Must be 8000 or 16000. Defaults to 16000.
|
||||
language: Language for transcription. Defaults to English.
|
||||
**kwargs: Additional arguments passed to parent STTService class.
|
||||
|
||||
Reference in New Issue
Block a user