Compare commits

...

11 Commits

Author SHA1 Message Date
Aleix Conchillo Flaqué
741ec7486d claude thinking model support 2025-12-01 12:01:56 -08:00
Aleix Conchillo Flaqué
2235d8f5a2 CHANGELOG formatting 2025-12-01 10:24:42 -08:00
Mark Backman
6e20a50a4b Merge pull request #3153 from pipecat-ai/mb/fix-aws-stt-region
fix: AWSTranscribeSTTService always set to us-east-1
2025-12-01 13:07:22 -05:00
Mark Backman
89d9ca045a fix: AWSTranscribeSTTService always set to us-east-1 2025-12-01 13:02:08 -05:00
Mark Backman
4b95ee92eb Merge pull request #3166 from pipecat-ai/mb/update-changelog-AWSBedrockAgentCoreProcessor
Retroactively add changelog to 0.0.96 for AWSBedrockAgentCoreProcessor
2025-12-01 11:51:47 -05:00
Mark Backman
d481ac6cc6 Retroactively add changelog to 0.0.96 for AWSBedrockAgentCoreProcessor 2025-12-01 11:49:00 -05:00
Mark Backman
e5a91296b5 Merge pull request #3162 from ai-coustics/add-stt-optimized-model
Add Quail STT as default model for `AICFilter`
2025-11-30 09:59:37 -05:00
Corvin Jaedicke
d8d10a0685 add changelog entry 2025-11-28 15:24:19 +01:00
Corvin Jaedicke
6dd9ed03b1 bump version to include new STT model, noise gate deprecation warning 2025-11-28 15:14:43 +01:00
Filipi da Silva Fuchter
d486c80804 Merge pull request #3151 from pipecat-ai/filipi/fix_runner_ice_servers
Fixing runner ICE servers to be compatible with what is expected by the mobile SDKs.
2025-11-27 10:24:02 -03:00
Filipi Fuchter
dedea7c420 Fixing runner ICE servers to be compatible with what is expected by the mobile SDKs. 2025-11-27 09:27:26 -03:00
9 changed files with 133 additions and 10 deletions

View File

@@ -5,10 +5,33 @@ All notable changes to **Pipecat** will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Changed
- Updated `AICFilter` to use Quail STT as the default model
(`AICModelType.QUAIL_STT`). Quail STT is optimized for human-to-machine
interaction (e.g., voice agents, speech-to-text) and operates at a native
sample rate of 16 kHz with fixed enhancement parameters.
### Deprecated
- The `noise_gate_enable` parameter in `AICFilter` is deprecated and no longer
has any effect. Noise gating is now handled automatically by the AIC VAD
system. Use `AICFilter.create_vad_analyzer()` for VAD functionality instead.
### Fixed
- Fixed an issue in `AWSTranscribeSTTService` where the `region` arg was
always set to `us-east-1` when providing an AWS_REGION env var.
## [0.0.96] - 2025-11-26 🦃 "Happy Thanksgiving!" 🦃
### Added
- Added `AWSBedrockAgentCoreProcessor` to support invoking an AgentCore-hosted
agent in a Pipecat pipeline.
- Enhanced error handling across the framework:
- Added `on_error` callback to `FrameProcessor` for centralized error
@@ -280,7 +303,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Fixed an issue in `AWSBedrockLLMService` where the `aws_region` arg was
always set to `us-east-1`.
always set to `us-east-1` when providing an AWS_REGION env var.
- Fixed an issue with `DeepgramFluxSTTService` where it sometimes failed to reconnect.

View File

@@ -82,6 +82,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
llm = AnthropicLLMService(
api_key=os.getenv("ANTHROPIC_API_KEY"),
model="claude-3-7-sonnet-latest",
wait_for_all=True,
params=AnthropicLLMService.InputParams(
max_tokens=16000,
extra={
"thinking": {"type": "enabled", "budget_tokens": 10000},
},
),
)
llm.register_function("get_weather", get_weather)
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)

View File

@@ -45,7 +45,7 @@ Source = "https://github.com/pipecat-ai/pipecat"
Website = "https://pipecat.ai"
[project.optional-dependencies]
aic = [ "aic-sdk~=1.1.0" ]
aic = [ "aic-sdk~=1.2.0" ]
anthropic = [ "anthropic~=0.49.0" ]
assemblyai = [ "pipecat-ai[websockets-base]" ]
asyncai = [ "pipecat-ai[websockets-base]" ]

View File

@@ -39,7 +39,7 @@ class AICFilter(BaseAudioFilter):
self,
*,
license_key: str = "",
model_type: AICModelType = AICModelType.QUAIL_L,
model_type: AICModelType = AICModelType.QUAIL_STT,
enhancement_level: Optional[float] = 1.0,
voice_gain: Optional[float] = 1.0,
noise_gate_enable: Optional[bool] = True,
@@ -52,12 +52,27 @@ class AICFilter(BaseAudioFilter):
enhancement_level: Optional overall enhancement strength (0.0..1.0).
voice_gain: Optional linear gain applied to detected speech (0.0..4.0).
noise_gate_enable: Optional enable/disable noise gate (default: True).
.. deprecated:: 1.3.0
The `noise_gate_enable` parameter is deprecated and no longer has any effect.
It will be removed in a future version.
"""
self._license_key = license_key
self._model_type = model_type
self._enhancement_level = enhancement_level
self._voice_gain = voice_gain
if noise_gate_enable is not None:
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Parameter `noise_gate_enable` is deprecated and no longer has any effect. "
"It will be removed in a future version. Use AIC VAD instead (create_vad_analyzer()).",
DeprecationWarning,
)
self._noise_gate_enable = noise_gate_enable
self._enabled = True
@@ -149,10 +164,6 @@ class AICFilter(BaseAudioFilter):
)
if self._voice_gain is not None:
self._aic.set_parameter(AICParameter.VOICE_GAIN, float(self._voice_gain))
if self._noise_gate_enable is not None:
self._aic.set_parameter(
AICParameter.NOISE_GATE_ENABLE, 1.0 if bool(self._noise_gate_enable) else 0.0
)
self._aic_ready = True

View File

@@ -563,6 +563,33 @@ class LLMContextFrame(Frame):
context: "LLMContext"
@dataclass
class LLMThinkingTextFrame(DataFrame):
"""Reasoning frame generated by LLM services."""
thinking: str
def __post_init__(self):
super().__post_init__()
# LLM services send text frames with all necessary spaces included
self.includes_inter_frame_spaces = True
def __str__(self):
pts = format_pts(self.pts)
return f"{self.name}(pts: {pts}, thinking: {self.thinking})"
@dataclass
class LLMThinkingSignatureFrame(DataFrame):
"""Reasoning signature frame generated by LLM services."""
signature: str
def __str__(self):
pts = format_pts(self.pts)
return f"{self.name}(pts: {pts}, signature: {self.signature})"
@dataclass
class LLMMessagesFrame(DataFrame):
"""Frame containing LLM messages for chat completion.

View File

@@ -47,6 +47,8 @@ from pipecat.frames.frames import (
LLMRunFrame,
LLMSetToolChoiceFrame,
LLMSetToolsFrame,
LLMThinkingSignatureFrame,
LLMThinkingTextFrame,
SpeechControlParamsFrame,
StartFrame,
TextFrame,
@@ -591,6 +593,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
self._started = 0
self._function_calls_in_progress: Dict[str, Optional[FunctionCallInProgressFrame]] = {}
self._context_updated_tasks: Set[asyncio.Task] = set()
self._thinking: List[TextPartForConcatenation] = []
@property
def has_function_calls_in_progress(self) -> bool:
@@ -601,6 +604,11 @@ class LLMAssistantAggregator(LLMContextAggregator):
"""
return bool(self._function_calls_in_progress)
async def reset(self):
"""Reset the aggregation state."""
await super().reset()
self._thinking = []
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process frames for assistant response aggregation and function call management.
@@ -619,6 +627,10 @@ class LLMAssistantAggregator(LLMContextAggregator):
await self._handle_llm_end(frame)
elif isinstance(frame, TextFrame):
await self._handle_text(frame)
elif isinstance(frame, LLMThinkingTextFrame):
await self._handle_thinking(frame)
elif isinstance(frame, LLMThinkingSignatureFrame):
await self._handle_thinking_signature(frame)
elif isinstance(frame, LLMRunFrame):
await self._handle_llm_run(frame)
elif isinstance(frame, LLMMessagesAppendFrame):
@@ -663,6 +675,14 @@ class LLMAssistantAggregator(LLMContextAggregator):
timestamp_frame = LLMContextAssistantTimestampFrame(timestamp=time_now_iso8601())
await self.push_frame(timestamp_frame)
def thinking_string(self) -> str:
"""Get the current thinking as a string.
Returns:
The concatenated thinking string.
"""
return concatenate_aggregated_text(self._thinking)
async def _handle_llm_run(self, frame: LLMRunFrame):
await self.push_context_frame(FrameDirection.UPSTREAM)
@@ -824,6 +844,35 @@ class LLMAssistantAggregator(LLMContextAggregator):
)
)
async def _handle_thinking(self, frame: LLMThinkingTextFrame):
if not self._started:
return
# Make sure we really have text (spaces count, too!)
if len(frame.thinking) == 0:
return
self._thinking.append(
TextPartForConcatenation(
frame.thinking, includes_inter_part_spaces=frame.includes_inter_frame_spaces
)
)
async def _handle_thinking_signature(self, frame: LLMThinkingSignatureFrame):
if not self._started:
return
thinking = self.thinking_string()
self._context.add_message(
{
"role": "assistant",
"content": [
{"type": "thinking", "thinking": thinking, "signature": frame.signature},
],
}
)
def _context_updated_task_finished(self, task: asyncio.Task):
self._context_updated_tasks.discard(task)

View File

@@ -302,7 +302,7 @@ def _setup_webrtc_routes(
result: StartBotResult = {"sessionId": session_id}
if request_data.get("enableDefaultIceServers"):
result["iceConfig"] = IceConfig(
iceServers=[IceServer(urls="stun:stun.l.google.com:19302")]
iceServers=[IceServer(urls=["stun:stun.l.google.com:19302"])]
)
return result

View File

@@ -40,6 +40,8 @@ from pipecat.frames.frames import (
LLMFullResponseStartFrame,
LLMMessagesFrame,
LLMTextFrame,
LLMThinkingSignatureFrame,
LLMThinkingTextFrame,
LLMUpdateSettingsFrame,
UserImageRawFrame,
)
@@ -380,6 +382,10 @@ class AnthropicLLMService(LLMService):
completion_tokens_estimate += self._estimate_tokens(
event.delta.partial_json
)
elif hasattr(event.delta, "thinking"):
await self.push_frame(LLMThinkingTextFrame(event.delta.thinking))
elif hasattr(event.delta, "signature"):
await self.push_frame(LLMThinkingSignatureFrame(event.delta.signature))
elif event.type == "content_block_start":
if event.content_block.type == "tool_use":
tool_use_block = event.content_block

View File

@@ -58,7 +58,7 @@ class AWSTranscribeSTTService(STTService):
api_key: Optional[str] = None,
aws_access_key_id: Optional[str] = None,
aws_session_token: Optional[str] = None,
region: Optional[str] = "us-east-1",
region: Optional[str] = None,
sample_rate: int = 16000,
language: Language = Language.EN,
**kwargs,
@@ -69,7 +69,7 @@ class AWSTranscribeSTTService(STTService):
api_key: AWS secret access key. If None, uses AWS_SECRET_ACCESS_KEY environment variable.
aws_access_key_id: AWS access key ID. If None, uses AWS_ACCESS_KEY_ID environment variable.
aws_session_token: AWS session token for temporary credentials. If None, uses AWS_SESSION_TOKEN environment variable.
region: AWS region for the service. Defaults to "us-east-1".
region: AWS region for the service.
sample_rate: Audio sample rate in Hz. Must be 8000 or 16000. Defaults to 16000.
language: Language for transcription. Defaults to English.
**kwargs: Additional arguments passed to parent STTService class.