From e5aa3bbc20ad907bcd38852aa6aa06bb07733f5e Mon Sep 17 00:00:00 2001 From: Luke Halley Date: Wed, 1 Apr 2026 10:47:30 +0800 Subject: [PATCH] feat(aws): add prompt caching support for Bedrock ConverseStream Adds `enable_prompt_caching` setting to `AWSBedrockLLMSettings`. When enabled, appends `cachePoint` markers to system prompts and tool definitions in ConverseStream requests. This can reduce TTFT by up to 85% for multi-turn conversations where the system prompt stays constant (e.g. voice agents, chat assistants). Follows the same pattern as `AnthropicLLMService.enable_prompt_caching`. Usage: ```python llm = AWSBedrockLLMService( settings=AWSBedrockLLMSettings( model="au.anthropic.claude-haiku-4-5-20251001-v1:0", enable_prompt_caching=True, ), ) ``` See: https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html --- src/pipecat/services/aws/llm.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index 722072592..83b970874 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -66,11 +66,16 @@ class AWSBedrockLLMSettings(LLMSettings): Parameters: stop_sequences: List of strings that stop generation. latency: Performance mode - "standard" or "optimized". + enable_prompt_caching: Whether to enable prompt caching by adding cachePoint + markers to system prompts and tool definitions. Can reduce TTFT by up to + 85% for multi-turn conversations. See: + https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html additional_model_request_fields: Additional model-specific parameters. """ stop_sequences: List[str] | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) latency: str | _NotGiven = field(default_factory=lambda: NOT_GIVEN) + enable_prompt_caching: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN) additional_model_request_fields: Dict[str, Any] | _NotGiven = field( default_factory=lambda: NOT_GIVEN ) @@ -455,6 +460,24 @@ class AWSBedrockLLMService(LLMService): if self._settings.latency in ["standard", "optimized"]: request_params["performanceConfig"] = {"latency": self._settings.latency} + # Add cache checkpoints to system prompts and tool definitions. + # This enables prompt caching for providers that support it (e.g. + # Anthropic Claude on Bedrock), reducing TTFT by up to 85% on + # multi-turn conversations where the system prompt stays constant. + if self._settings.enable_prompt_caching: + if "system" in request_params and request_params["system"]: + system_list = request_params["system"] + if not any("cachePoint" in item for item in system_list): + system_list.append({"cachePoint": {"type": "default"}}) + if ( + "toolConfig" in request_params + and "tools" in request_params["toolConfig"] + and request_params["toolConfig"]["tools"] + ): + tools_list = request_params["toolConfig"]["tools"] + if not any("cachePoint" in t for t in tools_list): + tools_list.append({"cachePoint": {"type": "default"}}) + # Log request params with messages redacted for logging adapter = self.get_llm_adapter() messages_for_logging = adapter.get_messages_for_logging(context)