feat(aws): add prompt caching support for Bedrock ConverseStream
Adds `enable_prompt_caching` setting to `AWSBedrockLLMSettings`. When
enabled, appends `cachePoint` markers to system prompts and tool
definitions in ConverseStream requests.
This can reduce TTFT by up to 85% for multi-turn conversations where
the system prompt stays constant (e.g. voice agents, chat assistants).
Follows the same pattern as `AnthropicLLMService.enable_prompt_caching`.
Usage:
```python
llm = AWSBedrockLLMService(
settings=AWSBedrockLLMSettings(
model="au.anthropic.claude-haiku-4-5-20251001-v1:0",
enable_prompt_caching=True,
),
)
```
See: https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html
This commit is contained in:
committed by
Mark Backman
parent
d4824ffe8a
commit
e5aa3bbc20
@@ -66,11 +66,16 @@ class AWSBedrockLLMSettings(LLMSettings):
|
||||
Parameters:
|
||||
stop_sequences: List of strings that stop generation.
|
||||
latency: Performance mode - "standard" or "optimized".
|
||||
enable_prompt_caching: Whether to enable prompt caching by adding cachePoint
|
||||
markers to system prompts and tool definitions. Can reduce TTFT by up to
|
||||
85% for multi-turn conversations. See:
|
||||
https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html
|
||||
additional_model_request_fields: Additional model-specific parameters.
|
||||
"""
|
||||
|
||||
stop_sequences: List[str] | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
latency: str | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
enable_prompt_caching: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
additional_model_request_fields: Dict[str, Any] | _NotGiven = field(
|
||||
default_factory=lambda: NOT_GIVEN
|
||||
)
|
||||
@@ -455,6 +460,24 @@ class AWSBedrockLLMService(LLMService):
|
||||
if self._settings.latency in ["standard", "optimized"]:
|
||||
request_params["performanceConfig"] = {"latency": self._settings.latency}
|
||||
|
||||
# Add cache checkpoints to system prompts and tool definitions.
|
||||
# This enables prompt caching for providers that support it (e.g.
|
||||
# Anthropic Claude on Bedrock), reducing TTFT by up to 85% on
|
||||
# multi-turn conversations where the system prompt stays constant.
|
||||
if self._settings.enable_prompt_caching:
|
||||
if "system" in request_params and request_params["system"]:
|
||||
system_list = request_params["system"]
|
||||
if not any("cachePoint" in item for item in system_list):
|
||||
system_list.append({"cachePoint": {"type": "default"}})
|
||||
if (
|
||||
"toolConfig" in request_params
|
||||
and "tools" in request_params["toolConfig"]
|
||||
and request_params["toolConfig"]["tools"]
|
||||
):
|
||||
tools_list = request_params["toolConfig"]["tools"]
|
||||
if not any("cachePoint" in t for t in tools_list):
|
||||
tools_list.append({"cachePoint": {"type": "default"}})
|
||||
|
||||
# Log request params with messages redacted for logging
|
||||
adapter = self.get_llm_adapter()
|
||||
messages_for_logging = adapter.get_messages_for_logging(context)
|
||||
|
||||
Reference in New Issue
Block a user