refactor(async-tool-messages): replace reminder grafting with caller-supplied template
Empirical testing showed the previous design — grafting a verbose re-invocation reminder into the payload's `description` field for started and intermediate messages — was actually making Nova Sonic *worse*, not better: more spurious re-invocations of the same tool, not fewer. Plausibly the long, instruction-shaped description text reads as content the model has to respond to, where a terse status update reads as ambient state. Replace the reminder grafting with a caller-supplied `template` keyword argument on `prepare_message_payload_for_realtime`. When `None` (the default), the payload is serialized to its canonical JSON form. When provided, `template.format(tool_call_id=…, status=…, result=…, description=…)` is applied. The template is honored across all kinds, so callers route per kind based on which wire channel they're using. Nova Sonic now defines its own bracketed plain-text template (`_ASYNC_TOOL_RESULT_TEXT_TEMPLATE`) and applies it on the cross-modal user-text channel (intermediate / final). The started path stays on raw JSON (the formal AWS tool-result channel requires valid JSON). A code comment at the template constant captures the empirical finding for the next person — short framing yields much better behavior, surprising as it sounds. Tests updated for the new template behavior across all kinds. Also reverts a stream-tool example sleep-duration tweak (20s → 10s) and adds a commented-out alternative in the function-calling-openai-async-stream example for parallel testing.
This commit is contained in:
@@ -74,6 +74,7 @@ async def track_current_location(params: FunctionCallParams):
|
||||
|
||||
# Second update: revised city estimate.
|
||||
await asyncio.sleep(10)
|
||||
# await asyncio.sleep(20)
|
||||
gps = {"lat": 33.96003, "lng": -118.40639}
|
||||
await params.result_callback(
|
||||
{"gps": gps, "city": "Los Angeles"},
|
||||
@@ -82,6 +83,7 @@ async def track_current_location(params: FunctionCallParams):
|
||||
|
||||
# Final result: confirmed city.
|
||||
await asyncio.sleep(10)
|
||||
# await asyncio.sleep(20)
|
||||
gps = {"lat": 32.743569, "lng": -117.20466}
|
||||
await params.result_callback({"gps": gps, "city": "San Diego"})
|
||||
|
||||
|
||||
@@ -54,14 +54,14 @@ async def track_current_location(params: FunctionCallParams):
|
||||
properties=FunctionCallResultProperties(is_final=False),
|
||||
)
|
||||
|
||||
await asyncio.sleep(20)
|
||||
await asyncio.sleep(10)
|
||||
gps = {"lat": 33.96003, "lng": -118.40639}
|
||||
await params.result_callback(
|
||||
{"gps": gps, "city": "Los Angeles"},
|
||||
properties=FunctionCallResultProperties(is_final=False),
|
||||
)
|
||||
|
||||
await asyncio.sleep(20)
|
||||
await asyncio.sleep(10)
|
||||
gps = {"lat": 32.743569, "lng": -117.20466}
|
||||
await params.result_callback({"gps": gps, "city": "San Diego"})
|
||||
|
||||
|
||||
@@ -289,95 +289,82 @@ def parse_message(message: LLMStandardMessage) -> AsyncToolMessagePayload | None
|
||||
# --- Realtime preparation ----------------------------------------------------
|
||||
|
||||
|
||||
# Natural-language reminder grafted onto the ``description`` field of in-flight
|
||||
# payloads (started / intermediate) when they're sent to a realtime LLM
|
||||
# service. Realtime services receive these mid-stream while the model is
|
||||
# still talking with the user, which is the moment the model is most likely
|
||||
# to mistakenly re-issue the same tool call. Keeping this reminder out of the
|
||||
# canonical payload descriptions (and confined to the realtime path) avoids
|
||||
# influencing non-realtime consumers of the same context. We don't graft it
|
||||
# onto ``final`` payloads, because at that point the task is done and
|
||||
# re-invocation by the model is no longer a mistake.
|
||||
#
|
||||
# The reminder is appended *after* the canonical description so the model
|
||||
# first reads the protocol-level explanation of what async-tool messages are
|
||||
# and how they work, and only then encounters the behavioral directive,
|
||||
# which now flows naturally from that context.
|
||||
_REALTIME_REINVOCATION_REMINDER = (
|
||||
"While this task is in flight, do not call the same tool with the same "
|
||||
"arguments again; you would just kick off a duplicate task."
|
||||
)
|
||||
|
||||
|
||||
def prepare_message_payload_for_realtime(payload: AsyncToolMessagePayload) -> str:
|
||||
def prepare_message_payload_for_realtime(
|
||||
payload: AsyncToolMessagePayload,
|
||||
*,
|
||||
template: str | None = None,
|
||||
) -> str:
|
||||
"""Prepare an async-tool message payload for sending to a realtime LLM service.
|
||||
|
||||
Returns a wire-ready JSON string. Realtime services that fully honor the
|
||||
async-tool mechanism send the ``started`` payload via the formal
|
||||
tool-result channel and the subsequent ``intermediate`` / ``final``
|
||||
payloads as text injected mid-conversation; this function returns the
|
||||
string to send in either case, and callers route it to the appropriate
|
||||
channel.
|
||||
|
||||
The exact transformation depends on the payload kind. Each kind is
|
||||
handled by its own private helper, so per-kind tweaks can be added later
|
||||
without entangling the others. Today:
|
||||
|
||||
- ``started`` / ``intermediate``: a natural-language reminder
|
||||
discouraging the model from re-invoking the in-flight tool is grafted
|
||||
onto the ``description`` field, then the payload is re-serialized.
|
||||
Grafting into ``description`` (rather than wrapping the JSON with extra
|
||||
text) keeps the output well-formed JSON, which the formal tool-result
|
||||
channel requires.
|
||||
- ``final``: pass-through; the payload is serialized as-is. The task is
|
||||
done at this point, so re-invocation by the model (if the user asks
|
||||
again later) is no longer a mistake.
|
||||
Realtime services that fully honor the async-tool mechanism send the
|
||||
``started`` payload via the formal tool-result channel and the subsequent
|
||||
``intermediate`` / ``final`` payloads as text injected mid-conversation;
|
||||
this function returns the string to send in either case, and callers
|
||||
route it to the appropriate channel.
|
||||
|
||||
Args:
|
||||
payload: The parsed async-tool message payload.
|
||||
template: Optional format string. If provided, the rendered output is
|
||||
``template.format(tool_call_id=…, status=…, result=…, description=…)``.
|
||||
If ``None``, the payload is serialized to its canonical JSON
|
||||
form. Per-kind helpers ultimately decide what to do with the
|
||||
template, so future per-kind tweaks (e.g. raising for a kind
|
||||
that shouldn't accept templates) can be added without changing
|
||||
this signature.
|
||||
|
||||
Returns:
|
||||
The prepared JSON string, ready to be sent to the realtime service.
|
||||
The prepared string, ready to be sent to the realtime service.
|
||||
"""
|
||||
if payload.kind == "started":
|
||||
return _prepare_started_message_payload_for_realtime(payload)
|
||||
return _prepare_started_message_payload_for_realtime(payload, template=template)
|
||||
if payload.kind == "intermediate":
|
||||
return _prepare_intermediate_result_message_payload_for_realtime(payload)
|
||||
return _prepare_intermediate_result_message_payload_for_realtime(payload, template=template)
|
||||
if payload.kind == "final":
|
||||
return _prepare_final_result_message_payload_for_realtime(payload)
|
||||
return _prepare_final_result_message_payload_for_realtime(payload, template=template)
|
||||
raise ValueError(f"Unknown async-tool message payload kind: {payload.kind!r}")
|
||||
|
||||
|
||||
def _prepare_started_message_payload_for_realtime(payload: AsyncToolMessagePayload) -> str:
|
||||
return _payload_to_json(_with_reinvocation_reminder_grafted_in(payload))
|
||||
def _prepare_started_message_payload_for_realtime(
|
||||
payload: AsyncToolMessagePayload,
|
||||
*,
|
||||
template: str | None = None,
|
||||
) -> str:
|
||||
if template is None:
|
||||
return _payload_to_json(payload)
|
||||
return _format_with_template(payload, template)
|
||||
|
||||
|
||||
def _prepare_intermediate_result_message_payload_for_realtime(
|
||||
payload: AsyncToolMessagePayload,
|
||||
*,
|
||||
template: str | None = None,
|
||||
) -> str:
|
||||
return _payload_to_json(_with_reinvocation_reminder_grafted_in(payload))
|
||||
if template is None:
|
||||
return _payload_to_json(payload)
|
||||
return _format_with_template(payload, template)
|
||||
|
||||
|
||||
def _prepare_final_result_message_payload_for_realtime(payload: AsyncToolMessagePayload) -> str:
|
||||
# Pass-through, for now
|
||||
return _payload_to_json(payload)
|
||||
|
||||
|
||||
def _with_reinvocation_reminder_grafted_in(
|
||||
def _prepare_final_result_message_payload_for_realtime(
|
||||
payload: AsyncToolMessagePayload,
|
||||
) -> AsyncToolMessagePayload:
|
||||
"""Return a copy of ``payload`` with the re-invocation reminder appended to ``description``.
|
||||
*,
|
||||
template: str | None = None,
|
||||
) -> str:
|
||||
if template is None:
|
||||
return _payload_to_json(payload)
|
||||
return _format_with_template(payload, template)
|
||||
|
||||
The reminder lives inside ``description`` so the surrounding JSON
|
||||
envelope stays well-formed (which the formal tool-result channel
|
||||
requires). It's appended (rather than prefixed) so the model first
|
||||
reads the protocol-level explanation of what async-tool messages are
|
||||
and only then encounters the behavioral directive.
|
||||
|
||||
def _format_with_template(payload: AsyncToolMessagePayload, template: str) -> str:
|
||||
"""Render a payload via a caller-supplied template.
|
||||
|
||||
Available substitution keys: ``tool_call_id``, ``status``, ``result``,
|
||||
``description``. Note that ``result`` is empty for ``started`` payloads
|
||||
(no result has been produced yet); callers building templates intended
|
||||
for ``started`` should not rely on it.
|
||||
"""
|
||||
return AsyncToolMessagePayload(
|
||||
kind=payload.kind,
|
||||
return template.format(
|
||||
tool_call_id=payload.tool_call_id,
|
||||
status=payload.status,
|
||||
description=f"{payload.description} {_REALTIME_REINVOCATION_REMINDER}",
|
||||
result=payload.result,
|
||||
result=payload.result or "",
|
||||
description=payload.description,
|
||||
)
|
||||
|
||||
@@ -237,6 +237,26 @@ class AWSNovaSonicLLMSettings(LLMSettings):
|
||||
endpointing_sensitivity: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
|
||||
# Bracketed plain-text template Nova Sonic uses when injecting async-tool
|
||||
# result updates onto the cross-modal user-text channel.
|
||||
#
|
||||
# Note that this template intentionally drops the payload's ``description``
|
||||
# field (the protocol-level explanation of what async-tool messages are and
|
||||
# how they work) and only carries ``tool_call_id``, ``status``, and
|
||||
# ``result``. Counterintuitively, this short framing — minus the verbose
|
||||
# protocol description, minus a JSON envelope altogether — empirically
|
||||
# yields much better Nova Sonic behavior: noticeably fewer spurious
|
||||
# re-invocations of the same tool than when the full JSON envelope (with
|
||||
# its description) was injected as text. We don't fully understand why; one
|
||||
# plausible explanation is that the model treats long, instruction-shaped
|
||||
# description text as content demanding a response, where a terse
|
||||
# bracketed status update reads more like ambient state. Worth revisiting
|
||||
# if Nova Sonic's text-channel handling changes.
|
||||
_ASYNC_TOOL_RESULT_TEXT_TEMPLATE = (
|
||||
"[Async tool update for tool_call_id={tool_call_id}, status={status}] {result}"
|
||||
)
|
||||
|
||||
|
||||
class AWSNovaSonicLLMService(LLMService[AWSNovaSonicLLMAdapter]):
|
||||
"""AWS Nova Sonic speech-to-speech LLM service.
|
||||
|
||||
@@ -686,12 +706,14 @@ class AWSNovaSonicLLMService(LLMService[AWSNovaSonicLLMAdapter]):
|
||||
)
|
||||
return
|
||||
if send_new_results:
|
||||
payload = async_tool_messages.prepare_message_payload_for_realtime(info)
|
||||
text = async_tool_messages.prepare_message_payload_for_realtime(
|
||||
info, template=_ASYNC_TOOL_RESULT_TEXT_TEMPLATE
|
||||
)
|
||||
logger.debug(
|
||||
f"{self}: async_tool send {info.kind} as text input: "
|
||||
f"tool_call_id={info.tool_call_id} text={payload!r}"
|
||||
f"tool_call_id={info.tool_call_id} text={text!r}"
|
||||
)
|
||||
await self._send_async_tool_text(payload)
|
||||
await self._send_async_tool_text(text)
|
||||
else:
|
||||
logger.trace(
|
||||
f"{self}: async_tool {info.kind} mark-handled (no send): "
|
||||
|
||||
@@ -233,54 +233,89 @@ class TestBuilders(unittest.TestCase):
|
||||
|
||||
|
||||
class TestPrepareMessagePayloadForRealtime(unittest.TestCase):
|
||||
def test_started_grafts_reminder_into_description(self):
|
||||
"""Verify the realtime preparation behavior across kinds and template usage."""
|
||||
|
||||
# --- Default (no template) → raw JSON pass-through -----------------------
|
||||
|
||||
def test_started_default_is_raw_json(self):
|
||||
msg = async_tool_messages.build_started_message("call_42")
|
||||
info = async_tool_messages.parse_message(msg)
|
||||
assert info is not None
|
||||
text = async_tool_messages.prepare_message_payload_for_realtime(info)
|
||||
# The output is well-formed JSON (the formal tool-result channel
|
||||
# requires it).
|
||||
decoded = json.loads(text)
|
||||
# The reminder lives inside the description field, not outside the
|
||||
# JSON envelope.
|
||||
assert "do not call the same tool" in decoded["description"]
|
||||
assert "duplicate task" in decoded["description"]
|
||||
# And the original description text is still present after the reminder.
|
||||
assert "asynchronous task" in decoded["description"]
|
||||
# Other payload fields are preserved.
|
||||
assert decoded["type"] == "async_tool"
|
||||
assert decoded["tool_call_id"] == "call_42"
|
||||
assert decoded["status"] == "running"
|
||||
# Started payloads have no result field.
|
||||
assert "result" not in decoded
|
||||
|
||||
def test_intermediate_grafts_reminder_into_description(self):
|
||||
def test_intermediate_default_is_raw_json(self):
|
||||
msg = async_tool_messages.build_intermediate_result_message("call_42", '"step-1"')
|
||||
info = async_tool_messages.parse_message(msg)
|
||||
assert info is not None
|
||||
text = async_tool_messages.prepare_message_payload_for_realtime(info)
|
||||
decoded = json.loads(text)
|
||||
assert "do not call the same tool" in decoded["description"]
|
||||
assert decoded["type"] == "async_tool"
|
||||
assert decoded["tool_call_id"] == "call_42"
|
||||
assert decoded["status"] == "running"
|
||||
assert decoded["result"] == '"step-1"'
|
||||
|
||||
def test_final_is_pass_through(self):
|
||||
# The task is done at this point; the re-invocation reminder no
|
||||
# longer applies, so the final payload is forwarded as-is (no
|
||||
# reminder grafted onto the description).
|
||||
def test_final_default_is_raw_json(self):
|
||||
msg = async_tool_messages.build_final_result_message("call_42", '"the answer"')
|
||||
info = async_tool_messages.parse_message(msg)
|
||||
assert info is not None
|
||||
text = async_tool_messages.prepare_message_payload_for_realtime(info)
|
||||
decoded = json.loads(text)
|
||||
assert "do not call the same tool" not in decoded["description"]
|
||||
assert decoded["type"] == "async_tool"
|
||||
assert decoded["tool_call_id"] == "call_42"
|
||||
assert decoded["status"] == "finished"
|
||||
assert decoded["result"] == '"the answer"'
|
||||
|
||||
# --- Caller-supplied template applied across kinds -----------------------
|
||||
|
||||
def test_template_applied_to_started(self):
|
||||
msg = async_tool_messages.build_started_message("call_42")
|
||||
info = async_tool_messages.parse_message(msg)
|
||||
assert info is not None
|
||||
text = async_tool_messages.prepare_message_payload_for_realtime(
|
||||
info,
|
||||
template="[{tool_call_id} {status}] {result}",
|
||||
)
|
||||
# Started has no result; substitution yields empty string after the bracket.
|
||||
assert text == "[call_42 running] "
|
||||
|
||||
def test_template_applied_to_intermediate(self):
|
||||
msg = async_tool_messages.build_intermediate_result_message("call_42", '"step-1"')
|
||||
info = async_tool_messages.parse_message(msg)
|
||||
assert info is not None
|
||||
text = async_tool_messages.prepare_message_payload_for_realtime(
|
||||
info,
|
||||
template="[{tool_call_id} {status}] {result}",
|
||||
)
|
||||
assert text == '[call_42 running] "step-1"'
|
||||
|
||||
def test_template_applied_to_final(self):
|
||||
msg = async_tool_messages.build_final_result_message("call_42", '"the answer"')
|
||||
info = async_tool_messages.parse_message(msg)
|
||||
assert info is not None
|
||||
text = async_tool_messages.prepare_message_payload_for_realtime(
|
||||
info,
|
||||
template="[{tool_call_id} {status}] {result}",
|
||||
)
|
||||
assert text == '[call_42 finished] "the answer"'
|
||||
|
||||
def test_template_can_use_description_field(self):
|
||||
msg = async_tool_messages.build_intermediate_result_message("call_42", '"step-1"')
|
||||
info = async_tool_messages.parse_message(msg)
|
||||
assert info is not None
|
||||
text = async_tool_messages.prepare_message_payload_for_realtime(
|
||||
info,
|
||||
template="{description} >> {result}",
|
||||
)
|
||||
# The intermediate description text is preserved verbatim.
|
||||
assert "intermediate result" in text
|
||||
assert text.endswith('>> "step-1"')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user