From d18fe7c39cbacc4ecbec73c0bb92bbc399e8e54a Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 6 May 2026 11:29:19 -0400 Subject: [PATCH] feat(rtvi): type UI accessibility snapshots --- .../processors/frameworks/rtvi/models.py | 83 ++++++++++++++++- tests/test_rtvi_ui.py | 91 ++++++++++++++++++- 2 files changed, 167 insertions(+), 7 deletions(-) diff --git a/src/pipecat/processors/frameworks/rtvi/models.py b/src/pipecat/processors/frameworks/rtvi/models.py index 5840ecbd6..dc16981b5 100644 --- a/src/pipecat/processors/frameworks/rtvi/models.py +++ b/src/pipecat/processors/frameworks/rtvi/models.py @@ -20,7 +20,7 @@ from typing import ( Literal, ) -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from pipecat.frames.frames import ( AggregationType, @@ -617,18 +617,91 @@ class UICommandData(BaseModel): payload: Any | None = None +class A11yNode(BaseModel): + """One node in the UI accessibility snapshot tree. + + Mirrors the client-side ``A11yNode`` wire shape. Extra fields are + allowed so clients can add platform-specific or future metadata + without breaking older servers. + + Parameters: + ref: Stable client-assigned element reference. + role: ARIA-style role for the node. + name: Optional accessible name. + value: Optional current value for inputs/progress/etc. + state: Optional short state tags (e.g. ``"focused"``, + ``"disabled"``, ``"offscreen"``). + level: Optional heading level. + colcount: Optional column count for grid-like containers. + rowcount: Optional row count for grid-like containers. + children: Optional child nodes. + """ + + model_config = ConfigDict(extra="allow") + + ref: str + role: str + name: str | None = None + value: str | None = None + state: list[str] | None = None + level: int | None = None + colcount: int | None = None + rowcount: int | None = None + children: list["A11yNode"] | None = None + + +class A11ySelection(BaseModel): + """The user's current text selection in the UI snapshot. + + Extra fields are allowed for forward compatibility with client + snapshot additions. + + Parameters: + ref: Ref of the element that carries the selection. + text: Selected text. + start_offset: Optional selection start offset. + end_offset: Optional selection end offset. + """ + + model_config = ConfigDict(extra="allow") + + ref: str + text: str + start_offset: int | None = None + end_offset: int | None = None + + +class A11ySnapshot(BaseModel): + """Client accessibility snapshot sent in a ``ui-snapshot`` message. + + Mirrors the client-side ``A11ySnapshot`` wire shape. Extra fields + are allowed so clients can add compatible metadata over time. + + Parameters: + root: Root accessibility node. + captured_at: Client-side epoch milliseconds when captured. + selection: Optional current text selection. + """ + + model_config = ConfigDict(extra="allow") + + root: A11yNode + captured_at: int + selection: A11ySelection | None = None + + class UISnapshotData(BaseModel): """Inner ``data`` for a ``ui-snapshot`` message. - The accessibility snapshot tree is opaque on the server side. - The client owns its shape; the server stores it as-is for - rendering into the LLM context. + The accessibility snapshot tree mirrors the client-side + ``A11ySnapshot`` wire shape and is kept forward-compatible by + allowing extra fields on the snapshot models. Parameters: tree: The serialized accessibility tree. """ - tree: Any | None = None + tree: A11ySnapshot class UICancelTaskData(BaseModel): diff --git a/tests/test_rtvi_ui.py b/tests/test_rtvi_ui.py index ec9e1247f..930786923 100644 --- a/tests/test_rtvi_ui.py +++ b/tests/test_rtvi_ui.py @@ -19,6 +19,9 @@ from pipecat.processors.frameworks.rtvi.frames import ( RTVIUIEventFrame, ) from pipecat.processors.frameworks.rtvi.models import ( + A11yNode, + A11ySelection, + A11ySnapshot, Click, Focus, Highlight, @@ -72,17 +75,101 @@ class TestEnvelopeMessages(unittest.TestCase): ) def test_ui_snapshot_envelope(self): - msg = UISnapshotMessage(id="m2", data=UISnapshotData(tree={"root": "..."})) + msg = UISnapshotMessage( + id="m2", + data=UISnapshotData( + tree=A11ySnapshot( + root=A11yNode( + ref="e1", + role="main", + children=[A11yNode(ref="e2", role="button", name="Save")], + ), + captured_at=42, + selection=A11ySelection(ref="e2", text="Save", start_offset=0, end_offset=4), + ) + ), + ) self.assertEqual( msg.model_dump(), { "label": "rtvi-ai", "type": "ui-snapshot", "id": "m2", - "data": {"tree": {"root": "..."}}, + "data": { + "tree": { + "root": { + "ref": "e1", + "role": "main", + "name": None, + "value": None, + "state": None, + "level": None, + "colcount": None, + "rowcount": None, + "children": [ + { + "ref": "e2", + "role": "button", + "name": "Save", + "value": None, + "state": None, + "level": None, + "colcount": None, + "rowcount": None, + "children": None, + } + ], + }, + "captured_at": 42, + "selection": { + "ref": "e2", + "text": "Save", + "start_offset": 0, + "end_offset": 4, + }, + } + }, }, ) + def test_ui_snapshot_allows_future_client_fields(self): + msg = UISnapshotMessage.model_validate( + { + "id": "m2", + "data": { + "tree": { + "root": { + "ref": "e1", + "role": "main", + "bounds": {"x": 1, "y": 2}, + "children": [ + { + "ref": "e2", + "role": "button", + "name": "Save", + "platform_state": {"pressed": False}, + } + ], + }, + "captured_at": 42, + "selection": { + "ref": "e2", + "text": "Save", + "direction": "forward", + }, + "viewport": {"width": 1024, "height": 768}, + } + }, + } + ) + + dumped = msg.model_dump() + tree = dumped["data"]["tree"] + self.assertEqual(tree["root"]["bounds"], {"x": 1, "y": 2}) + self.assertEqual(tree["root"]["children"][0]["platform_state"], {"pressed": False}) + self.assertEqual(tree["selection"]["direction"], "forward") + self.assertEqual(tree["viewport"], {"width": 1024, "height": 768}) + def test_ui_cancel_task_envelope(self): msg = UICancelTaskMessage(id="m3", data=UICancelTaskData(task_id="t-99", reason="user")) self.assertEqual(