Files
pipecat/examples/multi-worker/ui-worker/deixis/bot.py
Mark Backman f826da9ac9 Add deixis UIWorker example
A ReplyToolMixin UIWorker that grounds in the user's text selection (the
<selection> block in the snapshot) and points back via select_text — both
directions of deictic reference.
2026-05-21 23:20:40 -04:00

292 lines
10 KiB
Python

#
# Copyright (c) 2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Deixis — the UIWorker grounds in what the user just selected.
The page renders an article. The user selects a paragraph (or any span
of text) and asks "explain this", "rephrase that", "where does it talk
about RNA editing?", and so on. The client captures
``window.getSelection()`` and emits a ``<selection ref="...">selected
text</selection>`` block in the snapshot. The UIWorker reads it as a
deictic reference: "this paragraph" resolves to the selected element.
Two directions:
- **Read**: user selects text → ``<selection>`` block in ``<ui_state>``
→ the worker grounds its answer in the selected content.
- **Write**: the worker says "this paragraph" → ``select_text=ref`` puts
the page's text selection on that element → the user sees what the
worker is referring to.
Same skeleton as ``pointing``. ``DeixisWorker`` composes
``ReplyToolMixin``: the ``reply(answer, scroll_to, highlight,
select_text)`` bundle covers both pointing-style and reading-style apps.
Architecture::
Main worker (PipelineWorker, owns transport + RTVI):
transport.in → STT → user_agg → LLM → TTS → transport.out → assistant_agg
└── answer_about_screen(query) tool
└── params.pipeline_worker.job("ui", name="respond", payload={query})
DeixisWorker (ReplyToolMixin + UIWorker):
└── inherited: reply(answer, scroll_to, highlight, select_text)
Run::
uv run python bot.py
Then open the client at ``http://localhost:5173`` (see ``README.md``).
Requirements:
- OPENAI_API_KEY
- DEEPGRAM_API_KEY
- CARTESIA_API_KEY
"""
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.job_context import JobError
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.worker import PipelineParams, PipelineWorker
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import (
LLMContextAggregatorPair,
LLMUserAggregatorParams,
)
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.workers.ui import ReplyToolMixin, UIWorker
load_dotenv(override=True)
MAIN_NAME = "main"
transport_params = {
"daily": lambda: DailyParams(audio_in_enabled=True, audio_out_enabled=True),
"webrtc": lambda: TransportParams(audio_in_enabled=True, audio_out_enabled=True),
}
VOICE_PROMPT = """\
You are the voice layer of a screen-aware reading assistant. A \
separate UI layer sees the page (and the user's selection) and \
writes the spoken reply.
For every user utterance about the article, call \
``answer_about_screen`` with the user's request verbatim. The tool's \
response is the spoken reply, already TTS-ready.
Only respond directly for pure pleasantries (greetings, thanks, \
goodbyes). Keep direct replies to one short spoken sentence."""
# The UI wire-format guide (UI_STATE_PROMPT_GUIDE) is appended to the LLM's
# system instruction automatically by UIWorker, so this prompt only needs the
# app-specific behavior.
UI_PROMPT = """\
You help the user read and understand an article. The current \
``<ui_state>`` block is in your context, and may contain a \
``<selection>`` block when the user has highlighted text.
## Tool: reply
Every turn calls ``reply`` exactly once. One tool call per turn, no \
chaining.
``reply(answer, scroll_to=None, highlight=None, select_text=None)``:
- ``answer`` (REQUIRED): the spoken reply, plain language, two short \
sentences max. No markdown, no symbols, no quoting long passages.
- ``scroll_to`` (OPTIONAL): a snapshot ref. Set when the paragraph \
you want to point at is tagged ``[offscreen]``.
- ``highlight`` (OPTIONAL): a list of snapshot refs to flash briefly. \
Use for short emphasis: "look at this fact". Don't use it for a \
whole paragraph; ``select_text`` is better for that.
- ``select_text`` (OPTIONAL): a single snapshot ref. Sets the page's \
text selection to that element. Use this when you say "this \
paragraph" or "the section that talks about X" so the user sees \
exactly what you're referring to.
## Reading the user's selection
If ``<ui_state>`` contains a ``<selection ref="...">selected \
text</selection>`` block, the user has highlighted something. Treat \
that selection as the deictic referent for words like "this", \
"that", "this paragraph", "what I selected". Ground your answer in \
the selected content, not the article as a whole.
When answering about the user's selection, do NOT also call \
``select_text`` — they already selected it; pointing back at the \
same span is redundant.
## Decision rules
- User has a selection AND asks something deictic ("explain this", \
"rephrase that", "what does this mean") → ground in the selection. \
Just ``answer``; no visual fields.
- User asks "where does it say X?" or "show me the part about X"\
find the matching paragraph, ``answer`` briefly, set \
``select_text=ref`` to point at it, and ``scroll_to=ref`` if it's \
``[offscreen]``.
- User asks a content question without selection → ``answer`` with \
the relevant fact. Optionally set ``select_text=ref`` if the \
answer is sourced from one specific paragraph.
## Examples
(refs are illustrative; use the actual refs from the current \
``<ui_state>``)
- User selects the third paragraph, asks "explain this"\
``reply(answer="The skin acts as its own light sensor. Even though \
octopuses are colorblind, their skin can detect light directly, \
which is how they match colors so accurately.")``
- "Where does it talk about RNA editing?" (paragraph e15, offscreen) \
→ ``reply(answer="Here, in the paragraph about RNA editing.", \
scroll_to="e15", select_text="e15")``
- "How many neurons does an octopus have?" (no selection) → \
``reply(answer="About five hundred million, with two thirds of \
them in the arms.", select_text="e7")``
- "Hi, what's this article about?" (no selection) → \
``reply(answer="It's a short essay on octopus cognition. Select any \
paragraph and I'll explain it.")``"""
class DeixisWorker(ReplyToolMixin, UIWorker):
"""UIWorker that grounds in the user's selection and points back via select_text.
Composes ``ReplyToolMixin``, which exposes a single
``reply(answer, scroll_to=None, highlight=None, select_text=None, ...)``
LLM tool. The same bundle pointing apps use also covers
reading-style apps: ``select_text`` is for "this paragraph" / "the
section about X" (durable text selection), while ``highlight``
flashes briefly for short emphasis.
"""
def __init__(self):
llm = OpenAILLMService(
api_key=os.environ["OPENAI_API_KEY"],
settings=OpenAILLMService.Settings(system_instruction=UI_PROMPT),
)
super().__init__("ui", llm=llm)
async def answer_about_screen(params: FunctionCallParams, query: str):
"""Ask the screen-aware UI worker to answer about the article / selection.
Args:
query (str): The user's request, passed verbatim.
"""
logger.info(f"answer_about_screen('{query}')")
try:
async with params.pipeline_worker.job(
"ui", name="respond", payload={"query": query}, timeout=10
) as t:
pass
except JobError as e:
logger.warning(f"ui job failed: {e}")
await params.result_callback("Something went wrong on my side.")
return
speak = (t.response or {}).get("speak")
await params.result_callback(speak or "I'm not sure how to answer that.")
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info("Starting deixis bot")
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
stt = DeepgramSTTService(api_key=os.environ["DEEPGRAM_API_KEY"])
tts = CartesiaTTSService(
api_key=os.environ["CARTESIA_API_KEY"],
settings=CartesiaTTSService.Settings(
voice=os.getenv("CARTESIA_VOICE_ID", "71a7ad14-091c-4e8e-a314-022ece01c121"),
),
)
llm = OpenAILLMService(
api_key=os.environ["OPENAI_API_KEY"],
settings=OpenAILLMService.Settings(system_instruction=VOICE_PROMPT),
)
llm.register_direct_function(answer_about_screen, cancel_on_interruption=False, timeout_secs=30)
context = LLMContext(tools=ToolsSchema(standard_tools=[answer_about_screen]))
aggregators = LLMContextAggregatorPair(
context,
user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()),
)
pipeline = Pipeline(
[
transport.input(),
stt,
aggregators.user(),
llm,
tts,
transport.output(),
aggregators.assistant(),
]
)
worker = PipelineWorker(
pipeline,
name=MAIN_NAME,
params=PipelineParams(enable_metrics=True, enable_usage_metrics=True),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info("Client connected")
context.add_message(
{
"role": "developer",
"content": (
"Greet the user briefly. Tell them they can select a "
"paragraph and ask you to explain or rephrase it. One "
"short sentence."
),
}
)
await worker.queue_frame(LLMRunFrame())
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info("Client disconnected")
await runner.cancel()
await runner.launch_worker(DeixisWorker())
await runner.launch_worker(worker)
await runner.run()
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()