Add text_msg_prompt tool to DuplexPipeline and Assistants. Update DebugDrawer to handle text message prompts, including parameter validation and state management for displaying messages. Ensure integration with existing tools and maintain functionality across components.
This commit is contained in:
@@ -157,6 +157,17 @@ class DuplexPipeline:
|
||||
"required": ["msg"],
|
||||
},
|
||||
},
|
||||
"text_msg_prompt": {
|
||||
"name": "text_msg_prompt",
|
||||
"description": "Show a text message prompt dialog on client side",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"msg": {"type": "string", "description": "Message text to display"},
|
||||
},
|
||||
"required": ["msg"],
|
||||
},
|
||||
},
|
||||
}
|
||||
_DEFAULT_CLIENT_EXECUTORS = frozenset({
|
||||
"turn_on_camera",
|
||||
@@ -164,6 +175,7 @@ class DuplexPipeline:
|
||||
"increase_volume",
|
||||
"decrease_volume",
|
||||
"voice_message_prompt",
|
||||
"text_msg_prompt",
|
||||
})
|
||||
|
||||
def __init__(
|
||||
@@ -559,6 +571,10 @@ class DuplexPipeline:
|
||||
data = event.get("data")
|
||||
if not isinstance(data, dict):
|
||||
data = {}
|
||||
explicit_turn_id = str(event.get("turn_id") or "").strip() or None
|
||||
explicit_utterance_id = str(event.get("utterance_id") or "").strip() or None
|
||||
explicit_response_id = str(event.get("response_id") or "").strip() or None
|
||||
explicit_tts_id = str(event.get("tts_id") or "").strip() or None
|
||||
if self._current_turn_id:
|
||||
data.setdefault("turn_id", self._current_turn_id)
|
||||
if self._current_utterance_id:
|
||||
@@ -567,9 +583,29 @@ class DuplexPipeline:
|
||||
data.setdefault("response_id", self._current_response_id)
|
||||
if self._current_tts_id:
|
||||
data.setdefault("tts_id", self._current_tts_id)
|
||||
if explicit_turn_id:
|
||||
data["turn_id"] = explicit_turn_id
|
||||
if explicit_utterance_id:
|
||||
data["utterance_id"] = explicit_utterance_id
|
||||
if explicit_response_id:
|
||||
data["response_id"] = explicit_response_id
|
||||
if explicit_tts_id:
|
||||
data["tts_id"] = explicit_tts_id
|
||||
|
||||
for k, v in event.items():
|
||||
if k in {"type", "timestamp", "sessionId", "seq", "source", "trackId", "data"}:
|
||||
if k in {
|
||||
"type",
|
||||
"timestamp",
|
||||
"sessionId",
|
||||
"seq",
|
||||
"source",
|
||||
"trackId",
|
||||
"data",
|
||||
"turn_id",
|
||||
"utterance_id",
|
||||
"response_id",
|
||||
"tts_id",
|
||||
}:
|
||||
continue
|
||||
data.setdefault(k, v)
|
||||
|
||||
@@ -1027,25 +1063,50 @@ class DuplexPipeline:
|
||||
priority=30,
|
||||
)
|
||||
|
||||
async def _emit_llm_delta(self, text: str) -> None:
|
||||
async def _emit_llm_delta(
|
||||
self,
|
||||
text: str,
|
||||
*,
|
||||
turn_id: Optional[str] = None,
|
||||
utterance_id: Optional[str] = None,
|
||||
response_id: Optional[str] = None,
|
||||
) -> None:
|
||||
event = {
|
||||
**ev(
|
||||
"assistant.response.delta",
|
||||
trackId=self.track_audio_out,
|
||||
text=text,
|
||||
)
|
||||
}
|
||||
if turn_id:
|
||||
event["turn_id"] = turn_id
|
||||
if utterance_id:
|
||||
event["utterance_id"] = utterance_id
|
||||
if response_id:
|
||||
event["response_id"] = response_id
|
||||
await self._send_event(
|
||||
{
|
||||
**ev(
|
||||
"assistant.response.delta",
|
||||
trackId=self.track_audio_out,
|
||||
text=text,
|
||||
)
|
||||
},
|
||||
event,
|
||||
priority=20,
|
||||
)
|
||||
|
||||
async def _flush_pending_llm_delta(self) -> None:
|
||||
async def _flush_pending_llm_delta(
|
||||
self,
|
||||
*,
|
||||
turn_id: Optional[str] = None,
|
||||
utterance_id: Optional[str] = None,
|
||||
response_id: Optional[str] = None,
|
||||
) -> None:
|
||||
if not self._pending_llm_delta:
|
||||
return
|
||||
chunk = self._pending_llm_delta
|
||||
self._pending_llm_delta = ""
|
||||
self._last_llm_delta_emit_ms = time.monotonic() * 1000.0
|
||||
await self._emit_llm_delta(chunk)
|
||||
await self._emit_llm_delta(
|
||||
chunk,
|
||||
turn_id=turn_id,
|
||||
utterance_id=utterance_id,
|
||||
response_id=response_id,
|
||||
)
|
||||
|
||||
async def _outbound_loop(self) -> None:
|
||||
"""Single sender loop that enforces priority for interrupt events."""
|
||||
@@ -1761,7 +1822,9 @@ class DuplexPipeline:
|
||||
self._start_turn()
|
||||
if not self._current_utterance_id:
|
||||
self._finalize_utterance()
|
||||
self._start_response()
|
||||
turn_id = self._current_turn_id
|
||||
utterance_id = self._current_utterance_id
|
||||
response_id = self._start_response()
|
||||
# Start latency tracking
|
||||
self._turn_start_time = time.time()
|
||||
self._first_audio_sent = False
|
||||
@@ -1795,7 +1858,11 @@ class DuplexPipeline:
|
||||
|
||||
event = self._normalize_stream_event(raw_event)
|
||||
if event.type == "tool_call":
|
||||
await self._flush_pending_llm_delta()
|
||||
await self._flush_pending_llm_delta(
|
||||
turn_id=turn_id,
|
||||
utterance_id=utterance_id,
|
||||
response_id=response_id,
|
||||
)
|
||||
tool_call = event.tool_call if isinstance(event.tool_call, dict) else None
|
||||
if not tool_call:
|
||||
continue
|
||||
@@ -1869,7 +1936,11 @@ class DuplexPipeline:
|
||||
self._last_llm_delta_emit_ms <= 0.0
|
||||
or now_ms - self._last_llm_delta_emit_ms >= self._LLM_DELTA_THROTTLE_MS
|
||||
):
|
||||
await self._flush_pending_llm_delta()
|
||||
await self._flush_pending_llm_delta(
|
||||
turn_id=turn_id,
|
||||
utterance_id=utterance_id,
|
||||
response_id=response_id,
|
||||
)
|
||||
|
||||
if use_engine_sentence_split:
|
||||
while True:
|
||||
@@ -1905,7 +1976,10 @@ class DuplexPipeline:
|
||||
**ev(
|
||||
"output.audio.start",
|
||||
trackId=self.track_audio_out,
|
||||
)
|
||||
),
|
||||
"turn_id": turn_id,
|
||||
"utterance_id": utterance_id,
|
||||
"response_id": response_id,
|
||||
},
|
||||
priority=30,
|
||||
)
|
||||
@@ -1915,13 +1989,20 @@ class DuplexPipeline:
|
||||
sentence,
|
||||
fade_in_ms=0,
|
||||
fade_out_ms=8,
|
||||
turn_id=turn_id,
|
||||
utterance_id=utterance_id,
|
||||
response_id=response_id,
|
||||
)
|
||||
|
||||
if use_engine_sentence_split:
|
||||
remaining_text = f"{pending_punctuation}{sentence_buffer}".strip()
|
||||
else:
|
||||
remaining_text = sentence_buffer.strip()
|
||||
await self._flush_pending_llm_delta()
|
||||
await self._flush_pending_llm_delta(
|
||||
turn_id=turn_id,
|
||||
utterance_id=utterance_id,
|
||||
response_id=response_id,
|
||||
)
|
||||
if (
|
||||
self._tts_output_enabled()
|
||||
and remaining_text
|
||||
@@ -1935,7 +2016,10 @@ class DuplexPipeline:
|
||||
**ev(
|
||||
"output.audio.start",
|
||||
trackId=self.track_audio_out,
|
||||
)
|
||||
),
|
||||
"turn_id": turn_id,
|
||||
"utterance_id": utterance_id,
|
||||
"response_id": response_id,
|
||||
},
|
||||
priority=30,
|
||||
)
|
||||
@@ -1944,6 +2028,9 @@ class DuplexPipeline:
|
||||
remaining_text,
|
||||
fade_in_ms=0,
|
||||
fade_out_ms=8,
|
||||
turn_id=turn_id,
|
||||
utterance_id=utterance_id,
|
||||
response_id=response_id,
|
||||
)
|
||||
|
||||
if not tool_calls:
|
||||
@@ -2007,14 +2094,21 @@ class DuplexPipeline:
|
||||
]
|
||||
|
||||
if full_response and not self._interrupt_event.is_set():
|
||||
await self._flush_pending_llm_delta()
|
||||
await self._flush_pending_llm_delta(
|
||||
turn_id=turn_id,
|
||||
utterance_id=utterance_id,
|
||||
response_id=response_id,
|
||||
)
|
||||
await self._send_event(
|
||||
{
|
||||
**ev(
|
||||
"assistant.response.final",
|
||||
trackId=self.track_audio_out,
|
||||
text=full_response,
|
||||
)
|
||||
),
|
||||
"turn_id": turn_id,
|
||||
"utterance_id": utterance_id,
|
||||
"response_id": response_id,
|
||||
},
|
||||
priority=20,
|
||||
)
|
||||
@@ -2026,7 +2120,10 @@ class DuplexPipeline:
|
||||
**ev(
|
||||
"output.audio.end",
|
||||
trackId=self.track_audio_out,
|
||||
)
|
||||
),
|
||||
"turn_id": turn_id,
|
||||
"utterance_id": utterance_id,
|
||||
"response_id": response_id,
|
||||
}, priority=10)
|
||||
|
||||
# End assistant turn
|
||||
@@ -2049,7 +2146,15 @@ class DuplexPipeline:
|
||||
self._current_response_id = None
|
||||
self._current_tts_id = None
|
||||
|
||||
async def _speak_sentence(self, text: str, fade_in_ms: int = 0, fade_out_ms: int = 8) -> None:
|
||||
async def _speak_sentence(
|
||||
self,
|
||||
text: str,
|
||||
fade_in_ms: int = 0,
|
||||
fade_out_ms: int = 8,
|
||||
turn_id: Optional[str] = None,
|
||||
utterance_id: Optional[str] = None,
|
||||
response_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Synthesize and send a single sentence.
|
||||
|
||||
@@ -2086,7 +2191,10 @@ class DuplexPipeline:
|
||||
"metrics.ttfb",
|
||||
trackId=self.track_audio_out,
|
||||
latencyMs=round(ttfb_ms),
|
||||
)
|
||||
),
|
||||
"turn_id": turn_id,
|
||||
"utterance_id": utterance_id,
|
||||
"response_id": response_id,
|
||||
}, priority=25)
|
||||
|
||||
# Double-check interrupt right before sending audio
|
||||
@@ -2233,6 +2341,9 @@ class DuplexPipeline:
|
||||
self._is_bot_speaking = False
|
||||
self._drop_outbound_audio = True
|
||||
self._audio_out_frame_buffer = b""
|
||||
interrupted_turn_id = self._current_turn_id
|
||||
interrupted_utterance_id = self._current_utterance_id
|
||||
interrupted_response_id = self._current_response_id
|
||||
|
||||
# Send interrupt event to client IMMEDIATELY
|
||||
# This must happen BEFORE canceling services, so client knows to discard in-flight audio
|
||||
@@ -2240,7 +2351,10 @@ class DuplexPipeline:
|
||||
**ev(
|
||||
"response.interrupted",
|
||||
trackId=self.track_audio_out,
|
||||
)
|
||||
),
|
||||
"turn_id": interrupted_turn_id,
|
||||
"utterance_id": interrupted_utterance_id,
|
||||
"response_id": interrupted_response_id,
|
||||
}, priority=0)
|
||||
|
||||
# Cancel TTS
|
||||
|
||||
Reference in New Issue
Block a user