Files
engine-v5-pipecat-core/engine/text_stream.py

146 lines
5.2 KiB
Python

from __future__ import annotations
from typing import Any, Protocol
from pipecat.frames.frames import (
CancelFrame,
Frame,
InterruptionFrame,
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMTextFrame,
OutputTransportMessageUrgentFrame,
TTSSpeakFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class _AssistantContextSync(Protocol):
@property
def context(self) -> Any: ...
def sync_streamed_assistant_context(
aggregator: _AssistantContextSync,
*,
streamed_text: str,
committed_text: str,
) -> None:
"""Align LLM context with UI text after an interrupted assistant turn.
The assistant aggregator only commits TTS-spoken text on interrupt. Replace
or append the streamed LLM text so the next turn sees what the user saw.
"""
streamed = streamed_text.strip()
if not streamed or streamed == committed_text.strip():
return
committed = committed_text.strip()
def _apply(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
updated = list(messages)
if committed and updated:
last = updated[-1]
if isinstance(last, dict) and last.get("role") == "assistant":
content = last.get("content")
if isinstance(content, str) and content.strip() == committed:
updated[-1] = {"role": "assistant", "content": streamed}
return updated
updated.append({"role": "assistant", "content": streamed})
return updated
aggregator.context.transform_messages(_apply)
class ProductTextStreamProcessor(FrameProcessor):
"""Mirrors LLM text frames as streaming protocol events.
Placed between the LLM service and the TTS service, this processor
observes the LLM's text frames as they're emitted and forwards them
downstream as ``OutputTransportMessageUrgentFrame``s that the product
serializer turns into ``response.text.{started,delta,final}`` events.
Urgent frames bypass TTS serialization and transport audio queues so text
reaches the client at least as quickly as synthesized audio.
``TTSSpeakFrame`` (used by the fixed-greeting code path, which bypasses
the LLM entirely) is also handled: the processor synthesizes a single
started/delta/final sequence for its fixed text.
"""
def __init__(self) -> None:
super().__init__()
self._aggregation: list[str] = []
self._turn_active = False
self._interrupted_stream_text: str | None = None
def take_interrupted_stream_text(self) -> str | None:
text = self._interrupted_stream_text
self._interrupted_stream_text = None
return text
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
await super().process_frame(frame, direction)
if isinstance(frame, LLMFullResponseStartFrame):
await self.push_frame(frame, direction)
await self._start_turn()
elif isinstance(frame, LLMTextFrame):
await self.push_frame(frame, direction)
if frame.text:
await self._delta(frame.text)
elif isinstance(frame, LLMFullResponseEndFrame):
await self.push_frame(frame, direction)
await self._end_turn(interrupted=False)
elif isinstance(frame, (InterruptionFrame, CancelFrame)):
await self.push_frame(frame, direction)
await self._end_turn(interrupted=True)
elif isinstance(frame, TTSSpeakFrame):
# Fixed-text / direct-speech path: there's no LLM cycle, so
# synthesize one started/delta/final sequence for the spoken text.
text = frame.text or ""
await self.push_frame(frame, direction)
await self._start_turn()
if text:
await self._delta(text)
await self._end_turn(interrupted=False)
else:
await self.push_frame(frame, direction)
async def _start_turn(self) -> None:
if self._turn_active:
return
self._turn_active = True
self._aggregation = []
await self._emit("response.text.started")
async def _delta(self, text: str) -> None:
if not self._turn_active:
# A text frame outside a turn shouldn't happen, but if it does,
# synthesize a started boundary so the client renders sensibly.
await self._start_turn()
self._aggregation.append(text)
await self._emit("response.text.delta", text=text)
async def _end_turn(self, *, interrupted: bool) -> None:
if not self._turn_active:
return
full_text = "".join(self._aggregation)
if interrupted and full_text:
self._interrupted_stream_text = full_text
self._turn_active = False
self._aggregation = []
await self._emit(
"response.text.final",
text=full_text,
interrupted=interrupted,
)
async def _emit(self, event_type: str, **payload: object) -> None:
await self.push_frame(
OutputTransportMessageUrgentFrame(
message={"type": event_type, **payload},
),
FrameDirection.DOWNSTREAM,
)