engine-v5-pipecat-core/engine/text_stream.py

from __future__ import annotations

from dataclasses import dataclass

from loguru import logger

from pipecat.frames.frames import (
    Frame,
    InterruptionFrame,
    LLMFullResponseEndFrame,
    LLMFullResponseStartFrame,
    LLMTextFrame,
    OutputTransportMessageUrgentFrame,
    TTSSpeakFrame,
)
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.utils.time import time_now_iso8601


@dataclass
class ProductAssistantTurnStoppedMessage:
    content: str
    interrupted: bool
    timestamp: str


class ProductTextStreamProcessor(FrameProcessor):
    """Mirrors LLM text frames as streaming protocol events.

    Placed between the LLM service and the TTS service, this processor
    observes the LLM's text frames as they're emitted and forwards them
    downstream as ``OutputTransportMessageUrgentFrame``s that the product
    serializer turns into ``response.text.{started,delta,final}`` events.

    Because the events are emitted before the TTS holds onto
    ``LLMFullResponseEndFrame`` to drain its audio queue, text reaches the
    client well ahead of (or at worst, alongside) the synthesized audio.

    ``TTSSpeakFrame`` (used by the fixed-greeting code path, which bypasses
    the LLM entirely) is also handled: the processor synthesizes a single
    started/delta/final sequence for its fixed text.
    """

    def __init__(self, context: LLMContext | None = None) -> None:
        super().__init__()
        self._context = context
        self._aggregation: list[str] = []
        self._turn_active = False
        self._turn_start_timestamp = ""
        self._register_event_handler("on_assistant_turn_stopped")

    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
        await super().process_frame(frame, direction)

        if isinstance(frame, LLMFullResponseStartFrame):
            await self._start_turn()
        elif isinstance(frame, LLMTextFrame):
            if frame.text:
                await self._delta(frame.text)
        elif isinstance(frame, LLMFullResponseEndFrame):
            await self._end_turn(interrupted=False)
        elif isinstance(frame, InterruptionFrame):
            await self._end_turn(interrupted=True)
        elif isinstance(frame, TTSSpeakFrame):
            # Fixed-text / direct-speech path: there's no LLM cycle, so
            # synthesize one started/delta/final sequence for the spoken text.
            text = frame.text or ""
            await self._start_turn()
            if text:
                await self._delta(text)
            await self._end_turn(interrupted=False)

        await self.push_frame(frame, direction)

    async def _start_turn(self) -> None:
        if self._turn_active:
            return
        self._turn_active = True
        self._aggregation = []
        self._turn_start_timestamp = time_now_iso8601()
        await self._emit("response.text.started")

    async def _delta(self, text: str) -> None:
        if not self._turn_active:
            # A text frame outside a turn shouldn't happen, but if it does,
            # synthesize a started boundary so the client renders sensibly.
            await self._start_turn()
        self._aggregation.append(text)
        await self._emit("response.text.delta", text=text)

    async def _end_turn(self, *, interrupted: bool) -> None:
        if not self._turn_active:
            return
        full_text = "".join(self._aggregation)
        self._turn_active = False
        self._aggregation = []
        if self._context and full_text:
            self._context.add_message({"role": "assistant", "content": full_text})
            logger.info(
                "Assistant committed to LLM context before TTS: "
                f"{full_text[:120]}"
            )
        await self._emit(
            "response.text.final",
            text=full_text,
            interrupted=interrupted,
        )
        await self._call_event_handler(
            "on_assistant_turn_stopped",
            ProductAssistantTurnStoppedMessage(
                content=full_text,
                interrupted=interrupted,
                timestamp=self._turn_start_timestamp or time_now_iso8601(),
            ),
        )
        self._turn_start_timestamp = ""

    async def _emit(self, event_type: str, **payload: object) -> None:
        await self.push_frame(
            OutputTransportMessageUrgentFrame(
                message={"type": event_type, **payload},
            ),
            FrameDirection.DOWNSTREAM,
        )