ai-video-fullstack/backend/services/pipecat/pipeline.py

"""管线核心:给定一个 transport + 配置,跑完整的语音闭环。

关键设计:**transport 由调用方传入**,管线本身不关心是 WebRTC 还是 WS。
这就是"同时支持多种输出"的落点——加输出方式不用动这里。

对应 dograh 的 pipeline_builder.py + run_pipeline.py(已砍掉 workflow 引擎/DB/录音/指标)。
"""

from uuid import uuid4

import config
from loguru import logger
from models import AssistantConfig
from services.brains import build_brain
from services.pipecat.service_factory import (
    create_realtime_service,
    create_stt,
    create_tts,
)
from services.workflow_engine import WorkflowEngine

from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import (
    BotStartedSpeakingFrame,
    BotStoppedSpeakingFrame,
    EndFrame,
    InputTransportMessageFrame,
    InterruptionFrame,
    LLMFullResponseEndFrame,
    LLMFullResponseStartFrame,
    LLMTextFrame,
    LLMMessagesAppendFrame,
    OutputTransportMessageUrgentFrame,
    TextFrame,
    TTSSpeakFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.worker import PipelineParams, PipelineWorker
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import (
    LLMAssistantAggregator,
    LLMUserAggregator,
    LLMUserAggregatorParams,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.turns.user_start import (
    TranscriptionUserTurnStartStrategy,
    VADUserTurnStartStrategy,
)
from pipecat.turns.user_turn_strategies import UserTurnStrategies
from pipecat.utils.time import time_now_iso8601
from pipecat.workers.runner import WorkerRunner


def _text_input(message) -> tuple[str, bool] | None:
    """解析现有 user-text 与 RTVI send-text 两种前端文字消息。"""
    if not isinstance(message, dict):
        return None
    if message.get("type") == "user-text":
        text = str(message.get("text") or "").strip()
        return (text, True) if text else None
    if message.get("type") == "send-text":
        data = message.get("data")
        if not isinstance(data, dict):
            return None
        text = str(data.get("content") or "").strip()
        options = data.get("options")
        run_immediately = not isinstance(options, dict) or options.get(
            "run_immediately", True
        )
        return (text, bool(run_immediately)) if text else None
    return None


class TextInputProcessor(FrameProcessor):
    """把 transport 文字消息转换成 LLM 可消费的帧。

    run_immediately(默认/打断):先通过 on_text_input 事件把用户文字交给
    run_pipeline 登记,再用 broadcast_interruption() 打断当前播报。新的 LLM
    回复由 assistant aggregator 确认处理完 interruption 后触发。
    run_immediately=False(RTVI send-text 静默追加):仅把文字写进上下文,
    不打断、不触发推理。
    """

    def __init__(self):
        super().__init__()
        # 立即触发的文字(含打断语义)走 on_text_input;静默追加另走一条事件
        self._register_event_handler("on_text_input")
        self._register_event_handler("on_text_append")
        self._register_event_handler("on_client_ready")

    async def process_frame(self, frame, direction: FrameDirection):
        await super().process_frame(frame, direction)

        if not isinstance(frame, InputTransportMessageFrame):
            await self.push_frame(frame, direction)
            return

        if isinstance(frame.message, dict) and frame.message.get("type") == "client-ready":
            await self._call_event_handler("on_client_ready")
            return

        parsed = _text_input(frame.message)
        if not parsed:
            await self.push_frame(frame, direction)
            return

        text, run_immediately = parsed
        if run_immediately:
            # 先登记文字再打断。下一轮 LLM 由 assistant aggregator 在真正处理完
            # InterruptionFrame 后触发,避免新回复被这次 interruption 一起取消。
            await self._call_event_handler("on_text_input", text)
            await self.broadcast_interruption()
        else:
            await self._call_event_handler("on_text_append", text)


class RealtimeTextInputProcessor(FrameProcessor):
    """Route text input directly to a realtime service without cascade semantics."""

    def __init__(self):
        super().__init__()
        self._register_event_handler("on_text_input")
        self._register_event_handler("on_text_append")

    async def process_frame(self, frame, direction: FrameDirection):
        await super().process_frame(frame, direction)

        if not isinstance(frame, InputTransportMessageFrame):
            await self.push_frame(frame, direction)
            return

        parsed = _text_input(frame.message)
        if not parsed:
            await self.push_frame(frame, direction)
            return

        text, run_immediately = parsed
        await self._call_event_handler(
            "on_text_input" if run_immediately else "on_text_append",
            text,
        )


class PassthroughLLMAssistantAggregator(LLMAssistantAggregator):
    """聚合 LLM 回复进上下文,同时继续把回复帧交给下游 TTS。"""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._register_event_handler("on_interruption_processed")
        self._register_event_handler("on_assistant_text_start")
        self._register_event_handler("on_assistant_text_delta")
        self._register_event_handler("on_assistant_text_end")
        self._stream_turn_id: str | None = None
        self._stream_timestamp = ""
        self._stream_text = ""

    async def process_frame(self, frame, direction: FrameDirection):
        await super().process_frame(frame, direction)

        if isinstance(frame, LLMFullResponseStartFrame):
            self._stream_turn_id = uuid4().hex
            self._stream_timestamp = time_now_iso8601()
            self._stream_text = ""
            await self._call_event_handler(
                "on_assistant_text_start",
                self._stream_turn_id,
                self._stream_timestamp,
            )
        elif isinstance(frame, LLMTextFrame) and self._stream_turn_id:
            self._stream_text += frame.text
            await self._call_event_handler(
                "on_assistant_text_delta",
                self._stream_turn_id,
                frame.text,
            )
        elif isinstance(frame, LLMFullResponseEndFrame):
            await self._finish_text_stream(interrupted=False)

        # LLMAssistantAggregator 默认会消费这些帧。放在 TTS 前用于中断时保存
        # 已生成前缀时,必须显式透传,否则 TTS 收不到任何 LLM 回复。
        if isinstance(
            frame,
            (LLMFullResponseStartFrame, LLMFullResponseEndFrame, TextFrame),
        ):
            await self.push_frame(frame, direction)
        elif isinstance(frame, InterruptionFrame):
            await self._finish_text_stream(interrupted=True)
            await self._call_event_handler("on_interruption_processed")

    async def _finish_text_stream(self, *, interrupted: bool):
        if not self._stream_turn_id:
            return
        await self._call_event_handler(
            "on_assistant_text_end",
            self._stream_turn_id,
            self._stream_text,
            interrupted,
        )
        self._stream_turn_id = None
        self._stream_timestamp = ""
        self._stream_text = ""


async def run_pipeline(transport, cfg: AssistantConfig) -> None:
    """在给定 transport 上构建并运行管线,直到连接结束。

    Args:
        transport: 任意 pipecat transport(WebRTC / WS / 电话…),
                   只要有 .input() / .output() / event_handler 即可。
        cfg:       助手配置(随请求内联传入)。
    """
    logger.info(f"启动管线: assistant={cfg.name} type={cfg.type} mode={cfg.runtimeMode}")

    # 大脑:按类型决定 LLM 槽/开场白/上下文归属。每通电话一个实例(可持会话状态)。
    brain = build_brain(cfg)
    if (
        cfg.runtimeMode == "realtime"
        and "realtime" not in brain.spec.supported_runtime_modes
    ):
        logger.warning(f"类型 {cfg.type} 不支持 realtime,回退 cascade")
        cfg.runtimeMode = "pipeline"

    if cfg.runtimeMode == "realtime":
        await run_realtime_pipeline(transport, cfg)
        return

    stt = create_stt(cfg)
    tts = create_tts(cfg)

    # ---- workflow 图引擎(可选)----
    # 有节点图时按图驱动:开场白/系统提示来自起始节点,每轮回复后按条件路由。
    engine = WorkflowEngine(cfg.graph or {})
    workflow_active = engine.has_graph()
    wf_state = {
        # 开始节点本身就是会话节点(有自己的 prompt,可多轮),从它开始
        "current": engine.start_id if workflow_active else None,
        "ended": False,
        "turns_in_node": 0,
        # 结束流程的精确计时:只在「结束节点自己的结束语」真正说完时挂断。
        "end_turn_id": None,   # 结束节点回复的 turn_id(其 text_start 在 ended 之后)
        "end_armed": False,    # 结束语文本已生成完(已下发 data channel)
        "end_speaking": False,  # 结束语音频已开始播报
        "end_frame_queued": False,
    }
    history: list[dict] = []
    # 当前节点没有可调用转移工具(全是空条件)时,才启用文本兜底路由
    FALLBACK_AFTER_TURNS = 2

    if workflow_active:
        greeting = engine.greeting() or cfg.greeting
        system_content = engine.system_prompt_for(wf_state["current"])
        logger.info(
            f"工作流模式启用: 起始节点={engine.name(wf_state['current'])}"
        )
    elif brain.spec.owns_context:
        greeting = cfg.greeting
        system_content = cfg.prompt
    else:
        # 外部托管(fastgpt 等):开场白来自对方后台,系统提示/上下文不归我们维护
        greeting = await brain.greeting(cfg)
        system_content = ""

    context = LLMContext(messages=[{"role": "system", "content": system_content}])
    # LLM 槽由大脑提供:内部类型=OpenAI 兼容服务;fastgpt=包 SDK 的伪 LLM。
    llm = brain.build_llm(cfg, context)
    user_aggregator = LLMUserAggregator(
        context,
        params=LLMUserAggregatorParams(
            vad_analyzer=SileroVADAnalyzer(),
            user_turn_strategies=UserTurnStrategies(
                start=[
                    VADUserTurnStartStrategy(enable_interruptions=cfg.enableInterrupt),
                    TranscriptionUserTurnStartStrategy(
                        enable_interruptions=cfg.enableInterrupt
                    ),
                ]
            ),
        ),
    )
    assistant_aggregator = PassthroughLLMAssistantAggregator(context)
    text_input = TextInputProcessor()

    # 结束节点:等结束语「说完」(BotStoppedSpeakingFrame)再挂断,确保结束语的
    # 文字(走 data channel)与音频都已下发,避免前端只听到声音、看不到文字。
    worker_holder: dict = {}

    class EndCallAfterSpeech(FrameProcessor):
        async def process_frame(self, frame, direction: FrameDirection):
            await super().process_frame(frame, direction)
            await self.push_frame(frame, direction)
            # 结束语文本生成完(end_armed)→ 其音频开始(end_speaking)→ 音频说完才挂断。
            # 配对 started/stopped,避免被结束节点之前的话(如先答一句再转移)的
            # stopped 事件提前触发,导致结束语被截断。
            if isinstance(frame, BotStartedSpeakingFrame) and wf_state["end_armed"]:
                wf_state["end_speaking"] = True
            elif (
                isinstance(frame, BotStoppedSpeakingFrame)
                and wf_state["end_speaking"]
                and not wf_state["end_frame_queued"]
                and worker_holder.get("worker") is not None
            ):
                wf_state["end_frame_queued"] = True
                logger.info("结束语播报完毕,挂断通话")
                # 先告知前端这是正常结束(而非连接异常),再优雅挂断
                await worker_holder["worker"].queue_frame(
                    OutputTransportMessageUrgentFrame(
                        message={"type": "call-ended", "reason": "completed"}
                    )
                )
                await worker_holder["worker"].queue_frame(EndFrame())

    pipeline = Pipeline(
        [
            transport.input(),
            text_input,
            stt,
            user_aggregator,
            llm,
            # Aggregate the streamed LLM text before TTS. On interruption,
            # Pipecat commits the generated prefix immediately instead of
            # waiting for a TTS provider to emit spoken-text/timestamp frames.
            assistant_aggregator,
            tts,
            EndCallAfterSpeech(),
            transport.output(),
        ]
    )

    worker = PipelineWorker(
        pipeline,
        params=PipelineParams(
            enable_metrics=False,
        ),
        enable_rtvi=False,
    )
    worker_holder["worker"] = worker

    async def queue_transcript(role: str, content: str, timestamp: str) -> None:
        if content:
            await worker.queue_frame(
                OutputTransportMessageUrgentFrame(
                    message={
                        "type": "transcript",
                        "role": role,
                        "content": content,
                        "timestamp": timestamp,
                    },
                )
            )

    greeting_transcript_sent = False
    pending_text_inputs: list[str] = []

    async def emit_node_active(node_id: str | None) -> None:
        """通知前端当前激活的节点,画布据此高亮。"""
        if node_id:
            await worker.queue_frame(
                OutputTransportMessageUrgentFrame(
                    message={"type": "node-active", "nodeId": node_id}
                )
            )

    def set_system_prompt(text: str) -> None:
        """替换上下文里的系统提示(节点切换时整体替换,而非追加)。"""
        messages = context.get_messages()
        if messages and messages[0].get("role") == "system":
            messages[0] = {"role": "system", "content": text}
        else:
            messages.insert(0, {"role": "system", "content": text})

    def apply_node(node_id: str | None) -> None:
        """进入节点:设置系统提示 + 把出边注册为可调用的转移工具。"""
        set_system_prompt(engine.system_prompt_for(node_id))
        if engine.is_end(node_id):
            context.set_tools()  # 终止节点无工具
            return
        schemas = [
            FunctionSchema(
                name=engine.edge_fn_name(edge),
                description=engine.edge_description(edge),
                properties={},
                required=[],
            )
            for edge in engine.outgoing(node_id)
        ]
        if schemas:
            context.set_tools(ToolsSchema(standard_tools=schemas))
        else:
            context.set_tools()  # 无出边:清空工具

    async def go_to_node(target: str) -> None:
        """执行转移:切当前节点、重置计数、点亮画布、设置提示/工具。

        结束节点:设 ended 标记,apply_node 会清空工具,模型据结束语提示说完后,
        on_assistant_text_end 里排入 EndFrame 挂断,不再多轮。
        """
        wf_state["current"] = target
        wf_state["turns_in_node"] = 0
        if engine.is_end(target):
            wf_state["ended"] = True
        await emit_node_active(target)
        apply_node(target)

    async def speak_transition(edge: dict | None) -> None:
        """切换瞬间播报过渡语(可选),掩盖切节点/新一轮生成的延迟。不写入上下文。"""
        speech = engine.edge_transition_speech(edge)
        if speech:
            await worker.queue_frame(TTSSpeakFrame(speech, append_to_context=False))

    def make_transition_handler(edge: dict):
        target = edge.get("target")

        async def handler(params):
            logger.info(f"LLM 触发转移 → {engine.name(target)}")
            # 进结束节点不播过渡语(结束语本身就是收尾,避免打断挂断时序)
            if not engine.is_end(target):
                await speak_transition(edge)
            await go_to_node(target)
            # 返回工具结果,pipecat 随即在新节点的提示/工具下继续生成
            await params.result_callback({"status": "ok"})

        return handler

    async def fallback_route() -> None:
        """文本兜底:模型迟迟不调用转移工具时,用一次轻量分类器判断是否转移。"""
        if not workflow_active or wf_state["ended"]:
            return
        if wf_state["turns_in_node"] < FALLBACK_AFTER_TURNS:
            return
        if not engine.outgoing(wf_state["current"]):
            return
        target = await engine.route(
            wf_state["current"],
            history,
            api_key=cfg.llm_api_key or config.LLM_API_KEY,
            base_url=cfg.llm_base_url or config.LLM_BASE_URL,
            model=cfg.model or config.LLM_MODEL,
        )
        if target and target != wf_state["current"]:
            logger.info(f"文本兜底触发转移 → {engine.name(target)}")
            if not engine.is_end(target):
                await speak_transition(engine.find_edge(wf_state["current"], target))
            # 仅切换节点提示/工具,下一轮用户输入即在新节点处理
            await go_to_node(target)

    # 把每条边注册成 LLM 可调用的转移函数(按边唯一命名,处理器全局注册一次,
    # 由各节点的 context.tools 控制当前可见哪些)。
    if workflow_active:
        for edge in engine.edges:
            if edge.get("target"):
                llm.register_function(
                    engine.edge_fn_name(edge), make_transition_handler(edge)
                )
        apply_node(wf_state["current"])  # 设初始节点的提示与工具

    async def append_user_text_to_context(text: str, *, run_llm: bool) -> None:
        await worker.queue_frame(
            LLMMessagesAppendFrame(
                messages=[{"role": "user", "content": text}],
                run_llm=run_llm,
            )
        )

    @user_aggregator.event_handler("on_user_turn_stopped")
    async def on_user_turn_stopped(_aggregator, _strategy, message):
        if message.content:
            history.append({"role": "user", "content": message.content})
        await queue_transcript("user", message.content, message.timestamp)

    @assistant_aggregator.event_handler("on_assistant_text_start")
    async def on_assistant_text_start(_aggregator, turn_id, timestamp):
        # 进入结束节点后,第一条「开始生成」的回复就是结束节点自己的结束语
        # (其 text_start 发生在 ended 置位之后,不会误认转移前的那句)。
        if (
            workflow_active
            and wf_state["ended"]
            and wf_state["end_turn_id"] is None
        ):
            wf_state["end_turn_id"] = turn_id
        await worker.queue_frame(
            OutputTransportMessageUrgentFrame(
                message={
                    "type": "assistant-text-start",
                    "turn_id": turn_id,
                    "timestamp": timestamp,
                }
            )
        )

    @assistant_aggregator.event_handler("on_assistant_text_delta")
    async def on_assistant_text_delta(_aggregator, turn_id, delta):
        await worker.queue_frame(
            OutputTransportMessageUrgentFrame(
                message={
                    "type": "assistant-text-delta",
                    "turn_id": turn_id,
                    "delta": delta,
                }
            )
        )

    @assistant_aggregator.event_handler("on_assistant_text_end")
    async def on_assistant_text_end(_aggregator, turn_id, content, interrupted):
        await worker.queue_frame(
            OutputTransportMessageUrgentFrame(
                message={
                    "type": "assistant-text-end",
                    "turn_id": turn_id,
                    "content": content,
                    "interrupted": interrupted,
                }
            )
        )
        # 助手把话说完(未被打断)后:累加本节点轮次,必要时走文本兜底路由。
        # 正常情况下转移由 LLM 直接调用转移工具完成(go_to_node),无需这里处理。
        if content and not interrupted and workflow_active:
            history.append({"role": "assistant", "content": content})
            if turn_id == wf_state["end_turn_id"]:
                # 结束节点的结束语文本已生成完(也已下发 data channel),武装挂断;
                # 真正的 EndFrame 由 EndCallAfterSpeech 在结束语「说完」时排入。
                wf_state["end_armed"] = True
            elif not wf_state["ended"]:
                wf_state["turns_in_node"] += 1
                await fallback_route()
        elif content and not interrupted:
            history.append({"role": "assistant", "content": content})

    @text_input.event_handler("on_text_input")
    async def on_text_input(_processor, text):
        pending_text_inputs.append(text)
        history.append({"role": "user", "content": text})
        # 前端显示不依赖 interruption 后续事件,必须在打断前先排入发送队列。
        await queue_transcript("user", text, time_now_iso8601())

    @assistant_aggregator.event_handler("on_interruption_processed")
    async def on_interruption_processed(_aggregator):
        if not pending_text_inputs:
            return
        text = pending_text_inputs.pop(0)
        # assistant aggregator 已处理完 interruption,现在再启动下一轮 LLM。
        await append_user_text_to_context(text, run_llm=True)

    @text_input.event_handler("on_text_append")
    async def on_text_append(_processor, text):
        # 静默追加:写进上下文但不打断、不触发推理;transcript 照常上报
        history.append({"role": "user", "content": text})
        await queue_transcript("user", text, time_now_iso8601())
        await append_user_text_to_context(text, run_llm=False)

    @text_input.event_handler("on_client_ready")
    async def on_client_ready(_processor):
        nonlocal greeting_transcript_sent
        if greeting and not greeting_transcript_sent:
            greeting_transcript_sent = True
            await queue_transcript("assistant", greeting, time_now_iso8601())

    @transport.event_handler("on_client_connected")
    async def on_client_connected(_transport, _client):
        if greeting:
            # 外部托管类型的上下文由对方服务端维护,开场白不写入本地 context
            if brain.spec.owns_context:
                context.add_message({"role": "assistant", "content": greeting})
            await worker.queue_frame(TTSSpeakFrame(greeting, append_to_context=False))
        # 工作流:点亮当前(开始)节点。开始节点即首个会话节点。
        if workflow_active:
            await emit_node_active(wf_state["current"])

    @transport.event_handler("on_client_disconnected")
    async def on_client_disconnected(_transport, _client):
        logger.info("对端断开,结束管线")
        await worker.queue_frame(EndFrame())

    runner = WorkerRunner(handle_sigint=False)
    await runner.add_workers(worker)
    await runner.run()
    logger.info("管线已结束")


async def run_realtime_pipeline(transport, cfg: AssistantConfig) -> None:
    """Run a speech-to-speech model that owns ASR, reasoning, and synthesis."""
    realtime = create_realtime_service(cfg)
    text_input = RealtimeTextInputProcessor()

    pipeline = Pipeline(
        [
            transport.input(),
            text_input,
            realtime,
            transport.output(),
        ]
    )
    worker = PipelineWorker(
        pipeline,
        params=PipelineParams(
            enable_metrics=False,
            audio_in_sample_rate=int(
                cfg.realtime_values.get("inputSampleRate") or 24000
            ),
            audio_out_sample_rate=int(
                cfg.realtime_values.get("outputSampleRate") or 24000
            ),
        ),
        enable_rtvi=False,
    )

    async def queue_transcript(role: str, content: str) -> None:
        if content:
            await worker.queue_frame(
                OutputTransportMessageUrgentFrame(
                    message={
                        "type": "transcript",
                        "role": role,
                        "content": content,
                        "timestamp": time_now_iso8601(),
                    },
                )
            )

    @text_input.event_handler("on_text_input")
    async def on_text_input(_processor, text):
        await queue_transcript("user", text)
        await realtime.interrupt()
        await realtime.send_text(text, run_immediately=True)

    @text_input.event_handler("on_text_append")
    async def on_text_append(_processor, text):
        await queue_transcript("user", text)
        await realtime.send_text(text, run_immediately=False)

    @transport.event_handler("on_client_connected")
    async def on_client_connected(_transport, _client):
        if cfg.greeting:
            await realtime.speak(cfg.greeting)

    @transport.event_handler("on_client_disconnected")
    async def on_client_disconnected(_transport, _client):
        logger.info("Realtime 对端断开,结束管线")
        await worker.queue_frame(EndFrame())

    runner = WorkerRunner(handle_sigint=False)
    await runner.add_workers(worker)
    await runner.run()
    logger.info("Realtime 管线已结束")