Merge branch 'main' of https://gitea.xiaowang.eu.org/wx44wx/engine-v5-pipecat-core

2026-05-29 12:56:47 +08:00
parent cc418e5f7e b6cb72eda7
commit 4c4e3e949f
3 changed files with 22 additions and 112 deletions
--- a/config/openai.example.json
+++ b/config/openai.example.json
@@ -1,43 +0,0 @@
-{
-  "server": {
-    "host": "0.0.0.0",
-    "port": 8000,
-    "cors_origins": ["*"]
-  },
-  "audio": {
-    "sample_rate_hz": 16000,
-    "channels": 1,
-    "frame_ms": 20
-  },
-  "session": {
-    "inactivity_timeout_sec": 60
-  },
-  "agent": {
-    "system_prompt": "You are a concise voice assistant.",
-    "greeting": "Please say hello in one short sentence.",
-    "greeting_mode": "generated"
-  },
-  "services": {
-    "stt": {
-      "provider": "openai",
-      "api_key": "",
-      "base_url": null,
-      "model": "gpt-4o-mini-transcribe",
-      "language": "en"
-    },
-    "llm": {
-      "provider": "openai",
-      "api_key": "",
-      "base_url": null,
-      "model": "gpt-4o-mini",
-      "temperature": 0.7
-    },
-    "tts": {
-      "provider": "openai",
-      "api_key": "",
-      "base_url": null,
-      "model": "gpt-4o-mini-tts",
-      "voice": "alloy"
-    }
-  }
-}
--- a/engine/pipeline.py
+++ b/engine/pipeline.py
@@ -1,6 +1,5 @@
 from __future__ import annotations

-import asyncio
 import uuid

 from loguru import logger
@@ -8,9 +7,6 @@ from loguru import logger
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
 from pipecat.frames.frames import (
-    BotStartedSpeakingFrame,
-    BotStoppedSpeakingFrame,
-    Frame,
    LLMRunFrame,
    OutputTransportMessageUrgentFrame,
    TTSSpeakFrame,
@@ -150,6 +146,7 @@ async def run_pipeline_with_serializer(
        user_params=LLMUserAggregatorParams(
            vad_analyzer=SileroVADAnalyzer(params=vad_params),
            user_turn_strategies=user_turn_strategies,
+            user_idle_timeout=config.turn.idle_prompt_timeout_sec,
        ),
    )

@@ -192,59 +189,7 @@ async def run_pipeline_with_serializer(
        ),
        idle_timeout_secs=config.session.inactivity_timeout_sec,
    )
-    task.set_reached_upstream_filter((BotStartedSpeakingFrame, BotStoppedSpeakingFrame))
    idle_prompt_count = 0
-    idle_prompt_speaking = False
-    idle_prompt_task: asyncio.Task | None = None
-
-    async def cancel_idle_prompt_timer() -> None:
-        nonlocal idle_prompt_task
-        timer = idle_prompt_task
-        idle_prompt_task = None
-        if timer and not timer.done():
-            await task.cancel_task(timer)
-
-    async def run_idle_prompt_timer() -> None:
-        nonlocal idle_prompt_count, idle_prompt_speaking, idle_prompt_task
-        try:
-            await asyncio.sleep(config.turn.idle_prompt_timeout_sec)
-
-            text = config.turn.idle_prompt_text.strip()
-            if not text or config.turn.idle_prompt_max_count <= 0:
-                return
-            if idle_prompt_count >= config.turn.idle_prompt_max_count:
-                return
-
-            idle_prompt_count += 1
-            logger.info(
-                "User idle prompt triggered "
-                f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}"
-            )
-            idle_prompt_speaking = True
-            await task.queue_frames([TTSSpeakFrame(text)])
-        finally:
-            if idle_prompt_task is asyncio.current_task():
-                idle_prompt_task = None
-
-    async def arm_idle_prompt_timer() -> None:
-        nonlocal idle_prompt_task
-        await cancel_idle_prompt_timer()
-
-        text = config.turn.idle_prompt_text.strip()
-        if (
-            config.turn.idle_prompt_timeout_sec <= 0
-            or config.turn.idle_prompt_max_count <= 0
-            or not text
-            or idle_prompt_count >= config.turn.idle_prompt_max_count
-        ):
-            return
-
-        logger.debug(
-            "Arming user idle prompt timer "
-            f"timeout={config.turn.idle_prompt_timeout_sec}s "
-            f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}"
-        )
-        idle_prompt_task = task.create_task(run_idle_prompt_timer())

    @transport.event_handler("on_client_connected")
    async def on_client_connected(_transport, _client):
@@ -264,26 +209,17 @@ async def run_pipeline_with_serializer(
    @transport.event_handler("on_client_disconnected")
    async def on_client_disconnected(_transport, _client):
        logger.info(f"{client_label} websocket client disconnected")
-        await cancel_idle_prompt_timer()
        await task.cancel()

    @transport.event_handler("on_session_timeout")
    async def on_session_timeout(_transport, _client):
        logger.info(f"{client_label} websocket session timed out")
-        await cancel_idle_prompt_timer()
        await task.cancel()

-    @task.event_handler("on_frame_reached_upstream")
-    async def on_frame_reached_upstream(_task, frame: Frame):
-        nonlocal idle_prompt_count, idle_prompt_speaking
-        if isinstance(frame, BotStartedSpeakingFrame):
-            await cancel_idle_prompt_timer()
-        elif isinstance(frame, BotStoppedSpeakingFrame):
-            if idle_prompt_speaking:
-                idle_prompt_speaking = False
-            else:
-                idle_prompt_count = 0
-            await arm_idle_prompt_timer()
+    @user_aggregator.event_handler("on_user_turn_started")
+    async def on_user_turn_started(_aggregator, _strategy):
+        nonlocal idle_prompt_count
+        idle_prompt_count = 0

    @user_aggregator.event_handler("on_user_turn_stopped")
    async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage):
@@ -302,6 +238,22 @@ async def run_pipeline_with_serializer(
            )
        )

+    @user_aggregator.event_handler("on_user_turn_idle")
+    async def on_user_turn_idle(aggregator):
+        nonlocal idle_prompt_count
+        text = config.turn.idle_prompt_text.strip()
+        if not text or config.turn.idle_prompt_max_count <= 0:
+            return
+        if idle_prompt_count >= config.turn.idle_prompt_max_count:
+            return
+
+        idle_prompt_count += 1
+        logger.info(
+            "User idle prompt triggered "
+            f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}"
+        )
+        await aggregator.push_frame(TTSSpeakFrame(text))
+
    # NOTE: assistant turn started/final events are emitted by
    # ProductTextStreamProcessor, upstream of TTS, so text streams to the
    # client ahead of audio. This logger is kept for server-side visibility.
--- a/engine/services.py
+++ b/engine/services.py
@@ -107,6 +107,7 @@ def create_tts_service(config: TTSConfig, audio: AudioConfig):
            volume=config.volume,
            pitch=config.pitch,
            timeout=config.timeout_sec,
+            push_stop_frames=True,
        )

    if config.provider in ("xfyun_super", "xfyun_super_tts"):