Enhance voice configuration with idle prompt features and new TTS settings

- Added idle prompt timeout, maximum count, and text to multiple voice configuration files to improve user interaction during idle periods. - Updated greeting mode to 'fastgpt_opener' in relevant configurations for a more dynamic greeting experience. - Introduced a new voice configuration file for xfyun TTS, including detailed service settings and parameters. - Refactored the pipeline to handle idle prompts and user turn events, ensuring smoother interaction flow. - Adjusted the VAD and turn configurations to accommodate new idle prompt features.
2026-05-29 16:27:05 +08:00
parent f49212afc9
commit 13f5f44f61
9 changed files with 190 additions and 21 deletions
--- a/config/voice-fastgpt-state-xfyunSuperTTS.json
+++ b/config/voice-fastgpt-state-xfyunSuperTTS.json
@@ -42,12 +42,15 @@
      "你好",
      "在吗"
    ],
-    "user_speech_timeout_sec": 0.2
+    "user_speech_timeout_sec": 0.2,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
  },
  "agent": {
    "system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
    "greeting": "您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，请点击继续办理，如需人工服务，请说转人工。",
-    "greeting_mode": "fixed",
+    "greeting_mode": "fastgpt_opener",
    "response_state": {
      "enabled": true,
      "tag": "state",
--- a/config/voice-fastgpt-state-xfyunTTS.json
+++ b/config/voice-fastgpt-state-xfyunTTS.json
@@ -0,0 +1,99 @@
+{
+  "server": {
+    "host": "0.0.0.0",
+    "port": 8000,
+    "cors_origins": ["*"]
+  },
+  "audio": {
+    "sample_rate_hz": 16000,
+    "channels": 1,
+    "frame_ms": 20
+  },
+  "session": {
+    "inactivity_timeout_sec": 60
+  },
+  "turn": {
+    "vad": {
+      "confidence": 0.8,
+      "start_secs": 0.4,
+      "stop_secs": 0.2,
+      "min_volume": 0.8
+    },
+    "interruption_min_chars": 3,
+    "interruption_use_interim": true,
+    "interruption_short_replies": [
+      "是",
+      "是的",
+      "对",
+      "对的",
+      "嗯",
+      "好",
+      "好的",
+      "行",
+      "可以",
+      "没问题",
+      "不是",
+      "不",
+      "不行",
+      "不用",
+      "不要",
+      "没有",
+      "否",
+      "你好",
+      "在吗"
+    ],
+    "user_speech_timeout_sec": 0.2,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
+  },
+  "agent": {
+    "greeting_mode": "fastgpt_opener",
+    "response_state": {
+      "enabled": true,
+      "tag": "state",
+      "event_type": "response.state",
+      "max_prefix_chars": 256
+    }
+  },
+  "services": {
+    "stt": {
+      "provider": "xfyun",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://iat-api.xfyun.cn/v2/iat",
+      "language": "zh_cn",
+      "domain": "iat",
+      "accent": "mandarin",
+      "encoding": "raw",
+      "frame_size": 1280,
+      "timeout_sec": 10.0
+    },
+    "llm": {
+      "provider": "fastgpt",
+      "api_key": "fastgpt-zlLjYtWZWN0uhQHs3ZOFHG4KLGMIdr2CkbZLCSfqGm5vcdx5xIZbp",
+      "base_url": "http://localhost:3030",
+      "model": "my-voice-app",
+      "app_id": "691eddaa53e3f8d9f25f1370",
+      "chat_id": null,
+      "variables": {},
+      "detail": false,
+      "timeout_sec": 60.0
+    },
+    "tts": {
+      "provider": "xfyun",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://tts-api.xfyun.cn/v2/tts",
+      "voice": "x4_xiaoyan",
+      "aue": "raw",
+      "tte": "UTF8",
+      "speed": 50,
+      "volume": 50,
+      "pitch": 50,
+      "source_sample_rate_hz": 16000
+    }
+  }
+}
--- a/config/voice-fastgpt-xfyunSuperTTS.json
+++ b/config/voice-fastgpt-xfyunSuperTTS.json
@@ -42,12 +42,15 @@
      "你好",
      "在吗"
    ],
-    "user_speech_timeout_sec": 0.2
+    "user_speech_timeout_sec": 0.2,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
  },
  "agent": {
    "system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
    "greeting": "您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，请点击继续办理，如需人工服务，请说转人工。",
-    "greeting_mode": "fixed",
+    "greeting_mode": "fastgpt_opener",
    "response_state": {
      "enabled": true,
      "tag": "state",
--- a/config/voice-fastgpt-xfyunTTS.json
+++ b/config/voice-fastgpt-xfyunTTS.json
@@ -42,12 +42,15 @@
      "你好",
      "在吗"
    ],
-    "user_speech_timeout_sec": 0.2
+    "user_speech_timeout_sec": 0.2,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
  },
  "agent": {
    "system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
    "greeting": "您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，请点击继续办理，如需人工服务，请说转人工。",
-    "greeting_mode": "fixed",
+    "greeting_mode": "fastgpt_opener",
    "response_state": {
      "enabled": true,
      "tag": "state",
@@ -82,20 +85,18 @@
      "send_system_prompt": false
    },
    "tts": {
-      "provider": "xfyun_super",
+      "provider": "xfyun",
      "app_id": "416ce125",
      "api_key": "c65342fe603126c3610031d8429bb36d",
      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
-      "base_url": "wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6",
-      "voice": "x5_lingxiaoxuan_flow",
+      "base_url": "wss://tts-api.xfyun.cn/v2/tts",
+      "voice": "x4_xiaoyan",
      "aue": "raw",
+      "tte": "UTF8",
      "speed": 50,
      "volume": 50,
      "pitch": 50,
-      "oral_level": "mid",
-      "source_sample_rate_hz": 24000,
-      "text_aggregation_mode": "token",
-      "timeout_sec": 30.0
+      "source_sample_rate_hz": 16000
    }
  }
 }
--- a/config/voice-xfyun.json
+++ b/config/voice-xfyun.json
@@ -40,7 +40,10 @@
      "没有",
      "否"
    ],
-    "user_speech_timeout_sec": 0.2
+    "user_speech_timeout_sec": 0.2,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
  },
  "agent": {
    "system_prompt": "# 角色 你是一个高度集成、安全第一的交警AI接警员。正在收集事故人员伤亡情况，时间，地点，事故原因，事故车辆数量，收集完成之后和用户说再见",
--- a/config/voice.json
+++ b/config/voice.json
@@ -42,7 +42,10 @@
      "没有",
      "否"
    ],
-    "user_speech_timeout_sec": 0.8
+    "user_speech_timeout_sec": 0.8,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
  },
  "agent": {
    "system_prompt": "You are a helpful, friendly voice assistant. Keep responses concise and natural for spoken conversation.",
--- a/src/voice/config.py
+++ b/src/voice/config.py
@@ -67,6 +67,12 @@ class VADConfig:
 class TurnConfig:
    vad: VADConfig = field(default_factory=VADConfig)
    user_speech_timeout_sec: float = 1.0
+    idle_prompt_timeout_sec: float = 0.0
+    idle_prompt_max_count: int = 1
+    idle_prompt_text: str = (
+        "我先停在这里。你可以继续说你的想法，"
+        "或者让我根据刚才的内容帮你整理下一步。"
+    )
    interruption_min_chars: int = 3
    interruption_use_interim: bool = True
    interruption_short_replies: list[str] = field(
@@ -209,8 +215,10 @@ def config_from_dict(data: dict) -> EngineConfig:
    agent = _dict(data.get("agent"))
    if agent.get("greeting") == "":
        agent["greeting"] = None
-    if agent.get("greeting_mode") not in (None, "generated", "fixed", "off"):
-        raise ValueError("agent.greeting_mode must be one of: generated, fixed, off")
+    if agent.get("greeting_mode") not in (None, "generated", "fixed", "off", "fastgpt_opener"):
+        raise ValueError(
+            "agent.greeting_mode must be one of: generated, fixed, off, fastgpt_opener"
+        )
    response_state = ResponseStateConfig(**_dict(agent.pop("response_state")))
    if response_state.max_prefix_chars < 1:
        raise ValueError("agent.response_state.max_prefix_chars must be greater than 0")
@@ -231,6 +239,10 @@ def config_from_dict(data: dict) -> EngineConfig:
        llm["app_id"] = None
    if not isinstance(llm.get("variables"), dict):
        llm["variables"] = {}
+    if agent.get("greeting_mode") == "fastgpt_opener" and llm["provider"] != "fastgpt":
+        raise ValueError(
+            "agent.greeting_mode='fastgpt_opener' requires services.llm.provider='fastgpt'"
+        )

    turn = _dict(data.get("turn"))
    vad = _dict(turn.get("vad"))
@@ -244,6 +256,15 @@ def config_from_dict(data: dict) -> EngineConfig:
            user_speech_timeout_sec=float(
                turn.get("user_speech_timeout_sec", TurnConfig().user_speech_timeout_sec)
            ),
+            idle_prompt_timeout_sec=float(
+                turn.get("idle_prompt_timeout_sec", TurnConfig().idle_prompt_timeout_sec)
+            ),
+            idle_prompt_max_count=int(
+                turn.get("idle_prompt_max_count", TurnConfig().idle_prompt_max_count)
+            ),
+            idle_prompt_text=str(
+                turn.get("idle_prompt_text", TurnConfig().idle_prompt_text)
+            ),
            interruption_min_chars=int(
                turn.get("interruption_min_chars", TurnConfig().interruption_min_chars)
            ),
--- a/src/voice/pipeline.py
+++ b/src/voice/pipeline.py
@@ -126,6 +126,7 @@ async def run_pipeline_with_serializer(
        user_params=LLMUserAggregatorParams(
            vad_analyzer=SileroVADAnalyzer(params=vad_params),
            user_turn_strategies=user_turn_strategies,
+            user_idle_timeout=config.turn.idle_prompt_timeout_sec,
        ),
    )

@@ -167,21 +168,26 @@ async def run_pipeline_with_serializer(
        ),
        idle_timeout_secs=config.session.inactivity_timeout_sec,
    )
+    idle_prompt_count = 0

    @transport.event_handler("on_client_connected")
    async def on_client_connected(_transport, _client):
        logger.info(f"{client_label} websocket client connected")
        if config.agent.greeting_mode == "fixed" and config.agent.greeting:
            await task.queue_frames([TTSSpeakFrame(config.agent.greeting)])
-        elif config.agent.greeting_mode == "generated":
+        elif config.agent.greeting_mode == "fastgpt_opener":
            if isinstance(llm, FastGPTLLMService):
                welcome = await llm.fetch_welcome_text()
                if welcome:
                    await task.queue_frames([TTSSpeakFrame(welcome)])
                else:
-                    await task.queue_frames([LLMRunFrame()])
+                    logger.warning("FastGPT opener requested but no opener text was returned")
            else:
-                await task.queue_frames([LLMRunFrame()])
+                raise RuntimeError(
+                    "agent.greeting_mode='fastgpt_opener' requires FastGPT LLM service"
+                )
+        elif config.agent.greeting_mode == "generated":
+            await task.queue_frames([LLMRunFrame()])

    @transport.event_handler("on_client_disconnected")
    async def on_client_disconnected(_transport, _client):
@@ -193,6 +199,27 @@ async def run_pipeline_with_serializer(
        logger.info(f"{client_label} websocket session timed out")
        await task.cancel()

+    @user_aggregator.event_handler("on_user_turn_started")
+    async def on_user_turn_started(_aggregator, _strategy):
+        nonlocal idle_prompt_count
+        idle_prompt_count = 0
+
+    @user_aggregator.event_handler("on_user_turn_idle")
+    async def on_user_turn_idle(aggregator):
+        nonlocal idle_prompt_count
+        text = config.turn.idle_prompt_text.strip()
+        if not text or config.turn.idle_prompt_max_count <= 0:
+            return
+        if idle_prompt_count >= config.turn.idle_prompt_max_count:
+            return
+
+        idle_prompt_count += 1
+        logger.info(
+            "User idle prompt triggered "
+            f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}"
+        )
+        await aggregator.push_frame(TTSSpeakFrame(text))
+
    @user_aggregator.event_handler("on_user_turn_stopped")
    async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage):
        logger.info(f"User: {message.content}")
--- a/src/voice/text_input.py
+++ b/src/voice/text_input.py
@@ -2,7 +2,13 @@ from __future__ import annotations

 from loguru import logger

-from pipecat.frames.frames import Frame, InputTransportMessageFrame, LLMMessagesAppendFrame
+from pipecat.frames.frames import (
+    Frame,
+    InputTransportMessageFrame,
+    LLMMessagesAppendFrame,
+    UserStartedSpeakingFrame,
+    UserStoppedSpeakingFrame,
+)
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor


@@ -25,6 +31,8 @@ class ProductTextInputProcessor(FrameProcessor):
        if not text:
            return

+        await self.broadcast_frame(UserStartedSpeakingFrame)
+
        if message.get("interrupt", True):
            logger.info("Text input interrupting current response")
            await self.broadcast_interruption()
@@ -36,3 +44,4 @@ class ProductTextInputProcessor(FrameProcessor):
            ),
            FrameDirection.DOWNSTREAM,
        )
+        await self.broadcast_frame(UserStoppedSpeakingFrame)