Enhance voice configuration with idle prompt features and new TTS settings

- Added idle prompt timeout, maximum count, and text to multiple voice configuration files to improve user interaction during idle periods.
- Updated greeting mode to 'fastgpt_opener' in relevant configurations for a more dynamic greeting experience.
- Introduced a new voice configuration file for xfyun TTS, including detailed service settings and parameters.
- Refactored the pipeline to handle idle prompts and user turn events, ensuring smoother interaction flow.
- Adjusted the VAD and turn configurations to accommodate new idle prompt features.
This commit is contained in:
Xin Wang
2026-05-29 16:27:05 +08:00
parent f49212afc9
commit 13f5f44f61
9 changed files with 190 additions and 21 deletions

View File

@@ -42,12 +42,15 @@
"你好",
"在吗"
],
"user_speech_timeout_sec": 0.2
"user_speech_timeout_sec": 0.2,
"idle_prompt_timeout_sec": 3.0,
"idle_prompt_max_count": 3,
"idle_prompt_text": "你好,请问还在吗?"
},
"agent": {
"system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
"greeting": "您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域,开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了,请点击继续办理,如需人工服务,请说转人工。",
"greeting_mode": "fixed",
"greeting_mode": "fastgpt_opener",
"response_state": {
"enabled": true,
"tag": "state",

View File

@@ -0,0 +1,99 @@
{
"server": {
"host": "0.0.0.0",
"port": 8000,
"cors_origins": ["*"]
},
"audio": {
"sample_rate_hz": 16000,
"channels": 1,
"frame_ms": 20
},
"session": {
"inactivity_timeout_sec": 60
},
"turn": {
"vad": {
"confidence": 0.8,
"start_secs": 0.4,
"stop_secs": 0.2,
"min_volume": 0.8
},
"interruption_min_chars": 3,
"interruption_use_interim": true,
"interruption_short_replies": [
"是",
"是的",
"对",
"对的",
"嗯",
"好",
"好的",
"行",
"可以",
"没问题",
"不是",
"不",
"不行",
"不用",
"不要",
"没有",
"否",
"你好",
"在吗"
],
"user_speech_timeout_sec": 0.2,
"idle_prompt_timeout_sec": 3.0,
"idle_prompt_max_count": 3,
"idle_prompt_text": "你好,请问还在吗?"
},
"agent": {
"greeting_mode": "fastgpt_opener",
"response_state": {
"enabled": true,
"tag": "state",
"event_type": "response.state",
"max_prefix_chars": 256
}
},
"services": {
"stt": {
"provider": "xfyun",
"app_id": "416ce125",
"api_key": "c65342fe603126c3610031d8429bb36d",
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
"base_url": "wss://iat-api.xfyun.cn/v2/iat",
"language": "zh_cn",
"domain": "iat",
"accent": "mandarin",
"encoding": "raw",
"frame_size": 1280,
"timeout_sec": 10.0
},
"llm": {
"provider": "fastgpt",
"api_key": "fastgpt-zlLjYtWZWN0uhQHs3ZOFHG4KLGMIdr2CkbZLCSfqGm5vcdx5xIZbp",
"base_url": "http://localhost:3030",
"model": "my-voice-app",
"app_id": "691eddaa53e3f8d9f25f1370",
"chat_id": null,
"variables": {},
"detail": false,
"timeout_sec": 60.0
},
"tts": {
"provider": "xfyun",
"app_id": "416ce125",
"api_key": "c65342fe603126c3610031d8429bb36d",
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
"base_url": "wss://tts-api.xfyun.cn/v2/tts",
"voice": "x4_xiaoyan",
"aue": "raw",
"tte": "UTF8",
"speed": 50,
"volume": 50,
"pitch": 50,
"source_sample_rate_hz": 16000
}
}
}

View File

@@ -42,12 +42,15 @@
"你好",
"在吗"
],
"user_speech_timeout_sec": 0.2
"user_speech_timeout_sec": 0.2,
"idle_prompt_timeout_sec": 3.0,
"idle_prompt_max_count": 3,
"idle_prompt_text": "你好,请问还在吗?"
},
"agent": {
"system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
"greeting": "您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域,开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了,请点击继续办理,如需人工服务,请说转人工。",
"greeting_mode": "fixed",
"greeting_mode": "fastgpt_opener",
"response_state": {
"enabled": true,
"tag": "state",

View File

@@ -42,12 +42,15 @@
"你好",
"在吗"
],
"user_speech_timeout_sec": 0.2
"user_speech_timeout_sec": 0.2,
"idle_prompt_timeout_sec": 3.0,
"idle_prompt_max_count": 3,
"idle_prompt_text": "你好,请问还在吗?"
},
"agent": {
"system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
"greeting": "您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域,开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了,请点击继续办理,如需人工服务,请说转人工。",
"greeting_mode": "fixed",
"greeting_mode": "fastgpt_opener",
"response_state": {
"enabled": true,
"tag": "state",
@@ -82,20 +85,18 @@
"send_system_prompt": false
},
"tts": {
"provider": "xfyun_super",
"provider": "xfyun",
"app_id": "416ce125",
"api_key": "c65342fe603126c3610031d8429bb36d",
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
"base_url": "wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6",
"voice": "x5_lingxiaoxuan_flow",
"base_url": "wss://tts-api.xfyun.cn/v2/tts",
"voice": "x4_xiaoyan",
"aue": "raw",
"tte": "UTF8",
"speed": 50,
"volume": 50,
"pitch": 50,
"oral_level": "mid",
"source_sample_rate_hz": 24000,
"text_aggregation_mode": "token",
"timeout_sec": 30.0
"source_sample_rate_hz": 16000
}
}
}

View File

@@ -40,7 +40,10 @@
"没有",
"否"
],
"user_speech_timeout_sec": 0.2
"user_speech_timeout_sec": 0.2,
"idle_prompt_timeout_sec": 3.0,
"idle_prompt_max_count": 3,
"idle_prompt_text": "你好,请问还在吗?"
},
"agent": {
"system_prompt": "# 角色 你是一个高度集成、安全第一的交警AI接警员。正在收集事故人员伤亡情况时间地点事故原因事故车辆数量收集完成之后和用户说再见",

View File

@@ -42,7 +42,10 @@
"没有",
"否"
],
"user_speech_timeout_sec": 0.8
"user_speech_timeout_sec": 0.8,
"idle_prompt_timeout_sec": 3.0,
"idle_prompt_max_count": 3,
"idle_prompt_text": "你好,请问还在吗?"
},
"agent": {
"system_prompt": "You are a helpful, friendly voice assistant. Keep responses concise and natural for spoken conversation.",

View File

@@ -67,6 +67,12 @@ class VADConfig:
class TurnConfig:
vad: VADConfig = field(default_factory=VADConfig)
user_speech_timeout_sec: float = 1.0
idle_prompt_timeout_sec: float = 0.0
idle_prompt_max_count: int = 1
idle_prompt_text: str = (
"我先停在这里。你可以继续说你的想法,"
"或者让我根据刚才的内容帮你整理下一步。"
)
interruption_min_chars: int = 3
interruption_use_interim: bool = True
interruption_short_replies: list[str] = field(
@@ -209,8 +215,10 @@ def config_from_dict(data: dict) -> EngineConfig:
agent = _dict(data.get("agent"))
if agent.get("greeting") == "":
agent["greeting"] = None
if agent.get("greeting_mode") not in (None, "generated", "fixed", "off"):
raise ValueError("agent.greeting_mode must be one of: generated, fixed, off")
if agent.get("greeting_mode") not in (None, "generated", "fixed", "off", "fastgpt_opener"):
raise ValueError(
"agent.greeting_mode must be one of: generated, fixed, off, fastgpt_opener"
)
response_state = ResponseStateConfig(**_dict(agent.pop("response_state")))
if response_state.max_prefix_chars < 1:
raise ValueError("agent.response_state.max_prefix_chars must be greater than 0")
@@ -231,6 +239,10 @@ def config_from_dict(data: dict) -> EngineConfig:
llm["app_id"] = None
if not isinstance(llm.get("variables"), dict):
llm["variables"] = {}
if agent.get("greeting_mode") == "fastgpt_opener" and llm["provider"] != "fastgpt":
raise ValueError(
"agent.greeting_mode='fastgpt_opener' requires services.llm.provider='fastgpt'"
)
turn = _dict(data.get("turn"))
vad = _dict(turn.get("vad"))
@@ -244,6 +256,15 @@ def config_from_dict(data: dict) -> EngineConfig:
user_speech_timeout_sec=float(
turn.get("user_speech_timeout_sec", TurnConfig().user_speech_timeout_sec)
),
idle_prompt_timeout_sec=float(
turn.get("idle_prompt_timeout_sec", TurnConfig().idle_prompt_timeout_sec)
),
idle_prompt_max_count=int(
turn.get("idle_prompt_max_count", TurnConfig().idle_prompt_max_count)
),
idle_prompt_text=str(
turn.get("idle_prompt_text", TurnConfig().idle_prompt_text)
),
interruption_min_chars=int(
turn.get("interruption_min_chars", TurnConfig().interruption_min_chars)
),

View File

@@ -126,6 +126,7 @@ async def run_pipeline_with_serializer(
user_params=LLMUserAggregatorParams(
vad_analyzer=SileroVADAnalyzer(params=vad_params),
user_turn_strategies=user_turn_strategies,
user_idle_timeout=config.turn.idle_prompt_timeout_sec,
),
)
@@ -167,21 +168,26 @@ async def run_pipeline_with_serializer(
),
idle_timeout_secs=config.session.inactivity_timeout_sec,
)
idle_prompt_count = 0
@transport.event_handler("on_client_connected")
async def on_client_connected(_transport, _client):
logger.info(f"{client_label} websocket client connected")
if config.agent.greeting_mode == "fixed" and config.agent.greeting:
await task.queue_frames([TTSSpeakFrame(config.agent.greeting)])
elif config.agent.greeting_mode == "generated":
elif config.agent.greeting_mode == "fastgpt_opener":
if isinstance(llm, FastGPTLLMService):
welcome = await llm.fetch_welcome_text()
if welcome:
await task.queue_frames([TTSSpeakFrame(welcome)])
else:
await task.queue_frames([LLMRunFrame()])
logger.warning("FastGPT opener requested but no opener text was returned")
else:
await task.queue_frames([LLMRunFrame()])
raise RuntimeError(
"agent.greeting_mode='fastgpt_opener' requires FastGPT LLM service"
)
elif config.agent.greeting_mode == "generated":
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(_transport, _client):
@@ -193,6 +199,27 @@ async def run_pipeline_with_serializer(
logger.info(f"{client_label} websocket session timed out")
await task.cancel()
@user_aggregator.event_handler("on_user_turn_started")
async def on_user_turn_started(_aggregator, _strategy):
nonlocal idle_prompt_count
idle_prompt_count = 0
@user_aggregator.event_handler("on_user_turn_idle")
async def on_user_turn_idle(aggregator):
nonlocal idle_prompt_count
text = config.turn.idle_prompt_text.strip()
if not text or config.turn.idle_prompt_max_count <= 0:
return
if idle_prompt_count >= config.turn.idle_prompt_max_count:
return
idle_prompt_count += 1
logger.info(
"User idle prompt triggered "
f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}"
)
await aggregator.push_frame(TTSSpeakFrame(text))
@user_aggregator.event_handler("on_user_turn_stopped")
async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage):
logger.info(f"User: {message.content}")

View File

@@ -2,7 +2,13 @@ from __future__ import annotations
from loguru import logger
from pipecat.frames.frames import Frame, InputTransportMessageFrame, LLMMessagesAppendFrame
from pipecat.frames.frames import (
Frame,
InputTransportMessageFrame,
LLMMessagesAppendFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -25,6 +31,8 @@ class ProductTextInputProcessor(FrameProcessor):
if not text:
return
await self.broadcast_frame(UserStartedSpeakingFrame)
if message.get("interrupt", True):
logger.info("Text input interrupting current response")
await self.broadcast_interruption()
@@ -36,3 +44,4 @@ class ProductTextInputProcessor(FrameProcessor):
),
FrameDirection.DOWNSTREAM,
)
await self.broadcast_frame(UserStoppedSpeakingFrame)