diff --git a/config/voice-fastgpt-state-xfyunSuperTTS.json b/config/voice-fastgpt-state-xfyunSuperTTS.json index 98878a3..6426876 100644 --- a/config/voice-fastgpt-state-xfyunSuperTTS.json +++ b/config/voice-fastgpt-state-xfyunSuperTTS.json @@ -42,12 +42,15 @@ "你好", "在吗" ], - "user_speech_timeout_sec": 0.2 + "user_speech_timeout_sec": 0.2, + "idle_prompt_timeout_sec": 3.0, + "idle_prompt_max_count": 3, + "idle_prompt_text": "你好,请问还在吗?" }, "agent": { "system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.", "greeting": "您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域,开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了,请点击继续办理,如需人工服务,请说转人工。", - "greeting_mode": "fixed", + "greeting_mode": "fastgpt_opener", "response_state": { "enabled": true, "tag": "state", diff --git a/config/voice-fastgpt-state-xfyunTTS.json b/config/voice-fastgpt-state-xfyunTTS.json new file mode 100644 index 0000000..fa12cd7 --- /dev/null +++ b/config/voice-fastgpt-state-xfyunTTS.json @@ -0,0 +1,99 @@ +{ + "server": { + "host": "0.0.0.0", + "port": 8000, + "cors_origins": ["*"] + }, + "audio": { + "sample_rate_hz": 16000, + "channels": 1, + "frame_ms": 20 + }, + "session": { + "inactivity_timeout_sec": 60 + }, + "turn": { + "vad": { + "confidence": 0.8, + "start_secs": 0.4, + "stop_secs": 0.2, + "min_volume": 0.8 + }, + "interruption_min_chars": 3, + "interruption_use_interim": true, + "interruption_short_replies": [ + "是", + "是的", + "对", + "对的", + "嗯", + "好", + "好的", + "行", + "可以", + "没问题", + "不是", + "不", + "不行", + "不用", + "不要", + "没有", + "否", + "你好", + "在吗" + ], + "user_speech_timeout_sec": 0.2, + "idle_prompt_timeout_sec": 3.0, + "idle_prompt_max_count": 3, + "idle_prompt_text": "你好,请问还在吗?" + }, + "agent": { + "greeting_mode": "fastgpt_opener", + "response_state": { + "enabled": true, + "tag": "state", + "event_type": "response.state", + "max_prefix_chars": 256 + } + }, + "services": { + "stt": { + "provider": "xfyun", + "app_id": "416ce125", + "api_key": "c65342fe603126c3610031d8429bb36d", + "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4", + "base_url": "wss://iat-api.xfyun.cn/v2/iat", + "language": "zh_cn", + "domain": "iat", + "accent": "mandarin", + "encoding": "raw", + "frame_size": 1280, + "timeout_sec": 10.0 + }, + "llm": { + "provider": "fastgpt", + "api_key": "fastgpt-zlLjYtWZWN0uhQHs3ZOFHG4KLGMIdr2CkbZLCSfqGm5vcdx5xIZbp", + "base_url": "http://localhost:3030", + "model": "my-voice-app", + "app_id": "691eddaa53e3f8d9f25f1370", + "chat_id": null, + "variables": {}, + "detail": false, + "timeout_sec": 60.0 + }, + "tts": { + "provider": "xfyun", + "app_id": "416ce125", + "api_key": "c65342fe603126c3610031d8429bb36d", + "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4", + "base_url": "wss://tts-api.xfyun.cn/v2/tts", + "voice": "x4_xiaoyan", + "aue": "raw", + "tte": "UTF8", + "speed": 50, + "volume": 50, + "pitch": 50, + "source_sample_rate_hz": 16000 + } + } +} diff --git a/config/voice-fastgpt-xfyunSuperTTS.json b/config/voice-fastgpt-xfyunSuperTTS.json index cdca5f4..f52fa58 100644 --- a/config/voice-fastgpt-xfyunSuperTTS.json +++ b/config/voice-fastgpt-xfyunSuperTTS.json @@ -42,12 +42,15 @@ "你好", "在吗" ], - "user_speech_timeout_sec": 0.2 + "user_speech_timeout_sec": 0.2, + "idle_prompt_timeout_sec": 3.0, + "idle_prompt_max_count": 3, + "idle_prompt_text": "你好,请问还在吗?" }, "agent": { "system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.", "greeting": "您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域,开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了,请点击继续办理,如需人工服务,请说转人工。", - "greeting_mode": "fixed", + "greeting_mode": "fastgpt_opener", "response_state": { "enabled": true, "tag": "state", diff --git a/config/voice-fastgpt-xfyunTTS.json b/config/voice-fastgpt-xfyunTTS.json index 4fcf843..04bbda4 100644 --- a/config/voice-fastgpt-xfyunTTS.json +++ b/config/voice-fastgpt-xfyunTTS.json @@ -42,12 +42,15 @@ "你好", "在吗" ], - "user_speech_timeout_sec": 0.2 + "user_speech_timeout_sec": 0.2, + "idle_prompt_timeout_sec": 3.0, + "idle_prompt_max_count": 3, + "idle_prompt_text": "你好,请问还在吗?" }, "agent": { "system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.", "greeting": "您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域,开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了,请点击继续办理,如需人工服务,请说转人工。", - "greeting_mode": "fixed", + "greeting_mode": "fastgpt_opener", "response_state": { "enabled": true, "tag": "state", @@ -82,20 +85,18 @@ "send_system_prompt": false }, "tts": { - "provider": "xfyun_super", + "provider": "xfyun", "app_id": "416ce125", "api_key": "c65342fe603126c3610031d8429bb36d", "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4", - "base_url": "wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6", - "voice": "x5_lingxiaoxuan_flow", + "base_url": "wss://tts-api.xfyun.cn/v2/tts", + "voice": "x4_xiaoyan", "aue": "raw", + "tte": "UTF8", "speed": 50, "volume": 50, "pitch": 50, - "oral_level": "mid", - "source_sample_rate_hz": 24000, - "text_aggregation_mode": "token", - "timeout_sec": 30.0 + "source_sample_rate_hz": 16000 } } } diff --git a/config/voice-xfyun.json b/config/voice-xfyun.json index 6143b60..026af25 100644 --- a/config/voice-xfyun.json +++ b/config/voice-xfyun.json @@ -40,7 +40,10 @@ "没有", "否" ], - "user_speech_timeout_sec": 0.2 + "user_speech_timeout_sec": 0.2, + "idle_prompt_timeout_sec": 3.0, + "idle_prompt_max_count": 3, + "idle_prompt_text": "你好,请问还在吗?" }, "agent": { "system_prompt": "# 角色 你是一个高度集成、安全第一的交警AI接警员。正在收集事故人员伤亡情况,时间,地点,事故原因,事故车辆数量,收集完成之后和用户说再见", diff --git a/config/voice.json b/config/voice.json index 10891ad..4ce7ba5 100644 --- a/config/voice.json +++ b/config/voice.json @@ -42,7 +42,10 @@ "没有", "否" ], - "user_speech_timeout_sec": 0.8 + "user_speech_timeout_sec": 0.8, + "idle_prompt_timeout_sec": 3.0, + "idle_prompt_max_count": 3, + "idle_prompt_text": "你好,请问还在吗?" }, "agent": { "system_prompt": "You are a helpful, friendly voice assistant. Keep responses concise and natural for spoken conversation.", diff --git a/src/voice/config.py b/src/voice/config.py index 3cf3c3e..8807e6e 100644 --- a/src/voice/config.py +++ b/src/voice/config.py @@ -67,6 +67,12 @@ class VADConfig: class TurnConfig: vad: VADConfig = field(default_factory=VADConfig) user_speech_timeout_sec: float = 1.0 + idle_prompt_timeout_sec: float = 0.0 + idle_prompt_max_count: int = 1 + idle_prompt_text: str = ( + "我先停在这里。你可以继续说你的想法," + "或者让我根据刚才的内容帮你整理下一步。" + ) interruption_min_chars: int = 3 interruption_use_interim: bool = True interruption_short_replies: list[str] = field( @@ -209,8 +215,10 @@ def config_from_dict(data: dict) -> EngineConfig: agent = _dict(data.get("agent")) if agent.get("greeting") == "": agent["greeting"] = None - if agent.get("greeting_mode") not in (None, "generated", "fixed", "off"): - raise ValueError("agent.greeting_mode must be one of: generated, fixed, off") + if agent.get("greeting_mode") not in (None, "generated", "fixed", "off", "fastgpt_opener"): + raise ValueError( + "agent.greeting_mode must be one of: generated, fixed, off, fastgpt_opener" + ) response_state = ResponseStateConfig(**_dict(agent.pop("response_state"))) if response_state.max_prefix_chars < 1: raise ValueError("agent.response_state.max_prefix_chars must be greater than 0") @@ -231,6 +239,10 @@ def config_from_dict(data: dict) -> EngineConfig: llm["app_id"] = None if not isinstance(llm.get("variables"), dict): llm["variables"] = {} + if agent.get("greeting_mode") == "fastgpt_opener" and llm["provider"] != "fastgpt": + raise ValueError( + "agent.greeting_mode='fastgpt_opener' requires services.llm.provider='fastgpt'" + ) turn = _dict(data.get("turn")) vad = _dict(turn.get("vad")) @@ -244,6 +256,15 @@ def config_from_dict(data: dict) -> EngineConfig: user_speech_timeout_sec=float( turn.get("user_speech_timeout_sec", TurnConfig().user_speech_timeout_sec) ), + idle_prompt_timeout_sec=float( + turn.get("idle_prompt_timeout_sec", TurnConfig().idle_prompt_timeout_sec) + ), + idle_prompt_max_count=int( + turn.get("idle_prompt_max_count", TurnConfig().idle_prompt_max_count) + ), + idle_prompt_text=str( + turn.get("idle_prompt_text", TurnConfig().idle_prompt_text) + ), interruption_min_chars=int( turn.get("interruption_min_chars", TurnConfig().interruption_min_chars) ), diff --git a/src/voice/pipeline.py b/src/voice/pipeline.py index a49e8ec..15e229b 100644 --- a/src/voice/pipeline.py +++ b/src/voice/pipeline.py @@ -126,6 +126,7 @@ async def run_pipeline_with_serializer( user_params=LLMUserAggregatorParams( vad_analyzer=SileroVADAnalyzer(params=vad_params), user_turn_strategies=user_turn_strategies, + user_idle_timeout=config.turn.idle_prompt_timeout_sec, ), ) @@ -167,21 +168,26 @@ async def run_pipeline_with_serializer( ), idle_timeout_secs=config.session.inactivity_timeout_sec, ) + idle_prompt_count = 0 @transport.event_handler("on_client_connected") async def on_client_connected(_transport, _client): logger.info(f"{client_label} websocket client connected") if config.agent.greeting_mode == "fixed" and config.agent.greeting: await task.queue_frames([TTSSpeakFrame(config.agent.greeting)]) - elif config.agent.greeting_mode == "generated": + elif config.agent.greeting_mode == "fastgpt_opener": if isinstance(llm, FastGPTLLMService): welcome = await llm.fetch_welcome_text() if welcome: await task.queue_frames([TTSSpeakFrame(welcome)]) else: - await task.queue_frames([LLMRunFrame()]) + logger.warning("FastGPT opener requested but no opener text was returned") else: - await task.queue_frames([LLMRunFrame()]) + raise RuntimeError( + "agent.greeting_mode='fastgpt_opener' requires FastGPT LLM service" + ) + elif config.agent.greeting_mode == "generated": + await task.queue_frames([LLMRunFrame()]) @transport.event_handler("on_client_disconnected") async def on_client_disconnected(_transport, _client): @@ -193,6 +199,27 @@ async def run_pipeline_with_serializer( logger.info(f"{client_label} websocket session timed out") await task.cancel() + @user_aggregator.event_handler("on_user_turn_started") + async def on_user_turn_started(_aggregator, _strategy): + nonlocal idle_prompt_count + idle_prompt_count = 0 + + @user_aggregator.event_handler("on_user_turn_idle") + async def on_user_turn_idle(aggregator): + nonlocal idle_prompt_count + text = config.turn.idle_prompt_text.strip() + if not text or config.turn.idle_prompt_max_count <= 0: + return + if idle_prompt_count >= config.turn.idle_prompt_max_count: + return + + idle_prompt_count += 1 + logger.info( + "User idle prompt triggered " + f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}" + ) + await aggregator.push_frame(TTSSpeakFrame(text)) + @user_aggregator.event_handler("on_user_turn_stopped") async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage): logger.info(f"User: {message.content}") diff --git a/src/voice/text_input.py b/src/voice/text_input.py index a864ceb..f9b5c4e 100644 --- a/src/voice/text_input.py +++ b/src/voice/text_input.py @@ -2,7 +2,13 @@ from __future__ import annotations from loguru import logger -from pipecat.frames.frames import Frame, InputTransportMessageFrame, LLMMessagesAppendFrame +from pipecat.frames.frames import ( + Frame, + InputTransportMessageFrame, + LLMMessagesAppendFrame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame, +) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor @@ -25,6 +31,8 @@ class ProductTextInputProcessor(FrameProcessor): if not text: return + await self.broadcast_frame(UserStartedSpeakingFrame) + if message.get("interrupt", True): logger.info("Text input interrupting current response") await self.broadcast_interruption() @@ -36,3 +44,4 @@ class ProductTextInputProcessor(FrameProcessor): ), FrameDirection.DOWNSTREAM, ) + await self.broadcast_frame(UserStoppedSpeakingFrame)