Merge branch 'main' of https://gitea.xiaowang.eu.org/wx44wx/ZNJJ-api-server

Refactor form update handling in API endpoints and models
- Introduced a new function to parse JSON values in endpoints.py for improved data handling. - Updated extract_form_update_from_flow_nodes to return structured data instead of strings. - Changed formUpdate field in ProcessResponse_chat model to use Any type with a default empty dictionary for better flexibility in handling updates.
2026-06-17 13:33:23 +08:00 · 2026-06-17 13:29:50 +08:00 · 2026-06-17 12:36:46 +08:00 · 2026-06-17 11:36:42 +08:00 · 2026-06-03 12:52:38 +08:00 · 2026-06-03 12:36:18 +08:00
50 changed files with 12560 additions and 105 deletions
--- a/.env
+++ b/.env
@@ -2,9 +2,8 @@ DATABASE_URL=sqlite:///./test.db
 SECRET_KEY=your_secret_key
 DEBUG=True

-ANALYSIS_SERVICE_URL=http://101.89.151.141:3000/api/v1/chat/completions
-ANALYSIS_AUTH_TOKEN=fastgpt-hSPnXMoBNGVAEpTLkQT3YfAnN26gQSyvLd4ABL1MRDoh68nL4RDlopFHXqmH8
-APP_ID=683ea1bc86197e19f71fc1ae
-DELETE_SESSION_URL=http://101.89.151.141:3000/api/core/chat/delHistory?chatId={chatId}&appId={appId}
-DELETE_CHAT_URL=http://101.89.151.141:3000/api/core/chat/item/delete?contentId={contentId}&chatId={chatId}&appId={appId}
-GET_CHAT_RECORDS_URL=http://101.89.151.141:3000/api/core/chat/getPaginationRecords
+ANALYSIS_SERVICE_URL=http://127.0.0.1:3030
+ANALYSIS_AUTH_TOKEN=fastgpt-r13smJwPgXfGj1HDfc4SWAvIoNrL5Wc6o0BYnezqBs7hgzPdQ7Q34hVl2FJc0R
+APP_ID=6a310def7132e9f7d592dabb
+
+VOICE_CONFIG=config/voice-fastgpt-state-xfyunSuperTTS.json
--- a/config/voice-fastgpt-state-xfyunSuperTTS.json
+++ b/config/voice-fastgpt-state-xfyunSuperTTS.json
@@ -0,0 +1,104 @@
+{
+  "server": {
+    "host": "0.0.0.0",
+    "port": 8000,
+    "cors_origins": ["*"]
+  },
+  "audio": {
+    "sample_rate_hz": 16000,
+    "channels": 1,
+    "frame_ms": 20
+  },
+  "session": {
+    "inactivity_timeout_sec": 60
+  },
+  "turn": {
+    "vad": {
+      "confidence": 0.8,
+      "start_secs": 0.4,
+      "stop_secs": 0.2,
+      "min_volume": 0.8
+    },
+    "interruption_min_chars": 3,
+    "interruption_use_interim": true,
+    "interruption_short_replies": [
+      "是",
+      "是的",
+      "对",
+      "对的",
+      "嗯",
+      "好",
+      "好的",
+      "行",
+      "可以",
+      "没问题",
+      "不是",
+      "不",
+      "不行",
+      "不用",
+      "不要",
+      "没有",
+      "否",
+      "你好",
+      "在吗"
+    ],
+    "user_speech_timeout_sec": 0.2,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
+  },
+  "agent": {
+    "system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
+    "greeting": "您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，请点击继续办理，如需人工服务，请说转人工。",
+    "greeting_mode": "fastgpt_opener",
+    "response_state": {
+      "enabled": true,
+      "tag": "state",
+      "event_type": "response.state",
+      "max_prefix_chars": 256
+    }
+  },
+  "services": {
+    "stt": {
+      "provider": "xfyun",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://iat-api.xfyun.cn/v2/iat",
+      "language": "zh_cn",
+      "domain": "iat",
+      "accent": "mandarin",
+      "encoding": "raw",
+      "frame_size": 1280,
+      "timeout_sec": 10.0
+    },
+    "llm": {
+      "provider": "fastgpt",
+      "api_key": "fastgpt-zlLjYtWZWN0uhQHs3ZOFHG4KLGMIdr2CkbZLCSfqGm5vcdx5xIZbp",
+      "base_url": "http://localhost:3030",
+      "model": "my-voice-app",
+      "app_id": "691eddaa53e3f8d9f25f1370",
+      "chat_id": null,
+      "variables": {},
+      "detail": false,
+      "timeout_sec": 60.0,
+      "send_system_prompt": false
+    },
+    "tts": {
+      "provider": "xfyun_super",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6",
+      "voice": "x5_lingxiaoxuan_flow",
+      "aue": "raw",
+      "speed": 50,
+      "volume": 50,
+      "pitch": 50,
+      "oral_level": "mid",
+      "source_sample_rate_hz": 24000,
+      "text_aggregation_mode": "token",
+      "timeout_sec": 30.0
+    }
+  }
+}
--- a/config/voice-fastgpt-state-xfyunTTS.json
+++ b/config/voice-fastgpt-state-xfyunTTS.json
@@ -0,0 +1,99 @@
+{
+  "server": {
+    "host": "0.0.0.0",
+    "port": 8000,
+    "cors_origins": ["*"]
+  },
+  "audio": {
+    "sample_rate_hz": 16000,
+    "channels": 1,
+    "frame_ms": 20
+  },
+  "session": {
+    "inactivity_timeout_sec": 60
+  },
+  "turn": {
+    "vad": {
+      "confidence": 0.8,
+      "start_secs": 0.4,
+      "stop_secs": 0.2,
+      "min_volume": 0.8
+    },
+    "interruption_min_chars": 3,
+    "interruption_use_interim": true,
+    "interruption_short_replies": [
+      "是",
+      "是的",
+      "对",
+      "对的",
+      "嗯",
+      "好",
+      "好的",
+      "行",
+      "可以",
+      "没问题",
+      "不是",
+      "不",
+      "不行",
+      "不用",
+      "不要",
+      "没有",
+      "否",
+      "你好",
+      "在吗"
+    ],
+    "user_speech_timeout_sec": 0.2,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
+  },
+  "agent": {
+    "greeting_mode": "fastgpt_opener",
+    "response_state": {
+      "enabled": true,
+      "tag": "state",
+      "event_type": "response.state",
+      "max_prefix_chars": 256
+    }
+  },
+  "services": {
+    "stt": {
+      "provider": "xfyun",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://iat-api.xfyun.cn/v2/iat",
+      "language": "zh_cn",
+      "domain": "iat",
+      "accent": "mandarin",
+      "encoding": "raw",
+      "frame_size": 1280,
+      "timeout_sec": 10.0
+    },
+    "llm": {
+      "provider": "fastgpt",
+      "api_key": "fastgpt-zlLjYtWZWN0uhQHs3ZOFHG4KLGMIdr2CkbZLCSfqGm5vcdx5xIZbp",
+      "base_url": "http://localhost:3030",
+      "model": "my-voice-app",
+      "app_id": "691eddaa53e3f8d9f25f1370",
+      "chat_id": null,
+      "variables": {},
+      "detail": false,
+      "timeout_sec": 60.0
+    },
+    "tts": {
+      "provider": "xfyun",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://tts-api.xfyun.cn/v2/tts",
+      "voice": "x4_xiaoyan",
+      "aue": "raw",
+      "tte": "UTF8",
+      "speed": 50,
+      "volume": 50,
+      "pitch": 50,
+      "source_sample_rate_hz": 16000
+    }
+  }
+}
--- a/config/voice-fastgpt-xfyunSuperTTS.json
+++ b/config/voice-fastgpt-xfyunSuperTTS.json
@@ -0,0 +1,104 @@
+{
+  "server": {
+    "host": "0.0.0.0",
+    "port": 8000,
+    "cors_origins": ["*"]
+  },
+  "audio": {
+    "sample_rate_hz": 16000,
+    "channels": 1,
+    "frame_ms": 20
+  },
+  "session": {
+    "inactivity_timeout_sec": 60
+  },
+  "turn": {
+    "vad": {
+      "confidence": 0.8,
+      "start_secs": 0.4,
+      "stop_secs": 0.2,
+      "min_volume": 0.8
+    },
+    "interruption_min_chars": 3,
+    "interruption_use_interim": true,
+    "interruption_short_replies": [
+      "是",
+      "是的",
+      "对",
+      "对的",
+      "嗯",
+      "好",
+      "好的",
+      "行",
+      "可以",
+      "没问题",
+      "不是",
+      "不",
+      "不行",
+      "不用",
+      "不要",
+      "没有",
+      "否",
+      "你好",
+      "在吗"
+    ],
+    "user_speech_timeout_sec": 0.2,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
+  },
+  "agent": {
+    "system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
+    "greeting": "您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，请点击继续办理，如需人工服务，请说转人工。",
+    "greeting_mode": "fastgpt_opener",
+    "response_state": {
+      "enabled": true,
+      "tag": "state",
+      "event_type": "response.state",
+      "max_prefix_chars": 256
+    }
+  },
+  "services": {
+    "stt": {
+      "provider": "xfyun",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://iat-api.xfyun.cn/v2/iat",
+      "language": "zh_cn",
+      "domain": "iat",
+      "accent": "mandarin",
+      "encoding": "raw",
+      "frame_size": 1280,
+      "timeout_sec": 10.0
+    },
+    "llm": {
+      "provider": "fastgpt",
+      "api_key": "fastgpt-v1FljAxBz3tJeS0bH7HZU4yVGclsTcfiy9yK7V9Zr9126maDHQ97Xlo8n",
+      "base_url": "http://localhost:3030",
+      "model": "my-voice-app",
+      "app_id": "6a153aed53e3f8d9f2744905",
+      "chat_id": null,
+      "variables": {},
+      "detail": false,
+      "timeout_sec": 60.0,
+      "send_system_prompt": false
+    },
+    "tts": {
+      "provider": "xfyun_super",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6",
+      "voice": "x5_lingxiaoxuan_flow",
+      "aue": "raw",
+      "speed": 50,
+      "volume": 50,
+      "pitch": 50,
+      "oral_level": "mid",
+      "source_sample_rate_hz": 24000,
+      "text_aggregation_mode": "token",
+      "timeout_sec": 30.0
+    }
+  }
+}
--- a/config/voice-fastgpt-xfyunTTS.json
+++ b/config/voice-fastgpt-xfyunTTS.json
@@ -0,0 +1,102 @@
+{
+  "server": {
+    "host": "0.0.0.0",
+    "port": 8000,
+    "cors_origins": ["*"]
+  },
+  "audio": {
+    "sample_rate_hz": 16000,
+    "channels": 1,
+    "frame_ms": 20
+  },
+  "session": {
+    "inactivity_timeout_sec": 60
+  },
+  "turn": {
+    "vad": {
+      "confidence": 0.7,
+      "start_secs": 0.35,
+      "stop_secs": 0.2,
+      "min_volume": 0.65
+    },
+    "interruption_min_chars": 3,
+    "interruption_use_interim": true,
+    "interruption_short_replies": [
+      "是",
+      "是的",
+      "对",
+      "对的",
+      "嗯",
+      "好",
+      "好的",
+      "行",
+      "可以",
+      "没问题",
+      "不是",
+      "不",
+      "不行",
+      "不用",
+      "不要",
+      "没有",
+      "否",
+      "你好",
+      "在吗"
+    ],
+    "user_speech_timeout_sec": 0.2,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
+  },
+  "agent": {
+    "system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
+    "greeting": "您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，请点击继续办理，如需人工服务，请说转人工。",
+    "greeting_mode": "fastgpt_opener",
+    "response_state": {
+      "enabled": true,
+      "tag": "state",
+      "event_type": "response.state",
+      "max_prefix_chars": 256
+    }
+  },
+  "services": {
+    "stt": {
+      "provider": "xfyun",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://iat-api.xfyun.cn/v2/iat",
+      "language": "zh_cn",
+      "domain": "iat",
+      "accent": "mandarin",
+      "encoding": "raw",
+      "frame_size": 1280,
+      "timeout_sec": 10.0
+    },
+    "llm": {
+      "provider": "fastgpt",
+      "api_key": "fastgpt-v1FljAxBz3tJeS0bH7HZU4yVGclsTcfiy9yK7V9Zr9126maDHQ97Xlo8n",
+      "base_url": "http://localhost:3030",
+      "model": "my-voice-app",
+      "app_id": "6a153aed53e3f8d9f2744905",
+      "chat_id": null,
+      "variables": {},
+      "detail": false,
+      "timeout_sec": 60.0,
+      "send_system_prompt": false
+    },
+    "tts": {
+      "provider": "xfyun",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://tts-api.xfyun.cn/v2/tts",
+      "voice": "x4_xiaoyan",
+      "aue": "raw",
+      "tte": "UTF8",
+      "speed": 50,
+      "volume": 50,
+      "pitch": 50,
+      "source_sample_rate_hz": 16000
+    }
+  }
+}
--- a/config/voice-xfyun.json
+++ b/config/voice-xfyun.json
@@ -0,0 +1,95 @@
+{
+  "server": {
+    "host": "0.0.0.0",
+    "port": 8000,
+    "cors_origins": ["*"]
+  },
+  "audio": {
+    "sample_rate_hz": 16000,
+    "channels": 1,
+    "frame_ms": 20
+  },
+  "session": {
+    "inactivity_timeout_sec": 60
+  },
+  "turn": {
+    "vad": {
+      "confidence": 0.7,
+      "start_secs": 0.35,
+      "stop_secs": 0.2,
+      "min_volume": 0.65
+    },
+    "interruption_min_chars": 3,
+    "interruption_use_interim": true,
+    "interruption_short_replies": [
+      "是",
+      "是的",
+      "对",
+      "对的",
+      "嗯",
+      "好",
+      "好的",
+      "行",
+      "可以",
+      "没问题",
+      "不是",
+      "不",
+      "不行",
+      "不用",
+      "不要",
+      "没有",
+      "否"
+    ],
+    "user_speech_timeout_sec": 0.2,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
+  },
+  "agent": {
+    "system_prompt": "# 角色 你是一个高度集成、安全第一的交警AI接警员。正在收集事故人员伤亡情况，时间，地点，事故原因，事故车辆数量，收集完成之后和用户说再见",
+    "greeting": "您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，请点击继续办理，如需人工服务，请说转人工。",
+    "greeting_mode": "fixed",
+    "response_state": {
+      "enabled": true,
+      "tag": "state",
+      "event_type": "response.state",
+      "max_prefix_chars": 256
+    }
+  },
+  "services": {
+    "stt": {
+      "provider": "xfyun",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://iat-api.xfyun.cn/v2/iat",
+      "language": "zh_cn",
+      "domain": "iat",
+      "accent": "mandarin",
+      "encoding": "raw",
+      "frame_size": 1280,
+      "timeout_sec": 10.0
+    },
+    "llm": {
+      "provider": "openai",
+      "api_key": "sk-230701ff1b6143ecbf322b3170606016",
+      "base_url": "https://api.deepseek.com/v1",
+      "model": "deepseek-chat",
+      "temperature": 0.7
+    },
+    "tts": {
+      "provider": "xfyun",
+      "app_id": "416ce125",
+      "api_key": "c65342fe603126c3610031d8429bb36d",
+      "api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
+      "base_url": "wss://tts-api.xfyun.cn/v2/tts",
+      "voice": "x4_xiaoyan",
+      "aue": "raw",
+      "tte": "UTF8",
+      "speed": 50,
+      "volume": 50,
+      "pitch": 50,
+      "source_sample_rate_hz": 16000
+    }
+  }
+}
--- a/config/voice.json
+++ b/config/voice.json
@@ -0,0 +1,84 @@
+{
+  "server": {
+    "host": "0.0.0.0",
+    "port": 8000,
+    "cors_origins": ["http://localhost:3000", "http://localhost:8080"],
+    "serve_webpage": true,
+    "webpage_mount": "/voice-demo"
+  },
+  "audio": {
+    "sample_rate_hz": 16000,
+    "channels": 1,
+    "frame_ms": 20
+  },
+  "session": {
+    "inactivity_timeout_sec": 60
+  },
+  "turn": {
+    "vad": {
+      "confidence": 0.7,
+      "start_secs": 0.2,
+      "stop_secs": 0.4,
+      "min_volume": 0.6
+    },
+    "interruption_min_chars": 3,
+    "interruption_use_interim": true,
+    "interruption_short_replies": [
+      "是",
+      "是的",
+      "对",
+      "对的",
+      "嗯",
+      "好",
+      "好的",
+      "行",
+      "可以",
+      "没问题",
+      "不是",
+      "不",
+      "不行",
+      "不用",
+      "不要",
+      "没有",
+      "否"
+    ],
+    "user_speech_timeout_sec": 0.8,
+    "idle_prompt_timeout_sec": 3.0,
+    "idle_prompt_max_count": 3,
+    "idle_prompt_text": "你好，请问还在吗？"
+  },
+  "agent": {
+    "system_prompt": "You are a helpful, friendly voice assistant. Keep responses concise and natural for spoken conversation.",
+    "greeting": "Please introduce yourself briefly.",
+    "greeting_mode": "generated",
+    "response_state": {
+      "enabled": false,
+      "tag": "state",
+      "event_type": "response.state",
+      "max_prefix_chars": 256
+    }
+  },
+  "services": {
+    "stt": {
+      "provider": "openai",
+      "api_key": "",
+      "base_url": null,
+      "model": "gpt-4o-mini-transcribe",
+      "language": "en"
+    },
+    "llm": {
+      "provider": "openai",
+      "api_key": "",
+      "base_url": null,
+      "model": "gpt-4o-mini",
+      "temperature": 0.7
+    },
+    "tts": {
+      "provider": "openai",
+      "api_key": "",
+      "base_url": null,
+      "model": "gpt-4o-mini-tts",
+      "voice": "alloy"
+    }
+  }
+}
--- a/docs/chat-stream-mode.md
+++ b/docs/chat-stream-mode.md
@@ -0,0 +1,56 @@
+# /chat 流式响应模式说明
+
+## 接口地址
+
+```
+POST http://localhost:8000/chat?stream=true
+```
+
+## 请求参数
+
+| 参数 | 类型 | 必填 | 说明 |
+|------|------|------|------|
+| sessionId | string | 是 | 会话 ID |
+| timeStamp | string | 是 | 时间戳 |
+| text | string | 是 | 用户输入文本 |
+| stream | bool | 否 | 设为 true 启用流式响应 |
+
+## SSE 事件类型
+
+| 事件类型 | 说明 | 数据格式 |
+|----------|------|----------|
+| `stage_code` | 阶段状态码 | `{"nextStageCode": "0000", "nextStage": "结束通话"}` |
+| `text_delta` | 流式文本片段 | `{"text": "您好..."}` |
+| `done` | 流式结束 | `{"status": "completed"}` |
+| `error` | 错误信息 | `{"msg": "错误描述", "code": "500"}` |
+
+## 状态码映射
+
+| 状态码 | 含义 |
+|--------|------|
+| 0000 | 结束通话 |
+| 0001 | 转接人工 |
+| 0002 | 语义无法识别转接人工 |
+| 0003 | 有人伤转接人工 |
+| 1001 | 未准备好通话 |
+| 1002 | 通话中 |
+| 2000 | 进入单车拍照 |
+| ... | ... |
+
+## 示例
+
+### 请求
+
+```bash
+python examples/stream_chat.py session-001 "发生了交通事故"
+```
+
+### 响应
+
+```
+Status: 200
+--------------------------------------------------
+[stage_code] {"nextStageCode": "1002", "nextStage": "通话中"}
+[text_delta] {"text": "您好，请问发生了什么情况？"}
+[done] {"status": "completed"}
+```
--- a/docs/voice-websocket.md
+++ b/docs/voice-websocket.md
@@ -0,0 +1,376 @@
+# Voice WebSocket 使用说明
+
+基于 `src/voice` 产品语音管线与 `static/voice-demo` 浏览器示例整理。
+
+## 概览
+
+| 项目 | 说明 |
+|------|------|
+| WebSocket 路径 | `/ws-product` |
+| 协议标识 | `va.ws.v1`（JSON + base64；音频上行也支持二进制 PCM） |
+| 默认音频 | PCM16 小端（`pcm_s16le`）、16 kHz、单声道 |
+| 会话 ID | 连接 URL 查询参数 `chatId` 或 `chat_id`；未传时服务端自动生成 |
+| 健康检查 | `GET /voice/health` |
+| 浏览器 Demo | 默认挂载于 `/voice-demo`（由 voice 配置 `server.serve_webpage` 控制） |
+
+完整 URL 示例：
+
+```
+ws://127.0.0.1:8000/ws-product?chatId=voice_abc123
+wss://your-host/ws-product?chatId=voice_abc123
+```
+
+## 连接流程
+
+```mermaid
+sequenceDiagram
+    participant Client
+    participant Server
+
+    Client->>Server: WebSocket connect (?chatId=...)
+    Server-->>Client: 101 Switching Protocols
+    Client->>Server: session.start (JSON)
+    Note over Client,Server: 可选：固定开场白 / FastGPT opener / LLM 生成问候
+    loop 会话中
+        Client->>Server: input.audio (binary 或 JSON)
+        Client->>Server: input.text
+        Server-->>Client: input.transcript.* / response.text.* / response.audio.*
+        Server-->>Client: response.state（若启用状态标签）
+    end
+    Client->>Server: session.stop
+    Server-->>Client: WebSocket close
+```
+
+推荐顺序（与 `voice-demo/app.js` 一致）：
+
+1. 建立 WebSocket 连接（建议 `binaryType = "arraybuffer"`）。
+2. 连接成功后立即发送 `session.start`。
+3. 开始推送麦克风音频（二进制帧或 `input.audio` JSON）。
+4. 处理服务端 JSON 事件（文本、转写、TTS 音频等）。
+5. 断开前发送 `session.stop`，再关闭连接。
+
+## 消息信封
+
+除二进制音频外，所有消息均为 UTF-8 JSON 对象。服务端下发事件统一包含：
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `type` | string | 事件类型 |
+| `protocol` | string | 固定为 `va.ws.v1` |
+| `seq` | number | 单调递增序号（仅服务端事件） |
+
+## 客户端 → 服务端
+
+### `session.start`
+
+开始会话，必须在发送音频或文本输入之前调用。
+
+```json
+{
+  "type": "session.start",
+  "protocol": "va.ws.v1",
+  "chatId": "voice_abc123",
+  "audio": {
+    "encoding": "pcm_s16le",
+    "sample_rate": 16000,
+    "channels": 1
+  }
+}
+```
+
+`chatId` 也可写作 `chat_id`。若省略，服务端使用 URL 查询参数或自动生成 ID。
+
+### `session.stop`
+
+正常结束会话。
+
+```json
+{
+  "type": "session.stop",
+  "reason": "client_disconnect"
+}
+```
+
+### `input.audio`（JSON 形式）
+
+```json
+{
+  "type": "input.audio",
+  "audio": "<base64 PCM16>",
+  "sample_rate": 16000,
+  "channels": 1
+}
+```
+
+`audio` 字段也可命名为 `data`。`sample_rate` / `channels` 可省略，默认与服务端配置一致。
+
+### 二进制音频（推荐）
+
+直接发送 **原始 PCM16 小端** 字节流，无需 JSON 包装。`voice-demo` 通过 AudioWorklet 每 20 ms 发送一帧（16 kHz 单声道下约 640 字节/帧）。
+
+服务端同时接受 JSON 与二进制两种上行格式。
+
+### `input.text`
+
+发送文本回合；默认会打断当前 bot 回复（`interrupt: true`）。
+
+```json
+{
+  "type": "input.text",
+  "text": "你好，我想报案",
+  "interrupt": true
+}
+```
+
+注意：文本输入**不会**以 `input.transcript.final` 回显，客户端需自行在 UI 中展示用户消息（Demo 即如此处理）。Demo 的相机步骤通过发送 `input.text`（如 `【拍摄完成】`）完成，不上传图片帧。
+
+## 服务端 → 客户端
+
+### 用户语音转写
+
+| 事件 | 说明 |
+|------|------|
+| `input.transcript.interim` | ASR 中间结果（流式识别过程中） |
+| `input.transcript.final` | 用户一句话结束后的最终转写 |
+
+```json
+{
+  "type": "input.transcript.final",
+  "protocol": "va.ws.v1",
+  "seq": 12,
+  "text": "发生了交通事故",
+  "user_id": "product-user",
+  "timestamp": "2026-06-01T10:00:00.000Z"
+}
+```
+
+### 助手文本流
+
+文本通常**早于**对应 TTS 音频到达，便于客户端先渲染字幕。
+
+| 事件 | 说明 |
+|------|------|
+| `response.text.started` | 新一轮助手回复开始 |
+| `response.text.delta` | 流式文本片段 |
+| `response.text.final` | 本轮文本结束；`interrupted: true` 表示被打断 |
+
+```json
+{
+  "type": "response.text.delta",
+  "protocol": "va.ws.v1",
+  "seq": 20,
+  "text": "您好，"
+}
+```
+
+```json
+{
+  "type": "response.text.final",
+  "protocol": "va.ws.v1",
+  "seq": 45,
+  "text": "您好，请问发生了什么情况？",
+  "interrupted": false
+}
+```
+
+### 助手语音（TTS）
+
+| 事件 | 说明 |
+|------|------|
+| `response.audio.started` | Bot 开始说话 |
+| `response.audio.delta` | PCM16 音频块（base64） |
+| `response.audio.stopped` | Bot 说完 |
+
+```json
+{
+  "type": "response.audio.delta",
+  "protocol": "va.ws.v1",
+  "seq": 30,
+  "audio": "<base64 PCM16>",
+  "bytes": 640,
+  "sample_rate": 16000,
+  "channels": 1
+}
+```
+
+客户端应将各 `delta` 块按序解码并无缝拼接播放（Demo 使用 Web Audio `AudioContext` 调度）。
+
+### 助手状态（可选）
+
+当 voice 配置启用 `agent.response_state` 时，LLM 输出开头的 `<state>...</state>` 标签会被剥离，并单独下发：
+
+```json
+{
+  "type": "response.state",
+  "protocol": "va.ws.v1",
+  "seq": 18,
+  "state": "2000"
+}
+```
+
+Demo 根据状态码展示拍照引导（如 `2000`–`2015` 等车险场景状态）。
+
+## 音频参数
+
+| 参数 | 默认值 | 说明 |
+|------|--------|------|
+| 采样率 | 16000 Hz | 配置项 `audio.sample_rate_hz` |
+| 声道 | 1（mono） | 配置项 `audio.channels` |
+| 帧长 | 20 ms | 配置项 `audio.frame_ms`；每帧 640 字节 |
+| 编码 | PCM signed 16-bit LE | 小端有符号 16 位整数 |
+
+## 会话与打断行为
+
+- **chatId**：同一 ID 用于 LLM（如 FastGPT）多轮上下文；连接时可写在 URL 或 `session.start` 中。
+- **语音回合**：VAD + 静音超时判定用户说完；说完后触发 STT 最终转写与 LLM。
+- **打断**：用户说话或 `input.text`（`interrupt: true`）可打断 bot；被打断的助手文本在 `response.text.final` 中带 `interrupted: true`。
+- **空闲超时**：长时间无活动会断开（`session.inactivity_timeout_sec`，默认 60 秒）；可配置空闲提示语。
+- **开场白**：由 `agent.greeting_mode` 控制（`fixed` / `fastgpt_opener` / `generated` 等）。
+
+## 浏览器 Demo 参考实现
+
+Demo 位于 `static/voice-demo/`，无构建步骤，核心文件：
+
+| 文件 | 职责 |
+|------|------|
+| `app.js` | WebSocket 连接、事件处理、聊天 UI、TTS 播放 |
+| `pcm-recorder.worklet.js` | 麦克风采集、重采样至 16 kHz、20 ms 二进制帧 |
+| `index.html` / `styles.css` | 页面与样式 |
+
+### 启动 Demo
+
+1. 启动 API 服务并加载 voice 配置（环境变量 `VOICE_CONFIG` 指向 JSON，默认 `config/voice.json`）。
+2. 浏览器打开 `http://127.0.0.1:8000/voice-demo/`（挂载路径见配置 `server.webpage_mount`）。
+3. 点击 **Connect** → **Enable mic** 开始对话。
+
+### Demo 关键实现要点
+
+**连接与握手**
+
+```javascript
+const ws = new WebSocket("ws://127.0.0.1:8000/ws-product?chatId=voice_xxx");
+ws.binaryType = "arraybuffer";
+
+ws.onopen = () => {
+  ws.send(JSON.stringify({
+    type: "session.start",
+    protocol: "va.ws.v1",
+    chatId: "voice_xxx",
+    audio: { encoding: "pcm_s16le", sample_rate: 16000, channels: 1 },
+  }));
+};
+```
+
+**发送麦克风（二进制，与 Demo 一致）**
+
+```javascript
+// AudioWorklet 每 20ms postMessage { type: "frame", buffer: ArrayBuffer }
+recorderNode.port.onmessage = (event) => {
+  if (event.data?.type === "frame") {
+    ws.send(event.data.buffer);
+  }
+};
+```
+
+**播放 TTS**
+
+```javascript
+function decodeBase64ToInt16(b64) {
+  const binary = atob(b64);
+  const bytes = new Uint8Array(binary.length);
+  for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
+  return new Int16Array(bytes.buffer);
+}
+
+// 收到 response.audio.delta 后，将 Int16 转为 Float32 并调度到 AudioContext
+```
+
+**发送文本并打断**
+
+```javascript
+ws.send(JSON.stringify({
+  type: "input.text",
+  text: "【拍摄完成】",
+  interrupt: true,
+}));
+// 客户端应停止本地 TTS 播放队列；服务端会发 response.text.final(interrupted=true)
+```
+
+### 跨域静态页
+
+若 Demo 托管在其他端口，需在 voice 配置中设置 `server.cors_origins`，并将 WebSocket URL 指向 API 主机。
+
+> 浏览器 `getUserMedia` 需要安全上下文：`https://` 或 `http://localhost` 可用；其他 HTTP 源需改用 HTTPS + `wss://`。
+
+## 最小客户端示例（伪代码）
+
+```javascript
+const ws = new WebSocket(`${location.protocol === "https:" ? "wss" : "ws"}://${location.host}/ws-product?chatId=voice_demo_1`);
+ws.binaryType = "arraybuffer";
+
+ws.onopen = () => {
+  ws.send(JSON.stringify({
+    type: "session.start",
+    protocol: "va.ws.v1",
+    audio: { encoding: "pcm_s16le", sample_rate: 16000, channels: 1 },
+  }));
+};
+
+ws.onmessage = (event) => {
+  if (typeof event.data !== "string") return;
+  const msg = JSON.parse(event.data);
+  switch (msg.type) {
+    case "input.transcript.final":
+      console.log("User:", msg.text);
+      break;
+    case "response.text.delta":
+      process.stdout?.write?.(msg.text); // 流式打印助手文本
+      break;
+    case "response.audio.delta":
+      playPcm16(decodeBase64(msg.audio));
+      break;
+    case "response.state":
+      console.log("State:", msg.state);
+      break;
+  }
+};
+
+function disconnect() {
+  ws.send(JSON.stringify({ type: "session.stop", reason: "done" }));
+  ws.close(1000, "done");
+}
+```
+
+## 健康检查响应示例
+
+```bash
+curl http://127.0.0.1:8000/voice/health
+```
+
+```json
+{
+  "status": "healthy",
+  "config": "/path/to/config/voice.json",
+  "protocols": {
+    "/ws-product": "va.ws.v1.json_base64"
+  },
+  "features": {
+    "product_text_input": true,
+    "product_text_interrupt": true
+  },
+  "demo": "/voice-demo",
+  "llm_provider": "fastgpt",
+  "stt_provider": "xfyun",
+  "tts_provider": "xfyun"
+}
+```
+
+## 常见问题
+
+| 现象 | 可能原因 |
+|------|----------|
+| 连接后立即断开 | 未发送 `session.start`；或超过 inactivity 超时 |
+| 无 bot 语音 | 未处理 `response.audio.delta`；AudioContext 未在用户手势后 resume |
+| 回声/啸叫 | 建议使用耳机；Demo 已开启浏览器 AEC，但扬声器外放仍可能串音 |
+| 文本发送无用户气泡 | 设计如此，需客户端本地展示 `input.text` 内容 |
+| 跨域 WebSocket 失败 | 检查 `cors_origins` 与 `wss` 证书 |
--- a/examples/nostream_chat.py
+++ b/examples/nostream_chat.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""
+Simple CLI script to interact with /chat endpoint in non-stream mode.
+"""
+
+import asyncio
+import aiohttp
+import json
+import sys
+from datetime import datetime
+
+
+API_BASE_URL = "http://localhost:8000"
+
+
+async def chat(session_id: str, text: str):
+    """Send a non-streaming chat request."""
+    timestamp = datetime.now().isoformat()
+
+    payload = {
+        "sessionId": session_id,
+        "timeStamp": timestamp,
+        "text": text
+    }
+
+    async with aiohttp.ClientSession() as http_session:
+        async with http_session.post(
+            f"{API_BASE_URL}/chat",
+            json=payload,
+        ) as response:
+            data = await response.json()
+
+            print(f"Status: {response.status}")
+            print("-" * 50)
+            print(json.dumps(data, indent=2, ensure_ascii=False))
+
+
+async def main():
+    if len(sys.argv) < 3:
+        print("Usage: python nostream_chat.py <session_id> <message>")
+        print("Example: python nostream_chat.py test-session-123 '发生了交通事故'")
+        sys.exit(1)
+
+    session_id = sys.argv[1]
+    text = " ".join(sys.argv[2:])
+
+    print(f"Session ID: {session_id}")
+    print(f"Message: {text}")
+    print("-" * 50)
+
+    await chat(session_id, text)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/examples/stream_chat.py
+++ b/examples/stream_chat.py
@@ -1,16 +1,30 @@
 #!/usr/bin/env python3
 """
-Simple CLI script to interact with /chat endpoint in stream mode.
+Stream Chat CLI - 与 /chat 端点进行流式交互的脚本。
+
+用法:
+    python stream_chat.py <session_id> <消息>
+
+示例:
+    python stream_chat.py test-001 "发生了交通事故"
+
+输出说明:
+    - [stage_code]: 阶段状态码，如 {"nextStageCode": "0000", "nextStage": "结束通话"}
+    - [text_delta]: 流式文本片段
+    - [done]: 流式结束
+    - [error]: 错误信息
 """

 import asyncio
+import ssl
 import aiohttp
 import json
 import sys
 from datetime import datetime


-API_BASE_URL = "http://localhost:8000"
+#API_BASE_URL = "http://localhost:8000"
+API_BASE_URL = "https://101.89.108.122:8000"


 async def stream_chat(session_id: str, text: str):
@@ -23,7 +37,11 @@ async def stream_chat(session_id: str, text: str):
        "text": text
    }

-    async with aiohttp.ClientSession() as http_session:
+    ssl_ctx = ssl.create_default_context()
+    ssl_ctx.check_hostname = False
+    ssl_ctx.verify_mode = ssl.CERT_NONE
+
+    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=ssl_ctx)) as http_session:
        async with http_session.post(
            f"{API_BASE_URL}/chat",
            json=payload,
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
 fastapi>=0.104.0
-uvicorn>=0.24.0
+uvicorn[standard]>=0.24.0
+pipecat-ai[websocket,openai,silero]
+websockets>=13.1,<16.0
 pydantic>=2.4.2
 python-dotenv>=1.0.0
 httpx>=0.25.0
@@ -12,7 +14,7 @@ pydantic-settings==2.1.0
 python-multipart==0.0.6
 python-jose[cryptography]==3.3.0
 passlib[bcrypt]==1.7.4
-openai==1.55.3
+openai>=1.74.0,<3
 loguru>=0.7.0
 pandas
 requests
--- a/src/.env.example
+++ b/src/.env.example
@@ -8,3 +8,6 @@ APP_ID=683ea1bc86197e19f71fc1ae
 DELETE_SESSION_URL=http://127.0.0.1:3030/api/core/chat/delHistory?chatId={chatId}&appId={appId}
 DELETE_CHAT_URL=http://127.0.0.1:3030/api/core/chat/item/delete?contentId={contentId}&chatId={chatId}&appId={appId}
 GET_CHAT_RECORDS_URL=http://127.0.0.1:3030/api/core/chat/getPaginationRecords
+
+# Voice demo (Pipecat /ws-product). Relative to project root, or an absolute path.
+VOICE_CONFIG=config/voice.json
--- a/src/api/endpoints.py
+++ b/src/api/endpoints.py
@@ -1,7 +1,7 @@
 from fastapi import APIRouter, HTTPException, Depends
 from fastapi.responses import StreamingResponse
 from ..schemas.models import ProcessRequest_chat, ProcessResponse_chat, ProcessRequest_get, ProcessResponse_get, ProcessRequest_set, ProcessResponse_set, ProcessResponse_delete_session, ProcessRequest_delete_session
-from fastgpt_client import AsyncChatClient
+from fastgpt_client import AsyncChatClient, aiter_stream_events
 from fastgpt_client.exceptions import (
    APIError, AuthenticationError, RateLimitError, ValidationError
 )
@@ -12,6 +12,7 @@ import json
 import re

 router = APIRouter()
+FORM_EXTRACT_MODULE_NAME = "文本内容提取事故信息"
 STATUS_CODE_MAP = {
    '0000': '结束通话',
    '0001': '转接人工',
@@ -34,6 +35,19 @@ STATUS_CODE_MAP = {
    '2016': '确认双车中的车牌'
 }

+def normalize_stage_code(stage_code: str) -> str:
+    """Normalize FastGPT stage codes to external API stage codes."""
+    if stage_code in ['3001', '3002', '1002']:
+        return '1002'
+    if stage_code == '2006':
+        return '2004'
+    if stage_code == '2017':
+        return '2016'
+    if stage_code == '2020':
+        return '0002'
+    return stage_code
+
+
 def extract_state_and_content(data1: str) -> dict | None:
    """
    Extracts the state and content from a string in the format <state>STATE</state>content.
@@ -47,7 +61,7 @@ def extract_state_and_content(data1: str) -> dict | None:
    """
    data1 = data1.strip()
    regex = r"<state>(.*?)</state>(.*)"
-    match = re.search(regex, data1)
+    match = re.search(regex, data1, flags=re.DOTALL)
    
    if match:
        return {
@@ -56,6 +70,52 @@ def extract_state_and_content(data1: str) -> dict | None:
        }
    return None

+
+def parse_json_value(value):
+    """Parse JSON string values when possible."""
+    parsed = value
+    for _ in range(3):
+        if not isinstance(parsed, str):
+            return parsed
+        parsed = parsed.strip()
+        if not parsed:
+            return {}
+        try:
+            parsed = json.loads(parsed)
+        except json.JSONDecodeError:
+            return parsed
+    return parsed
+
+
+def extract_form_update_from_flow_nodes(nodes):
+    """Extract form update data from the configured FastGPT content-extract node."""
+    if not isinstance(nodes, list):
+        return {}
+
+    for node in nodes:
+        if not isinstance(node, dict):
+            continue
+        if node.get("moduleName") != FORM_EXTRACT_MODULE_NAME:
+            continue
+
+        extract_result = node.get("extractResult", {})
+        if not isinstance(extract_result, dict):
+            return {}
+
+        form_update = extract_result.get("formUpdate") or extract_result.get("form") or ""
+        if not form_update:
+            return {}
+        return parse_json_value(form_update)
+
+    return {}
+
+
+def format_set_info_input(payload: dict, include_input_info: bool) -> str:
+    """Build optional setInfo input for FastGPT helper calls."""
+    if not include_input_info:
+        return ""
+    return f"<setInfo>{json.dumps(payload, ensure_ascii=False)}</setInfo>"
+
 async def delete_last_two_chat_records(
    client: AsyncChatClient,
    session_id: str
@@ -112,6 +172,8 @@ async def chat(
    """Handle chat completion request."""
    json_data = request.model_dump()
    logger.info(f"用户请求信息ProcessRequest_chat: {json_data}, stream={stream}")
+    need_form_update = json_data.get('needFormUpdate', False)
+    chat_variables = {'needFormUpdate': need_form_update}
    
    if stream:
        async def event_generator():
@@ -121,73 +183,80 @@ async def chat(
                    messages=[{"role": "user", "content": json_data['text']}],
                    chatId=json_data['sessionId'],
                    stream=True,
-                    detail=True
+                    detail=True,
+                    variables=chat_variables
                )
                
                buffer = ""
                state_code_found = False
+                module_form_sent = False
+
+                def flush_text_delta(text: str):
+                    return create_sse_event("text_delta", {"text": text})
+
+                def flush_form_update(form_update):
+                    return create_sse_event("formUpdate", form_update)
                
-                async for chunk in response.aiter_lines():
-                    if chunk.startswith('data: '):
-                        data_str = chunk[6:].strip()
-                        if data_str == '[DONE]':
-                            break
+                async for event in aiter_stream_events(response):
+                    try:
+                        if event.kind == "flowResponses" and not module_form_sent:
+                            form_update = extract_form_update_from_flow_nodes(event.data)
+                            if form_update:
+                                yield flush_form_update(form_update)
+                                module_form_sent = True
+                            continue
+
+                        if event.kind not in {"answer", "fastAnswer", "data"}:
+                            continue
+
+                        data = event.data
+                        if not isinstance(data, dict):
+                            continue
+
                        try:
-                            data = json.loads(data_str)
-                            try:
-                                delta_content = data['choices'][0]['delta'].get('content', '')
-                            except (KeyError, IndexError):
-                                delta_content = ''
-                            if delta_content:
-                                buffer += delta_content
-                                
-                                if not state_code_found:
-                                    # Check for <state>XXXX</state> pattern
-                                    match = re.search(r"<state>(.*?)</state>", buffer)
-                                    if match:
-                                        state_code = match.group(1)
-                                        
-                                        # Apply logic to map/adjust state code
-                                        nextStageCode = state_code
-                                        if nextStageCode in ['3001', '3002', '1002']:
-                                            nextStageCode = '1002'
-                                        elif nextStageCode == '2006':
-                                            nextStageCode = '2004'
-                                        elif nextStageCode == '2017':
-                                            nextStageCode = '2016'
-                                        elif nextStageCode == '2020':
-                                            nextStageCode = '0002'
-                                        nextStage = STATUS_CODE_MAP.get(nextStageCode, '')
-                                        
-                                        # Send stage code event
-                                        yield create_sse_event("stage_code", {
-                                            "nextStageCode": nextStageCode,
-                                            "nextStage": nextStage
-                                        })
-                                        
-                                        state_code_found = True
-                                        
-                                        # Send remaining content as text_delta
-                                        remaining_content = buffer[match.end():]
-                                        if remaining_content:
-                                            yield create_sse_event("text_delta", {"text": remaining_content})
-                                        buffer = "" # Clear buffer after extracting state
-                                else:
-                                    # State code already found, just stream text
-                                    yield create_sse_event("text_delta", {"text": delta_content})
-                                    buffer = "" # Do not buffer text after state found
-                                    
-                        except json.JSONDecodeError:
-                            continue
-                        except Exception as e:
-                            print(data)
-                            logger.error(f"Error processing chunk: {e}")
+                            delta_content = data['choices'][0]['delta'].get('content', '')
+                        except (KeyError, IndexError):
+                            delta_content = ''
+                        if not delta_content:
                            continue

+                        buffer += delta_content
+
+                        if not state_code_found:
+                            # Check for <state>XXXX</state> pattern
+                            match = re.search(r"<state>(.*?)</state>", buffer, flags=re.DOTALL)
+                            if match:
+                                state_code = match.group(1)
+
+                                # Apply logic to map/adjust state code
+                                nextStageCode = normalize_stage_code(state_code)
+                                nextStage = STATUS_CODE_MAP.get(nextStageCode, '')
+
+                                # Send stage code event
+                                yield create_sse_event("stage_code", {
+                                    "nextStageCode": nextStageCode,
+                                    "nextStage": nextStage
+                                })
+
+                                state_code_found = True
+
+                                # Send remaining content as text_delta
+                                remaining_content = buffer[match.end():]
+                                if remaining_content:
+                                    yield flush_text_delta(remaining_content)
+                                buffer = "" # Clear buffer after extracting state
+                        else:
+                            yield flush_text_delta(delta_content)
+                            buffer = ""
+
+                    except Exception as e:
+                        logger.error(f"Error processing stream event: {e}")
+                        continue
+
                # If stream ends and no state code found (unlikely if format is strict), 
                # we might want to send what we have
                if not state_code_found and buffer:
-                     yield create_sse_event("text_delta", {"text": buffer})
+                    yield create_sse_event("text_delta", {"text": buffer})

                yield create_sse_event("done", {"status": "completed"})
                
@@ -203,7 +272,8 @@ async def chat(
            messages=[{"role": "user", "content": json_data['text']}],
            chatId=json_data['sessionId'],
            stream=False,
-            detail=True
+            detail=True,
+            variables=chat_variables
        )
        response.raise_for_status()
        data = response.json()
@@ -281,28 +351,18 @@ async def chat(

        logger.debug(f"State variables: {data.get('newVariables', {})}")

-        nextStageCode = data['newVariables']['status_code']
-        
-        # 有一些情况需要调整nextStageCode
-        if nextStageCode in ['3001', '3002', '1002']:
-            nextStageCode = '1002'
-        elif nextStageCode == '2006':
-            nextStageCode = '2004'
-        elif nextStageCode == '2017':
-            nextStageCode = '2016'
-        elif nextStageCode == '2020':
-            nextStageCode = '0002'
-        nextStage = STATUS_CODE_MAP.get(nextStageCode, '')
-
        # Parse content - sometimes content is a string, sometimes it is a list
+        content_stage_code = None
        if isinstance(content, list):
            logger.debug("content是一个list")
            content = content[0]['text']['content']
-        elif isinstance(content, str):
+        
+        if isinstance(content, str):
            logger.debug("content是一个str")
            state_and_content = extract_state_and_content(content)
            if state_and_content:
                logger.debug(f"解析后的state和content为: {state_and_content}")
+                content_stage_code = state_and_content['state']
                content = state_and_content['content']
            else:
                raise ValueError("大模型回复中的state解析失败")
@@ -310,10 +370,16 @@ async def chat(
            logger.error(f"content既不是list也不是str, type: {type(content)}")
            raise ValueError("大模型回复不是list也不是str")

+        nextStageCode = content_stage_code or data['newVariables']['status_code']
+        nextStageCode = normalize_stage_code(nextStageCode)
+        nextStage = STATUS_CODE_MAP.get(nextStageCode, '')
+        form_update = extract_form_update_from_flow_nodes(data.get("responseData", []))
+
        return ProcessResponse_chat(
            sessionId=json_data['sessionId'],
            timeStamp=json_data['timeStamp'],
            outputText=content,
+            formUpdate=form_update,
            nextStage=nextStage,
            nextStageCode=nextStageCode,
            code="200",
@@ -340,11 +406,16 @@ async def set_info(
 ):
    """Set information in chat state."""
    json_data = request.model_dump()
+    set_info_payload = {'key': json_data['key'], 'value': json_data['value']}
+    set_info_input = format_set_info_input(
+        set_info_payload,
+        json_data.get('includeInputInfo', False)
+    )
    
    try:
        # Get current state
        response = await client.create_chat_completion(
-            messages=[{"role": "user", "content": ""}],
+            messages=[{"role": "user", "content": set_info_input}],
            chatId=json_data['sessionId'],
            stream=False,
            detail=True
@@ -382,11 +453,12 @@ async def set_info(
        key = json_data['key']
        value = json_data['value']
        current_state[key] = value
+        logger.info(f'即将设置 {key} 为 {value}')
        logger.info(f'即将上传 {current_state}')
        
        # Update state using SDK
        response = await client.create_chat_completion(
-            messages=[{"role": "user", "content": ""}],
+            messages=[{"role": "user", "content": set_info_input}],
            chatId=json_data['sessionId'],
            stream=False,
            detail=True,
@@ -420,11 +492,16 @@ async def get_info(
 ):
    """Get information from chat state."""
    json_data = request.model_dump()
+    get_info_payload = {'key': json_data['key']}
+    get_info_input = format_set_info_input(
+        get_info_payload,
+        json_data.get('includeInputInfo', False)
+    )
    
    try:
        # Get current state
        response = await client.create_chat_completion(
-            messages=[{"role": "user", "content": ""}],
+            messages=[{"role": "user", "content": get_info_input}],
            chatId=json_data['sessionId'],
            stream=False,
            detail=True
--- a/src/main.py
+++ b/src/main.py
@@ -1,8 +1,8 @@
 from fastapi import FastAPI
-import sys
 from .api.endpoints import router as api_router
 from .core.fastgpt_client import lifespan
 from .core.logging_config import setup_logging
+from .voice.routes import register_voice

 # Setup logging first
 setup_logging()
@@ -18,4 +18,5 @@ app = FastAPI(
 def read_root():
    return {"message": "Server is running."}

-app.include_router(api_router)
+app.include_router(api_router)
+register_voice(app)
--- a/src/schemas/models.py
+++ b/src/schemas/models.py
@@ -1,15 +1,17 @@
 from pydantic import BaseModel, Field
-from typing import Optional
+from typing import Any, Optional

 class ProcessRequest_chat(BaseModel):
    sessionId: str = Field(..., max_length=64)
    timeStamp: str = Field(..., max_length=32)
    text: str = Field(...)
+    needFormUpdate: bool = False

 class ProcessResponse_chat(BaseModel):
    sessionId: str = Field(..., max_length=64)
    timeStamp: str = Field(..., max_length=32)
    outputText: str = Field(...)
+    formUpdate: Any = Field(default_factory=dict)
    nextStage: str = Field(..., max_length=32)
    nextStageCode: str = Field(..., max_length=4)
    code: str = Field(..., max_length=4)
@@ -19,6 +21,7 @@ class ProcessRequest_get(BaseModel):
    sessionId: str = Field(..., max_length=64)
    timeStamp: str = Field(..., max_length=32)
    key: str = Field(...)
+    includeInputInfo: bool = False

 class ProcessResponse_get(BaseModel):
    sessionId: str = Field(..., max_length=64)
@@ -32,6 +35,7 @@ class ProcessRequest_set(BaseModel):
    timeStamp: str = Field(..., max_length=32)
    key: str = Field(...)
    value: str = Field(...)
+    includeInputInfo: bool = False

 class ProcessResponse_set(BaseModel):
    sessionId: str = Field(..., max_length=64)
--- a/src/voice/init.py
+++ b/src/voice/init.py
@@ -0,0 +1 @@
+"""Voice websocket demo (product-ws / va.ws.v1) powered by Pipecat."""
--- a/src/voice/config.py
+++ b/src/voice/config.py
@@ -0,0 +1,313 @@
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+DEFAULT_VOICE_CONFIG_REL = "config/voice.json"
+
+
+def resolve_voice_config_path() -> Path:
+    """Return the voice config path from VOICE_CONFIG or the default."""
+    configured = os.getenv("VOICE_CONFIG", DEFAULT_VOICE_CONFIG_REL).strip()
+    if not configured:
+        configured = DEFAULT_VOICE_CONFIG_REL
+    path = Path(configured)
+    if not path.is_absolute():
+        path = PROJECT_ROOT / path
+    return path
+
+
+DEFAULT_VOICE_CONFIG = resolve_voice_config_path()
+
+SUPPORTED_LLM_PROVIDERS = frozenset({"openai", "fastgpt"})
+_LLM_PROVIDER_ALIASES = {"llm": "openai", "openai": "openai", "fastgpt": "fastgpt"}
+
+
+@dataclass(frozen=True)
+class ServerConfig:
+    host: str = "0.0.0.0"
+    port: int = 8000
+    cors_origins: list[str] = field(default_factory=list)
+    serve_webpage: bool = True
+    webpage_mount: str = "/voice-demo"
+
+
+@dataclass(frozen=True)
+class AudioConfig:
+    sample_rate_hz: int = 16000
+    channels: int = 1
+    frame_ms: int = 20
+
+    @property
+    def frame_bytes(self) -> int:
+        return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2
+
+
+@dataclass(frozen=True)
+class SessionConfig:
+    inactivity_timeout_sec: int = 60
+
+
+@dataclass(frozen=True)
+class VADConfig:
+    confidence: float = 0.7
+    start_secs: float = 0.2
+    stop_secs: float = 0.6
+    min_volume: float = 0.6
+
+
+@dataclass(frozen=True)
+class TurnConfig:
+    vad: VADConfig = field(default_factory=VADConfig)
+    user_speech_timeout_sec: float = 1.0
+    idle_prompt_timeout_sec: float = 0.0
+    idle_prompt_max_count: int = 1
+    idle_prompt_text: str = (
+        "我先停在这里。你可以继续说你的想法，"
+        "或者让我根据刚才的内容帮你整理下一步。"
+    )
+    interruption_min_chars: int = 3
+    interruption_use_interim: bool = True
+    interruption_short_replies: list[str] = field(
+        default_factory=lambda: [
+            "是",
+            "是的",
+            "对",
+            "对的",
+            "嗯",
+            "好",
+            "好的",
+            "行",
+            "可以",
+            "没问题",
+            "不是",
+            "不",
+            "不行",
+            "不用",
+            "不要",
+            "没有",
+            "否",
+            "no",
+            "yes",
+            "ok",
+            "okay",
+        ]
+    )
+
+
+@dataclass(frozen=True)
+class ResponseStateConfig:
+    enabled: bool = False
+    tag: str = "state"
+    event_type: str = "response.state"
+    max_prefix_chars: int = 256
+
+
+@dataclass(frozen=True)
+class AgentConfig:
+    system_prompt: str = "You are a helpful, friendly voice assistant."
+    greeting: str | None = None
+    greeting_mode: str = "generated"
+    fastgpt_reconnect_greeting: str = "欢迎回来继续对话，请告诉我准备好了之后继续办理"
+    response_state: ResponseStateConfig = field(default_factory=ResponseStateConfig)
+
+
+@dataclass(frozen=True)
+class LLMConfig:
+    provider: str = "openai"
+    api_key: str = ""
+    base_url: str | None = None
+    model: str = "gpt-4o-mini"
+    app_id: str | None = None
+    temperature: float | None = 0.7
+    chat_id: str | None = None
+    variables: dict[str, str] = field(default_factory=dict)
+    detail: bool = False
+    timeout_sec: float = 60.0
+    image_input_mode: str = "base64"
+
+    @property
+    def is_fastgpt(self) -> bool:
+        return self.provider == "fastgpt"
+
+    @property
+    def is_openai(self) -> bool:
+        return self.provider == "openai"
+
+    @property
+    def uses_local_context_history(self) -> bool:
+        """Whether the pipeline should seed and maintain local LLM context history."""
+        return not self.is_fastgpt
+
+
+@dataclass(frozen=True)
+class STTConfig:
+    provider: str = "openai"
+    app_id: str = ""
+    api_key: str = ""
+    api_secret: str = ""
+    base_url: str | None = None
+    model: str = "gpt-4o-mini-transcribe"
+    language: str | None = "en"
+    domain: str = "iat"
+    accent: str = "mandarin"
+    encoding: str = "raw"
+    frame_size: int = 1280
+    timeout_sec: float = 10.0
+    dynamic_correction: bool = False
+
+
+@dataclass(frozen=True)
+class TTSConfig:
+    provider: str = "openai"
+    app_id: str = ""
+    api_key: str = ""
+    api_secret: str = ""
+    base_url: str | None = None
+    model: str = "gpt-4o-mini-tts"
+    voice: str = "alloy"
+    aue: str = "raw"
+    tte: str = "UTF8"
+    speed: int = 50
+    volume: int = 50
+    pitch: int = 50
+    timeout_sec: float = 30.0
+    source_sample_rate_hz: int | None = None
+    oral_level: str = "mid"
+    text_aggregation_mode: str | None = None
+
+
+@dataclass(frozen=True)
+class ServicesConfig:
+    llm: LLMConfig = field(default_factory=LLMConfig)
+    stt: STTConfig = field(default_factory=STTConfig)
+    tts: TTSConfig = field(default_factory=TTSConfig)
+
+
+@dataclass(frozen=True)
+class EngineConfig:
+    server: ServerConfig = field(default_factory=ServerConfig)
+    audio: AudioConfig = field(default_factory=AudioConfig)
+    session: SessionConfig = field(default_factory=SessionConfig)
+    turn: TurnConfig = field(default_factory=TurnConfig)
+    agent: AgentConfig = field(default_factory=AgentConfig)
+    services: ServicesConfig = field(default_factory=ServicesConfig)
+
+
+def load_config(path: str | Path | None = None) -> EngineConfig:
+    config_path = Path(path) if path is not None else resolve_voice_config_path()
+    if not config_path.is_absolute():
+        config_path = PROJECT_ROOT / config_path
+    data = json.loads(config_path.read_text(encoding="utf-8"))
+    if not isinstance(data, dict):
+        raise ValueError(f"Config file must contain a JSON object: {config_path}")
+    return config_from_dict(data)
+
+
+def config_from_dict(data: dict) -> EngineConfig:
+    services = _dict(data.get("services"))
+    agent = _dict(data.get("agent"))
+    if agent.get("greeting") == "":
+        agent["greeting"] = None
+    if agent.get("greeting_mode") not in (None, "generated", "fixed", "off", "fastgpt_opener"):
+        raise ValueError(
+            "agent.greeting_mode must be one of: generated, fixed, off, fastgpt_opener"
+        )
+    response_state = ResponseStateConfig(**_dict(agent.pop("response_state", None)))
+    if response_state.max_prefix_chars < 1:
+        raise ValueError("agent.response_state.max_prefix_chars must be greater than 0")
+    if not response_state.tag:
+        raise ValueError("agent.response_state.tag must not be empty")
+    if not response_state.event_type:
+        raise ValueError("agent.response_state.event_type must not be empty")
+
+    stt = _dict(services.get("stt") or services.get("asr"))
+    if stt.get("language") == "":
+        stt["language"] = None
+
+    llm = _dict(services.get("llm"))
+    llm["provider"] = _normalize_llm_provider(llm.get("provider", LLMConfig().provider))
+    if llm.get("chat_id") == "":
+        llm["chat_id"] = None
+    llm.pop("send_system_prompt", None)
+    image_input_mode = str(
+        llm.get("image_input_mode", LLMConfig().image_input_mode)
+    ).strip().lower()
+    if image_input_mode not in {"base64", "upload"}:
+        raise ValueError(
+            "services.llm.image_input_mode must be 'base64' or 'upload', "
+            f"got {llm.get('image_input_mode')!r}"
+        )
+    llm["image_input_mode"] = image_input_mode
+    if llm.get("app_id") == "":
+        llm["app_id"] = None
+    if not isinstance(llm.get("variables"), dict):
+        llm["variables"] = {}
+    if agent.get("greeting_mode") == "fastgpt_opener" and llm["provider"] != "fastgpt":
+        raise ValueError(
+            "agent.greeting_mode='fastgpt_opener' requires services.llm.provider='fastgpt'"
+        )
+
+    turn = _dict(data.get("turn"))
+    vad = _dict(turn.get("vad"))
+
+    return EngineConfig(
+        server=ServerConfig(**_dict(data.get("server"))),
+        audio=AudioConfig(**_dict(data.get("audio"))),
+        session=SessionConfig(**_dict(data.get("session"))),
+        turn=TurnConfig(
+            vad=VADConfig(**vad),
+            user_speech_timeout_sec=float(
+                turn.get("user_speech_timeout_sec", TurnConfig().user_speech_timeout_sec)
+            ),
+            idle_prompt_timeout_sec=float(
+                turn.get("idle_prompt_timeout_sec", TurnConfig().idle_prompt_timeout_sec)
+            ),
+            idle_prompt_max_count=int(
+                turn.get("idle_prompt_max_count", TurnConfig().idle_prompt_max_count)
+            ),
+            idle_prompt_text=str(
+                turn.get("idle_prompt_text", TurnConfig().idle_prompt_text)
+            ),
+            interruption_min_chars=int(
+                turn.get("interruption_min_chars", TurnConfig().interruption_min_chars)
+            ),
+            interruption_use_interim=bool(
+                turn.get("interruption_use_interim", TurnConfig().interruption_use_interim)
+            ),
+            interruption_short_replies=list(
+                turn.get(
+                    "interruption_short_replies",
+                    TurnConfig().interruption_short_replies,
+                )
+            ),
+        ),
+        agent=AgentConfig(**agent, response_state=response_state),
+        services=ServicesConfig(
+            llm=LLMConfig(**llm),
+            stt=STTConfig(**stt),
+            tts=TTSConfig(**_dict(services.get("tts"))),
+        ),
+    )
+
+
+def _dict(value: object) -> dict:
+    return dict(value) if isinstance(value, dict) else {}
+
+
+def _normalize_llm_provider(value: object) -> str:
+    provider = str(value or LLMConfig().provider).strip().lower()
+    normalized = _LLM_PROVIDER_ALIASES.get(provider)
+    if normalized is None:
+        supported = ", ".join(sorted(SUPPORTED_LLM_PROVIDERS | {"llm"}))
+        raise ValueError(
+            f"services.llm.provider must be one of: {supported}; got {value!r}"
+        )
+    return normalized
--- a/src/voice/context_sync.py
+++ b/src/voice/context_sync.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from typing import Any
+
+from pipecat.frames.frames import Frame, InterruptionFrame, LLMMessagesAppendFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+from .text_stream import ProductTextStreamProcessor, maybe_sync_assistant_context
+
+
+class AssistantContextSyncProcessor(FrameProcessor):
+    """Sync LLM context to urgent-streamed assistant text before text-input turns.
+
+    ``input.text`` with ``interrupt: true`` queues ``InterruptionFrame`` before
+    ``LLMMessagesAppendFrame``. This processor runs context repair after the
+    interrupt has propagated (including TTS-phase interrupts) and before the new
+    user message is appended.
+    """
+
+    def __init__(
+        self,
+        *,
+        text_stream: ProductTextStreamProcessor,
+        assistant_aggregator: Any,
+    ) -> None:
+        super().__init__()
+        self._text_stream = text_stream
+        self._assistant_aggregator = assistant_aggregator
+        self._sync_on_next_append = False
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, InterruptionFrame):
+            self._sync_on_next_append = True
+        elif isinstance(frame, LLMMessagesAppendFrame) and self._sync_on_next_append:
+            self._sync_on_next_append = False
+            maybe_sync_assistant_context(self._assistant_aggregator, self._text_stream)
+
+        await self.push_frame(frame, direction)
--- a/src/voice/fastgpt_llm.py
+++ b/src/voice/fastgpt_llm.py
@@ -0,0 +1,564 @@
+from __future__ import annotations
+
+import asyncio
+import base64
+import binascii
+import os
+import tempfile
+import uuid
+from dataclasses import dataclass, field
+from typing import Any
+
+import httpx
+from fastgpt_client import AsyncChatClient, FastGPTInteractiveEvent, aiter_stream_events
+from fastgpt_client.exceptions import FastGPTError
+from loguru import logger
+
+from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
+    Frame,
+    InterruptionFrame,
+    LLMContextFrame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMTextFrame,
+    OutputTransportMessageFrame,
+    OutputTransportMessageUrgentFrame,
+)
+from pipecat.processors.aggregators.llm_context import LLMContext
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.llm_service import LLMService
+from pipecat.services.settings import LLMSettings
+
+
+def _extract_text_from_event(kind: str, payload: Any) -> str:
+    if not isinstance(payload, dict):
+        return ""
+
+    if kind in {"answer", "fastAnswer"}:
+        text = payload.get("text")
+        if isinstance(text, str) and text:
+            return text
+
+    choices = payload.get("choices") if isinstance(payload.get("choices"), list) else []
+    if not choices:
+        return str(payload.get("text") or "")
+
+    first_choice = choices[0] if isinstance(choices[0], dict) else {}
+    delta = first_choice.get("delta") if isinstance(first_choice.get("delta"), dict) else {}
+    content = delta.get("content")
+    if isinstance(content, str) and content:
+        return content
+
+    message = first_choice.get("message") if isinstance(first_choice.get("message"), dict) else {}
+    message_content = message.get("content")
+    if isinstance(message_content, str) and message_content:
+        return message_content
+
+    return ""
+
+
+def _message_text(message: dict[str, Any]) -> str:
+    content = message.get("content")
+    if isinstance(content, str):
+        return content.strip()
+    if isinstance(content, list):
+        parts: list[str] = []
+        for part in content:
+            if isinstance(part, dict) and part.get("type") == "text":
+                text = part.get("text")
+                if isinstance(text, str) and text.strip():
+                    parts.append(text.strip())
+        return " ".join(parts)
+    return ""
+
+
+def _first_nonempty_text(*values: Any) -> str:
+    for value in values:
+        if isinstance(value, str):
+            text = value.strip()
+            if text:
+                return text
+    return ""
+
+
+def _interactive_spoken_prompt(event: FastGPTInteractiveEvent) -> str:
+    payload = event.data if isinstance(event.data, dict) else {}
+    params = payload.get("params") if isinstance(payload.get("params"), dict) else {}
+
+    prompt = _first_nonempty_text(
+        payload.get("opener"),
+        params.get("opener"),
+        payload.get("prompt"),
+        params.get("prompt"),
+        payload.get("text"),
+        params.get("text"),
+        payload.get("title"),
+        params.get("title"),
+        payload.get("description"),
+        params.get("description"),
+    )
+    if prompt:
+        return prompt
+
+    if event.interaction_type == "userSelect":
+        raw_options = (
+            params.get("userSelectOptions")
+            if isinstance(params.get("userSelectOptions"), list)
+            else []
+        )
+        labels: list[str] = []
+        for index, raw in enumerate(raw_options, start=1):
+            if isinstance(raw, str) and raw.strip():
+                labels.append(f"{index}. {raw.strip()}")
+            elif isinstance(raw, dict):
+                label = _first_nonempty_text(raw.get("label"), raw.get("value"))
+                if label:
+                    labels.append(f"{index}. {label}")
+        if labels:
+            return "请选择：" + "，".join(labels)
+        return "请选择一个选项。"
+
+    if event.interaction_type == "userInput":
+        input_form = params.get("inputForm") if isinstance(params.get("inputForm"), list) else []
+        labels = [
+            _first_nonempty_text(field.get("label"), field.get("name"))
+            for field in input_form
+            if isinstance(field, dict)
+        ]
+        labels = [label for label in labels if label]
+        if labels:
+            return "请提供以下信息：" + "，".join(labels)
+        return "请补充所需信息。"
+
+    return "请继续。"
+
+
+IMAGE_INPUT_MODE_BASE64 = "base64"
+IMAGE_INPUT_MODE_UPLOAD = "upload"
+SUPPORTED_IMAGE_INPUT_MODES = frozenset({IMAGE_INPUT_MODE_BASE64, IMAGE_INPUT_MODE_UPLOAD})
+
+_MIME_TO_EXT = {
+    "image/jpeg": ".jpg",
+    "image/png": ".png",
+    "image/webp": ".webp",
+}
+
+
+def _message_has_image(message: dict[str, Any]) -> bool:
+    content = message.get("content")
+    if not isinstance(content, list):
+        return False
+    return any(
+        isinstance(part, dict) and part.get("type") == "image_url"
+        for part in content
+    )
+
+
+def _redact_messages_for_log(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Replace base64 image data URLs with a short placeholder for logging."""
+    redacted: list[dict[str, Any]] = []
+    for message in messages:
+        content = message.get("content")
+        if not isinstance(content, list):
+            redacted.append(message)
+            continue
+        parts: list[Any] = []
+        for part in content:
+            if (
+                isinstance(part, dict)
+                and part.get("type") == "image_url"
+                and isinstance(part.get("image_url"), dict)
+            ):
+                url = str(part["image_url"].get("url") or "")
+                parts.append({"type": "image_url", "image_url": {"url": f"<{len(url)} chars>"}})
+            else:
+                parts.append(part)
+        redacted.append({**message, "content": parts})
+    return redacted
+
+
+@dataclass
+class FastGPTLLMSettings(LLMSettings):
+    variables: dict[str, Any] = field(default_factory=dict)
+    detail: bool = False
+
+
+def _default_fastgpt_settings(*, model: str = "fastgpt") -> FastGPTLLMSettings:
+    return FastGPTLLMSettings(
+        model=model,
+        system_instruction=None,
+        temperature=None,
+        max_tokens=None,
+        top_p=None,
+        top_k=None,
+        frequency_penalty=None,
+        presence_penalty=None,
+        seed=None,
+        filter_incomplete_user_turns=False,
+        user_turn_completion_config=None,
+        variables={},
+        detail=False,
+    )
+
+
+class FastGPTLLMService(LLMService):
+    """FastGPT LLM service using chatId server-side memory and workflow variables."""
+
+    Settings = FastGPTLLMSettings
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        base_url: str,
+        chat_id: str | None = None,
+        app_id: str | None = None,
+        greeting_prompt: str | None = None,
+        timeout: float = 60.0,
+        image_input_mode: str = IMAGE_INPUT_MODE_BASE64,
+        settings: FastGPTLLMSettings | None = None,
+        **kwargs,
+    ) -> None:
+        default_settings = _default_fastgpt_settings()
+        if settings is not None:
+            default_settings.apply_update(settings)
+        super().__init__(settings=default_settings, **kwargs)
+
+        self._chat_id = chat_id or f"voice_{uuid.uuid4().hex[:16]}"
+        self._app_id = (app_id or "").strip()
+        self._greeting_prompt = (greeting_prompt or "你好").strip() or "你好"
+        self._client = AsyncChatClient(
+            api_key=api_key,
+            base_url=base_url,
+            timeout=timeout,
+        )
+        self._active_response = None
+
+        mode = (image_input_mode or IMAGE_INPUT_MODE_BASE64).strip().lower()
+        if mode not in SUPPORTED_IMAGE_INPUT_MODES:
+            raise ValueError(
+                f"Unsupported image_input_mode {image_input_mode!r}; "
+                f"expected one of {sorted(SUPPORTED_IMAGE_INPUT_MODES)}"
+            )
+        if mode == IMAGE_INPUT_MODE_UPLOAD and not self._app_id:
+            logger.warning(
+                "FastGPT image_input_mode='upload' requires app_id; "
+                "falling back to inline base64"
+            )
+            mode = IMAGE_INPUT_MODE_BASE64
+        self._image_input_mode = mode
+
+    @property
+    def app_id(self) -> str:
+        return self._app_id
+
+    @property
+    def chat_id(self) -> str:
+        return self._chat_id
+
+    def set_variables(self, variables: dict[str, Any]) -> None:
+        merged = dict(self._settings.variables)
+        merged.update(variables)
+        self._settings.variables = merged
+
+    async def stop(self, frame: EndFrame) -> None:
+        await self._close_active_response()
+        await self._client.close()
+        await super().stop(frame)
+
+    async def cancel(self, frame: CancelFrame) -> None:
+        await self._close_active_response()
+        await super().cancel(frame)
+
+    async def _handle_interruptions(self, _: InterruptionFrame) -> None:
+        await self._close_active_response()
+        await super()._handle_interruptions(_)
+
+    @staticmethod
+    def _welcome_text_from_init_payload(payload: Any) -> str:
+        if not isinstance(payload, dict):
+            return ""
+
+        for container in (payload.get("app"), payload.get("data"), payload):
+            if not isinstance(container, dict):
+                continue
+            nested_app = container.get("app")
+            if isinstance(nested_app, dict):
+                text = FastGPTLLMService._welcome_text_from_app(nested_app)
+                if text:
+                    return text
+            text = FastGPTLLMService._welcome_text_from_app(container)
+            if text:
+                return text
+        return ""
+
+    @staticmethod
+    def _welcome_text_from_app(app_payload: dict[str, Any]) -> str:
+        chat_config = (
+            app_payload.get("chatConfig")
+            if isinstance(app_payload.get("chatConfig"), dict)
+            else {}
+        )
+        return _first_nonempty_text(
+            chat_config.get("welcomeText"),
+            app_payload.get("welcomeText"),
+            app_payload.get("opener"),
+            app_payload.get("intro"),
+        )
+
+    async def fetch_welcome_text(self) -> str | None:
+        """Return FastGPT app welcome text from chat init when ``app_id`` is configured."""
+        if not self._app_id:
+            return None
+
+        try:
+            response = await self._client.get_chat_init(
+                appId=self._app_id,
+                chatId=self._chat_id,
+            )
+            response.raise_for_status()
+            text = self._welcome_text_from_init_payload(response.json())
+            if text:
+                logger.info(f"FastGPT app opener loaded for appId={self._app_id}")
+            return text or None
+        except FastGPTError as exc:
+            logger.warning(f"FastGPT chat init failed: {exc}")
+        except httpx.HTTPError as exc:
+            logger.warning(f"FastGPT chat init HTTP error: {exc}")
+        except Exception as exc:
+            logger.warning(f"FastGPT chat init error: {exc}")
+        return None
+
+    async def has_chat_history(self) -> bool:
+        """Return whether FastGPT has persisted records for this chatId."""
+        if not self._app_id:
+            return False
+
+        try:
+            response = await self._client.get_chat_records(
+                appId=self._app_id,
+                chatId=self._chat_id,
+                offset=0,
+                pageSize=1,
+            )
+            response.raise_for_status()
+            data = response.json()
+            records = data.get("data", {}).get("list", [])
+            return isinstance(records, list) and bool(records)
+        except FastGPTError as exc:
+            logger.warning(f"FastGPT chat records failed: {exc}")
+        except httpx.HTTPError as exc:
+            logger.warning(f"FastGPT chat records HTTP error: {exc}")
+        except Exception as exc:
+            logger.warning(f"FastGPT chat records error: {exc}")
+        return False
+
+    async def fetch_session_greeting_text(self, reconnect_greeting: str) -> str | None:
+        """Use opener for a new chatId and a fixed greeting for reconnects."""
+        if await self.has_chat_history():
+            logger.info(f"FastGPT chatId={self._chat_id} has history; using reconnect greeting")
+            return reconnect_greeting.strip() or None
+
+        logger.info(f"FastGPT chatId={self._chat_id} has no history; using app opener")
+        return await self.fetch_welcome_text()
+
+    async def _close_active_response(self) -> None:
+        response = self._active_response
+        self._active_response = None
+        if response is not None:
+            await response.aclose()
+
+    def _build_fastgpt_messages(self, context: LLMContext) -> list[dict[str, Any]]:
+        raw_messages = context.get_messages()
+
+        for message in reversed(raw_messages):
+            if not isinstance(message, dict) or message.get("role") != "user":
+                continue
+            if _message_has_image(message):
+                # Multimodal turn: forward the OpenAI-style content list as-is
+                # (text parts + image_url with a base64 data URL). FastGPT's
+                # /chat/completions accepts this directly.
+                return [{"role": "user", "content": message["content"]}]
+            text = _message_text(message)
+            if text:
+                return [{"role": "user", "content": text}]
+
+        return [{"role": "user", "content": self._greeting_prompt}]
+
+    async def _resolve_image_inputs(
+        self, messages: list[dict[str, Any]]
+    ) -> list[dict[str, Any]]:
+        """In ``upload`` mode, replace inline base64 image data URLs with uploaded URLs.
+
+        In ``base64`` mode the messages are returned untouched (inline data URLs).
+        New message/content objects are built so the shared ``LLMContext`` messages
+        are never mutated.
+        """
+        if self._image_input_mode != IMAGE_INPUT_MODE_UPLOAD:
+            return messages
+
+        resolved: list[dict[str, Any]] = []
+        for message in messages:
+            content = message.get("content")
+            if not isinstance(content, list):
+                resolved.append(message)
+                continue
+
+            new_content: list[Any] = []
+            for part in content:
+                url = (
+                    part.get("image_url", {}).get("url")
+                    if isinstance(part, dict) and part.get("type") == "image_url"
+                    else None
+                )
+                if isinstance(url, str) and url.startswith("data:image/"):
+                    uploaded = await self._upload_data_url(url)
+                    new_content.append(
+                        {"type": "image_url", "image_url": {"url": uploaded}}
+                    )
+                else:
+                    new_content.append(part)
+            resolved.append({**message, "content": new_content})
+
+        return resolved
+
+    async def _upload_data_url(self, data_url: str) -> str:
+        """Upload a ``data:image/...;base64,...`` URL via FastGPT and return its URL.
+
+        Falls back to the original data URL if parsing or upload fails so the turn
+        still proceeds with inline base64.
+        """
+        header, _, payload = data_url.partition(",")
+        mime_type = header[len("data:"):].split(";", 1)[0].strip() or "image/jpeg"
+        try:
+            raw = base64.b64decode(payload, validate=True)
+        except (binascii.Error, ValueError) as exc:
+            logger.warning(f"FastGPT image upload skipped; invalid base64: {exc}")
+            return data_url
+
+        suffix = _MIME_TO_EXT.get(mime_type, ".jpg")
+        tmp_path: str | None = None
+        try:
+            with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+                tmp.write(raw)
+                tmp_path = tmp.name
+            result = await self._client.upload_chat_image(
+                appId=self._app_id,
+                chatId=self._chat_id,
+                file_path=tmp_path,
+            )
+            url = result.get("url") if isinstance(result, dict) else None
+            if isinstance(url, str) and url:
+                logger.info(
+                    f"FastGPT image uploaded chatId={self._chat_id} "
+                    f"bytes={len(raw)} url={url}"
+                )
+                return url
+            logger.warning("FastGPT image upload returned no url; using inline base64")
+            return data_url
+        except Exception as exc:
+            logger.warning(f"FastGPT image upload failed; using inline base64: {exc}")
+            return data_url
+        finally:
+            if tmp_path is not None:
+                try:
+                    os.unlink(tmp_path)
+                except OSError:
+                    pass
+
+    async def _process_context(self, context: LLMContext) -> None:
+        messages = self._build_fastgpt_messages(context)
+        messages = await self._resolve_image_inputs(messages)
+        variables = self._settings.variables or None
+
+        logger.info(
+            "FastGPT chat completion "
+            f"chatId={self._chat_id} appId={self._app_id or '-'} "
+            f"variables={sorted((variables or {}).keys())} "
+            f"messages={_redact_messages_for_log(messages)!r}"
+        )
+
+        await self.start_ttfb_metrics()
+
+        try:
+            response = await self._client.create_chat_completion(
+                messages=messages,
+                stream=True,
+                chatId=self._chat_id,
+                variables=variables,
+                detail=self._settings.detail,
+            )
+        except FastGPTError as exc:
+            await self.push_error(error_msg=f"FastGPT request failed: {exc}", exception=exc)
+            return
+        except httpx.HTTPError as exc:
+            await self.push_error(error_msg=f"FastGPT HTTP error: {exc}", exception=exc)
+            return
+
+        self._active_response = response
+
+        try:
+            async for event in aiter_stream_events(response):
+                if event.kind in {"data", "answer", "fastAnswer"}:
+                    text = _extract_text_from_event(event.kind, event.data)
+                    if text:
+                        await self.stop_ttfb_metrics()
+                        await self.push_frame(LLMTextFrame(text))
+                    continue
+
+                if event.kind == "interactive" and isinstance(event, FastGPTInteractiveEvent):
+                    await self._handle_interactive(event)
+                    break
+
+                if event.kind == "error":
+                    payload = event.data if isinstance(event.data, dict) else {}
+                    message = _first_nonempty_text(
+                        payload.get("message"),
+                        payload.get("error"),
+                    ) or "FastGPT stream error"
+                    await self.push_error(error_msg=message)
+                    break
+
+                if event.kind == "done":
+                    break
+        finally:
+            self._active_response = None
+            await response.aclose()
+
+    async def _handle_interactive(self, event: FastGPTInteractiveEvent) -> None:
+        prompt = _interactive_spoken_prompt(event)
+        if prompt:
+            await self.stop_ttfb_metrics()
+            await self.push_frame(LLMTextFrame(prompt))
+
+        await self.push_frame(
+            OutputTransportMessageFrame(
+                message={
+                    "type": "response.interactive",
+                    "interaction_type": event.interaction_type,
+                    "data": event.data,
+                }
+            ),
+            FrameDirection.DOWNSTREAM,
+        )
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, LLMContextFrame):
+            try:
+                await self.push_frame(LLMFullResponseStartFrame())
+                await self.start_processing_metrics()
+                await self._process_context(frame.context)
+            except httpx.TimeoutException as exc:
+                await self._call_event_handler("on_completion_timeout")
+                await self.push_error(error_msg="FastGPT completion timeout", exception=exc)
+            except Exception as exc:
+                await self.push_error(error_msg=f"FastGPT completion error: {exc}", exception=exc)
+            finally:
+                await self.stop_processing_metrics()
+                await self.push_frame(LLMFullResponseEndFrame())
+        else:
+            await self.push_frame(frame, direction)
--- a/src/voice/pipeline.py
+++ b/src/voice/pipeline.py
@@ -0,0 +1,291 @@
+from __future__ import annotations
+
+import uuid
+
+from loguru import logger
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.audio.vad.vad_analyzer import VADParams
+from pipecat.frames.frames import (
+    LLMRunFrame,
+    OutputTransportMessageUrgentFrame,
+    TTSSpeakFrame,
+    UserStartedSpeakingFrame,
+)
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_context import LLMContext
+from pipecat.processors.aggregators.llm_response_universal import (
+    AssistantTurnStoppedMessage,
+    LLMContextAggregatorPair,
+    LLMUserAggregatorParams,
+    UserTurnStoppedMessage,
+)
+from pipecat.serializers.base_serializer import FrameSerializer
+from pipecat.serializers.protobuf import ProtobufFrameSerializer
+from pipecat.transports.websocket.fastapi import (
+    FastAPIWebsocketParams,
+    FastAPIWebsocketTransport,
+)
+from pipecat.turns.user_stop.speech_timeout_user_turn_stop_strategy import (
+    SpeechTimeoutUserTurnStopStrategy,
+)
+from pipecat.turns.user_turn_strategies import UserTurnStrategies
+
+from .config import EngineConfig
+from .context_sync import AssistantContextSyncProcessor
+from .fastgpt_llm import FastGPTLLMService
+from .protocol import ProductWebsocketSerializer
+from .services import create_llm_service, create_stt_service, create_tts_service
+from .response_state import StateTagResponseProcessor
+from .text_input import ProductTextInputProcessor
+from .text_stream import ProductTextStreamProcessor, maybe_sync_assistant_context
+from .transcript_stream import ProductTranscriptStreamProcessor
+from .turn_start import InterruptionGateUserTurnStartStrategy
+
+
+def _chat_id_from_websocket(websocket) -> str | None:
+    query_params = getattr(websocket, "query_params", None)
+    if not query_params:
+        return None
+
+    for name in ("chatId", "chat_id"):
+        value = query_params.get(name)
+        if isinstance(value, str) and value.strip():
+            return value.strip()
+    return None
+
+
+async def run_product_voice_pipeline(websocket, config: EngineConfig) -> None:
+    await run_pipeline_with_serializer(
+        websocket,
+        config,
+        serializer=ProductWebsocketSerializer(
+            sample_rate=config.audio.sample_rate_hz,
+            channels=config.audio.channels,
+        ),
+        client_label="Product JSON",
+    )
+
+
+async def run_voice_pipeline(websocket, config: EngineConfig) -> None:
+    await run_pipeline_with_serializer(
+        websocket,
+        config,
+        serializer=ProtobufFrameSerializer(),
+        client_label="Pipecat protobuf",
+    )
+
+
+async def run_pipeline_with_serializer(
+    websocket,
+    config: EngineConfig,
+    *,
+    serializer: FrameSerializer,
+    client_label: str,
+) -> None:
+    transport = FastAPIWebsocketTransport(
+        websocket=websocket,
+        params=FastAPIWebsocketParams(
+            audio_in_enabled=True,
+            audio_out_enabled=True,
+            audio_in_sample_rate=config.audio.sample_rate_hz,
+            audio_out_sample_rate=config.audio.sample_rate_hz,
+            audio_in_channels=config.audio.channels,
+            audio_out_channels=config.audio.channels,
+            serializer=serializer,
+            session_timeout=None,
+        ),
+    )
+
+    stt = create_stt_service(config.services.stt, config.audio)
+
+    llm_config = config.services.llm
+    chat_id = _chat_id_from_websocket(websocket) or f"voice_{uuid.uuid4().hex[:16]}"
+    llm = create_llm_service(
+        llm_config,
+        chat_id=chat_id,
+        session_variables={"session_id": chat_id, "channel": "voice"},
+        greeting_prompt=config.agent.greeting,
+    )
+    if llm_config.is_fastgpt:
+        logger.info(f"LLM backend=fastgpt chatId={chat_id} appId={llm_config.app_id or '-'}")
+    else:
+        logger.info(f"LLM backend=openai model={llm_config.model}")
+
+    tts = create_tts_service(config.services.tts, config.audio)
+
+    messages: list[dict[str, str]] = []
+    if llm_config.uses_local_context_history:
+        messages = [{"role": "system", "content": config.agent.system_prompt}]
+        if config.agent.greeting and config.agent.greeting_mode == "generated":
+            messages.append({"role": "system", "content": config.agent.greeting})
+
+    context = LLMContext(messages)
+
+    vad_params = VADParams(
+        confidence=config.turn.vad.confidence,
+        start_secs=config.turn.vad.start_secs,
+        stop_secs=config.turn.vad.stop_secs,
+        min_volume=config.turn.vad.min_volume,
+    )
+    # Replace pipecat's default stop strategy (Smart Turn v3) with a simple
+    # silence-timeout strategy. Smart Turn v3 was finalizing every short
+    # Chinese phrase as a complete turn, which caused one logical utterance
+    # to become several LLM calls and several user bubbles in the UI. The
+    # timeout strategy waits for `user_speech_timeout_sec` of silence
+    # (re-armed every time the user resumes speaking) before declaring the
+    # turn finished — which is what we actually want for streaming ASRs.
+    user_turn_strategies = UserTurnStrategies(
+        start=[
+            InterruptionGateUserTurnStartStrategy(
+                min_chars_when_bot_speaking=config.turn.interruption_min_chars,
+                allowed_short_replies=config.turn.interruption_short_replies,
+                use_interim=config.turn.interruption_use_interim,
+            ),
+        ],
+        stop=[
+            SpeechTimeoutUserTurnStopStrategy(
+                user_speech_timeout=config.turn.user_speech_timeout_sec,
+            ),
+        ],
+    )
+    user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
+        context,
+        user_params=LLMUserAggregatorParams(
+            vad_analyzer=SileroVADAnalyzer(params=vad_params),
+            user_turn_strategies=user_turn_strategies,
+            user_idle_timeout=config.turn.idle_prompt_timeout_sec,
+        ),
+    )
+
+    text_stream = ProductTextStreamProcessor()
+    context_sync = AssistantContextSyncProcessor(
+        text_stream=text_stream,
+        assistant_aggregator=assistant_aggregator,
+    )
+
+    processors = [
+        transport.input(),
+        ProductTextInputProcessor(),
+        stt,
+        ProductTranscriptStreamProcessor(),
+        context_sync,
+        user_aggregator,
+        llm,
+    ]
+    if config.agent.response_state.enabled:
+        processors.append(StateTagResponseProcessor(config.agent.response_state))
+    processors.extend(
+        [
+            text_stream,
+            tts,
+            transport.output(),
+            assistant_aggregator,
+        ]
+    )
+    pipeline = Pipeline(processors)
+
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            audio_in_sample_rate=config.audio.sample_rate_hz,
+            audio_out_sample_rate=config.audio.sample_rate_hz,
+            enable_metrics=True,
+            enable_usage_metrics=True,
+            enable_heartbeats=True,
+        ),
+        idle_timeout_secs=config.session.inactivity_timeout_sec,
+    )
+    task.set_reached_upstream_filter((UserStartedSpeakingFrame,))
+    idle_prompt_count = 0
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(_transport, _client):
+        logger.info(f"{client_label} websocket client connected")
+        if config.agent.greeting_mode == "fixed" and config.agent.greeting:
+            await task.queue_frames([TTSSpeakFrame(config.agent.greeting)])
+        elif config.agent.greeting_mode == "fastgpt_opener":
+            if isinstance(llm, FastGPTLLMService):
+                welcome = await llm.fetch_session_greeting_text(
+                    config.agent.fastgpt_reconnect_greeting
+                )
+                if welcome:
+                    await task.queue_frames([TTSSpeakFrame(welcome)])
+                else:
+                    logger.warning("FastGPT opener requested but no opener text was returned")
+            else:
+                raise RuntimeError("agent.greeting_mode='fastgpt_opener' requires FastGPT LLM service")
+        elif config.agent.greeting_mode == "generated":
+            await task.queue_frames([LLMRunFrame()])
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(_transport, _client):
+        logger.info(f"{client_label} websocket client disconnected")
+        await task.cancel()
+
+    @transport.event_handler("on_session_timeout")
+    async def on_session_timeout(_transport, _client):
+        logger.info(f"{client_label} websocket session timed out")
+        await task.cancel()
+
+    @task.event_handler("on_frame_reached_upstream")
+    async def on_frame_reached_upstream(_task, _frame: UserStartedSpeakingFrame):
+        nonlocal idle_prompt_count
+        idle_prompt_count = 0
+
+    @user_aggregator.event_handler("on_user_turn_started")
+    async def on_user_turn_started(_aggregator, _strategy):
+        nonlocal idle_prompt_count
+        idle_prompt_count = 0
+
+    @user_aggregator.event_handler("on_user_turn_stopped")
+    async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage):
+        logger.info(f"User: {message.content}")
+        text = (message.content or "").strip()
+        if not text:
+            return
+        await _aggregator.push_frame(
+            OutputTransportMessageUrgentFrame(
+                message={
+                    "type": "input.transcript.final",
+                    "text": text,
+                    "user_id": message.user_id,
+                    "timestamp": message.timestamp,
+                }
+            )
+        )
+
+    @assistant_aggregator.event_handler("on_assistant_turn_stopped")
+    async def on_assistant_turn_stopped(_aggregator, message: AssistantTurnStoppedMessage):
+        logger.info(f"Assistant: {message.content}")
+        maybe_sync_assistant_context(
+            _aggregator,
+            text_stream,
+            committed_text=message.content or "",
+        )
+        text_stream.take_interrupted_stream_text()
+
+    @user_aggregator.event_handler("on_user_turn_idle")
+    async def on_user_turn_idle(aggregator):
+        nonlocal idle_prompt_count
+        text = config.turn.idle_prompt_text.strip()
+        if not text or config.turn.idle_prompt_max_count <= 0:
+            return
+        if idle_prompt_count >= config.turn.idle_prompt_max_count:
+            return
+
+        idle_prompt_count += 1
+        logger.info(
+            "User idle prompt triggered "
+            f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}"
+        )
+        await aggregator.push_frame(TTSSpeakFrame(text))
+
+    # NOTE: assistant turn started/final events are emitted by
+    # ProductTextStreamProcessor, upstream of TTS, so text streams to the
+    # client ahead of audio. This logger is kept for server-side visibility.
+
+    runner = PipelineRunner(handle_sigint=False)
+    await runner.run(task)
--- a/src/voice/protocol.py
+++ b/src/voice/protocol.py
@@ -0,0 +1,227 @@
+from __future__ import annotations
+
+import base64
+import binascii
+import json
+from typing import Any
+
+from loguru import logger
+
+from pipecat.frames.frames import (
+    CancelFrame,
+    BotStartedSpeakingFrame,
+    BotStoppedSpeakingFrame,
+    EndFrame,
+    Frame,
+    InputAudioRawFrame,
+    InputTransportMessageFrame,
+    OutputAudioRawFrame,
+    OutputTransportMessageFrame,
+    OutputTransportMessageUrgentFrame,
+    TranscriptionFrame,
+    UserImageRawFrame,
+)
+from pipecat.serializers.base_serializer import FrameSerializer
+
+
+MAX_INPUT_IMAGE_BYTES = 8 * 1024 * 1024
+SUPPORTED_INPUT_IMAGE_MIME_TYPES = {"image/jpeg", "image/png", "image/webp"}
+
+
+class ProductWebsocketSerializer(FrameSerializer):
+    """Stable app-facing JSON/base64 protocol adapter for Pipecat websocket transport."""
+
+    protocol = "va.ws.v1"
+
+    def __init__(self, *, sample_rate: int, channels: int):
+        super().__init__()
+        self._sample_rate = sample_rate
+        self._channels = channels
+        self._sequence = 0
+
+    async def serialize(self, frame: Frame) -> str | bytes | None:
+        if isinstance(frame, OutputAudioRawFrame):
+            return self._event(
+                "response.audio.delta",
+                audio=base64.b64encode(frame.audio).decode("ascii"),
+                bytes=len(frame.audio),
+                sample_rate=frame.sample_rate,
+                channels=frame.num_channels,
+            )
+
+        if isinstance(frame, BotStartedSpeakingFrame):
+            return self._event("response.audio.started")
+
+        if isinstance(frame, BotStoppedSpeakingFrame):
+            return self._event("response.audio.stopped")
+
+        if isinstance(frame, TranscriptionFrame):
+            return self._event(
+                "input.transcript.final",
+                text=frame.text,
+                user_id=frame.user_id,
+                timestamp=frame.timestamp,
+            )
+
+        # ProductTextStreamProcessor owns response.text.* events. TTS can also
+        # emit TextFrame subclasses internally, so serializing them here would
+        # make clients render duplicate assistant text.
+        if isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
+            if self.should_ignore_frame(frame):
+                return None
+            message = frame.message
+            # Allow callers to emit a named protocol event by pushing a
+            # transport-message frame whose payload already carries a `type`.
+            if isinstance(message, dict) and isinstance(message.get("type"), str):
+                event_type = message["type"]
+                payload = {k: v for k, v in message.items() if k != "type"}
+                return self._event(event_type, **payload)
+            return self._event("transport.message", message=message)
+
+        return None
+
+    async def deserialize(self, data: str | bytes) -> Frame | None:
+        if isinstance(data, bytes):
+            return InputAudioRawFrame(
+                audio=data,
+                sample_rate=self._sample_rate,
+                num_channels=self._channels,
+            )
+
+        try:
+            message = json.loads(data)
+        except json.JSONDecodeError as exc:
+            logger.warning(f"Invalid product websocket JSON: {exc}")
+            return None
+
+        if not isinstance(message, dict):
+            logger.warning("Product websocket message must be a JSON object")
+            return None
+
+        message_type = message.get("type")
+        if message_type == "session.start":
+            chat_id = message.get("chatId") or message.get("chat_id")
+            return InputTransportMessageFrame(
+                message={
+                    "type": "session.started",
+                    "protocol": self.protocol,
+                    "chatId": chat_id if isinstance(chat_id, str) else None,
+                    "audio": {
+                        "encoding": "pcm_s16le",
+                        "sample_rate": self._sample_rate,
+                        "channels": self._channels,
+                    },
+                }
+            )
+
+        if message_type == "session.stop":
+            return EndFrame()
+
+        if message_type == "response.cancel":
+            return CancelFrame(reason="client_cancelled")
+
+        if message_type == "input.audio":
+            audio = message.get("audio") or message.get("data")
+            if not isinstance(audio, str):
+                logger.warning("input.audio requires base64 'audio' or 'data'")
+                return None
+            try:
+                pcm = base64.b64decode(audio)
+            except (binascii.Error, ValueError) as exc:
+                logger.warning(f"Invalid input.audio base64: {exc}")
+                return None
+            return InputAudioRawFrame(
+                audio=pcm,
+                sample_rate=int(message.get("sample_rate") or self._sample_rate),
+                num_channels=int(message.get("channels") or self._channels),
+            )
+
+        if message_type == "input.image":
+            return self._deserialize_input_image(message)
+
+        if message_type == "input.text":
+            text = message.get("text")
+            if not isinstance(text, str) or not text.strip():
+                logger.warning("input.text requires non-empty 'text'")
+                return None
+            return InputTransportMessageFrame(
+                message={
+                    "type": "input.text",
+                    "text": text,
+                    "interrupt": bool(message.get("interrupt", True)),
+                }
+            )
+
+        if message_type == "transport.message":
+            payload = message.get("message")
+            return InputTransportMessageFrame(message=payload if isinstance(payload, dict) else message)
+
+        logger.warning(f"Unsupported product websocket message type: {message_type!r}")
+        return None
+
+    def _deserialize_input_image(self, message: dict[str, Any]) -> Frame | None:
+        encoded = message.get("image") or message.get("data")
+        if not isinstance(encoded, str):
+            logger.warning("input.image requires base64 'image' or 'data'")
+            return None
+
+        mime_type = str(message.get("mime_type") or message.get("media_type") or "image/jpeg")
+        if mime_type not in SUPPORTED_INPUT_IMAGE_MIME_TYPES:
+            logger.warning(
+                "input.image unsupported mime_type "
+                f"{mime_type!r}; expected one of {sorted(SUPPORTED_INPUT_IMAGE_MIME_TYPES)}"
+            )
+            return None
+
+        try:
+            width = int(message.get("width") or 0)
+            height = int(message.get("height") or 0)
+        except (TypeError, ValueError):
+            logger.warning("input.image width and height must be integers")
+            return None
+
+        if width <= 0 or height <= 0:
+            logger.warning("input.image requires positive integer width and height")
+            return None
+
+        if "," in encoded and encoded.lstrip().startswith("data:"):
+            encoded = encoded.split(",", 1)[1]
+
+        try:
+            image = base64.b64decode(encoded, validate=True)
+        except (binascii.Error, ValueError) as exc:
+            logger.warning(f"Invalid input.image base64: {exc}")
+            return None
+
+        if len(image) > MAX_INPUT_IMAGE_BYTES:
+            logger.warning(
+                f"input.image too large: {len(image)} bytes; "
+                f"max is {MAX_INPUT_IMAGE_BYTES} bytes"
+            )
+            return None
+
+        text = message.get("text")
+        if text is not None and not isinstance(text, str):
+            logger.warning("input.image text must be a string when provided")
+            return None
+
+        return UserImageRawFrame(
+            image=image,
+            size=(width, height),
+            format=mime_type,
+            user_id=str(message.get("user_id") or "product-user"),
+            text=text or "Answer using this camera image.",
+            append_to_context=bool(message.get("append_to_context", True)),
+        )
+
+    def _event(self, event_type: str, **payload: Any) -> str:
+        self._sequence += 1
+        return json.dumps(
+            {
+                "type": event_type,
+                "protocol": self.protocol,
+                "seq": self._sequence,
+                **payload,
+            },
+            ensure_ascii=False,
+        )
--- a/src/voice/response_state.py
+++ b/src/voice/response_state.py
@@ -0,0 +1,136 @@
+from __future__ import annotations
+
+from pipecat.frames.frames import (
+    CancelFrame,
+    Frame,
+    InterruptionFrame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMTextFrame,
+    OutputTransportMessageUrgentFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+from .config import ResponseStateConfig
+
+
+class StateTagResponseProcessor(FrameProcessor):
+    """Extract a leading state tag from LLM text before text streaming and TTS.
+
+    Expected model output:
+
+        <state>some state</state>spoken response
+
+    The extracted state is emitted as a product protocol event, while only the
+    spoken response text is forwarded downstream. If the model does not produce
+    the tag, the original text is forwarded unchanged.
+    """
+
+    def __init__(self, config: ResponseStateConfig) -> None:
+        super().__init__()
+        self._tag = config.tag
+        self._event_type = config.event_type
+        self._max_prefix_chars = config.max_prefix_chars
+        self._opening_tag = f"<{self._tag}>"
+        self._closing_tag = f"</{self._tag}>"
+        self._start_frame: LLMFullResponseStartFrame | None = None
+        self._buffer = ""
+        self._decided = False
+        self._in_llm_response = False
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, LLMFullResponseStartFrame):
+            self._start_frame = frame
+            self._buffer = ""
+            self._decided = False
+            self._in_llm_response = True
+            return
+
+        if isinstance(frame, LLMTextFrame) and self._in_llm_response and not self._decided:
+            await self._process_initial_text(frame.text or "", direction)
+            return
+
+        if isinstance(frame, LLMFullResponseEndFrame):
+            if self._in_llm_response:
+                await self._flush_buffer(direction)
+            await self.push_frame(frame, direction)
+            self._reset()
+            return
+
+        if isinstance(frame, (InterruptionFrame, CancelFrame)):
+            if self._in_llm_response:
+                await self._flush_buffer(direction)
+                self._reset()
+            await self.push_frame(frame, direction)
+            return
+
+        await self.push_frame(frame, direction)
+
+    async def _process_initial_text(self, text: str, direction: FrameDirection) -> None:
+        if not text:
+            return
+
+        self._buffer += text
+        decision = self._parse_buffer()
+        if decision is None:
+            return
+
+        self._decided = True
+        state, response_text = decision
+        if state is not None:
+            await self._emit_state(state)
+        await self._push_start(direction)
+        if response_text:
+            await self.push_frame(LLMTextFrame(response_text), direction)
+        self._buffer = ""
+
+    def _parse_buffer(self) -> tuple[str | None, str] | None:
+        stripped = self._buffer.lstrip()
+        if not stripped:
+            return None
+
+        if stripped.startswith(self._opening_tag):
+            state_start = len(self._opening_tag)
+            state_end = stripped.find(self._closing_tag, state_start)
+            if state_end >= 0:
+                response_start = state_end + len(self._closing_tag)
+                return stripped[state_start:state_end].strip(), stripped[response_start:]
+            if len(self._buffer) < self._max_prefix_chars:
+                return None
+            return None, self._buffer
+
+        if self._opening_tag.startswith(stripped) and len(self._buffer) < self._max_prefix_chars:
+            return None
+
+        return None, self._buffer
+
+    async def _flush_buffer(self, direction: FrameDirection) -> None:
+        await self._push_start(direction)
+        if self._buffer:
+            await self.push_frame(LLMTextFrame(self._buffer), direction)
+            self._buffer = ""
+        self._decided = True
+
+    async def _push_start(self, direction: FrameDirection) -> None:
+        if self._start_frame:
+            await self.push_frame(self._start_frame, direction)
+            self._start_frame = None
+
+    async def _emit_state(self, state: str) -> None:
+        await self.push_frame(
+            OutputTransportMessageUrgentFrame(
+                message={
+                    "type": self._event_type,
+                    "state": state,
+                }
+            ),
+            FrameDirection.DOWNSTREAM,
+        )
+
+    def _reset(self) -> None:
+        self._start_frame = None
+        self._buffer = ""
+        self._decided = False
+        self._in_llm_response = False
--- a/src/voice/routes.py
+++ b/src/voice/routes.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+from functools import lru_cache
+from pathlib import Path
+
+from fastapi import APIRouter, FastAPI, WebSocket
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from loguru import logger
+
+from .config import EngineConfig, load_config, resolve_voice_config_path
+from .pipeline import run_product_voice_pipeline
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+VOICE_DEMO_DIR = PROJECT_ROOT / "static" / "voice-demo"
+
+router = APIRouter(tags=["voice"])
+
+
+@lru_cache(maxsize=1)
+def get_voice_config() -> EngineConfig:
+    return load_config()
+
+
+@lru_cache(maxsize=1)
+def get_voice_config_path() -> Path:
+    return resolve_voice_config_path()
+
+
+def _normalize_mount_path(path: str) -> str:
+    normalized = path.strip() or "/voice-demo"
+    if not normalized.startswith("/"):
+        normalized = f"/{normalized}"
+    return normalized.rstrip("/") or "/"
+
+
+@router.get("/voice/health")
+async def voice_health() -> dict[str, object]:
+    config = get_voice_config()
+    mount = (
+        _normalize_mount_path(config.server.webpage_mount)
+        if config.server.serve_webpage
+        else None
+    )
+    return {
+        "status": "healthy",
+        "config": str(get_voice_config_path()),
+        "protocols": {
+            "/ws-product": "va.ws.v1.json_base64",
+        },
+        "features": {
+            "product_text_input": True,
+            "product_text_interrupt": True,
+        },
+        "demo": mount,
+        "llm_provider": config.services.llm.provider,
+        "stt_provider": config.services.stt.provider,
+        "tts_provider": config.services.tts.provider,
+    }
+
+
+@router.websocket("/ws-product")
+async def product_websocket_endpoint(websocket: WebSocket) -> None:
+    await websocket.accept()
+    config = get_voice_config()
+    await run_product_voice_pipeline(websocket, config)
+
+
+def register_voice(app: FastAPI) -> None:
+    """Mount voice websocket routes and optional browser demo static files."""
+    voice_config_path = get_voice_config_path()
+    if not voice_config_path.exists():
+        logger.warning(f"Voice config not found at {voice_config_path}; voice demo disabled")
+        return
+
+    config = get_voice_config()
+    app.include_router(router)
+    logger.info(f"Voice config loaded from {voice_config_path}")
+
+    if config.server.cors_origins:
+        app.add_middleware(
+            CORSMiddleware,
+            allow_origins=config.server.cors_origins,
+            allow_credentials=True,
+            allow_methods=["*"],
+            allow_headers=["*"],
+        )
+
+    if config.server.serve_webpage and VOICE_DEMO_DIR.is_dir():
+        mount = _normalize_mount_path(config.server.webpage_mount)
+        app.mount(
+            mount,
+            StaticFiles(directory=str(VOICE_DEMO_DIR), html=True),
+            name="voice-demo",
+        )
+        logger.info(f"Voice demo mounted at {mount}")
+    else:
+        logger.info("Voice demo static page disabled or missing")
+
+    logger.info("Voice websocket registered at /ws-product")
--- a/src/voice/services.py
+++ b/src/voice/services.py
@@ -0,0 +1,220 @@
+from __future__ import annotations
+
+from collections.abc import AsyncGenerator
+
+from openai import BadRequestError
+from openai import NOT_GIVEN
+
+from pipecat.frames.frames import ErrorFrame, Frame, TTSAudioRawFrame
+from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
+from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.services.openai.stt import OpenAISTTService
+from pipecat.services.openai.tts import VALID_VOICES, OpenAITTSService
+from pipecat.services.tts_service import TextAggregationMode
+from pipecat.transcriptions.language import Language
+
+from .config import AudioConfig, LLMConfig, STTConfig, TTSConfig
+from .fastgpt_llm import FastGPTLLMService, FastGPTLLMSettings
+from .xfyun_asr import DEFAULT_XFYUN_ASR_URL, XfyunASRService
+from .xfyun_super_tts import DEFAULT_XFYUN_SUPER_TTS_URL, XfyunSuperTTSService
+from .xfyun_tts import DEFAULT_XFYUN_TTS_URL, XfyunTTSService
+
+
+def create_stt_service(config: STTConfig, audio: AudioConfig | None = None):
+    if config.provider == "xfyun":
+        sample_rate = audio.sample_rate_hz if audio else 16000
+        return XfyunASRService(
+            app_id=config.app_id,
+            api_key=config.api_key or "",
+            api_secret=config.api_secret,
+            url=config.base_url or DEFAULT_XFYUN_ASR_URL,
+            language=config.language or "zh_cn",
+            domain=config.domain,
+            accent=config.accent,
+            sample_rate=sample_rate,
+            encoding=config.encoding,
+            frame_size=config.frame_size,
+            open_timeout=config.timeout_sec,
+            dynamic_correction=config.dynamic_correction,
+        )
+
+    _require_provider(config.provider, "openai", "stt")
+    return OpenAISTTService(
+        api_key=config.api_key or None,
+        base_url=config.base_url,
+        settings=OpenAISTTService.Settings(
+            model=config.model,
+            language=_language(config.language),
+        ),
+    )
+
+
+def create_llm_service(
+    config: LLMConfig,
+    *,
+    chat_id: str | None = None,
+    session_variables: dict | None = None,
+    greeting_prompt: str | None = None,
+):
+    if config.is_fastgpt:
+        variables = {**config.variables, **(session_variables or {})}
+        return FastGPTLLMService(
+            api_key=config.api_key,
+            base_url=config.base_url or "http://localhost:3000",
+            chat_id=chat_id,
+            app_id=config.app_id,
+            greeting_prompt=greeting_prompt,
+            timeout=config.timeout_sec,
+            image_input_mode=config.image_input_mode,
+            settings=FastGPTLLMSettings(
+                model=config.model or "fastgpt",
+                variables=variables,
+                detail=config.detail,
+            ),
+        )
+
+    if not config.is_openai:
+        supported = ", ".join(sorted(("openai", "fastgpt", "llm")))
+        raise ValueError(
+            f"Unsupported llm provider {config.provider!r}; expected one of: {supported}"
+        )
+    return OpenAILLMService(
+        api_key=config.api_key or None,
+        base_url=config.base_url,
+        settings=OpenAILLMService.Settings(
+            model=config.model,
+            temperature=config.temperature if config.temperature is not None else NOT_GIVEN,
+        ),
+    )
+
+
+def create_tts_service(config: TTSConfig, audio: AudioConfig):
+    if config.provider == "xfyun":
+        source_sample_rate = config.source_sample_rate_hz or audio.sample_rate_hz
+        if source_sample_rate not in (8000, 16000):
+            raise ValueError("Xfyun TTS source_sample_rate_hz must be 8000 or 16000")
+        return XfyunTTSService(
+            app_id=config.app_id,
+            api_key=config.api_key or "",
+            api_secret=config.api_secret,
+            voice=config.voice,
+            url=config.base_url or DEFAULT_XFYUN_TTS_URL,
+            sample_rate=audio.sample_rate_hz,
+            source_sample_rate=source_sample_rate,
+            encoding=config.aue,
+            text_encoding=config.tte,
+            speed=config.speed,
+            volume=config.volume,
+            pitch=config.pitch,
+            timeout=config.timeout_sec,
+            push_stop_frames=True,
+        )
+
+    if config.provider in ("xfyun_super", "xfyun_super_tts"):
+        source_sample_rate = config.source_sample_rate_hz or 24000
+        if source_sample_rate not in (8000, 16000, 24000):
+            raise ValueError(
+                "Xfyun Super TTS source_sample_rate_hz must be 8000, 16000, or 24000"
+            )
+        text_aggregation_mode = config.text_aggregation_mode or TextAggregationMode.TOKEN
+        return XfyunSuperTTSService(
+            app_id=config.app_id,
+            api_key=config.api_key or "",
+            api_secret=config.api_secret,
+            voice=config.voice,
+            url=config.base_url or DEFAULT_XFYUN_SUPER_TTS_URL,
+            sample_rate=audio.sample_rate_hz,
+            source_sample_rate=source_sample_rate,
+            encoding=config.aue,
+            speed=config.speed,
+            volume=config.volume,
+            pitch=config.pitch,
+            oral_level=config.oral_level,
+            text_aggregation_mode=text_aggregation_mode,
+            open_timeout=config.timeout_sec,
+        )
+
+    _require_provider(config.provider, "openai", "tts")
+    service_class = OpenAITTSService if config.voice in VALID_VOICES else OpenAICompatibleTTSService
+    return service_class(
+        api_key=config.api_key or None,
+        base_url=config.base_url,
+        sample_rate=audio.sample_rate_hz,
+        source_sample_rate=config.source_sample_rate_hz,
+        settings=OpenAITTSService.Settings(
+            model=config.model,
+            voice=config.voice,
+        ),
+    )
+
+
+class OpenAICompatibleTTSService(OpenAITTSService):
+    """OpenAI-compatible TTS service that permits provider-specific voice ids."""
+
+    def __init__(self, *, source_sample_rate: int | None = None, **kwargs):
+        super().__init__(**kwargs)
+        self._source_sample_rate = source_sample_rate or OPENAI_SAMPLE_RATE
+
+    async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
+        voice = self._settings.voice
+        if not voice:
+            yield ErrorFrame(error="TTS voice must be specified")
+            return
+
+        try:
+            create_params = {
+                "input": text,
+                "model": self._settings.model,
+                "voice": voice,
+                "response_format": "pcm",
+            }
+
+            if self._settings.instructions:
+                create_params["instructions"] = self._settings.instructions
+
+            if self._settings.speed:
+                create_params["speed"] = self._settings.speed
+
+            async with self._client.audio.speech.with_streaming_response.create(
+                **create_params
+            ) as response:
+                if response.status_code != 200:
+                    error = await response.text()
+                    yield ErrorFrame(
+                        error=f"TTS request failed (status: {response.status_code}, error: {error})"
+                    )
+                    return
+
+                await self.start_tts_usage_metrics(text)
+
+                async def audio_chunks():
+                    async for chunk in response.iter_bytes(self.chunk_size):
+                        if chunk:
+                            yield chunk
+
+                first_frame = True
+                async for frame in self._stream_audio_frames_from_iterator(
+                    audio_chunks(),
+                    in_sample_rate=self._source_sample_rate,
+                    context_id=context_id,
+                ):
+                    if first_frame:
+                        await self.stop_ttfb_metrics()
+                        first_frame = False
+                    yield frame
+        except BadRequestError as exc:
+            yield ErrorFrame(error=f"TTS request failed: {exc}")
+        except Exception as exc:
+            yield ErrorFrame(error=f"TTS request failed: {exc}")
+
+
+def _require_provider(actual: str, expected: str, service_name: str) -> None:
+    if actual != expected:
+        raise ValueError(f"Unsupported {service_name} provider {actual!r}; expected {expected!r}")
+
+
+def _language(value: str | None) -> Language | None:
+    if value is None:
+        return None
+    normalized = value.replace("-", "_").upper()
+    return getattr(Language, normalized, value)
--- a/src/voice/text_input.py
+++ b/src/voice/text_input.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from loguru import logger
+
+from pipecat.frames.frames import (
+    Frame,
+    InputTransportMessageFrame,
+    LLMMessagesAppendFrame,
+    UserImageRawFrame,
+    UserStartedSpeakingFrame,
+    UserStoppedSpeakingFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+
+class ProductTextInputProcessor(FrameProcessor):
+    """Converts product text-input transport messages and marks image input as user activity."""
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, UserImageRawFrame):
+            await self.broadcast_frame(UserStartedSpeakingFrame)
+            await self.push_frame(frame, direction)
+            await self.broadcast_frame(UserStoppedSpeakingFrame)
+            return
+
+        if not isinstance(frame, InputTransportMessageFrame):
+            await self.push_frame(frame, direction)
+            return
+
+        message = frame.message
+        if not isinstance(message, dict) or message.get("type") != "input.text":
+            await self.push_frame(frame, direction)
+            return
+
+        text = str(message.get("text") or "").strip()
+        if not text:
+            return
+
+        await self.broadcast_frame(UserStartedSpeakingFrame)
+
+        if message.get("interrupt", True):
+            logger.info("Text input interrupting current response")
+            await self.broadcast_interruption()
+
+        await self.push_frame(
+            LLMMessagesAppendFrame(
+                messages=[{"role": "user", "content": text}],
+                run_llm=True,
+            ),
+            FrameDirection.DOWNSTREAM,
+        )
+        await self.broadcast_frame(UserStoppedSpeakingFrame)
--- a/src/voice/text_stream.py
+++ b/src/voice/text_stream.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+
+from typing import Any, Protocol
+
+from pipecat.frames.frames import (
+    CancelFrame,
+    Frame,
+    InterruptionFrame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMTextFrame,
+    OutputTransportMessageUrgentFrame,
+    TTSSpeakFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+
+class _AssistantContextSync(Protocol):
+    @property
+    def context(self) -> Any: ...
+
+
+def _committed_assistant_content(context: Any) -> str:
+    """Return trailing assistant text only when the last context message is assistant."""
+    messages = context.get_messages()
+    if not messages:
+        return ""
+    last = messages[-1]
+    if not isinstance(last, dict) or last.get("role") != "assistant":
+        return ""
+    content = last.get("content")
+    if isinstance(content, str):
+        return content.strip()
+    return ""
+
+
+def sync_streamed_assistant_context(
+    aggregator: _AssistantContextSync,
+    *,
+    streamed_text: str,
+    committed_text: str,
+) -> None:
+    """Align LLM context with urgent-streamed UI text.
+
+    The assistant aggregator commits TTS-spoken text; ``ProductTextStreamProcessor``
+    mirrors the LLM stream to the client. Replace or insert the streamed text so
+    the next turn sees what the user read on screen.
+    """
+    streamed = streamed_text.strip()
+    if not streamed or streamed == committed_text.strip():
+        return
+
+    committed = committed_text.strip()
+
+    def _apply(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        updated = list(messages)
+        if not updated:
+            updated.append({"role": "assistant", "content": streamed})
+            return updated
+
+        last = updated[-1]
+        if isinstance(last, dict) and last.get("role") == "assistant":
+            content = last.get("content")
+            if isinstance(content, str) and content.strip() != streamed:
+                updated[-1] = {"role": "assistant", "content": streamed}
+            return updated
+
+        if (
+            len(updated) >= 2
+            and isinstance(last, dict)
+            and last.get("role") == "user"
+        ):
+            prev = updated[-2]
+            if isinstance(prev, dict) and prev.get("role") == "user":
+                updated.insert(len(updated) - 1, {"role": "assistant", "content": streamed})
+                return updated
+
+        if isinstance(last, dict) and last.get("role") == "user":
+            updated.append({"role": "assistant", "content": streamed})
+            return updated
+
+        updated.append({"role": "assistant", "content": streamed})
+        return updated
+
+    aggregator.context.transform_messages(_apply)
+
+
+def maybe_sync_assistant_context(
+    aggregator: _AssistantContextSync,
+    text_stream: "ProductTextStreamProcessor",
+    *,
+    committed_text: str | None = None,
+) -> None:
+    committed = (
+        committed_text.strip()
+        if committed_text is not None
+        else _committed_assistant_content(aggregator.context)
+    )
+    streamed = text_stream.last_assistant_stream_text()
+    if not streamed:
+        return
+    sync_streamed_assistant_context(
+        aggregator,
+        streamed_text=streamed,
+        committed_text=committed,
+    )
+
+
+class ProductTextStreamProcessor(FrameProcessor):
+    """Mirrors LLM text frames as streaming protocol events.
+
+    Placed between the LLM service and the TTS service, this processor
+    observes the LLM's text frames as they're emitted and forwards them
+    downstream as ``OutputTransportMessageUrgentFrame``s that the product
+    serializer turns into ``response.text.{started,delta,final}`` events.
+
+    Urgent frames bypass TTS serialization and transport audio queues so text
+    reaches the client at least as quickly as synthesized audio.
+
+    ``TTSSpeakFrame`` (used by the fixed-greeting code path, which bypasses
+    the LLM entirely) is also handled: the processor synthesizes a single
+    started/delta/final sequence for its fixed text.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._aggregation: list[str] = []
+        self._turn_active = False
+        self._last_assistant_stream_text = ""
+        self._interrupted_stream_text: str | None = None
+
+    def last_assistant_stream_text(self) -> str:
+        return self._last_assistant_stream_text
+
+    def take_interrupted_stream_text(self) -> str | None:
+        text = self._interrupted_stream_text
+        self._interrupted_stream_text = None
+        return text
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, LLMFullResponseStartFrame):
+            await self.push_frame(frame, direction)
+            await self._start_turn()
+        elif isinstance(frame, LLMTextFrame):
+            await self.push_frame(frame, direction)
+            if frame.text:
+                await self._delta(frame.text)
+        elif isinstance(frame, LLMFullResponseEndFrame):
+            await self.push_frame(frame, direction)
+            await self._end_turn(interrupted=False)
+        elif isinstance(frame, (InterruptionFrame, CancelFrame)):
+            await self.push_frame(frame, direction)
+            await self._handle_interrupt()
+        elif isinstance(frame, TTSSpeakFrame):
+            # Fixed-text / direct-speech path: there's no LLM cycle, so
+            # synthesize one started/delta/final sequence for the spoken text.
+            text = frame.text or ""
+            await self.push_frame(frame, direction)
+            await self._start_turn()
+            if text:
+                await self._delta(text)
+            await self._end_turn(interrupted=False)
+        else:
+            await self.push_frame(frame, direction)
+
+    async def _start_turn(self) -> None:
+        if self._turn_active:
+            return
+        self._turn_active = True
+        self._aggregation = []
+        await self._emit("response.text.started")
+
+    async def _delta(self, text: str) -> None:
+        if not self._turn_active:
+            # A text frame outside a turn shouldn't happen, but if it does,
+            # synthesize a started boundary so the client renders sensibly.
+            await self._start_turn()
+        self._aggregation.append(text)
+        await self._emit("response.text.delta", text=text)
+
+    async def _handle_interrupt(self) -> None:
+        if self._turn_active:
+            await self._end_turn(interrupted=True)
+            return
+
+        if self._last_assistant_stream_text:
+            self._interrupted_stream_text = self._last_assistant_stream_text
+
+    async def _end_turn(self, *, interrupted: bool) -> None:
+        if not self._turn_active:
+            return
+
+        full_text = "".join(self._aggregation)
+        if full_text:
+            self._last_assistant_stream_text = full_text
+        if interrupted and full_text:
+            self._interrupted_stream_text = full_text
+
+        self._turn_active = False
+        self._aggregation = []
+        await self._emit(
+            "response.text.final",
+            text=full_text,
+            interrupted=interrupted,
+        )
+
+    async def _emit(self, event_type: str, **payload: object) -> None:
+        await self.push_frame(
+            OutputTransportMessageUrgentFrame(
+                message={"type": event_type, **payload},
+            ),
+            FrameDirection.DOWNSTREAM,
+        )
--- a/src/voice/transcript_stream.py
+++ b/src/voice/transcript_stream.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+from pipecat.frames.frames import (
+    Frame,
+    InterimTranscriptionFrame,
+    OutputTransportMessageUrgentFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+
+class ProductTranscriptStreamProcessor(FrameProcessor):
+    """Mirrors interim STT frames to the product websocket protocol."""
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, InterimTranscriptionFrame):
+            await self.push_frame(
+                OutputTransportMessageUrgentFrame(
+                    message={
+                        "type": "input.transcript.interim",
+                        "text": frame.text,
+                        "user_id": frame.user_id,
+                        "timestamp": frame.timestamp,
+                    }
+                ),
+                FrameDirection.DOWNSTREAM,
+            )
+
+        await self.push_frame(frame, direction)
--- a/src/voice/turn_start.py
+++ b/src/voice/turn_start.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+import re
+
+from loguru import logger
+from pipecat.frames.frames import (
+    BotStartedSpeakingFrame,
+    BotStoppedSpeakingFrame,
+    Frame,
+    InterimTranscriptionFrame,
+    TranscriptionFrame,
+)
+from pipecat.turns.types import ProcessFrameResult
+from pipecat.turns.user_start.base_user_turn_start_strategy import BaseUserTurnStartStrategy
+
+
+_COUNTABLE_TEXT_RE = re.compile(r"[\w\u4e00-\u9fff]", re.UNICODE)
+
+
+class InterruptionGateUserTurnStartStrategy(BaseUserTurnStartStrategy):
+    """Starts user turns only after likely intentional speech.
+
+    When the assistant is speaking, short background speech should not barge in
+    unless it is a common answer to a yes/no style question. When the assistant
+    is not speaking, any non-empty transcript can start a normal user turn.
+    """
+
+    def __init__(
+        self,
+        *,
+        min_chars_when_bot_speaking: int,
+        allowed_short_replies: list[str],
+        use_interim: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self._min_chars_when_bot_speaking = min_chars_when_bot_speaking
+        self._allowed_short_replies = {
+            self._normalize_text(reply) for reply in allowed_short_replies if reply.strip()
+        }
+        self._use_interim = use_interim
+        self._bot_speaking = False
+
+    async def reset(self):
+        await super().reset()
+
+    async def process_frame(self, frame: Frame) -> ProcessFrameResult:
+        if isinstance(frame, BotStartedSpeakingFrame):
+            self._bot_speaking = True
+            return ProcessFrameResult.CONTINUE
+        if isinstance(frame, BotStoppedSpeakingFrame):
+            self._bot_speaking = False
+            return ProcessFrameResult.CONTINUE
+        if isinstance(frame, InterimTranscriptionFrame) and self._use_interim:
+            return await self._handle_transcription(frame.text, interim=True)
+        if isinstance(frame, TranscriptionFrame):
+            return await self._handle_transcription(frame.text, interim=False)
+
+        return ProcessFrameResult.CONTINUE
+
+    async def _handle_transcription(self, text: str, *, interim: bool) -> ProcessFrameResult:
+        normalized = self._normalize_text(text)
+        if not normalized:
+            return ProcessFrameResult.CONTINUE
+
+        if not self._bot_speaking:
+            await self.trigger_user_turn_started()
+            return ProcessFrameResult.STOP
+
+        should_interrupt = self._should_interrupt(normalized)
+        logger.debug(
+            f"{self} interruption_gate text={text!r} normalized={normalized!r} "
+            f"should_interrupt={should_interrupt} interim={interim}"
+        )
+
+        if should_interrupt:
+            await self.trigger_user_turn_started()
+            return ProcessFrameResult.STOP
+
+        await self.trigger_reset_aggregation()
+        return ProcessFrameResult.CONTINUE
+
+    def _should_interrupt(self, normalized: str) -> bool:
+        return (
+            normalized in self._allowed_short_replies
+            or len(normalized) >= self._min_chars_when_bot_speaking
+        )
+
+    @staticmethod
+    def _normalize_text(text: str) -> str:
+        return "".join(_COUNTABLE_TEXT_RE.findall(text.lower()))
--- a/src/voice/xfyun_asr.py
+++ b/src/voice/xfyun_asr.py
@@ -0,0 +1,353 @@
+from __future__ import annotations
+
+import asyncio
+import base64
+import hashlib
+import hmac
+import json
+import os
+from collections.abc import AsyncGenerator
+from datetime import datetime, timezone
+from email.utils import format_datetime
+from typing import Any
+from urllib.parse import urlencode, urlparse
+
+from loguru import logger
+
+from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
+    Frame,
+    InterimTranscriptionFrame,
+    TranscriptionFrame,
+    UserStoppedSpeakingFrame,
+    VADUserStartedSpeakingFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.settings import STTSettings
+from pipecat.services.stt_service import STTService
+from pipecat.transcriptions.language import Language
+from pipecat.utils.time import time_now_iso8601
+from websockets.asyncio.client import connect as websocket_connect
+from websockets.protocol import State
+
+
+DEFAULT_XFYUN_ASR_URL = "wss://iat-api.xfyun.cn/v2/iat"
+
+
+class XfyunASRService(STTService):
+    """iFlytek/Xfyun streaming voice dictation service for Pipecat."""
+
+    def __init__(
+        self,
+        *,
+        app_id: str,
+        api_key: str,
+        api_secret: str,
+        url: str | None = None,
+        language: str = "zh_cn",
+        domain: str = "iat",
+        accent: str = "mandarin",
+        sample_rate: int = 16000,
+        encoding: str = "raw",
+        frame_size: int = 1280,
+        open_timeout: float = 10.0,
+        dynamic_correction: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            sample_rate=sample_rate,
+            settings=STTSettings(model=None, language=language),
+            **kwargs,
+        )
+        self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
+        self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
+        self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
+        self._url = url or DEFAULT_XFYUN_ASR_URL
+        self._language = language
+        self._domain = domain
+        self._accent = accent
+        self._encoding = encoding
+        self._frame_size = frame_size
+        self._open_timeout = open_timeout
+        self._dynamic_correction = dynamic_correction
+
+        self._websocket = None
+        self._receive_task = None
+        self._audio_buffer = bytearray()
+        self._sent_first_frame = False
+        self._sent_final_frame = False
+        self._finalizing_turn = False
+        self._partials: list[str] = []
+        self._last_text = ""
+
+    async def cleanup(self) -> None:
+        await self._close_utterance()
+        await super().cleanup()
+
+    async def stop(self, frame: EndFrame) -> None:
+        await self._close_utterance()
+        await super().stop(frame)
+
+    async def cancel(self, frame: CancelFrame) -> None:
+        await self._close_utterance()
+        await super().cancel(frame)
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, UserStoppedSpeakingFrame):
+            # Aggregator-level turn end (broadcast once per logical user turn).
+            # This is the only boundary that finalizes/closes the xfyun
+            # websocket, so brief VAD pauses do not restart the ASR session.
+            await self._finish_utterance()
+        elif isinstance(frame, VADUserStartedSpeakingFrame):
+            await self._start_utterance()
+
+    async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame | None, None]:
+        if not audio:
+            yield None
+            return
+
+        if not self._websocket or self._websocket.state is not State.OPEN:
+            await self._start_utterance()
+
+        self._audio_buffer.extend(audio)
+        await self._flush_audio_buffer(final=False)
+        yield None
+
+    async def _start_utterance(self) -> None:
+        if self._websocket and self._websocket.state is State.OPEN:
+            return
+
+        if not self._app_id or not self._api_key or not self._api_secret:
+            await self.push_error("Xfyun ASR requires app_id, api_key, and api_secret")
+            return
+
+        if self.sample_rate not in (8000, 16000):
+            await self.push_error("Xfyun ASR sample rate must be 8000 or 16000")
+            return
+
+        self._audio_buffer.clear()
+        self._partials = []
+        self._last_text = ""
+        self._sent_first_frame = False
+        self._sent_final_frame = False
+
+        auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
+        try:
+            self._websocket = await websocket_connect(
+                auth_url,
+                max_size=None,
+                open_timeout=self._open_timeout,
+            )
+        except Exception as exc:
+            await self.push_error(f"Xfyun ASR connection failed: {exc}", exception=exc)
+            self._websocket = None
+            return
+
+        self._receive_task = self.create_task(
+            self._receive_messages(),
+            name="xfyun_asr_receive",
+        )
+
+    async def _finish_utterance(self) -> None:
+        if not self._websocket or self._websocket.state is not State.OPEN:
+            return
+
+        await self._flush_audio_buffer(final=True)
+        if not self._sent_first_frame:
+            await self._close_utterance()
+            return
+
+        if not self._sent_final_frame:
+            self._finalizing_turn = True
+            await self._send_payload({"data": {"status": 2}})
+            self.request_finalize()
+            self._sent_final_frame = True
+
+    async def _close_utterance(self) -> None:
+        current_task = asyncio.current_task()
+        if self._receive_task and self._receive_task is not current_task:
+            await self.cancel_task(self._receive_task)
+            self._receive_task = None
+
+        websocket = self._websocket
+        self._websocket = None
+        if websocket and websocket.state is State.OPEN:
+            try:
+                await websocket.close()
+            except Exception:
+                pass
+
+        self._audio_buffer.clear()
+        self._sent_first_frame = False
+        self._sent_final_frame = False
+        self._finalizing_turn = False
+
+    async def _flush_audio_buffer(self, *, final: bool) -> None:
+        while len(self._audio_buffer) >= self._frame_size:
+            chunk = bytes(self._audio_buffer[: self._frame_size])
+            del self._audio_buffer[: self._frame_size]
+            await self._send_audio_chunk(chunk, status=1)
+
+        if final and self._audio_buffer:
+            chunk = bytes(self._audio_buffer)
+            self._audio_buffer.clear()
+            await self._send_audio_chunk(chunk, status=1)
+
+    async def _send_audio_chunk(self, audio: bytes, *, status: int) -> None:
+        if not audio:
+            return
+
+        if not self._sent_first_frame:
+            business = {
+                "language": self._language,
+                "domain": self._domain,
+                "accent": self._accent,
+            }
+            if self._dynamic_correction:
+                business["dwa"] = "wpgs"
+
+            payload = {
+                "common": {"app_id": self._app_id},
+                "business": business,
+                "data": {
+                    "status": 0,
+                    "format": f"audio/L16;rate={self.sample_rate}",
+                    "encoding": self._encoding,
+                    "audio": base64.b64encode(audio).decode("utf-8"),
+                },
+            }
+            self._sent_first_frame = True
+        else:
+            payload = {
+                "data": {
+                    "status": status,
+                    "format": f"audio/L16;rate={self.sample_rate}",
+                    "encoding": self._encoding,
+                    "audio": base64.b64encode(audio).decode("utf-8"),
+                }
+            }
+
+        await self._send_payload(payload)
+
+    async def _send_payload(self, payload: dict[str, Any]) -> None:
+        if not self._websocket or self._websocket.state is not State.OPEN:
+            return
+        await self._websocket.send(json.dumps(payload, ensure_ascii=False))
+
+    async def _receive_messages(self) -> None:
+        websocket = self._websocket
+        if not websocket:
+            return
+
+        try:
+            async for message in websocket:
+                await self._process_response(json.loads(message))
+        except Exception as exc:
+            if self._websocket is websocket:
+                await self.push_error(f"Xfyun ASR receive failed: {exc}", exception=exc)
+        finally:
+            if self._websocket is websocket:
+                self._websocket = None
+            self._receive_task = None
+
+    async def _process_response(self, payload: dict[str, Any]) -> None:
+        code = payload.get("code", -1)
+        if code != 0:
+            message = payload.get("message", "unknown error")
+            sid = payload.get("sid")
+            await self.push_error(f"Xfyun ASR error code={code}, sid={sid}, message={message}")
+            return
+
+        data = payload.get("data")
+        if not isinstance(data, dict):
+            return
+
+        is_final_response = data.get("status") == 2
+        recognition = data.get("result")
+        if isinstance(recognition, dict):
+            text = self._apply_recognition_result(recognition)
+            if text and text != self._last_text:
+                self._last_text = text
+                if not self._finalizing_turn and not is_final_response:
+                    await self.push_frame(
+                        InterimTranscriptionFrame(
+                            text,
+                            self._user_id,
+                            time_now_iso8601(),
+                            _language_or_none(self._language),
+                            result=payload,
+                        )
+                    )
+
+        if is_final_response:
+            final_text = self._last_text
+            if final_text:
+                self.confirm_finalize()
+                await self.push_frame(
+                    TranscriptionFrame(
+                        final_text,
+                        self._user_id,
+                        time_now_iso8601(),
+                        _language_or_none(self._language),
+                        result=payload,
+                    )
+                )
+            await self._close_utterance()
+
+    def _apply_recognition_result(self, recognition: dict[str, Any]) -> str:
+        partial = _extract_text_from_result(recognition)
+        if not partial:
+            return self._last_text
+
+        if self._dynamic_correction and recognition.get("pgs") == "rpl" and recognition.get("rg"):
+            start, end = recognition["rg"]
+            if 1 <= start <= len(self._partials):
+                self._partials[start - 1 : end] = [partial]
+            else:
+                logger.debug(f"Ignoring out-of-range Xfyun replacement rg={recognition['rg']}")
+        else:
+            self._partials.append(partial)
+
+        return "".join(self._partials)
+
+
+def _extract_text_from_result(result: dict[str, Any]) -> str:
+    words: list[str] = []
+    for item in result.get("ws", []):
+        for candidate in item.get("cw", []):
+            word = candidate.get("w")
+            if word:
+                words.append(word)
+    return "".join(words)
+
+
+def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
+    parsed = urlparse(url)
+    host = parsed.netloc
+    path = parsed.path or "/v2/iat"
+    date = format_datetime(datetime.now(timezone.utc), usegmt=True)
+    request_line = f"GET {path} HTTP/1.1"
+    signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
+    signature_sha = hmac.new(
+        api_secret.encode("utf-8"),
+        signature_origin.encode("utf-8"),
+        digestmod=hashlib.sha256,
+    ).digest()
+    signature = base64.b64encode(signature_sha).decode("utf-8")
+    authorization_origin = (
+        f'api_key="{api_key}", algorithm="hmac-sha256", '
+        f'headers="host date request-line", signature="{signature}"'
+    )
+    authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
+    query = urlencode({"authorization": authorization, "date": date, "host": host})
+    return f"{url}?{query}"
+
+
+def _language_or_none(value: str) -> Language | None:
+    try:
+        return Language(value)
+    except ValueError:
+        return None
--- a/src/voice/xfyun_super_tts.py
+++ b/src/voice/xfyun_super_tts.py
@@ -0,0 +1,391 @@
+from __future__ import annotations
+
+import asyncio
+import base64
+import hashlib
+import hmac
+import json
+import os
+from collections.abc import AsyncGenerator
+from datetime import datetime, timezone
+from email.utils import format_datetime
+from typing import Any
+from urllib.parse import urlencode, urlparse
+
+from loguru import logger
+
+from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
+    ErrorFrame,
+    Frame,
+    StartFrame,
+    TTSAudioRawFrame,
+    TTSStoppedFrame,
+)
+from pipecat.services.settings import TTSSettings
+from pipecat.services.tts_service import TextAggregationMode, WebsocketTTSService
+from pipecat.utils.tracing.service_decorators import traced_tts
+
+try:
+    from websockets.asyncio.client import connect as websocket_connect
+    from websockets.protocol import State
+except ModuleNotFoundError as exc:
+    logger.error(f"Exception: {exc}")
+    logger.error("In order to use Xfyun Super TTS, install the websockets package.")
+    raise Exception(f"Missing module: {exc}") from exc
+
+from .xfyun_tts import _sanitize_text_for_tts
+
+
+DEFAULT_XFYUN_SUPER_TTS_URL = "wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6"
+VALID_SAMPLE_RATES = {8000, 16000, 24000}
+
+
+class XfyunSuperTTSService(WebsocketTTSService):
+    """iFlytek/Xfyun Super Smart TTS using bidirectional WebSocket streaming.
+
+    The service keeps one Xfyun synthesis session open for a Pipecat turn. Each
+    ``run_tts`` call sends a text segment with status 0/1, while ``flush_audio``
+    sends the terminal status 2 frame. Audio arrives on the receive task and is
+    appended to the Pipecat audio context.
+    """
+
+    def __init__(
+        self,
+        *,
+        app_id: str,
+        api_key: str,
+        api_secret: str,
+        voice: str,
+        url: str | None = None,
+        sample_rate: int = 16000,
+        source_sample_rate: int = 24000,
+        encoding: str = "raw",
+        speed: int = 50,
+        volume: int = 50,
+        pitch: int = 50,
+        oral_level: str = "mid",
+        text_aggregation_mode: TextAggregationMode | str | None = TextAggregationMode.TOKEN,
+        open_timeout: float = 30.0,
+        **kwargs,
+    ) -> None:
+        if isinstance(text_aggregation_mode, str):
+            text_aggregation_mode = TextAggregationMode(text_aggregation_mode)
+
+        super().__init__(
+            text_aggregation_mode=text_aggregation_mode,
+            push_text_frames=True,
+            push_stop_frames=False,
+            push_start_frame=True,
+            pause_frame_processing=False,
+            sample_rate=sample_rate,
+            settings=TTSSettings(model=None, voice=voice, language=None),
+            **kwargs,
+        )
+        self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
+        self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
+        self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
+        self._voice = voice
+        self._url = url or DEFAULT_XFYUN_SUPER_TTS_URL
+        self._source_sample_rate = source_sample_rate
+        self._encoding = encoding
+        self._speed = speed
+        self._volume = volume
+        self._pitch = pitch
+        self._oral_level = oral_level
+        self._open_timeout = open_timeout
+
+        self._receive_task: asyncio.Task | None = None
+        self._active_context_id: str | None = None
+        self._started_contexts: set[str] = set()
+        self._seq_by_context: dict[str, int] = {}
+        self._sent_text_bytes_by_context: dict[str, int] = {}
+        self._stream_completed = False
+
+    def can_generate_metrics(self) -> bool:
+        return True
+
+    async def start(self, frame: StartFrame) -> None:
+        await super().start(frame)
+        if not self._app_id or not self._api_key or not self._api_secret:
+            await self.push_error(
+                error_msg="Xfyun Super TTS requires app_id, api_key, and api_secret"
+            )
+            return
+        if self._encoding != "raw":
+            await self.push_error(error_msg="Xfyun Super TTS must use raw PCM audio in Pipecat")
+            return
+        if self._source_sample_rate not in VALID_SAMPLE_RATES:
+            await self.push_error(
+                error_msg=(
+                    "Xfyun Super TTS source_sample_rate must be one of "
+                    f"{sorted(VALID_SAMPLE_RATES)}"
+                )
+            )
+            return
+        await self._connect()
+
+    async def stop(self, frame: EndFrame) -> None:
+        await super().stop(frame)
+        await self._disconnect()
+
+    async def cancel(self, frame: CancelFrame) -> None:
+        await super().cancel(frame)
+        await self._disconnect()
+
+    async def flush_audio(self, context_id: str | None = None) -> None:
+        flush_id = context_id or self.get_active_audio_context_id()
+        if not flush_id or not self._websocket:
+            return
+        if flush_id not in self._started_contexts:
+            return
+
+        logger.trace(f"{self}: flushing Xfyun Super TTS stream {flush_id}")
+        await self._send_request_frame(flush_id, "", status=2)
+
+    async def on_audio_context_interrupted(self, context_id: str) -> None:
+        await self.stop_all_metrics()
+        await self._reset_context(context_id)
+        await self._disconnect()
+        await self._connect()
+        await super().on_audio_context_interrupted(context_id)
+
+    async def _connect(self) -> None:
+        await super()._connect()
+        await self._connect_websocket()
+        if self._websocket and not self._receive_task:
+            self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
+
+    async def _disconnect(self) -> None:
+        await super()._disconnect()
+        if self._receive_task:
+            await self.cancel_task(self._receive_task)
+            self._receive_task = None
+        await self._disconnect_websocket()
+
+    async def _connect_websocket(self) -> None:
+        try:
+            if self._websocket and self._websocket.state is State.OPEN:
+                return
+            logger.debug("Connecting to Xfyun Super TTS")
+            auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
+            self._websocket = await websocket_connect(
+                auth_url,
+                max_size=None,
+                open_timeout=self._open_timeout,
+            )
+            await self._call_event_handler("on_connected")
+        except Exception as exc:
+            self._websocket = None
+            await self.push_error(
+                error_msg=f"Unable to connect to Xfyun Super TTS: {exc}",
+                exception=exc,
+            )
+            await self._call_event_handler("on_connection_error", f"{exc}")
+
+    async def _disconnect_websocket(self) -> None:
+        try:
+            await self.stop_all_metrics()
+            if self._websocket:
+                logger.debug("Disconnecting from Xfyun Super TTS")
+                await self._websocket.close()
+        except Exception as exc:
+            await self.push_error(
+                error_msg=f"Error closing Xfyun Super TTS websocket: {exc}",
+                exception=exc,
+            )
+        finally:
+            await self.remove_active_audio_context()
+            self._websocket = None
+            self._active_context_id = None
+            self._started_contexts.clear()
+            self._seq_by_context.clear()
+            self._sent_text_bytes_by_context.clear()
+            self._stream_completed = False
+            await self._call_event_handler("on_disconnected")
+
+    def _get_websocket(self):
+        if self._websocket:
+            return self._websocket
+        raise Exception("Websocket not connected")
+
+    async def _receive_messages(self) -> None:
+        async for raw_message in self._get_websocket():
+            try:
+                message = json.loads(raw_message)
+            except json.JSONDecodeError:
+                logger.warning(f"{self}: received non-JSON Xfyun Super TTS message: {raw_message!r}")
+                continue
+
+            header = message.get("header") or {}
+            code = header.get("code", -1)
+            sid = header.get("sid")
+            context_id = self._active_context_id
+
+            if code != 0:
+                error_message = header.get("message", "unknown error")
+                await self.push_error(
+                    error_msg=f"Xfyun Super TTS error code={code}, sid={sid}: {error_message}"
+                )
+                if context_id and self.audio_context_available(context_id):
+                    await self.append_to_audio_context(
+                        context_id, TTSStoppedFrame(context_id=context_id)
+                    )
+                    await self.remove_audio_context(context_id)
+                if context_id:
+                    await self._reset_context(context_id)
+                continue
+
+            audio_obj = (message.get("payload") or {}).get("audio") or {}
+            audio_b64 = audio_obj.get("audio")
+            if audio_b64 and context_id and self.audio_context_available(context_id):
+                await self.stop_ttfb_metrics()
+                audio = base64.b64decode(audio_b64)
+                if self._source_sample_rate != self.sample_rate:
+                    audio = await self._resampler.resample(
+                        audio, self._source_sample_rate, self.sample_rate
+                    )
+                frame = TTSAudioRawFrame(audio, self.sample_rate, 1, context_id=context_id)
+                await self.append_to_audio_context(context_id, frame)
+
+            audio_status = audio_obj.get("status")
+            header_status = header.get("status")
+            if audio_status == 2 or header_status == 2:
+                if context_id and self.audio_context_available(context_id):
+                    await self.append_to_audio_context(
+                        context_id, TTSStoppedFrame(context_id=context_id)
+                    )
+                    await self.remove_audio_context(context_id)
+                if context_id:
+                    await self._reset_context(context_id)
+                self._stream_completed = True
+
+    @traced_tts
+    async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame | None, None]:
+        sanitized = _sanitize_text_for_tts(text)
+        if not sanitized:
+            return
+
+        if not self._is_streaming_tokens:
+            logger.debug(f"{self}: Generating Xfyun Super TTS [{sanitized}]")
+        else:
+            logger.trace(f"{self}: Generating Xfyun Super TTS [{sanitized}]")
+
+        if self._stream_completed and self._websocket:
+            await self._disconnect()
+            await self._connect()
+
+        if not self._websocket or self._websocket.state is State.CLOSED:
+            await self._connect()
+
+        if self._active_context_id and self._active_context_id != context_id:
+            yield ErrorFrame(
+                error=(
+                    "Xfyun Super TTS supports one active synthesis stream per WebSocket; "
+                    f"active={self._active_context_id}, new={context_id}"
+                )
+            )
+            return
+
+        try:
+            status = 0 if context_id not in self._started_contexts else 1
+            await self._send_request_frame(context_id, sanitized, status=status)
+            await self.start_tts_usage_metrics(sanitized)
+        except Exception as exc:
+            yield ErrorFrame(error=f"Xfyun Super TTS request failed: {exc}")
+            yield TTSStoppedFrame(context_id=context_id)
+            await self._disconnect()
+            await self._connect()
+            return
+
+        yield None
+
+    async def _send_request_frame(self, context_id: str, text: str, *, status: int) -> None:
+        if status == 0:
+            self._active_context_id = context_id
+            self._started_contexts.add(context_id)
+
+        seq = self._seq_by_context.get(context_id, 0)
+        text_bytes = text.encode("utf-8")
+        total_bytes = self._sent_text_bytes_by_context.get(context_id, 0) + len(text_bytes)
+        if total_bytes > 65536:
+            raise ValueError("Xfyun Super TTS text must not exceed 64K UTF-8 bytes per stream")
+
+        frame = self._build_request_frame(text, status=status, seq=seq)
+        await self._get_websocket().send(json.dumps(frame, ensure_ascii=False))
+
+        self._seq_by_context[context_id] = seq + 1
+        self._sent_text_bytes_by_context[context_id] = total_bytes
+
+    def _build_request_frame(self, text: str, *, status: int, seq: int) -> dict[str, Any]:
+        return {
+            "header": {
+                "app_id": self._app_id,
+                "status": status,
+            },
+            "parameter": {
+                "oral": {
+                    "oral_level": self._oral_level,
+                },
+                "tts": {
+                    "vcn": self._voice,
+                    "speed": self._speed,
+                    "volume": self._volume,
+                    "pitch": self._pitch,
+                    "bgs": 0,
+                    "reg": 0,
+                    "rdn": 0,
+                    "rhy": 0,
+                    "audio": {
+                        "encoding": self._encoding,
+                        "sample_rate": self._source_sample_rate,
+                        "channels": 1,
+                        "bit_depth": 16,
+                        "frame_size": 0,
+                    },
+                },
+            },
+            "payload": {
+                "text": {
+                    "encoding": "utf8",
+                    "compress": "raw",
+                    "format": "plain",
+                    "status": status,
+                    "seq": seq,
+                    "text": base64.b64encode(text.encode("utf-8")).decode("utf-8"),
+                },
+            },
+        }
+
+    async def _reset_context(self, context_id: str) -> None:
+        self._started_contexts.discard(context_id)
+        self._seq_by_context.pop(context_id, None)
+        self._sent_text_bytes_by_context.pop(context_id, None)
+        if self._active_context_id == context_id:
+            self._active_context_id = None
+
+
+def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
+    parsed = urlparse(url)
+    if parsed.scheme not in {"ws", "wss"} or not parsed.hostname:
+        raise ValueError(f"invalid Xfyun Super TTS WebSocket URL: {url}")
+
+    host = parsed.hostname
+    path = parsed.path or "/"
+    date = format_datetime(datetime.now(timezone.utc), usegmt=True)
+    request_line = f"GET {path} HTTP/1.1"
+    signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
+    signature_sha = hmac.new(
+        api_secret.encode("utf-8"),
+        signature_origin.encode("utf-8"),
+        digestmod=hashlib.sha256,
+    ).digest()
+    signature = base64.b64encode(signature_sha).decode("utf-8")
+    authorization_origin = (
+        f'api_key="{api_key}", algorithm="hmac-sha256", '
+        f'headers="host date request-line", signature="{signature}"'
+    )
+    authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
+    query = urlencode({"authorization": authorization, "date": date, "host": host})
+    return f"{url}?{query}"
--- a/src/voice/xfyun_tts.py
+++ b/src/voice/xfyun_tts.py
@@ -0,0 +1,257 @@
+from __future__ import annotations
+
+import base64
+import hashlib
+import hmac
+import json
+import os
+import re
+import unicodedata
+from collections.abc import AsyncGenerator, AsyncIterator
+from datetime import datetime, timezone
+from email.utils import format_datetime
+from typing import Any
+from urllib.parse import urlencode, urlparse
+
+from loguru import logger
+
+from pipecat.frames.frames import ErrorFrame, Frame
+from pipecat.services.settings import TTSSettings
+from pipecat.services.tts_service import TTSService
+from websockets.asyncio.client import connect
+
+
+DEFAULT_XFYUN_TTS_URL = "wss://tts-api.xfyun.cn/v2/tts"
+
+# Strip characters Xfyun's online TTS cannot synthesize. The engine silently
+# rejects (or returns empty audio for) text containing emoji and other
+# non-BMP symbols, which surfaces as "request finished without audio data".
+_EMOJI_AND_SYMBOL_RE = re.compile(
+    "["
+    "\U0001F300-\U0001FAFF"  # misc pictographs, emoji, symbols, transport, etc.
+    "\U00002600-\U000027BF"  # misc symbols and dingbats
+    "\U0001F1E6-\U0001F1FF"  # regional indicators (flags)
+    "\uFE00-\uFE0F"           # variation selectors
+    "\u200D"                  # zero-width joiner
+    "]",
+    flags=re.UNICODE,
+)
+
+
+class XfyunTTSService(TTSService):
+    """iFlytek/Xfyun online TTS service for Pipecat.
+
+    Xfyun's API is not OpenAI-compatible. It uses a signed WebSocket URL,
+    receives one JSON request per synthesis, and streams text WebSocket
+    messages containing base64-encoded audio chunks. This service requests
+    raw PCM so the chunks can become Pipecat audio frames without MP3 decode.
+    """
+
+    def __init__(
+        self,
+        *,
+        app_id: str,
+        api_key: str,
+        api_secret: str,
+        voice: str,
+        url: str | None = None,
+        sample_rate: int = 16000,
+        source_sample_rate: int = 16000,
+        encoding: str = "raw",
+        text_encoding: str = "UTF8",
+        speed: int = 50,
+        volume: int = 50,
+        pitch: int = 50,
+        timeout: float = 30.0,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            sample_rate=sample_rate,
+            settings=TTSSettings(model=None, voice=voice, language=None),
+            **kwargs,
+        )
+        self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
+        self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
+        self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
+        self._voice = voice
+        self._url = url or DEFAULT_XFYUN_TTS_URL
+        self._source_sample_rate = source_sample_rate
+        self._encoding = encoding
+        self._text_encoding = text_encoding
+        self._speed = speed
+        self._volume = volume
+        self._pitch = pitch
+        self._timeout = timeout
+        self._last_failure_detail: str | None = None
+
+    async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
+        if not text:
+            return
+
+        if not self._app_id or not self._api_key or not self._api_secret:
+            yield ErrorFrame(error="Xfyun TTS requires app_id, api_key, and api_secret")
+            return
+
+        sanitized = _sanitize_text_for_tts(text)
+        if not sanitized:
+            logger.debug(
+                f"{self}: skipping Xfyun TTS, text became empty after sanitization "
+                f"(original={text!r})"
+            )
+            return
+
+        if sanitized != text:
+            logger.debug(
+                f"{self}: sanitized Xfyun TTS text "
+                f"(original={text!r}, sanitized={sanitized!r})"
+            )
+
+        if len(sanitized.encode("utf-8")) >= 8000:
+            yield ErrorFrame(error="Xfyun TTS text must be less than 8000 UTF-8 bytes")
+            return
+
+        if self._encoding != "raw":
+            yield ErrorFrame(error="Xfyun TTS is configured for PCM output; set aue/encoding to raw")
+            return
+
+        try:
+            await self.start_tts_usage_metrics(sanitized)
+
+            first_frame = True
+            async for frame in self._stream_audio_frames_from_iterator(
+                self._iter_audio_chunks(sanitized),
+                in_sample_rate=self._source_sample_rate,
+                context_id=context_id,
+            ):
+                if first_frame:
+                    await self.stop_ttfb_metrics()
+                    first_frame = False
+                yield frame
+
+            if first_frame:
+                detail = self._last_failure_detail or "no audio frames received"
+                yield ErrorFrame(
+                    error=(
+                        f"Xfyun TTS request finished without audio data ({detail}); "
+                        f"text={sanitized!r}"
+                    )
+                )
+        except Exception as exc:
+            yield ErrorFrame(error=f"Xfyun TTS request failed: {exc}")
+
+    async def _iter_audio_chunks(self, text: str) -> AsyncIterator[bytes]:
+        request = self._build_request_frame(text)
+        auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
+
+        self._last_failure_detail = None
+        frames_received = 0
+        audio_bytes_received = 0
+        last_status: int | None = None
+        last_sid: str | None = None
+        saw_status_2 = False
+
+        async with connect(auth_url, max_size=None, open_timeout=self._timeout) as websocket:
+            await websocket.send(json.dumps(request, ensure_ascii=False))
+
+            async for raw_message in websocket:
+                frames_received += 1
+                payload = json.loads(raw_message)
+                code = payload.get("code", -1)
+                sid = payload.get("sid")
+                if sid:
+                    last_sid = sid
+                if code != 0:
+                    err_msg = payload.get("message", "unknown error")
+                    raise RuntimeError(f"code={code}, sid={sid}, message={err_msg}")
+
+                data = payload.get("data")
+                if not isinstance(data, dict):
+                    continue
+
+                last_status = data.get("status", last_status)
+
+                audio_b64 = data.get("audio")
+                if audio_b64:
+                    audio_bytes = base64.b64decode(audio_b64)
+                    audio_bytes_received += len(audio_bytes)
+                    yield audio_bytes
+
+                if data.get("status") == 2:
+                    saw_status_2 = True
+                    break
+
+        if audio_bytes_received == 0:
+            self._last_failure_detail = (
+                f"frames={frames_received}, audio_bytes=0, "
+                f"last_status={last_status}, saw_status_2={saw_status_2}, sid={last_sid}"
+            )
+            logger.warning(
+                f"{self}: Xfyun TTS produced no audio ({self._last_failure_detail})"
+            )
+
+    def _build_request_frame(self, text: str) -> dict[str, Any]:
+        business: dict[str, Any] = {
+            "aue": self._encoding,
+            "auf": f"audio/L16;rate={self._source_sample_rate}",
+            "vcn": self._voice,
+            "speed": self._speed,
+            "volume": self._volume,
+            "pitch": self._pitch,
+            "tte": self._text_encoding,
+        }
+
+        return {
+            "common": {"app_id": self._app_id},
+            "business": business,
+            "data": {
+                "status": 2,
+                "text": base64.b64encode(text.encode("utf-8")).decode("utf-8"),
+            },
+        }
+
+
+def _sanitize_text_for_tts(text: str) -> str:
+    """Strip characters Xfyun's online TTS cannot synthesize.
+
+    The Xfyun ``/v2/tts`` engine silently drops or rejects emoji, pictographs,
+    dingbats, regional-indicator flags, variation selectors, and zero-width
+    joiners.  When such characters appear in the input the synthesis can
+    finish without any audio data ("Xfyun TTS request finished without audio
+    data").  We also drop control characters (other than common whitespace)
+    and "Symbol, Other" codepoints, then collapse runs of whitespace.
+    """
+    if not text:
+        return text
+
+    cleaned = _EMOJI_AND_SYMBOL_RE.sub("", text)
+    filtered: list[str] = []
+    for ch in cleaned:
+        category = unicodedata.category(ch)
+        if category == "So":
+            continue
+        if category.startswith("C") and ch not in ("\n", "\r", "\t"):
+            continue
+        filtered.append(ch)
+    return re.sub(r"\s+", " ", "".join(filtered)).strip()
+
+
+def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
+    parsed = urlparse(url)
+    host = parsed.netloc
+    path = parsed.path or "/v2/tts"
+    date = format_datetime(datetime.now(timezone.utc), usegmt=True)
+    request_line = f"GET {path} HTTP/1.1"
+    signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
+    signature_sha = hmac.new(
+        api_secret.encode("utf-8"),
+        signature_origin.encode("utf-8"),
+        digestmod=hashlib.sha256,
+    ).digest()
+    signature = base64.b64encode(signature_sha).decode("utf-8")
+    authorization_origin = (
+        f'api_key="{api_key}", algorithm="hmac-sha256", '
+        f'headers="host date request-line", signature="{signature}"'
+    )
+    authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
+    query = urlencode({"authorization": authorization, "date": date, "host": host})
+    return f"{url}?{query}"
--- a/static/voice-demo/README.md
+++ b/static/voice-demo/README.md
@@ -0,0 +1,106 @@
+# Webpage Example — Realtime Voice Chat
+
+A self-contained browser client for the engine's product websocket
+(`/ws-product`, protocol `va.ws.v1`).
+
+## Features
+
+- **Connect / Disconnect** to any `ws://` or `wss://` URL.
+- **Microphone selector + mic on/off toggle** — available input devices
+  are listed with `enumerateDevices`, and getUserMedia is requested with
+  `echoCancellation`, `noiseSuppression`, and `autoGainControl` so the
+  browser handles AEC against the bot's voice.
+- **Text composer** — type a message and press <kbd>Enter</kbd> to send
+  an `input.text` event (Shift+Enter for newline). Sending interrupts
+  any in-flight bot audio so the next reply is heard cleanly.
+- **Chat history** rendered from `input.transcript.final` (you, when
+  spoken), streamed `response.text.delta` / `response.text.final`
+  (assistant — deltas arrive ahead of the synthesized audio), and locally
+  for text you submit (the engine doesn't echo text input back as a
+  transcript).
+- **WebSocket log** panel for connection state and compact send/receive
+  events. Audio chunks are summarized so the UI does not flood.
+- **Gapless TTS playback** by scheduling each `response.audio.delta`
+  chunk back-to-back on the AudioContext.
+- **Live VU meter** + mic and bot activity indicators.
+- **Clear** button to reset history.
+
+No build step, no dependencies — just three files plus an AudioWorklet.
+
+## Layout
+
+```text
+examples/webpage/
+├── index.html
+├── styles.css
+├── app.js
+└── pcm-recorder.worklet.js
+```
+
+## Run
+
+1. Start the engine (default port `8000`):
+
+   ```bash
+   cd AI-VideoAssistant-engine-v5-pipecat-minimal
+   source .venv/bin/activate
+   export OPENAI_API_KEY=...
+   uvicorn engine.main:app --host 127.0.0.1 --port 8000
+   ```
+
+2. Open the demo page served by the same process:
+
+   ```text
+   http://127.0.0.1:8000/voice-demo/
+   ```
+
+   The default websocket URL is derived from the page host
+   (`ws://127.0.0.1:8000/ws-product`). Click **Connect**, pick a
+   microphone if needed, click **Enable mic**, and start speaking.
+
+   Mount path and on/off are controlled in `config.json`:
+
+   ```json
+   "server": {
+     "serve_webpage": true,
+     "webpage_mount": "/voice-demo"
+   }
+   ```
+
+   Set `"serve_webpage": false` in production if you serve the UI elsewhere.
+
+### Standalone static server (optional)
+
+You can still serve the files from another port for UI-only iteration.
+Add that origin to `server.cors_origins` in `config.json` if needed:
+
+```bash
+cd AI-VideoAssistant-engine-v5-pipecat-minimal/examples/webpage
+python -m http.server 8080
+```
+
+Then open <http://localhost:8080> and point the URL field at
+`ws://127.0.0.1:8000/ws-product`.
+
+> The browser's mic API requires a secure context. `http://localhost`
+> qualifies; if you serve from another host, use HTTPS and a `wss://`
+> URL.
+
+## Audio details
+
+- Input: mono Float32 from `getUserMedia` is resampled in the
+  AudioWorklet to PCM16 mono @ 16 kHz, framed into 20 ms chunks, and
+  sent as **binary** websocket messages (the server accepts either
+  binary or the JSON+base64 form).
+- Output: each `response.audio.delta` carries base64-encoded PCM16 @
+  16 kHz; chunks are decoded and scheduled back-to-back through Web
+  Audio. The browser handles resampling to the device rate.
+
+## Notes
+
+- Use headphones if you still hear echo despite browser AEC; the bot's
+  voice leaking back into the open mic is the most common cause of
+  feedback loops.
+- The engine's session has an inactivity timeout
+  (`session.inactivity_timeout_sec` in `config.json`). If the bot
+  doesn't respond after a long silence, reconnect.
--- a/static/voice-demo/app.js
+++ b/static/voice-demo/app.js
--- a/static/voice-demo/index.html
+++ b/static/voice-demo/index.html
@@ -0,0 +1,288 @@
+<!doctype html>
+<html lang="zh-CN">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>VA Voice Chat &mdash; /ws-product</title>
+    <link rel="stylesheet" href="./styles.css" />
+  </head>
+  <body>
+    <main class="app">
+      <header class="app__header">
+        <div class="brand">
+          <span class="brand__dot" aria-hidden="true"></span>
+          <h1>VA Voice Chat</h1>
+        </div>
+
+        <div class="connection">
+          <label class="connection__field">
+            <span>服务器地址</span>
+            <input
+              id="ws-url"
+              type="text"
+              placeholder="ws://host/ws-product"
+              spellcheck="false"
+              autocomplete="off"
+            />
+          </label>
+          <label class="connection__field connection__field--chat">
+            <span>会话 ID</span>
+            <div class="chat-id-control">
+              <input
+                id="chat-id"
+                type="text"
+                placeholder="可选"
+                spellcheck="false"
+                autocomplete="off"
+              />
+              <button
+                id="copy-chat-id-btn"
+                class="chat-id-control__copy"
+                type="button"
+                disabled
+                title="复制会话 ID"
+                aria-label="复制会话 ID"
+              >
+                <svg class="copy-icon copy-icon--default" viewBox="0 0 16 16" width="14" height="14" fill="none" aria-hidden="true">
+                  <rect x="5" y="5" width="8" height="9" rx="1.5" stroke="currentColor" stroke-width="1.4"/>
+                  <path d="M3 11V3.5A1.5 1.5 0 0 1 4.5 2H11" stroke="currentColor" stroke-width="1.4" stroke-linecap="round"/>
+                </svg>
+                <svg class="copy-icon copy-icon--check" viewBox="0 0 16 16" width="14" height="14" fill="none" aria-hidden="true">
+                  <path d="M3 8.5l3.5 3.5 6.5-7" stroke="currentColor" stroke-width="1.6" stroke-linecap="round" stroke-linejoin="round"/>
+                </svg>
+              </button>
+            </div>
+          </label>
+          <button id="connect-btn" class="btn btn--primary" type="button">
+            连接
+          </button>
+        </div>
+
+        <div class="status">
+          <span id="status-dot" class="status__dot status__dot--idle"></span>
+          <span id="status-text" class="status__text">未连接</span>
+        </div>
+      </header>
+
+      <div class="app__body">
+        <div class="app__main">
+          <div id="conversation" class="conversation">
+            <aside
+              id="camera-drawer"
+              class="camera-drawer"
+              aria-label="拍照步骤"
+              aria-hidden="true"
+            >
+              <div class="camera-drawer__panel">
+                <div class="camera-drawer__header">
+                  <div>
+                    <p class="camera-drawer__eyebrow">拍照</p>
+                    <h2>拍照步骤</h2>
+                  </div>
+                  <span id="camera-state" class="camera-drawer__state">状态 -</span>
+                </div>
+
+                <div id="camera-preview" class="camera-drawer__preview">
+                  <video
+                    id="camera-video"
+                    class="camera-drawer__video"
+                    playsinline
+                    muted
+                    autoplay
+                  ></video>
+                  <img
+                    id="camera-photo"
+                    class="camera-drawer__photo"
+                    alt="已选择图片预览"
+                  />
+                  <span class="camera-drawer__corner camera-drawer__corner--tl"></span>
+                  <span class="camera-drawer__corner camera-drawer__corner--tr"></span>
+                  <span class="camera-drawer__corner camera-drawer__corner--bl"></span>
+                  <span class="camera-drawer__corner camera-drawer__corner--br"></span>
+                  <span class="camera-drawer__lens"></span>
+                  <span class="camera-drawer__scan"></span>
+                  <span id="camera-placeholder" class="camera-drawer__placeholder">
+                    打开摄像头实时拍摄，或从下方选择 / 上传图片
+                  </span>
+                </div>
+
+                <p id="camera-question" class="camera-drawer__question"></p>
+
+                <div
+                  id="camera-samples"
+                  class="camera-drawer__samples"
+                  aria-label="示例图片，点击选择"
+                ></div>
+
+                <div class="camera-drawer__sources">
+                  <label
+                    class="btn btn--ghost camera-drawer__source"
+                  >
+                    上传图片
+                    <input
+                      id="camera-upload"
+                      type="file"
+                      accept="image/*"
+                      hidden
+                    />
+                  </label>
+                  <button
+                    id="camera-start-btn"
+                    class="btn btn--ghost camera-drawer__source"
+                    type="button"
+                    title="打开摄像头"
+                  >
+                    使用摄像头
+                  </button>
+                </div>
+
+                <label
+                  id="camera-device-row"
+                  class="device-picker camera-drawer__device-row"
+                  hidden
+                >
+                  <span class="device-picker__label">选择摄像头</span>
+                  <select
+                    id="camera-device-select"
+                    class="device-picker__select"
+                    disabled
+                  >
+                    <option value="">默认摄像头</option>
+                  </select>
+                </label>
+
+                <button
+                  id="camera-done-btn"
+                  class="btn btn--primary camera-drawer__button"
+                  type="button"
+                  disabled
+                >
+                  拍摄完成
+                </button>
+                <canvas id="camera-canvas" hidden></canvas>
+              </div>
+            </aside>
+
+            <section class="chat" aria-label="对话记录">
+              <div id="chat-log" class="chat__log" role="log" aria-live="polite">
+                <div class="chat__empty">
+                  <p>连接服务、开启麦克风后即可开始对话。</p>
+                  <p class="chat__hint">
+                    音频通过 <code>/ws-product</code> 以 PCM16 单声道 16&nbsp;kHz
+                    传输。
+                  </p>
+                </div>
+              </div>
+            </section>
+          </div>
+
+          <footer class="controls" aria-label="操作栏">
+            <div class="meter" aria-hidden="true">
+              <div id="meter-fill" class="meter__fill"></div>
+            </div>
+
+            <form id="composer" class="composer" autocomplete="off">
+              <textarea
+                id="text-input"
+                class="composer__input"
+                rows="1"
+                placeholder="输入消息，或使用麦克风…"
+                disabled
+              ></textarea>
+              <button
+                id="send-btn"
+                class="btn btn--primary composer__send"
+                type="submit"
+                disabled
+                title="发送消息 (Enter)"
+              >
+                发送
+              </button>
+            </form>
+
+            <div class="controls__row">
+              <label class="device-picker">
+                <span class="device-picker__label">麦克风</span>
+                <select id="mic-select" class="device-picker__select" disabled>
+                  <option value="">默认麦克风</option>
+                </select>
+              </label>
+
+              <button
+                id="mic-btn"
+                class="mic-btn"
+                type="button"
+                disabled
+                aria-pressed="false"
+                title="麦克风已关闭"
+              >
+                <svg
+                  class="mic-btn__icon"
+                  viewBox="0 0 24 24"
+                  width="24"
+                  height="24"
+                  aria-hidden="true"
+                >
+                  <path
+                    d="M12 14a3 3 0 0 0 3-3V6a3 3 0 1 0-6 0v5a3 3 0 0 0 3 3Z"
+                    fill="currentColor"
+                  />
+                  <path
+                    d="M19 11a1 1 0 1 0-2 0 5 5 0 0 1-10 0 1 1 0 1 0-2 0 7 7 0 0 0 6 6.92V21a1 1 0 1 0 2 0v-3.08A7 7 0 0 0 19 11Z"
+                    fill="currentColor"
+                  />
+                </svg>
+                <span class="mic-btn__label">开启麦克风</span>
+              </button>
+
+              <div class="indicators">
+                <span id="mic-indicator" class="indicator">
+                  <span class="indicator__dot indicator__dot--mic"></span>
+                  <span class="indicator__label">麦克风</span>
+                </span>
+                <span id="bot-indicator" class="indicator">
+                  <span class="indicator__dot indicator__dot--bot"></span>
+                  <span class="indicator__label">助手</span>
+                </span>
+                <span id="state-indicator" class="indicator indicator--state">
+                  <span class="indicator__dot indicator__dot--state"></span>
+                  <span id="state-label" class="indicator__label">状态 -</span>
+                </span>
+              </div>
+
+              <button id="clear-btn" class="btn btn--ghost" type="button">
+                清空
+              </button>
+            </div>
+
+            <p class="hint">
+              按 <kbd>Enter</kbd> 发送，<kbd>Shift</kbd>+<kbd>Enter</kbd>
+              换行。发送文字会打断正在说话的助手。
+              浏览器回声消除已开启，如有回音请使用耳机。
+            </p>
+          </footer>
+        </div>
+
+        <section class="ws-log" aria-label="WebSocket 日志">
+          <div class="ws-log__header">
+            <div class="ws-log__header-left">
+              <h2>WebSocket 日志</h2>
+              <div class="ws-log__legend" aria-hidden="true">
+                <span class="ws-log__legend-item ws-log__legend-item--send">发送</span>
+                <span class="ws-log__legend-item ws-log__legend-item--recv">接收</span>
+              </div>
+            </div>
+            <button id="clear-ws-log-btn" class="btn btn--ghost" type="button">
+              清空日志
+            </button>
+          </div>
+          <div id="ws-log" class="ws-log__body" role="log" aria-live="polite">
+            <div class="ws-log__empty">暂无 WebSocket 事件。</div>
+          </div>
+        </section>
+      </div>
+    </main>
+
+    <script type="module" src="./app.js"></script>
+  </body>
+</html>
--- a/static/voice-demo/pcm-recorder.worklet.js
+++ b/static/voice-demo/pcm-recorder.worklet.js
@@ -0,0 +1,104 @@
+/**
+ * PCM Recorder AudioWorklet.
+ *
+ * Captures mono Float32 mic samples at the AudioContext's native rate,
+ * resamples them to a target sample rate (default 16 kHz) with linear
+ * interpolation, then ships PCM16 frames of a fixed duration (default 20 ms)
+ * to the main thread via `port.postMessage(ArrayBuffer)`.
+ *
+ * It also computes a simple RMS level per frame for the UI VU meter so the
+ * main thread doesn't have to re-process the audio.
+ */
+
+class PcmRecorderProcessor extends AudioWorkletProcessor {
+  constructor(options) {
+    super();
+
+    const opts = (options && options.processorOptions) || {};
+    this._targetSampleRate = opts.targetSampleRate || 16000;
+    this._frameMs = opts.frameMs || 20;
+    this._frameSamples = Math.round(
+      (this._targetSampleRate * this._frameMs) / 1000,
+    );
+
+    // Resampling state.
+    // `ratio` is input samples per output sample.
+    this._ratio = sampleRate / this._targetSampleRate;
+    this._inputBuffer = new Float32Array(0);
+    // Float position in `_inputBuffer` for the next output sample.
+    this._inputOffset = 0;
+
+    // Output framing state.
+    this._frameBuffer = new Int16Array(this._frameSamples);
+    this._frameIndex = 0;
+
+    // VU meter accumulator.
+    this._rmsSumSquares = 0;
+    this._rmsCount = 0;
+  }
+
+  process(inputs) {
+    const input = inputs[0];
+    if (!input || input.length === 0) return true;
+    const channel = input[0];
+    if (!channel || channel.length === 0) return true;
+
+    // Append new samples to the input buffer.
+    const merged = new Float32Array(this._inputBuffer.length + channel.length);
+    merged.set(this._inputBuffer, 0);
+    merged.set(channel, this._inputBuffer.length);
+    this._inputBuffer = merged;
+
+    const ratio = this._ratio;
+    const inLen = this._inputBuffer.length;
+    let pos = this._inputOffset;
+
+    while (pos + 1 < inLen) {
+      const lo = Math.floor(pos);
+      const hi = lo + 1;
+      const w = pos - lo;
+      const sample =
+        this._inputBuffer[lo] * (1 - w) + this._inputBuffer[hi] * w;
+
+      this._rmsSumSquares += sample * sample;
+      this._rmsCount += 1;
+
+      let s = sample;
+      if (s > 1) s = 1;
+      else if (s < -1) s = -1;
+      this._frameBuffer[this._frameIndex++] =
+        s < 0 ? Math.round(s * 0x8000) : Math.round(s * 0x7fff);
+
+      if (this._frameIndex === this._frameSamples) {
+        const frame = new Int16Array(this._frameSamples);
+        frame.set(this._frameBuffer);
+        const rms =
+          this._rmsCount > 0
+            ? Math.sqrt(this._rmsSumSquares / this._rmsCount)
+            : 0;
+        this.port.postMessage(
+          { type: "frame", buffer: frame.buffer, rms },
+          [frame.buffer],
+        );
+        this._frameIndex = 0;
+        this._rmsSumSquares = 0;
+        this._rmsCount = 0;
+      }
+
+      pos += ratio;
+    }
+
+    // Trim consumed samples from the input buffer; keep at least the last
+    // sample we still need to interpolate against on the next call.
+    const consumed = Math.floor(pos);
+    if (consumed > 0) {
+      this._inputBuffer = this._inputBuffer.slice(consumed);
+      pos -= consumed;
+    }
+    this._inputOffset = pos;
+
+    return true;
+  }
+}
+
+registerProcessor("pcm-recorder", PcmRecorderProcessor);
--- a/static/voice-demo/samples/.DS_Store
+++ b/static/voice-demo/samples/.DS_Store
--- a/static/voice-demo/samples/damage1.png
+++ b/static/voice-demo/samples/damage1.png
--- a/static/voice-demo/samples/damage2.png
+++ b/static/voice-demo/samples/damage2.png
--- a/static/voice-demo/samples/plate1.jpg
+++ b/static/voice-demo/samples/plate1.jpg
--- a/static/voice-demo/samples/plate2.jpg
+++ b/static/voice-demo/samples/plate2.jpg
--- a/static/voice-demo/samples/user1.jpg
+++ b/static/voice-demo/samples/user1.jpg
--- a/static/voice-demo/samples/user2.jpg
+++ b/static/voice-demo/samples/user2.jpg
--- a/static/voice-demo/styles.css
+++ b/static/voice-demo/styles.css
--- a/test/api/fastapi.http
+++ b/test/api/fastapi.http
@@ -1,29 +1,30 @@
-@baseUrl = http://127.0.0.1:8080
-###
+@baseUrl = http://101.89.108.122:8000
+
 GET http://127.0.0.1:8080

 HTTP/1.1 200  - OK
-connection: close
+date: Wed, 17 Jun 2026 00:37:02 GMT
+server: uvicorn
 content-length: 32
 content-type: application/json
-date: Thu, 08 Jan 2026 08:58:09 GMT
-server: uvicorn
+connection: close
 ###
 POST http://127.0.0.1:8080/chat
 content-type: application/json

 {
-    "sessionId": "a1002",
+    "sessionId": "a1100",
    "timeStamp": "202503310303",
-    "text": "【拍摄完成】"
+    "text": "继续",
+    "needFormTags": true
 }

 HTTP/1.1 200  - OK
-connection: close
-content-length: 205
-content-type: application/json
-date: Thu, 08 Jan 2026 08:59:37 GMT
+date: Wed, 17 Jun 2026 00:37:26 GMT
 server: uvicorn
+content-length: 274
+content-type: application/json
+connection: close
 ###
 POST http://127.0.0.1:8080/get_info
 content-type: application/json
@@ -35,11 +36,11 @@ content-type: application/json
 }

 HTTP/1.1 200  - OK
-connection: close
-content-length: 97
-content-type: application/json
-date: Thu, 08 Jan 2026 09:27:05 GMT
+date: Wed, 17 Jun 2026 00:27:12 GMT
 server: uvicorn
+content-length: 108
+content-type: application/json
+connection: close
 ###
 POST http://127.0.0.1:8080/set_info
 content-type: application/json
--- a/workflow/20250419/事故信息采集20250419.json
+++ b/workflow/20250419/事故信息采集20250419.json
--- a/workflow/20250419/单车拍照流程插件.json
+++ b/workflow/20250419/单车拍照流程插件.json
--- a/workflow/20250419/单车插件.json
+++ b/workflow/20250419/单车插件.json
--- a/workflow/20250419/双车拍照流程插件.json
+++ b/workflow/20250419/双车拍照流程插件.json
--- a/workflow/20251108/事故信息采集20251108.json
+++ b/workflow/20251108/事故信息采集20251108.json
Author	SHA1	Message	Date
Xin Wang	7666759121	Merge branch 'main' of https://gitea.xiaowang.eu.org/wx44wx/ZNJJ-api-server	2026-06-17 13:33:23 +08:00
Xin Wang	6ff23e433b	Refactor form update handling in API endpoints and models - Introduced a new function to parse JSON values in endpoints.py for improved data handling. - Updated extract_form_update_from_flow_nodes to return structured data instead of strings. - Changed formUpdate field in ProcessResponse_chat model to use Any type with a default empty dictionary for better flexibility in handling updates.	2026-06-17 13:29:50 +08:00
Xin Wang	edf85e21cc	Update base URL in API configuration for deployment	2026-06-17 12:36:46 +08:00
Xin Wang	ffd3bf0385	Update environment configuration and enhance API endpoints - Changed ANALYSIS_SERVICE_URL to localhost for local development. - Updated ANALYSIS_AUTH_TOKEN and APP_ID for improved security. - Added new functions in endpoints.py for form extraction and stage code normalization. - Enhanced chat handling to support form updates and improved event streaming. - Updated models to include new fields for form update handling.	2026-06-17 11:36:42 +08:00
Xin Wang	1ea1d86d5a	Update continue greeting	2026-06-03 12:52:38 +08:00
Xin Wang	705a63dd25	Sync with engine v5	2026-06-03 12:36:18 +08:00
Xin Wang	056a8a4ad8	Update example for ssl problem	2026-06-02 17:28:49 +08:00
Xin Wang	30c413b6d4	Refactor voice websocket documentation to clarify input handling and remove deprecated sections. Updated the demo process to indicate that image uploads are no longer required, streamlining user interaction with the voice service.	2026-06-01 11:24:31 +08:00
Xin Wang	0ef5de399a	Add voice ws docs	2026-06-01 11:18:41 +08:00
Xin Wang	00c1bbdc6b	Sync voice chatId session handling	2026-06-01 10:08:15 +08:00
Xin Wang	6df6c16e1d	Implement UserStartedSpeakingFrame handling in voice pipeline - Added support for UserStartedSpeakingFrame to enhance user interaction tracking. - Updated the pipeline to reset idle prompt count when a user starts speaking, improving responsiveness during conversations. - Integrated new event handlers for better management of user turn events and upstream frame processing.	2026-05-29 16:51:51 +08:00
Xin Wang	13f5f44f61	Enhance voice configuration with idle prompt features and new TTS settings - Added idle prompt timeout, maximum count, and text to multiple voice configuration files to improve user interaction during idle periods. - Updated greeting mode to 'fastgpt_opener' in relevant configurations for a more dynamic greeting experience. - Introduced a new voice configuration file for xfyun TTS, including detailed service settings and parameters. - Refactored the pipeline to handle idle prompts and user turn events, ensuring smoother interaction flow. - Adjusted the VAD and turn configurations to accommodate new idle prompt features.	2026-05-29 16:27:05 +08:00
Xin Wang	f49212afc9	Add camera functionality to voice demo with UI updates and state management - Introduced a camera drawer for capturing images during the conversation flow. - Added prompts for various camera states to guide users through the photo capture process. - Updated HTML structure to include camera-related elements and integrated them with existing chat functionality. - Enhanced JavaScript logic to manage camera state and button enabling/disabling based on connection status. - Updated CSS for styling the camera drawer and its components, ensuring responsive design across devices. - Adjusted README to reflect the new demo URL for voice functionality.	2026-05-28 15:13:54 +08:00
Xin Wang	ed9621dfe0	Update VOICE_CONFIG to use new FastGPT state-enabled configuration and add TEXT_DELTA log group for websocket responses.	2026-05-28 13:46:33 +08:00
Xin Wang	9e2374f492	Add voice state tags, SuperTTS configs, and demo WS log groups. Parse leading <state> tags from LLM replies and emit response.state over the product websocket while stripping tags from TTS/text streams. Add FastGPT+Xfyun voice configs (including state-enabled preset), SuperTTS support, and context sync for interrupted turns. Refresh the voice demo with a state indicator and collapsible audio delta websocket log groups. Co-authored-by: Cursor <cursoragent@cursor.com>	2026-05-28 11:32:20 +08:00
Xin Wang	b14ef64665	Add configuration files for FastGPT and Xfyun voice services, enhancing LLM service capabilities. Update LLMConfig to include chat_id, variables, detail, and timeout settings. Refactor create_llm_service to support FastGPT integration and adjust pipeline to handle chat_id and greeting prompts. Implement context synchronization for interrupted assistant turns in text streaming.	2026-05-26 10:56:38 +08:00
Xin Wang	e4e47f637e	Refactor pipeline to correctly include assistant_aggregator in the run_pipeline_with_serializer function.	2026-05-23 00:19:34 +08:00
Xin Wang	c7f2f4e5f3	update default env	2026-05-22 16:41:24 +08:00
Xin Wang	a10f0a586b	Add VOICE_CONFIG env var to select the voice pipeline config file. Defaults to config/voice.json; relative paths resolve from project root. Co-authored-by: Cursor <cursoragent@cursor.com>	2026-05-22 16:29:27 +08:00
Xin Wang	bc2aa5b133	Integrate product-ws voice demo on port 8000 alongside REST API. Add src/voice Pipecat pipeline, browser demo at /voice-demo, and config/voice.json. Co-authored-by: Cursor <cursoragent@cursor.com>	2026-05-22 16:26:06 +08:00
Xin Wang	0b6b40aba4	Add set_info logging for key/value updates Co-authored-by: Cursor <cursoragent@cursor.com>	2026-05-22 14:29:42 +08:00
Xin Wang	87e616ab55	提示词添加车辆预先提取和手机号多次拼接输入	2026-02-03 17:33:45 +08:00
Xin Wang	34848dd6a0	add nostream chat example	2026-02-02 18:16:51 +08:00
				`@@ -0,0 +1 @@`
				`"""Voice websocket demo (product-ws / va.ws.v1) powered by Pipecat."""`