Compare commits
23 Commits
fastgpt-py
...
7666759121
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7666759121 | ||
|
|
6ff23e433b | ||
|
|
edf85e21cc | ||
|
|
ffd3bf0385 | ||
|
|
1ea1d86d5a | ||
|
|
705a63dd25 | ||
|
|
056a8a4ad8 | ||
|
|
30c413b6d4 | ||
|
|
0ef5de399a | ||
|
|
00c1bbdc6b | ||
|
|
6df6c16e1d | ||
|
|
13f5f44f61 | ||
|
|
f49212afc9 | ||
|
|
ed9621dfe0 | ||
|
|
9e2374f492 | ||
|
|
b14ef64665 | ||
|
|
e4e47f637e | ||
|
|
c7f2f4e5f3 | ||
|
|
a10f0a586b | ||
|
|
bc2aa5b133 | ||
|
|
0b6b40aba4 | ||
|
|
87e616ab55 | ||
|
|
34848dd6a0 |
11
.env
11
.env
@@ -2,9 +2,8 @@ DATABASE_URL=sqlite:///./test.db
|
||||
SECRET_KEY=your_secret_key
|
||||
DEBUG=True
|
||||
|
||||
ANALYSIS_SERVICE_URL=http://101.89.151.141:3000/api/v1/chat/completions
|
||||
ANALYSIS_AUTH_TOKEN=fastgpt-hSPnXMoBNGVAEpTLkQT3YfAnN26gQSyvLd4ABL1MRDoh68nL4RDlopFHXqmH8
|
||||
APP_ID=683ea1bc86197e19f71fc1ae
|
||||
DELETE_SESSION_URL=http://101.89.151.141:3000/api/core/chat/delHistory?chatId={chatId}&appId={appId}
|
||||
DELETE_CHAT_URL=http://101.89.151.141:3000/api/core/chat/item/delete?contentId={contentId}&chatId={chatId}&appId={appId}
|
||||
GET_CHAT_RECORDS_URL=http://101.89.151.141:3000/api/core/chat/getPaginationRecords
|
||||
ANALYSIS_SERVICE_URL=http://127.0.0.1:3030
|
||||
ANALYSIS_AUTH_TOKEN=fastgpt-r13smJwPgXfGj1HDfc4SWAvIoNrL5Wc6o0BYnezqBs7hgzPdQ7Q34hVl2FJc0R
|
||||
APP_ID=6a310def7132e9f7d592dabb
|
||||
|
||||
VOICE_CONFIG=config/voice-fastgpt-state-xfyunSuperTTS.json
|
||||
|
||||
104
config/voice-fastgpt-state-xfyunSuperTTS.json
Normal file
104
config/voice-fastgpt-state-xfyunSuperTTS.json
Normal file
@@ -0,0 +1,104 @@
|
||||
{
|
||||
"server": {
|
||||
"host": "0.0.0.0",
|
||||
"port": 8000,
|
||||
"cors_origins": ["*"]
|
||||
},
|
||||
"audio": {
|
||||
"sample_rate_hz": 16000,
|
||||
"channels": 1,
|
||||
"frame_ms": 20
|
||||
},
|
||||
"session": {
|
||||
"inactivity_timeout_sec": 60
|
||||
},
|
||||
"turn": {
|
||||
"vad": {
|
||||
"confidence": 0.8,
|
||||
"start_secs": 0.4,
|
||||
"stop_secs": 0.2,
|
||||
"min_volume": 0.8
|
||||
},
|
||||
"interruption_min_chars": 3,
|
||||
"interruption_use_interim": true,
|
||||
"interruption_short_replies": [
|
||||
"是",
|
||||
"是的",
|
||||
"对",
|
||||
"对的",
|
||||
"嗯",
|
||||
"好",
|
||||
"好的",
|
||||
"行",
|
||||
"可以",
|
||||
"没问题",
|
||||
"不是",
|
||||
"不",
|
||||
"不行",
|
||||
"不用",
|
||||
"不要",
|
||||
"没有",
|
||||
"否",
|
||||
"你好",
|
||||
"在吗"
|
||||
],
|
||||
"user_speech_timeout_sec": 0.2,
|
||||
"idle_prompt_timeout_sec": 3.0,
|
||||
"idle_prompt_max_count": 3,
|
||||
"idle_prompt_text": "你好,请问还在吗?"
|
||||
},
|
||||
"agent": {
|
||||
"system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
|
||||
"greeting": "您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域,开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了,请点击继续办理,如需人工服务,请说转人工。",
|
||||
"greeting_mode": "fastgpt_opener",
|
||||
"response_state": {
|
||||
"enabled": true,
|
||||
"tag": "state",
|
||||
"event_type": "response.state",
|
||||
"max_prefix_chars": 256
|
||||
}
|
||||
},
|
||||
"services": {
|
||||
"stt": {
|
||||
"provider": "xfyun",
|
||||
"app_id": "416ce125",
|
||||
"api_key": "c65342fe603126c3610031d8429bb36d",
|
||||
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
|
||||
"base_url": "wss://iat-api.xfyun.cn/v2/iat",
|
||||
"language": "zh_cn",
|
||||
"domain": "iat",
|
||||
"accent": "mandarin",
|
||||
"encoding": "raw",
|
||||
"frame_size": 1280,
|
||||
"timeout_sec": 10.0
|
||||
},
|
||||
"llm": {
|
||||
"provider": "fastgpt",
|
||||
"api_key": "fastgpt-zlLjYtWZWN0uhQHs3ZOFHG4KLGMIdr2CkbZLCSfqGm5vcdx5xIZbp",
|
||||
"base_url": "http://localhost:3030",
|
||||
"model": "my-voice-app",
|
||||
"app_id": "691eddaa53e3f8d9f25f1370",
|
||||
"chat_id": null,
|
||||
"variables": {},
|
||||
"detail": false,
|
||||
"timeout_sec": 60.0,
|
||||
"send_system_prompt": false
|
||||
},
|
||||
"tts": {
|
||||
"provider": "xfyun_super",
|
||||
"app_id": "416ce125",
|
||||
"api_key": "c65342fe603126c3610031d8429bb36d",
|
||||
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
|
||||
"base_url": "wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6",
|
||||
"voice": "x5_lingxiaoxuan_flow",
|
||||
"aue": "raw",
|
||||
"speed": 50,
|
||||
"volume": 50,
|
||||
"pitch": 50,
|
||||
"oral_level": "mid",
|
||||
"source_sample_rate_hz": 24000,
|
||||
"text_aggregation_mode": "token",
|
||||
"timeout_sec": 30.0
|
||||
}
|
||||
}
|
||||
}
|
||||
99
config/voice-fastgpt-state-xfyunTTS.json
Normal file
99
config/voice-fastgpt-state-xfyunTTS.json
Normal file
@@ -0,0 +1,99 @@
|
||||
{
|
||||
"server": {
|
||||
"host": "0.0.0.0",
|
||||
"port": 8000,
|
||||
"cors_origins": ["*"]
|
||||
},
|
||||
"audio": {
|
||||
"sample_rate_hz": 16000,
|
||||
"channels": 1,
|
||||
"frame_ms": 20
|
||||
},
|
||||
"session": {
|
||||
"inactivity_timeout_sec": 60
|
||||
},
|
||||
"turn": {
|
||||
"vad": {
|
||||
"confidence": 0.8,
|
||||
"start_secs": 0.4,
|
||||
"stop_secs": 0.2,
|
||||
"min_volume": 0.8
|
||||
},
|
||||
"interruption_min_chars": 3,
|
||||
"interruption_use_interim": true,
|
||||
"interruption_short_replies": [
|
||||
"是",
|
||||
"是的",
|
||||
"对",
|
||||
"对的",
|
||||
"嗯",
|
||||
"好",
|
||||
"好的",
|
||||
"行",
|
||||
"可以",
|
||||
"没问题",
|
||||
"不是",
|
||||
"不",
|
||||
"不行",
|
||||
"不用",
|
||||
"不要",
|
||||
"没有",
|
||||
"否",
|
||||
"你好",
|
||||
"在吗"
|
||||
],
|
||||
"user_speech_timeout_sec": 0.2,
|
||||
"idle_prompt_timeout_sec": 3.0,
|
||||
"idle_prompt_max_count": 3,
|
||||
"idle_prompt_text": "你好,请问还在吗?"
|
||||
},
|
||||
"agent": {
|
||||
"greeting_mode": "fastgpt_opener",
|
||||
"response_state": {
|
||||
"enabled": true,
|
||||
"tag": "state",
|
||||
"event_type": "response.state",
|
||||
"max_prefix_chars": 256
|
||||
}
|
||||
},
|
||||
"services": {
|
||||
"stt": {
|
||||
"provider": "xfyun",
|
||||
"app_id": "416ce125",
|
||||
"api_key": "c65342fe603126c3610031d8429bb36d",
|
||||
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
|
||||
"base_url": "wss://iat-api.xfyun.cn/v2/iat",
|
||||
"language": "zh_cn",
|
||||
"domain": "iat",
|
||||
"accent": "mandarin",
|
||||
"encoding": "raw",
|
||||
"frame_size": 1280,
|
||||
"timeout_sec": 10.0
|
||||
},
|
||||
"llm": {
|
||||
"provider": "fastgpt",
|
||||
"api_key": "fastgpt-zlLjYtWZWN0uhQHs3ZOFHG4KLGMIdr2CkbZLCSfqGm5vcdx5xIZbp",
|
||||
"base_url": "http://localhost:3030",
|
||||
"model": "my-voice-app",
|
||||
"app_id": "691eddaa53e3f8d9f25f1370",
|
||||
"chat_id": null,
|
||||
"variables": {},
|
||||
"detail": false,
|
||||
"timeout_sec": 60.0
|
||||
},
|
||||
"tts": {
|
||||
"provider": "xfyun",
|
||||
"app_id": "416ce125",
|
||||
"api_key": "c65342fe603126c3610031d8429bb36d",
|
||||
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
|
||||
"base_url": "wss://tts-api.xfyun.cn/v2/tts",
|
||||
"voice": "x4_xiaoyan",
|
||||
"aue": "raw",
|
||||
"tte": "UTF8",
|
||||
"speed": 50,
|
||||
"volume": 50,
|
||||
"pitch": 50,
|
||||
"source_sample_rate_hz": 16000
|
||||
}
|
||||
}
|
||||
}
|
||||
104
config/voice-fastgpt-xfyunSuperTTS.json
Normal file
104
config/voice-fastgpt-xfyunSuperTTS.json
Normal file
@@ -0,0 +1,104 @@
|
||||
{
|
||||
"server": {
|
||||
"host": "0.0.0.0",
|
||||
"port": 8000,
|
||||
"cors_origins": ["*"]
|
||||
},
|
||||
"audio": {
|
||||
"sample_rate_hz": 16000,
|
||||
"channels": 1,
|
||||
"frame_ms": 20
|
||||
},
|
||||
"session": {
|
||||
"inactivity_timeout_sec": 60
|
||||
},
|
||||
"turn": {
|
||||
"vad": {
|
||||
"confidence": 0.8,
|
||||
"start_secs": 0.4,
|
||||
"stop_secs": 0.2,
|
||||
"min_volume": 0.8
|
||||
},
|
||||
"interruption_min_chars": 3,
|
||||
"interruption_use_interim": true,
|
||||
"interruption_short_replies": [
|
||||
"是",
|
||||
"是的",
|
||||
"对",
|
||||
"对的",
|
||||
"嗯",
|
||||
"好",
|
||||
"好的",
|
||||
"行",
|
||||
"可以",
|
||||
"没问题",
|
||||
"不是",
|
||||
"不",
|
||||
"不行",
|
||||
"不用",
|
||||
"不要",
|
||||
"没有",
|
||||
"否",
|
||||
"你好",
|
||||
"在吗"
|
||||
],
|
||||
"user_speech_timeout_sec": 0.2,
|
||||
"idle_prompt_timeout_sec": 3.0,
|
||||
"idle_prompt_max_count": 3,
|
||||
"idle_prompt_text": "你好,请问还在吗?"
|
||||
},
|
||||
"agent": {
|
||||
"system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
|
||||
"greeting": "您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域,开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了,请点击继续办理,如需人工服务,请说转人工。",
|
||||
"greeting_mode": "fastgpt_opener",
|
||||
"response_state": {
|
||||
"enabled": true,
|
||||
"tag": "state",
|
||||
"event_type": "response.state",
|
||||
"max_prefix_chars": 256
|
||||
}
|
||||
},
|
||||
"services": {
|
||||
"stt": {
|
||||
"provider": "xfyun",
|
||||
"app_id": "416ce125",
|
||||
"api_key": "c65342fe603126c3610031d8429bb36d",
|
||||
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
|
||||
"base_url": "wss://iat-api.xfyun.cn/v2/iat",
|
||||
"language": "zh_cn",
|
||||
"domain": "iat",
|
||||
"accent": "mandarin",
|
||||
"encoding": "raw",
|
||||
"frame_size": 1280,
|
||||
"timeout_sec": 10.0
|
||||
},
|
||||
"llm": {
|
||||
"provider": "fastgpt",
|
||||
"api_key": "fastgpt-v1FljAxBz3tJeS0bH7HZU4yVGclsTcfiy9yK7V9Zr9126maDHQ97Xlo8n",
|
||||
"base_url": "http://localhost:3030",
|
||||
"model": "my-voice-app",
|
||||
"app_id": "6a153aed53e3f8d9f2744905",
|
||||
"chat_id": null,
|
||||
"variables": {},
|
||||
"detail": false,
|
||||
"timeout_sec": 60.0,
|
||||
"send_system_prompt": false
|
||||
},
|
||||
"tts": {
|
||||
"provider": "xfyun_super",
|
||||
"app_id": "416ce125",
|
||||
"api_key": "c65342fe603126c3610031d8429bb36d",
|
||||
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
|
||||
"base_url": "wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6",
|
||||
"voice": "x5_lingxiaoxuan_flow",
|
||||
"aue": "raw",
|
||||
"speed": 50,
|
||||
"volume": 50,
|
||||
"pitch": 50,
|
||||
"oral_level": "mid",
|
||||
"source_sample_rate_hz": 24000,
|
||||
"text_aggregation_mode": "token",
|
||||
"timeout_sec": 30.0
|
||||
}
|
||||
}
|
||||
}
|
||||
102
config/voice-fastgpt-xfyunTTS.json
Normal file
102
config/voice-fastgpt-xfyunTTS.json
Normal file
@@ -0,0 +1,102 @@
|
||||
{
|
||||
"server": {
|
||||
"host": "0.0.0.0",
|
||||
"port": 8000,
|
||||
"cors_origins": ["*"]
|
||||
},
|
||||
"audio": {
|
||||
"sample_rate_hz": 16000,
|
||||
"channels": 1,
|
||||
"frame_ms": 20
|
||||
},
|
||||
"session": {
|
||||
"inactivity_timeout_sec": 60
|
||||
},
|
||||
"turn": {
|
||||
"vad": {
|
||||
"confidence": 0.7,
|
||||
"start_secs": 0.35,
|
||||
"stop_secs": 0.2,
|
||||
"min_volume": 0.65
|
||||
},
|
||||
"interruption_min_chars": 3,
|
||||
"interruption_use_interim": true,
|
||||
"interruption_short_replies": [
|
||||
"是",
|
||||
"是的",
|
||||
"对",
|
||||
"对的",
|
||||
"嗯",
|
||||
"好",
|
||||
"好的",
|
||||
"行",
|
||||
"可以",
|
||||
"没问题",
|
||||
"不是",
|
||||
"不",
|
||||
"不行",
|
||||
"不用",
|
||||
"不要",
|
||||
"没有",
|
||||
"否",
|
||||
"你好",
|
||||
"在吗"
|
||||
],
|
||||
"user_speech_timeout_sec": 0.2,
|
||||
"idle_prompt_timeout_sec": 3.0,
|
||||
"idle_prompt_max_count": 3,
|
||||
"idle_prompt_text": "你好,请问还在吗?"
|
||||
},
|
||||
"agent": {
|
||||
"system_prompt": "FastGPT app owns the system prompt when send_system_prompt is false.",
|
||||
"greeting": "您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域,开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了,请点击继续办理,如需人工服务,请说转人工。",
|
||||
"greeting_mode": "fastgpt_opener",
|
||||
"response_state": {
|
||||
"enabled": true,
|
||||
"tag": "state",
|
||||
"event_type": "response.state",
|
||||
"max_prefix_chars": 256
|
||||
}
|
||||
},
|
||||
"services": {
|
||||
"stt": {
|
||||
"provider": "xfyun",
|
||||
"app_id": "416ce125",
|
||||
"api_key": "c65342fe603126c3610031d8429bb36d",
|
||||
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
|
||||
"base_url": "wss://iat-api.xfyun.cn/v2/iat",
|
||||
"language": "zh_cn",
|
||||
"domain": "iat",
|
||||
"accent": "mandarin",
|
||||
"encoding": "raw",
|
||||
"frame_size": 1280,
|
||||
"timeout_sec": 10.0
|
||||
},
|
||||
"llm": {
|
||||
"provider": "fastgpt",
|
||||
"api_key": "fastgpt-v1FljAxBz3tJeS0bH7HZU4yVGclsTcfiy9yK7V9Zr9126maDHQ97Xlo8n",
|
||||
"base_url": "http://localhost:3030",
|
||||
"model": "my-voice-app",
|
||||
"app_id": "6a153aed53e3f8d9f2744905",
|
||||
"chat_id": null,
|
||||
"variables": {},
|
||||
"detail": false,
|
||||
"timeout_sec": 60.0,
|
||||
"send_system_prompt": false
|
||||
},
|
||||
"tts": {
|
||||
"provider": "xfyun",
|
||||
"app_id": "416ce125",
|
||||
"api_key": "c65342fe603126c3610031d8429bb36d",
|
||||
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
|
||||
"base_url": "wss://tts-api.xfyun.cn/v2/tts",
|
||||
"voice": "x4_xiaoyan",
|
||||
"aue": "raw",
|
||||
"tte": "UTF8",
|
||||
"speed": 50,
|
||||
"volume": 50,
|
||||
"pitch": 50,
|
||||
"source_sample_rate_hz": 16000
|
||||
}
|
||||
}
|
||||
}
|
||||
95
config/voice-xfyun.json
Normal file
95
config/voice-xfyun.json
Normal file
@@ -0,0 +1,95 @@
|
||||
{
|
||||
"server": {
|
||||
"host": "0.0.0.0",
|
||||
"port": 8000,
|
||||
"cors_origins": ["*"]
|
||||
},
|
||||
"audio": {
|
||||
"sample_rate_hz": 16000,
|
||||
"channels": 1,
|
||||
"frame_ms": 20
|
||||
},
|
||||
"session": {
|
||||
"inactivity_timeout_sec": 60
|
||||
},
|
||||
"turn": {
|
||||
"vad": {
|
||||
"confidence": 0.7,
|
||||
"start_secs": 0.35,
|
||||
"stop_secs": 0.2,
|
||||
"min_volume": 0.65
|
||||
},
|
||||
"interruption_min_chars": 3,
|
||||
"interruption_use_interim": true,
|
||||
"interruption_short_replies": [
|
||||
"是",
|
||||
"是的",
|
||||
"对",
|
||||
"对的",
|
||||
"嗯",
|
||||
"好",
|
||||
"好的",
|
||||
"行",
|
||||
"可以",
|
||||
"没问题",
|
||||
"不是",
|
||||
"不",
|
||||
"不行",
|
||||
"不用",
|
||||
"不要",
|
||||
"没有",
|
||||
"否"
|
||||
],
|
||||
"user_speech_timeout_sec": 0.2,
|
||||
"idle_prompt_timeout_sec": 3.0,
|
||||
"idle_prompt_max_count": 3,
|
||||
"idle_prompt_text": "你好,请问还在吗?"
|
||||
},
|
||||
"agent": {
|
||||
"system_prompt": "# 角色 你是一个高度集成、安全第一的交警AI接警员。正在收集事故人员伤亡情况,时间,地点,事故原因,事故车辆数量,收集完成之后和用户说再见",
|
||||
"greeting": "您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域,开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了,请点击继续办理,如需人工服务,请说转人工。",
|
||||
"greeting_mode": "fixed",
|
||||
"response_state": {
|
||||
"enabled": true,
|
||||
"tag": "state",
|
||||
"event_type": "response.state",
|
||||
"max_prefix_chars": 256
|
||||
}
|
||||
},
|
||||
"services": {
|
||||
"stt": {
|
||||
"provider": "xfyun",
|
||||
"app_id": "416ce125",
|
||||
"api_key": "c65342fe603126c3610031d8429bb36d",
|
||||
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
|
||||
"base_url": "wss://iat-api.xfyun.cn/v2/iat",
|
||||
"language": "zh_cn",
|
||||
"domain": "iat",
|
||||
"accent": "mandarin",
|
||||
"encoding": "raw",
|
||||
"frame_size": 1280,
|
||||
"timeout_sec": 10.0
|
||||
},
|
||||
"llm": {
|
||||
"provider": "openai",
|
||||
"api_key": "sk-230701ff1b6143ecbf322b3170606016",
|
||||
"base_url": "https://api.deepseek.com/v1",
|
||||
"model": "deepseek-chat",
|
||||
"temperature": 0.7
|
||||
},
|
||||
"tts": {
|
||||
"provider": "xfyun",
|
||||
"app_id": "416ce125",
|
||||
"api_key": "c65342fe603126c3610031d8429bb36d",
|
||||
"api_secret": "MzkyYmI5OWEyODQzN2FiN2VhN2UzYzU4",
|
||||
"base_url": "wss://tts-api.xfyun.cn/v2/tts",
|
||||
"voice": "x4_xiaoyan",
|
||||
"aue": "raw",
|
||||
"tte": "UTF8",
|
||||
"speed": 50,
|
||||
"volume": 50,
|
||||
"pitch": 50,
|
||||
"source_sample_rate_hz": 16000
|
||||
}
|
||||
}
|
||||
}
|
||||
84
config/voice.json
Normal file
84
config/voice.json
Normal file
@@ -0,0 +1,84 @@
|
||||
{
|
||||
"server": {
|
||||
"host": "0.0.0.0",
|
||||
"port": 8000,
|
||||
"cors_origins": ["http://localhost:3000", "http://localhost:8080"],
|
||||
"serve_webpage": true,
|
||||
"webpage_mount": "/voice-demo"
|
||||
},
|
||||
"audio": {
|
||||
"sample_rate_hz": 16000,
|
||||
"channels": 1,
|
||||
"frame_ms": 20
|
||||
},
|
||||
"session": {
|
||||
"inactivity_timeout_sec": 60
|
||||
},
|
||||
"turn": {
|
||||
"vad": {
|
||||
"confidence": 0.7,
|
||||
"start_secs": 0.2,
|
||||
"stop_secs": 0.4,
|
||||
"min_volume": 0.6
|
||||
},
|
||||
"interruption_min_chars": 3,
|
||||
"interruption_use_interim": true,
|
||||
"interruption_short_replies": [
|
||||
"是",
|
||||
"是的",
|
||||
"对",
|
||||
"对的",
|
||||
"嗯",
|
||||
"好",
|
||||
"好的",
|
||||
"行",
|
||||
"可以",
|
||||
"没问题",
|
||||
"不是",
|
||||
"不",
|
||||
"不行",
|
||||
"不用",
|
||||
"不要",
|
||||
"没有",
|
||||
"否"
|
||||
],
|
||||
"user_speech_timeout_sec": 0.8,
|
||||
"idle_prompt_timeout_sec": 3.0,
|
||||
"idle_prompt_max_count": 3,
|
||||
"idle_prompt_text": "你好,请问还在吗?"
|
||||
},
|
||||
"agent": {
|
||||
"system_prompt": "You are a helpful, friendly voice assistant. Keep responses concise and natural for spoken conversation.",
|
||||
"greeting": "Please introduce yourself briefly.",
|
||||
"greeting_mode": "generated",
|
||||
"response_state": {
|
||||
"enabled": false,
|
||||
"tag": "state",
|
||||
"event_type": "response.state",
|
||||
"max_prefix_chars": 256
|
||||
}
|
||||
},
|
||||
"services": {
|
||||
"stt": {
|
||||
"provider": "openai",
|
||||
"api_key": "",
|
||||
"base_url": null,
|
||||
"model": "gpt-4o-mini-transcribe",
|
||||
"language": "en"
|
||||
},
|
||||
"llm": {
|
||||
"provider": "openai",
|
||||
"api_key": "",
|
||||
"base_url": null,
|
||||
"model": "gpt-4o-mini",
|
||||
"temperature": 0.7
|
||||
},
|
||||
"tts": {
|
||||
"provider": "openai",
|
||||
"api_key": "",
|
||||
"base_url": null,
|
||||
"model": "gpt-4o-mini-tts",
|
||||
"voice": "alloy"
|
||||
}
|
||||
}
|
||||
}
|
||||
56
docs/chat-stream-mode.md
Normal file
56
docs/chat-stream-mode.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# /chat 流式响应模式说明
|
||||
|
||||
## 接口地址
|
||||
|
||||
```
|
||||
POST http://localhost:8000/chat?stream=true
|
||||
```
|
||||
|
||||
## 请求参数
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| sessionId | string | 是 | 会话 ID |
|
||||
| timeStamp | string | 是 | 时间戳 |
|
||||
| text | string | 是 | 用户输入文本 |
|
||||
| stream | bool | 否 | 设为 true 启用流式响应 |
|
||||
|
||||
## SSE 事件类型
|
||||
|
||||
| 事件类型 | 说明 | 数据格式 |
|
||||
|----------|------|----------|
|
||||
| `stage_code` | 阶段状态码 | `{"nextStageCode": "0000", "nextStage": "结束通话"}` |
|
||||
| `text_delta` | 流式文本片段 | `{"text": "您好..."}` |
|
||||
| `done` | 流式结束 | `{"status": "completed"}` |
|
||||
| `error` | 错误信息 | `{"msg": "错误描述", "code": "500"}` |
|
||||
|
||||
## 状态码映射
|
||||
|
||||
| 状态码 | 含义 |
|
||||
|--------|------|
|
||||
| 0000 | 结束通话 |
|
||||
| 0001 | 转接人工 |
|
||||
| 0002 | 语义无法识别转接人工 |
|
||||
| 0003 | 有人伤转接人工 |
|
||||
| 1001 | 未准备好通话 |
|
||||
| 1002 | 通话中 |
|
||||
| 2000 | 进入单车拍照 |
|
||||
| ... | ... |
|
||||
|
||||
## 示例
|
||||
|
||||
### 请求
|
||||
|
||||
```bash
|
||||
python examples/stream_chat.py session-001 "发生了交通事故"
|
||||
```
|
||||
|
||||
### 响应
|
||||
|
||||
```
|
||||
Status: 200
|
||||
--------------------------------------------------
|
||||
[stage_code] {"nextStageCode": "1002", "nextStage": "通话中"}
|
||||
[text_delta] {"text": "您好,请问发生了什么情况?"}
|
||||
[done] {"status": "completed"}
|
||||
```
|
||||
376
docs/voice-websocket.md
Normal file
376
docs/voice-websocket.md
Normal file
@@ -0,0 +1,376 @@
|
||||
# Voice WebSocket 使用说明
|
||||
|
||||
基于 `src/voice` 产品语音管线与 `static/voice-demo` 浏览器示例整理。
|
||||
|
||||
## 概览
|
||||
|
||||
| 项目 | 说明 |
|
||||
|------|------|
|
||||
| WebSocket 路径 | `/ws-product` |
|
||||
| 协议标识 | `va.ws.v1`(JSON + base64;音频上行也支持二进制 PCM) |
|
||||
| 默认音频 | PCM16 小端(`pcm_s16le`)、16 kHz、单声道 |
|
||||
| 会话 ID | 连接 URL 查询参数 `chatId` 或 `chat_id`;未传时服务端自动生成 |
|
||||
| 健康检查 | `GET /voice/health` |
|
||||
| 浏览器 Demo | 默认挂载于 `/voice-demo`(由 voice 配置 `server.serve_webpage` 控制) |
|
||||
|
||||
完整 URL 示例:
|
||||
|
||||
```
|
||||
ws://127.0.0.1:8000/ws-product?chatId=voice_abc123
|
||||
wss://your-host/ws-product?chatId=voice_abc123
|
||||
```
|
||||
|
||||
## 连接流程
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Client
|
||||
participant Server
|
||||
|
||||
Client->>Server: WebSocket connect (?chatId=...)
|
||||
Server-->>Client: 101 Switching Protocols
|
||||
Client->>Server: session.start (JSON)
|
||||
Note over Client,Server: 可选:固定开场白 / FastGPT opener / LLM 生成问候
|
||||
loop 会话中
|
||||
Client->>Server: input.audio (binary 或 JSON)
|
||||
Client->>Server: input.text
|
||||
Server-->>Client: input.transcript.* / response.text.* / response.audio.*
|
||||
Server-->>Client: response.state(若启用状态标签)
|
||||
end
|
||||
Client->>Server: session.stop
|
||||
Server-->>Client: WebSocket close
|
||||
```
|
||||
|
||||
推荐顺序(与 `voice-demo/app.js` 一致):
|
||||
|
||||
1. 建立 WebSocket 连接(建议 `binaryType = "arraybuffer"`)。
|
||||
2. 连接成功后立即发送 `session.start`。
|
||||
3. 开始推送麦克风音频(二进制帧或 `input.audio` JSON)。
|
||||
4. 处理服务端 JSON 事件(文本、转写、TTS 音频等)。
|
||||
5. 断开前发送 `session.stop`,再关闭连接。
|
||||
|
||||
## 消息信封
|
||||
|
||||
除二进制音频外,所有消息均为 UTF-8 JSON 对象。服务端下发事件统一包含:
|
||||
|
||||
| 字段 | 类型 | 说明 |
|
||||
|------|------|------|
|
||||
| `type` | string | 事件类型 |
|
||||
| `protocol` | string | 固定为 `va.ws.v1` |
|
||||
| `seq` | number | 单调递增序号(仅服务端事件) |
|
||||
|
||||
## 客户端 → 服务端
|
||||
|
||||
### `session.start`
|
||||
|
||||
开始会话,必须在发送音频或文本输入之前调用。
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "session.start",
|
||||
"protocol": "va.ws.v1",
|
||||
"chatId": "voice_abc123",
|
||||
"audio": {
|
||||
"encoding": "pcm_s16le",
|
||||
"sample_rate": 16000,
|
||||
"channels": 1
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
`chatId` 也可写作 `chat_id`。若省略,服务端使用 URL 查询参数或自动生成 ID。
|
||||
|
||||
### `session.stop`
|
||||
|
||||
正常结束会话。
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "session.stop",
|
||||
"reason": "client_disconnect"
|
||||
}
|
||||
```
|
||||
|
||||
### `input.audio`(JSON 形式)
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "input.audio",
|
||||
"audio": "<base64 PCM16>",
|
||||
"sample_rate": 16000,
|
||||
"channels": 1
|
||||
}
|
||||
```
|
||||
|
||||
`audio` 字段也可命名为 `data`。`sample_rate` / `channels` 可省略,默认与服务端配置一致。
|
||||
|
||||
### 二进制音频(推荐)
|
||||
|
||||
直接发送 **原始 PCM16 小端** 字节流,无需 JSON 包装。`voice-demo` 通过 AudioWorklet 每 20 ms 发送一帧(16 kHz 单声道下约 640 字节/帧)。
|
||||
|
||||
服务端同时接受 JSON 与二进制两种上行格式。
|
||||
|
||||
### `input.text`
|
||||
|
||||
发送文本回合;默认会打断当前 bot 回复(`interrupt: true`)。
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "input.text",
|
||||
"text": "你好,我想报案",
|
||||
"interrupt": true
|
||||
}
|
||||
```
|
||||
|
||||
注意:文本输入**不会**以 `input.transcript.final` 回显,客户端需自行在 UI 中展示用户消息(Demo 即如此处理)。Demo 的相机步骤通过发送 `input.text`(如 `【拍摄完成】`)完成,不上传图片帧。
|
||||
|
||||
## 服务端 → 客户端
|
||||
|
||||
### 用户语音转写
|
||||
|
||||
| 事件 | 说明 |
|
||||
|------|------|
|
||||
| `input.transcript.interim` | ASR 中间结果(流式识别过程中) |
|
||||
| `input.transcript.final` | 用户一句话结束后的最终转写 |
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "input.transcript.final",
|
||||
"protocol": "va.ws.v1",
|
||||
"seq": 12,
|
||||
"text": "发生了交通事故",
|
||||
"user_id": "product-user",
|
||||
"timestamp": "2026-06-01T10:00:00.000Z"
|
||||
}
|
||||
```
|
||||
|
||||
### 助手文本流
|
||||
|
||||
文本通常**早于**对应 TTS 音频到达,便于客户端先渲染字幕。
|
||||
|
||||
| 事件 | 说明 |
|
||||
|------|------|
|
||||
| `response.text.started` | 新一轮助手回复开始 |
|
||||
| `response.text.delta` | 流式文本片段 |
|
||||
| `response.text.final` | 本轮文本结束;`interrupted: true` 表示被打断 |
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "response.text.delta",
|
||||
"protocol": "va.ws.v1",
|
||||
"seq": 20,
|
||||
"text": "您好,"
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "response.text.final",
|
||||
"protocol": "va.ws.v1",
|
||||
"seq": 45,
|
||||
"text": "您好,请问发生了什么情况?",
|
||||
"interrupted": false
|
||||
}
|
||||
```
|
||||
|
||||
### 助手语音(TTS)
|
||||
|
||||
| 事件 | 说明 |
|
||||
|------|------|
|
||||
| `response.audio.started` | Bot 开始说话 |
|
||||
| `response.audio.delta` | PCM16 音频块(base64) |
|
||||
| `response.audio.stopped` | Bot 说完 |
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "response.audio.delta",
|
||||
"protocol": "va.ws.v1",
|
||||
"seq": 30,
|
||||
"audio": "<base64 PCM16>",
|
||||
"bytes": 640,
|
||||
"sample_rate": 16000,
|
||||
"channels": 1
|
||||
}
|
||||
```
|
||||
|
||||
客户端应将各 `delta` 块按序解码并无缝拼接播放(Demo 使用 Web Audio `AudioContext` 调度)。
|
||||
|
||||
### 助手状态(可选)
|
||||
|
||||
当 voice 配置启用 `agent.response_state` 时,LLM 输出开头的 `<state>...</state>` 标签会被剥离,并单独下发:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "response.state",
|
||||
"protocol": "va.ws.v1",
|
||||
"seq": 18,
|
||||
"state": "2000"
|
||||
}
|
||||
```
|
||||
|
||||
Demo 根据状态码展示拍照引导(如 `2000`–`2015` 等车险场景状态)。
|
||||
|
||||
## 音频参数
|
||||
|
||||
| 参数 | 默认值 | 说明 |
|
||||
|------|--------|------|
|
||||
| 采样率 | 16000 Hz | 配置项 `audio.sample_rate_hz` |
|
||||
| 声道 | 1(mono) | 配置项 `audio.channels` |
|
||||
| 帧长 | 20 ms | 配置项 `audio.frame_ms`;每帧 640 字节 |
|
||||
| 编码 | PCM signed 16-bit LE | 小端有符号 16 位整数 |
|
||||
|
||||
## 会话与打断行为
|
||||
|
||||
- **chatId**:同一 ID 用于 LLM(如 FastGPT)多轮上下文;连接时可写在 URL 或 `session.start` 中。
|
||||
- **语音回合**:VAD + 静音超时判定用户说完;说完后触发 STT 最终转写与 LLM。
|
||||
- **打断**:用户说话或 `input.text`(`interrupt: true`)可打断 bot;被打断的助手文本在 `response.text.final` 中带 `interrupted: true`。
|
||||
- **空闲超时**:长时间无活动会断开(`session.inactivity_timeout_sec`,默认 60 秒);可配置空闲提示语。
|
||||
- **开场白**:由 `agent.greeting_mode` 控制(`fixed` / `fastgpt_opener` / `generated` 等)。
|
||||
|
||||
## 浏览器 Demo 参考实现
|
||||
|
||||
Demo 位于 `static/voice-demo/`,无构建步骤,核心文件:
|
||||
|
||||
| 文件 | 职责 |
|
||||
|------|------|
|
||||
| `app.js` | WebSocket 连接、事件处理、聊天 UI、TTS 播放 |
|
||||
| `pcm-recorder.worklet.js` | 麦克风采集、重采样至 16 kHz、20 ms 二进制帧 |
|
||||
| `index.html` / `styles.css` | 页面与样式 |
|
||||
|
||||
### 启动 Demo
|
||||
|
||||
1. 启动 API 服务并加载 voice 配置(环境变量 `VOICE_CONFIG` 指向 JSON,默认 `config/voice.json`)。
|
||||
2. 浏览器打开 `http://127.0.0.1:8000/voice-demo/`(挂载路径见配置 `server.webpage_mount`)。
|
||||
3. 点击 **Connect** → **Enable mic** 开始对话。
|
||||
|
||||
### Demo 关键实现要点
|
||||
|
||||
**连接与握手**
|
||||
|
||||
```javascript
|
||||
const ws = new WebSocket("ws://127.0.0.1:8000/ws-product?chatId=voice_xxx");
|
||||
ws.binaryType = "arraybuffer";
|
||||
|
||||
ws.onopen = () => {
|
||||
ws.send(JSON.stringify({
|
||||
type: "session.start",
|
||||
protocol: "va.ws.v1",
|
||||
chatId: "voice_xxx",
|
||||
audio: { encoding: "pcm_s16le", sample_rate: 16000, channels: 1 },
|
||||
}));
|
||||
};
|
||||
```
|
||||
|
||||
**发送麦克风(二进制,与 Demo 一致)**
|
||||
|
||||
```javascript
|
||||
// AudioWorklet 每 20ms postMessage { type: "frame", buffer: ArrayBuffer }
|
||||
recorderNode.port.onmessage = (event) => {
|
||||
if (event.data?.type === "frame") {
|
||||
ws.send(event.data.buffer);
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
**播放 TTS**
|
||||
|
||||
```javascript
|
||||
function decodeBase64ToInt16(b64) {
|
||||
const binary = atob(b64);
|
||||
const bytes = new Uint8Array(binary.length);
|
||||
for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
|
||||
return new Int16Array(bytes.buffer);
|
||||
}
|
||||
|
||||
// 收到 response.audio.delta 后,将 Int16 转为 Float32 并调度到 AudioContext
|
||||
```
|
||||
|
||||
**发送文本并打断**
|
||||
|
||||
```javascript
|
||||
ws.send(JSON.stringify({
|
||||
type: "input.text",
|
||||
text: "【拍摄完成】",
|
||||
interrupt: true,
|
||||
}));
|
||||
// 客户端应停止本地 TTS 播放队列;服务端会发 response.text.final(interrupted=true)
|
||||
```
|
||||
|
||||
### 跨域静态页
|
||||
|
||||
若 Demo 托管在其他端口,需在 voice 配置中设置 `server.cors_origins`,并将 WebSocket URL 指向 API 主机。
|
||||
|
||||
> 浏览器 `getUserMedia` 需要安全上下文:`https://` 或 `http://localhost` 可用;其他 HTTP 源需改用 HTTPS + `wss://`。
|
||||
|
||||
## 最小客户端示例(伪代码)
|
||||
|
||||
```javascript
|
||||
const ws = new WebSocket(`${location.protocol === "https:" ? "wss" : "ws"}://${location.host}/ws-product?chatId=voice_demo_1`);
|
||||
ws.binaryType = "arraybuffer";
|
||||
|
||||
ws.onopen = () => {
|
||||
ws.send(JSON.stringify({
|
||||
type: "session.start",
|
||||
protocol: "va.ws.v1",
|
||||
audio: { encoding: "pcm_s16le", sample_rate: 16000, channels: 1 },
|
||||
}));
|
||||
};
|
||||
|
||||
ws.onmessage = (event) => {
|
||||
if (typeof event.data !== "string") return;
|
||||
const msg = JSON.parse(event.data);
|
||||
switch (msg.type) {
|
||||
case "input.transcript.final":
|
||||
console.log("User:", msg.text);
|
||||
break;
|
||||
case "response.text.delta":
|
||||
process.stdout?.write?.(msg.text); // 流式打印助手文本
|
||||
break;
|
||||
case "response.audio.delta":
|
||||
playPcm16(decodeBase64(msg.audio));
|
||||
break;
|
||||
case "response.state":
|
||||
console.log("State:", msg.state);
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
function disconnect() {
|
||||
ws.send(JSON.stringify({ type: "session.stop", reason: "done" }));
|
||||
ws.close(1000, "done");
|
||||
}
|
||||
```
|
||||
|
||||
## 健康检查响应示例
|
||||
|
||||
```bash
|
||||
curl http://127.0.0.1:8000/voice/health
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "healthy",
|
||||
"config": "/path/to/config/voice.json",
|
||||
"protocols": {
|
||||
"/ws-product": "va.ws.v1.json_base64"
|
||||
},
|
||||
"features": {
|
||||
"product_text_input": true,
|
||||
"product_text_interrupt": true
|
||||
},
|
||||
"demo": "/voice-demo",
|
||||
"llm_provider": "fastgpt",
|
||||
"stt_provider": "xfyun",
|
||||
"tts_provider": "xfyun"
|
||||
}
|
||||
```
|
||||
|
||||
## 常见问题
|
||||
|
||||
| 现象 | 可能原因 |
|
||||
|------|----------|
|
||||
| 连接后立即断开 | 未发送 `session.start`;或超过 inactivity 超时 |
|
||||
| 无 bot 语音 | 未处理 `response.audio.delta`;AudioContext 未在用户手势后 resume |
|
||||
| 回声/啸叫 | 建议使用耳机;Demo 已开启浏览器 AEC,但扬声器外放仍可能串音 |
|
||||
| 文本发送无用户气泡 | 设计如此,需客户端本地展示 `input.text` 内容 |
|
||||
| 跨域 WebSocket 失败 | 检查 `cors_origins` 与 `wss` 证书 |
|
||||
55
examples/nostream_chat.py
Normal file
55
examples/nostream_chat.py
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple CLI script to interact with /chat endpoint in non-stream mode.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
API_BASE_URL = "http://localhost:8000"
|
||||
|
||||
|
||||
async def chat(session_id: str, text: str):
|
||||
"""Send a non-streaming chat request."""
|
||||
timestamp = datetime.now().isoformat()
|
||||
|
||||
payload = {
|
||||
"sessionId": session_id,
|
||||
"timeStamp": timestamp,
|
||||
"text": text
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as http_session:
|
||||
async with http_session.post(
|
||||
f"{API_BASE_URL}/chat",
|
||||
json=payload,
|
||||
) as response:
|
||||
data = await response.json()
|
||||
|
||||
print(f"Status: {response.status}")
|
||||
print("-" * 50)
|
||||
print(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
async def main():
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: python nostream_chat.py <session_id> <message>")
|
||||
print("Example: python nostream_chat.py test-session-123 '发生了交通事故'")
|
||||
sys.exit(1)
|
||||
|
||||
session_id = sys.argv[1]
|
||||
text = " ".join(sys.argv[2:])
|
||||
|
||||
print(f"Session ID: {session_id}")
|
||||
print(f"Message: {text}")
|
||||
print("-" * 50)
|
||||
|
||||
await chat(session_id, text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,16 +1,30 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple CLI script to interact with /chat endpoint in stream mode.
|
||||
Stream Chat CLI - 与 /chat 端点进行流式交互的脚本。
|
||||
|
||||
用法:
|
||||
python stream_chat.py <session_id> <消息>
|
||||
|
||||
示例:
|
||||
python stream_chat.py test-001 "发生了交通事故"
|
||||
|
||||
输出说明:
|
||||
- [stage_code]: 阶段状态码,如 {"nextStageCode": "0000", "nextStage": "结束通话"}
|
||||
- [text_delta]: 流式文本片段
|
||||
- [done]: 流式结束
|
||||
- [error]: 错误信息
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import ssl
|
||||
import aiohttp
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
API_BASE_URL = "http://localhost:8000"
|
||||
#API_BASE_URL = "http://localhost:8000"
|
||||
API_BASE_URL = "https://101.89.108.122:8000"
|
||||
|
||||
|
||||
async def stream_chat(session_id: str, text: str):
|
||||
@@ -23,7 +37,11 @@ async def stream_chat(session_id: str, text: str):
|
||||
"text": text
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as http_session:
|
||||
ssl_ctx = ssl.create_default_context()
|
||||
ssl_ctx.check_hostname = False
|
||||
ssl_ctx.verify_mode = ssl.CERT_NONE
|
||||
|
||||
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=ssl_ctx)) as http_session:
|
||||
async with http_session.post(
|
||||
f"{API_BASE_URL}/chat",
|
||||
json=payload,
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
fastapi>=0.104.0
|
||||
uvicorn>=0.24.0
|
||||
uvicorn[standard]>=0.24.0
|
||||
pipecat-ai[websocket,openai,silero]
|
||||
websockets>=13.1,<16.0
|
||||
pydantic>=2.4.2
|
||||
python-dotenv>=1.0.0
|
||||
httpx>=0.25.0
|
||||
@@ -12,7 +14,7 @@ pydantic-settings==2.1.0
|
||||
python-multipart==0.0.6
|
||||
python-jose[cryptography]==3.3.0
|
||||
passlib[bcrypt]==1.7.4
|
||||
openai==1.55.3
|
||||
openai>=1.74.0,<3
|
||||
loguru>=0.7.0
|
||||
pandas
|
||||
requests
|
||||
|
||||
@@ -8,3 +8,6 @@ APP_ID=683ea1bc86197e19f71fc1ae
|
||||
DELETE_SESSION_URL=http://127.0.0.1:3030/api/core/chat/delHistory?chatId={chatId}&appId={appId}
|
||||
DELETE_CHAT_URL=http://127.0.0.1:3030/api/core/chat/item/delete?contentId={contentId}&chatId={chatId}&appId={appId}
|
||||
GET_CHAT_RECORDS_URL=http://127.0.0.1:3030/api/core/chat/getPaginationRecords
|
||||
|
||||
# Voice demo (Pipecat /ws-product). Relative to project root, or an absolute path.
|
||||
VOICE_CONFIG=config/voice.json
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from fastapi import APIRouter, HTTPException, Depends
|
||||
from fastapi.responses import StreamingResponse
|
||||
from ..schemas.models import ProcessRequest_chat, ProcessResponse_chat, ProcessRequest_get, ProcessResponse_get, ProcessRequest_set, ProcessResponse_set, ProcessResponse_delete_session, ProcessRequest_delete_session
|
||||
from fastgpt_client import AsyncChatClient
|
||||
from fastgpt_client import AsyncChatClient, aiter_stream_events
|
||||
from fastgpt_client.exceptions import (
|
||||
APIError, AuthenticationError, RateLimitError, ValidationError
|
||||
)
|
||||
@@ -12,6 +12,7 @@ import json
|
||||
import re
|
||||
|
||||
router = APIRouter()
|
||||
FORM_EXTRACT_MODULE_NAME = "文本内容提取事故信息"
|
||||
STATUS_CODE_MAP = {
|
||||
'0000': '结束通话',
|
||||
'0001': '转接人工',
|
||||
@@ -34,6 +35,19 @@ STATUS_CODE_MAP = {
|
||||
'2016': '确认双车中的车牌'
|
||||
}
|
||||
|
||||
def normalize_stage_code(stage_code: str) -> str:
|
||||
"""Normalize FastGPT stage codes to external API stage codes."""
|
||||
if stage_code in ['3001', '3002', '1002']:
|
||||
return '1002'
|
||||
if stage_code == '2006':
|
||||
return '2004'
|
||||
if stage_code == '2017':
|
||||
return '2016'
|
||||
if stage_code == '2020':
|
||||
return '0002'
|
||||
return stage_code
|
||||
|
||||
|
||||
def extract_state_and_content(data1: str) -> dict | None:
|
||||
"""
|
||||
Extracts the state and content from a string in the format <state>STATE</state>content.
|
||||
@@ -47,7 +61,7 @@ def extract_state_and_content(data1: str) -> dict | None:
|
||||
"""
|
||||
data1 = data1.strip()
|
||||
regex = r"<state>(.*?)</state>(.*)"
|
||||
match = re.search(regex, data1)
|
||||
match = re.search(regex, data1, flags=re.DOTALL)
|
||||
|
||||
if match:
|
||||
return {
|
||||
@@ -56,6 +70,52 @@ def extract_state_and_content(data1: str) -> dict | None:
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def parse_json_value(value):
|
||||
"""Parse JSON string values when possible."""
|
||||
parsed = value
|
||||
for _ in range(3):
|
||||
if not isinstance(parsed, str):
|
||||
return parsed
|
||||
parsed = parsed.strip()
|
||||
if not parsed:
|
||||
return {}
|
||||
try:
|
||||
parsed = json.loads(parsed)
|
||||
except json.JSONDecodeError:
|
||||
return parsed
|
||||
return parsed
|
||||
|
||||
|
||||
def extract_form_update_from_flow_nodes(nodes):
|
||||
"""Extract form update data from the configured FastGPT content-extract node."""
|
||||
if not isinstance(nodes, list):
|
||||
return {}
|
||||
|
||||
for node in nodes:
|
||||
if not isinstance(node, dict):
|
||||
continue
|
||||
if node.get("moduleName") != FORM_EXTRACT_MODULE_NAME:
|
||||
continue
|
||||
|
||||
extract_result = node.get("extractResult", {})
|
||||
if not isinstance(extract_result, dict):
|
||||
return {}
|
||||
|
||||
form_update = extract_result.get("formUpdate") or extract_result.get("form") or ""
|
||||
if not form_update:
|
||||
return {}
|
||||
return parse_json_value(form_update)
|
||||
|
||||
return {}
|
||||
|
||||
|
||||
def format_set_info_input(payload: dict, include_input_info: bool) -> str:
|
||||
"""Build optional setInfo input for FastGPT helper calls."""
|
||||
if not include_input_info:
|
||||
return ""
|
||||
return f"<setInfo>{json.dumps(payload, ensure_ascii=False)}</setInfo>"
|
||||
|
||||
async def delete_last_two_chat_records(
|
||||
client: AsyncChatClient,
|
||||
session_id: str
|
||||
@@ -112,6 +172,8 @@ async def chat(
|
||||
"""Handle chat completion request."""
|
||||
json_data = request.model_dump()
|
||||
logger.info(f"用户请求信息ProcessRequest_chat: {json_data}, stream={stream}")
|
||||
need_form_update = json_data.get('needFormUpdate', False)
|
||||
chat_variables = {'needFormUpdate': need_form_update}
|
||||
|
||||
if stream:
|
||||
async def event_generator():
|
||||
@@ -121,73 +183,80 @@ async def chat(
|
||||
messages=[{"role": "user", "content": json_data['text']}],
|
||||
chatId=json_data['sessionId'],
|
||||
stream=True,
|
||||
detail=True
|
||||
detail=True,
|
||||
variables=chat_variables
|
||||
)
|
||||
|
||||
buffer = ""
|
||||
state_code_found = False
|
||||
module_form_sent = False
|
||||
|
||||
def flush_text_delta(text: str):
|
||||
return create_sse_event("text_delta", {"text": text})
|
||||
|
||||
def flush_form_update(form_update):
|
||||
return create_sse_event("formUpdate", form_update)
|
||||
|
||||
async for chunk in response.aiter_lines():
|
||||
if chunk.startswith('data: '):
|
||||
data_str = chunk[6:].strip()
|
||||
if data_str == '[DONE]':
|
||||
break
|
||||
async for event in aiter_stream_events(response):
|
||||
try:
|
||||
if event.kind == "flowResponses" and not module_form_sent:
|
||||
form_update = extract_form_update_from_flow_nodes(event.data)
|
||||
if form_update:
|
||||
yield flush_form_update(form_update)
|
||||
module_form_sent = True
|
||||
continue
|
||||
|
||||
if event.kind not in {"answer", "fastAnswer", "data"}:
|
||||
continue
|
||||
|
||||
data = event.data
|
||||
if not isinstance(data, dict):
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(data_str)
|
||||
try:
|
||||
delta_content = data['choices'][0]['delta'].get('content', '')
|
||||
except (KeyError, IndexError):
|
||||
delta_content = ''
|
||||
if delta_content:
|
||||
buffer += delta_content
|
||||
|
||||
if not state_code_found:
|
||||
# Check for <state>XXXX</state> pattern
|
||||
match = re.search(r"<state>(.*?)</state>", buffer)
|
||||
if match:
|
||||
state_code = match.group(1)
|
||||
|
||||
# Apply logic to map/adjust state code
|
||||
nextStageCode = state_code
|
||||
if nextStageCode in ['3001', '3002', '1002']:
|
||||
nextStageCode = '1002'
|
||||
elif nextStageCode == '2006':
|
||||
nextStageCode = '2004'
|
||||
elif nextStageCode == '2017':
|
||||
nextStageCode = '2016'
|
||||
elif nextStageCode == '2020':
|
||||
nextStageCode = '0002'
|
||||
nextStage = STATUS_CODE_MAP.get(nextStageCode, '')
|
||||
|
||||
# Send stage code event
|
||||
yield create_sse_event("stage_code", {
|
||||
"nextStageCode": nextStageCode,
|
||||
"nextStage": nextStage
|
||||
})
|
||||
|
||||
state_code_found = True
|
||||
|
||||
# Send remaining content as text_delta
|
||||
remaining_content = buffer[match.end():]
|
||||
if remaining_content:
|
||||
yield create_sse_event("text_delta", {"text": remaining_content})
|
||||
buffer = "" # Clear buffer after extracting state
|
||||
else:
|
||||
# State code already found, just stream text
|
||||
yield create_sse_event("text_delta", {"text": delta_content})
|
||||
buffer = "" # Do not buffer text after state found
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(data)
|
||||
logger.error(f"Error processing chunk: {e}")
|
||||
delta_content = data['choices'][0]['delta'].get('content', '')
|
||||
except (KeyError, IndexError):
|
||||
delta_content = ''
|
||||
if not delta_content:
|
||||
continue
|
||||
|
||||
buffer += delta_content
|
||||
|
||||
if not state_code_found:
|
||||
# Check for <state>XXXX</state> pattern
|
||||
match = re.search(r"<state>(.*?)</state>", buffer, flags=re.DOTALL)
|
||||
if match:
|
||||
state_code = match.group(1)
|
||||
|
||||
# Apply logic to map/adjust state code
|
||||
nextStageCode = normalize_stage_code(state_code)
|
||||
nextStage = STATUS_CODE_MAP.get(nextStageCode, '')
|
||||
|
||||
# Send stage code event
|
||||
yield create_sse_event("stage_code", {
|
||||
"nextStageCode": nextStageCode,
|
||||
"nextStage": nextStage
|
||||
})
|
||||
|
||||
state_code_found = True
|
||||
|
||||
# Send remaining content as text_delta
|
||||
remaining_content = buffer[match.end():]
|
||||
if remaining_content:
|
||||
yield flush_text_delta(remaining_content)
|
||||
buffer = "" # Clear buffer after extracting state
|
||||
else:
|
||||
yield flush_text_delta(delta_content)
|
||||
buffer = ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing stream event: {e}")
|
||||
continue
|
||||
|
||||
# If stream ends and no state code found (unlikely if format is strict),
|
||||
# we might want to send what we have
|
||||
if not state_code_found and buffer:
|
||||
yield create_sse_event("text_delta", {"text": buffer})
|
||||
yield create_sse_event("text_delta", {"text": buffer})
|
||||
|
||||
yield create_sse_event("done", {"status": "completed"})
|
||||
|
||||
@@ -203,7 +272,8 @@ async def chat(
|
||||
messages=[{"role": "user", "content": json_data['text']}],
|
||||
chatId=json_data['sessionId'],
|
||||
stream=False,
|
||||
detail=True
|
||||
detail=True,
|
||||
variables=chat_variables
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
@@ -281,28 +351,18 @@ async def chat(
|
||||
|
||||
logger.debug(f"State variables: {data.get('newVariables', {})}")
|
||||
|
||||
nextStageCode = data['newVariables']['status_code']
|
||||
|
||||
# 有一些情况需要调整nextStageCode
|
||||
if nextStageCode in ['3001', '3002', '1002']:
|
||||
nextStageCode = '1002'
|
||||
elif nextStageCode == '2006':
|
||||
nextStageCode = '2004'
|
||||
elif nextStageCode == '2017':
|
||||
nextStageCode = '2016'
|
||||
elif nextStageCode == '2020':
|
||||
nextStageCode = '0002'
|
||||
nextStage = STATUS_CODE_MAP.get(nextStageCode, '')
|
||||
|
||||
# Parse content - sometimes content is a string, sometimes it is a list
|
||||
content_stage_code = None
|
||||
if isinstance(content, list):
|
||||
logger.debug("content是一个list")
|
||||
content = content[0]['text']['content']
|
||||
elif isinstance(content, str):
|
||||
|
||||
if isinstance(content, str):
|
||||
logger.debug("content是一个str")
|
||||
state_and_content = extract_state_and_content(content)
|
||||
if state_and_content:
|
||||
logger.debug(f"解析后的state和content为: {state_and_content}")
|
||||
content_stage_code = state_and_content['state']
|
||||
content = state_and_content['content']
|
||||
else:
|
||||
raise ValueError("大模型回复中的state解析失败")
|
||||
@@ -310,10 +370,16 @@ async def chat(
|
||||
logger.error(f"content既不是list也不是str, type: {type(content)}")
|
||||
raise ValueError("大模型回复不是list也不是str")
|
||||
|
||||
nextStageCode = content_stage_code or data['newVariables']['status_code']
|
||||
nextStageCode = normalize_stage_code(nextStageCode)
|
||||
nextStage = STATUS_CODE_MAP.get(nextStageCode, '')
|
||||
form_update = extract_form_update_from_flow_nodes(data.get("responseData", []))
|
||||
|
||||
return ProcessResponse_chat(
|
||||
sessionId=json_data['sessionId'],
|
||||
timeStamp=json_data['timeStamp'],
|
||||
outputText=content,
|
||||
formUpdate=form_update,
|
||||
nextStage=nextStage,
|
||||
nextStageCode=nextStageCode,
|
||||
code="200",
|
||||
@@ -340,11 +406,16 @@ async def set_info(
|
||||
):
|
||||
"""Set information in chat state."""
|
||||
json_data = request.model_dump()
|
||||
set_info_payload = {'key': json_data['key'], 'value': json_data['value']}
|
||||
set_info_input = format_set_info_input(
|
||||
set_info_payload,
|
||||
json_data.get('includeInputInfo', False)
|
||||
)
|
||||
|
||||
try:
|
||||
# Get current state
|
||||
response = await client.create_chat_completion(
|
||||
messages=[{"role": "user", "content": ""}],
|
||||
messages=[{"role": "user", "content": set_info_input}],
|
||||
chatId=json_data['sessionId'],
|
||||
stream=False,
|
||||
detail=True
|
||||
@@ -382,11 +453,12 @@ async def set_info(
|
||||
key = json_data['key']
|
||||
value = json_data['value']
|
||||
current_state[key] = value
|
||||
logger.info(f'即将设置 {key} 为 {value}')
|
||||
logger.info(f'即将上传 {current_state}')
|
||||
|
||||
# Update state using SDK
|
||||
response = await client.create_chat_completion(
|
||||
messages=[{"role": "user", "content": ""}],
|
||||
messages=[{"role": "user", "content": set_info_input}],
|
||||
chatId=json_data['sessionId'],
|
||||
stream=False,
|
||||
detail=True,
|
||||
@@ -420,11 +492,16 @@ async def get_info(
|
||||
):
|
||||
"""Get information from chat state."""
|
||||
json_data = request.model_dump()
|
||||
get_info_payload = {'key': json_data['key']}
|
||||
get_info_input = format_set_info_input(
|
||||
get_info_payload,
|
||||
json_data.get('includeInputInfo', False)
|
||||
)
|
||||
|
||||
try:
|
||||
# Get current state
|
||||
response = await client.create_chat_completion(
|
||||
messages=[{"role": "user", "content": ""}],
|
||||
messages=[{"role": "user", "content": get_info_input}],
|
||||
chatId=json_data['sessionId'],
|
||||
stream=False,
|
||||
detail=True
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
from fastapi import FastAPI
|
||||
import sys
|
||||
from .api.endpoints import router as api_router
|
||||
from .core.fastgpt_client import lifespan
|
||||
from .core.logging_config import setup_logging
|
||||
from .voice.routes import register_voice
|
||||
|
||||
# Setup logging first
|
||||
setup_logging()
|
||||
@@ -18,4 +18,5 @@ app = FastAPI(
|
||||
def read_root():
|
||||
return {"message": "Server is running."}
|
||||
|
||||
app.include_router(api_router)
|
||||
app.include_router(api_router)
|
||||
register_voice(app)
|
||||
@@ -1,15 +1,17 @@
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
class ProcessRequest_chat(BaseModel):
|
||||
sessionId: str = Field(..., max_length=64)
|
||||
timeStamp: str = Field(..., max_length=32)
|
||||
text: str = Field(...)
|
||||
needFormUpdate: bool = False
|
||||
|
||||
class ProcessResponse_chat(BaseModel):
|
||||
sessionId: str = Field(..., max_length=64)
|
||||
timeStamp: str = Field(..., max_length=32)
|
||||
outputText: str = Field(...)
|
||||
formUpdate: Any = Field(default_factory=dict)
|
||||
nextStage: str = Field(..., max_length=32)
|
||||
nextStageCode: str = Field(..., max_length=4)
|
||||
code: str = Field(..., max_length=4)
|
||||
@@ -19,6 +21,7 @@ class ProcessRequest_get(BaseModel):
|
||||
sessionId: str = Field(..., max_length=64)
|
||||
timeStamp: str = Field(..., max_length=32)
|
||||
key: str = Field(...)
|
||||
includeInputInfo: bool = False
|
||||
|
||||
class ProcessResponse_get(BaseModel):
|
||||
sessionId: str = Field(..., max_length=64)
|
||||
@@ -32,6 +35,7 @@ class ProcessRequest_set(BaseModel):
|
||||
timeStamp: str = Field(..., max_length=32)
|
||||
key: str = Field(...)
|
||||
value: str = Field(...)
|
||||
includeInputInfo: bool = False
|
||||
|
||||
class ProcessResponse_set(BaseModel):
|
||||
sessionId: str = Field(..., max_length=64)
|
||||
|
||||
1
src/voice/__init__.py
Normal file
1
src/voice/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Voice websocket demo (product-ws / va.ws.v1) powered by Pipecat."""
|
||||
313
src/voice/config.py
Normal file
313
src/voice/config.py
Normal file
@@ -0,0 +1,313 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
DEFAULT_VOICE_CONFIG_REL = "config/voice.json"
|
||||
|
||||
|
||||
def resolve_voice_config_path() -> Path:
|
||||
"""Return the voice config path from VOICE_CONFIG or the default."""
|
||||
configured = os.getenv("VOICE_CONFIG", DEFAULT_VOICE_CONFIG_REL).strip()
|
||||
if not configured:
|
||||
configured = DEFAULT_VOICE_CONFIG_REL
|
||||
path = Path(configured)
|
||||
if not path.is_absolute():
|
||||
path = PROJECT_ROOT / path
|
||||
return path
|
||||
|
||||
|
||||
DEFAULT_VOICE_CONFIG = resolve_voice_config_path()
|
||||
|
||||
SUPPORTED_LLM_PROVIDERS = frozenset({"openai", "fastgpt"})
|
||||
_LLM_PROVIDER_ALIASES = {"llm": "openai", "openai": "openai", "fastgpt": "fastgpt"}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ServerConfig:
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 8000
|
||||
cors_origins: list[str] = field(default_factory=list)
|
||||
serve_webpage: bool = True
|
||||
webpage_mount: str = "/voice-demo"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AudioConfig:
|
||||
sample_rate_hz: int = 16000
|
||||
channels: int = 1
|
||||
frame_ms: int = 20
|
||||
|
||||
@property
|
||||
def frame_bytes(self) -> int:
|
||||
return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SessionConfig:
|
||||
inactivity_timeout_sec: int = 60
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VADConfig:
|
||||
confidence: float = 0.7
|
||||
start_secs: float = 0.2
|
||||
stop_secs: float = 0.6
|
||||
min_volume: float = 0.6
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TurnConfig:
|
||||
vad: VADConfig = field(default_factory=VADConfig)
|
||||
user_speech_timeout_sec: float = 1.0
|
||||
idle_prompt_timeout_sec: float = 0.0
|
||||
idle_prompt_max_count: int = 1
|
||||
idle_prompt_text: str = (
|
||||
"我先停在这里。你可以继续说你的想法,"
|
||||
"或者让我根据刚才的内容帮你整理下一步。"
|
||||
)
|
||||
interruption_min_chars: int = 3
|
||||
interruption_use_interim: bool = True
|
||||
interruption_short_replies: list[str] = field(
|
||||
default_factory=lambda: [
|
||||
"是",
|
||||
"是的",
|
||||
"对",
|
||||
"对的",
|
||||
"嗯",
|
||||
"好",
|
||||
"好的",
|
||||
"行",
|
||||
"可以",
|
||||
"没问题",
|
||||
"不是",
|
||||
"不",
|
||||
"不行",
|
||||
"不用",
|
||||
"不要",
|
||||
"没有",
|
||||
"否",
|
||||
"no",
|
||||
"yes",
|
||||
"ok",
|
||||
"okay",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ResponseStateConfig:
|
||||
enabled: bool = False
|
||||
tag: str = "state"
|
||||
event_type: str = "response.state"
|
||||
max_prefix_chars: int = 256
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AgentConfig:
|
||||
system_prompt: str = "You are a helpful, friendly voice assistant."
|
||||
greeting: str | None = None
|
||||
greeting_mode: str = "generated"
|
||||
fastgpt_reconnect_greeting: str = "欢迎回来继续对话,请告诉我准备好了之后继续办理"
|
||||
response_state: ResponseStateConfig = field(default_factory=ResponseStateConfig)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LLMConfig:
|
||||
provider: str = "openai"
|
||||
api_key: str = ""
|
||||
base_url: str | None = None
|
||||
model: str = "gpt-4o-mini"
|
||||
app_id: str | None = None
|
||||
temperature: float | None = 0.7
|
||||
chat_id: str | None = None
|
||||
variables: dict[str, str] = field(default_factory=dict)
|
||||
detail: bool = False
|
||||
timeout_sec: float = 60.0
|
||||
image_input_mode: str = "base64"
|
||||
|
||||
@property
|
||||
def is_fastgpt(self) -> bool:
|
||||
return self.provider == "fastgpt"
|
||||
|
||||
@property
|
||||
def is_openai(self) -> bool:
|
||||
return self.provider == "openai"
|
||||
|
||||
@property
|
||||
def uses_local_context_history(self) -> bool:
|
||||
"""Whether the pipeline should seed and maintain local LLM context history."""
|
||||
return not self.is_fastgpt
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class STTConfig:
|
||||
provider: str = "openai"
|
||||
app_id: str = ""
|
||||
api_key: str = ""
|
||||
api_secret: str = ""
|
||||
base_url: str | None = None
|
||||
model: str = "gpt-4o-mini-transcribe"
|
||||
language: str | None = "en"
|
||||
domain: str = "iat"
|
||||
accent: str = "mandarin"
|
||||
encoding: str = "raw"
|
||||
frame_size: int = 1280
|
||||
timeout_sec: float = 10.0
|
||||
dynamic_correction: bool = False
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TTSConfig:
|
||||
provider: str = "openai"
|
||||
app_id: str = ""
|
||||
api_key: str = ""
|
||||
api_secret: str = ""
|
||||
base_url: str | None = None
|
||||
model: str = "gpt-4o-mini-tts"
|
||||
voice: str = "alloy"
|
||||
aue: str = "raw"
|
||||
tte: str = "UTF8"
|
||||
speed: int = 50
|
||||
volume: int = 50
|
||||
pitch: int = 50
|
||||
timeout_sec: float = 30.0
|
||||
source_sample_rate_hz: int | None = None
|
||||
oral_level: str = "mid"
|
||||
text_aggregation_mode: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ServicesConfig:
|
||||
llm: LLMConfig = field(default_factory=LLMConfig)
|
||||
stt: STTConfig = field(default_factory=STTConfig)
|
||||
tts: TTSConfig = field(default_factory=TTSConfig)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EngineConfig:
|
||||
server: ServerConfig = field(default_factory=ServerConfig)
|
||||
audio: AudioConfig = field(default_factory=AudioConfig)
|
||||
session: SessionConfig = field(default_factory=SessionConfig)
|
||||
turn: TurnConfig = field(default_factory=TurnConfig)
|
||||
agent: AgentConfig = field(default_factory=AgentConfig)
|
||||
services: ServicesConfig = field(default_factory=ServicesConfig)
|
||||
|
||||
|
||||
def load_config(path: str | Path | None = None) -> EngineConfig:
|
||||
config_path = Path(path) if path is not None else resolve_voice_config_path()
|
||||
if not config_path.is_absolute():
|
||||
config_path = PROJECT_ROOT / config_path
|
||||
data = json.loads(config_path.read_text(encoding="utf-8"))
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"Config file must contain a JSON object: {config_path}")
|
||||
return config_from_dict(data)
|
||||
|
||||
|
||||
def config_from_dict(data: dict) -> EngineConfig:
|
||||
services = _dict(data.get("services"))
|
||||
agent = _dict(data.get("agent"))
|
||||
if agent.get("greeting") == "":
|
||||
agent["greeting"] = None
|
||||
if agent.get("greeting_mode") not in (None, "generated", "fixed", "off", "fastgpt_opener"):
|
||||
raise ValueError(
|
||||
"agent.greeting_mode must be one of: generated, fixed, off, fastgpt_opener"
|
||||
)
|
||||
response_state = ResponseStateConfig(**_dict(agent.pop("response_state", None)))
|
||||
if response_state.max_prefix_chars < 1:
|
||||
raise ValueError("agent.response_state.max_prefix_chars must be greater than 0")
|
||||
if not response_state.tag:
|
||||
raise ValueError("agent.response_state.tag must not be empty")
|
||||
if not response_state.event_type:
|
||||
raise ValueError("agent.response_state.event_type must not be empty")
|
||||
|
||||
stt = _dict(services.get("stt") or services.get("asr"))
|
||||
if stt.get("language") == "":
|
||||
stt["language"] = None
|
||||
|
||||
llm = _dict(services.get("llm"))
|
||||
llm["provider"] = _normalize_llm_provider(llm.get("provider", LLMConfig().provider))
|
||||
if llm.get("chat_id") == "":
|
||||
llm["chat_id"] = None
|
||||
llm.pop("send_system_prompt", None)
|
||||
image_input_mode = str(
|
||||
llm.get("image_input_mode", LLMConfig().image_input_mode)
|
||||
).strip().lower()
|
||||
if image_input_mode not in {"base64", "upload"}:
|
||||
raise ValueError(
|
||||
"services.llm.image_input_mode must be 'base64' or 'upload', "
|
||||
f"got {llm.get('image_input_mode')!r}"
|
||||
)
|
||||
llm["image_input_mode"] = image_input_mode
|
||||
if llm.get("app_id") == "":
|
||||
llm["app_id"] = None
|
||||
if not isinstance(llm.get("variables"), dict):
|
||||
llm["variables"] = {}
|
||||
if agent.get("greeting_mode") == "fastgpt_opener" and llm["provider"] != "fastgpt":
|
||||
raise ValueError(
|
||||
"agent.greeting_mode='fastgpt_opener' requires services.llm.provider='fastgpt'"
|
||||
)
|
||||
|
||||
turn = _dict(data.get("turn"))
|
||||
vad = _dict(turn.get("vad"))
|
||||
|
||||
return EngineConfig(
|
||||
server=ServerConfig(**_dict(data.get("server"))),
|
||||
audio=AudioConfig(**_dict(data.get("audio"))),
|
||||
session=SessionConfig(**_dict(data.get("session"))),
|
||||
turn=TurnConfig(
|
||||
vad=VADConfig(**vad),
|
||||
user_speech_timeout_sec=float(
|
||||
turn.get("user_speech_timeout_sec", TurnConfig().user_speech_timeout_sec)
|
||||
),
|
||||
idle_prompt_timeout_sec=float(
|
||||
turn.get("idle_prompt_timeout_sec", TurnConfig().idle_prompt_timeout_sec)
|
||||
),
|
||||
idle_prompt_max_count=int(
|
||||
turn.get("idle_prompt_max_count", TurnConfig().idle_prompt_max_count)
|
||||
),
|
||||
idle_prompt_text=str(
|
||||
turn.get("idle_prompt_text", TurnConfig().idle_prompt_text)
|
||||
),
|
||||
interruption_min_chars=int(
|
||||
turn.get("interruption_min_chars", TurnConfig().interruption_min_chars)
|
||||
),
|
||||
interruption_use_interim=bool(
|
||||
turn.get("interruption_use_interim", TurnConfig().interruption_use_interim)
|
||||
),
|
||||
interruption_short_replies=list(
|
||||
turn.get(
|
||||
"interruption_short_replies",
|
||||
TurnConfig().interruption_short_replies,
|
||||
)
|
||||
),
|
||||
),
|
||||
agent=AgentConfig(**agent, response_state=response_state),
|
||||
services=ServicesConfig(
|
||||
llm=LLMConfig(**llm),
|
||||
stt=STTConfig(**stt),
|
||||
tts=TTSConfig(**_dict(services.get("tts"))),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _dict(value: object) -> dict:
|
||||
return dict(value) if isinstance(value, dict) else {}
|
||||
|
||||
|
||||
def _normalize_llm_provider(value: object) -> str:
|
||||
provider = str(value or LLMConfig().provider).strip().lower()
|
||||
normalized = _LLM_PROVIDER_ALIASES.get(provider)
|
||||
if normalized is None:
|
||||
supported = ", ".join(sorted(SUPPORTED_LLM_PROVIDERS | {"llm"}))
|
||||
raise ValueError(
|
||||
f"services.llm.provider must be one of: {supported}; got {value!r}"
|
||||
)
|
||||
return normalized
|
||||
40
src/voice/context_sync.py
Normal file
40
src/voice/context_sync.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pipecat.frames.frames import Frame, InterruptionFrame, LLMMessagesAppendFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
from .text_stream import ProductTextStreamProcessor, maybe_sync_assistant_context
|
||||
|
||||
|
||||
class AssistantContextSyncProcessor(FrameProcessor):
|
||||
"""Sync LLM context to urgent-streamed assistant text before text-input turns.
|
||||
|
||||
``input.text`` with ``interrupt: true`` queues ``InterruptionFrame`` before
|
||||
``LLMMessagesAppendFrame``. This processor runs context repair after the
|
||||
interrupt has propagated (including TTS-phase interrupts) and before the new
|
||||
user message is appended.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
text_stream: ProductTextStreamProcessor,
|
||||
assistant_aggregator: Any,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self._text_stream = text_stream
|
||||
self._assistant_aggregator = assistant_aggregator
|
||||
self._sync_on_next_append = False
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, InterruptionFrame):
|
||||
self._sync_on_next_append = True
|
||||
elif isinstance(frame, LLMMessagesAppendFrame) and self._sync_on_next_append:
|
||||
self._sync_on_next_append = False
|
||||
maybe_sync_assistant_context(self._assistant_aggregator, self._text_stream)
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
564
src/voice/fastgpt_llm.py
Normal file
564
src/voice/fastgpt_llm.py
Normal file
@@ -0,0 +1,564 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import binascii
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
from fastgpt_client import AsyncChatClient, FastGPTInteractiveEvent, aiter_stream_events
|
||||
from fastgpt_client.exceptions import FastGPTError
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
InterruptionFrame,
|
||||
LLMContextFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMTextFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.llm_service import LLMService
|
||||
from pipecat.services.settings import LLMSettings
|
||||
|
||||
|
||||
def _extract_text_from_event(kind: str, payload: Any) -> str:
|
||||
if not isinstance(payload, dict):
|
||||
return ""
|
||||
|
||||
if kind in {"answer", "fastAnswer"}:
|
||||
text = payload.get("text")
|
||||
if isinstance(text, str) and text:
|
||||
return text
|
||||
|
||||
choices = payload.get("choices") if isinstance(payload.get("choices"), list) else []
|
||||
if not choices:
|
||||
return str(payload.get("text") or "")
|
||||
|
||||
first_choice = choices[0] if isinstance(choices[0], dict) else {}
|
||||
delta = first_choice.get("delta") if isinstance(first_choice.get("delta"), dict) else {}
|
||||
content = delta.get("content")
|
||||
if isinstance(content, str) and content:
|
||||
return content
|
||||
|
||||
message = first_choice.get("message") if isinstance(first_choice.get("message"), dict) else {}
|
||||
message_content = message.get("content")
|
||||
if isinstance(message_content, str) and message_content:
|
||||
return message_content
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def _message_text(message: dict[str, Any]) -> str:
|
||||
content = message.get("content")
|
||||
if isinstance(content, str):
|
||||
return content.strip()
|
||||
if isinstance(content, list):
|
||||
parts: list[str] = []
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get("type") == "text":
|
||||
text = part.get("text")
|
||||
if isinstance(text, str) and text.strip():
|
||||
parts.append(text.strip())
|
||||
return " ".join(parts)
|
||||
return ""
|
||||
|
||||
|
||||
def _first_nonempty_text(*values: Any) -> str:
|
||||
for value in values:
|
||||
if isinstance(value, str):
|
||||
text = value.strip()
|
||||
if text:
|
||||
return text
|
||||
return ""
|
||||
|
||||
|
||||
def _interactive_spoken_prompt(event: FastGPTInteractiveEvent) -> str:
|
||||
payload = event.data if isinstance(event.data, dict) else {}
|
||||
params = payload.get("params") if isinstance(payload.get("params"), dict) else {}
|
||||
|
||||
prompt = _first_nonempty_text(
|
||||
payload.get("opener"),
|
||||
params.get("opener"),
|
||||
payload.get("prompt"),
|
||||
params.get("prompt"),
|
||||
payload.get("text"),
|
||||
params.get("text"),
|
||||
payload.get("title"),
|
||||
params.get("title"),
|
||||
payload.get("description"),
|
||||
params.get("description"),
|
||||
)
|
||||
if prompt:
|
||||
return prompt
|
||||
|
||||
if event.interaction_type == "userSelect":
|
||||
raw_options = (
|
||||
params.get("userSelectOptions")
|
||||
if isinstance(params.get("userSelectOptions"), list)
|
||||
else []
|
||||
)
|
||||
labels: list[str] = []
|
||||
for index, raw in enumerate(raw_options, start=1):
|
||||
if isinstance(raw, str) and raw.strip():
|
||||
labels.append(f"{index}. {raw.strip()}")
|
||||
elif isinstance(raw, dict):
|
||||
label = _first_nonempty_text(raw.get("label"), raw.get("value"))
|
||||
if label:
|
||||
labels.append(f"{index}. {label}")
|
||||
if labels:
|
||||
return "请选择:" + ",".join(labels)
|
||||
return "请选择一个选项。"
|
||||
|
||||
if event.interaction_type == "userInput":
|
||||
input_form = params.get("inputForm") if isinstance(params.get("inputForm"), list) else []
|
||||
labels = [
|
||||
_first_nonempty_text(field.get("label"), field.get("name"))
|
||||
for field in input_form
|
||||
if isinstance(field, dict)
|
||||
]
|
||||
labels = [label for label in labels if label]
|
||||
if labels:
|
||||
return "请提供以下信息:" + ",".join(labels)
|
||||
return "请补充所需信息。"
|
||||
|
||||
return "请继续。"
|
||||
|
||||
|
||||
IMAGE_INPUT_MODE_BASE64 = "base64"
|
||||
IMAGE_INPUT_MODE_UPLOAD = "upload"
|
||||
SUPPORTED_IMAGE_INPUT_MODES = frozenset({IMAGE_INPUT_MODE_BASE64, IMAGE_INPUT_MODE_UPLOAD})
|
||||
|
||||
_MIME_TO_EXT = {
|
||||
"image/jpeg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/webp": ".webp",
|
||||
}
|
||||
|
||||
|
||||
def _message_has_image(message: dict[str, Any]) -> bool:
|
||||
content = message.get("content")
|
||||
if not isinstance(content, list):
|
||||
return False
|
||||
return any(
|
||||
isinstance(part, dict) and part.get("type") == "image_url"
|
||||
for part in content
|
||||
)
|
||||
|
||||
|
||||
def _redact_messages_for_log(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Replace base64 image data URLs with a short placeholder for logging."""
|
||||
redacted: list[dict[str, Any]] = []
|
||||
for message in messages:
|
||||
content = message.get("content")
|
||||
if not isinstance(content, list):
|
||||
redacted.append(message)
|
||||
continue
|
||||
parts: list[Any] = []
|
||||
for part in content:
|
||||
if (
|
||||
isinstance(part, dict)
|
||||
and part.get("type") == "image_url"
|
||||
and isinstance(part.get("image_url"), dict)
|
||||
):
|
||||
url = str(part["image_url"].get("url") or "")
|
||||
parts.append({"type": "image_url", "image_url": {"url": f"<{len(url)} chars>"}})
|
||||
else:
|
||||
parts.append(part)
|
||||
redacted.append({**message, "content": parts})
|
||||
return redacted
|
||||
|
||||
|
||||
@dataclass
|
||||
class FastGPTLLMSettings(LLMSettings):
|
||||
variables: dict[str, Any] = field(default_factory=dict)
|
||||
detail: bool = False
|
||||
|
||||
|
||||
def _default_fastgpt_settings(*, model: str = "fastgpt") -> FastGPTLLMSettings:
|
||||
return FastGPTLLMSettings(
|
||||
model=model,
|
||||
system_instruction=None,
|
||||
temperature=None,
|
||||
max_tokens=None,
|
||||
top_p=None,
|
||||
top_k=None,
|
||||
frequency_penalty=None,
|
||||
presence_penalty=None,
|
||||
seed=None,
|
||||
filter_incomplete_user_turns=False,
|
||||
user_turn_completion_config=None,
|
||||
variables={},
|
||||
detail=False,
|
||||
)
|
||||
|
||||
|
||||
class FastGPTLLMService(LLMService):
|
||||
"""FastGPT LLM service using chatId server-side memory and workflow variables."""
|
||||
|
||||
Settings = FastGPTLLMSettings
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
base_url: str,
|
||||
chat_id: str | None = None,
|
||||
app_id: str | None = None,
|
||||
greeting_prompt: str | None = None,
|
||||
timeout: float = 60.0,
|
||||
image_input_mode: str = IMAGE_INPUT_MODE_BASE64,
|
||||
settings: FastGPTLLMSettings | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
default_settings = _default_fastgpt_settings()
|
||||
if settings is not None:
|
||||
default_settings.apply_update(settings)
|
||||
super().__init__(settings=default_settings, **kwargs)
|
||||
|
||||
self._chat_id = chat_id or f"voice_{uuid.uuid4().hex[:16]}"
|
||||
self._app_id = (app_id or "").strip()
|
||||
self._greeting_prompt = (greeting_prompt or "你好").strip() or "你好"
|
||||
self._client = AsyncChatClient(
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
timeout=timeout,
|
||||
)
|
||||
self._active_response = None
|
||||
|
||||
mode = (image_input_mode or IMAGE_INPUT_MODE_BASE64).strip().lower()
|
||||
if mode not in SUPPORTED_IMAGE_INPUT_MODES:
|
||||
raise ValueError(
|
||||
f"Unsupported image_input_mode {image_input_mode!r}; "
|
||||
f"expected one of {sorted(SUPPORTED_IMAGE_INPUT_MODES)}"
|
||||
)
|
||||
if mode == IMAGE_INPUT_MODE_UPLOAD and not self._app_id:
|
||||
logger.warning(
|
||||
"FastGPT image_input_mode='upload' requires app_id; "
|
||||
"falling back to inline base64"
|
||||
)
|
||||
mode = IMAGE_INPUT_MODE_BASE64
|
||||
self._image_input_mode = mode
|
||||
|
||||
@property
|
||||
def app_id(self) -> str:
|
||||
return self._app_id
|
||||
|
||||
@property
|
||||
def chat_id(self) -> str:
|
||||
return self._chat_id
|
||||
|
||||
def set_variables(self, variables: dict[str, Any]) -> None:
|
||||
merged = dict(self._settings.variables)
|
||||
merged.update(variables)
|
||||
self._settings.variables = merged
|
||||
|
||||
async def stop(self, frame: EndFrame) -> None:
|
||||
await self._close_active_response()
|
||||
await self._client.close()
|
||||
await super().stop(frame)
|
||||
|
||||
async def cancel(self, frame: CancelFrame) -> None:
|
||||
await self._close_active_response()
|
||||
await super().cancel(frame)
|
||||
|
||||
async def _handle_interruptions(self, _: InterruptionFrame) -> None:
|
||||
await self._close_active_response()
|
||||
await super()._handle_interruptions(_)
|
||||
|
||||
@staticmethod
|
||||
def _welcome_text_from_init_payload(payload: Any) -> str:
|
||||
if not isinstance(payload, dict):
|
||||
return ""
|
||||
|
||||
for container in (payload.get("app"), payload.get("data"), payload):
|
||||
if not isinstance(container, dict):
|
||||
continue
|
||||
nested_app = container.get("app")
|
||||
if isinstance(nested_app, dict):
|
||||
text = FastGPTLLMService._welcome_text_from_app(nested_app)
|
||||
if text:
|
||||
return text
|
||||
text = FastGPTLLMService._welcome_text_from_app(container)
|
||||
if text:
|
||||
return text
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _welcome_text_from_app(app_payload: dict[str, Any]) -> str:
|
||||
chat_config = (
|
||||
app_payload.get("chatConfig")
|
||||
if isinstance(app_payload.get("chatConfig"), dict)
|
||||
else {}
|
||||
)
|
||||
return _first_nonempty_text(
|
||||
chat_config.get("welcomeText"),
|
||||
app_payload.get("welcomeText"),
|
||||
app_payload.get("opener"),
|
||||
app_payload.get("intro"),
|
||||
)
|
||||
|
||||
async def fetch_welcome_text(self) -> str | None:
|
||||
"""Return FastGPT app welcome text from chat init when ``app_id`` is configured."""
|
||||
if not self._app_id:
|
||||
return None
|
||||
|
||||
try:
|
||||
response = await self._client.get_chat_init(
|
||||
appId=self._app_id,
|
||||
chatId=self._chat_id,
|
||||
)
|
||||
response.raise_for_status()
|
||||
text = self._welcome_text_from_init_payload(response.json())
|
||||
if text:
|
||||
logger.info(f"FastGPT app opener loaded for appId={self._app_id}")
|
||||
return text or None
|
||||
except FastGPTError as exc:
|
||||
logger.warning(f"FastGPT chat init failed: {exc}")
|
||||
except httpx.HTTPError as exc:
|
||||
logger.warning(f"FastGPT chat init HTTP error: {exc}")
|
||||
except Exception as exc:
|
||||
logger.warning(f"FastGPT chat init error: {exc}")
|
||||
return None
|
||||
|
||||
async def has_chat_history(self) -> bool:
|
||||
"""Return whether FastGPT has persisted records for this chatId."""
|
||||
if not self._app_id:
|
||||
return False
|
||||
|
||||
try:
|
||||
response = await self._client.get_chat_records(
|
||||
appId=self._app_id,
|
||||
chatId=self._chat_id,
|
||||
offset=0,
|
||||
pageSize=1,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
records = data.get("data", {}).get("list", [])
|
||||
return isinstance(records, list) and bool(records)
|
||||
except FastGPTError as exc:
|
||||
logger.warning(f"FastGPT chat records failed: {exc}")
|
||||
except httpx.HTTPError as exc:
|
||||
logger.warning(f"FastGPT chat records HTTP error: {exc}")
|
||||
except Exception as exc:
|
||||
logger.warning(f"FastGPT chat records error: {exc}")
|
||||
return False
|
||||
|
||||
async def fetch_session_greeting_text(self, reconnect_greeting: str) -> str | None:
|
||||
"""Use opener for a new chatId and a fixed greeting for reconnects."""
|
||||
if await self.has_chat_history():
|
||||
logger.info(f"FastGPT chatId={self._chat_id} has history; using reconnect greeting")
|
||||
return reconnect_greeting.strip() or None
|
||||
|
||||
logger.info(f"FastGPT chatId={self._chat_id} has no history; using app opener")
|
||||
return await self.fetch_welcome_text()
|
||||
|
||||
async def _close_active_response(self) -> None:
|
||||
response = self._active_response
|
||||
self._active_response = None
|
||||
if response is not None:
|
||||
await response.aclose()
|
||||
|
||||
def _build_fastgpt_messages(self, context: LLMContext) -> list[dict[str, Any]]:
|
||||
raw_messages = context.get_messages()
|
||||
|
||||
for message in reversed(raw_messages):
|
||||
if not isinstance(message, dict) or message.get("role") != "user":
|
||||
continue
|
||||
if _message_has_image(message):
|
||||
# Multimodal turn: forward the OpenAI-style content list as-is
|
||||
# (text parts + image_url with a base64 data URL). FastGPT's
|
||||
# /chat/completions accepts this directly.
|
||||
return [{"role": "user", "content": message["content"]}]
|
||||
text = _message_text(message)
|
||||
if text:
|
||||
return [{"role": "user", "content": text}]
|
||||
|
||||
return [{"role": "user", "content": self._greeting_prompt}]
|
||||
|
||||
async def _resolve_image_inputs(
|
||||
self, messages: list[dict[str, Any]]
|
||||
) -> list[dict[str, Any]]:
|
||||
"""In ``upload`` mode, replace inline base64 image data URLs with uploaded URLs.
|
||||
|
||||
In ``base64`` mode the messages are returned untouched (inline data URLs).
|
||||
New message/content objects are built so the shared ``LLMContext`` messages
|
||||
are never mutated.
|
||||
"""
|
||||
if self._image_input_mode != IMAGE_INPUT_MODE_UPLOAD:
|
||||
return messages
|
||||
|
||||
resolved: list[dict[str, Any]] = []
|
||||
for message in messages:
|
||||
content = message.get("content")
|
||||
if not isinstance(content, list):
|
||||
resolved.append(message)
|
||||
continue
|
||||
|
||||
new_content: list[Any] = []
|
||||
for part in content:
|
||||
url = (
|
||||
part.get("image_url", {}).get("url")
|
||||
if isinstance(part, dict) and part.get("type") == "image_url"
|
||||
else None
|
||||
)
|
||||
if isinstance(url, str) and url.startswith("data:image/"):
|
||||
uploaded = await self._upload_data_url(url)
|
||||
new_content.append(
|
||||
{"type": "image_url", "image_url": {"url": uploaded}}
|
||||
)
|
||||
else:
|
||||
new_content.append(part)
|
||||
resolved.append({**message, "content": new_content})
|
||||
|
||||
return resolved
|
||||
|
||||
async def _upload_data_url(self, data_url: str) -> str:
|
||||
"""Upload a ``data:image/...;base64,...`` URL via FastGPT and return its URL.
|
||||
|
||||
Falls back to the original data URL if parsing or upload fails so the turn
|
||||
still proceeds with inline base64.
|
||||
"""
|
||||
header, _, payload = data_url.partition(",")
|
||||
mime_type = header[len("data:"):].split(";", 1)[0].strip() or "image/jpeg"
|
||||
try:
|
||||
raw = base64.b64decode(payload, validate=True)
|
||||
except (binascii.Error, ValueError) as exc:
|
||||
logger.warning(f"FastGPT image upload skipped; invalid base64: {exc}")
|
||||
return data_url
|
||||
|
||||
suffix = _MIME_TO_EXT.get(mime_type, ".jpg")
|
||||
tmp_path: str | None = None
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
tmp.write(raw)
|
||||
tmp_path = tmp.name
|
||||
result = await self._client.upload_chat_image(
|
||||
appId=self._app_id,
|
||||
chatId=self._chat_id,
|
||||
file_path=tmp_path,
|
||||
)
|
||||
url = result.get("url") if isinstance(result, dict) else None
|
||||
if isinstance(url, str) and url:
|
||||
logger.info(
|
||||
f"FastGPT image uploaded chatId={self._chat_id} "
|
||||
f"bytes={len(raw)} url={url}"
|
||||
)
|
||||
return url
|
||||
logger.warning("FastGPT image upload returned no url; using inline base64")
|
||||
return data_url
|
||||
except Exception as exc:
|
||||
logger.warning(f"FastGPT image upload failed; using inline base64: {exc}")
|
||||
return data_url
|
||||
finally:
|
||||
if tmp_path is not None:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
async def _process_context(self, context: LLMContext) -> None:
|
||||
messages = self._build_fastgpt_messages(context)
|
||||
messages = await self._resolve_image_inputs(messages)
|
||||
variables = self._settings.variables or None
|
||||
|
||||
logger.info(
|
||||
"FastGPT chat completion "
|
||||
f"chatId={self._chat_id} appId={self._app_id or '-'} "
|
||||
f"variables={sorted((variables or {}).keys())} "
|
||||
f"messages={_redact_messages_for_log(messages)!r}"
|
||||
)
|
||||
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
try:
|
||||
response = await self._client.create_chat_completion(
|
||||
messages=messages,
|
||||
stream=True,
|
||||
chatId=self._chat_id,
|
||||
variables=variables,
|
||||
detail=self._settings.detail,
|
||||
)
|
||||
except FastGPTError as exc:
|
||||
await self.push_error(error_msg=f"FastGPT request failed: {exc}", exception=exc)
|
||||
return
|
||||
except httpx.HTTPError as exc:
|
||||
await self.push_error(error_msg=f"FastGPT HTTP error: {exc}", exception=exc)
|
||||
return
|
||||
|
||||
self._active_response = response
|
||||
|
||||
try:
|
||||
async for event in aiter_stream_events(response):
|
||||
if event.kind in {"data", "answer", "fastAnswer"}:
|
||||
text = _extract_text_from_event(event.kind, event.data)
|
||||
if text:
|
||||
await self.stop_ttfb_metrics()
|
||||
await self.push_frame(LLMTextFrame(text))
|
||||
continue
|
||||
|
||||
if event.kind == "interactive" and isinstance(event, FastGPTInteractiveEvent):
|
||||
await self._handle_interactive(event)
|
||||
break
|
||||
|
||||
if event.kind == "error":
|
||||
payload = event.data if isinstance(event.data, dict) else {}
|
||||
message = _first_nonempty_text(
|
||||
payload.get("message"),
|
||||
payload.get("error"),
|
||||
) or "FastGPT stream error"
|
||||
await self.push_error(error_msg=message)
|
||||
break
|
||||
|
||||
if event.kind == "done":
|
||||
break
|
||||
finally:
|
||||
self._active_response = None
|
||||
await response.aclose()
|
||||
|
||||
async def _handle_interactive(self, event: FastGPTInteractiveEvent) -> None:
|
||||
prompt = _interactive_spoken_prompt(event)
|
||||
if prompt:
|
||||
await self.stop_ttfb_metrics()
|
||||
await self.push_frame(LLMTextFrame(prompt))
|
||||
|
||||
await self.push_frame(
|
||||
OutputTransportMessageFrame(
|
||||
message={
|
||||
"type": "response.interactive",
|
||||
"interaction_type": event.interaction_type,
|
||||
"data": event.data,
|
||||
}
|
||||
),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, LLMContextFrame):
|
||||
try:
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
await self.start_processing_metrics()
|
||||
await self._process_context(frame.context)
|
||||
except httpx.TimeoutException as exc:
|
||||
await self._call_event_handler("on_completion_timeout")
|
||||
await self.push_error(error_msg="FastGPT completion timeout", exception=exc)
|
||||
except Exception as exc:
|
||||
await self.push_error(error_msg=f"FastGPT completion error: {exc}", exception=exc)
|
||||
finally:
|
||||
await self.stop_processing_metrics()
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
291
src/voice/pipeline.py
Normal file
291
src/voice/pipeline.py
Normal file
@@ -0,0 +1,291 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
LLMRunFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
TTSSpeakFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
AssistantTurnStoppedMessage,
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
UserTurnStoppedMessage,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer
|
||||
from pipecat.serializers.protobuf import ProtobufFrameSerializer
|
||||
from pipecat.transports.websocket.fastapi import (
|
||||
FastAPIWebsocketParams,
|
||||
FastAPIWebsocketTransport,
|
||||
)
|
||||
from pipecat.turns.user_stop.speech_timeout_user_turn_stop_strategy import (
|
||||
SpeechTimeoutUserTurnStopStrategy,
|
||||
)
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
from .config import EngineConfig
|
||||
from .context_sync import AssistantContextSyncProcessor
|
||||
from .fastgpt_llm import FastGPTLLMService
|
||||
from .protocol import ProductWebsocketSerializer
|
||||
from .services import create_llm_service, create_stt_service, create_tts_service
|
||||
from .response_state import StateTagResponseProcessor
|
||||
from .text_input import ProductTextInputProcessor
|
||||
from .text_stream import ProductTextStreamProcessor, maybe_sync_assistant_context
|
||||
from .transcript_stream import ProductTranscriptStreamProcessor
|
||||
from .turn_start import InterruptionGateUserTurnStartStrategy
|
||||
|
||||
|
||||
def _chat_id_from_websocket(websocket) -> str | None:
|
||||
query_params = getattr(websocket, "query_params", None)
|
||||
if not query_params:
|
||||
return None
|
||||
|
||||
for name in ("chatId", "chat_id"):
|
||||
value = query_params.get(name)
|
||||
if isinstance(value, str) and value.strip():
|
||||
return value.strip()
|
||||
return None
|
||||
|
||||
|
||||
async def run_product_voice_pipeline(websocket, config: EngineConfig) -> None:
|
||||
await run_pipeline_with_serializer(
|
||||
websocket,
|
||||
config,
|
||||
serializer=ProductWebsocketSerializer(
|
||||
sample_rate=config.audio.sample_rate_hz,
|
||||
channels=config.audio.channels,
|
||||
),
|
||||
client_label="Product JSON",
|
||||
)
|
||||
|
||||
|
||||
async def run_voice_pipeline(websocket, config: EngineConfig) -> None:
|
||||
await run_pipeline_with_serializer(
|
||||
websocket,
|
||||
config,
|
||||
serializer=ProtobufFrameSerializer(),
|
||||
client_label="Pipecat protobuf",
|
||||
)
|
||||
|
||||
|
||||
async def run_pipeline_with_serializer(
|
||||
websocket,
|
||||
config: EngineConfig,
|
||||
*,
|
||||
serializer: FrameSerializer,
|
||||
client_label: str,
|
||||
) -> None:
|
||||
transport = FastAPIWebsocketTransport(
|
||||
websocket=websocket,
|
||||
params=FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
audio_in_sample_rate=config.audio.sample_rate_hz,
|
||||
audio_out_sample_rate=config.audio.sample_rate_hz,
|
||||
audio_in_channels=config.audio.channels,
|
||||
audio_out_channels=config.audio.channels,
|
||||
serializer=serializer,
|
||||
session_timeout=None,
|
||||
),
|
||||
)
|
||||
|
||||
stt = create_stt_service(config.services.stt, config.audio)
|
||||
|
||||
llm_config = config.services.llm
|
||||
chat_id = _chat_id_from_websocket(websocket) or f"voice_{uuid.uuid4().hex[:16]}"
|
||||
llm = create_llm_service(
|
||||
llm_config,
|
||||
chat_id=chat_id,
|
||||
session_variables={"session_id": chat_id, "channel": "voice"},
|
||||
greeting_prompt=config.agent.greeting,
|
||||
)
|
||||
if llm_config.is_fastgpt:
|
||||
logger.info(f"LLM backend=fastgpt chatId={chat_id} appId={llm_config.app_id or '-'}")
|
||||
else:
|
||||
logger.info(f"LLM backend=openai model={llm_config.model}")
|
||||
|
||||
tts = create_tts_service(config.services.tts, config.audio)
|
||||
|
||||
messages: list[dict[str, str]] = []
|
||||
if llm_config.uses_local_context_history:
|
||||
messages = [{"role": "system", "content": config.agent.system_prompt}]
|
||||
if config.agent.greeting and config.agent.greeting_mode == "generated":
|
||||
messages.append({"role": "system", "content": config.agent.greeting})
|
||||
|
||||
context = LLMContext(messages)
|
||||
|
||||
vad_params = VADParams(
|
||||
confidence=config.turn.vad.confidence,
|
||||
start_secs=config.turn.vad.start_secs,
|
||||
stop_secs=config.turn.vad.stop_secs,
|
||||
min_volume=config.turn.vad.min_volume,
|
||||
)
|
||||
# Replace pipecat's default stop strategy (Smart Turn v3) with a simple
|
||||
# silence-timeout strategy. Smart Turn v3 was finalizing every short
|
||||
# Chinese phrase as a complete turn, which caused one logical utterance
|
||||
# to become several LLM calls and several user bubbles in the UI. The
|
||||
# timeout strategy waits for `user_speech_timeout_sec` of silence
|
||||
# (re-armed every time the user resumes speaking) before declaring the
|
||||
# turn finished — which is what we actually want for streaming ASRs.
|
||||
user_turn_strategies = UserTurnStrategies(
|
||||
start=[
|
||||
InterruptionGateUserTurnStartStrategy(
|
||||
min_chars_when_bot_speaking=config.turn.interruption_min_chars,
|
||||
allowed_short_replies=config.turn.interruption_short_replies,
|
||||
use_interim=config.turn.interruption_use_interim,
|
||||
),
|
||||
],
|
||||
stop=[
|
||||
SpeechTimeoutUserTurnStopStrategy(
|
||||
user_speech_timeout=config.turn.user_speech_timeout_sec,
|
||||
),
|
||||
],
|
||||
)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
vad_analyzer=SileroVADAnalyzer(params=vad_params),
|
||||
user_turn_strategies=user_turn_strategies,
|
||||
user_idle_timeout=config.turn.idle_prompt_timeout_sec,
|
||||
),
|
||||
)
|
||||
|
||||
text_stream = ProductTextStreamProcessor()
|
||||
context_sync = AssistantContextSyncProcessor(
|
||||
text_stream=text_stream,
|
||||
assistant_aggregator=assistant_aggregator,
|
||||
)
|
||||
|
||||
processors = [
|
||||
transport.input(),
|
||||
ProductTextInputProcessor(),
|
||||
stt,
|
||||
ProductTranscriptStreamProcessor(),
|
||||
context_sync,
|
||||
user_aggregator,
|
||||
llm,
|
||||
]
|
||||
if config.agent.response_state.enabled:
|
||||
processors.append(StateTagResponseProcessor(config.agent.response_state))
|
||||
processors.extend(
|
||||
[
|
||||
text_stream,
|
||||
tts,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
pipeline = Pipeline(processors)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
audio_in_sample_rate=config.audio.sample_rate_hz,
|
||||
audio_out_sample_rate=config.audio.sample_rate_hz,
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
enable_heartbeats=True,
|
||||
),
|
||||
idle_timeout_secs=config.session.inactivity_timeout_sec,
|
||||
)
|
||||
task.set_reached_upstream_filter((UserStartedSpeakingFrame,))
|
||||
idle_prompt_count = 0
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(_transport, _client):
|
||||
logger.info(f"{client_label} websocket client connected")
|
||||
if config.agent.greeting_mode == "fixed" and config.agent.greeting:
|
||||
await task.queue_frames([TTSSpeakFrame(config.agent.greeting)])
|
||||
elif config.agent.greeting_mode == "fastgpt_opener":
|
||||
if isinstance(llm, FastGPTLLMService):
|
||||
welcome = await llm.fetch_session_greeting_text(
|
||||
config.agent.fastgpt_reconnect_greeting
|
||||
)
|
||||
if welcome:
|
||||
await task.queue_frames([TTSSpeakFrame(welcome)])
|
||||
else:
|
||||
logger.warning("FastGPT opener requested but no opener text was returned")
|
||||
else:
|
||||
raise RuntimeError("agent.greeting_mode='fastgpt_opener' requires FastGPT LLM service")
|
||||
elif config.agent.greeting_mode == "generated":
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(_transport, _client):
|
||||
logger.info(f"{client_label} websocket client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@transport.event_handler("on_session_timeout")
|
||||
async def on_session_timeout(_transport, _client):
|
||||
logger.info(f"{client_label} websocket session timed out")
|
||||
await task.cancel()
|
||||
|
||||
@task.event_handler("on_frame_reached_upstream")
|
||||
async def on_frame_reached_upstream(_task, _frame: UserStartedSpeakingFrame):
|
||||
nonlocal idle_prompt_count
|
||||
idle_prompt_count = 0
|
||||
|
||||
@user_aggregator.event_handler("on_user_turn_started")
|
||||
async def on_user_turn_started(_aggregator, _strategy):
|
||||
nonlocal idle_prompt_count
|
||||
idle_prompt_count = 0
|
||||
|
||||
@user_aggregator.event_handler("on_user_turn_stopped")
|
||||
async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage):
|
||||
logger.info(f"User: {message.content}")
|
||||
text = (message.content or "").strip()
|
||||
if not text:
|
||||
return
|
||||
await _aggregator.push_frame(
|
||||
OutputTransportMessageUrgentFrame(
|
||||
message={
|
||||
"type": "input.transcript.final",
|
||||
"text": text,
|
||||
"user_id": message.user_id,
|
||||
"timestamp": message.timestamp,
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
@assistant_aggregator.event_handler("on_assistant_turn_stopped")
|
||||
async def on_assistant_turn_stopped(_aggregator, message: AssistantTurnStoppedMessage):
|
||||
logger.info(f"Assistant: {message.content}")
|
||||
maybe_sync_assistant_context(
|
||||
_aggregator,
|
||||
text_stream,
|
||||
committed_text=message.content or "",
|
||||
)
|
||||
text_stream.take_interrupted_stream_text()
|
||||
|
||||
@user_aggregator.event_handler("on_user_turn_idle")
|
||||
async def on_user_turn_idle(aggregator):
|
||||
nonlocal idle_prompt_count
|
||||
text = config.turn.idle_prompt_text.strip()
|
||||
if not text or config.turn.idle_prompt_max_count <= 0:
|
||||
return
|
||||
if idle_prompt_count >= config.turn.idle_prompt_max_count:
|
||||
return
|
||||
|
||||
idle_prompt_count += 1
|
||||
logger.info(
|
||||
"User idle prompt triggered "
|
||||
f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}"
|
||||
)
|
||||
await aggregator.push_frame(TTSSpeakFrame(text))
|
||||
|
||||
# NOTE: assistant turn started/final events are emitted by
|
||||
# ProductTextStreamProcessor, upstream of TTS, so text streams to the
|
||||
# client ahead of audio. This logger is kept for server-side visibility.
|
||||
|
||||
runner = PipelineRunner(handle_sigint=False)
|
||||
await runner.run(task)
|
||||
227
src/voice/protocol.py
Normal file
227
src/voice/protocol.py
Normal file
@@ -0,0 +1,227 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import binascii
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
InputTransportMessageFrame,
|
||||
OutputAudioRawFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
TranscriptionFrame,
|
||||
UserImageRawFrame,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer
|
||||
|
||||
|
||||
MAX_INPUT_IMAGE_BYTES = 8 * 1024 * 1024
|
||||
SUPPORTED_INPUT_IMAGE_MIME_TYPES = {"image/jpeg", "image/png", "image/webp"}
|
||||
|
||||
|
||||
class ProductWebsocketSerializer(FrameSerializer):
|
||||
"""Stable app-facing JSON/base64 protocol adapter for Pipecat websocket transport."""
|
||||
|
||||
protocol = "va.ws.v1"
|
||||
|
||||
def __init__(self, *, sample_rate: int, channels: int):
|
||||
super().__init__()
|
||||
self._sample_rate = sample_rate
|
||||
self._channels = channels
|
||||
self._sequence = 0
|
||||
|
||||
async def serialize(self, frame: Frame) -> str | bytes | None:
|
||||
if isinstance(frame, OutputAudioRawFrame):
|
||||
return self._event(
|
||||
"response.audio.delta",
|
||||
audio=base64.b64encode(frame.audio).decode("ascii"),
|
||||
bytes=len(frame.audio),
|
||||
sample_rate=frame.sample_rate,
|
||||
channels=frame.num_channels,
|
||||
)
|
||||
|
||||
if isinstance(frame, BotStartedSpeakingFrame):
|
||||
return self._event("response.audio.started")
|
||||
|
||||
if isinstance(frame, BotStoppedSpeakingFrame):
|
||||
return self._event("response.audio.stopped")
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
return self._event(
|
||||
"input.transcript.final",
|
||||
text=frame.text,
|
||||
user_id=frame.user_id,
|
||||
timestamp=frame.timestamp,
|
||||
)
|
||||
|
||||
# ProductTextStreamProcessor owns response.text.* events. TTS can also
|
||||
# emit TextFrame subclasses internally, so serializing them here would
|
||||
# make clients render duplicate assistant text.
|
||||
if isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||
if self.should_ignore_frame(frame):
|
||||
return None
|
||||
message = frame.message
|
||||
# Allow callers to emit a named protocol event by pushing a
|
||||
# transport-message frame whose payload already carries a `type`.
|
||||
if isinstance(message, dict) and isinstance(message.get("type"), str):
|
||||
event_type = message["type"]
|
||||
payload = {k: v for k, v in message.items() if k != "type"}
|
||||
return self._event(event_type, **payload)
|
||||
return self._event("transport.message", message=message)
|
||||
|
||||
return None
|
||||
|
||||
async def deserialize(self, data: str | bytes) -> Frame | None:
|
||||
if isinstance(data, bytes):
|
||||
return InputAudioRawFrame(
|
||||
audio=data,
|
||||
sample_rate=self._sample_rate,
|
||||
num_channels=self._channels,
|
||||
)
|
||||
|
||||
try:
|
||||
message = json.loads(data)
|
||||
except json.JSONDecodeError as exc:
|
||||
logger.warning(f"Invalid product websocket JSON: {exc}")
|
||||
return None
|
||||
|
||||
if not isinstance(message, dict):
|
||||
logger.warning("Product websocket message must be a JSON object")
|
||||
return None
|
||||
|
||||
message_type = message.get("type")
|
||||
if message_type == "session.start":
|
||||
chat_id = message.get("chatId") or message.get("chat_id")
|
||||
return InputTransportMessageFrame(
|
||||
message={
|
||||
"type": "session.started",
|
||||
"protocol": self.protocol,
|
||||
"chatId": chat_id if isinstance(chat_id, str) else None,
|
||||
"audio": {
|
||||
"encoding": "pcm_s16le",
|
||||
"sample_rate": self._sample_rate,
|
||||
"channels": self._channels,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
if message_type == "session.stop":
|
||||
return EndFrame()
|
||||
|
||||
if message_type == "response.cancel":
|
||||
return CancelFrame(reason="client_cancelled")
|
||||
|
||||
if message_type == "input.audio":
|
||||
audio = message.get("audio") or message.get("data")
|
||||
if not isinstance(audio, str):
|
||||
logger.warning("input.audio requires base64 'audio' or 'data'")
|
||||
return None
|
||||
try:
|
||||
pcm = base64.b64decode(audio)
|
||||
except (binascii.Error, ValueError) as exc:
|
||||
logger.warning(f"Invalid input.audio base64: {exc}")
|
||||
return None
|
||||
return InputAudioRawFrame(
|
||||
audio=pcm,
|
||||
sample_rate=int(message.get("sample_rate") or self._sample_rate),
|
||||
num_channels=int(message.get("channels") or self._channels),
|
||||
)
|
||||
|
||||
if message_type == "input.image":
|
||||
return self._deserialize_input_image(message)
|
||||
|
||||
if message_type == "input.text":
|
||||
text = message.get("text")
|
||||
if not isinstance(text, str) or not text.strip():
|
||||
logger.warning("input.text requires non-empty 'text'")
|
||||
return None
|
||||
return InputTransportMessageFrame(
|
||||
message={
|
||||
"type": "input.text",
|
||||
"text": text,
|
||||
"interrupt": bool(message.get("interrupt", True)),
|
||||
}
|
||||
)
|
||||
|
||||
if message_type == "transport.message":
|
||||
payload = message.get("message")
|
||||
return InputTransportMessageFrame(message=payload if isinstance(payload, dict) else message)
|
||||
|
||||
logger.warning(f"Unsupported product websocket message type: {message_type!r}")
|
||||
return None
|
||||
|
||||
def _deserialize_input_image(self, message: dict[str, Any]) -> Frame | None:
|
||||
encoded = message.get("image") or message.get("data")
|
||||
if not isinstance(encoded, str):
|
||||
logger.warning("input.image requires base64 'image' or 'data'")
|
||||
return None
|
||||
|
||||
mime_type = str(message.get("mime_type") or message.get("media_type") or "image/jpeg")
|
||||
if mime_type not in SUPPORTED_INPUT_IMAGE_MIME_TYPES:
|
||||
logger.warning(
|
||||
"input.image unsupported mime_type "
|
||||
f"{mime_type!r}; expected one of {sorted(SUPPORTED_INPUT_IMAGE_MIME_TYPES)}"
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
width = int(message.get("width") or 0)
|
||||
height = int(message.get("height") or 0)
|
||||
except (TypeError, ValueError):
|
||||
logger.warning("input.image width and height must be integers")
|
||||
return None
|
||||
|
||||
if width <= 0 or height <= 0:
|
||||
logger.warning("input.image requires positive integer width and height")
|
||||
return None
|
||||
|
||||
if "," in encoded and encoded.lstrip().startswith("data:"):
|
||||
encoded = encoded.split(",", 1)[1]
|
||||
|
||||
try:
|
||||
image = base64.b64decode(encoded, validate=True)
|
||||
except (binascii.Error, ValueError) as exc:
|
||||
logger.warning(f"Invalid input.image base64: {exc}")
|
||||
return None
|
||||
|
||||
if len(image) > MAX_INPUT_IMAGE_BYTES:
|
||||
logger.warning(
|
||||
f"input.image too large: {len(image)} bytes; "
|
||||
f"max is {MAX_INPUT_IMAGE_BYTES} bytes"
|
||||
)
|
||||
return None
|
||||
|
||||
text = message.get("text")
|
||||
if text is not None and not isinstance(text, str):
|
||||
logger.warning("input.image text must be a string when provided")
|
||||
return None
|
||||
|
||||
return UserImageRawFrame(
|
||||
image=image,
|
||||
size=(width, height),
|
||||
format=mime_type,
|
||||
user_id=str(message.get("user_id") or "product-user"),
|
||||
text=text or "Answer using this camera image.",
|
||||
append_to_context=bool(message.get("append_to_context", True)),
|
||||
)
|
||||
|
||||
def _event(self, event_type: str, **payload: Any) -> str:
|
||||
self._sequence += 1
|
||||
return json.dumps(
|
||||
{
|
||||
"type": event_type,
|
||||
"protocol": self.protocol,
|
||||
"seq": self._sequence,
|
||||
**payload,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
136
src/voice/response_state.py
Normal file
136
src/voice/response_state.py
Normal file
@@ -0,0 +1,136 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
Frame,
|
||||
InterruptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMTextFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
from .config import ResponseStateConfig
|
||||
|
||||
|
||||
class StateTagResponseProcessor(FrameProcessor):
|
||||
"""Extract a leading state tag from LLM text before text streaming and TTS.
|
||||
|
||||
Expected model output:
|
||||
|
||||
<state>some state</state>spoken response
|
||||
|
||||
The extracted state is emitted as a product protocol event, while only the
|
||||
spoken response text is forwarded downstream. If the model does not produce
|
||||
the tag, the original text is forwarded unchanged.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ResponseStateConfig) -> None:
|
||||
super().__init__()
|
||||
self._tag = config.tag
|
||||
self._event_type = config.event_type
|
||||
self._max_prefix_chars = config.max_prefix_chars
|
||||
self._opening_tag = f"<{self._tag}>"
|
||||
self._closing_tag = f"</{self._tag}>"
|
||||
self._start_frame: LLMFullResponseStartFrame | None = None
|
||||
self._buffer = ""
|
||||
self._decided = False
|
||||
self._in_llm_response = False
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, LLMFullResponseStartFrame):
|
||||
self._start_frame = frame
|
||||
self._buffer = ""
|
||||
self._decided = False
|
||||
self._in_llm_response = True
|
||||
return
|
||||
|
||||
if isinstance(frame, LLMTextFrame) and self._in_llm_response and not self._decided:
|
||||
await self._process_initial_text(frame.text or "", direction)
|
||||
return
|
||||
|
||||
if isinstance(frame, LLMFullResponseEndFrame):
|
||||
if self._in_llm_response:
|
||||
await self._flush_buffer(direction)
|
||||
await self.push_frame(frame, direction)
|
||||
self._reset()
|
||||
return
|
||||
|
||||
if isinstance(frame, (InterruptionFrame, CancelFrame)):
|
||||
if self._in_llm_response:
|
||||
await self._flush_buffer(direction)
|
||||
self._reset()
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _process_initial_text(self, text: str, direction: FrameDirection) -> None:
|
||||
if not text:
|
||||
return
|
||||
|
||||
self._buffer += text
|
||||
decision = self._parse_buffer()
|
||||
if decision is None:
|
||||
return
|
||||
|
||||
self._decided = True
|
||||
state, response_text = decision
|
||||
if state is not None:
|
||||
await self._emit_state(state)
|
||||
await self._push_start(direction)
|
||||
if response_text:
|
||||
await self.push_frame(LLMTextFrame(response_text), direction)
|
||||
self._buffer = ""
|
||||
|
||||
def _parse_buffer(self) -> tuple[str | None, str] | None:
|
||||
stripped = self._buffer.lstrip()
|
||||
if not stripped:
|
||||
return None
|
||||
|
||||
if stripped.startswith(self._opening_tag):
|
||||
state_start = len(self._opening_tag)
|
||||
state_end = stripped.find(self._closing_tag, state_start)
|
||||
if state_end >= 0:
|
||||
response_start = state_end + len(self._closing_tag)
|
||||
return stripped[state_start:state_end].strip(), stripped[response_start:]
|
||||
if len(self._buffer) < self._max_prefix_chars:
|
||||
return None
|
||||
return None, self._buffer
|
||||
|
||||
if self._opening_tag.startswith(stripped) and len(self._buffer) < self._max_prefix_chars:
|
||||
return None
|
||||
|
||||
return None, self._buffer
|
||||
|
||||
async def _flush_buffer(self, direction: FrameDirection) -> None:
|
||||
await self._push_start(direction)
|
||||
if self._buffer:
|
||||
await self.push_frame(LLMTextFrame(self._buffer), direction)
|
||||
self._buffer = ""
|
||||
self._decided = True
|
||||
|
||||
async def _push_start(self, direction: FrameDirection) -> None:
|
||||
if self._start_frame:
|
||||
await self.push_frame(self._start_frame, direction)
|
||||
self._start_frame = None
|
||||
|
||||
async def _emit_state(self, state: str) -> None:
|
||||
await self.push_frame(
|
||||
OutputTransportMessageUrgentFrame(
|
||||
message={
|
||||
"type": self._event_type,
|
||||
"state": state,
|
||||
}
|
||||
),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
def _reset(self) -> None:
|
||||
self._start_frame = None
|
||||
self._buffer = ""
|
||||
self._decided = False
|
||||
self._in_llm_response = False
|
||||
100
src/voice/routes.py
Normal file
100
src/voice/routes.py
Normal file
@@ -0,0 +1,100 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, FastAPI, WebSocket
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from loguru import logger
|
||||
|
||||
from .config import EngineConfig, load_config, resolve_voice_config_path
|
||||
from .pipeline import run_product_voice_pipeline
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
VOICE_DEMO_DIR = PROJECT_ROOT / "static" / "voice-demo"
|
||||
|
||||
router = APIRouter(tags=["voice"])
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_voice_config() -> EngineConfig:
|
||||
return load_config()
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_voice_config_path() -> Path:
|
||||
return resolve_voice_config_path()
|
||||
|
||||
|
||||
def _normalize_mount_path(path: str) -> str:
|
||||
normalized = path.strip() or "/voice-demo"
|
||||
if not normalized.startswith("/"):
|
||||
normalized = f"/{normalized}"
|
||||
return normalized.rstrip("/") or "/"
|
||||
|
||||
|
||||
@router.get("/voice/health")
|
||||
async def voice_health() -> dict[str, object]:
|
||||
config = get_voice_config()
|
||||
mount = (
|
||||
_normalize_mount_path(config.server.webpage_mount)
|
||||
if config.server.serve_webpage
|
||||
else None
|
||||
)
|
||||
return {
|
||||
"status": "healthy",
|
||||
"config": str(get_voice_config_path()),
|
||||
"protocols": {
|
||||
"/ws-product": "va.ws.v1.json_base64",
|
||||
},
|
||||
"features": {
|
||||
"product_text_input": True,
|
||||
"product_text_interrupt": True,
|
||||
},
|
||||
"demo": mount,
|
||||
"llm_provider": config.services.llm.provider,
|
||||
"stt_provider": config.services.stt.provider,
|
||||
"tts_provider": config.services.tts.provider,
|
||||
}
|
||||
|
||||
|
||||
@router.websocket("/ws-product")
|
||||
async def product_websocket_endpoint(websocket: WebSocket) -> None:
|
||||
await websocket.accept()
|
||||
config = get_voice_config()
|
||||
await run_product_voice_pipeline(websocket, config)
|
||||
|
||||
|
||||
def register_voice(app: FastAPI) -> None:
|
||||
"""Mount voice websocket routes and optional browser demo static files."""
|
||||
voice_config_path = get_voice_config_path()
|
||||
if not voice_config_path.exists():
|
||||
logger.warning(f"Voice config not found at {voice_config_path}; voice demo disabled")
|
||||
return
|
||||
|
||||
config = get_voice_config()
|
||||
app.include_router(router)
|
||||
logger.info(f"Voice config loaded from {voice_config_path}")
|
||||
|
||||
if config.server.cors_origins:
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=config.server.cors_origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
if config.server.serve_webpage and VOICE_DEMO_DIR.is_dir():
|
||||
mount = _normalize_mount_path(config.server.webpage_mount)
|
||||
app.mount(
|
||||
mount,
|
||||
StaticFiles(directory=str(VOICE_DEMO_DIR), html=True),
|
||||
name="voice-demo",
|
||||
)
|
||||
logger.info(f"Voice demo mounted at {mount}")
|
||||
else:
|
||||
logger.info("Voice demo static page disabled or missing")
|
||||
|
||||
logger.info("Voice websocket registered at /ws-product")
|
||||
220
src/voice/services.py
Normal file
220
src/voice/services.py
Normal file
@@ -0,0 +1,220 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
from openai import BadRequestError
|
||||
from openai import NOT_GIVEN
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, TTSAudioRawFrame
|
||||
from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.openai.stt import OpenAISTTService
|
||||
from pipecat.services.openai.tts import VALID_VOICES, OpenAITTSService
|
||||
from pipecat.services.tts_service import TextAggregationMode
|
||||
from pipecat.transcriptions.language import Language
|
||||
|
||||
from .config import AudioConfig, LLMConfig, STTConfig, TTSConfig
|
||||
from .fastgpt_llm import FastGPTLLMService, FastGPTLLMSettings
|
||||
from .xfyun_asr import DEFAULT_XFYUN_ASR_URL, XfyunASRService
|
||||
from .xfyun_super_tts import DEFAULT_XFYUN_SUPER_TTS_URL, XfyunSuperTTSService
|
||||
from .xfyun_tts import DEFAULT_XFYUN_TTS_URL, XfyunTTSService
|
||||
|
||||
|
||||
def create_stt_service(config: STTConfig, audio: AudioConfig | None = None):
|
||||
if config.provider == "xfyun":
|
||||
sample_rate = audio.sample_rate_hz if audio else 16000
|
||||
return XfyunASRService(
|
||||
app_id=config.app_id,
|
||||
api_key=config.api_key or "",
|
||||
api_secret=config.api_secret,
|
||||
url=config.base_url or DEFAULT_XFYUN_ASR_URL,
|
||||
language=config.language or "zh_cn",
|
||||
domain=config.domain,
|
||||
accent=config.accent,
|
||||
sample_rate=sample_rate,
|
||||
encoding=config.encoding,
|
||||
frame_size=config.frame_size,
|
||||
open_timeout=config.timeout_sec,
|
||||
dynamic_correction=config.dynamic_correction,
|
||||
)
|
||||
|
||||
_require_provider(config.provider, "openai", "stt")
|
||||
return OpenAISTTService(
|
||||
api_key=config.api_key or None,
|
||||
base_url=config.base_url,
|
||||
settings=OpenAISTTService.Settings(
|
||||
model=config.model,
|
||||
language=_language(config.language),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def create_llm_service(
|
||||
config: LLMConfig,
|
||||
*,
|
||||
chat_id: str | None = None,
|
||||
session_variables: dict | None = None,
|
||||
greeting_prompt: str | None = None,
|
||||
):
|
||||
if config.is_fastgpt:
|
||||
variables = {**config.variables, **(session_variables or {})}
|
||||
return FastGPTLLMService(
|
||||
api_key=config.api_key,
|
||||
base_url=config.base_url or "http://localhost:3000",
|
||||
chat_id=chat_id,
|
||||
app_id=config.app_id,
|
||||
greeting_prompt=greeting_prompt,
|
||||
timeout=config.timeout_sec,
|
||||
image_input_mode=config.image_input_mode,
|
||||
settings=FastGPTLLMSettings(
|
||||
model=config.model or "fastgpt",
|
||||
variables=variables,
|
||||
detail=config.detail,
|
||||
),
|
||||
)
|
||||
|
||||
if not config.is_openai:
|
||||
supported = ", ".join(sorted(("openai", "fastgpt", "llm")))
|
||||
raise ValueError(
|
||||
f"Unsupported llm provider {config.provider!r}; expected one of: {supported}"
|
||||
)
|
||||
return OpenAILLMService(
|
||||
api_key=config.api_key or None,
|
||||
base_url=config.base_url,
|
||||
settings=OpenAILLMService.Settings(
|
||||
model=config.model,
|
||||
temperature=config.temperature if config.temperature is not None else NOT_GIVEN,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def create_tts_service(config: TTSConfig, audio: AudioConfig):
|
||||
if config.provider == "xfyun":
|
||||
source_sample_rate = config.source_sample_rate_hz or audio.sample_rate_hz
|
||||
if source_sample_rate not in (8000, 16000):
|
||||
raise ValueError("Xfyun TTS source_sample_rate_hz must be 8000 or 16000")
|
||||
return XfyunTTSService(
|
||||
app_id=config.app_id,
|
||||
api_key=config.api_key or "",
|
||||
api_secret=config.api_secret,
|
||||
voice=config.voice,
|
||||
url=config.base_url or DEFAULT_XFYUN_TTS_URL,
|
||||
sample_rate=audio.sample_rate_hz,
|
||||
source_sample_rate=source_sample_rate,
|
||||
encoding=config.aue,
|
||||
text_encoding=config.tte,
|
||||
speed=config.speed,
|
||||
volume=config.volume,
|
||||
pitch=config.pitch,
|
||||
timeout=config.timeout_sec,
|
||||
push_stop_frames=True,
|
||||
)
|
||||
|
||||
if config.provider in ("xfyun_super", "xfyun_super_tts"):
|
||||
source_sample_rate = config.source_sample_rate_hz or 24000
|
||||
if source_sample_rate not in (8000, 16000, 24000):
|
||||
raise ValueError(
|
||||
"Xfyun Super TTS source_sample_rate_hz must be 8000, 16000, or 24000"
|
||||
)
|
||||
text_aggregation_mode = config.text_aggregation_mode or TextAggregationMode.TOKEN
|
||||
return XfyunSuperTTSService(
|
||||
app_id=config.app_id,
|
||||
api_key=config.api_key or "",
|
||||
api_secret=config.api_secret,
|
||||
voice=config.voice,
|
||||
url=config.base_url or DEFAULT_XFYUN_SUPER_TTS_URL,
|
||||
sample_rate=audio.sample_rate_hz,
|
||||
source_sample_rate=source_sample_rate,
|
||||
encoding=config.aue,
|
||||
speed=config.speed,
|
||||
volume=config.volume,
|
||||
pitch=config.pitch,
|
||||
oral_level=config.oral_level,
|
||||
text_aggregation_mode=text_aggregation_mode,
|
||||
open_timeout=config.timeout_sec,
|
||||
)
|
||||
|
||||
_require_provider(config.provider, "openai", "tts")
|
||||
service_class = OpenAITTSService if config.voice in VALID_VOICES else OpenAICompatibleTTSService
|
||||
return service_class(
|
||||
api_key=config.api_key or None,
|
||||
base_url=config.base_url,
|
||||
sample_rate=audio.sample_rate_hz,
|
||||
source_sample_rate=config.source_sample_rate_hz,
|
||||
settings=OpenAITTSService.Settings(
|
||||
model=config.model,
|
||||
voice=config.voice,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class OpenAICompatibleTTSService(OpenAITTSService):
|
||||
"""OpenAI-compatible TTS service that permits provider-specific voice ids."""
|
||||
|
||||
def __init__(self, *, source_sample_rate: int | None = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._source_sample_rate = source_sample_rate or OPENAI_SAMPLE_RATE
|
||||
|
||||
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
|
||||
voice = self._settings.voice
|
||||
if not voice:
|
||||
yield ErrorFrame(error="TTS voice must be specified")
|
||||
return
|
||||
|
||||
try:
|
||||
create_params = {
|
||||
"input": text,
|
||||
"model": self._settings.model,
|
||||
"voice": voice,
|
||||
"response_format": "pcm",
|
||||
}
|
||||
|
||||
if self._settings.instructions:
|
||||
create_params["instructions"] = self._settings.instructions
|
||||
|
||||
if self._settings.speed:
|
||||
create_params["speed"] = self._settings.speed
|
||||
|
||||
async with self._client.audio.speech.with_streaming_response.create(
|
||||
**create_params
|
||||
) as response:
|
||||
if response.status_code != 200:
|
||||
error = await response.text()
|
||||
yield ErrorFrame(
|
||||
error=f"TTS request failed (status: {response.status_code}, error: {error})"
|
||||
)
|
||||
return
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
async def audio_chunks():
|
||||
async for chunk in response.iter_bytes(self.chunk_size):
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
first_frame = True
|
||||
async for frame in self._stream_audio_frames_from_iterator(
|
||||
audio_chunks(),
|
||||
in_sample_rate=self._source_sample_rate,
|
||||
context_id=context_id,
|
||||
):
|
||||
if first_frame:
|
||||
await self.stop_ttfb_metrics()
|
||||
first_frame = False
|
||||
yield frame
|
||||
except BadRequestError as exc:
|
||||
yield ErrorFrame(error=f"TTS request failed: {exc}")
|
||||
except Exception as exc:
|
||||
yield ErrorFrame(error=f"TTS request failed: {exc}")
|
||||
|
||||
|
||||
def _require_provider(actual: str, expected: str, service_name: str) -> None:
|
||||
if actual != expected:
|
||||
raise ValueError(f"Unsupported {service_name} provider {actual!r}; expected {expected!r}")
|
||||
|
||||
|
||||
def _language(value: str | None) -> Language | None:
|
||||
if value is None:
|
||||
return None
|
||||
normalized = value.replace("-", "_").upper()
|
||||
return getattr(Language, normalized, value)
|
||||
54
src/voice/text_input.py
Normal file
54
src/voice/text_input.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InputTransportMessageFrame,
|
||||
LLMMessagesAppendFrame,
|
||||
UserImageRawFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
class ProductTextInputProcessor(FrameProcessor):
|
||||
"""Converts product text-input transport messages and marks image input as user activity."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserImageRawFrame):
|
||||
await self.broadcast_frame(UserStartedSpeakingFrame)
|
||||
await self.push_frame(frame, direction)
|
||||
await self.broadcast_frame(UserStoppedSpeakingFrame)
|
||||
return
|
||||
|
||||
if not isinstance(frame, InputTransportMessageFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
message = frame.message
|
||||
if not isinstance(message, dict) or message.get("type") != "input.text":
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
text = str(message.get("text") or "").strip()
|
||||
if not text:
|
||||
return
|
||||
|
||||
await self.broadcast_frame(UserStartedSpeakingFrame)
|
||||
|
||||
if message.get("interrupt", True):
|
||||
logger.info("Text input interrupting current response")
|
||||
await self.broadcast_interruption()
|
||||
|
||||
await self.push_frame(
|
||||
LLMMessagesAppendFrame(
|
||||
messages=[{"role": "user", "content": text}],
|
||||
run_llm=True,
|
||||
),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
await self.broadcast_frame(UserStoppedSpeakingFrame)
|
||||
215
src/voice/text_stream.py
Normal file
215
src/voice/text_stream.py
Normal file
@@ -0,0 +1,215 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Protocol
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
Frame,
|
||||
InterruptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMTextFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
TTSSpeakFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
class _AssistantContextSync(Protocol):
|
||||
@property
|
||||
def context(self) -> Any: ...
|
||||
|
||||
|
||||
def _committed_assistant_content(context: Any) -> str:
|
||||
"""Return trailing assistant text only when the last context message is assistant."""
|
||||
messages = context.get_messages()
|
||||
if not messages:
|
||||
return ""
|
||||
last = messages[-1]
|
||||
if not isinstance(last, dict) or last.get("role") != "assistant":
|
||||
return ""
|
||||
content = last.get("content")
|
||||
if isinstance(content, str):
|
||||
return content.strip()
|
||||
return ""
|
||||
|
||||
|
||||
def sync_streamed_assistant_context(
|
||||
aggregator: _AssistantContextSync,
|
||||
*,
|
||||
streamed_text: str,
|
||||
committed_text: str,
|
||||
) -> None:
|
||||
"""Align LLM context with urgent-streamed UI text.
|
||||
|
||||
The assistant aggregator commits TTS-spoken text; ``ProductTextStreamProcessor``
|
||||
mirrors the LLM stream to the client. Replace or insert the streamed text so
|
||||
the next turn sees what the user read on screen.
|
||||
"""
|
||||
streamed = streamed_text.strip()
|
||||
if not streamed or streamed == committed_text.strip():
|
||||
return
|
||||
|
||||
committed = committed_text.strip()
|
||||
|
||||
def _apply(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
updated = list(messages)
|
||||
if not updated:
|
||||
updated.append({"role": "assistant", "content": streamed})
|
||||
return updated
|
||||
|
||||
last = updated[-1]
|
||||
if isinstance(last, dict) and last.get("role") == "assistant":
|
||||
content = last.get("content")
|
||||
if isinstance(content, str) and content.strip() != streamed:
|
||||
updated[-1] = {"role": "assistant", "content": streamed}
|
||||
return updated
|
||||
|
||||
if (
|
||||
len(updated) >= 2
|
||||
and isinstance(last, dict)
|
||||
and last.get("role") == "user"
|
||||
):
|
||||
prev = updated[-2]
|
||||
if isinstance(prev, dict) and prev.get("role") == "user":
|
||||
updated.insert(len(updated) - 1, {"role": "assistant", "content": streamed})
|
||||
return updated
|
||||
|
||||
if isinstance(last, dict) and last.get("role") == "user":
|
||||
updated.append({"role": "assistant", "content": streamed})
|
||||
return updated
|
||||
|
||||
updated.append({"role": "assistant", "content": streamed})
|
||||
return updated
|
||||
|
||||
aggregator.context.transform_messages(_apply)
|
||||
|
||||
|
||||
def maybe_sync_assistant_context(
|
||||
aggregator: _AssistantContextSync,
|
||||
text_stream: "ProductTextStreamProcessor",
|
||||
*,
|
||||
committed_text: str | None = None,
|
||||
) -> None:
|
||||
committed = (
|
||||
committed_text.strip()
|
||||
if committed_text is not None
|
||||
else _committed_assistant_content(aggregator.context)
|
||||
)
|
||||
streamed = text_stream.last_assistant_stream_text()
|
||||
if not streamed:
|
||||
return
|
||||
sync_streamed_assistant_context(
|
||||
aggregator,
|
||||
streamed_text=streamed,
|
||||
committed_text=committed,
|
||||
)
|
||||
|
||||
|
||||
class ProductTextStreamProcessor(FrameProcessor):
|
||||
"""Mirrors LLM text frames as streaming protocol events.
|
||||
|
||||
Placed between the LLM service and the TTS service, this processor
|
||||
observes the LLM's text frames as they're emitted and forwards them
|
||||
downstream as ``OutputTransportMessageUrgentFrame``s that the product
|
||||
serializer turns into ``response.text.{started,delta,final}`` events.
|
||||
|
||||
Urgent frames bypass TTS serialization and transport audio queues so text
|
||||
reaches the client at least as quickly as synthesized audio.
|
||||
|
||||
``TTSSpeakFrame`` (used by the fixed-greeting code path, which bypasses
|
||||
the LLM entirely) is also handled: the processor synthesizes a single
|
||||
started/delta/final sequence for its fixed text.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._aggregation: list[str] = []
|
||||
self._turn_active = False
|
||||
self._last_assistant_stream_text = ""
|
||||
self._interrupted_stream_text: str | None = None
|
||||
|
||||
def last_assistant_stream_text(self) -> str:
|
||||
return self._last_assistant_stream_text
|
||||
|
||||
def take_interrupted_stream_text(self) -> str | None:
|
||||
text = self._interrupted_stream_text
|
||||
self._interrupted_stream_text = None
|
||||
return text
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, LLMFullResponseStartFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
await self._start_turn()
|
||||
elif isinstance(frame, LLMTextFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
if frame.text:
|
||||
await self._delta(frame.text)
|
||||
elif isinstance(frame, LLMFullResponseEndFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
await self._end_turn(interrupted=False)
|
||||
elif isinstance(frame, (InterruptionFrame, CancelFrame)):
|
||||
await self.push_frame(frame, direction)
|
||||
await self._handle_interrupt()
|
||||
elif isinstance(frame, TTSSpeakFrame):
|
||||
# Fixed-text / direct-speech path: there's no LLM cycle, so
|
||||
# synthesize one started/delta/final sequence for the spoken text.
|
||||
text = frame.text or ""
|
||||
await self.push_frame(frame, direction)
|
||||
await self._start_turn()
|
||||
if text:
|
||||
await self._delta(text)
|
||||
await self._end_turn(interrupted=False)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _start_turn(self) -> None:
|
||||
if self._turn_active:
|
||||
return
|
||||
self._turn_active = True
|
||||
self._aggregation = []
|
||||
await self._emit("response.text.started")
|
||||
|
||||
async def _delta(self, text: str) -> None:
|
||||
if not self._turn_active:
|
||||
# A text frame outside a turn shouldn't happen, but if it does,
|
||||
# synthesize a started boundary so the client renders sensibly.
|
||||
await self._start_turn()
|
||||
self._aggregation.append(text)
|
||||
await self._emit("response.text.delta", text=text)
|
||||
|
||||
async def _handle_interrupt(self) -> None:
|
||||
if self._turn_active:
|
||||
await self._end_turn(interrupted=True)
|
||||
return
|
||||
|
||||
if self._last_assistant_stream_text:
|
||||
self._interrupted_stream_text = self._last_assistant_stream_text
|
||||
|
||||
async def _end_turn(self, *, interrupted: bool) -> None:
|
||||
if not self._turn_active:
|
||||
return
|
||||
|
||||
full_text = "".join(self._aggregation)
|
||||
if full_text:
|
||||
self._last_assistant_stream_text = full_text
|
||||
if interrupted and full_text:
|
||||
self._interrupted_stream_text = full_text
|
||||
|
||||
self._turn_active = False
|
||||
self._aggregation = []
|
||||
await self._emit(
|
||||
"response.text.final",
|
||||
text=full_text,
|
||||
interrupted=interrupted,
|
||||
)
|
||||
|
||||
async def _emit(self, event_type: str, **payload: object) -> None:
|
||||
await self.push_frame(
|
||||
OutputTransportMessageUrgentFrame(
|
||||
message={"type": event_type, **payload},
|
||||
),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
30
src/voice/transcript_stream.py
Normal file
30
src/voice/transcript_stream.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
class ProductTranscriptStreamProcessor(FrameProcessor):
|
||||
"""Mirrors interim STT frames to the product websocket protocol."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, InterimTranscriptionFrame):
|
||||
await self.push_frame(
|
||||
OutputTransportMessageUrgentFrame(
|
||||
message={
|
||||
"type": "input.transcript.interim",
|
||||
"text": frame.text,
|
||||
"user_id": frame.user_id,
|
||||
"timestamp": frame.timestamp,
|
||||
}
|
||||
),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
91
src/voice/turn_start.py
Normal file
91
src/voice/turn_start.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from loguru import logger
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
TranscriptionFrame,
|
||||
)
|
||||
from pipecat.turns.types import ProcessFrameResult
|
||||
from pipecat.turns.user_start.base_user_turn_start_strategy import BaseUserTurnStartStrategy
|
||||
|
||||
|
||||
_COUNTABLE_TEXT_RE = re.compile(r"[\w\u4e00-\u9fff]", re.UNICODE)
|
||||
|
||||
|
||||
class InterruptionGateUserTurnStartStrategy(BaseUserTurnStartStrategy):
|
||||
"""Starts user turns only after likely intentional speech.
|
||||
|
||||
When the assistant is speaking, short background speech should not barge in
|
||||
unless it is a common answer to a yes/no style question. When the assistant
|
||||
is not speaking, any non-empty transcript can start a normal user turn.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
min_chars_when_bot_speaking: int,
|
||||
allowed_short_replies: list[str],
|
||||
use_interim: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self._min_chars_when_bot_speaking = min_chars_when_bot_speaking
|
||||
self._allowed_short_replies = {
|
||||
self._normalize_text(reply) for reply in allowed_short_replies if reply.strip()
|
||||
}
|
||||
self._use_interim = use_interim
|
||||
self._bot_speaking = False
|
||||
|
||||
async def reset(self):
|
||||
await super().reset()
|
||||
|
||||
async def process_frame(self, frame: Frame) -> ProcessFrameResult:
|
||||
if isinstance(frame, BotStartedSpeakingFrame):
|
||||
self._bot_speaking = True
|
||||
return ProcessFrameResult.CONTINUE
|
||||
if isinstance(frame, BotStoppedSpeakingFrame):
|
||||
self._bot_speaking = False
|
||||
return ProcessFrameResult.CONTINUE
|
||||
if isinstance(frame, InterimTranscriptionFrame) and self._use_interim:
|
||||
return await self._handle_transcription(frame.text, interim=True)
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
return await self._handle_transcription(frame.text, interim=False)
|
||||
|
||||
return ProcessFrameResult.CONTINUE
|
||||
|
||||
async def _handle_transcription(self, text: str, *, interim: bool) -> ProcessFrameResult:
|
||||
normalized = self._normalize_text(text)
|
||||
if not normalized:
|
||||
return ProcessFrameResult.CONTINUE
|
||||
|
||||
if not self._bot_speaking:
|
||||
await self.trigger_user_turn_started()
|
||||
return ProcessFrameResult.STOP
|
||||
|
||||
should_interrupt = self._should_interrupt(normalized)
|
||||
logger.debug(
|
||||
f"{self} interruption_gate text={text!r} normalized={normalized!r} "
|
||||
f"should_interrupt={should_interrupt} interim={interim}"
|
||||
)
|
||||
|
||||
if should_interrupt:
|
||||
await self.trigger_user_turn_started()
|
||||
return ProcessFrameResult.STOP
|
||||
|
||||
await self.trigger_reset_aggregation()
|
||||
return ProcessFrameResult.CONTINUE
|
||||
|
||||
def _should_interrupt(self, normalized: str) -> bool:
|
||||
return (
|
||||
normalized in self._allowed_short_replies
|
||||
or len(normalized) >= self._min_chars_when_bot_speaking
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_text(text: str) -> str:
|
||||
return "".join(_COUNTABLE_TEXT_RE.findall(text.lower()))
|
||||
353
src/voice/xfyun_asr.py
Normal file
353
src/voice/xfyun_asr.py
Normal file
@@ -0,0 +1,353 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import os
|
||||
from collections.abc import AsyncGenerator
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import format_datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urlencode, urlparse
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
TranscriptionFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
VADUserStartedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.settings import STTSettings
|
||||
from pipecat.services.stt_service import STTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from websockets.asyncio.client import connect as websocket_connect
|
||||
from websockets.protocol import State
|
||||
|
||||
|
||||
DEFAULT_XFYUN_ASR_URL = "wss://iat-api.xfyun.cn/v2/iat"
|
||||
|
||||
|
||||
class XfyunASRService(STTService):
|
||||
"""iFlytek/Xfyun streaming voice dictation service for Pipecat."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
app_id: str,
|
||||
api_key: str,
|
||||
api_secret: str,
|
||||
url: str | None = None,
|
||||
language: str = "zh_cn",
|
||||
domain: str = "iat",
|
||||
accent: str = "mandarin",
|
||||
sample_rate: int = 16000,
|
||||
encoding: str = "raw",
|
||||
frame_size: int = 1280,
|
||||
open_timeout: float = 10.0,
|
||||
dynamic_correction: bool = False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
sample_rate=sample_rate,
|
||||
settings=STTSettings(model=None, language=language),
|
||||
**kwargs,
|
||||
)
|
||||
self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
|
||||
self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
|
||||
self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
|
||||
self._url = url or DEFAULT_XFYUN_ASR_URL
|
||||
self._language = language
|
||||
self._domain = domain
|
||||
self._accent = accent
|
||||
self._encoding = encoding
|
||||
self._frame_size = frame_size
|
||||
self._open_timeout = open_timeout
|
||||
self._dynamic_correction = dynamic_correction
|
||||
|
||||
self._websocket = None
|
||||
self._receive_task = None
|
||||
self._audio_buffer = bytearray()
|
||||
self._sent_first_frame = False
|
||||
self._sent_final_frame = False
|
||||
self._finalizing_turn = False
|
||||
self._partials: list[str] = []
|
||||
self._last_text = ""
|
||||
|
||||
async def cleanup(self) -> None:
|
||||
await self._close_utterance()
|
||||
await super().cleanup()
|
||||
|
||||
async def stop(self, frame: EndFrame) -> None:
|
||||
await self._close_utterance()
|
||||
await super().stop(frame)
|
||||
|
||||
async def cancel(self, frame: CancelFrame) -> None:
|
||||
await self._close_utterance()
|
||||
await super().cancel(frame)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserStoppedSpeakingFrame):
|
||||
# Aggregator-level turn end (broadcast once per logical user turn).
|
||||
# This is the only boundary that finalizes/closes the xfyun
|
||||
# websocket, so brief VAD pauses do not restart the ASR session.
|
||||
await self._finish_utterance()
|
||||
elif isinstance(frame, VADUserStartedSpeakingFrame):
|
||||
await self._start_utterance()
|
||||
|
||||
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame | None, None]:
|
||||
if not audio:
|
||||
yield None
|
||||
return
|
||||
|
||||
if not self._websocket or self._websocket.state is not State.OPEN:
|
||||
await self._start_utterance()
|
||||
|
||||
self._audio_buffer.extend(audio)
|
||||
await self._flush_audio_buffer(final=False)
|
||||
yield None
|
||||
|
||||
async def _start_utterance(self) -> None:
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
return
|
||||
|
||||
if not self._app_id or not self._api_key or not self._api_secret:
|
||||
await self.push_error("Xfyun ASR requires app_id, api_key, and api_secret")
|
||||
return
|
||||
|
||||
if self.sample_rate not in (8000, 16000):
|
||||
await self.push_error("Xfyun ASR sample rate must be 8000 or 16000")
|
||||
return
|
||||
|
||||
self._audio_buffer.clear()
|
||||
self._partials = []
|
||||
self._last_text = ""
|
||||
self._sent_first_frame = False
|
||||
self._sent_final_frame = False
|
||||
|
||||
auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
|
||||
try:
|
||||
self._websocket = await websocket_connect(
|
||||
auth_url,
|
||||
max_size=None,
|
||||
open_timeout=self._open_timeout,
|
||||
)
|
||||
except Exception as exc:
|
||||
await self.push_error(f"Xfyun ASR connection failed: {exc}", exception=exc)
|
||||
self._websocket = None
|
||||
return
|
||||
|
||||
self._receive_task = self.create_task(
|
||||
self._receive_messages(),
|
||||
name="xfyun_asr_receive",
|
||||
)
|
||||
|
||||
async def _finish_utterance(self) -> None:
|
||||
if not self._websocket or self._websocket.state is not State.OPEN:
|
||||
return
|
||||
|
||||
await self._flush_audio_buffer(final=True)
|
||||
if not self._sent_first_frame:
|
||||
await self._close_utterance()
|
||||
return
|
||||
|
||||
if not self._sent_final_frame:
|
||||
self._finalizing_turn = True
|
||||
await self._send_payload({"data": {"status": 2}})
|
||||
self.request_finalize()
|
||||
self._sent_final_frame = True
|
||||
|
||||
async def _close_utterance(self) -> None:
|
||||
current_task = asyncio.current_task()
|
||||
if self._receive_task and self._receive_task is not current_task:
|
||||
await self.cancel_task(self._receive_task)
|
||||
self._receive_task = None
|
||||
|
||||
websocket = self._websocket
|
||||
self._websocket = None
|
||||
if websocket and websocket.state is State.OPEN:
|
||||
try:
|
||||
await websocket.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self._audio_buffer.clear()
|
||||
self._sent_first_frame = False
|
||||
self._sent_final_frame = False
|
||||
self._finalizing_turn = False
|
||||
|
||||
async def _flush_audio_buffer(self, *, final: bool) -> None:
|
||||
while len(self._audio_buffer) >= self._frame_size:
|
||||
chunk = bytes(self._audio_buffer[: self._frame_size])
|
||||
del self._audio_buffer[: self._frame_size]
|
||||
await self._send_audio_chunk(chunk, status=1)
|
||||
|
||||
if final and self._audio_buffer:
|
||||
chunk = bytes(self._audio_buffer)
|
||||
self._audio_buffer.clear()
|
||||
await self._send_audio_chunk(chunk, status=1)
|
||||
|
||||
async def _send_audio_chunk(self, audio: bytes, *, status: int) -> None:
|
||||
if not audio:
|
||||
return
|
||||
|
||||
if not self._sent_first_frame:
|
||||
business = {
|
||||
"language": self._language,
|
||||
"domain": self._domain,
|
||||
"accent": self._accent,
|
||||
}
|
||||
if self._dynamic_correction:
|
||||
business["dwa"] = "wpgs"
|
||||
|
||||
payload = {
|
||||
"common": {"app_id": self._app_id},
|
||||
"business": business,
|
||||
"data": {
|
||||
"status": 0,
|
||||
"format": f"audio/L16;rate={self.sample_rate}",
|
||||
"encoding": self._encoding,
|
||||
"audio": base64.b64encode(audio).decode("utf-8"),
|
||||
},
|
||||
}
|
||||
self._sent_first_frame = True
|
||||
else:
|
||||
payload = {
|
||||
"data": {
|
||||
"status": status,
|
||||
"format": f"audio/L16;rate={self.sample_rate}",
|
||||
"encoding": self._encoding,
|
||||
"audio": base64.b64encode(audio).decode("utf-8"),
|
||||
}
|
||||
}
|
||||
|
||||
await self._send_payload(payload)
|
||||
|
||||
async def _send_payload(self, payload: dict[str, Any]) -> None:
|
||||
if not self._websocket or self._websocket.state is not State.OPEN:
|
||||
return
|
||||
await self._websocket.send(json.dumps(payload, ensure_ascii=False))
|
||||
|
||||
async def _receive_messages(self) -> None:
|
||||
websocket = self._websocket
|
||||
if not websocket:
|
||||
return
|
||||
|
||||
try:
|
||||
async for message in websocket:
|
||||
await self._process_response(json.loads(message))
|
||||
except Exception as exc:
|
||||
if self._websocket is websocket:
|
||||
await self.push_error(f"Xfyun ASR receive failed: {exc}", exception=exc)
|
||||
finally:
|
||||
if self._websocket is websocket:
|
||||
self._websocket = None
|
||||
self._receive_task = None
|
||||
|
||||
async def _process_response(self, payload: dict[str, Any]) -> None:
|
||||
code = payload.get("code", -1)
|
||||
if code != 0:
|
||||
message = payload.get("message", "unknown error")
|
||||
sid = payload.get("sid")
|
||||
await self.push_error(f"Xfyun ASR error code={code}, sid={sid}, message={message}")
|
||||
return
|
||||
|
||||
data = payload.get("data")
|
||||
if not isinstance(data, dict):
|
||||
return
|
||||
|
||||
is_final_response = data.get("status") == 2
|
||||
recognition = data.get("result")
|
||||
if isinstance(recognition, dict):
|
||||
text = self._apply_recognition_result(recognition)
|
||||
if text and text != self._last_text:
|
||||
self._last_text = text
|
||||
if not self._finalizing_turn and not is_final_response:
|
||||
await self.push_frame(
|
||||
InterimTranscriptionFrame(
|
||||
text,
|
||||
self._user_id,
|
||||
time_now_iso8601(),
|
||||
_language_or_none(self._language),
|
||||
result=payload,
|
||||
)
|
||||
)
|
||||
|
||||
if is_final_response:
|
||||
final_text = self._last_text
|
||||
if final_text:
|
||||
self.confirm_finalize()
|
||||
await self.push_frame(
|
||||
TranscriptionFrame(
|
||||
final_text,
|
||||
self._user_id,
|
||||
time_now_iso8601(),
|
||||
_language_or_none(self._language),
|
||||
result=payload,
|
||||
)
|
||||
)
|
||||
await self._close_utterance()
|
||||
|
||||
def _apply_recognition_result(self, recognition: dict[str, Any]) -> str:
|
||||
partial = _extract_text_from_result(recognition)
|
||||
if not partial:
|
||||
return self._last_text
|
||||
|
||||
if self._dynamic_correction and recognition.get("pgs") == "rpl" and recognition.get("rg"):
|
||||
start, end = recognition["rg"]
|
||||
if 1 <= start <= len(self._partials):
|
||||
self._partials[start - 1 : end] = [partial]
|
||||
else:
|
||||
logger.debug(f"Ignoring out-of-range Xfyun replacement rg={recognition['rg']}")
|
||||
else:
|
||||
self._partials.append(partial)
|
||||
|
||||
return "".join(self._partials)
|
||||
|
||||
|
||||
def _extract_text_from_result(result: dict[str, Any]) -> str:
|
||||
words: list[str] = []
|
||||
for item in result.get("ws", []):
|
||||
for candidate in item.get("cw", []):
|
||||
word = candidate.get("w")
|
||||
if word:
|
||||
words.append(word)
|
||||
return "".join(words)
|
||||
|
||||
|
||||
def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
host = parsed.netloc
|
||||
path = parsed.path or "/v2/iat"
|
||||
date = format_datetime(datetime.now(timezone.utc), usegmt=True)
|
||||
request_line = f"GET {path} HTTP/1.1"
|
||||
signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
|
||||
signature_sha = hmac.new(
|
||||
api_secret.encode("utf-8"),
|
||||
signature_origin.encode("utf-8"),
|
||||
digestmod=hashlib.sha256,
|
||||
).digest()
|
||||
signature = base64.b64encode(signature_sha).decode("utf-8")
|
||||
authorization_origin = (
|
||||
f'api_key="{api_key}", algorithm="hmac-sha256", '
|
||||
f'headers="host date request-line", signature="{signature}"'
|
||||
)
|
||||
authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
|
||||
query = urlencode({"authorization": authorization, "date": date, "host": host})
|
||||
return f"{url}?{query}"
|
||||
|
||||
|
||||
def _language_or_none(value: str) -> Language | None:
|
||||
try:
|
||||
return Language(value)
|
||||
except ValueError:
|
||||
return None
|
||||
391
src/voice/xfyun_super_tts.py
Normal file
391
src/voice/xfyun_super_tts.py
Normal file
@@ -0,0 +1,391 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import os
|
||||
from collections.abc import AsyncGenerator
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import format_datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urlencode, urlparse
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.services.settings import TTSSettings
|
||||
from pipecat.services.tts_service import TextAggregationMode, WebsocketTTSService
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
try:
|
||||
from websockets.asyncio.client import connect as websocket_connect
|
||||
from websockets.protocol import State
|
||||
except ModuleNotFoundError as exc:
|
||||
logger.error(f"Exception: {exc}")
|
||||
logger.error("In order to use Xfyun Super TTS, install the websockets package.")
|
||||
raise Exception(f"Missing module: {exc}") from exc
|
||||
|
||||
from .xfyun_tts import _sanitize_text_for_tts
|
||||
|
||||
|
||||
DEFAULT_XFYUN_SUPER_TTS_URL = "wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6"
|
||||
VALID_SAMPLE_RATES = {8000, 16000, 24000}
|
||||
|
||||
|
||||
class XfyunSuperTTSService(WebsocketTTSService):
|
||||
"""iFlytek/Xfyun Super Smart TTS using bidirectional WebSocket streaming.
|
||||
|
||||
The service keeps one Xfyun synthesis session open for a Pipecat turn. Each
|
||||
``run_tts`` call sends a text segment with status 0/1, while ``flush_audio``
|
||||
sends the terminal status 2 frame. Audio arrives on the receive task and is
|
||||
appended to the Pipecat audio context.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
app_id: str,
|
||||
api_key: str,
|
||||
api_secret: str,
|
||||
voice: str,
|
||||
url: str | None = None,
|
||||
sample_rate: int = 16000,
|
||||
source_sample_rate: int = 24000,
|
||||
encoding: str = "raw",
|
||||
speed: int = 50,
|
||||
volume: int = 50,
|
||||
pitch: int = 50,
|
||||
oral_level: str = "mid",
|
||||
text_aggregation_mode: TextAggregationMode | str | None = TextAggregationMode.TOKEN,
|
||||
open_timeout: float = 30.0,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if isinstance(text_aggregation_mode, str):
|
||||
text_aggregation_mode = TextAggregationMode(text_aggregation_mode)
|
||||
|
||||
super().__init__(
|
||||
text_aggregation_mode=text_aggregation_mode,
|
||||
push_text_frames=True,
|
||||
push_stop_frames=False,
|
||||
push_start_frame=True,
|
||||
pause_frame_processing=False,
|
||||
sample_rate=sample_rate,
|
||||
settings=TTSSettings(model=None, voice=voice, language=None),
|
||||
**kwargs,
|
||||
)
|
||||
self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
|
||||
self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
|
||||
self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
|
||||
self._voice = voice
|
||||
self._url = url or DEFAULT_XFYUN_SUPER_TTS_URL
|
||||
self._source_sample_rate = source_sample_rate
|
||||
self._encoding = encoding
|
||||
self._speed = speed
|
||||
self._volume = volume
|
||||
self._pitch = pitch
|
||||
self._oral_level = oral_level
|
||||
self._open_timeout = open_timeout
|
||||
|
||||
self._receive_task: asyncio.Task | None = None
|
||||
self._active_context_id: str | None = None
|
||||
self._started_contexts: set[str] = set()
|
||||
self._seq_by_context: dict[str, int] = {}
|
||||
self._sent_text_bytes_by_context: dict[str, int] = {}
|
||||
self._stream_completed = False
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
|
||||
async def start(self, frame: StartFrame) -> None:
|
||||
await super().start(frame)
|
||||
if not self._app_id or not self._api_key or not self._api_secret:
|
||||
await self.push_error(
|
||||
error_msg="Xfyun Super TTS requires app_id, api_key, and api_secret"
|
||||
)
|
||||
return
|
||||
if self._encoding != "raw":
|
||||
await self.push_error(error_msg="Xfyun Super TTS must use raw PCM audio in Pipecat")
|
||||
return
|
||||
if self._source_sample_rate not in VALID_SAMPLE_RATES:
|
||||
await self.push_error(
|
||||
error_msg=(
|
||||
"Xfyun Super TTS source_sample_rate must be one of "
|
||||
f"{sorted(VALID_SAMPLE_RATES)}"
|
||||
)
|
||||
)
|
||||
return
|
||||
await self._connect()
|
||||
|
||||
async def stop(self, frame: EndFrame) -> None:
|
||||
await super().stop(frame)
|
||||
await self._disconnect()
|
||||
|
||||
async def cancel(self, frame: CancelFrame) -> None:
|
||||
await super().cancel(frame)
|
||||
await self._disconnect()
|
||||
|
||||
async def flush_audio(self, context_id: str | None = None) -> None:
|
||||
flush_id = context_id or self.get_active_audio_context_id()
|
||||
if not flush_id or not self._websocket:
|
||||
return
|
||||
if flush_id not in self._started_contexts:
|
||||
return
|
||||
|
||||
logger.trace(f"{self}: flushing Xfyun Super TTS stream {flush_id}")
|
||||
await self._send_request_frame(flush_id, "", status=2)
|
||||
|
||||
async def on_audio_context_interrupted(self, context_id: str) -> None:
|
||||
await self.stop_all_metrics()
|
||||
await self._reset_context(context_id)
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
await super().on_audio_context_interrupted(context_id)
|
||||
|
||||
async def _connect(self) -> None:
|
||||
await super()._connect()
|
||||
await self._connect_websocket()
|
||||
if self._websocket and not self._receive_task:
|
||||
self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
|
||||
|
||||
async def _disconnect(self) -> None:
|
||||
await super()._disconnect()
|
||||
if self._receive_task:
|
||||
await self.cancel_task(self._receive_task)
|
||||
self._receive_task = None
|
||||
await self._disconnect_websocket()
|
||||
|
||||
async def _connect_websocket(self) -> None:
|
||||
try:
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
return
|
||||
logger.debug("Connecting to Xfyun Super TTS")
|
||||
auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
|
||||
self._websocket = await websocket_connect(
|
||||
auth_url,
|
||||
max_size=None,
|
||||
open_timeout=self._open_timeout,
|
||||
)
|
||||
await self._call_event_handler("on_connected")
|
||||
except Exception as exc:
|
||||
self._websocket = None
|
||||
await self.push_error(
|
||||
error_msg=f"Unable to connect to Xfyun Super TTS: {exc}",
|
||||
exception=exc,
|
||||
)
|
||||
await self._call_event_handler("on_connection_error", f"{exc}")
|
||||
|
||||
async def _disconnect_websocket(self) -> None:
|
||||
try:
|
||||
await self.stop_all_metrics()
|
||||
if self._websocket:
|
||||
logger.debug("Disconnecting from Xfyun Super TTS")
|
||||
await self._websocket.close()
|
||||
except Exception as exc:
|
||||
await self.push_error(
|
||||
error_msg=f"Error closing Xfyun Super TTS websocket: {exc}",
|
||||
exception=exc,
|
||||
)
|
||||
finally:
|
||||
await self.remove_active_audio_context()
|
||||
self._websocket = None
|
||||
self._active_context_id = None
|
||||
self._started_contexts.clear()
|
||||
self._seq_by_context.clear()
|
||||
self._sent_text_bytes_by_context.clear()
|
||||
self._stream_completed = False
|
||||
await self._call_event_handler("on_disconnected")
|
||||
|
||||
def _get_websocket(self):
|
||||
if self._websocket:
|
||||
return self._websocket
|
||||
raise Exception("Websocket not connected")
|
||||
|
||||
async def _receive_messages(self) -> None:
|
||||
async for raw_message in self._get_websocket():
|
||||
try:
|
||||
message = json.loads(raw_message)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(f"{self}: received non-JSON Xfyun Super TTS message: {raw_message!r}")
|
||||
continue
|
||||
|
||||
header = message.get("header") or {}
|
||||
code = header.get("code", -1)
|
||||
sid = header.get("sid")
|
||||
context_id = self._active_context_id
|
||||
|
||||
if code != 0:
|
||||
error_message = header.get("message", "unknown error")
|
||||
await self.push_error(
|
||||
error_msg=f"Xfyun Super TTS error code={code}, sid={sid}: {error_message}"
|
||||
)
|
||||
if context_id and self.audio_context_available(context_id):
|
||||
await self.append_to_audio_context(
|
||||
context_id, TTSStoppedFrame(context_id=context_id)
|
||||
)
|
||||
await self.remove_audio_context(context_id)
|
||||
if context_id:
|
||||
await self._reset_context(context_id)
|
||||
continue
|
||||
|
||||
audio_obj = (message.get("payload") or {}).get("audio") or {}
|
||||
audio_b64 = audio_obj.get("audio")
|
||||
if audio_b64 and context_id and self.audio_context_available(context_id):
|
||||
await self.stop_ttfb_metrics()
|
||||
audio = base64.b64decode(audio_b64)
|
||||
if self._source_sample_rate != self.sample_rate:
|
||||
audio = await self._resampler.resample(
|
||||
audio, self._source_sample_rate, self.sample_rate
|
||||
)
|
||||
frame = TTSAudioRawFrame(audio, self.sample_rate, 1, context_id=context_id)
|
||||
await self.append_to_audio_context(context_id, frame)
|
||||
|
||||
audio_status = audio_obj.get("status")
|
||||
header_status = header.get("status")
|
||||
if audio_status == 2 or header_status == 2:
|
||||
if context_id and self.audio_context_available(context_id):
|
||||
await self.append_to_audio_context(
|
||||
context_id, TTSStoppedFrame(context_id=context_id)
|
||||
)
|
||||
await self.remove_audio_context(context_id)
|
||||
if context_id:
|
||||
await self._reset_context(context_id)
|
||||
self._stream_completed = True
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame | None, None]:
|
||||
sanitized = _sanitize_text_for_tts(text)
|
||||
if not sanitized:
|
||||
return
|
||||
|
||||
if not self._is_streaming_tokens:
|
||||
logger.debug(f"{self}: Generating Xfyun Super TTS [{sanitized}]")
|
||||
else:
|
||||
logger.trace(f"{self}: Generating Xfyun Super TTS [{sanitized}]")
|
||||
|
||||
if self._stream_completed and self._websocket:
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
|
||||
if not self._websocket or self._websocket.state is State.CLOSED:
|
||||
await self._connect()
|
||||
|
||||
if self._active_context_id and self._active_context_id != context_id:
|
||||
yield ErrorFrame(
|
||||
error=(
|
||||
"Xfyun Super TTS supports one active synthesis stream per WebSocket; "
|
||||
f"active={self._active_context_id}, new={context_id}"
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
status = 0 if context_id not in self._started_contexts else 1
|
||||
await self._send_request_frame(context_id, sanitized, status=status)
|
||||
await self.start_tts_usage_metrics(sanitized)
|
||||
except Exception as exc:
|
||||
yield ErrorFrame(error=f"Xfyun Super TTS request failed: {exc}")
|
||||
yield TTSStoppedFrame(context_id=context_id)
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
return
|
||||
|
||||
yield None
|
||||
|
||||
async def _send_request_frame(self, context_id: str, text: str, *, status: int) -> None:
|
||||
if status == 0:
|
||||
self._active_context_id = context_id
|
||||
self._started_contexts.add(context_id)
|
||||
|
||||
seq = self._seq_by_context.get(context_id, 0)
|
||||
text_bytes = text.encode("utf-8")
|
||||
total_bytes = self._sent_text_bytes_by_context.get(context_id, 0) + len(text_bytes)
|
||||
if total_bytes > 65536:
|
||||
raise ValueError("Xfyun Super TTS text must not exceed 64K UTF-8 bytes per stream")
|
||||
|
||||
frame = self._build_request_frame(text, status=status, seq=seq)
|
||||
await self._get_websocket().send(json.dumps(frame, ensure_ascii=False))
|
||||
|
||||
self._seq_by_context[context_id] = seq + 1
|
||||
self._sent_text_bytes_by_context[context_id] = total_bytes
|
||||
|
||||
def _build_request_frame(self, text: str, *, status: int, seq: int) -> dict[str, Any]:
|
||||
return {
|
||||
"header": {
|
||||
"app_id": self._app_id,
|
||||
"status": status,
|
||||
},
|
||||
"parameter": {
|
||||
"oral": {
|
||||
"oral_level": self._oral_level,
|
||||
},
|
||||
"tts": {
|
||||
"vcn": self._voice,
|
||||
"speed": self._speed,
|
||||
"volume": self._volume,
|
||||
"pitch": self._pitch,
|
||||
"bgs": 0,
|
||||
"reg": 0,
|
||||
"rdn": 0,
|
||||
"rhy": 0,
|
||||
"audio": {
|
||||
"encoding": self._encoding,
|
||||
"sample_rate": self._source_sample_rate,
|
||||
"channels": 1,
|
||||
"bit_depth": 16,
|
||||
"frame_size": 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
"payload": {
|
||||
"text": {
|
||||
"encoding": "utf8",
|
||||
"compress": "raw",
|
||||
"format": "plain",
|
||||
"status": status,
|
||||
"seq": seq,
|
||||
"text": base64.b64encode(text.encode("utf-8")).decode("utf-8"),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
async def _reset_context(self, context_id: str) -> None:
|
||||
self._started_contexts.discard(context_id)
|
||||
self._seq_by_context.pop(context_id, None)
|
||||
self._sent_text_bytes_by_context.pop(context_id, None)
|
||||
if self._active_context_id == context_id:
|
||||
self._active_context_id = None
|
||||
|
||||
|
||||
def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme not in {"ws", "wss"} or not parsed.hostname:
|
||||
raise ValueError(f"invalid Xfyun Super TTS WebSocket URL: {url}")
|
||||
|
||||
host = parsed.hostname
|
||||
path = parsed.path or "/"
|
||||
date = format_datetime(datetime.now(timezone.utc), usegmt=True)
|
||||
request_line = f"GET {path} HTTP/1.1"
|
||||
signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
|
||||
signature_sha = hmac.new(
|
||||
api_secret.encode("utf-8"),
|
||||
signature_origin.encode("utf-8"),
|
||||
digestmod=hashlib.sha256,
|
||||
).digest()
|
||||
signature = base64.b64encode(signature_sha).decode("utf-8")
|
||||
authorization_origin = (
|
||||
f'api_key="{api_key}", algorithm="hmac-sha256", '
|
||||
f'headers="host date request-line", signature="{signature}"'
|
||||
)
|
||||
authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
|
||||
query = urlencode({"authorization": authorization, "date": date, "host": host})
|
||||
return f"{url}?{query}"
|
||||
257
src/voice/xfyun_tts.py
Normal file
257
src/voice/xfyun_tts.py
Normal file
@@ -0,0 +1,257 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
from collections.abc import AsyncGenerator, AsyncIterator
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import format_datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urlencode, urlparse
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame
|
||||
from pipecat.services.settings import TTSSettings
|
||||
from pipecat.services.tts_service import TTSService
|
||||
from websockets.asyncio.client import connect
|
||||
|
||||
|
||||
DEFAULT_XFYUN_TTS_URL = "wss://tts-api.xfyun.cn/v2/tts"
|
||||
|
||||
# Strip characters Xfyun's online TTS cannot synthesize. The engine silently
|
||||
# rejects (or returns empty audio for) text containing emoji and other
|
||||
# non-BMP symbols, which surfaces as "request finished without audio data".
|
||||
_EMOJI_AND_SYMBOL_RE = re.compile(
|
||||
"["
|
||||
"\U0001F300-\U0001FAFF" # misc pictographs, emoji, symbols, transport, etc.
|
||||
"\U00002600-\U000027BF" # misc symbols and dingbats
|
||||
"\U0001F1E6-\U0001F1FF" # regional indicators (flags)
|
||||
"\uFE00-\uFE0F" # variation selectors
|
||||
"\u200D" # zero-width joiner
|
||||
"]",
|
||||
flags=re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
class XfyunTTSService(TTSService):
|
||||
"""iFlytek/Xfyun online TTS service for Pipecat.
|
||||
|
||||
Xfyun's API is not OpenAI-compatible. It uses a signed WebSocket URL,
|
||||
receives one JSON request per synthesis, and streams text WebSocket
|
||||
messages containing base64-encoded audio chunks. This service requests
|
||||
raw PCM so the chunks can become Pipecat audio frames without MP3 decode.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
app_id: str,
|
||||
api_key: str,
|
||||
api_secret: str,
|
||||
voice: str,
|
||||
url: str | None = None,
|
||||
sample_rate: int = 16000,
|
||||
source_sample_rate: int = 16000,
|
||||
encoding: str = "raw",
|
||||
text_encoding: str = "UTF8",
|
||||
speed: int = 50,
|
||||
volume: int = 50,
|
||||
pitch: int = 50,
|
||||
timeout: float = 30.0,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
sample_rate=sample_rate,
|
||||
settings=TTSSettings(model=None, voice=voice, language=None),
|
||||
**kwargs,
|
||||
)
|
||||
self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
|
||||
self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
|
||||
self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
|
||||
self._voice = voice
|
||||
self._url = url or DEFAULT_XFYUN_TTS_URL
|
||||
self._source_sample_rate = source_sample_rate
|
||||
self._encoding = encoding
|
||||
self._text_encoding = text_encoding
|
||||
self._speed = speed
|
||||
self._volume = volume
|
||||
self._pitch = pitch
|
||||
self._timeout = timeout
|
||||
self._last_failure_detail: str | None = None
|
||||
|
||||
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
|
||||
if not text:
|
||||
return
|
||||
|
||||
if not self._app_id or not self._api_key or not self._api_secret:
|
||||
yield ErrorFrame(error="Xfyun TTS requires app_id, api_key, and api_secret")
|
||||
return
|
||||
|
||||
sanitized = _sanitize_text_for_tts(text)
|
||||
if not sanitized:
|
||||
logger.debug(
|
||||
f"{self}: skipping Xfyun TTS, text became empty after sanitization "
|
||||
f"(original={text!r})"
|
||||
)
|
||||
return
|
||||
|
||||
if sanitized != text:
|
||||
logger.debug(
|
||||
f"{self}: sanitized Xfyun TTS text "
|
||||
f"(original={text!r}, sanitized={sanitized!r})"
|
||||
)
|
||||
|
||||
if len(sanitized.encode("utf-8")) >= 8000:
|
||||
yield ErrorFrame(error="Xfyun TTS text must be less than 8000 UTF-8 bytes")
|
||||
return
|
||||
|
||||
if self._encoding != "raw":
|
||||
yield ErrorFrame(error="Xfyun TTS is configured for PCM output; set aue/encoding to raw")
|
||||
return
|
||||
|
||||
try:
|
||||
await self.start_tts_usage_metrics(sanitized)
|
||||
|
||||
first_frame = True
|
||||
async for frame in self._stream_audio_frames_from_iterator(
|
||||
self._iter_audio_chunks(sanitized),
|
||||
in_sample_rate=self._source_sample_rate,
|
||||
context_id=context_id,
|
||||
):
|
||||
if first_frame:
|
||||
await self.stop_ttfb_metrics()
|
||||
first_frame = False
|
||||
yield frame
|
||||
|
||||
if first_frame:
|
||||
detail = self._last_failure_detail or "no audio frames received"
|
||||
yield ErrorFrame(
|
||||
error=(
|
||||
f"Xfyun TTS request finished without audio data ({detail}); "
|
||||
f"text={sanitized!r}"
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
yield ErrorFrame(error=f"Xfyun TTS request failed: {exc}")
|
||||
|
||||
async def _iter_audio_chunks(self, text: str) -> AsyncIterator[bytes]:
|
||||
request = self._build_request_frame(text)
|
||||
auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
|
||||
|
||||
self._last_failure_detail = None
|
||||
frames_received = 0
|
||||
audio_bytes_received = 0
|
||||
last_status: int | None = None
|
||||
last_sid: str | None = None
|
||||
saw_status_2 = False
|
||||
|
||||
async with connect(auth_url, max_size=None, open_timeout=self._timeout) as websocket:
|
||||
await websocket.send(json.dumps(request, ensure_ascii=False))
|
||||
|
||||
async for raw_message in websocket:
|
||||
frames_received += 1
|
||||
payload = json.loads(raw_message)
|
||||
code = payload.get("code", -1)
|
||||
sid = payload.get("sid")
|
||||
if sid:
|
||||
last_sid = sid
|
||||
if code != 0:
|
||||
err_msg = payload.get("message", "unknown error")
|
||||
raise RuntimeError(f"code={code}, sid={sid}, message={err_msg}")
|
||||
|
||||
data = payload.get("data")
|
||||
if not isinstance(data, dict):
|
||||
continue
|
||||
|
||||
last_status = data.get("status", last_status)
|
||||
|
||||
audio_b64 = data.get("audio")
|
||||
if audio_b64:
|
||||
audio_bytes = base64.b64decode(audio_b64)
|
||||
audio_bytes_received += len(audio_bytes)
|
||||
yield audio_bytes
|
||||
|
||||
if data.get("status") == 2:
|
||||
saw_status_2 = True
|
||||
break
|
||||
|
||||
if audio_bytes_received == 0:
|
||||
self._last_failure_detail = (
|
||||
f"frames={frames_received}, audio_bytes=0, "
|
||||
f"last_status={last_status}, saw_status_2={saw_status_2}, sid={last_sid}"
|
||||
)
|
||||
logger.warning(
|
||||
f"{self}: Xfyun TTS produced no audio ({self._last_failure_detail})"
|
||||
)
|
||||
|
||||
def _build_request_frame(self, text: str) -> dict[str, Any]:
|
||||
business: dict[str, Any] = {
|
||||
"aue": self._encoding,
|
||||
"auf": f"audio/L16;rate={self._source_sample_rate}",
|
||||
"vcn": self._voice,
|
||||
"speed": self._speed,
|
||||
"volume": self._volume,
|
||||
"pitch": self._pitch,
|
||||
"tte": self._text_encoding,
|
||||
}
|
||||
|
||||
return {
|
||||
"common": {"app_id": self._app_id},
|
||||
"business": business,
|
||||
"data": {
|
||||
"status": 2,
|
||||
"text": base64.b64encode(text.encode("utf-8")).decode("utf-8"),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _sanitize_text_for_tts(text: str) -> str:
|
||||
"""Strip characters Xfyun's online TTS cannot synthesize.
|
||||
|
||||
The Xfyun ``/v2/tts`` engine silently drops or rejects emoji, pictographs,
|
||||
dingbats, regional-indicator flags, variation selectors, and zero-width
|
||||
joiners. When such characters appear in the input the synthesis can
|
||||
finish without any audio data ("Xfyun TTS request finished without audio
|
||||
data"). We also drop control characters (other than common whitespace)
|
||||
and "Symbol, Other" codepoints, then collapse runs of whitespace.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
cleaned = _EMOJI_AND_SYMBOL_RE.sub("", text)
|
||||
filtered: list[str] = []
|
||||
for ch in cleaned:
|
||||
category = unicodedata.category(ch)
|
||||
if category == "So":
|
||||
continue
|
||||
if category.startswith("C") and ch not in ("\n", "\r", "\t"):
|
||||
continue
|
||||
filtered.append(ch)
|
||||
return re.sub(r"\s+", " ", "".join(filtered)).strip()
|
||||
|
||||
|
||||
def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
host = parsed.netloc
|
||||
path = parsed.path or "/v2/tts"
|
||||
date = format_datetime(datetime.now(timezone.utc), usegmt=True)
|
||||
request_line = f"GET {path} HTTP/1.1"
|
||||
signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
|
||||
signature_sha = hmac.new(
|
||||
api_secret.encode("utf-8"),
|
||||
signature_origin.encode("utf-8"),
|
||||
digestmod=hashlib.sha256,
|
||||
).digest()
|
||||
signature = base64.b64encode(signature_sha).decode("utf-8")
|
||||
authorization_origin = (
|
||||
f'api_key="{api_key}", algorithm="hmac-sha256", '
|
||||
f'headers="host date request-line", signature="{signature}"'
|
||||
)
|
||||
authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
|
||||
query = urlencode({"authorization": authorization, "date": date, "host": host})
|
||||
return f"{url}?{query}"
|
||||
106
static/voice-demo/README.md
Normal file
106
static/voice-demo/README.md
Normal file
@@ -0,0 +1,106 @@
|
||||
# Webpage Example — Realtime Voice Chat
|
||||
|
||||
A self-contained browser client for the engine's product websocket
|
||||
(`/ws-product`, protocol `va.ws.v1`).
|
||||
|
||||
## Features
|
||||
|
||||
- **Connect / Disconnect** to any `ws://` or `wss://` URL.
|
||||
- **Microphone selector + mic on/off toggle** — available input devices
|
||||
are listed with `enumerateDevices`, and getUserMedia is requested with
|
||||
`echoCancellation`, `noiseSuppression`, and `autoGainControl` so the
|
||||
browser handles AEC against the bot's voice.
|
||||
- **Text composer** — type a message and press <kbd>Enter</kbd> to send
|
||||
an `input.text` event (Shift+Enter for newline). Sending interrupts
|
||||
any in-flight bot audio so the next reply is heard cleanly.
|
||||
- **Chat history** rendered from `input.transcript.final` (you, when
|
||||
spoken), streamed `response.text.delta` / `response.text.final`
|
||||
(assistant — deltas arrive ahead of the synthesized audio), and locally
|
||||
for text you submit (the engine doesn't echo text input back as a
|
||||
transcript).
|
||||
- **WebSocket log** panel for connection state and compact send/receive
|
||||
events. Audio chunks are summarized so the UI does not flood.
|
||||
- **Gapless TTS playback** by scheduling each `response.audio.delta`
|
||||
chunk back-to-back on the AudioContext.
|
||||
- **Live VU meter** + mic and bot activity indicators.
|
||||
- **Clear** button to reset history.
|
||||
|
||||
No build step, no dependencies — just three files plus an AudioWorklet.
|
||||
|
||||
## Layout
|
||||
|
||||
```text
|
||||
examples/webpage/
|
||||
├── index.html
|
||||
├── styles.css
|
||||
├── app.js
|
||||
└── pcm-recorder.worklet.js
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
1. Start the engine (default port `8000`):
|
||||
|
||||
```bash
|
||||
cd AI-VideoAssistant-engine-v5-pipecat-minimal
|
||||
source .venv/bin/activate
|
||||
export OPENAI_API_KEY=...
|
||||
uvicorn engine.main:app --host 127.0.0.1 --port 8000
|
||||
```
|
||||
|
||||
2. Open the demo page served by the same process:
|
||||
|
||||
```text
|
||||
http://127.0.0.1:8000/voice-demo/
|
||||
```
|
||||
|
||||
The default websocket URL is derived from the page host
|
||||
(`ws://127.0.0.1:8000/ws-product`). Click **Connect**, pick a
|
||||
microphone if needed, click **Enable mic**, and start speaking.
|
||||
|
||||
Mount path and on/off are controlled in `config.json`:
|
||||
|
||||
```json
|
||||
"server": {
|
||||
"serve_webpage": true,
|
||||
"webpage_mount": "/voice-demo"
|
||||
}
|
||||
```
|
||||
|
||||
Set `"serve_webpage": false` in production if you serve the UI elsewhere.
|
||||
|
||||
### Standalone static server (optional)
|
||||
|
||||
You can still serve the files from another port for UI-only iteration.
|
||||
Add that origin to `server.cors_origins` in `config.json` if needed:
|
||||
|
||||
```bash
|
||||
cd AI-VideoAssistant-engine-v5-pipecat-minimal/examples/webpage
|
||||
python -m http.server 8080
|
||||
```
|
||||
|
||||
Then open <http://localhost:8080> and point the URL field at
|
||||
`ws://127.0.0.1:8000/ws-product`.
|
||||
|
||||
> The browser's mic API requires a secure context. `http://localhost`
|
||||
> qualifies; if you serve from another host, use HTTPS and a `wss://`
|
||||
> URL.
|
||||
|
||||
## Audio details
|
||||
|
||||
- Input: mono Float32 from `getUserMedia` is resampled in the
|
||||
AudioWorklet to PCM16 mono @ 16 kHz, framed into 20 ms chunks, and
|
||||
sent as **binary** websocket messages (the server accepts either
|
||||
binary or the JSON+base64 form).
|
||||
- Output: each `response.audio.delta` carries base64-encoded PCM16 @
|
||||
16 kHz; chunks are decoded and scheduled back-to-back through Web
|
||||
Audio. The browser handles resampling to the device rate.
|
||||
|
||||
## Notes
|
||||
|
||||
- Use headphones if you still hear echo despite browser AEC; the bot's
|
||||
voice leaking back into the open mic is the most common cause of
|
||||
feedback loops.
|
||||
- The engine's session has an inactivity timeout
|
||||
(`session.inactivity_timeout_sec` in `config.json`). If the bot
|
||||
doesn't respond after a long silence, reconnect.
|
||||
1544
static/voice-demo/app.js
Normal file
1544
static/voice-demo/app.js
Normal file
File diff suppressed because it is too large
Load Diff
288
static/voice-demo/index.html
Normal file
288
static/voice-demo/index.html
Normal file
@@ -0,0 +1,288 @@
|
||||
<!doctype html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>VA Voice Chat — /ws-product</title>
|
||||
<link rel="stylesheet" href="./styles.css" />
|
||||
</head>
|
||||
<body>
|
||||
<main class="app">
|
||||
<header class="app__header">
|
||||
<div class="brand">
|
||||
<span class="brand__dot" aria-hidden="true"></span>
|
||||
<h1>VA Voice Chat</h1>
|
||||
</div>
|
||||
|
||||
<div class="connection">
|
||||
<label class="connection__field">
|
||||
<span>服务器地址</span>
|
||||
<input
|
||||
id="ws-url"
|
||||
type="text"
|
||||
placeholder="ws://host/ws-product"
|
||||
spellcheck="false"
|
||||
autocomplete="off"
|
||||
/>
|
||||
</label>
|
||||
<label class="connection__field connection__field--chat">
|
||||
<span>会话 ID</span>
|
||||
<div class="chat-id-control">
|
||||
<input
|
||||
id="chat-id"
|
||||
type="text"
|
||||
placeholder="可选"
|
||||
spellcheck="false"
|
||||
autocomplete="off"
|
||||
/>
|
||||
<button
|
||||
id="copy-chat-id-btn"
|
||||
class="chat-id-control__copy"
|
||||
type="button"
|
||||
disabled
|
||||
title="复制会话 ID"
|
||||
aria-label="复制会话 ID"
|
||||
>
|
||||
<svg class="copy-icon copy-icon--default" viewBox="0 0 16 16" width="14" height="14" fill="none" aria-hidden="true">
|
||||
<rect x="5" y="5" width="8" height="9" rx="1.5" stroke="currentColor" stroke-width="1.4"/>
|
||||
<path d="M3 11V3.5A1.5 1.5 0 0 1 4.5 2H11" stroke="currentColor" stroke-width="1.4" stroke-linecap="round"/>
|
||||
</svg>
|
||||
<svg class="copy-icon copy-icon--check" viewBox="0 0 16 16" width="14" height="14" fill="none" aria-hidden="true">
|
||||
<path d="M3 8.5l3.5 3.5 6.5-7" stroke="currentColor" stroke-width="1.6" stroke-linecap="round" stroke-linejoin="round"/>
|
||||
</svg>
|
||||
</button>
|
||||
</div>
|
||||
</label>
|
||||
<button id="connect-btn" class="btn btn--primary" type="button">
|
||||
连接
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<div class="status">
|
||||
<span id="status-dot" class="status__dot status__dot--idle"></span>
|
||||
<span id="status-text" class="status__text">未连接</span>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<div class="app__body">
|
||||
<div class="app__main">
|
||||
<div id="conversation" class="conversation">
|
||||
<aside
|
||||
id="camera-drawer"
|
||||
class="camera-drawer"
|
||||
aria-label="拍照步骤"
|
||||
aria-hidden="true"
|
||||
>
|
||||
<div class="camera-drawer__panel">
|
||||
<div class="camera-drawer__header">
|
||||
<div>
|
||||
<p class="camera-drawer__eyebrow">拍照</p>
|
||||
<h2>拍照步骤</h2>
|
||||
</div>
|
||||
<span id="camera-state" class="camera-drawer__state">状态 -</span>
|
||||
</div>
|
||||
|
||||
<div id="camera-preview" class="camera-drawer__preview">
|
||||
<video
|
||||
id="camera-video"
|
||||
class="camera-drawer__video"
|
||||
playsinline
|
||||
muted
|
||||
autoplay
|
||||
></video>
|
||||
<img
|
||||
id="camera-photo"
|
||||
class="camera-drawer__photo"
|
||||
alt="已选择图片预览"
|
||||
/>
|
||||
<span class="camera-drawer__corner camera-drawer__corner--tl"></span>
|
||||
<span class="camera-drawer__corner camera-drawer__corner--tr"></span>
|
||||
<span class="camera-drawer__corner camera-drawer__corner--bl"></span>
|
||||
<span class="camera-drawer__corner camera-drawer__corner--br"></span>
|
||||
<span class="camera-drawer__lens"></span>
|
||||
<span class="camera-drawer__scan"></span>
|
||||
<span id="camera-placeholder" class="camera-drawer__placeholder">
|
||||
打开摄像头实时拍摄,或从下方选择 / 上传图片
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<p id="camera-question" class="camera-drawer__question"></p>
|
||||
|
||||
<div
|
||||
id="camera-samples"
|
||||
class="camera-drawer__samples"
|
||||
aria-label="示例图片,点击选择"
|
||||
></div>
|
||||
|
||||
<div class="camera-drawer__sources">
|
||||
<label
|
||||
class="btn btn--ghost camera-drawer__source"
|
||||
>
|
||||
上传图片
|
||||
<input
|
||||
id="camera-upload"
|
||||
type="file"
|
||||
accept="image/*"
|
||||
hidden
|
||||
/>
|
||||
</label>
|
||||
<button
|
||||
id="camera-start-btn"
|
||||
class="btn btn--ghost camera-drawer__source"
|
||||
type="button"
|
||||
title="打开摄像头"
|
||||
>
|
||||
使用摄像头
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<label
|
||||
id="camera-device-row"
|
||||
class="device-picker camera-drawer__device-row"
|
||||
hidden
|
||||
>
|
||||
<span class="device-picker__label">选择摄像头</span>
|
||||
<select
|
||||
id="camera-device-select"
|
||||
class="device-picker__select"
|
||||
disabled
|
||||
>
|
||||
<option value="">默认摄像头</option>
|
||||
</select>
|
||||
</label>
|
||||
|
||||
<button
|
||||
id="camera-done-btn"
|
||||
class="btn btn--primary camera-drawer__button"
|
||||
type="button"
|
||||
disabled
|
||||
>
|
||||
拍摄完成
|
||||
</button>
|
||||
<canvas id="camera-canvas" hidden></canvas>
|
||||
</div>
|
||||
</aside>
|
||||
|
||||
<section class="chat" aria-label="对话记录">
|
||||
<div id="chat-log" class="chat__log" role="log" aria-live="polite">
|
||||
<div class="chat__empty">
|
||||
<p>连接服务、开启麦克风后即可开始对话。</p>
|
||||
<p class="chat__hint">
|
||||
音频通过 <code>/ws-product</code> 以 PCM16 单声道 16 kHz
|
||||
传输。
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
|
||||
<footer class="controls" aria-label="操作栏">
|
||||
<div class="meter" aria-hidden="true">
|
||||
<div id="meter-fill" class="meter__fill"></div>
|
||||
</div>
|
||||
|
||||
<form id="composer" class="composer" autocomplete="off">
|
||||
<textarea
|
||||
id="text-input"
|
||||
class="composer__input"
|
||||
rows="1"
|
||||
placeholder="输入消息,或使用麦克风…"
|
||||
disabled
|
||||
></textarea>
|
||||
<button
|
||||
id="send-btn"
|
||||
class="btn btn--primary composer__send"
|
||||
type="submit"
|
||||
disabled
|
||||
title="发送消息 (Enter)"
|
||||
>
|
||||
发送
|
||||
</button>
|
||||
</form>
|
||||
|
||||
<div class="controls__row">
|
||||
<label class="device-picker">
|
||||
<span class="device-picker__label">麦克风</span>
|
||||
<select id="mic-select" class="device-picker__select" disabled>
|
||||
<option value="">默认麦克风</option>
|
||||
</select>
|
||||
</label>
|
||||
|
||||
<button
|
||||
id="mic-btn"
|
||||
class="mic-btn"
|
||||
type="button"
|
||||
disabled
|
||||
aria-pressed="false"
|
||||
title="麦克风已关闭"
|
||||
>
|
||||
<svg
|
||||
class="mic-btn__icon"
|
||||
viewBox="0 0 24 24"
|
||||
width="24"
|
||||
height="24"
|
||||
aria-hidden="true"
|
||||
>
|
||||
<path
|
||||
d="M12 14a3 3 0 0 0 3-3V6a3 3 0 1 0-6 0v5a3 3 0 0 0 3 3Z"
|
||||
fill="currentColor"
|
||||
/>
|
||||
<path
|
||||
d="M19 11a1 1 0 1 0-2 0 5 5 0 0 1-10 0 1 1 0 1 0-2 0 7 7 0 0 0 6 6.92V21a1 1 0 1 0 2 0v-3.08A7 7 0 0 0 19 11Z"
|
||||
fill="currentColor"
|
||||
/>
|
||||
</svg>
|
||||
<span class="mic-btn__label">开启麦克风</span>
|
||||
</button>
|
||||
|
||||
<div class="indicators">
|
||||
<span id="mic-indicator" class="indicator">
|
||||
<span class="indicator__dot indicator__dot--mic"></span>
|
||||
<span class="indicator__label">麦克风</span>
|
||||
</span>
|
||||
<span id="bot-indicator" class="indicator">
|
||||
<span class="indicator__dot indicator__dot--bot"></span>
|
||||
<span class="indicator__label">助手</span>
|
||||
</span>
|
||||
<span id="state-indicator" class="indicator indicator--state">
|
||||
<span class="indicator__dot indicator__dot--state"></span>
|
||||
<span id="state-label" class="indicator__label">状态 -</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<button id="clear-btn" class="btn btn--ghost" type="button">
|
||||
清空
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<p class="hint">
|
||||
按 <kbd>Enter</kbd> 发送,<kbd>Shift</kbd>+<kbd>Enter</kbd>
|
||||
换行。发送文字会打断正在说话的助手。
|
||||
浏览器回声消除已开启,如有回音请使用耳机。
|
||||
</p>
|
||||
</footer>
|
||||
</div>
|
||||
|
||||
<section class="ws-log" aria-label="WebSocket 日志">
|
||||
<div class="ws-log__header">
|
||||
<div class="ws-log__header-left">
|
||||
<h2>WebSocket 日志</h2>
|
||||
<div class="ws-log__legend" aria-hidden="true">
|
||||
<span class="ws-log__legend-item ws-log__legend-item--send">发送</span>
|
||||
<span class="ws-log__legend-item ws-log__legend-item--recv">接收</span>
|
||||
</div>
|
||||
</div>
|
||||
<button id="clear-ws-log-btn" class="btn btn--ghost" type="button">
|
||||
清空日志
|
||||
</button>
|
||||
</div>
|
||||
<div id="ws-log" class="ws-log__body" role="log" aria-live="polite">
|
||||
<div class="ws-log__empty">暂无 WebSocket 事件。</div>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
</main>
|
||||
|
||||
<script type="module" src="./app.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
104
static/voice-demo/pcm-recorder.worklet.js
Normal file
104
static/voice-demo/pcm-recorder.worklet.js
Normal file
@@ -0,0 +1,104 @@
|
||||
/**
|
||||
* PCM Recorder AudioWorklet.
|
||||
*
|
||||
* Captures mono Float32 mic samples at the AudioContext's native rate,
|
||||
* resamples them to a target sample rate (default 16 kHz) with linear
|
||||
* interpolation, then ships PCM16 frames of a fixed duration (default 20 ms)
|
||||
* to the main thread via `port.postMessage(ArrayBuffer)`.
|
||||
*
|
||||
* It also computes a simple RMS level per frame for the UI VU meter so the
|
||||
* main thread doesn't have to re-process the audio.
|
||||
*/
|
||||
|
||||
class PcmRecorderProcessor extends AudioWorkletProcessor {
|
||||
constructor(options) {
|
||||
super();
|
||||
|
||||
const opts = (options && options.processorOptions) || {};
|
||||
this._targetSampleRate = opts.targetSampleRate || 16000;
|
||||
this._frameMs = opts.frameMs || 20;
|
||||
this._frameSamples = Math.round(
|
||||
(this._targetSampleRate * this._frameMs) / 1000,
|
||||
);
|
||||
|
||||
// Resampling state.
|
||||
// `ratio` is input samples per output sample.
|
||||
this._ratio = sampleRate / this._targetSampleRate;
|
||||
this._inputBuffer = new Float32Array(0);
|
||||
// Float position in `_inputBuffer` for the next output sample.
|
||||
this._inputOffset = 0;
|
||||
|
||||
// Output framing state.
|
||||
this._frameBuffer = new Int16Array(this._frameSamples);
|
||||
this._frameIndex = 0;
|
||||
|
||||
// VU meter accumulator.
|
||||
this._rmsSumSquares = 0;
|
||||
this._rmsCount = 0;
|
||||
}
|
||||
|
||||
process(inputs) {
|
||||
const input = inputs[0];
|
||||
if (!input || input.length === 0) return true;
|
||||
const channel = input[0];
|
||||
if (!channel || channel.length === 0) return true;
|
||||
|
||||
// Append new samples to the input buffer.
|
||||
const merged = new Float32Array(this._inputBuffer.length + channel.length);
|
||||
merged.set(this._inputBuffer, 0);
|
||||
merged.set(channel, this._inputBuffer.length);
|
||||
this._inputBuffer = merged;
|
||||
|
||||
const ratio = this._ratio;
|
||||
const inLen = this._inputBuffer.length;
|
||||
let pos = this._inputOffset;
|
||||
|
||||
while (pos + 1 < inLen) {
|
||||
const lo = Math.floor(pos);
|
||||
const hi = lo + 1;
|
||||
const w = pos - lo;
|
||||
const sample =
|
||||
this._inputBuffer[lo] * (1 - w) + this._inputBuffer[hi] * w;
|
||||
|
||||
this._rmsSumSquares += sample * sample;
|
||||
this._rmsCount += 1;
|
||||
|
||||
let s = sample;
|
||||
if (s > 1) s = 1;
|
||||
else if (s < -1) s = -1;
|
||||
this._frameBuffer[this._frameIndex++] =
|
||||
s < 0 ? Math.round(s * 0x8000) : Math.round(s * 0x7fff);
|
||||
|
||||
if (this._frameIndex === this._frameSamples) {
|
||||
const frame = new Int16Array(this._frameSamples);
|
||||
frame.set(this._frameBuffer);
|
||||
const rms =
|
||||
this._rmsCount > 0
|
||||
? Math.sqrt(this._rmsSumSquares / this._rmsCount)
|
||||
: 0;
|
||||
this.port.postMessage(
|
||||
{ type: "frame", buffer: frame.buffer, rms },
|
||||
[frame.buffer],
|
||||
);
|
||||
this._frameIndex = 0;
|
||||
this._rmsSumSquares = 0;
|
||||
this._rmsCount = 0;
|
||||
}
|
||||
|
||||
pos += ratio;
|
||||
}
|
||||
|
||||
// Trim consumed samples from the input buffer; keep at least the last
|
||||
// sample we still need to interpolate against on the next call.
|
||||
const consumed = Math.floor(pos);
|
||||
if (consumed > 0) {
|
||||
this._inputBuffer = this._inputBuffer.slice(consumed);
|
||||
pos -= consumed;
|
||||
}
|
||||
this._inputOffset = pos;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
registerProcessor("pcm-recorder", PcmRecorderProcessor);
|
||||
BIN
static/voice-demo/samples/.DS_Store
vendored
Normal file
BIN
static/voice-demo/samples/.DS_Store
vendored
Normal file
Binary file not shown.
BIN
static/voice-demo/samples/damage1.png
Normal file
BIN
static/voice-demo/samples/damage1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 273 KiB |
BIN
static/voice-demo/samples/damage2.png
Normal file
BIN
static/voice-demo/samples/damage2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 323 KiB |
BIN
static/voice-demo/samples/plate1.jpg
Normal file
BIN
static/voice-demo/samples/plate1.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 7.5 KiB |
BIN
static/voice-demo/samples/plate2.jpg
Normal file
BIN
static/voice-demo/samples/plate2.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 229 KiB |
BIN
static/voice-demo/samples/user1.jpg
Normal file
BIN
static/voice-demo/samples/user1.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 72 KiB |
BIN
static/voice-demo/samples/user2.jpg
Normal file
BIN
static/voice-demo/samples/user2.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 105 KiB |
1240
static/voice-demo/styles.css
Normal file
1240
static/voice-demo/styles.css
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,29 +1,30 @@
|
||||
@baseUrl = http://127.0.0.1:8080
|
||||
###
|
||||
@baseUrl = http://101.89.108.122:8000
|
||||
|
||||
GET http://127.0.0.1:8080
|
||||
|
||||
HTTP/1.1 200 - OK
|
||||
connection: close
|
||||
date: Wed, 17 Jun 2026 00:37:02 GMT
|
||||
server: uvicorn
|
||||
content-length: 32
|
||||
content-type: application/json
|
||||
date: Thu, 08 Jan 2026 08:58:09 GMT
|
||||
server: uvicorn
|
||||
connection: close
|
||||
###
|
||||
POST http://127.0.0.1:8080/chat
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"sessionId": "a1002",
|
||||
"sessionId": "a1100",
|
||||
"timeStamp": "202503310303",
|
||||
"text": "【拍摄完成】"
|
||||
"text": "继续",
|
||||
"needFormTags": true
|
||||
}
|
||||
|
||||
HTTP/1.1 200 - OK
|
||||
connection: close
|
||||
content-length: 205
|
||||
content-type: application/json
|
||||
date: Thu, 08 Jan 2026 08:59:37 GMT
|
||||
date: Wed, 17 Jun 2026 00:37:26 GMT
|
||||
server: uvicorn
|
||||
content-length: 274
|
||||
content-type: application/json
|
||||
connection: close
|
||||
###
|
||||
POST http://127.0.0.1:8080/get_info
|
||||
content-type: application/json
|
||||
@@ -35,11 +36,11 @@ content-type: application/json
|
||||
}
|
||||
|
||||
HTTP/1.1 200 - OK
|
||||
connection: close
|
||||
content-length: 97
|
||||
content-type: application/json
|
||||
date: Thu, 08 Jan 2026 09:27:05 GMT
|
||||
date: Wed, 17 Jun 2026 00:27:12 GMT
|
||||
server: uvicorn
|
||||
content-length: 108
|
||||
content-type: application/json
|
||||
connection: close
|
||||
###
|
||||
POST http://127.0.0.1:8080/set_info
|
||||
content-type: application/json
|
||||
|
||||
4710
workflow/20251108/事故信息采集20251108.json
Normal file
4710
workflow/20251108/事故信息采集20251108.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user