Add DashScope agent configuration files for VAD, LLM, TTS, and ASR services

- Introduced new YAML configuration files for DashScope, detailing agent behavior settings for VAD, LLM, TTS, and ASR. - Configured parameters including model paths, API keys, and service URLs for real-time processing. - Ensured compatibility with existing agent-side behavior management while providing specific settings for DashScope integration.
2026-03-08 23:28:08 +08:00
parent aeeeee20d1
commit e41d34fe23
2 changed files with 115 additions and 0 deletions
--- a/engine/config/agents/dashscope.yaml
+++ b/engine/config/agents/dashscope.yaml
@@ -0,0 +1,47 @@
+# Agent behavior configuration for DashScope realtime ASR/TTS.
+# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers).
+# Infra/server/network settings should stay in .env.
+
+agent:
+  vad:
+    type: silero
+    model_path: data/vad/silero_vad.onnx
+    threshold: 0.5
+    min_speech_duration_ms: 100
+    eou_threshold_ms: 800
+
+  llm:
+    # provider: openai | openai_compatible | siliconflow
+    provider: openai_compatible
+    model: deepseek-v3
+    temperature: 0.7
+    api_key: your_llm_api_key
+    api_url: https://api.qnaigc.com/v1
+
+  tts:
+    provider: dashscope
+    api_key: your_tts_api_key
+    api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
+    model: qwen3-tts-flash-realtime
+    voice: Cherry
+    dashscope_mode: commit
+    speed: 1.0
+
+  asr:
+    provider: dashscope
+    api_key: your_asr_api_key
+    api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
+    model: qwen3-asr-flash-realtime
+    interim_interval_ms: 500
+    min_audio_ms: 300
+    start_min_speech_ms: 160
+    pre_speech_ms: 240
+    final_tail_ms: 120
+
+  duplex:
+    enabled: true
+    system_prompt: You are a helpful, friendly voice assistant. Keep your responses concise and conversational.
+
+  barge_in:
+    min_duration_ms: 200
+    silence_tolerance_ms: 60
--- a/engine/config/agents/volcengine.yaml
+++ b/engine/config/agents/volcengine.yaml
@@ -0,0 +1,68 @@
+# Agent behavior configuration (safe to edit per profile)
+# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers).
+# Infra/server/network settings should stay in .env.
+
+agent:
+  vad:
+    type: silero
+    model_path: data/vad/silero_vad.onnx
+    threshold: 0.5
+    min_speech_duration_ms: 100
+    eou_threshold_ms: 800
+
+  llm:
+    # provider: openai | openai_compatible | siliconflow
+    provider: openai_compatible
+    model: deepseek-v3
+    temperature: 0.7
+    # Required: no fallback. You can still reference env explicitly.
+    api_key: your_llm_api_key
+    # Optional for OpenAI-compatible endpoints:
+    api_url: https://api.qnaigc.com/v1
+
+  tts:
+    # provider: edge | openai_compatible | siliconflow | dashscope
+    # dashscope defaults (if omitted):
+    #   api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
+    #   model: qwen3-tts-flash-realtime
+    #   dashscope_mode: commit (engine splits) | server_commit (dashscope splits)
+    #   note: dashscope_mode/mode is ONLY used when provider=dashscope.
+    # volcengine defaults (if omitted):
+    provider: volcengine
+    api_url: https://openspeech.bytedance.com/api/v3/tts/unidirectional
+    resource_id: seed-tts-2.0
+    app_id: your_tts_app_id
+    api_key: your_tts_api_key
+    speed: 1.1
+    voice: zh_female_vv_uranus_bigtts
+
+  asr:
+  asr:
+    provider: volcengine
+    api_url: wss://openspeech.bytedance.com/api/v3/sauc/bigmodel
+    app_id: your_asr_app_id
+    api_key: your_asr_api_key
+    resource_id: volc.bigasr.sauc.duration
+    uid: caller-1
+    model: bigmodel
+    request_params:
+      end_window_size: 800
+      force_to_speech_time: 1000
+      enable_punc: true
+      enable_itn: false
+      enable_ddc: false
+      show_utterance: true
+      result_type: single
+    interim_interval_ms: 500
+    min_audio_ms: 300
+    start_min_speech_ms: 160
+    pre_speech_ms: 240
+    final_tail_ms: 120
+
+  duplex:
+    enabled: true
+    system_prompt: 你是一个人工智能助手，你用简答语句回答，避免使用标点符号和emoji。
+
+  barge_in:
+    min_duration_ms: 200
+    silence_tolerance_ms: 60