AI-VideoAssistant/engine/config/agents/volcengine.yaml

# Agent behavior configuration (safe to edit per profile)
# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers).
# Infra/server/network settings should stay in .env.

agent:
  vad:
    type: silero
    model_path: data/vad/silero_vad.onnx
    threshold: 0.5
    min_speech_duration_ms: 100
    eou_threshold_ms: 800

  llm:
    # provider: openai | openai_compatible | siliconflow
    provider: openai_compatible
    model: deepseek-v3
    temperature: 0.7
    # Required: no fallback. You can still reference env explicitly.
    api_key: your_llm_api_key
    # Optional for OpenAI-compatible endpoints:
    api_url: https://api.qnaigc.com/v1

  tts:
    # provider: edge | openai_compatible | siliconflow | dashscope
    # dashscope defaults (if omitted):
    #   api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
    #   model: qwen3-tts-flash-realtime
    #   dashscope_mode: commit (engine splits) | server_commit (dashscope splits)
    #   note: dashscope_mode/mode is ONLY used when provider=dashscope.
    # volcengine defaults (if omitted):
    provider: volcengine
    api_url: https://openspeech.bytedance.com/api/v3/tts/unidirectional
    resource_id: seed-tts-2.0
    app_id: your_tts_app_id
    api_key: your_tts_api_key
    speed: 1.1
    voice: zh_female_vv_uranus_bigtts

  asr:
  asr:
    provider: volcengine
    api_url: wss://openspeech.bytedance.com/api/v3/sauc/bigmodel
    app_id: your_asr_app_id
    api_key: your_asr_api_key
    resource_id: volc.bigasr.sauc.duration
    uid: caller-1
    model: bigmodel
    request_params:
      end_window_size: 800
      force_to_speech_time: 1000
      enable_punc: true
      enable_itn: false
      enable_ddc: false
      show_utterance: true
      result_type: single
    interim_interval_ms: 500
    min_audio_ms: 300
    start_min_speech_ms: 160
    pre_speech_ms: 240
    final_tail_ms: 120

  duplex:
    enabled: true
    system_prompt: 你是一个人工智能助手，你用简答语句回答，避免使用标点符号和emoji。

  barge_in:
    min_duration_ms: 200
    silence_tolerance_ms: 60