# ----------------------------------------------------------------------------- # Engine .env example (safe template) # Notes: # - Never commit real API keys. # - Start with defaults below, then tune from logs. # ----------------------------------------------------------------------------- # Server HOST=0.0.0.0 PORT=8000 # EXTERNAL_IP=1.2.3.4 # Backend bridge (optional) BACKEND_URL=http://127.0.0.1:8100 BACKEND_TIMEOUT_SEC=10 HISTORY_DEFAULT_USER_ID=1 # Audio SAMPLE_RATE=16000 # 20ms is recommended for VAD stability and latency. # 100ms works but usually worsens start-of-speech accuracy. CHUNK_SIZE_MS=20 DEFAULT_CODEC=pcm MAX_AUDIO_BUFFER_SECONDS=30 # VAD / EOU VAD_TYPE=silero VAD_MODEL_PATH=data/vad/silero_vad.onnx # Higher = stricter speech detection (fewer false positives, more misses). VAD_THRESHOLD=0.5 # Require this much continuous speech before utterance can be valid. VAD_MIN_SPEECH_DURATION_MS=100 # Silence duration required to finalize one user turn. VAD_EOU_THRESHOLD_MS=800 # LLM OPENAI_API_KEY=your_openai_api_key_here # Optional for OpenAI-compatible providers. # OPENAI_API_URL=https://api.openai.com/v1 LLM_MODEL=gpt-4o-mini LLM_TEMPERATURE=0.7 # TTS # edge: no API key needed # openai_compatible: compatible with SiliconFlow-style endpoints TTS_PROVIDER=openai_compatible TTS_VOICE=anna TTS_SPEED=1.0 # SiliconFlow (used by TTS and/or ASR when provider=openai_compatible) SILICONFLOW_API_KEY=your_siliconflow_api_key_here SILICONFLOW_TTS_MODEL=FunAudioLLM/CosyVoice2-0.5B SILICONFLOW_ASR_MODEL=FunAudioLLM/SenseVoiceSmall # ASR ASR_PROVIDER=openai_compatible # Interim cadence and minimum audio before interim decode. ASR_INTERIM_INTERVAL_MS=500 ASR_MIN_AUDIO_MS=300 # ASR start gate: ignore micro-noise, then commit to one turn once started. ASR_START_MIN_SPEECH_MS=160 # Pre-roll protects beginning phonemes. ASR_PRE_SPEECH_MS=240 # Tail silence protects ending phonemes. ASR_FINAL_TAIL_MS=120 # Duplex behavior DUPLEX_ENABLED=true # DUPLEX_GREETING=Hello! How can I help you today? DUPLEX_SYSTEM_PROMPT=You are a helpful, friendly voice assistant. Keep your responses concise and conversational. # Barge-in (user interrupting assistant) # Min user speech duration needed to interrupt assistant audio. BARGE_IN_MIN_DURATION_MS=200 # Allowed silence during potential barge-in (ms) before reset. BARGE_IN_SILENCE_TOLERANCE_MS=60 # Logging LOG_LEVEL=INFO # json is better for production/observability; text is easier locally. LOG_FORMAT=json # WebSocket behavior INACTIVITY_TIMEOUT_SEC=60 HEARTBEAT_INTERVAL_SEC=50 WS_PROTOCOL_VERSION=v1 # WS_API_KEY=replace_with_shared_secret WS_REQUIRE_AUTH=false # CORS / ICE (JSON strings) CORS_ORIGINS=["http://localhost:3000","http://localhost:8080"] ICE_SERVERS=[{"urls":"stun:stun.l.google.com:19302"}]