Update .env.example

Cleanup engine
2026-02-12 17:44:38 +08:00 · 2026-02-12 17:42:21 +08:00
3 changed files with 83 additions and 40 deletions
--- a/engine/.env.example
+++ b/engine/.env.example
@@ -1,53 +1,92 @@
-# Server Configuration
+# -----------------------------------------------------------------------------
+# Engine .env example (safe template)
+# Notes:
+# - Never commit real API keys.
+# - Start with defaults below, then tune from logs.
+# -----------------------------------------------------------------------------
+
+# Server
 HOST=0.0.0.0
 PORT=8000
+# EXTERNAL_IP=1.2.3.4

+# Backend bridge (optional)
 BACKEND_URL=http://127.0.0.1:8100
+BACKEND_TIMEOUT_SEC=10
+HISTORY_DEFAULT_USER_ID=1

-# Audio Configuration
+# Audio
 SAMPLE_RATE=16000
+# 20ms is recommended for VAD stability and latency.
+# 100ms works but usually worsens start-of-speech accuracy.
 CHUNK_SIZE_MS=20
+DEFAULT_CODEC=pcm
+MAX_AUDIO_BUFFER_SECONDS=30

-# VAD Configuration
+# VAD / EOU
+VAD_TYPE=silero
+VAD_MODEL_PATH=data/vad/silero_vad.onnx
+# Higher = stricter speech detection (fewer false positives, more misses).
 VAD_THRESHOLD=0.5
-VAD_EOU_THRESHOLD_MS=600
-VAD_MIN_SPEECH_DURATION_MS=160
+# Require this much continuous speech before utterance can be valid.
+VAD_MIN_SPEECH_DURATION_MS=100
+# Silence duration required to finalize one user turn.
+VAD_EOU_THRESHOLD_MS=800

-# OpenAI / LLM Configuration (required for duplex voice)
-OPENAI_API_KEY=sk-fc4d59b360475f53401a864db8ce0985010acc4e696723d20a90d6569f38d80a
-OPENAI_API_URL=https://api.qnaigc.com/v1  # Optional: for Azure or compatible APIs
-LLM_MODEL=deepseek-v3
+# LLM
+OPENAI_API_KEY=your_openai_api_key_here
+# Optional for OpenAI-compatible providers.
+# OPENAI_API_URL=https://api.openai.com/v1
+LLM_MODEL=gpt-4o-mini
 LLM_TEMPERATURE=0.7

-# TTS Configuration
+# TTS
+# edge: no SiliconFlow key needed
+# siliconflow: requires SILICONFLOW_API_KEY
 TTS_PROVIDER=siliconflow
 TTS_VOICE=anna
 TTS_SPEED=1.0

-# SiliconFlow Configuration (for TTS and ASR)
-SILICONFLOW_API_KEY=sk-thmzysdpqqmhqxxshyqoxvjeiflexjdgaftyufrsgrhpjnyx
+# SiliconFlow (used by TTS and/or ASR when provider=siliconflow)
+SILICONFLOW_API_KEY=your_siliconflow_api_key_here
 SILICONFLOW_TTS_MODEL=FunAudioLLM/CosyVoice2-0.5B
-
-# ASR Configuration
-ASR_PROVIDER=siliconflow
 SILICONFLOW_ASR_MODEL=FunAudioLLM/SenseVoiceSmall
+
+# ASR
+ASR_PROVIDER=siliconflow
+# Interim cadence and minimum audio before interim decode.
 ASR_INTERIM_INTERVAL_MS=500
 ASR_MIN_AUDIO_MS=300
+# ASR start gate: ignore micro-noise, then commit to one turn once started.
+ASR_START_MIN_SPEECH_MS=160
+# Pre-roll protects beginning phonemes.
+ASR_PRE_SPEECH_MS=240
+# Tail silence protects ending phonemes.
+ASR_FINAL_TAIL_MS=120

-# Duplex Pipeline Configuration
+# Duplex behavior
 DUPLEX_ENABLED=true
 # DUPLEX_GREETING=Hello! How can I help you today?
 DUPLEX_SYSTEM_PROMPT=You are a helpful, friendly voice assistant. Keep your responses concise and conversational.

-# Barge-in Configuration
-# Minimum speech duration (ms) to trigger interruption - filters out brief noises
-# Lower = more sensitive (50-100ms recommended), Higher = filters more noise
-BARGE_IN_MIN_DURATION_MS=100
-
-ASR_START_MIN_SPEECH_MS=100
-ASR_PRE_SPEECH_MS=320
+# Barge-in (user interrupting assistant)
+# Min user speech duration needed to interrupt assistant audio.
+BARGE_IN_MIN_DURATION_MS=200
+# Allowed silence during potential barge-in (ms) before reset.
+BARGE_IN_SILENCE_TOLERANCE_MS=60

 # Logging
 LOG_LEVEL=INFO
-LOG_FORMAT=text
+# json is better for production/observability; text is easier locally.
+LOG_FORMAT=json

+# WebSocket behavior
+INACTIVITY_TIMEOUT_SEC=60
+HEARTBEAT_INTERVAL_SEC=50
+WS_PROTOCOL_VERSION=v1
+# WS_API_KEY=replace_with_shared_secret
+WS_REQUIRE_AUTH=false
+
+# CORS / ICE (JSON strings)
+CORS_ORIGINS=["http://localhost:3000","http://localhost:8080"]
+ICE_SERVERS=[{"urls":"stun:stun.l.google.com:19302"}]
--- a/engine/app/config.py
+++ b/engine/app/config.py
@@ -25,6 +25,10 @@ class Settings(BaseSettings):
    sample_rate: int = Field(default=16000, description="Audio sample rate in Hz")
    chunk_size_ms: int = Field(default=20, description="Audio chunk duration in milliseconds")
    default_codec: str = Field(default="pcm", description="Default audio codec")
+    max_audio_buffer_seconds: int = Field(
+        default=30,
+        description="Maximum buffered user audio duration kept in memory for current turn"
+    )

    # VAD Configuration
    vad_type: str = Field(default="silero", description="VAD algorithm type")
@@ -79,6 +83,10 @@ class Settings(BaseSettings):
        default=200, 
        description="Minimum speech duration (ms) required to trigger barge-in. Lower=more sensitive."
    )
+    barge_in_silence_tolerance_ms: int = Field(
+        default=60,
+        description="How much silence (ms) is tolerated during potential barge-in before reset"
+    )

    # Logging
    log_level: str = Field(default="INFO", description="Logging level")
--- a/engine/core/duplex_pipeline.py
+++ b/engine/core/duplex_pipeline.py
@@ -228,21 +228,19 @@ class DuplexPipeline:
        self._is_bot_speaking = False
        self._current_turn_task: Optional[asyncio.Task] = None
        self._audio_buffer: bytes = b""
-        max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30
+        max_buffer_seconds = settings.max_audio_buffer_seconds
        self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
-        self._asr_start_min_speech_ms: int = (
-            settings.asr_start_min_speech_ms if hasattr(settings, "asr_start_min_speech_ms") else 160
-        )
+        self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms
        self._asr_capture_active: bool = False
        self._pending_speech_audio: bytes = b""
        # Keep a short rolling pre-speech window so VAD transition latency
        # does not clip the first phoneme/character sent to ASR.
-        pre_speech_ms = settings.asr_pre_speech_ms if hasattr(settings, "asr_pre_speech_ms") else 240
+        pre_speech_ms = settings.asr_pre_speech_ms
        self._asr_pre_speech_bytes = int(settings.sample_rate * 2 * (pre_speech_ms / 1000.0))
        self._pre_speech_buffer: bytes = b""
        # Add a tiny trailing silence tail before final ASR to avoid
        # clipping the last phoneme at utterance boundaries.
-        asr_final_tail_ms = settings.asr_final_tail_ms if hasattr(settings, "asr_final_tail_ms") else 120
+        asr_final_tail_ms = settings.asr_final_tail_ms
        self._asr_final_tail_bytes = int(settings.sample_rate * 2 * (asr_final_tail_ms / 1000.0))
        self._last_vad_status: str = "Silence"
        self._process_lock = asyncio.Lock()
@@ -261,10 +259,10 @@ class DuplexPipeline:

        # Barge-in filtering - require minimum speech duration to interrupt
        self._barge_in_speech_start_time: Optional[float] = None
-        self._barge_in_min_duration_ms: int = settings.barge_in_min_duration_ms if hasattr(settings, 'barge_in_min_duration_ms') else 50
+        self._barge_in_min_duration_ms: int = settings.barge_in_min_duration_ms
+        self._barge_in_silence_tolerance_ms: int = settings.barge_in_silence_tolerance_ms
        self._barge_in_speech_frames: int = 0  # Count speech frames
        self._barge_in_silence_frames: int = 0  # Count silence frames during potential barge-in
-        self._barge_in_silence_tolerance: int = 3  # Allow up to 3 silence frames (60ms at 20ms chunks)

        # Runtime overrides injected from session.start metadata
        self._runtime_llm: Dict[str, Any] = {}
@@ -415,6 +413,11 @@ class DuplexPipeline:
            return self._runtime_barge_in_min_duration_ms
        return self._barge_in_min_duration_ms

+    def _barge_in_silence_tolerance_frames(self) -> int:
+        """Convert silence tolerance from ms to frame count using current chunk size."""
+        chunk_ms = max(1, settings.chunk_size_ms)
+        return max(1, int(np.ceil(self._barge_in_silence_tolerance_ms / chunk_ms)))
+
    async def _generate_runtime_greeting(self) -> Optional[str]:
        if not self.llm_service:
            return None
@@ -679,7 +682,7 @@ class DuplexPipeline:
                        if self._barge_in_speech_start_time is not None:
                            self._barge_in_silence_frames += 1
                            # Allow brief silence gaps (VAD flickering)
-                            if self._barge_in_silence_frames > self._barge_in_silence_tolerance:
+                            if self._barge_in_silence_frames > self._barge_in_silence_tolerance_frames():
                                # Too much silence - reset barge-in tracking
                                logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
                                self._barge_in_speech_start_time = None
@@ -927,9 +930,6 @@ class DuplexPipeline:
            fn = item.get("function")
            if isinstance(fn, dict) and fn.get("name"):
                fn_name = str(fn.get("name"))
-                executor = str(item.get("executor") or item.get("run_on") or "").strip().lower()
-                if executor in {"client", "server"}:
-                    self._runtime_tool_executor[fn_name] = executor
                schemas.append(
                    {
                        "type": "function",
@@ -943,10 +943,6 @@ class DuplexPipeline:
                continue

            if item.get("name"):
-                fn_name = str(item.get("name"))
-                executor = str(item.get("executor") or item.get("run_on") or "").strip().lower()
-                if executor in {"client", "server"}:
-                    self._runtime_tool_executor[fn_name] = executor
                schemas.append(
                    {
                        "type": "function",
Author	SHA1	Message	Date
Xin Wang	98207936ae	Update .env.example	2026-02-12 17:44:38 +08:00
Xin Wang	35bd83767e	Cleanup engine	2026-02-12 17:42:21 +08:00