Compare commits
2 Commits
838c19bf9c
...
98207936ae
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
98207936ae | ||
|
|
35bd83767e |
@@ -1,53 +1,92 @@
|
|||||||
# Server Configuration
|
# -----------------------------------------------------------------------------
|
||||||
|
# Engine .env example (safe template)
|
||||||
|
# Notes:
|
||||||
|
# - Never commit real API keys.
|
||||||
|
# - Start with defaults below, then tune from logs.
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Server
|
||||||
HOST=0.0.0.0
|
HOST=0.0.0.0
|
||||||
PORT=8000
|
PORT=8000
|
||||||
|
# EXTERNAL_IP=1.2.3.4
|
||||||
|
|
||||||
|
# Backend bridge (optional)
|
||||||
BACKEND_URL=http://127.0.0.1:8100
|
BACKEND_URL=http://127.0.0.1:8100
|
||||||
|
BACKEND_TIMEOUT_SEC=10
|
||||||
|
HISTORY_DEFAULT_USER_ID=1
|
||||||
|
|
||||||
# Audio Configuration
|
# Audio
|
||||||
SAMPLE_RATE=16000
|
SAMPLE_RATE=16000
|
||||||
|
# 20ms is recommended for VAD stability and latency.
|
||||||
|
# 100ms works but usually worsens start-of-speech accuracy.
|
||||||
CHUNK_SIZE_MS=20
|
CHUNK_SIZE_MS=20
|
||||||
|
DEFAULT_CODEC=pcm
|
||||||
|
MAX_AUDIO_BUFFER_SECONDS=30
|
||||||
|
|
||||||
# VAD Configuration
|
# VAD / EOU
|
||||||
|
VAD_TYPE=silero
|
||||||
|
VAD_MODEL_PATH=data/vad/silero_vad.onnx
|
||||||
|
# Higher = stricter speech detection (fewer false positives, more misses).
|
||||||
VAD_THRESHOLD=0.5
|
VAD_THRESHOLD=0.5
|
||||||
VAD_EOU_THRESHOLD_MS=600
|
# Require this much continuous speech before utterance can be valid.
|
||||||
VAD_MIN_SPEECH_DURATION_MS=160
|
VAD_MIN_SPEECH_DURATION_MS=100
|
||||||
|
# Silence duration required to finalize one user turn.
|
||||||
|
VAD_EOU_THRESHOLD_MS=800
|
||||||
|
|
||||||
# OpenAI / LLM Configuration (required for duplex voice)
|
# LLM
|
||||||
OPENAI_API_KEY=sk-fc4d59b360475f53401a864db8ce0985010acc4e696723d20a90d6569f38d80a
|
OPENAI_API_KEY=your_openai_api_key_here
|
||||||
OPENAI_API_URL=https://api.qnaigc.com/v1 # Optional: for Azure or compatible APIs
|
# Optional for OpenAI-compatible providers.
|
||||||
LLM_MODEL=deepseek-v3
|
# OPENAI_API_URL=https://api.openai.com/v1
|
||||||
|
LLM_MODEL=gpt-4o-mini
|
||||||
LLM_TEMPERATURE=0.7
|
LLM_TEMPERATURE=0.7
|
||||||
|
|
||||||
# TTS Configuration
|
# TTS
|
||||||
|
# edge: no SiliconFlow key needed
|
||||||
|
# siliconflow: requires SILICONFLOW_API_KEY
|
||||||
TTS_PROVIDER=siliconflow
|
TTS_PROVIDER=siliconflow
|
||||||
TTS_VOICE=anna
|
TTS_VOICE=anna
|
||||||
TTS_SPEED=1.0
|
TTS_SPEED=1.0
|
||||||
|
|
||||||
# SiliconFlow Configuration (for TTS and ASR)
|
# SiliconFlow (used by TTS and/or ASR when provider=siliconflow)
|
||||||
SILICONFLOW_API_KEY=sk-thmzysdpqqmhqxxshyqoxvjeiflexjdgaftyufrsgrhpjnyx
|
SILICONFLOW_API_KEY=your_siliconflow_api_key_here
|
||||||
SILICONFLOW_TTS_MODEL=FunAudioLLM/CosyVoice2-0.5B
|
SILICONFLOW_TTS_MODEL=FunAudioLLM/CosyVoice2-0.5B
|
||||||
|
|
||||||
# ASR Configuration
|
|
||||||
ASR_PROVIDER=siliconflow
|
|
||||||
SILICONFLOW_ASR_MODEL=FunAudioLLM/SenseVoiceSmall
|
SILICONFLOW_ASR_MODEL=FunAudioLLM/SenseVoiceSmall
|
||||||
|
|
||||||
|
# ASR
|
||||||
|
ASR_PROVIDER=siliconflow
|
||||||
|
# Interim cadence and minimum audio before interim decode.
|
||||||
ASR_INTERIM_INTERVAL_MS=500
|
ASR_INTERIM_INTERVAL_MS=500
|
||||||
ASR_MIN_AUDIO_MS=300
|
ASR_MIN_AUDIO_MS=300
|
||||||
|
# ASR start gate: ignore micro-noise, then commit to one turn once started.
|
||||||
|
ASR_START_MIN_SPEECH_MS=160
|
||||||
|
# Pre-roll protects beginning phonemes.
|
||||||
|
ASR_PRE_SPEECH_MS=240
|
||||||
|
# Tail silence protects ending phonemes.
|
||||||
|
ASR_FINAL_TAIL_MS=120
|
||||||
|
|
||||||
# Duplex Pipeline Configuration
|
# Duplex behavior
|
||||||
DUPLEX_ENABLED=true
|
DUPLEX_ENABLED=true
|
||||||
# DUPLEX_GREETING=Hello! How can I help you today?
|
# DUPLEX_GREETING=Hello! How can I help you today?
|
||||||
DUPLEX_SYSTEM_PROMPT=You are a helpful, friendly voice assistant. Keep your responses concise and conversational.
|
DUPLEX_SYSTEM_PROMPT=You are a helpful, friendly voice assistant. Keep your responses concise and conversational.
|
||||||
|
|
||||||
# Barge-in Configuration
|
# Barge-in (user interrupting assistant)
|
||||||
# Minimum speech duration (ms) to trigger interruption - filters out brief noises
|
# Min user speech duration needed to interrupt assistant audio.
|
||||||
# Lower = more sensitive (50-100ms recommended), Higher = filters more noise
|
BARGE_IN_MIN_DURATION_MS=200
|
||||||
BARGE_IN_MIN_DURATION_MS=100
|
# Allowed silence during potential barge-in (ms) before reset.
|
||||||
|
BARGE_IN_SILENCE_TOLERANCE_MS=60
|
||||||
ASR_START_MIN_SPEECH_MS=100
|
|
||||||
ASR_PRE_SPEECH_MS=320
|
|
||||||
|
|
||||||
# Logging
|
# Logging
|
||||||
LOG_LEVEL=INFO
|
LOG_LEVEL=INFO
|
||||||
LOG_FORMAT=text
|
# json is better for production/observability; text is easier locally.
|
||||||
|
LOG_FORMAT=json
|
||||||
|
|
||||||
|
# WebSocket behavior
|
||||||
|
INACTIVITY_TIMEOUT_SEC=60
|
||||||
|
HEARTBEAT_INTERVAL_SEC=50
|
||||||
|
WS_PROTOCOL_VERSION=v1
|
||||||
|
# WS_API_KEY=replace_with_shared_secret
|
||||||
|
WS_REQUIRE_AUTH=false
|
||||||
|
|
||||||
|
# CORS / ICE (JSON strings)
|
||||||
|
CORS_ORIGINS=["http://localhost:3000","http://localhost:8080"]
|
||||||
|
ICE_SERVERS=[{"urls":"stun:stun.l.google.com:19302"}]
|
||||||
|
|||||||
@@ -25,6 +25,10 @@ class Settings(BaseSettings):
|
|||||||
sample_rate: int = Field(default=16000, description="Audio sample rate in Hz")
|
sample_rate: int = Field(default=16000, description="Audio sample rate in Hz")
|
||||||
chunk_size_ms: int = Field(default=20, description="Audio chunk duration in milliseconds")
|
chunk_size_ms: int = Field(default=20, description="Audio chunk duration in milliseconds")
|
||||||
default_codec: str = Field(default="pcm", description="Default audio codec")
|
default_codec: str = Field(default="pcm", description="Default audio codec")
|
||||||
|
max_audio_buffer_seconds: int = Field(
|
||||||
|
default=30,
|
||||||
|
description="Maximum buffered user audio duration kept in memory for current turn"
|
||||||
|
)
|
||||||
|
|
||||||
# VAD Configuration
|
# VAD Configuration
|
||||||
vad_type: str = Field(default="silero", description="VAD algorithm type")
|
vad_type: str = Field(default="silero", description="VAD algorithm type")
|
||||||
@@ -79,6 +83,10 @@ class Settings(BaseSettings):
|
|||||||
default=200,
|
default=200,
|
||||||
description="Minimum speech duration (ms) required to trigger barge-in. Lower=more sensitive."
|
description="Minimum speech duration (ms) required to trigger barge-in. Lower=more sensitive."
|
||||||
)
|
)
|
||||||
|
barge_in_silence_tolerance_ms: int = Field(
|
||||||
|
default=60,
|
||||||
|
description="How much silence (ms) is tolerated during potential barge-in before reset"
|
||||||
|
)
|
||||||
|
|
||||||
# Logging
|
# Logging
|
||||||
log_level: str = Field(default="INFO", description="Logging level")
|
log_level: str = Field(default="INFO", description="Logging level")
|
||||||
|
|||||||
@@ -228,21 +228,19 @@ class DuplexPipeline:
|
|||||||
self._is_bot_speaking = False
|
self._is_bot_speaking = False
|
||||||
self._current_turn_task: Optional[asyncio.Task] = None
|
self._current_turn_task: Optional[asyncio.Task] = None
|
||||||
self._audio_buffer: bytes = b""
|
self._audio_buffer: bytes = b""
|
||||||
max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30
|
max_buffer_seconds = settings.max_audio_buffer_seconds
|
||||||
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
|
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
|
||||||
self._asr_start_min_speech_ms: int = (
|
self._asr_start_min_speech_ms: int = settings.asr_start_min_speech_ms
|
||||||
settings.asr_start_min_speech_ms if hasattr(settings, "asr_start_min_speech_ms") else 160
|
|
||||||
)
|
|
||||||
self._asr_capture_active: bool = False
|
self._asr_capture_active: bool = False
|
||||||
self._pending_speech_audio: bytes = b""
|
self._pending_speech_audio: bytes = b""
|
||||||
# Keep a short rolling pre-speech window so VAD transition latency
|
# Keep a short rolling pre-speech window so VAD transition latency
|
||||||
# does not clip the first phoneme/character sent to ASR.
|
# does not clip the first phoneme/character sent to ASR.
|
||||||
pre_speech_ms = settings.asr_pre_speech_ms if hasattr(settings, "asr_pre_speech_ms") else 240
|
pre_speech_ms = settings.asr_pre_speech_ms
|
||||||
self._asr_pre_speech_bytes = int(settings.sample_rate * 2 * (pre_speech_ms / 1000.0))
|
self._asr_pre_speech_bytes = int(settings.sample_rate * 2 * (pre_speech_ms / 1000.0))
|
||||||
self._pre_speech_buffer: bytes = b""
|
self._pre_speech_buffer: bytes = b""
|
||||||
# Add a tiny trailing silence tail before final ASR to avoid
|
# Add a tiny trailing silence tail before final ASR to avoid
|
||||||
# clipping the last phoneme at utterance boundaries.
|
# clipping the last phoneme at utterance boundaries.
|
||||||
asr_final_tail_ms = settings.asr_final_tail_ms if hasattr(settings, "asr_final_tail_ms") else 120
|
asr_final_tail_ms = settings.asr_final_tail_ms
|
||||||
self._asr_final_tail_bytes = int(settings.sample_rate * 2 * (asr_final_tail_ms / 1000.0))
|
self._asr_final_tail_bytes = int(settings.sample_rate * 2 * (asr_final_tail_ms / 1000.0))
|
||||||
self._last_vad_status: str = "Silence"
|
self._last_vad_status: str = "Silence"
|
||||||
self._process_lock = asyncio.Lock()
|
self._process_lock = asyncio.Lock()
|
||||||
@@ -261,10 +259,10 @@ class DuplexPipeline:
|
|||||||
|
|
||||||
# Barge-in filtering - require minimum speech duration to interrupt
|
# Barge-in filtering - require minimum speech duration to interrupt
|
||||||
self._barge_in_speech_start_time: Optional[float] = None
|
self._barge_in_speech_start_time: Optional[float] = None
|
||||||
self._barge_in_min_duration_ms: int = settings.barge_in_min_duration_ms if hasattr(settings, 'barge_in_min_duration_ms') else 50
|
self._barge_in_min_duration_ms: int = settings.barge_in_min_duration_ms
|
||||||
|
self._barge_in_silence_tolerance_ms: int = settings.barge_in_silence_tolerance_ms
|
||||||
self._barge_in_speech_frames: int = 0 # Count speech frames
|
self._barge_in_speech_frames: int = 0 # Count speech frames
|
||||||
self._barge_in_silence_frames: int = 0 # Count silence frames during potential barge-in
|
self._barge_in_silence_frames: int = 0 # Count silence frames during potential barge-in
|
||||||
self._barge_in_silence_tolerance: int = 3 # Allow up to 3 silence frames (60ms at 20ms chunks)
|
|
||||||
|
|
||||||
# Runtime overrides injected from session.start metadata
|
# Runtime overrides injected from session.start metadata
|
||||||
self._runtime_llm: Dict[str, Any] = {}
|
self._runtime_llm: Dict[str, Any] = {}
|
||||||
@@ -415,6 +413,11 @@ class DuplexPipeline:
|
|||||||
return self._runtime_barge_in_min_duration_ms
|
return self._runtime_barge_in_min_duration_ms
|
||||||
return self._barge_in_min_duration_ms
|
return self._barge_in_min_duration_ms
|
||||||
|
|
||||||
|
def _barge_in_silence_tolerance_frames(self) -> int:
|
||||||
|
"""Convert silence tolerance from ms to frame count using current chunk size."""
|
||||||
|
chunk_ms = max(1, settings.chunk_size_ms)
|
||||||
|
return max(1, int(np.ceil(self._barge_in_silence_tolerance_ms / chunk_ms)))
|
||||||
|
|
||||||
async def _generate_runtime_greeting(self) -> Optional[str]:
|
async def _generate_runtime_greeting(self) -> Optional[str]:
|
||||||
if not self.llm_service:
|
if not self.llm_service:
|
||||||
return None
|
return None
|
||||||
@@ -679,7 +682,7 @@ class DuplexPipeline:
|
|||||||
if self._barge_in_speech_start_time is not None:
|
if self._barge_in_speech_start_time is not None:
|
||||||
self._barge_in_silence_frames += 1
|
self._barge_in_silence_frames += 1
|
||||||
# Allow brief silence gaps (VAD flickering)
|
# Allow brief silence gaps (VAD flickering)
|
||||||
if self._barge_in_silence_frames > self._barge_in_silence_tolerance:
|
if self._barge_in_silence_frames > self._barge_in_silence_tolerance_frames():
|
||||||
# Too much silence - reset barge-in tracking
|
# Too much silence - reset barge-in tracking
|
||||||
logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
|
logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
|
||||||
self._barge_in_speech_start_time = None
|
self._barge_in_speech_start_time = None
|
||||||
@@ -927,9 +930,6 @@ class DuplexPipeline:
|
|||||||
fn = item.get("function")
|
fn = item.get("function")
|
||||||
if isinstance(fn, dict) and fn.get("name"):
|
if isinstance(fn, dict) and fn.get("name"):
|
||||||
fn_name = str(fn.get("name"))
|
fn_name = str(fn.get("name"))
|
||||||
executor = str(item.get("executor") or item.get("run_on") or "").strip().lower()
|
|
||||||
if executor in {"client", "server"}:
|
|
||||||
self._runtime_tool_executor[fn_name] = executor
|
|
||||||
schemas.append(
|
schemas.append(
|
||||||
{
|
{
|
||||||
"type": "function",
|
"type": "function",
|
||||||
@@ -943,10 +943,6 @@ class DuplexPipeline:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if item.get("name"):
|
if item.get("name"):
|
||||||
fn_name = str(item.get("name"))
|
|
||||||
executor = str(item.get("executor") or item.get("run_on") or "").strip().lower()
|
|
||||||
if executor in {"client", "server"}:
|
|
||||||
self._runtime_tool_executor[fn_name] = executor
|
|
||||||
schemas.append(
|
schemas.append(
|
||||||
{
|
{
|
||||||
"type": "function",
|
"type": "function",
|
||||||
|
|||||||
Reference in New Issue
Block a user