from __future__ import annotations import json from dataclasses import dataclass, field from pathlib import Path @dataclass(frozen=True) class ServerConfig: host: str = "0.0.0.0" port: int = 8000 cors_origins: list[str] = field(default_factory=list) serve_webpage: bool = True webpage_mount: str = "/demo" @dataclass(frozen=True) class AudioConfig: sample_rate_hz: int = 16000 channels: int = 1 frame_ms: int = 20 @property def frame_bytes(self) -> int: return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2 @dataclass(frozen=True) class SessionConfig: inactivity_timeout_sec: int = 60 @dataclass(frozen=True) class VADConfig: """Voice Activity Detection thresholds for the Silero analyzer. These map directly to ``pipecat.audio.vad.vad_analyzer.VADParams``. Defaults are tuned a touch more conservative than upstream pipecat so short pauses in continuous speech don't end the user turn prematurely. """ confidence: float = 0.7 start_secs: float = 0.2 stop_secs: float = 0.6 min_volume: float = 0.6 @dataclass(frozen=True) class TurnConfig: """User-turn segmentation policy. ``user_speech_timeout_sec`` is the grace window (in seconds) after VAD has confirmed silence during which the user is allowed to resume speaking before the aggregator finalizes the turn. Used by ``SpeechTimeoutUserTurnStopStrategy``. Higher = more tolerant of natural mid-sentence pauses; lower = snappier turn-taking. The combined "user pause before turn ends" budget is roughly ``vad.stop_secs + user_speech_timeout_sec``. """ vad: VADConfig = field(default_factory=VADConfig) user_speech_timeout_sec: float = 1.0 interruption_min_chars: int = 3 interruption_use_interim: bool = True interruption_short_replies: list[str] = field( default_factory=lambda: [ "是", "是的", "对", "对的", "嗯", "好", "好的", "行", "可以", "没问题", "不是", "不", "不行", "不用", "不要", "没有", "否", "no", "yes", "ok", "okay", ] ) @dataclass(frozen=True) class AgentConfig: system_prompt: str = "You are a helpful, friendly voice assistant." greeting: str | None = None greeting_mode: str = "generated" @dataclass(frozen=True) class LLMConfig: provider: str = "openai" api_key: str = "" base_url: str | None = None model: str = "gpt-4o-mini" temperature: float | None = 0.7 @dataclass(frozen=True) class STTConfig: provider: str = "openai" app_id: str = "" api_key: str = "" api_secret: str = "" base_url: str | None = None model: str = "gpt-4o-mini-transcribe" language: str | None = "en" domain: str = "iat" accent: str = "mandarin" encoding: str = "raw" frame_size: int = 1280 timeout_sec: float = 10.0 dynamic_correction: bool = False @dataclass(frozen=True) class TTSConfig: provider: str = "openai" app_id: str = "" api_key: str = "" api_secret: str = "" base_url: str | None = None model: str = "gpt-4o-mini-tts" voice: str = "alloy" aue: str = "raw" tte: str = "UTF8" speed: int = 50 volume: int = 50 pitch: int = 50 timeout_sec: float = 30.0 source_sample_rate_hz: int | None = None @dataclass(frozen=True) class ServicesConfig: llm: LLMConfig = field(default_factory=LLMConfig) stt: STTConfig = field(default_factory=STTConfig) tts: TTSConfig = field(default_factory=TTSConfig) @dataclass(frozen=True) class EngineConfig: server: ServerConfig = field(default_factory=ServerConfig) audio: AudioConfig = field(default_factory=AudioConfig) session: SessionConfig = field(default_factory=SessionConfig) turn: TurnConfig = field(default_factory=TurnConfig) agent: AgentConfig = field(default_factory=AgentConfig) services: ServicesConfig = field(default_factory=ServicesConfig) def load_config(path: str | Path = "config.json") -> EngineConfig: config_path = Path(path) if not config_path.exists() and str(path) == "config.json": config_path = Path(__file__).resolve().parent.parent / "config.json" data = json.loads(config_path.read_text(encoding="utf-8")) if not isinstance(data, dict): raise ValueError(f"Config file must contain a JSON object: {config_path}") return config_from_dict(data) def config_from_dict(data: dict) -> EngineConfig: services = _dict(data.get("services")) agent = _dict(data.get("agent")) if agent.get("greeting") == "": agent["greeting"] = None if agent.get("greeting_mode") not in (None, "generated", "fixed", "off"): raise ValueError("agent.greeting_mode must be one of: generated, fixed, off") stt = _dict(services.get("stt") or services.get("asr")) if stt.get("language") == "": stt["language"] = None turn = _dict(data.get("turn")) vad = _dict(turn.get("vad")) return EngineConfig( server=ServerConfig(**_dict(data.get("server"))), audio=AudioConfig(**_dict(data.get("audio"))), session=SessionConfig(**_dict(data.get("session"))), turn=TurnConfig( vad=VADConfig(**vad), user_speech_timeout_sec=float( turn.get("user_speech_timeout_sec", TurnConfig().user_speech_timeout_sec) ), interruption_min_chars=int( turn.get("interruption_min_chars", TurnConfig().interruption_min_chars) ), interruption_use_interim=bool( turn.get("interruption_use_interim", TurnConfig().interruption_use_interim) ), interruption_short_replies=list( turn.get( "interruption_short_replies", TurnConfig().interruption_short_replies, ) ), ), agent=AgentConfig(**agent), services=ServicesConfig( llm=LLMConfig(**_dict(services.get("llm"))), stt=STTConfig(**stt), tts=TTSConfig(**_dict(services.get("tts"))), ), ) def _dict(value: object) -> dict: return dict(value) if isinstance(value, dict) else {}