- Added idle prompt timeout, maximum count, and text to multiple voice configuration files to improve user interaction during idle periods. - Updated greeting mode to 'fastgpt_opener' in relevant configurations for a more dynamic greeting experience. - Introduced a new voice configuration file for xfyun TTS, including detailed service settings and parameters. - Refactored the pipeline to handle idle prompts and user turn events, ensuring smoother interaction flow. - Adjusted the VAD and turn configurations to accommodate new idle prompt features.
303 lines
9.2 KiB
Python
303 lines
9.2 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
DEFAULT_VOICE_CONFIG_REL = "config/voice.json"
|
|
|
|
|
|
def resolve_voice_config_path() -> Path:
|
|
"""Return the voice config path from VOICE_CONFIG or the default."""
|
|
configured = os.getenv("VOICE_CONFIG", DEFAULT_VOICE_CONFIG_REL).strip()
|
|
if not configured:
|
|
configured = DEFAULT_VOICE_CONFIG_REL
|
|
path = Path(configured)
|
|
if not path.is_absolute():
|
|
path = PROJECT_ROOT / path
|
|
return path
|
|
|
|
|
|
DEFAULT_VOICE_CONFIG = resolve_voice_config_path()
|
|
|
|
SUPPORTED_LLM_PROVIDERS = frozenset({"openai", "fastgpt"})
|
|
_LLM_PROVIDER_ALIASES = {"llm": "openai", "openai": "openai", "fastgpt": "fastgpt"}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ServerConfig:
|
|
host: str = "0.0.0.0"
|
|
port: int = 8000
|
|
cors_origins: list[str] = field(default_factory=list)
|
|
serve_webpage: bool = True
|
|
webpage_mount: str = "/voice-demo"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class AudioConfig:
|
|
sample_rate_hz: int = 16000
|
|
channels: int = 1
|
|
frame_ms: int = 20
|
|
|
|
@property
|
|
def frame_bytes(self) -> int:
|
|
return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class SessionConfig:
|
|
inactivity_timeout_sec: int = 60
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class VADConfig:
|
|
confidence: float = 0.7
|
|
start_secs: float = 0.2
|
|
stop_secs: float = 0.6
|
|
min_volume: float = 0.6
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class TurnConfig:
|
|
vad: VADConfig = field(default_factory=VADConfig)
|
|
user_speech_timeout_sec: float = 1.0
|
|
idle_prompt_timeout_sec: float = 0.0
|
|
idle_prompt_max_count: int = 1
|
|
idle_prompt_text: str = (
|
|
"我先停在这里。你可以继续说你的想法,"
|
|
"或者让我根据刚才的内容帮你整理下一步。"
|
|
)
|
|
interruption_min_chars: int = 3
|
|
interruption_use_interim: bool = True
|
|
interruption_short_replies: list[str] = field(
|
|
default_factory=lambda: [
|
|
"是",
|
|
"是的",
|
|
"对",
|
|
"对的",
|
|
"嗯",
|
|
"好",
|
|
"好的",
|
|
"行",
|
|
"可以",
|
|
"没问题",
|
|
"不是",
|
|
"不",
|
|
"不行",
|
|
"不用",
|
|
"不要",
|
|
"没有",
|
|
"否",
|
|
"no",
|
|
"yes",
|
|
"ok",
|
|
"okay",
|
|
]
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ResponseStateConfig:
|
|
enabled: bool = False
|
|
tag: str = "state"
|
|
event_type: str = "response.state"
|
|
max_prefix_chars: int = 256
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class AgentConfig:
|
|
system_prompt: str = "You are a helpful, friendly voice assistant."
|
|
greeting: str | None = None
|
|
greeting_mode: str = "generated"
|
|
response_state: ResponseStateConfig = field(default_factory=ResponseStateConfig)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class LLMConfig:
|
|
provider: str = "openai"
|
|
api_key: str = ""
|
|
base_url: str | None = None
|
|
model: str = "gpt-4o-mini"
|
|
app_id: str | None = None
|
|
temperature: float | None = 0.7
|
|
chat_id: str | None = None
|
|
variables: dict[str, str] = field(default_factory=dict)
|
|
detail: bool = False
|
|
timeout_sec: float = 60.0
|
|
send_system_prompt: bool = False
|
|
|
|
@property
|
|
def is_fastgpt(self) -> bool:
|
|
return self.provider == "fastgpt"
|
|
|
|
@property
|
|
def is_openai(self) -> bool:
|
|
return self.provider == "openai"
|
|
|
|
@property
|
|
def uses_local_context_history(self) -> bool:
|
|
"""Whether the pipeline should seed and maintain local LLM context history."""
|
|
return not self.is_fastgpt or self.send_system_prompt
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class STTConfig:
|
|
provider: str = "openai"
|
|
app_id: str = ""
|
|
api_key: str = ""
|
|
api_secret: str = ""
|
|
base_url: str | None = None
|
|
model: str = "gpt-4o-mini-transcribe"
|
|
language: str | None = "en"
|
|
domain: str = "iat"
|
|
accent: str = "mandarin"
|
|
encoding: str = "raw"
|
|
frame_size: int = 1280
|
|
timeout_sec: float = 10.0
|
|
dynamic_correction: bool = False
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class TTSConfig:
|
|
provider: str = "openai"
|
|
app_id: str = ""
|
|
api_key: str = ""
|
|
api_secret: str = ""
|
|
base_url: str | None = None
|
|
model: str = "gpt-4o-mini-tts"
|
|
voice: str = "alloy"
|
|
aue: str = "raw"
|
|
tte: str = "UTF8"
|
|
speed: int = 50
|
|
volume: int = 50
|
|
pitch: int = 50
|
|
timeout_sec: float = 30.0
|
|
source_sample_rate_hz: int | None = None
|
|
oral_level: str = "mid"
|
|
text_aggregation_mode: str | None = None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ServicesConfig:
|
|
llm: LLMConfig = field(default_factory=LLMConfig)
|
|
stt: STTConfig = field(default_factory=STTConfig)
|
|
tts: TTSConfig = field(default_factory=TTSConfig)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EngineConfig:
|
|
server: ServerConfig = field(default_factory=ServerConfig)
|
|
audio: AudioConfig = field(default_factory=AudioConfig)
|
|
session: SessionConfig = field(default_factory=SessionConfig)
|
|
turn: TurnConfig = field(default_factory=TurnConfig)
|
|
agent: AgentConfig = field(default_factory=AgentConfig)
|
|
services: ServicesConfig = field(default_factory=ServicesConfig)
|
|
|
|
|
|
def load_config(path: str | Path | None = None) -> EngineConfig:
|
|
config_path = Path(path) if path is not None else resolve_voice_config_path()
|
|
if not config_path.is_absolute():
|
|
config_path = PROJECT_ROOT / config_path
|
|
data = json.loads(config_path.read_text(encoding="utf-8"))
|
|
if not isinstance(data, dict):
|
|
raise ValueError(f"Config file must contain a JSON object: {config_path}")
|
|
return config_from_dict(data)
|
|
|
|
|
|
def config_from_dict(data: dict) -> EngineConfig:
|
|
services = _dict(data.get("services"))
|
|
agent = _dict(data.get("agent"))
|
|
if agent.get("greeting") == "":
|
|
agent["greeting"] = None
|
|
if agent.get("greeting_mode") not in (None, "generated", "fixed", "off", "fastgpt_opener"):
|
|
raise ValueError(
|
|
"agent.greeting_mode must be one of: generated, fixed, off, fastgpt_opener"
|
|
)
|
|
response_state = ResponseStateConfig(**_dict(agent.pop("response_state")))
|
|
if response_state.max_prefix_chars < 1:
|
|
raise ValueError("agent.response_state.max_prefix_chars must be greater than 0")
|
|
if not response_state.tag:
|
|
raise ValueError("agent.response_state.tag must not be empty")
|
|
if not response_state.event_type:
|
|
raise ValueError("agent.response_state.event_type must not be empty")
|
|
|
|
stt = _dict(services.get("stt") or services.get("asr"))
|
|
if stt.get("language") == "":
|
|
stt["language"] = None
|
|
|
|
llm = _dict(services.get("llm"))
|
|
llm["provider"] = _normalize_llm_provider(llm.get("provider", LLMConfig().provider))
|
|
if llm.get("chat_id") == "":
|
|
llm["chat_id"] = None
|
|
if llm.get("app_id") == "":
|
|
llm["app_id"] = None
|
|
if not isinstance(llm.get("variables"), dict):
|
|
llm["variables"] = {}
|
|
if agent.get("greeting_mode") == "fastgpt_opener" and llm["provider"] != "fastgpt":
|
|
raise ValueError(
|
|
"agent.greeting_mode='fastgpt_opener' requires services.llm.provider='fastgpt'"
|
|
)
|
|
|
|
turn = _dict(data.get("turn"))
|
|
vad = _dict(turn.get("vad"))
|
|
|
|
return EngineConfig(
|
|
server=ServerConfig(**_dict(data.get("server"))),
|
|
audio=AudioConfig(**_dict(data.get("audio"))),
|
|
session=SessionConfig(**_dict(data.get("session"))),
|
|
turn=TurnConfig(
|
|
vad=VADConfig(**vad),
|
|
user_speech_timeout_sec=float(
|
|
turn.get("user_speech_timeout_sec", TurnConfig().user_speech_timeout_sec)
|
|
),
|
|
idle_prompt_timeout_sec=float(
|
|
turn.get("idle_prompt_timeout_sec", TurnConfig().idle_prompt_timeout_sec)
|
|
),
|
|
idle_prompt_max_count=int(
|
|
turn.get("idle_prompt_max_count", TurnConfig().idle_prompt_max_count)
|
|
),
|
|
idle_prompt_text=str(
|
|
turn.get("idle_prompt_text", TurnConfig().idle_prompt_text)
|
|
),
|
|
interruption_min_chars=int(
|
|
turn.get("interruption_min_chars", TurnConfig().interruption_min_chars)
|
|
),
|
|
interruption_use_interim=bool(
|
|
turn.get("interruption_use_interim", TurnConfig().interruption_use_interim)
|
|
),
|
|
interruption_short_replies=list(
|
|
turn.get(
|
|
"interruption_short_replies",
|
|
TurnConfig().interruption_short_replies,
|
|
)
|
|
),
|
|
),
|
|
agent=AgentConfig(**agent, response_state=response_state),
|
|
services=ServicesConfig(
|
|
llm=LLMConfig(**llm),
|
|
stt=STTConfig(**stt),
|
|
tts=TTSConfig(**_dict(services.get("tts"))),
|
|
),
|
|
)
|
|
|
|
|
|
def _dict(value: object) -> dict:
|
|
return dict(value) if isinstance(value, dict) else {}
|
|
|
|
|
|
def _normalize_llm_provider(value: object) -> str:
|
|
provider = str(value or LLMConfig().provider).strip().lower()
|
|
normalized = _LLM_PROVIDER_ALIASES.get(provider)
|
|
if normalized is None:
|
|
supported = ", ".join(sorted(SUPPORTED_LLM_PROVIDERS | {"llm"}))
|
|
raise ValueError(
|
|
f"services.llm.provider must be one of: {supported}; got {value!r}"
|
|
)
|
|
return normalized
|