Merge branch 'engine-v3'

This commit is contained in:
Xin Wang
2026-03-11 11:42:29 +08:00
174 changed files with 17997 additions and 9024 deletions

5
.gitignore vendored
View File

@@ -1,6 +1,3 @@
# OS artifacts
.DS_Store
Thumbs.db
# Workspace runtime data
data/
Thumbs.db

View File

@@ -127,11 +127,13 @@ class Assistant(Base):
speed: Mapped[float] = mapped_column(Float, default=1.0)
hotwords: Mapped[dict] = mapped_column(JSON, default=list)
tools: Mapped[dict] = mapped_column(JSON, default=list)
asr_interim_enabled: Mapped[bool] = mapped_column(default=False)
bot_cannot_be_interrupted: Mapped[bool] = mapped_column(default=False)
interruption_sensitivity: Mapped[int] = mapped_column(Integer, default=500)
config_mode: Mapped[str] = mapped_column(String(32), default="platform")
api_url: Mapped[Optional[str]] = mapped_column(String(255), nullable=True)
api_key: Mapped[Optional[str]] = mapped_column(String(255), nullable=True)
app_id: Mapped[Optional[str]] = mapped_column(String(255), nullable=True)
# 模型关联
llm_model_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True)
asr_model_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True)

View File

@@ -1,6 +1,14 @@
import asyncio
import base64
import io
import json
import os
import sys
import threading
import time
from typing import List, Optional
import wave
from array import array
from typing import Any, Dict, List, Optional, Tuple
import httpx
from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
@@ -17,6 +25,32 @@ from ..schemas import (
router = APIRouter(prefix="/asr", tags=["ASR Models"])
OPENAI_COMPATIBLE_DEFAULT_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall"
DASHSCOPE_DEFAULT_ASR_MODEL = "qwen3-asr-flash-realtime"
DASHSCOPE_DEFAULT_BASE_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
try:
import dashscope
from dashscope.audio.qwen_omni import MultiModality, OmniRealtimeCallback, OmniRealtimeConversation
try:
from dashscope.audio.qwen_omni import TranscriptionParams
except ImportError:
from dashscope.audio.qwen_omni.omni_realtime import TranscriptionParams
DASHSCOPE_SDK_AVAILABLE = True
DASHSCOPE_IMPORT_ERROR = ""
except Exception as exc:
dashscope = None # type: ignore[assignment]
MultiModality = None # type: ignore[assignment]
OmniRealtimeConversation = None # type: ignore[assignment]
TranscriptionParams = None # type: ignore[assignment]
DASHSCOPE_SDK_AVAILABLE = False
DASHSCOPE_IMPORT_ERROR = f"{type(exc).__name__}: {exc}"
class OmniRealtimeCallback: # type: ignore[no-redef]
"""Fallback callback base when DashScope SDK is unavailable."""
pass
def _is_openai_compatible_vendor(vendor: str) -> bool:
@@ -29,12 +63,377 @@ def _is_openai_compatible_vendor(vendor: str) -> bool:
}
def _is_dashscope_vendor(vendor: str) -> bool:
return (vendor or "").strip().lower() == "dashscope"
def _default_asr_model(vendor: str) -> str:
if _is_openai_compatible_vendor(vendor):
return OPENAI_COMPATIBLE_DEFAULT_ASR_MODEL
if _is_dashscope_vendor(vendor):
return DASHSCOPE_DEFAULT_ASR_MODEL
return "whisper-1"
def _dashscope_language(language: Optional[str]) -> Optional[str]:
normalized = (language or "").strip().lower()
if not normalized or normalized in {"multi-lingual", "multilingual", "multi_lingual", "auto"}:
return None
if normalized.startswith("zh"):
return "zh"
if normalized.startswith("en"):
return "en"
return normalized
class _DashScopePreviewCallback(OmniRealtimeCallback):
"""Collect DashScope ASR websocket events for preview/test flows."""
def __init__(self) -> None:
super().__init__()
self._open_event = threading.Event()
self._session_ready_event = threading.Event()
self._done_event = threading.Event()
self._lock = threading.Lock()
self._final_text = ""
self._last_interim_text = ""
self._error_message: Optional[str] = None
def on_open(self) -> None:
self._open_event.set()
def on_close(self, code: int, reason: str) -> None:
if self._done_event.is_set():
return
self._error_message = f"DashScope websocket closed unexpectedly: {code} {reason}"
self._done_event.set()
self._session_ready_event.set()
def on_error(self, message: Any) -> None:
self._error_message = str(message)
self._done_event.set()
self._session_ready_event.set()
def on_event(self, response: Any) -> None:
payload = _coerce_dashscope_event(response)
event_type = str(payload.get("type") or "").strip()
if not event_type:
return
if event_type in {"session.created", "session.updated"}:
self._session_ready_event.set()
return
if event_type == "error" or event_type.endswith(".failed"):
self._error_message = _format_dashscope_error_event(payload)
self._done_event.set()
self._session_ready_event.set()
return
if event_type == "conversation.item.input_audio_transcription.text":
interim_text = _extract_dashscope_text(payload, keys=("stash", "text", "transcript"))
if interim_text:
with self._lock:
self._last_interim_text = interim_text
return
if event_type == "conversation.item.input_audio_transcription.completed":
final_text = _extract_dashscope_text(payload, keys=("transcript", "text", "stash"))
with self._lock:
if final_text:
self._final_text = final_text
self._done_event.set()
return
if event_type in {"response.done", "session.finished"}:
self._done_event.set()
def wait_for_open(self, timeout: float = 10.0) -> None:
if not self._open_event.wait(timeout):
raise TimeoutError("DashScope websocket open timeout")
def wait_for_session_ready(self, timeout: float = 6.0) -> bool:
return self._session_ready_event.wait(timeout)
def wait_for_done(self, timeout: float = 20.0) -> None:
if not self._done_event.wait(timeout):
raise TimeoutError("DashScope transcription timeout")
def raise_if_error(self) -> None:
if self._error_message:
raise RuntimeError(self._error_message)
def read_text(self) -> str:
with self._lock:
return self._final_text or self._last_interim_text
def _coerce_dashscope_event(response: Any) -> Dict[str, Any]:
if isinstance(response, dict):
return response
if isinstance(response, str):
try:
parsed = json.loads(response)
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
pass
return {"type": "raw", "message": str(response)}
def _format_dashscope_error_event(payload: Dict[str, Any]) -> str:
error = payload.get("error")
if isinstance(error, dict):
code = str(error.get("code") or "").strip()
message = str(error.get("message") or "").strip()
if code and message:
return f"{code}: {message}"
return message or str(error)
return str(error or "DashScope realtime ASR error")
def _extract_dashscope_text(payload: Dict[str, Any], *, keys: Tuple[str, ...]) -> str:
for key in keys:
value = payload.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
if isinstance(value, dict):
nested = _extract_dashscope_text(value, keys=keys)
if nested:
return nested
for value in payload.values():
if isinstance(value, dict):
nested = _extract_dashscope_text(value, keys=keys)
if nested:
return nested
return ""
def _create_dashscope_realtime_client(
*,
model: str,
callback: _DashScopePreviewCallback,
url: str,
api_key: str,
) -> Any:
if OmniRealtimeConversation is None:
raise RuntimeError("DashScope SDK unavailable")
init_kwargs = {
"model": model,
"callback": callback,
"url": url,
}
try:
return OmniRealtimeConversation(api_key=api_key, **init_kwargs) # type: ignore[misc]
except TypeError as exc:
if "api_key" not in str(exc):
raise
return OmniRealtimeConversation(**init_kwargs) # type: ignore[misc]
def _close_dashscope_client(client: Any) -> None:
finish_fn = getattr(client, "finish", None)
if callable(finish_fn):
try:
finish_fn()
except Exception:
pass
close_fn = getattr(client, "close", None)
if callable(close_fn):
try:
close_fn()
except Exception:
pass
def _configure_dashscope_session(
*,
client: Any,
callback: _DashScopePreviewCallback,
sample_rate: int,
language: Optional[str],
) -> None:
update_fn = getattr(client, "update_session", None)
if not callable(update_fn):
raise RuntimeError("DashScope ASR SDK missing update_session method")
text_modality: Any = "text"
if MultiModality is not None and hasattr(MultiModality, "TEXT"):
text_modality = MultiModality.TEXT
transcription_params: Optional[Any] = None
language_hint = _dashscope_language(language)
if TranscriptionParams is not None:
try:
params_kwargs: Dict[str, Any] = {
"sample_rate": sample_rate,
"input_audio_format": "pcm",
}
if language_hint:
params_kwargs["language"] = language_hint
transcription_params = TranscriptionParams(**params_kwargs)
except Exception:
transcription_params = None
update_attempts = [
{
"output_modalities": [text_modality],
"enable_turn_detection": False,
"enable_input_audio_transcription": True,
"transcription_params": transcription_params,
},
{
"output_modalities": [text_modality],
"enable_turn_detection": False,
"enable_input_audio_transcription": True,
},
{
"output_modalities": [text_modality],
},
]
last_error: Optional[Exception] = None
for params in update_attempts:
if params.get("transcription_params") is None:
params = {key: value for key, value in params.items() if key != "transcription_params"}
try:
update_fn(**params)
callback.wait_for_session_ready()
callback.raise_if_error()
return
except TypeError as exc:
last_error = exc
continue
except Exception as exc:
last_error = exc
continue
raise RuntimeError(f"DashScope ASR session.update failed: {last_error}")
def _load_wav_pcm16_mono(audio_bytes: bytes) -> Tuple[bytes, int]:
try:
with wave.open(io.BytesIO(audio_bytes), "rb") as wav_file:
channel_count = wav_file.getnchannels()
sample_width = wav_file.getsampwidth()
sample_rate = wav_file.getframerate()
compression = wav_file.getcomptype()
pcm_frames = wav_file.readframes(wav_file.getnframes())
except wave.Error as exc:
raise RuntimeError("DashScope preview currently supports WAV audio. Record in browser or upload a .wav file.") from exc
if compression != "NONE":
raise RuntimeError("DashScope preview requires uncompressed PCM WAV audio.")
if sample_width != 2:
raise RuntimeError("DashScope preview requires 16-bit PCM WAV audio.")
if not pcm_frames:
raise RuntimeError("Uploaded WAV file is empty")
if channel_count <= 1:
return pcm_frames, sample_rate
samples = array("h")
samples.frombytes(pcm_frames)
if sys.byteorder == "big":
samples.byteswap()
mono_samples = array(
"h",
(
int(sum(samples[index:index + channel_count]) / channel_count)
for index in range(0, len(samples), channel_count)
),
)
if sys.byteorder == "big":
mono_samples.byteswap()
return mono_samples.tobytes(), sample_rate
def _probe_dashscope_asr_connection(*, api_key: str, base_url: str, model: str, language: Optional[str]) -> None:
if not DASHSCOPE_SDK_AVAILABLE:
hint = f"`{sys.executable} -m pip install dashscope>=1.25.11`"
detail = f"; import error: {DASHSCOPE_IMPORT_ERROR}" if DASHSCOPE_IMPORT_ERROR else ""
raise RuntimeError(f"dashscope package not installed; install with {hint}{detail}")
callback = _DashScopePreviewCallback()
if dashscope is not None:
dashscope.api_key = api_key
client = _create_dashscope_realtime_client(
model=model,
callback=callback,
url=base_url,
api_key=api_key,
)
try:
client.connect()
callback.wait_for_open()
_configure_dashscope_session(
client=client,
callback=callback,
sample_rate=16000,
language=language,
)
finally:
_close_dashscope_client(client)
def _transcribe_dashscope_preview(
*,
audio_bytes: bytes,
api_key: str,
base_url: str,
model: str,
language: Optional[str],
) -> Dict[str, Any]:
if not DASHSCOPE_SDK_AVAILABLE:
hint = f"`{sys.executable} -m pip install dashscope>=1.25.11`"
detail = f"; import error: {DASHSCOPE_IMPORT_ERROR}" if DASHSCOPE_IMPORT_ERROR else ""
raise RuntimeError(f"dashscope package not installed; install with {hint}{detail}")
pcm_audio, sample_rate = _load_wav_pcm16_mono(audio_bytes)
callback = _DashScopePreviewCallback()
if dashscope is not None:
dashscope.api_key = api_key
client = _create_dashscope_realtime_client(
model=model,
callback=callback,
url=base_url,
api_key=api_key,
)
try:
client.connect()
callback.wait_for_open()
_configure_dashscope_session(
client=client,
callback=callback,
sample_rate=sample_rate,
language=language,
)
append_fn = getattr(client, "append_audio", None)
if not callable(append_fn):
raise RuntimeError("DashScope ASR SDK missing append_audio method")
commit_fn = getattr(client, "commit", None)
if not callable(commit_fn):
raise RuntimeError("DashScope ASR SDK missing commit method")
append_fn(base64.b64encode(pcm_audio).decode("ascii"))
commit_fn()
callback.wait_for_done()
callback.raise_if_error()
return {
"transcript": callback.read_text(),
"language": _dashscope_language(language) or "Multi-lingual",
"confidence": None,
}
finally:
_close_dashscope_client(client)
# ============ ASR Models CRUD ============
@router.get("")
def list_asr_models(
@@ -132,6 +531,27 @@ def test_asr_model(
start_time = time.time()
try:
if _is_dashscope_vendor(model.vendor):
effective_api_key = (model.api_key or "").strip() or os.getenv("DASHSCOPE_API_KEY", "").strip() or os.getenv("ASR_API_KEY", "").strip()
if not effective_api_key:
return ASRTestResponse(success=False, error=f"API key is required for ASR model: {model.name}")
base_url = (model.base_url or "").strip() or DASHSCOPE_DEFAULT_BASE_URL
selected_model = (model.model_name or "").strip() or _default_asr_model(model.vendor)
_probe_dashscope_asr_connection(
api_key=effective_api_key,
base_url=base_url,
model=selected_model,
language=model.language,
)
latency_ms = int((time.time() - start_time) * 1000)
return ASRTestResponse(
success=True,
language=model.language,
latency_ms=latency_ms,
message="DashScope realtime ASR connected",
)
# 连接性测试优先,避免依赖真实音频输入
headers = {"Authorization": f"Bearer {model.api_key}"}
with httpx.Client(timeout=60.0) as client:
@@ -246,7 +666,7 @@ async def preview_asr_model(
api_key: Optional[str] = Form(None),
db: Session = Depends(get_db),
):
"""预览 ASR上传音频并调用 OpenAI-compatible /audio/transcriptions"""
"""预览 ASR根据供应商调用 OpenAI-compatible 或 DashScope 实时识别"""
model = db.query(ASRModel).filter(ASRModel.id == id).first()
if not model:
raise HTTPException(status_code=404, detail="ASR Model not found")
@@ -264,18 +684,50 @@ async def preview_asr_model(
raise HTTPException(status_code=400, detail="Uploaded audio file is empty")
effective_api_key = (api_key or "").strip() or (model.api_key or "").strip()
if not effective_api_key and _is_openai_compatible_vendor(model.vendor):
effective_api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
if not effective_api_key:
if _is_openai_compatible_vendor(model.vendor):
effective_api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
elif _is_dashscope_vendor(model.vendor):
effective_api_key = os.getenv("DASHSCOPE_API_KEY", "").strip() or os.getenv("ASR_API_KEY", "").strip()
if not effective_api_key:
raise HTTPException(status_code=400, detail=f"API key is required for ASR model: {model.name}")
base_url = (model.base_url or "").strip().rstrip("/")
if _is_dashscope_vendor(model.vendor) and not base_url:
base_url = DASHSCOPE_DEFAULT_BASE_URL
if not base_url:
raise HTTPException(status_code=400, detail=f"Base URL is required for ASR model: {model.name}")
selected_model = (model.model_name or "").strip() or _default_asr_model(model.vendor)
data = {"model": selected_model}
effective_language = (language or "").strip() or None
start_time = time.time()
if _is_dashscope_vendor(model.vendor):
try:
payload = await asyncio.to_thread(
_transcribe_dashscope_preview,
audio_bytes=audio_bytes,
api_key=effective_api_key,
base_url=base_url,
model=selected_model,
language=effective_language or model.language,
)
except Exception as exc:
raise HTTPException(status_code=502, detail=f"DashScope ASR request failed: {exc}") from exc
transcript = str(payload.get("transcript") or "")
response_language = str(payload.get("language") or effective_language or model.language)
latency_ms = int((time.time() - start_time) * 1000)
return ASRTestResponse(
success=bool(transcript),
transcript=transcript,
language=response_language,
confidence=None,
latency_ms=latency_ms,
message=None if transcript else "No transcript in response",
)
data = {"model": selected_model}
if effective_language:
data["language"] = effective_language
if model.hotwords:
@@ -284,7 +736,6 @@ async def preview_asr_model(
headers = {"Authorization": f"Bearer {effective_api_key}"}
files = {"file": (filename, audio_bytes, content_type)}
start_time = time.time()
try:
with httpx.Client(timeout=90.0) as client:
response = client.post(

View File

@@ -126,6 +126,12 @@ def _ensure_assistant_schema(db: Session) -> None:
if "manual_opener_tool_calls" not in columns:
db.execute(text("ALTER TABLE assistants ADD COLUMN manual_opener_tool_calls JSON"))
altered = True
if "asr_interim_enabled" not in columns:
db.execute(text("ALTER TABLE assistants ADD COLUMN asr_interim_enabled BOOLEAN DEFAULT 0"))
altered = True
if "app_id" not in columns:
db.execute(text("ALTER TABLE assistants ADD COLUMN app_id VARCHAR(255)"))
altered = True
if altered:
db.commit()
@@ -294,7 +300,7 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s
config_mode = str(assistant.config_mode or "platform").strip().lower()
if config_mode in {"dify", "fastgpt"}:
if config_mode == "dify":
metadata["services"]["llm"] = {
"provider": "openai",
"model": "",
@@ -305,6 +311,19 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s
warnings.append(f"External LLM API URL is empty for mode: {assistant.config_mode}")
if not (assistant.api_key or "").strip():
warnings.append(f"External LLM API key is empty for mode: {assistant.config_mode}")
elif config_mode == "fastgpt":
metadata["services"]["llm"] = {
"provider": "fastgpt",
"model": "fastgpt",
"apiKey": assistant.api_key,
"baseUrl": assistant.api_url,
}
if (assistant.app_id or "").strip():
metadata["services"]["llm"]["appId"] = assistant.app_id
if not (assistant.api_url or "").strip():
warnings.append(f"FastGPT API URL is empty for mode: {assistant.config_mode}")
if not (assistant.api_key or "").strip():
warnings.append(f"FastGPT API key is empty for mode: {assistant.config_mode}")
elif assistant.llm_model_id:
llm = db.query(LLMModel).filter(LLMModel.id == assistant.llm_model_id).first()
if llm:
@@ -317,18 +336,27 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s
else:
warnings.append(f"LLM model not found: {assistant.llm_model_id}")
asr_runtime: Dict[str, Any] = {
"enableInterim": bool(assistant.asr_interim_enabled),
}
if assistant.asr_model_id:
asr = db.query(ASRModel).filter(ASRModel.id == assistant.asr_model_id).first()
if asr:
asr_provider = "openai_compatible" if _is_openai_compatible_vendor(asr.vendor) else "buffered"
metadata["services"]["asr"] = {
if _is_dashscope_vendor(asr.vendor):
asr_provider = "dashscope"
elif _is_openai_compatible_vendor(asr.vendor):
asr_provider = "openai_compatible"
else:
asr_provider = "buffered"
asr_runtime.update({
"provider": asr_provider,
"model": asr.model_name or asr.name,
"apiKey": asr.api_key if asr_provider == "openai_compatible" else None,
"baseUrl": asr.base_url if asr_provider == "openai_compatible" else None,
}
"apiKey": asr.api_key if asr_provider in {"openai_compatible", "dashscope"} else None,
"baseUrl": asr.base_url if asr_provider in {"openai_compatible", "dashscope"} else None,
})
else:
warnings.append(f"ASR model not found: {assistant.asr_model_id}")
metadata["services"]["asr"] = asr_runtime
if not assistant.voice_output_enabled:
metadata["services"]["tts"] = {"enabled": False}
@@ -432,11 +460,13 @@ def assistant_to_dict(assistant: Assistant) -> dict:
"speed": assistant.speed,
"hotwords": assistant.hotwords or [],
"tools": _normalize_assistant_tool_ids(assistant.tools),
"asrInterimEnabled": bool(assistant.asr_interim_enabled),
"botCannotBeInterrupted": bool(assistant.bot_cannot_be_interrupted),
"interruptionSensitivity": assistant.interruption_sensitivity,
"configMode": assistant.config_mode,
"apiUrl": assistant.api_url,
"apiKey": assistant.api_key,
"appId": assistant.app_id,
"llmModelId": assistant.llm_model_id,
"asrModelId": assistant.asr_model_id,
"embeddingModelId": assistant.embedding_model_id,
@@ -452,12 +482,14 @@ def _apply_assistant_update(assistant: Assistant, update_data: dict) -> None:
"firstTurnMode": "first_turn_mode",
"manualOpenerToolCalls": "manual_opener_tool_calls",
"interruptionSensitivity": "interruption_sensitivity",
"asrInterimEnabled": "asr_interim_enabled",
"botCannotBeInterrupted": "bot_cannot_be_interrupted",
"configMode": "config_mode",
"voiceOutputEnabled": "voice_output_enabled",
"generatedOpenerEnabled": "generated_opener_enabled",
"apiUrl": "api_url",
"apiKey": "api_key",
"appId": "app_id",
"llmModelId": "llm_model_id",
"asrModelId": "asr_model_id",
"embeddingModelId": "embedding_model_id",
@@ -646,11 +678,13 @@ def create_assistant(data: AssistantCreate, db: Session = Depends(get_db)):
speed=data.speed,
hotwords=data.hotwords,
tools=_normalize_assistant_tool_ids(data.tools),
asr_interim_enabled=data.asrInterimEnabled,
bot_cannot_be_interrupted=data.botCannotBeInterrupted,
interruption_sensitivity=data.interruptionSensitivity,
config_mode=data.configMode,
api_url=data.apiUrl,
api_key=data.apiKey,
app_id=data.appId,
llm_model_id=data.llmModelId,
asr_model_id=data.asrModelId,
embedding_model_id=data.embeddingModelId,

View File

@@ -191,6 +191,7 @@ class ASRModelCreate(ASRModelBase):
class ASRModelUpdate(BaseModel):
name: Optional[str] = None
vendor: Optional[str] = None
language: Optional[str] = None
base_url: Optional[str] = None
api_key: Optional[str] = None
@@ -291,11 +292,13 @@ class AssistantBase(BaseModel):
speed: float = 1.0
hotwords: List[str] = []
tools: List[str] = []
asrInterimEnabled: bool = False
botCannotBeInterrupted: bool = False
interruptionSensitivity: int = 500
configMode: str = "platform"
apiUrl: Optional[str] = None
apiKey: Optional[str] = None
appId: Optional[str] = None
# 模型关联
llmModelId: Optional[str] = None
asrModelId: Optional[str] = None
@@ -322,11 +325,13 @@ class AssistantUpdate(BaseModel):
speed: Optional[float] = None
hotwords: Optional[List[str]] = None
tools: Optional[List[str]] = None
asrInterimEnabled: Optional[bool] = None
botCannotBeInterrupted: Optional[bool] = None
interruptionSensitivity: Optional[int] = None
configMode: Optional[str] = None
apiUrl: Optional[str] = None
apiKey: Optional[str] = None
appId: Optional[str] = None
llmModelId: Optional[str] = None
asrModelId: Optional[str] = None
embeddingModelId: Optional[str] = None

View File

@@ -34,6 +34,7 @@ SEED_LLM_IDS = {
SEED_ASR_IDS = {
"sensevoice_small": short_id("asr"),
"telespeech_asr": short_id("asr"),
"dashscope_realtime": short_id("asr"),
}
SEED_ASSISTANT_IDS = {
@@ -408,6 +409,20 @@ def init_default_asr_models():
enable_normalization=True,
enabled=True,
),
ASRModel(
id=SEED_ASR_IDS["dashscope_realtime"],
user_id=1,
name="DashScope Realtime ASR",
vendor="DashScope",
language="Multi-lingual",
base_url=DASHSCOPE_REALTIME_URL,
api_key="YOUR_API_KEY",
model_name="qwen3-asr-flash-realtime",
hotwords=[],
enable_punctuation=True,
enable_normalization=True,
enabled=True,
),
]
seed_if_empty(db, ASRModel, asr_models, "✅ 默认ASR模型已初始化")

View File

@@ -1,8 +1,21 @@
"""Tests for ASR Model API endpoints"""
import io
import wave
import pytest
from unittest.mock import patch, MagicMock
def _make_wav_bytes(sample_rate: int = 16000) -> bytes:
with io.BytesIO() as buffer:
with wave.open(buffer, "wb") as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
wav_file.writeframes(b"\x00\x00" * sample_rate)
return buffer.getvalue()
class TestASRModelAPI:
"""Test cases for ASR Model endpoints"""
@@ -75,6 +88,24 @@ class TestASRModelAPI:
assert data["language"] == "en"
assert data["enable_punctuation"] == False
def test_update_asr_model_vendor(self, client, sample_asr_model_data):
"""Test updating ASR vendor metadata."""
create_response = client.post("/api/asr", json=sample_asr_model_data)
model_id = create_response.json()["id"]
response = client.put(
f"/api/asr/{model_id}",
json={
"vendor": "DashScope",
"model_name": "qwen3-asr-flash-realtime",
"base_url": "wss://dashscope.aliyuncs.com/api-ws/v1/realtime",
},
)
assert response.status_code == 200
data = response.json()
assert data["vendor"] == "DashScope"
assert data["model_name"] == "qwen3-asr-flash-realtime"
def test_delete_asr_model(self, client, sample_asr_model_data):
"""Test deleting an ASR model"""
# Create first
@@ -234,6 +265,28 @@ class TestASRModelAPI:
response = client.post(f"/api/asr/{model_id}/test")
assert response.status_code == 200
def test_test_asr_model_dashscope(self, client, sample_asr_model_data, monkeypatch):
"""Test DashScope ASR connectivity probe."""
from app.routers import asr as asr_router
sample_asr_model_data["vendor"] = "DashScope"
sample_asr_model_data["base_url"] = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
sample_asr_model_data["model_name"] = "qwen3-asr-flash-realtime"
create_response = client.post("/api/asr", json=sample_asr_model_data)
model_id = create_response.json()["id"]
def fake_probe(**kwargs):
assert kwargs["api_key"] == sample_asr_model_data["api_key"]
assert kwargs["model"] == "qwen3-asr-flash-realtime"
monkeypatch.setattr(asr_router, "_probe_dashscope_asr_connection", fake_probe)
response = client.post(f"/api/asr/{model_id}/test")
assert response.status_code == 200
data = response.json()
assert data["success"] is True
assert data["message"] == "DashScope realtime ASR connected"
@patch('httpx.Client')
def test_test_asr_model_failure(self, mock_client_class, client, sample_asr_model_data):
"""Test testing an ASR model with failed connection"""
@@ -274,7 +327,7 @@ class TestASRModelAPI:
def test_different_asr_vendors(self, client):
"""Test creating ASR models with different vendors"""
vendors = ["SiliconFlow", "OpenAI", "Azure"]
vendors = ["SiliconFlow", "OpenAI", "Azure", "DashScope"]
for vendor in vendors:
data = {
"id": f"asr-vendor-{vendor.lower()}",
@@ -345,3 +398,33 @@ class TestASRModelAPI:
)
assert response.status_code == 400
assert "Only audio files are supported" in response.text
def test_preview_asr_model_dashscope(self, client, sample_asr_model_data, monkeypatch):
"""Test ASR preview endpoint with DashScope realtime helper."""
from app.routers import asr as asr_router
sample_asr_model_data["vendor"] = "DashScope"
sample_asr_model_data["base_url"] = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
sample_asr_model_data["model_name"] = "qwen3-asr-flash-realtime"
create_response = client.post("/api/asr", json=sample_asr_model_data)
model_id = create_response.json()["id"]
def fake_preview(**kwargs):
assert kwargs["base_url"] == sample_asr_model_data["base_url"]
assert kwargs["model"] == sample_asr_model_data["model_name"]
return {
"transcript": "你好,这是实时识别",
"language": "zh",
"confidence": None,
}
monkeypatch.setattr(asr_router, "_transcribe_dashscope_preview", fake_preview)
response = client.post(
f"/api/asr/{model_id}/preview",
files={"file": ("sample.wav", _make_wav_bytes(), "audio/wav")},
)
assert response.status_code == 200
payload = response.json()
assert payload["success"] is True
assert payload["transcript"] == "你好,这是实时识别"

View File

@@ -27,7 +27,9 @@ class TestAssistantAPI:
assert data["voiceOutputEnabled"] is True
assert data["firstTurnMode"] == "bot_first"
assert data["generatedOpenerEnabled"] is False
assert data["asrInterimEnabled"] is False
assert data["botCannotBeInterrupted"] is False
assert data["appId"] is None
assert "id" in data
assert data["callCount"] == 0
@@ -37,6 +39,7 @@ class TestAssistantAPI:
response = client.post("/api/assistants", json=data)
assert response.status_code == 200
assert response.json()["name"] == "Minimal Assistant"
assert response.json()["asrInterimEnabled"] is False
def test_get_assistant_by_id(self, client, sample_assistant_data):
"""Test getting a specific assistant by ID"""
@@ -68,6 +71,7 @@ class TestAssistantAPI:
"prompt": "You are an updated assistant.",
"speed": 1.5,
"voiceOutputEnabled": False,
"asrInterimEnabled": True,
"manualOpenerToolCalls": [
{"toolName": "text_msg_prompt", "arguments": {"msg": "请选择服务类型"}}
],
@@ -79,6 +83,7 @@ class TestAssistantAPI:
assert data["prompt"] == "You are an updated assistant."
assert data["speed"] == 1.5
assert data["voiceOutputEnabled"] is False
assert data["asrInterimEnabled"] is True
assert data["manualOpenerToolCalls"] == [
{"toolName": "text_msg_prompt", "arguments": {"msg": "请选择服务类型"}}
]
@@ -213,6 +218,7 @@ class TestAssistantAPI:
"prompt": "runtime prompt",
"opener": "runtime opener",
"manualOpenerToolCalls": [{"toolName": "text_msg_prompt", "arguments": {"msg": "欢迎"}}],
"asrInterimEnabled": True,
"speed": 1.1,
})
assistant_resp = client.post("/api/assistants", json=sample_assistant_data)
@@ -232,6 +238,7 @@ class TestAssistantAPI:
assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"]
assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"]
assert metadata["services"]["asr"]["baseUrl"] == sample_asr_model_data["base_url"]
assert metadata["services"]["asr"]["enableInterim"] is True
expected_tts_voice = f"{sample_voice_data['model']}:{sample_voice_data['voice_key']}"
assert metadata["services"]["tts"]["voice"] == expected_tts_voice
assert metadata["services"]["tts"]["baseUrl"] == sample_voice_data["base_url"]
@@ -309,6 +316,7 @@ class TestAssistantAPI:
assert runtime_resp.status_code == 200
metadata = runtime_resp.json()["sessionStartMetadata"]
assert metadata["output"]["mode"] == "text"
assert metadata["services"]["asr"]["enableInterim"] is False
assert metadata["services"]["tts"]["enabled"] is False
def test_runtime_config_dashscope_voice_provider(self, client, sample_assistant_data):
@@ -343,6 +351,48 @@ class TestAssistantAPI:
assert tts["apiKey"] == "dashscope-key"
assert tts["baseUrl"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
def test_runtime_config_dashscope_asr_provider(self, client, sample_assistant_data):
"""DashScope ASR models should map to dashscope asr provider in runtime metadata."""
asr_resp = client.post("/api/asr", json={
"name": "DashScope Realtime ASR",
"vendor": "DashScope",
"language": "zh",
"base_url": "wss://dashscope.aliyuncs.com/api-ws/v1/realtime",
"api_key": "dashscope-asr-key",
"model_name": "qwen3-asr-flash-realtime",
"hotwords": [],
"enable_punctuation": True,
"enable_normalization": True,
"enabled": True,
})
assert asr_resp.status_code == 200
asr_payload = asr_resp.json()
sample_assistant_data.update({
"asrModelId": asr_payload["id"],
})
assistant_resp = client.post("/api/assistants", json=sample_assistant_data)
assert assistant_resp.status_code == 200
assistant_id = assistant_resp.json()["id"]
runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config")
assert runtime_resp.status_code == 200
metadata = runtime_resp.json()["sessionStartMetadata"]
asr = metadata["services"]["asr"]
assert asr["provider"] == "dashscope"
assert asr["baseUrl"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
assert asr["enableInterim"] is False
def test_runtime_config_defaults_asr_interim_disabled_without_asr_model(self, client, sample_assistant_data):
assistant_resp = client.post("/api/assistants", json=sample_assistant_data)
assert assistant_resp.status_code == 200
assistant_id = assistant_resp.json()["id"]
runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config")
assert runtime_resp.status_code == 200
metadata = runtime_resp.json()["sessionStartMetadata"]
assert metadata["services"]["asr"]["enableInterim"] is False
def test_assistant_interrupt_and_generated_opener_flags(self, client, sample_assistant_data):
sample_assistant_data.update({
"firstTurnMode": "user_first",
@@ -370,3 +420,21 @@ class TestAssistantAPI:
assert metadata["greeting"] == ""
assert metadata["bargeIn"]["enabled"] is False
assert metadata["bargeIn"]["minDurationMs"] == 900
def test_fastgpt_app_id_persists_and_flows_to_runtime(self, client, sample_assistant_data):
sample_assistant_data.update({
"configMode": "fastgpt",
"apiUrl": "https://cloud.fastgpt.cn/api",
"apiKey": "fastgpt-key",
"appId": "app-fastgpt-123",
})
assistant_resp = client.post("/api/assistants", json=sample_assistant_data)
assert assistant_resp.status_code == 200
assistant_id = assistant_resp.json()["id"]
assert assistant_resp.json()["appId"] == "app-fastgpt-123"
runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config")
assert runtime_resp.status_code == 200
metadata = runtime_resp.json()["sessionStartMetadata"]
assert metadata["services"]["llm"]["provider"] == "fastgpt"
assert metadata["services"]["llm"]["appId"] == "app-fastgpt-123"

View File

@@ -163,4 +163,4 @@
- [自动化测试](autotest.md) - 批量测试助手
- [历史记录](history.md) - 查看对话详情
- [提示词指南](../assistants/prompts.md) - 优化提示词
- [提示词指南](../concepts/assistants/prompts.md) - 优化提示词

View File

@@ -1,4 +1,4 @@
# API 参考
# API 参考
本节提供 Realtime Agent Studio (RAS) 的完整 API 文档。
@@ -163,6 +163,8 @@ WebSocket API 使用双向消息通信:
## SDK
> 下面的 SDK 包名和类名沿用当前包标识;产品名称在文档中统一使用 Realtime Agent StudioRAS
### JavaScript SDK
```bash
@@ -230,3 +232,4 @@ async with client.connect(assistant.id) as conv:
- [WebSocket 协议](websocket.md) - 实时对话协议详解
- [错误码](errors.md) - 错误处理参考
- [快速开始](../quickstart/index.md) - 快速创建助手

View File

@@ -1,218 +1,8 @@
# 配置选项
# 配置选项(旧入口)
助手配置界面包含多个标签页,每个标签页负责不同方面的配置。
本页保留旧链接,用于承接历史导航或外部引用。助手配置的正式文档已经迁移到:
## 全局设置
- [配置选项](../concepts/assistants/configuration.md) - 助手配置界面与运行时配置层说明
- [助手概念](../concepts/assistants.md) - 先理解助手对象、会话与动态变量
全局设置定义助手的核心对话能力。
| 配置项 | 说明 | 建议值 |
|-------|------|--------|
| 助手名称 | 用于标识和管理 | 简洁明确 |
| 系统提示词 | 定义角色、任务和约束 | 详见[提示词指南](prompts.md) |
| 开场白 | 对话开始时的问候语 | 简短友好 |
| 温度参数 | 控制回复随机性 | 0.7(通用)/ 0.3(严谨) |
| 上下文长度 | 保留的历史消息数 | 10-20 |
### 高级选项
- **首轮模式** - 设置首次对话的触发方式
- **打断检测** - 用户打断时的处理策略
- **超时设置** - 无响应时的处理
## 语音配置
配置语音识别和语音合成参数。
### TTS 语音合成
| 配置 | 说明 |
|------|------|
| TTS 引擎 | 选择语音合成服务(阿里/火山/Minimax |
| 音色 | 选择语音风格和性别 |
| 语速 | 语音播放速度0.5-2.0 |
| 音量 | 语音输出音量0-100 |
| 音调 | 语音音调高低0.5-2.0 |
### ASR 语音识别
| 配置 | 说明 |
|------|------|
| ASR 引擎 | 选择语音识别服务 |
| 语言 | 识别语言(中文/英文/多语言) |
| 热词 | 提高特定词汇识别准确率 |
## 工具绑定
配置助手可调用的外部工具。
### 可用工具类型
| 工具 | 说明 |
|------|------|
| 搜索工具 | 网络搜索获取信息 |
| 天气查询 | 查询天气预报 |
| 计算器 | 数学计算 |
| 知识库检索 | RAG 知识检索 |
| 自定义工具 | HTTP 回调外部 API |
### 配置步骤
1. 在工具列表中勾选需要的工具
2. 配置工具参数(如有)
3. 测试工具调用是否正常
## 知识关联
关联 RAG 知识库,让助手能够回答专业领域问题。
### 配置参数
| 参数 | 说明 | 建议值 |
|------|------|--------|
| 知识库 | 选择要关联的知识库 | - |
| 相似度阈值 | 低于此分数不返回 | 0.7 |
| 返回数量 | 单次检索返回条数 | 3 |
| 检索策略 | 混合/向量/关键词 | 混合 |
### 多知识库
支持关联多个知识库,系统会自动合并检索结果。
## 外部链接
配置第三方服务集成和 Webhook 回调。
### Webhook 配置
| 字段 | 说明 |
|------|------|
| 回调 URL | 接收事件的 HTTP 端点 |
| 事件类型 | 订阅的事件(对话开始/结束/工具调用等) |
| 认证方式 | API Key / Bearer Token / 无 |
### 支持的事件
- `conversation.started` - 对话开始
- `conversation.ended` - 对话结束
- `tool.called` - 工具被调用
- `human.transfer` - 转人工
## 配置持久化与运行时覆盖
助手配置分为两层:
1. **数据库持久化配置(基线配置)**:通过助手管理 API 保存,后续会话默认读取这一层。
2. **会话级覆盖配置runtime overrides**:仅对当前 WebSocket 会话生效,不会写回数据库。
### 哪些配置会存到数据库
以下字段会持久化在 `assistants` / `assistant_opener_audio` 等表中(通过创建/更新助手写入):
| 类别 | 典型字段 |
|------|---------|
| 对话行为 | `name``prompt``opener``firstTurnMode``generatedOpenerEnabled` |
| 输出与打断 | `voiceOutputEnabled``voice``speed``botCannotBeInterrupted``interruptionSensitivity` |
| 工具与知识库 | `tools``knowledgeBaseId` |
| 模型与外部模式 | `configMode``apiUrl``apiKey``llmModelId``asrModelId``embeddingModelId``rerankModelId` |
| 开场音频 | `openerAudioEnabled` 及音频文件状态(`ready``durationMs` 等) |
> 引擎在连接时通过 `assistant_id` 从后端读取该助手的 `sessionStartMetadata` 作为默认运行配置。
### 哪些配置可以在会话中覆盖
客户端可在 `session.start.metadata.overrides` 中覆盖以下白名单字段(仅当前会话有效):
- `systemPrompt`
- `greeting`
- `firstTurnMode`
- `generatedOpenerEnabled`
- `output`
- `bargeIn`
- `knowledgeBaseId`
- `knowledge`
- `tools`
- `openerAudio`
以下字段不能由客户端覆盖:
- `services`(模型 provider / apiKey / baseUrl 等)
- `assistantId` / `appId` / `configVersionId`(及下划线变体)
- 包含密钥语义的字段(如 `apiKey``token``secret``password``authorization`
### 覆盖示例(代码)
下面示例展示「数据库基线配置 + 会话 overrides」的最终效果。
```json
// 1) 数据库存储的基线配置(示意)
// GET /api/v1/assistants/asst_demo/config -> sessionStartMetadata
{
"systemPrompt": "你是电商客服助手,回答要简洁。",
"greeting": "你好,我是你的客服助手。",
"firstTurnMode": "bot_first",
"output": { "mode": "audio" },
"knowledgeBaseId": "kb_orders",
"tools": [
{ "type": "function", "function": { "name": "query_order" } }
]
}
```
```json
// 2) 客户端发起会话时的覆盖
{
"type": "session.start",
"metadata": {
"channel": "web",
"history": { "userId": 1001 },
"overrides": {
"greeting": "你好,我来帮你查订单进度。",
"output": { "mode": "text" },
"knowledgeBaseId": "kb_vip_orders",
"tools": [
{ "type": "function", "function": { "name": "query_vip_order" } }
]
}
}
}
```
```json
// 3) 引擎合并后的有效配置(示意)
{
"assistantId": "asst_demo",
"systemPrompt": "你是电商客服助手,回答要简洁。",
"greeting": "你好,我来帮你查订单进度。",
"firstTurnMode": "bot_first",
"output": { "mode": "text" },
"knowledgeBaseId": "kb_vip_orders",
"tools": [
{ "type": "function", "function": { "name": "query_vip_order" } }
],
"channel": "web",
"history": { "userId": 1001 }
}
```
合并规则可简化为:
```python
effective = {**db_session_start_metadata, **metadata.overrides}
```
`WS_EMIT_CONFIG_RESOLVED=true` 时,服务端会返回 `config.resolved`(公开、安全裁剪后的快照)用于前端调试当前生效配置。
## 配置导入导出
### 导出配置
1. 在助手详情页点击 **更多**
2. 选择 **导出配置**
3. 下载 JSON 格式的配置文件
### 导入配置
1. 点击 **新建助手**
2. 选择 **从配置导入**
3. 上传配置文件
如果你是从创建路径进入,也可以直接回到 [快速开始](../quickstart/index.md)。

View File

@@ -1,57 +1,10 @@
# 助手管理
# 助手管理(旧入口)
助手是 Realtime Agent Studio (RAS) 的核心模块,用于创建和配置智能对话机器人。每个助手都可以独立配置提示词、语音、知识库和工具。
本页保留旧链接,用于承接历史导航或外部引用。助手相关内容已经拆分到更明确的文档中:
## 概述
- [助手概念](../concepts/assistants.md) - 了解助手是什么、由哪些部分组成,以及会话如何运行
- [配置选项](../concepts/assistants/configuration.md) - 查看控制台和运行时配置项的分工
- [提示词指南](../concepts/assistants/prompts.md) - 编写高质量系统提示词
- [测试调试](../concepts/assistants/testing.md) - 验证助手行为并排查问题
![助手管理](../images/assistants.png)
## 助手能力
| 能力 | 说明 |
|------|------|
| **智能对话** | 基于 LLM 的自然语言理解和生成 |
| **语音交互** | 支持语音识别和语音合成 |
| **知识检索** | 关联知识库回答专业问题 |
| **工具调用** | 调用外部 API 执行操作 |
| **工作流** | 支持复杂的多轮对话流程 |
## 创建助手
### 步骤
1. 进入 **助手管理** 页面
2. 点击 **新建助手** 按钮
3. 填写基本信息
4. 配置各项参数
5. 保存并发布
### 基本信息
| 配置项 | 说明 |
|-------|------|
| 助手名称 | 唯一标识,用于区分不同助手 |
| 提示词 | 定义助手的角色和行为 |
| 温度参数 | 控制回复的随机性0-1 |
## 调试助手
在助手详情页可进行实时调试:
- **文本对话测试** - 快速验证回复质量
- **语音输入测试** - 测试 ASR 识别效果
- **工具调用验证** - 确认工具正常执行
## 发布助手
配置完成后:
1. 点击 **保存** - 保存当前配置
2. 点击 **发布** - 发布到生产环境
3. 获取 API 调用地址 - 用于集成
## 下一步
- [配置选项](configuration.md) - 详细的配置标签页说明
- [提示词指南](prompts.md) - 如何编写高质量的系统提示词
- [测试调试](testing.md) - 助手测试与问题排查
如果你是第一次上手,建议直接从 [快速开始](../quickstart/index.md) 进入。

View File

@@ -1,184 +1,8 @@
# 提示词指南
# 提示词指南(旧入口)
系统提示词System Prompt是定义助手行为的核心配置。本指南介绍如何编写高质量的提示词。
本页保留旧链接,用于承接历史导航或外部引用。提示词的正式文档已经迁移到:
## 提示词结构
- [提示词指南](../concepts/assistants/prompts.md) - 设计角色、任务、限制与风格
- [助手概念](../concepts/assistants.md) - 理解提示词在助手体系中的位置
一个完整的系统提示词通常包含以下部分:
```
[角色定义]
[任务描述]
[行为约束]
[输出格式]
[示例(可选)]
```
## 编写原则
### 1. 明确角色
告诉助手它是谁:
```
你是一个专业的技术支持工程师,专门负责解答产品使用问题。
```
### 2. 定义任务
明确助手需要完成什么:
```
你的主要任务是:
1. 解答用户关于产品功能的问题
2. 提供使用指导和最佳实践
3. 帮助用户排查常见故障
```
### 3. 设置约束
限制不希望出现的行为:
```
请注意:
- 不要讨论与产品无关的话题
- 不要编造不存在的功能
- 如果不确定答案,请建议用户联系人工客服
```
### 4. 指定风格
定义回复的语气和风格:
```
回复风格要求:
- 使用友好、专业的语气
- 回答简洁明了,避免冗长
- 适当使用列表和步骤说明
```
## 提示词模板
### 客服助手
```
你是 [公司名称] 的智能客服助手。
## 你的职责
- 解答用户关于产品和服务的问题
- 处理常见的投诉和建议
- 引导用户完成操作流程
## 回复要求
- 保持友好和耐心
- 回答简洁,一般不超过 3 句话
- 如果问题复杂,建议转接人工客服
## 禁止行为
- 不要讨论竞争对手
- 不要承诺无法兑现的事项
- 不要透露内部信息
```
### 技术支持
```
你是一个技术支持工程师,专门帮助用户解决技术问题。
## 工作流程
1. 首先了解用户遇到的具体问题
2. 询问必要的环境信息(系统版本、错误信息等)
3. 提供分步骤的解决方案
4. 确认问题是否解决
## 回复格式
- 使用编号列表说明操作步骤
- 提供代码示例时使用代码块
- 复杂问题可以分多次回复
```
### 销售顾问
```
你是一个产品销售顾问,帮助用户了解产品并做出购买决策。
## 沟通策略
- 先了解用户需求,再推荐合适的产品
- 突出产品优势,但不贬低竞品
- 提供真实的价格和优惠信息
## 目标
- 帮助用户找到最适合的方案
- 解答购买相关的疑问
- 促进成交但不过度推销
```
## 动态变量
提示词支持动态变量,使用 `{{变量名}}` 语法:
```
你好 {{customer_name}},欢迎来到 {{company_name}}。
你当前的会员等级是 {{membership_tier}}。
```
`session.start` 时通过 `dynamicVariables` 传入:
```json
{
"type": "session.start",
"metadata": {
"dynamicVariables": {
"customer_name": "张三",
"company_name": "AI 公司",
"membership_tier": "黄金会员"
}
}
}
```
## 常见问题
### 回复太长
在提示词中明确限制:
```
回复长度要求:
- 一般问题1-2 句话
- 复杂问题:不超过 5 句话
- 避免重复和冗余内容
```
### 答非所问
增加任务边界说明:
```
重要提示:
- 只回答与 [产品/服务] 相关的问题
- 对于无关问题,礼貌地拒绝并引导回正题
```
### 编造信息
强调诚实原则:
```
信息准确性要求:
- 只提供你确定的信息
- 不确定时说"我不太确定,建议您..."
- 绝对不要编造数据或功能
```
## 最佳实践
1. **迭代优化** - 根据实际对话效果持续调整
2. **测试覆盖** - 用各种场景测试提示词效果
3. **版本管理** - 保存历史版本,便于回退
4. **定期复盘** - 分析对话记录,发现改进点
## 下一步
- [测试调试](testing.md) - 验证提示词效果
- [知识库配置](../customization/knowledge-base.md) - 补充专业知识
如果你想先完成最小可用配置,请从 [快速开始](../quickstart/index.md) 继续。

View File

@@ -1,162 +1,8 @@
# 测试调试
# 测试调试(旧入口)
指南介绍如何测试调试 AI 助手,确保其行为符合预期。
页保留旧链接,用于承接历史导航或外部引用。测试调试的正式文档已经迁移到:
## 测试面板
- [测试调试](../concepts/assistants/testing.md) - 验证助手行为、事件流和常见问题定位
- [故障排查](../resources/troubleshooting.md) - 进入更细的链路排查步骤
在助手详情页,点击 **测试** 按钮打开测试面板。
### 功能介绍
| 功能 | 说明 |
|------|------|
| 文本对话 | 直接输入文字进行测试 |
| 语音测试 | 使用麦克风进行语音对话 |
| 查看日志 | 实时查看系统日志 |
| 事件追踪 | 查看 WebSocket 事件流 |
## 测试用例设计
### 基础功能测试
| 测试项 | 输入 | 预期结果 |
|--------|------|---------|
| 问候响应 | "你好" | 友好的问候回复 |
| 功能介绍 | "你能做什么?" | 准确描述能力范围 |
| 开场白 | 连接后自动 | 播放配置的开场白 |
### 业务场景测试
根据助手定位设计测试用例:
```
场景:产品咨询助手
测试用例 1常见问题
- 输入:"产品有哪些功能?"
- 预期:准确列出主要功能
测试用例 2价格询问
- 输入:"多少钱?"
- 预期:提供价格信息或引导方式
测试用例 3超出范围
- 输入:"帮我写一首诗"
- 预期:礼貌拒绝并引导回业务话题
```
### 边界测试
| 测试项 | 输入 | 预期结果 |
|--------|------|---------|
| 空输入 | "" | 提示用户输入内容 |
| 超长输入 | 1000+ 字符 | 正常处理或提示过长 |
| 特殊字符 | "<script>alert(1)</script>" | 安全处理,不执行 |
| 敏感内容 | 不当言论 | 拒绝回复并提示 |
## 日志分析
### 查看日志
在测试面板的 **日志** 标签页,可以看到:
- ASR 识别结果
- LLM 推理过程
- TTS 合成状态
- 工具调用记录
### 常见日志
```
[ASR] transcript.final: "你好,请问有什么可以帮你"
[LLM] request: messages=[...]
[LLM] response: "您好!我是..."
[TTS] synthesizing: "您好!我是..."
[TTS] audio.start
[TTS] audio.end
```
## 事件追踪
**事件** 标签页查看完整的 WebSocket 事件流:
```json
{"type": "session.started", "timestamp": 1704067200000}
{"type": "input.speech_started", "timestamp": 1704067201000}
{"type": "transcript.delta", "data": {"text": "你"}}
{"type": "transcript.delta", "data": {"text": "好"}}
{"type": "transcript.final", "data": {"text": "你好"}}
{"type": "assistant.response.delta", "data": {"text": "您"}}
{"type": "assistant.response.final", "data": {"text": "您好!..."}}
{"type": "output.audio.start"}
{"type": "output.audio.end"}
```
## 性能指标
关注以下性能指标:
| 指标 | 说明 | 建议值 |
|------|------|--------|
| TTFB | 首字节时间 | < 500ms |
| 识别延迟 | ASR 处理时间 | < 1s |
| 回复延迟 | LLM 推理时间 | < 2s |
| 合成延迟 | TTS 处理时间 | < 500ms |
## 常见问题排查
### 助手不响应
1. **检查连接状态**
- 确认 WebSocket 连接成功
- 查看是否收到 `session.started` 事件
2. **检查模型配置**
- 确认 LLM 模型 API Key 有效
- 测试模型连接是否正常
3. **查看错误日志**
- 打开浏览器开发者工具
- 检查 Console 和 Network 标签
### 回复质量差
1. **优化提示词**
- 增加更明确的指令
- 添加示例和约束
2. **调整温度参数**
- 降低 temperature 提高一致性
- 适当值通常在 0.3-0.7
3. **补充知识库**
- 上传相关文档
- 提高检索相关性
### 语音问题
1. **ASR 识别不准**
- 检查麦克风权限
- 尝试更换 ASR 引擎
- 添加热词提高识别率
2. **TTS 不播放**
- 检查浏览器自动播放限制
- 确认 TTS 配置正确
## 自动化测试
使用自动化测试功能进行批量测试:
1. 进入 **自动化测试** 页面
2. 创建测试任务
3. 配置测试用例
4. 运行测试并查看报告
详见 [自动化测试](../analysis/autotest.md)。
## 下一步
- [自动化测试](../analysis/autotest.md) - 批量测试
- [历史记录](../analysis/history.md) - 查看对话记录
- [效果评估](../analysis/evaluation.md) - 评估对话质量
如果你还没创建助手,请先完成 [快速开始](../quickstart/index.md)。

View File

@@ -1,68 +1,7 @@
# 工作流配置选项TODO 版本
# 工作流配置(旧入口
文档是工作流配置页的第一版草稿,后续会根据实际能力继续细化。
页保留旧链接,用于承接早期草稿和历史引用。工作流的正式文档已收敛到:
## 配置目标
- 将多步骤对话拆分为可编排节点
- 为不同分支定义独立提示词和工具权限
- 在会话中按条件切换节点并透传上下文
## 基础配置项(建议)
| 配置项 | 说明 | 建议值 |
|---|---|---|
| 工作流名称 | 用于区分业务流程 | 简洁、业务语义明确 |
| 入口节点 | 用户进入后的首个节点 | 固定单入口 |
| 全局提示词 | 对所有节点生效的共性约束 | 保持简短,避免与节点提示词冲突 |
| 节点提示词 | 当前节点的任务说明 | 单一职责,明确输入/输出 |
| 节点工具白名单 | 当前节点可调用工具集合 | 最小权限原则 |
| 节点超时 | 节点等待超时处理 | 3-10 秒 |
| 失败回退节点 | 异常时兜底节点 | 建议统一到人工或澄清节点 |
## 节点建议类型
- 意图识别节点:判断用户诉求并路由
- 信息收集节点:收集订单号、手机号等关键信息
- 处理节点:执行查询、计算、调用工具
- 回复节点:组织最终答复
- 结束节点:输出结束语并关闭会话
## 配置示例
```yaml
workflow:
name: "订单咨询流程"
entry: "intent_router"
global_prompt: "优先给出可执行步骤,必要时先澄清信息。"
nodes:
- id: "intent_router"
type: "router"
prompt: "识别用户意图:查订单、退款、投诉"
next:
- when: "intent == query_order"
to: "collect_order_id"
- when: "intent == refund"
to: "refund_policy"
- id: "collect_order_id"
type: "collect"
prompt: "请用户提供订单号"
tools: ["query_order"]
fallback: "human_handoff"
- id: "human_handoff"
type: "end"
prompt: "转人工处理"
```
## 已知限制(当前)
- 不支持在文档中完整定义所有表达式语法
- 不同执行引擎的节点字段可能存在差异
- 可视化编排与 YAML 字段暂未完全一一对应
## 后续计划
- 补充节点字段的完整 Schema
- 补充路由条件表达式规范
- 增加“调试与回放”章节
- [工作流](../customization/workflows.md) - 了解工作流的定位、节点结构、设计建议和当前边界
如果你正在配置助手中的流程能力,请优先阅读上述页面,再结合 [工具](../customization/tools.md) 与 [助手概念](../concepts/assistants.md) 一起使用。

View File

@@ -1,4 +1,4 @@
# 更新日志
# 更新日志
本文档记录 Realtime Agent Studio 的所有重要变更。
@@ -29,7 +29,7 @@
- **OpenAI 兼容接口** - 支持 OpenAI Compatible 的 ASR/TTS 服务
- **DashScope TTS** - 阿里云语音合成服务适配
#### 智能体配置
#### 助手配置
- **系统提示词** - 支持角色定义和动态变量 `{{variable}}`
- **模型管理** - LLM/ASR/TTS 模型统一管理界面

View File

@@ -1,253 +1,147 @@
# 助手概念详解
深入了解助手Assistant的设计理念和配置细节
助手Assistant是 Realtime Agent StudioRAS中最核心的配置单元也是控制台和 API 对外暴露能力的基本对象
---
## 什么是助手
## 什么是助手
**助手**是 RAS 中的核心实体,代表一个具有特定角色、能力和行为的 AI 对话智能体。每个助手都是独立配置的,可以服务于不同的业务场景。
一个助手代表一个可接入、可测试、可发布的实时 AI 入口。它回答三个问题:
### 助手的组成
- **它是谁**:角色、语气、目标、限制、开场方式、静默时候的行动(比如静默时候的询问 Ask-on-Idle
- **它能做什么**语言模型能力、语音模型能力ASR、TTS、用户打断灵敏度Barge-in、语句端点设置End-of-Utterance、知识库、记忆、工具Webhook、客户端工具、系统工具、MCP、输出模式
- **它在一次会话中如何运行**:通过 `assistant_id` 载入配置,并在运行时接收动态变量、对话时候的上下文更新
```mermaid
flowchart TB
subgraph Assistant["助手"]
Identity[身份定义]
Models[模型配置]
Capabilities[能力扩展]
Behavior[行为控制]
end
如果把引擎理解为“运行时”,那么助手就是“运行时要执行的那份定义”。
subgraph Identity
Name[名称]
Prompt[系统提示词]
Language[语言]
end
## 助手由哪些部分组成
subgraph Models
LLM[LLM 模型]
ASR[ASR 模型]
TTS[TTS 声音]
end
| 层次 | 负责什么 | 典型内容 |
|------|----------|----------|
| **身份层** | 定义助手角色和交互风格 | 系统提示词、限制、开场白、静默处理 |
| **模型层** | 决定理解与生成能力 | LLM、ASR、TTS、引擎类型、用户打断、语句端点 |
| **能力层** | 扩展知识和执行能力 | 知识库、工具、记忆 |
| **会话层** | 决定运行时上下文如何注入 | `assistant_id`、动态变量 |
subgraph Capabilities
Tools[工具调用]
KB[知识库]
end
## 身份层
subgraph Behavior
Greeting[开场白]
Interruption[打断设置]
Output[输出模式]
end
```
---
## 身份定义
助手首先是一个“被约束的角色”,而不是一段孤立的模型调用。
### 系统提示词
系统提示词是助手最重要的配置,它定义了:
系统提示词定义助手的角色、任务、边界和风格,是所有能力组合的基础。
| 要素 | 说明 | 示例 |
| 要素 | 作用 | 示例 |
|------|------|------|
| **角色** | 助手扮演什么身份 | "你是一名专业的医疗咨询顾问" |
| **能力** | 助手能做什么 | "你可以回答健康问题,但不能开具处方" |
| **限制** | 助手不能做什么 | "不要讨论政治话题" |
| **风格** | 回复的语气和格式 | "保持友好专业,回答简洁" |
| **角色** | 告诉模型“自己是谁” | 客服助手、销售顾问、培训教练 |
| **任务** | 指定要完成的结果 | 解答咨询、收集信息、调用工具处理业务 |
| **限制** | 明确哪些事不能做 | 不承诺超权限优惠、不输出未经验证的结论 |
| **风格** | 约束回答节奏和措辞 | 简洁、口语化、每次 2-3 句 |
### 提示词模板
### 开场白
```markdown
## 角色
你是{{company}}的智能客服助手"小智"。
一个助手还要定义会话应该如何开始,以及用户静默时候如何处理,包括:
## 任务
- 回答用户关于产品和服务的问题
- 协助处理订单查询和售后问题
- 收集用户反馈
- **首轮模式**:助手先说、用户先说或者机器先说
- **开场白**使用固定开场白或者AI生成开场白
## 限制
- 不讨论竞争对手产品
- 不承诺超出权限的优惠
- 遇到复杂问题引导用户联系人工客服
### 静默处理
## 风格
- 语气友好亲切
- 回答简洁明了,每次 2-3 句话
- 适当使用语气词使对话更自然
```
用户静默时候是否询问用户是否在线
---
## 模型层
## 模型配置
模型决定助手的基础理解、推理和表达能力,但不是助手定义的全部。
### LLM 模型
- **LLM** 决定对话推理与文本生成能力
- **ASR** 决定语音输入如何被实时转写
- **TTS** 决定文本回复如何转成可播放语音
- **引擎类型** 决定运行链路是分段可控还是端到端低延迟
- **VAD** 声音活动模型,判断用户是否在说话
- **EOU** 语句端点模型,判断用户是否完成一段语句等待回复
- **Barge In** 由于用户声音活动或者手动请求,是否打断助手当前的回复
大语言模型是助手的"大脑",负责理解用户意图和生成回复。
## 能力层
| 参数 | 说明 | 建议值 |
|------|------|--------|
| **温度** | 回复随机性,越高越发散 | 0.7 (对话) / 0.3 (问答) |
| **最大 Token** | 单次回复长度上限 | 256-512 |
| **上下文长度** | 记忆的对话轮数 | 10-20 轮 |
### 知识库
### ASR 模型
语音识别模型将用户语音转为文字。
| 配置 | 说明 |
|------|------|
| **语言** | 识别语言,如中文、英文 |
| **热词** | 提高特定词汇识别率 |
| **标点** | 是否自动添加标点 |
### TTS 声音
语音合成将助手回复转为语音输出。
| 配置 | 说明 |
|------|------|
| **音色** | 选择声音角色 |
| **语速** | 说话速度0.5-2.0 |
| **音调** | 声音高低 |
---
## 能力扩展
### 工具调用
通过工具让助手能够执行外部操作:
知识库用于补充私有领域知识,让助手回答超出基础模型常识之外的问题。
```mermaid
flowchart LR
User[用户] -->|"查询订单"| Assistant[助手]
Assistant -->|调用工具| API[订单 API]
API -->|返回数据| Assistant
Assistant -->|回复| User
```
**工具定义示例:**
```json
{
"name": "get_order_status",
"description": "查询用户订单状态",
"parameters": {
"type": "object",
"properties": {
"order_id": {
"type": "string",
"description": "订单编号"
}
},
"required": ["order_id"]
}
}
```
### 知识库关联
让助手基于私有文档回答问题:
```mermaid
flowchart LR
Question[用户问题] --> Search[知识检索]
Search --> KB[(知识库)]
KB --> Context[相关内容]
Question[用户问题] --> Retrieval[检索]
Retrieval --> KB[(知识库)]
KB --> Context[相关片段]
Context --> LLM[LLM]
LLM --> Answer[回答]
```
---
知识库适合承载政策、产品资料、流程说明、FAQ 和内部文档,而不是把所有业务知识堆进系统提示词。
## 行为控制
### 工具
### 开场白设置
| 模式 | 说明 |
|------|------|
| **助手先说** | 连接后助手主动问候 |
| **用户先说** | 等待用户开口 |
| **静默** | 不自动开场 |
### 打断设置
| 选项 | 说明 |
|------|------|
| **允许打断** | 用户可随时插话 |
| **禁止打断** | 助手说完才能输入 |
| **灵敏度** | 打断触发的敏感程度 |
### 输出模式
| 模式 | 说明 |
|------|------|
| **语音** | TTS 语音输出 |
| **文本** | 纯文本输出 |
| **混合** | 同时输出语音和文本 |
---
## 助手版本管理
### 草稿与发布
工具让助手从“会说”变成“能做事”。
```mermaid
gitGraph
commit id: "创建助手"
commit id: "配置提示词"
commit id: "添加工具"
branch published
checkout published
commit id: "发布 v1"
checkout main
commit id: "修改提示词"
commit id: "调整参数"
checkout published
merge main id: "发布 v2"
flowchart LR
User[用户] --> Assistant[助手]
Assistant --> Tool[工具 / 外部系统]
Tool --> Assistant
Assistant --> User
```
- **草稿**: 可随时修改,仅供测试
- **发布**: 正式上线,用于生产环境
适合用工具处理的任务包括:订单查询、预约、外部搜索、写入业务系统、调用客户端能力等。
### 配置导入导出
## 会话层
支持以 JSON 格式导入导出助手配置,便于:
### `assistant_id` 的作用
- 备份和恢复
- 跨环境迁移
- 团队共享模板
在接入层面,客户端通过 `assistant_id` 指定要加载哪一个助手。引擎据此读取默认配置,并把同一份助手定义应用到当前会话。
---
### 会话生命周期
## 最佳实践
```mermaid
stateDiagram-v2
[*] --> Connecting: WebSocket 连接
Connecting --> Started: session.started
Started --> Active: config.resolved / 开始对话
Active --> Active: 多轮交互
Active --> Stopped: session.stop 或连接关闭
Stopped --> [*]
```
### 1. 提示词工程
一次会话通常会沉淀以下信息:
- **明确角色**: 清晰定义助手身份
- **设定边界**: 明确能做什么、不能做什么
- **控制长度**: 语音场景下回复要简短
- 用户与助手消息时间线
- 音频流、转写结果和模型输出
- 工具调用记录与中间事件
- 自定义 metadata、渠道和业务上下文
### 2. 模型选择
- **平衡成本与效果**: 不一定需要最强模型
- **测试不同供应商**: 找到最适合场景的组合
- **考虑延迟**: 语音交互对延迟敏感
### 动态变量与会话级覆盖
### 3. 工具设计
助手的默认配置不需要为每个用户都重新复制一份。RAS 提供两种常见的运行时注入方式:
- **单一职责**: 每个工具做一件事
- **清晰描述**: 让 LLM 正确理解何时调用
- **错误处理**: 工具失败时优雅降级
- **动态变量**:在提示词中使用 `{{variable}}` 占位,并在会话开始时传入具体值
- **会话级覆盖**:仅对当前会话覆盖部分运行时参数,不回写助手基线配置
---
```json
{
"type": "session.start",
"metadata": {
"dynamicVariables": {
"company_name": "ABC 公司",
"customer_name": "张三",
"tier": "VIP"
}
}
}
```
这种设计让你既能复用标准助手,又能在每次接入时注入渠道、用户、订单或上下文信息。
## 相关文档
- [助手配置](../assistants/configuration.md) - 配置界面详解
- [提示词指南](../assistants/prompts.md) - 编写高质量提示词
- [工具集成](../customization/tools.md) - 工具配置详情
- [配置选项](assistants/configuration.md) - 查看助手在控制台和运行时有哪些配置层
- [提示词指南](assistants/prompts.md) - 设计角色、任务、限制和语气
- [测试调试](assistants/testing.md) - 验证助手质量并定位问题

View File

@@ -0,0 +1,218 @@
# 配置选项
助手配置界面包含多个标签页,每个标签页负责不同方面的配置。
## 全局设置
全局设置定义助手的核心对话能力。
| 配置项 | 说明 | 建议值 |
|-------|------|--------|
| 助手名称 | 用于标识和管理 | 简洁明确 |
| 系统提示词 | 定义角色、任务和约束 | 详见[提示词指南](prompts.md) |
| 开场白 | 对话开始时的问候语 | 简短友好 |
| 温度参数 | 控制回复随机性 | 0.7(通用)/ 0.3(严谨) |
| 上下文长度 | 保留的历史消息数 | 10-20 |
### 高级选项
- **首轮模式** - 设置首次对话的触发方式
- **打断检测** - 用户打断时的处理策略
- **超时设置** - 无响应时的处理
## 语音配置
配置语音识别和语音合成参数。
### TTS 语音合成
| 配置 | 说明 |
|------|------|
| TTS 引擎 | 选择语音合成服务(阿里/火山/Minimax |
| 音色 | 选择语音风格和性别 |
| 语速 | 语音播放速度0.5-2.0 |
| 音量 | 语音输出音量0-100 |
| 音调 | 语音音调高低0.5-2.0 |
### ASR 语音识别
| 配置 | 说明 |
|------|------|
| ASR 引擎 | 选择语音识别服务 |
| 语言 | 识别语言(中文/英文/多语言) |
| 热词 | 提高特定词汇识别准确率 |
## 工具绑定
配置助手可调用的外部工具。
### 可用工具类型
| 工具 | 说明 |
|------|------|
| 搜索工具 | 网络搜索获取信息 |
| 天气查询 | 查询天气预报 |
| 计算器 | 数学计算 |
| 知识库检索 | RAG 知识检索 |
| 自定义工具 | HTTP 回调外部 API |
### 配置步骤
1. 在工具列表中勾选需要的工具
2. 配置工具参数(如有)
3. 测试工具调用是否正常
## 知识关联
关联 RAG 知识库,让助手能够回答专业领域问题。
### 配置参数
| 参数 | 说明 | 建议值 |
|------|------|--------|
| 知识库 | 选择要关联的知识库 | - |
| 相似度阈值 | 低于此分数不返回 | 0.7 |
| 返回数量 | 单次检索返回条数 | 3 |
| 检索策略 | 混合/向量/关键词 | 混合 |
### 多知识库
支持关联多个知识库,系统会自动合并检索结果。
## 外部链接
配置第三方服务集成和 Webhook 回调。
### Webhook 配置
| 字段 | 说明 |
|------|------|
| 回调 URL | 接收事件的 HTTP 端点 |
| 事件类型 | 订阅的事件(对话开始/结束/工具调用等) |
| 认证方式 | API Key / Bearer Token / 无 |
### 支持的事件
- `conversation.started` - 对话开始
- `conversation.ended` - 对话结束
- `tool.called` - 工具被调用
- `human.transfer` - 转人工
## 配置持久化与运行时覆盖
助手配置分为两层:
1. **数据库持久化配置(基线配置)**:通过助手管理 API 保存,后续会话默认读取这一层。
2. **会话级覆盖配置runtime overrides**:仅对当前 WebSocket 会话生效,不会写回数据库。
### 哪些配置会存到数据库
以下字段会持久化在 `assistants` / `assistant_opener_audio` 等表中(通过创建/更新助手写入):
| 类别 | 典型字段 |
|------|---------|
| 对话行为 | `name``prompt``opener``firstTurnMode``generatedOpenerEnabled` |
| 输出与打断 | `voiceOutputEnabled``voice``speed``botCannotBeInterrupted``interruptionSensitivity` |
| 工具与知识库 | `tools``knowledgeBaseId` |
| 模型与外部模式 | `configMode``apiUrl``apiKey``llmModelId``asrModelId``embeddingModelId``rerankModelId` |
| 开场音频 | `openerAudioEnabled` 及音频文件状态(`ready``durationMs` 等) |
> 引擎在连接时通过 `assistant_id` 从后端读取该助手的 `sessionStartMetadata` 作为默认运行配置。
### 哪些配置可以在会话中覆盖
客户端可在 `session.start.metadata.overrides` 中覆盖以下白名单字段(仅当前会话有效):
- `systemPrompt`
- `greeting`
- `firstTurnMode`
- `generatedOpenerEnabled`
- `output`
- `bargeIn`
- `knowledgeBaseId`
- `knowledge`
- `tools`
- `openerAudio`
以下字段不能由客户端覆盖:
- `services`(模型 provider / apiKey / baseUrl 等)
- `assistantId` / `appId` / `configVersionId`(及下划线变体)
- 包含密钥语义的字段(如 `apiKey``token``secret``password``authorization`
### 覆盖示例(代码)
下面示例展示「数据库基线配置 + 会话 overrides」的最终效果。
```json
// 1) 数据库存储的基线配置(示意)
// GET /api/v1/assistants/asst_demo/config -> sessionStartMetadata
{
"systemPrompt": "你是电商客服助手,回答要简洁。",
"greeting": "你好,我是你的客服助手。",
"firstTurnMode": "bot_first",
"output": { "mode": "audio" },
"knowledgeBaseId": "kb_orders",
"tools": [
{ "type": "function", "function": { "name": "query_order" } }
]
}
```
```json
// 2) 客户端发起会话时的覆盖
{
"type": "session.start",
"metadata": {
"channel": "web",
"history": { "userId": 1001 },
"overrides": {
"greeting": "你好,我来帮你查订单进度。",
"output": { "mode": "text" },
"knowledgeBaseId": "kb_vip_orders",
"tools": [
{ "type": "function", "function": { "name": "query_vip_order" } }
]
}
}
}
```
```json
// 3) 引擎合并后的有效配置(示意)
{
"assistantId": "asst_demo",
"systemPrompt": "你是电商客服助手,回答要简洁。",
"greeting": "你好,我来帮你查订单进度。",
"firstTurnMode": "bot_first",
"output": { "mode": "text" },
"knowledgeBaseId": "kb_vip_orders",
"tools": [
{ "type": "function", "function": { "name": "query_vip_order" } }
],
"channel": "web",
"history": { "userId": 1001 }
}
```
合并规则可简化为:
```python
effective = {**db_session_start_metadata, **metadata.overrides}
```
`WS_EMIT_CONFIG_RESOLVED=true` 时,服务端会返回 `config.resolved`(公开、安全裁剪后的快照)用于前端调试当前生效配置。
## 配置导入导出
### 导出配置
1. 在助手详情页点击 **更多**
2. 选择 **导出配置**
3. 下载 JSON 格式的配置文件
### 导入配置
1. 点击 **新建助手**
2. 选择 **从配置导入**
3. 上传配置文件

View File

@@ -0,0 +1,184 @@
# 提示词指南
系统提示词System Prompt是定义助手行为的核心配置。本指南介绍如何编写高质量的提示词。
## 提示词结构
一个完整的系统提示词通常包含以下部分:
```
[角色定义]
[任务描述]
[行为约束]
[输出格式]
[示例(可选)]
```
## 编写原则
### 1. 明确角色
告诉助手它是谁:
```
你是一个专业的技术支持工程师,专门负责解答产品使用问题。
```
### 2. 定义任务
明确助手需要完成什么:
```
你的主要任务是:
1. 解答用户关于产品功能的问题
2. 提供使用指导和最佳实践
3. 帮助用户排查常见故障
```
### 3. 设置约束
限制不希望出现的行为:
```
请注意:
- 不要讨论与产品无关的话题
- 不要编造不存在的功能
- 如果不确定答案,请建议用户联系人工客服
```
### 4. 指定风格
定义回复的语气和风格:
```
回复风格要求:
- 使用友好、专业的语气
- 回答简洁明了,避免冗长
- 适当使用列表和步骤说明
```
## 提示词模板
### 客服助手
```
你是 [公司名称] 的智能客服助手。
## 你的职责
- 解答用户关于产品和服务的问题
- 处理常见的投诉和建议
- 引导用户完成操作流程
## 回复要求
- 保持友好和耐心
- 回答简洁,一般不超过 3 句话
- 如果问题复杂,建议转接人工客服
## 禁止行为
- 不要讨论竞争对手
- 不要承诺无法兑现的事项
- 不要透露内部信息
```
### 技术支持
```
你是一个技术支持工程师,专门帮助用户解决技术问题。
## 工作流程
1. 首先了解用户遇到的具体问题
2. 询问必要的环境信息(系统版本、错误信息等)
3. 提供分步骤的解决方案
4. 确认问题是否解决
## 回复格式
- 使用编号列表说明操作步骤
- 提供代码示例时使用代码块
- 复杂问题可以分多次回复
```
### 销售顾问
```
你是一个产品销售顾问,帮助用户了解产品并做出购买决策。
## 沟通策略
- 先了解用户需求,再推荐合适的产品
- 突出产品优势,但不贬低竞品
- 提供真实的价格和优惠信息
## 目标
- 帮助用户找到最适合的方案
- 解答购买相关的疑问
- 促进成交但不过度推销
```
## 动态变量
提示词支持动态变量,使用 `{{变量名}}` 语法:
```
你好 {{customer_name}},欢迎来到 {{company_name}}。
你当前的会员等级是 {{membership_tier}}。
```
`session.start` 时通过 `dynamicVariables` 传入:
```json
{
"type": "session.start",
"metadata": {
"dynamicVariables": {
"customer_name": "张三",
"company_name": "AI 公司",
"membership_tier": "黄金会员"
}
}
}
```
## 常见问题
### 回复太长
在提示词中明确限制:
```
回复长度要求:
- 一般问题1-2 句话
- 复杂问题:不超过 5 句话
- 避免重复和冗余内容
```
### 答非所问
增加任务边界说明:
```
重要提示:
- 只回答与 [产品/服务] 相关的问题
- 对于无关问题,礼貌地拒绝并引导回正题
```
### 编造信息
强调诚实原则:
```
信息准确性要求:
- 只提供你确定的信息
- 不确定时说"我不太确定,建议您..."
- 绝对不要编造数据或功能
```
## 最佳实践
1. **迭代优化** - 根据实际对话效果持续调整
2. **测试覆盖** - 用各种场景测试提示词效果
3. **版本管理** - 保存历史版本,便于回退
4. **定期复盘** - 分析对话记录,发现改进点
## 下一步
- [测试调试](testing.md) - 验证提示词效果
- [知识库配置](../../customization/knowledge-base.md) - 补充专业知识

View File

@@ -0,0 +1,162 @@
# 测试调试
本指南介绍如何测试和调试 AI 助手,确保其行为符合预期。
## 测试面板
在助手详情页,点击 **测试** 按钮打开测试面板。
### 功能介绍
| 功能 | 说明 |
|------|------|
| 文本对话 | 直接输入文字进行测试 |
| 语音测试 | 使用麦克风进行语音对话 |
| 查看日志 | 实时查看系统日志 |
| 事件追踪 | 查看 WebSocket 事件流 |
## 测试用例设计
### 基础功能测试
| 测试项 | 输入 | 预期结果 |
|--------|------|---------|
| 问候响应 | "你好" | 友好的问候回复 |
| 功能介绍 | "你能做什么?" | 准确描述能力范围 |
| 开场白 | 连接后自动 | 播放配置的开场白 |
### 业务场景测试
根据助手定位设计测试用例:
```
场景:产品咨询助手
测试用例 1常见问题
- 输入:"产品有哪些功能?"
- 预期:准确列出主要功能
测试用例 2价格询问
- 输入:"多少钱?"
- 预期:提供价格信息或引导方式
测试用例 3超出范围
- 输入:"帮我写一首诗"
- 预期:礼貌拒绝并引导回业务话题
```
### 边界测试
| 测试项 | 输入 | 预期结果 |
|--------|------|---------|
| 空输入 | "" | 提示用户输入内容 |
| 超长输入 | 1000+ 字符 | 正常处理或提示过长 |
| 特殊字符 | "<script>alert(1)</script>" | 安全处理,不执行 |
| 敏感内容 | 不当言论 | 拒绝回复并提示 |
## 日志分析
### 查看日志
在测试面板的 **日志** 标签页,可以看到:
- ASR 识别结果
- LLM 推理过程
- TTS 合成状态
- 工具调用记录
### 常见日志
```
[ASR] transcript.final: "你好,请问有什么可以帮你"
[LLM] request: messages=[...]
[LLM] response: "您好!我是..."
[TTS] synthesizing: "您好!我是..."
[TTS] audio.start
[TTS] audio.end
```
## 事件追踪
**事件** 标签页查看完整的 WebSocket 事件流:
```json
{"type": "session.started", "timestamp": 1704067200000}
{"type": "input.speech_started", "timestamp": 1704067201000}
{"type": "transcript.delta", "data": {"text": "你"}}
{"type": "transcript.delta", "data": {"text": "好"}}
{"type": "transcript.final", "data": {"text": "你好"}}
{"type": "assistant.response.delta", "data": {"text": "您"}}
{"type": "assistant.response.final", "data": {"text": "您好!..."}}
{"type": "output.audio.start"}
{"type": "output.audio.end"}
```
## 性能指标
关注以下性能指标:
| 指标 | 说明 | 建议值 |
|------|------|--------|
| TTFB | 首字节时间 | < 500ms |
| 识别延迟 | ASR 处理时间 | < 1s |
| 回复延迟 | LLM 推理时间 | < 2s |
| 合成延迟 | TTS 处理时间 | < 500ms |
## 常见问题排查
### 助手不响应
1. **检查连接状态**
- 确认 WebSocket 连接成功
- 查看是否收到 `session.started` 事件
2. **检查模型配置**
- 确认 LLM 模型 API Key 有效
- 测试模型连接是否正常
3. **查看错误日志**
- 打开浏览器开发者工具
- 检查 Console 和 Network 标签
### 回复质量差
1. **优化提示词**
- 增加更明确的指令
- 添加示例和约束
2. **调整温度参数**
- 降低 temperature 提高一致性
- 适当值通常在 0.3-0.7
3. **补充知识库**
- 上传相关文档
- 提高检索相关性
### 语音问题
1. **ASR 识别不准**
- 检查麦克风权限
- 尝试更换 ASR 引擎
- 添加热词提高识别率
2. **TTS 不播放**
- 检查浏览器自动播放限制
- 确认 TTS 配置正确
## 自动化测试
使用自动化测试功能进行批量测试:
1. 进入 **自动化测试** 页面
2. 创建测试任务
3. 配置测试用例
4. 运行测试并查看报告
详见 [自动化测试](../../analysis/autotest.md)。
## 下一步
- [自动化测试](../../analysis/autotest.md) - 批量测试
- [历史记录](../../analysis/history.md) - 查看对话记录
- [效果评估](../../analysis/evaluation.md) - 评估对话质量

View File

@@ -1,349 +1,107 @@
# 引擎架构详解
# 引擎架构
深入了解 RAS 的两种引擎架构:管线式引擎和多模态引擎。
RAS 提供两类实时运行时:**Pipeline 引擎** 和 **Realtime 引擎**。本页只回答一个问题:你的助手应该跑在哪种引擎
---
## 引擎概述
## 先记住这条判断标准
引擎是 RAS 的核心,负责处理实时语音交互。根据不同需求,可以选择两种架构:
- 如果你优先考虑 **可控性、可替换性、成本管理、工具 / 知识 / 流程编排**,优先选 **Pipeline 引擎**
- 如果你优先考虑 **超低延迟、更自然的端到端语音体验**,优先选 **Realtime 引擎**
| 架构 | 特点 | 适用场景 |
|------|------|---------|
| **管线式** | 灵活、可定制、成本可控 | 大多数场景 |
| **多模态** | 低延迟、自然、简单 | 高端体验场景 |
## 两类引擎的区别
---
| 维度 | Pipeline 引擎 | Realtime 引擎 |
|------|---------------|---------------|
| **交互路径** | VAD → ASR → TD → LLM → TTS | 端到端实时模型 |
| **可控性** | 高,每个环节可替换 | 中,更多依赖模型供应商 |
| **延迟** | 中等,通常由多环节累加 | 低,链路更短 |
| **能力编排** | 更适合接入工具、知识库、工作流 | 也可接工具,但流程可控性较弱 |
| **成本结构** | 可按环节优化 | 往往更依赖单一供应商定价 |
| **适合场景** | 企业客服、流程型助手、电话场景、知识问答 | 高拟真语音助手、多模态入口、高自然度体验 |
## 管线式引擎 (Pipeline)
## Pipeline 引擎是什么
### 架构设计
管线式引擎包含 **声音活动检测VAD**、**语音识别ASR**、**回合检测TD**、**大语言模型LLM**、**语音合成TTS**,各环节可对接**外部服务**OpenAI、SiliconFlow、DashScope、本地模型。LLM 可连接**工具**Webhook、客户端工具、内建工具
Pipeline 引擎把实时语音拆成多个明确环节:
```mermaid
flowchart LR
subgraph Input["输入处理"]
Audio[用户音频] --> VAD[声音活动检测 VAD]
VAD --> ASR[语音识别 ASR]
ASR --> Text[转写文本]
Text --> TD[回合检测 TD]
end
subgraph Process["语义处理"]
TD --> LLM[大语言模型 LLM]
LLM --> Response[回复文本]
LLM --> Tools[工具]
end
subgraph Output["输出生成"]
Response --> TTS[语音合成 TTS]
TTS --> OutputAudio[助手音频]
end
VAD[VAD] --> ASR[ASR]
ASR --> TD[回合检测]
TD --> LLM[LLM]
LLM --> TTS[TTS]
```
### 数据流详解
这样做的好处是:
```mermaid
sequenceDiagram
participant U as 用户
participant E as 引擎
participant ASR as ASR 服务
participant LLM as LLM 服务
participant TTS as TTS 服务
- 你可以分别选择 ASR、LLM、TTS 的供应商
- 你可以单独优化某一个环节,而不是整体替换
- 工具、知识库和工作流更容易插入到链路中
U->>E: 音频帧 (PCM 16kHz)
Note over E: VAD 检测语音活动
E->>E: 累积音频缓冲
Note over E: 回合检测 (TD) 确定可送 LLM 的输入
E->>ASR: 发送音频
ASR-->>E: 转写文本 (流式)
E-->>U: transcript.delta
E-->>U: transcript.final
E->>LLM: 发送对话历史 + 用户输入
LLM-->>E: 回复文本 (流式)
E-->>U: assistant.response.delta
loop 流式合成
E->>TTS: 文本片段
TTS-->>E: 音频片段
E-->>U: 音频帧
end
E-->>U: assistant.response.final
```
代价是:
### 延迟分析
- 延迟会累加
- 系统集成更复杂
- 你需要同时管理多类外部依赖
管线式引擎的延迟由各环节累加:
## Realtime 引擎是什么
| 环节 | 典型延迟 | 优化方向 |
|------|---------|---------|
| VAD/EOU | 200-500ms | 调整灵敏度 |
| ASR | 100-300ms | 选择快速模型 |
| LLM TTFT | 200-500ms | 选择低延迟模型 |
| TTS | 100-200ms | 流式合成 |
| **总计** | **600-1500ms** | - |
### 流式优化
为降低感知延迟,采用流式处理:
```mermaid
gantt
title 非流式 vs 流式处理
dateFormat X
axisFormat %s
section 非流式
ASR完成 :a1, 0, 300ms
LLM完成 :a2, after a1, 800ms
TTS完成 :a3, after a2, 500ms
播放 :a4, after a3, 500ms
section 流式
ASR :b1, 0, 300ms
LLM开始 :b2, after b1, 200ms
TTS开始 :b3, after b2, 100ms
边生成边播放 :b4, after b3, 600ms
```
---
## 实时交互引擎与多模态
### 实时交互引擎连接
实时交互引擎可连接**实时交互引擎**后端,包括:
| 后端 | 说明 |
|------|------|
| **OpenAI Realtime** | OpenAI 实时语音模型 |
| **Gemini Live** | Google 实时多模态 |
| **Doubao 实时交互引擎** | 豆包实时交互 |
实时交互引擎与管线式引擎中的 LLM 一样,均可连接**工具**Webhook、客户端工具、内建工具。
### 多模态引擎架构
多模态引擎使用端到端模型,直接处理音频输入输出:
Realtime 引擎直接连接端到端实时模型,让模型同时处理输入、理解、生成与打断。
```mermaid
flowchart LR
subgraph Client["客户端"]
Mic[麦克风] --> AudioIn[音频输入]
AudioOut[音频输出] --> Speaker[扬声器]
end
subgraph Engine["引擎"]
AudioIn --> RT[Realtime Model]
RT --> AudioOut
RT --> Tools[工具]
end
subgraph Model["实时交互引擎"]
RT --> GPT4o[OpenAI Realtime]
RT --> Gemini[Gemini Live]
RT --> Doubao[Doubao 实时]
end
Input[音频 / 视频 / 文本输入] --> RT[Realtime Model]
RT --> Output[音频 / 文本输出]
RT --> Tools[工具]
```
### 数据流详解
这样做的好处是:
```mermaid
sequenceDiagram
participant U as 用户
participant E as 引擎
participant RT as Realtime Model
- 链路更短,延迟更低
- 全双工与打断通常更自然
- 接入路径更简单,适合强调体验的入口
U->>E: 音频帧
E->>RT: 转发音频
Note over RT: 端到端处理
RT-->>E: 音频响应 (流式)
E-->>U: 播放音频
Note over U,RT: 支持全双工<br/>用户可随时打断
```
代价是:
### 外部服务(管线式)
- 更依赖特定模型供应商
- 对 ASR / TTS / 回合检测的独立控制更弱
- 成本和能力边界受实时模型限制更大
管线式引擎各环节可选用以下**外部服务**
## 怎么选
| 服务 | 说明 |
|------|------|
| **OpenAI** | LLM / ASR / TTS 等 |
| **SiliconFlow** | 国内 API 服务 |
| **DashScope** | 阿里云灵积 |
| **本地模型** | 私有化部署模型 |
### 适合选择 Pipeline 的情况
### 支持的实时交互模型
- 你要接入特定 ASR 或 TTS 供应商
- 你需要知识库、工具、工作流形成稳定业务流程
- 你更在意可解释性、观测和分段优化
- 你需要把成本按环节精细控制
| 模型 | 供应商 | 特点 |
|------|--------|------|
| **OpenAI Realtime** | OpenAI | 最自然的语音,延迟极低 |
| **Gemini Live** | Google | 多模态能力强 |
| **Doubao 实时交互** | 字节跳动 | 国内可用,中文优化 |
### 适合选择 Realtime 的情况
### 延迟对比
- 你把“自然对话感”放在首位
- 你需要更低的首响和更顺滑的打断体验
- 你可以接受对某个模型供应商的依赖
- 你的场景更接近语音助手、陪练、虚拟角色或多模态入口
```mermaid
xychart-beta
title "端到端延迟对比"
x-axis ["管线式 (普通)", "管线式 (优化)", "多模态"]
y-axis "延迟 (ms)" 0 --> 1500
bar [1200, 700, 300]
```
## 简化决策表
---
| 场景 | 推荐引擎 | 原因 |
|------|----------|------|
| 企业客服 / 电话机器人 | Pipeline | 可控、可审计、易接工具与业务系统 |
| 知识问答 / 业务流程助手 | Pipeline | 更适合接知识库与工作流 |
| 高拟真语音助手 | Realtime | 更自然、更低延迟 |
| 多模态入口 | Realtime | 端到端处理音频 / 视频 / 文本 |
| 预算敏感场景 | Pipeline | 更容易逐环节优化成本 |
## 智能打断机制
## 智能打断的差异
引擎都支持智能打断,但实现方式不同
引擎都支持打断,但边界不同
### 管线式引擎打断
- **Pipeline**:由 VAD / 回合检测与 TTS 停止逻辑协同实现,行为更可控
- **Realtime**:更多由实时模型内部完成,体验更自然,但可解释性更低
```mermaid
sequenceDiagram
participant U as 用户
participant E as 引擎
participant TTS as TTS
## 继续阅读
Note over E,TTS: TTS 正在合成播放
E->>U: 音频帧...
U->>E: 用户说话 (检测到 VAD)
E->>E: 判断是否有效打断
alt 有效打断
E->>TTS: 停止合成
E->>E: 清空音频缓冲
E-->>U: output.audio.interrupted
Note over E: 处理新输入
else 噪音/误触发
Note over E: 继续播放
end
```
### 多模态引擎打断
多模态模型原生支持全双工,打断由模型内部处理:
```mermaid
sequenceDiagram
participant U as 用户
participant E as 引擎
participant RT as Realtime Model
Note over RT: 模型正在输出
RT-->>E: 音频流...
E-->>U: 播放
U->>E: 用户说话
E->>RT: 转发用户音频
Note over RT: 模型检测到打断<br/>自动停止输出
RT-->>E: 新的响应
E-->>U: 播放新响应
```
---
## 引擎选择指南
### 决策流程
```mermaid
flowchart TD
Start[选择引擎] --> Q1{延迟要求?}
Q1 -->|< 500ms| Q2{预算充足?}
Q1 -->|> 500ms 可接受| Pipeline[管线式引擎]
Q2 -->|是| Q3{模型可用?}
Q2 -->|否| Pipeline
Q3 -->|GPT-4o/Gemini 可用| Multimodal[多模态引擎]
Q3 -->|国内环境受限| Q4{Step Audio?}
Q4 -->|可用| Multimodal
Q4 -->|不可用| Pipeline
```
### 场景推荐
| 场景 | 推荐引擎 | 理由 |
|------|---------|------|
| **企业客服** | 管线式 | 成本可控,可定制 ASR |
| **高端虚拟人** | 多模态 | 最自然的交互体验 |
| **电话机器人** | 管线式 | 可对接电信 ASR |
| **语音助手** | 多模态 | 低延迟,自然对话 |
| **口语练习** | 管线式 | 需要精确的 ASR 评分 |
### 混合方案
也可以根据用户等级使用不同引擎:
```mermaid
flowchart LR
User[用户请求] --> Router{路由判断}
Router -->|VIP 用户| Multimodal[多模态引擎]
Router -->|普通用户| Pipeline[管线式引擎]
Multimodal --> Response[响应]
Pipeline --> Response
```
---
## 配置示例
### 管线式引擎配置
```json
{
"engine": "pipeline",
"asr": {
"provider": "openai-compatible",
"model": "FunAudioLLM/SenseVoiceSmall",
"language": "zh"
},
"llm": {
"provider": "openai",
"model": "gpt-4o-mini",
"temperature": 0.7
},
"tts": {
"provider": "openai-compatible",
"model": "FunAudioLLM/CosyVoice2-0.5B",
"voice": "anna"
}
}
```
### 多模态引擎配置
```json
{
"engine": "multimodal",
"model": {
"provider": "openai",
"model": "gpt-4o-realtime-preview",
"voice": "alloy"
}
}
```
---
## 相关文档
- [系统架构](../overview/architecture.md) - 整体架构设计
- [WebSocket 协议](../api-reference/websocket.md) - 协议详情
- [部署指南](../deployment/index.md) - 引擎部署配置
- [Pipeline 引擎](pipeline-engine.md) - 查看分段链路、延迟构成与配置示例
- [Realtime 引擎](realtime-engine.md) - 查看端到端实时模型的交互路径
- [系统架构](../overview/architecture.md) - 从服务边界理解引擎在整体系统中的位置

View File

@@ -1,286 +1,49 @@
# 核心概念
# 核心概念
本章节介绍 Realtime Agent Studio 中的核心概念,帮助你更好地理解和使用平台
本章节只解释 Realtime Agent Studio 的关键心智模型,不重复环境部署或助手构建的操作细节
---
## 概念总览
## 先建立这三个概念
```mermaid
flowchart TB
subgraph Platform["RAS 平台"]
Assistant[助手 Assistant]
subgraph Resources["资源库"]
LLM[LLM 模型]
ASR[ASR 模型]
TTS[TTS 声音]
KB[知识库]
end
subgraph Engine["交互引擎"]
Pipeline[管线式引擎]
Multimodal[多模态引擎]
end
Session[会话 Session]
end
### 1. 助手是“对外提供能力的配置单元”
Assistant --> LLM
Assistant --> ASR
Assistant --> TTS
Assistant --> KB
Assistant --> Engine
Engine --> Session
```
助手决定了一个实时 AI 入口对外表现成什么角色:它使用什么提示词、哪些模型、能访问哪些知识和工具、会话如何开始以及运行时如何被覆盖。
- [助手概念](assistants.md) — 统一理解助手、会话、动态变量与能力边界
- [配置选项](assistants/configuration.md) — 了解界面层和运行时配置项如何分工
- [提示词指南](assistants/prompts.md) — 学会定义助手的角色、任务、风格与约束
- [测试调试](assistants/testing.md) — 理解如何验证助手行为和定位问题
### 2. 引擎是“承载实时交互的运行时”
RAS 同时提供 Pipeline 引擎与 Realtime 引擎。它们都能驱动实时助手,但在延迟、可控性、成本和可替换性上各有取舍。
- [引擎概览](engines.md) — 两类引擎的能力边界与选择建议
- [Pipeline 引擎](pipeline-engine.md) — VAD/ASR/TD/LLM/TTS 串联的可组合链路
- [Realtime 引擎](realtime-engine.md) — 面向端到端实时模型的低延迟交互路径
### 3. 工作流是“把复杂业务拆成步骤和分支的方法”
当单一提示词不足以稳定处理多步骤、多条件、多工具的业务流程时,应使用工作流来显式编排节点、路由和回退策略。
- [工作流](../customization/workflows.md) — 了解何时需要工作流、它由哪些部分组成、如何设计可维护的流程
---
## 助手 (Assistant)
## 本章节不负责什么
**助手**是 RAS 的核心实体,代表一个可对话的 AI 智能体。
以下内容属于“如何搭建和使用”,不在本章节展开说明:
### 助手配置
- 助手搭建、模型/知识库/工具/工作流配置:从 [助手概览](assistants.md) 进入构建链路
- 部署与环境变量:见 [环境与部署](../getting-started/index.md)
- 第一个助手的最短操作路径:见 [快速开始](../quickstart/index.md)
- 事件格式与接入协议:见 [API 参考](../api-reference/index.md)
每个助手包含以下配置:
## 建议阅读顺序
| 配置项 | 说明 |
|-------|------|
| **名称** | 助手的显示名称 |
| **系统提示词** | 定义助手角色、行为、限制 |
| **LLM 模型** | 选择用于生成回复的大语言模型 |
| **ASR 模型** | 选择用于语音识别的模型 |
| **TTS 声音** | 选择用于语音合成的音色 |
| **工具** | 配置助手可调用的外部工具 |
| **知识库** | 关联的知识库(用于 RAG |
1. 先读 [助手概念](assistants.md),明确你要配置的对象到底是什么
2. 再读 [引擎概览](engines.md),决定应该选择 Pipeline 还是 Realtime
3. 如果场景涉及多步骤流程,再读 [工作流](../customization/workflows.md)
4. 最后回到 [快速开始](../quickstart/index.md) 或 [助手概览](assistants.md) 开始具体配置
### 助手生命周期
```mermaid
stateDiagram-v2
[*] --> Draft: 创建
Draft --> Draft: 编辑配置
Draft --> Published: 发布
Published --> Draft: 取消发布
Published --> Published: 更新配置
Published --> [*]: 删除
```
---
## 会话 (Session)
**会话**代表一次完整的对话交互,从用户连接到断开。
### 会话状态
```mermaid
stateDiagram-v2
[*] --> Connecting: WebSocket 连接
Connecting --> Started: session.started
Started --> Active: 对话中
Active --> Active: 多轮对话
Active --> Stopped: session.stop
Stopped --> [*]: 连接关闭
```
### 会话数据
每个会话记录包含:
- **基本信息** - ID、时长、时间戳
- **音频数据** - 用户和助手的音频记录
- **转写文本** - ASR 识别结果
- **LLM 交互** - 输入输出和工具调用
- **元数据** - 渠道、来源、自定义变量
---
## 管线式引擎 vs 多模态引擎
RAS 支持两种引擎架构,适用于不同场景。
### 管线式引擎 (Pipeline)
将语音交互拆分为多个环节,包含 **VAD声音活动检测**、**ASR语音识别**、**TD回合检测**、**LLM大语言模型**、**TTS语音合成**。外部服务可选 **OpenAI**、**SiliconFlow**、**DashScope**、**本地模型**。LLM 与实时交互引擎均可连接**工具**Webhook、客户端工具、内建工具
```
用户语音 → [VAD] → [ASR] → [TD] → 文本 → [LLM] → 回复 → [TTS] → 助手语音
```
**优点:**
- 灵活选择各环节供应商OpenAI、SiliconFlow、DashScope、本地模型
- 可独立优化 VAD、ASR、TD、LLM、TTS 每个环节
- 成本可控
**缺点:**
- 延迟较高(累加延迟)
- 需要协调多个服务
### 实时交互引擎与多模态 (Realtime / Multimodal)
实时交互引擎可连接 **OpenAI Realtime**、**Gemini Live**、**Doubao 实时交互引擎** 等,同样可连接工具。使用端到端模型直接处理:
```
用户语音 → [Realtime Model] → 助手语音
```
**优点:**
- 更低延迟
- 更自然的语音
- 架构简单
**缺点:**
- 依赖特定供应商
- 成本较高
- 可定制性有限
### 选择建议
| 场景 | 推荐引擎 |
|------|---------|
| 成本敏感 | 管线式 |
| 延迟敏感 | 多模态 |
| 需要特定 ASR/TTS | 管线式 |
| 追求最自然体验 | 多模态 |
---
## 智能打断 (Barge-in)
**智能打断**是指用户在助手说话时可以随时插话,系统能够:
1. 检测用户开始说话
2. 立即停止 TTS 播放
3. 处理用户新的输入
### 打断检测方式
| 方式 | 说明 |
|------|------|
| **VAD** | Voice Activity Detection检测到声音活动即打断 |
| **语义** | 基于语音内容判断是否有意义的打断 |
| **混合** | VAD + 语义结合,减少误触发 |
### 打断流程
```mermaid
sequenceDiagram
participant User as 用户
participant Engine as 引擎
participant TTS as TTS
Note over Engine,TTS: 助手正在播放回复
Engine->>User: 音频流...
User->>Engine: 开始说话 (VAD 触发)
Engine->>Engine: 打断判断
Engine->>TTS: 停止合成
Engine->>User: output.audio.interrupted
Note over Engine: 处理新输入
```
---
## 工具调用 (Tool Calling)
助手可以通过**工具**扩展能力,访问外部系统或执行特定操作。
### 工具类型
管线式引擎中的 LLM 与实时交互引擎均可连接**工具**,包括:
| 类型 | 说明 | 示例 |
|------|------|------|
| **Webhook** | 调用外部 HTTP API | 查询订单、预约日程 |
| **客户端工具** | 由客户端执行的操作 | 打开页面、显示表单 |
| **内建工具** | 平台提供的工具 | 代码执行、计算器 |
### 工具调用流程
```mermaid
sequenceDiagram
participant User as 用户
participant LLM as LLM
participant Tool as 工具
User->>LLM: "帮我查一下订单状态"
LLM->>LLM: 决定调用工具
LLM->>Tool: get_order_status(order_id)
Tool-->>LLM: {status: "已发货"}
LLM->>User: "您的订单已发货"
```
---
## 知识库 (Knowledge Base)
**知识库**让助手能够基于私有文档回答问题,实现 RAG检索增强生成
### 工作原理
```mermaid
flowchart LR
subgraph Indexing["索引阶段"]
Doc[文档] --> Chunk[分块]
Chunk --> Embed[向量化]
Embed --> Store[(向量数据库)]
end
subgraph Query["查询阶段"]
Q[用户问题] --> QEmbed[问题向量化]
QEmbed --> Search[相似度搜索]
Store --> Search
Search --> Context[相关上下文]
Context --> LLM[LLM 生成回答]
end
```
### 支持的文档格式
- PDF
- Word (.docx)
- Markdown
- 纯文本
- HTML
---
## 动态变量
**动态变量**允许在运行时向助手注入上下文信息。
### 使用方式
在系统提示词中使用 `{{variable}}` 占位符:
```
你是{{company_name}}的客服助手。
当前用户是{{customer_name}},会员等级为{{tier}}。
```
连接时通过 `dynamicVariables` 传入:
```json
{
"type": "session.start",
"metadata": {
"dynamicVariables": {
"company_name": "ABC 公司",
"customer_name": "张三",
"tier": "VIP"
}
}
}
```
---
## 下一步
- [快速开始](../quickstart/index.md) - 创建第一个助手
- [助手配置](../assistants/configuration.md) - 详细配置说明
- [WebSocket 协议](../api-reference/websocket.md) - API 接口详情

View File

@@ -0,0 +1,137 @@
# Pipeline 引擎
Pipeline 引擎把实时对话拆成多个清晰环节,适合需要高可控性、可替换外部能力和复杂业务编排的场景。
---
## 运行链路
```mermaid
flowchart LR
subgraph Input["输入处理"]
Audio[用户音频] --> VAD[声音活动检测 VAD]
VAD --> ASR[语音识别 ASR]
ASR --> TD[回合检测 TD]
end
subgraph Reasoning["语义处理"]
TD --> LLM[大语言模型 LLM]
LLM --> Tools[工具]
LLM --> Text[回复文本]
end
subgraph Output["输出生成"]
Text --> TTS[语音合成 TTS]
TTS --> AudioOut[助手音频]
end
```
Pipeline 的关键价值不在于“环节多”,而在于每个环节都可以被单独选择、单独优化、单独观测。
## 它适合什么场景
- 需要接特定 ASR / TTS 供应商
- 需要稳定接入知识库、工具和工作流
- 需要把问题定位到具体环节,而不是只看到整体失败
- 需要按延迟、成本、质量对不同环节分别优化
## 数据流
```mermaid
sequenceDiagram
participant U as 用户
participant E as 引擎
participant ASR as ASR 服务
participant LLM as LLM 服务
participant TTS as TTS 服务
U->>E: 音频帧 (PCM)
E->>E: VAD / 回合检测
E->>ASR: 发送可识别音频
ASR-->>E: transcript.delta / transcript.final
E->>LLM: 发送对话历史与当前输入
LLM-->>E: assistant.response.delta
E->>TTS: 文本片段
TTS-->>E: 音频片段
E-->>U: 音频流与事件
```
## 延迟来自哪里
| 环节 | 典型影响 | 常见优化点 |
|------|----------|------------|
| **VAD / EoU** | 用户说完后多久触发回复 | 调整静音阈值和最短语音门限 |
| **ASR** | 语音转写速度和准确率 | 选择合适模型、热词和语言设置 |
| **LLM** | 首个 token 返回速度 | 选择低延迟模型、优化上下文 |
| **TTS** | 文字到音频的生成速度 | 选择流式 TTS缩短单次回复 |
Pipeline 的总延迟通常不是单点问题,而是链路总和。因此更适合做“逐环节调优”。
## EoU用户说完为什么重要
Pipeline 必须决定“什么时候把当前轮输入正式交给 LLM”。这个判断通常由 **EoU** 完成。
- 阈值小:响应更快,但更容易把用户停顿误判为说完
- 阈值大:更稳,但首次响应会更慢
你可以把它理解为 Pipeline 中最直接影响“对话节奏感”的参数之一。
## 工具、知识库和工作流如何插入
Pipeline 特别适合把业务能力插入到对话中:
- **知识库**:在 LLM 生成前补充领域事实
- **工具**:在需要外部信息或动作时调用系统能力
- **工作流**:在多步骤、多分支流程中决定接下来走哪个节点
这也是它在企业客服、流程助手和知识问答场景中更常见的原因。
## 智能打断
在 Pipeline 中,打断通常由 VAD 检测和 TTS 停止逻辑协同完成:
```mermaid
sequenceDiagram
participant U as 用户
participant E as 引擎
participant TTS as TTS
Note over E,TTS: 正在播放回复
E->>U: 音频流...
U->>E: 用户开始说话
E->>E: 判定是否触发打断
E->>TTS: 停止合成 / 播放
E-->>U: output.audio.interrupted
```
相比端到端实时模型,这种方式更容易解释“为什么打断”以及“在哪个环节发生了问题”。
## 配置示例
```json
{
"engine": "pipeline",
"asr": {
"provider": "openai-compatible",
"model": "FunAudioLLM/SenseVoiceSmall",
"language": "zh"
},
"llm": {
"provider": "openai",
"model": "gpt-4o-mini",
"temperature": 0.7
},
"tts": {
"provider": "openai-compatible",
"model": "FunAudioLLM/CosyVoice2-0.5B",
"voice": "anna"
}
}
```
## 相关文档
- [引擎架构](engines.md) - 回到选择指南
- [Realtime 引擎](realtime-engine.md) - 对比端到端实时模型路径
- [工具](../customization/tools.md) - 设计可被 LLM 安全调用的工具
- [知识库](../customization/knowledge-base.md) - 在对话中补充领域知识

View File

@@ -0,0 +1,97 @@
# Realtime 引擎
Realtime 引擎直接连接端到端实时模型,适合把低延迟和自然语音体验放在第一位的场景。
---
## 运行链路
```mermaid
flowchart LR
Input[音频 / 视频 / 文本输入] --> RT[Realtime Model]
RT --> Output[音频 / 文本输出]
RT --> Tools[工具]
```
与 Pipeline 不同Realtime 引擎不会把 ASR、回合检测、LLM、TTS 作为独立阶段暴露出来,而是更多依赖实时模型整体处理。
## 常见后端
| 后端 | 特点 |
|------|------|
| **OpenAI Realtime** | 语音交互自然,延迟低 |
| **Gemini Live** | 多模态能力强 |
| **Doubao 实时交互** | 更适合国内环境与中文场景 |
## 它适合什么场景
- 语音助手、陪练、虚拟角色等高自然度体验场景
- 对首响和连续打断体验要求高的入口
- 希望减少链路拼装复杂度,直接接入端到端模型的团队
## 数据流
```mermaid
sequenceDiagram
participant U as 用户
participant E as 引擎
participant RT as Realtime Model
U->>E: 音频 / 视频 / 文本输入
E->>RT: 转发实时流
RT-->>E: 流式文本 / 音频输出
E-->>U: 播放或渲染结果
```
## Realtime 的优势
- **延迟更低**:链路更短,用户感知更自然
- **全双工更顺滑**:用户插话时,模型更容易在内部处理打断
- **多模态更直接**:适合音频、视频、文本混合输入输出场景
## Realtime 的取舍
- 更依赖实时模型供应商的能力边界
- 不容易对 ASR / TTS / 回合检测做独立替换
- 成本和可观测性往往不如 Pipeline 那样可逐环节拆分
## 智能打断
Realtime 模型通常原生支持全双工和打断:
```mermaid
sequenceDiagram
participant U as 用户
participant E as 引擎
participant RT as Realtime Model
Note over RT: 模型正在输出
RT-->>E: 音频流...
E-->>U: 播放
U->>E: 用户开始说话
E->>RT: 转发新输入
Note over RT: 模型内部处理中断并切换回复
RT-->>E: 新的响应
E-->>U: 播放新响应
```
这种方式更自然,但你通常只能看到模型的整体行为,而不是每个中间阶段的细节。
## 配置示例
```json
{
"engine": "multimodal",
"model": {
"provider": "openai",
"model": "gpt-4o-realtime-preview",
"voice": "alloy"
}
}
```
## 相关文档
- [引擎架构](engines.md) - 回到两类引擎的选择指南
- [Pipeline 引擎](pipeline-engine.md) - 查看分段可控的运行路径
- [WebSocket 协议](../api-reference/websocket.md) - 了解客户端如何与引擎建立会话

View File

@@ -1,6 +1,21 @@
# 语音识别
# 语音识别
语音识别ASR负责用户音频实时转写文本,供对话引擎理解
语音识别ASR负责用户音频实时转写文本,供引擎继续理解和处理
## 关键配置项
| 配置项 | 说明 |
|--------|------|
| **ASR 引擎** | 选择语音识别服务提供商或自建服务 |
| **模型** | 实际使用的识别模型名称 |
| **语言** | 中文、英文或多语言 |
| **热词** | 提高业务词汇、品牌词、专有名词识别率 |
| **标点与规范化** | 自动补全标点、规范数字和日期等 |
## 模式
- `offline`:引擎本地缓冲音频后触发识别(适用于 OpenAI-compatible / SiliconFlow
- `streaming`:音频分片实时发送到服务端,服务端持续返回转写事件(适用于 DashScope Realtime ASR、Volcengine BigASR
## 配置项
@@ -8,17 +23,31 @@
|---|---|
| ASR 引擎 | 选择语音识别服务提供商 |
| 模型 | 识别模型名称 |
| `enable_interim` | 是否开启离线 ASR 中间结果(默认 `false`,仅离线模式生效) |
| `app_id` / `resource_id` | Volcengine 等厂商的应用标识与资源标识 |
| `request_params` | 厂商原生请求参数透传,例如 `end_window_size``force_to_speech_time``context` |
| 语言 | 中文/英文/多语言 |
| 热词 | 提升特定词汇识别准确率 |
| 标点与规范化 | 是否自动补全标点、文本规范化 |
## 建议
## 选择建议
- 客服场景建议开启热词并维护业务词表
- 多语言场景建议按会话入口显式指定语言
- 对延迟敏感场景优先选择流式识别模型
- 客服、外呼等业务场景建议维护热词表,并按业务线持续更新
- 多语言入口建议显式指定语言,避免模型自动判断带来的波动
- 对延迟敏感场景优先选择流式识别模型
- 对准确率敏感的场景,先评估专有名词、数字、地址等样本的识别表现
## 运行建议
- 使用与接入端一致的采样率和编码方式,减少额外转换
- 在测试阶段准备固定样本,便于对比不同模型或参数的变化
- 把“识别准确率”和“识别延迟”一起看,不要只看其中一项
## 相关文档
- [语音配置总览](voices.md)
- [声音资源](voices.md) - 完整语音输入输出链路中的 TTS 侧配置
- [快速开始](../quickstart/index.md) - 以任务路径接入第一个 ASR 资源
- 客服场景建议开启热词并维护业务词表
- 多语言场景建议按会话入口显式指定语言
- 对延迟敏感场景优先选择流式识别模型
- 当前支持提供商:`openai_compatible``siliconflow``dashscope``volcengine``buffered`(回退)

View File

@@ -1,53 +1,86 @@
# 知识库
# 知识库
知识库基于 RAG检索增强生成技术让 AI 能够回答私有领域问题
知识库负责承载助手需要引用的私有事实、业务资料和长文档内容,是 RAG检索增强生成能力的正式说明页
## 概述
## 什么时候应该用知识库
![知识库](../images/knowledge.png)
当问题答案主要来自“稳定文档”而不是实时外部动作时,优先使用知识库:
## 创建知识库
- 产品说明、政策条款、操作流程、培训材料
- 内部手册、FAQ、规范文档
- 需要被多位助手复用的领域知识
### 步骤
如果任务本质上是“查状态、写数据、执行动作”,那通常更适合 [工具](tools.md),而不是知识库。
1. 进入 **知识库** 页面
2. 点击 **新建知识库**
3. 填写知识库名称
4. 上传文档
## 工作原理
### 支持格式
```mermaid
flowchart LR
subgraph Indexing["索引阶段"]
Doc[文档] --> Chunk[分块]
Chunk --> Embed[向量化]
Embed --> Store[(向量数据库)]
end
| 格式 | 说明 |
|------|------|
| Markdown | 最佳选择,格式清晰 |
| PDF | 自动提取文本 |
| TXT | 纯文本支持 |
| Word | 需转换为其他格式 |
subgraph Query["查询阶段"]
Q[用户问题] --> Search[相似度检索]
Store --> Search
Search --> Context[相关片段]
Context --> LLM[LLM 生成回答]
end
```
### 文档上传
核心原则很简单:把长文档转成可检索的片段,在用户提问时只把最相关的内容送给模型。
- 拖拽上传或点击选择
- 单文件大小限制 10MB
- 建议单文档不超过 50000 字
## 适合放进知识库的内容
## 配置检索参数
| 适合 | 不适合 |
|------|--------|
| 稳定规则、标准答案、产品文档 | 高频变化的实时状态 |
| 领域术语、说明手册、培训材料 | 需要外部系统写入或变更的动作 |
| 需要跨助手复用的内容 | 只在单次会话里临时生成的数据 |
| 参数 | 说明 | 默认值 |
|------|------|--------|
| 相似度阈值 | 低于此分数的结果不返回 | 0.7 |
| 返回数量 | 单次检索返回的结果数 | 3 |
| 分块大小 | 文档分块的最大长度 | 500 |
## 内容准备建议
## 管理知识库
- 优先上传结构清晰、主题明确的文档
- 对超长文档按主题拆分,减少一次索引的噪声
- 标题、章节名和表格说明对召回质量很重要,不要全部删掉格式信息
- 与其堆很多相近文档,不如先清理重复、过期和相互冲突的内容
- **查看文档** - 浏览已上传的文件
- **删除文档** - 移除不需要的内容
- **更新文档** - 重新上传覆盖
- **测试检索** - 验证知识库效果
## 常见配置项
## 关联助手
| 配置项 | 作用 | 常见做法 |
|--------|------|----------|
| **相似度阈值** | 过滤弱相关结果 | 从保守值起步,再按误召回调 |
| **返回数量** | 控制一次送给模型的候选片段数 | 先少后多,避免上下文污染 |
| **分块大小** | 决定每个文档片段的长度 | 按文档类型和问题粒度调整 |
在助手配置的 **知识** 标签页中:
1. 选择要关联的知识库
2. 设置检索策略
3. 保存配置
## 创建与维护
### 最小流程
1. 新建知识库
2. 上传文档
3. 完成索引
4. 用典型问题测试召回结果
5. 绑定到目标助手
### 日常维护
- 删除过期或互相矛盾的文档
- 当业务口径变化时,优先更新知识库而不是只改提示词
- 为关键问题准备固定测试问句,观察召回是否稳定
## 与助手的关系
知识库不是独立产品入口,而是助手的能力层:
- 助手决定是否、何时、以什么风格使用知识
- 知识库决定能够提供哪些事实片段
- 工作流和工具可以与知识库并用,但承担不同职责
## 相关文档
- [助手概念](../concepts/assistants.md) - 知识库在助手能力层中的位置
- [LLM 模型](models.md) - 为知识库准备嵌入或重排模型
- [工具](tools.md) - 当任务需要执行动作时,优先考虑工具而不是知识库

View File

@@ -1,44 +1,53 @@
# 模型配置
# LLM 模型
## LLM 模型库
本页是资源库中 LLM 模型的正式说明页,聚焦文本生成、嵌入和重排模型的接入与选择。
![LLM模型库](../images/llms.png)
## 这页负责什么
### 支持的模型
当你需要为助手配置“理解与生成能力”时,请从这里开始决定:
| 供应商 | 模型 | 特点 |
- 使用哪个供应商或模型家族
- 该模型负责文本生成、嵌入还是重排
- 接口地址、认证信息和默认参数如何设置
语音识别和语音合成分别由 [语音识别](asr.md) 与 [声音资源](voices.md) 说明,不在本页重复。
## 模型类型
| 类型 | 用途 | 常见场景 |
|------|------|----------|
| **文本模型** | 生成回复、总结、分类、规划 | 助手主对话、工具调用决策 |
| **嵌入模型** | 向量化文档或查询 | 知识库检索 |
| **重排模型** | 对检索结果再次排序 | 提升知识召回质量 |
## 配置清单
| 配置项 | 说明 | 建议 |
|--------|------|------|
| **OpenAI** | GPT-4 / GPT-3.5 | 通用能力强 |
| **DeepSeek** | DeepSeek Chat | 高性价比 |
| **SiliconFlow** | 多种开源模型 | 本地部署友好 |
| **Google** | Gemini Pro | 多模态支持 |
| **供应商** | OpenAI 兼容、托管平台或自建服务 | 用统一命名规范区分环境 |
| **模型名称** | 控制台中的显示名称 | 体现厂商、用途和环境 |
| **模型标识** | 请求中实际使用的 model 名称 | 保持与供应商文档一致 |
| **Base URL** | 接口地址 | 为不同环境分别配置 |
| **API Key / Token** | 鉴权凭证 | 与显示名称配套管理 |
| **默认参数** | Temperature、Max Tokens、上下文长度等 | 按业务场景收敛默认值 |
### 配置步骤
## 选择建议
1. 进入 **LLM 库** 页面
2. 点击 **添加模型**
3. 选择供应商
4. 填写 API Key 和 Endpoint
5. 设置默认参数
- **先按用途选模型,再按成本和延迟筛选供应商**
- **文本模型不要承担知识库检索职责**:检索应交给嵌入与重排模型
- **为不同环境建立清晰命名**:如 `prod-gpt4o-mini``staging-qwen-text`
- **默认参数要保守**:让助手默认稳定,再在单个场景内按需调优
### 参数说明
## 常见组合
| 参数 | 说明 | 建议值 |
|------|------|--------|
| Temperature | 随机性 | 0.7 |
| Max Tokens | 最大输出长度 | 2048 |
| Top P | 核采样 | 0.9 |
| 目标 | 推荐组合 |
|------|----------|
| **通用对话助手** | 1 个文本模型 |
| **知识问答助手** | 文本模型 + 嵌入模型 |
| **高质量知识召回** | 文本模型 + 嵌入模型 + 重排模型 |
## ASR 语音识别
## 下一步
### 支持引擎
- **Whisper** - OpenAI 通用语音识别
- **SenseVoice** - 高精度中文语音识别
### 配置方法
1. 进入 **ASR 库** 页面
2. 选择识别引擎
3. 配置音频参数(采样率、编码)
4. 测试识别效果
- [语音识别](asr.md) - 为语音输入选择 ASR
- [声音资源](voices.md) - 为语音输出准备 TTS 资源
- [知识库](knowledge-base.md) - 把嵌入 / 重排模型接入 RAG 链路

View File

@@ -1,38 +1,60 @@
# 工具集成
# 工具
工具Tools让助手能够执行外部操作如查询天气、搜索信息、调用 API 等
工具让助手从“会回答”扩展成“能执行动作”。本页是工具能力的正式说明页
## 概述
## 什么时候应该用工具
工具是助手能力的扩展。当用户请求需要外部数据或操作时,助手会调用相应的工具
当用户请求需要依赖外部系统、实时数据或执行某个动作时,应该使用工具,而不是只靠提示词或知识库
## 内置工具
典型场景包括:
| 工具 | 说明 | 参数 |
|------|------|------|
| `search` | 网络搜索 | query: 搜索关键词 |
| `weather` | 天气查询 | city: 城市名称 |
| `calculator` | 数学计算 | expression: 计算表达式 |
| `knowledge` | 知识库检索 | query: 查询内容 |
- 查询订单、库存、物流、天气等实时信息
- 创建预约、提交表单、写入业务系统
- 获取客户端环境能力,如定位、相机、权限确认
### 启用内置工具
如果问题本质上是“查阅稳定资料”,优先用 [知识库](knowledge-base.md);如果问题是“执行动作或读写实时状态”,优先用工具。
在助手配置的 **工具** 标签页:
## 工具类型
1. 勾选需要启用的工具
2. 配置工具参数(如有)
3. 保存配置
| 类型 | 说明 | 常见场景 |
|------|------|----------|
| **Webhook 工具** | 调用外部 HTTP API | 订单查询、CRM 写入、预约服务 |
| **客户端工具** | 由接入端在本地执行 | 获取定位、打开相机、请求用户授权 |
| **内建工具** | 平台或运行时直接提供 | 搜索、计算、知识检索等 |
## 自定义工具
## 工具调用的基本过程
支持通过 HTTP 回调实现自定义工具。
```mermaid
sequenceDiagram
participant User as 用户
participant Assistant as 助手 / 模型
participant Tool as 工具
### 定义工具
User->>Assistant: 发起请求
Assistant->>Assistant: 判断是否需要工具
Assistant->>Tool: 发起工具调用
Tool-->>Assistant: 返回结构化结果
Assistant->>User: 组织最终回复
```
关键点不是“模型会不会调用工具”,而是“工具的定义是否足够清晰,能让模型在正确时机调用”。
## 如何定义一个好工具
| 要素 | 为什么重要 |
|------|------------|
| **清晰名称** | 让模型知道它是做什么的,而不是猜用途 |
| **明确描述** | 告诉模型何时调用、何时不要调用 |
| **完整参数定义** | 降低缺参、错参和歧义调用 |
| **稳定返回结构** | 让模型更容易根据结果组织回复 |
| **明确错误语义** | 让失败时也能安全退回用户对话 |
## Webhook 工具示例
```json
{
"name": "query_order",
"description": "查询用户订单信息",
"description": "根据订单号查询当前订单状态,仅用于用户已提供订单号的场景。",
"parameters": {
"type": "object",
"properties": {
@@ -42,188 +64,45 @@
}
},
"required": ["order_id"]
},
"endpoint": {
"url": "https://api.example.com/orders",
"method": "GET",
"headers": {
"Authorization": "Bearer {{api_key}}"
}
}
}
```
### 工具字段说明
## 客户端工具的作用
| 字段 | 说明 |
|------|------|
| name | 工具名称(英文标识符) |
| description | 工具描述LLM 用于理解工具用途) |
| parameters | 参数定义JSON Schema 格式) |
| endpoint | HTTP 调用配置 |
某些动作必须在接入端执行,例如:
### 参数映射
- 获取当前位置
- 请求麦克风或相机权限
- 打开特定页面或原生能力
工具参数自动映射到 HTTP 请求:
这类工具通常通过事件流和客户端配合完成,而不是由后端直接执行。
- **GET 请求**:参数作为 query string
- **POST 请求**:参数作为 JSON body
## 工具设计建议
## 客户端工具
- **一工具一职责**:不要把多个业务动作塞进同一个工具
- **名称与描述写给模型看**:必须明确何时用、何时不用
- **先设计错误返回**:失败时模型应该知道如何解释给用户
- **减少高权限工具暴露面**:不是每个助手、每个工作流节点都需要全部工具
- **把业务规则放回系统**:工具负责执行,提示词负责决策边界
某些工具需要在客户端执行(如获取地理位置)。
## 与知识库、工作流的分工
### 工作流程
- **知识库**:提供稳定事实
- **工具**:执行动作或读取实时状态
- **工作流**:决定何时进入某个步骤、调用哪个工具、失败如何回退
1. 助手返回 `assistant.tool_call` 事件
2. 客户端执行工具并获取结果
3. 客户端发送 `tool_call.results` 消息
4. 助手继续生成回复
当一个助手开始涉及多步骤、多系统调用时,工具通常应与 [工作流](workflows.md) 一起设计,而不是孤立配置。
### 服务端事件
## 安全与治理
```json
{
"type": "assistant.tool_call",
"data": {
"tool_call_id": "call_abc123",
"tool_name": "get_location",
"arguments": {}
}
}
```
- 校验输入,不直接信任模型生成的参数
- 为工具设置最小权限和清晰的可见范围
- 记录调用日志,便于审计和回放
- 对外部接口增加超时、重试和速率限制策略
### 客户端响应
## 相关文档
```json
{
"type": "tool_call.results",
"results": [
{
"tool_call_id": "call_abc123",
"name": "get_location",
"output": {
"latitude": 39.9042,
"longitude": 116.4074,
"city": "北京"
},
"status": {
"code": 200,
"message": "ok"
}
}
]
}
```
## 工具调用示例
### 天气查询
用户:"北京今天天气怎么样?"
助手调用工具:
```json
{
"tool_name": "weather",
"arguments": {
"city": "北京"
}
}
```
工具返回:
```json
{
"temperature": 25,
"condition": "晴",
"humidity": 40
}
```
助手回复:"北京今天天气晴朗,气温 25 度,湿度 40%。"
### 订单查询
用户:"帮我查一下订单 12345"
助手调用工具:
```json
{
"tool_name": "query_order",
"arguments": {
"order_id": "12345"
}
}
```
工具返回:
```json
{
"order_id": "12345",
"status": "已发货",
"tracking": "SF1234567890"
}
```
助手回复:"您的订单 12345 已发货,快递单号是 SF1234567890。"
## 工具配置最佳实践
### 1. 清晰的描述
工具描述应该让 LLM 准确理解何时使用:
```
好的描述:
"查询指定城市的实时天气信息,包括温度、天气状况和湿度"
不好的描述:
"天气工具"
```
### 2. 完整的参数定义
```json
{
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "城市名称,如 '北京'、'上海'"
},
"date": {
"type": "string",
"description": "日期,格式 YYYY-MM-DD可选默认今天"
}
},
"required": ["city"]
}
}
```
### 3. 错误处理
工具应返回清晰的错误信息:
```json
{
"status": {
"code": 404,
"message": "未找到该城市的天气数据"
}
}
```
## 安全注意事项
1. **验证输入** - 不要直接信任用户输入
2. **限制权限** - 工具只应有必要的权限
3. **审计日志** - 记录所有工具调用
4. **速率限制** - 防止滥用
## 下一步
- [知识库配置](knowledge-base.md) - 让助手具备专业知识
- [工作流编排](workflows.md) - 复杂对话流程
- [知识库](knowledge-base.md) - 当问题更适合“查资料”时使用知识库
- [工作流](workflows.md) - 当工具调用需要流程控制和分支逻辑时接入工作流
- [助手概念](../concepts/assistants.md) - 理解工具在助手能力层中的位置

View File

@@ -1,25 +1,25 @@
# 语音生成
# TTS 参数
语音生成TTS负责将助手回复文本转换为可播放音频
TTS 参数决定助手语音输出的节奏、音量和听感。本页只讨论参数层面的调优建议
## 配置项
## 常用参数
| 配置项 | 说明 |
|---|---|
| TTS 引擎 | 选择语音合成服务提供商 |
| 声音/音色 | 选择目标音色或发音人 |
| 模型 | 语音合成模型名称 |
| 语速 | 播放速度,通常 0.5-2.0 |
| 音量/增益 | 输出音量控制 |
| 音调 | 声线高低调整 |
| 参数 | 说明 | 常见范围 |
|------|------|----------|
| **语速** | 说话速度 | `0.5 - 2.0` |
| **音量 / 增益** | 输出音量强弱 | 供应商自定义 |
| **音调** | 声线高低 | 供应商自定义 |
| **模型** | 合成模型名称 | 依供应商而定 |
| **声音 ID** | 发音人或音色标识 | 依供应商而定 |
## 建议
## 调优建议
- 对话助手建议保持语速`0.9-1.2`
- 生产环境建议固定主音色,降低体验波动
- 若需要打断能力,优先使用低延迟流式 TTS
- 对话助手通常建议把语速控制`0.9 - 1.2`
- 需要打断能力的场景,优先选择低延迟流式 TTS并避免过长的单次回复
- 如果业务强调可信度或专业感,先保证清晰度和稳定性,再追求个性化音色
- 不要只试听一句问候语,至少用三类文案对比:短答复、长答复、数字或专有名词较多的答复
## 相关文档
- [语音配置总览](voices.md)
- [声音资源](voices.md) - 先选择适合的供应商、模型和音色
- [语音识别](asr.md) - 结合输入侧延迟一起评估整条语音链路

View File

@@ -1,58 +1,43 @@
# 语音合成
# 声音资源
语音合成TTS模块提供自然流畅的语音输出能力
本页是资源库中 TTS 声音与发音人资源的正式说明页,聚焦“选择哪种声音给助手输出”
## 概述
## 这页负责什么
![语音合成](../images/voices.png)
当你已经决定启用语音输出后,需要在这里完成:
## 支持的引擎
- 选择供应商、模型和声音资源
- 为不同业务或语言准备不同音色
- 通过预览和测试确定默认发音人
| 供应商 | 特点 | 适用场景 |
|--------|------|---------|
| **阿里云** | 多音色、高自然度 | 通用场景 |
| **火山引擎** | 低延迟、实时性好 | 实时对话 |
| **Minimax** | 高性价比 | 批量合成 |
更细的速度、音量、音调等参数建议见 [TTS 参数](tts.md)。
## 配置方法
## 选择声音时要考虑什么
### 添加语音配置
1. 进入 **语音库** 页面
2. 点击 **添加语音**
3. 选择供应商
4. 填写 API 凭证
5. 保存配置
### 测试语音
- 在线预览发音效果
- 调整语速和音量
- 切换不同音色
## 音色选择
### 中文音色
| 音色 | 风格 |
| 维度 | 说明 |
|------|------|
| 晓晓 | 标准女声 |
| 晓北 | 知性女声 |
| 逍遥 | 青年男声 |
| 丫丫 | 活泼童声 |
| **语言与口音** | 是否覆盖目标用户语言与地区口音 |
| **风格** | 专业、亲切、活泼、沉稳等输出气质 |
| **延迟** | 是否适合实时对话,而不仅是离线合成 |
| **稳定性** | 长文本、多轮会话中的音色一致性 |
| **成本** | 单次调用成本和高并发可用性 |
### 英文音色
## 推荐做法
| 音色 | 风格 |
|------|------|
| Joanna | 专业女声 |
| Matthew | 沉稳男声 |
| Amy | 亲切女声 |
1. 先为每类业务角色确定一条主音色
2. 再按语言或渠道补充少量备选音色
3. 通过固定测试文案试听,统一比较自然度、节奏和可懂度
4. 上线后尽量保持默认音色稳定,避免频繁切换影响用户体验
## 参数调优
## 常见资源组织方式
| 参数 | 范围 | 说明 |
|------|------|------|
| 语速 | 0.5-2.0 | 1.0 为正常速度 |
| 音量 | 0-100 | 输出音量百分比 |
| 音调 | 0.5-2.0 | 语音音调高低 |
| 组织方式 | 适用场景 |
|----------|----------|
| **按语言区分** | 中英文或多语种助手 |
| **按业务角色区分** | 客服、销售、培训、提醒类助手 |
| **按环境区分** | 开发、预发、生产使用不同供应商或凭证 |
## 下一步
- [TTS 参数](tts.md) - 调整语速、增益、音调等输出参数
- [快速开始](../quickstart/index.md) - 把声音资源绑定到第一个助手

View File

@@ -1,53 +1,106 @@
# 工作流管理
# 工作流
工作流提供可视化的对话流程编排能力,支持复杂的业务场景
工作流用于把复杂业务拆成明确的步骤、分支和回退策略,是 RAS 中承载流程逻辑的正式能力页
## 概述
## 什么时候需要工作流
![工作流](../images/workflows.png)
当一个助手同时满足以下任一情况时,通常应考虑工作流,而不是继续堆叠单一提示词:
## 节点类型
- 需要多轮收集信息,例如订单号、手机号、预约时间等
- 需要按意图或条件走不同分支
- 需要串联多个工具或业务系统
- 需要在异常或信息不足时统一回退到澄清、兜底或人工节点
| 节点 | 图标 | 功能说明 |
|------|------|---------|
| **对话节点** | 💬 | AI 自动回复,可设置回复策略 |
| **工具节点** | 🔧 | 调用外部 API 或自定义工具 |
| **人工节点** | 👤 | 转接人工客服 |
| **结束节点** | 🏁 | 结束对话流程 |
## 工作流与助手的关系
## 创建工作流
助手负责对外表现、全局策略和渠道接入;工作流负责把某个业务流程拆成可维护的节点。
### 步骤
```mermaid
flowchart LR
Assistant[助手] --> Workflow[工作流]
Workflow --> Nodes[节点与分支]
Nodes --> Tools[工具 / 知识库 / 人工]
```
1. 进入 **工作流** 页面
2. 点击 **新建工作流**
3. 从左侧拖拽节点到画布
4. 连接节点建立流程
5. 配置各节点参数
6. 保存并发布
这意味着:
### 节点配置
- 助手定义角色、提示词基线、模型和输出方式
- 工作流定义“这类问题该按什么顺序被处理”
- 工具和知识库作为节点可调用的能力,被有选择地暴露给流程
#### 对话节点配置
## 关键组成
- 回复模板
- 条件分支
- 知识库检索
| 组成 | 作用 | 设计建议 |
|------|------|----------|
| **工作流名称** | 区分业务流程 | 用业务语义命名,避免过于技术化 |
| **入口节点** | 用户进入后的第一步 | 保持单入口,便于理解和测试 |
| **全局提示词** | 对所有节点生效的共性约束 | 保持简短,避免与节点提示词冲突 |
| **节点提示词** | 当前节点的任务说明 | 单一职责,明确输入 / 输出 |
| **节点工具白名单** | 控制当前节点可调用的工具集合 | 遵循最小权限原则 |
| **超时与回退** | 异常、超时、缺信息时的处理方式 | 优先回到澄清、兜底或人工节点 |
| **上下文透传** | 在节点之间共享状态 | 只传递后续节点真正需要的信息 |
#### 工具节点配置
## 常见节点类型
- 选择工具类型
- 配置输入参数
- 设置输出处理
| 节点类型 | 适合做什么 |
|----------|------------|
| **路由节点** | 判断用户意图并进入不同分支 |
| **信息收集节点** | 收集订单号、联系方式、时间等关键信息 |
| **处理节点** | 调用工具、执行查询、计算或写入系统 |
| **回复节点** | 组织最终答复并控制输出风格 |
| **人工节点** | 转接人工、排队或发起通知 |
| **结束节点** | 输出结束语并关闭流程 |
#### 人工节点配置
## 推荐编排步骤
- 转接规则
- 排队策略
- 通知设置
1. 先写清楚流程目标:这条工作流要解决哪一类业务问题
2. 画出最小节点图:入口、关键分支、结束和兜底
3. 为每个节点定义唯一职责和输入 / 输出
4. 再绑定知识库、工具和回退策略
5. 在测试面板或流程调试工具中验证每条主路径和异常路径
## 流程测试
## 配置示例
- 支持单步调试
- 可查看执行日志
- 实时验证流程逻辑
```yaml
workflow:
name: "订单咨询流程"
entry: "intent_router"
global_prompt: "优先给出可执行步骤,必要时先澄清信息。"
nodes:
- id: "intent_router"
type: "router"
prompt: "识别用户意图:查订单、退款、投诉"
next:
- when: "intent == query_order"
to: "collect_order_id"
- when: "intent == refund"
to: "refund_policy"
- id: "collect_order_id"
type: "collect"
prompt: "请用户提供订单号"
tools: ["query_order"]
fallback: "human_handoff"
- id: "human_handoff"
type: "end"
prompt: "转人工处理"
```
## 设计建议
- **让每个节点只做一件事**:避免单节点同时负责路由、收集信息和最终回复
- **工具按节点授权**:不要把所有工具暴露给整条流程中的每个节点
- **把失败路径设计出来**:超时、无结果、参数缺失都应该有明确回退
- **优先传状态,不传长文本**:节点之间共享必要结构化信息,比传递大段自然语言更稳
- **为流程保留可观测性**:每条主路径都应能在调试时解释“为什么走到这里”
## 当前边界
- 文档不会完整覆盖所有表达式或节点字段的最终 Schema
- 不同执行引擎下,可用节点字段和运行行为可能存在差异
- 可视化编排与底层字段映射可能不会一一对应
## 相关文档
- [助手概念](../concepts/assistants.md) - 工作流在助手体系中的位置
- [工具](tools.md) - 设计可被流程安全调用的工具
- [知识库](knowledge-base.md) - 让流程中的节点使用 RAG 能力

View File

@@ -1,4 +1,4 @@
# 配置说明
# 配置说明
本页面介绍 Realtime Agent Studio 各组件的配置方法。
@@ -274,5 +274,6 @@ python -c "from config import settings; print(settings)"
## 下一步
- [安装部署](index.md) - 开始安装服务
- [环境与部署](index.md) - 开始安装服务
- [Docker 部署](../deployment/docker.md) - 容器化部署

View File

@@ -1,12 +1,12 @@
# 安装部署
# 环境与部署
章节介绍如何安装和配置 Realtime Agent Studio (RAS) 开发环境
页属于“快速开始”中的环境与部署路径,只负责把服务跑起来、说明配置入口和部署方式。首次创建助手请转到 [创建第一个助手](../quickstart/index.md)
---
## 系统组件
## 先理解部署对象
RAS 由三个核心服务组成:
Realtime Agent StudioRAS通常由三个核心服务组成:
```mermaid
flowchart LR
@@ -26,47 +26,32 @@ flowchart LR
Engine <--> API
```
| 组件 | 端口 | 说明 |
|------|------|------|
| **Web 前端** | 3000 | React + TypeScript 管理控制台 |
| **API 服务** | 8080 | Python FastAPI 后端 |
| **Engine 服务** | 8000 | 实时对话引擎WebSocket |
| 组件 | 默认端口 | 负责什么 |
|------|----------|----------|
| **Web 前端** | 3000 | 管理控制台与调试界面 |
| **API 服务** | 8080 | 资源管理、配置持久化、历史数据 |
| **Engine 服务** | 8000 | 实时会话、事件流和音频流 |
---
## 选择你的安装方式
## 快速安装
### 方式一Docker Compose
### 方式一Docker Compose推荐
最快捷的启动方式,适合快速体验和生产部署。
适合希望尽快跑通一套完整环境的团队。
```bash
# 1. 克隆项目
# 仓库目录示例沿用当前代码仓库 slug
# 你本地实际目录名可以不同
git clone https://github.com/your-org/AI-VideoAssistant.git
cd AI-VideoAssistant
# 2. 启动服务
docker-compose up -d
# 3. 访问控制台
open http://localhost:3000
```
!!! tip "首次启动"
首次启动需要构建镜像,可能需要几分钟时间。
### 方式二:本地开发
适合需要修改代码的开发者。
适合需要分别调试前端、API 和 Engine 的开发者。
#### 1. 克隆项目
```bash
git clone https://github.com/your-org/AI-VideoAssistant.git
cd AI-VideoAssistant
```
#### 2. 启动 API 服务
#### 启动 API 服务
```bash
cd api
@@ -76,7 +61,7 @@ pip install -r requirements.txt
uvicorn main:app --host 0.0.0.0 --port 8080 --reload
```
#### 3. 启动 Engine 服务
#### 启动 Engine 服务
```bash
cd engine
@@ -86,7 +71,7 @@ pip install -r requirements.txt
python main.py
```
#### 4. 启动 Web 前端
#### 启动 Web 前端
```bash
cd web
@@ -94,97 +79,37 @@ npm install
npm run dev
```
访问 `http://localhost:3000`
## 基础验证
---
完成安装后,至少确认以下入口可访问:
## 验证安装
| 服务 | 地址 | 用途 |
|------|------|------|
| Web | `http://localhost:3000` | 打开控制台 |
| API | `http://localhost:8080/docs` | 查看管理接口 |
| Engine | `http://localhost:8000/health` | 检查实时引擎健康状态 |
### 检查服务状态
如果你需要更完整的环境变量、配置文件和部署说明,请继续阅读本章节其他页面:
| 服务 | URL | 预期结果 |
|------|-----|---------|
| Web | http://localhost:3000 | 看到登录/控制台页面 |
| API | http://localhost:8080/docs | 看到 Swagger 文档 |
| Engine | http://localhost:8000/health | 返回 `{"status": "ok"}` |
- [环境要求](requirements.md)
- [配置说明](configuration.md)
- [部署概览](../deployment/index.md)
- [Docker 部署](../deployment/docker.md)
### 测试 WebSocket 连接
## 目录结构(阅读导向)
```javascript
const ws = new WebSocket('ws://localhost:8000/ws?assistant_id=test');
ws.onopen = () => console.log('Connected!');
ws.onerror = (e) => console.error('Error:', e);
```text
repo/
├── web/ # 管理控制台
├── api/ # 控制面与管理接口
├── engine/ # 实时交互引擎
├── docker/ # 部署编排与镜像配置
└── docs/ # 当前文档站点
```
---
## 遇到问题时去哪里
## 目录结构
- 需要“快速判断往哪看”:先看 [常见问题](../resources/faq.md)
- 需要“按步骤排查”:直接看 [故障排查](../resources/troubleshooting.md)
- 已经跑通环境,准备创建助手:回到 [快速开始](../quickstart/index.md)
```
AI-VideoAssistant/
├── web/ # React 前端
│ ├── src/
│ │ ├── components/ # UI 组件
│ │ ├── pages/ # 页面
│ │ ├── stores/ # Zustand 状态
│ │ └── api/ # API 客户端
│ └── package.json
├── api/ # FastAPI 后端
│ ├── app/
│ │ ├── routers/ # API 路由
│ │ ├── models/ # 数据模型
│ │ └── services/ # 业务逻辑
│ └── requirements.txt
├── engine/ # 实时交互引擎
│ ├── app/
│ │ ├── pipeline/ # 管线引擎
│ │ └── multimodal/ # 多模态引擎
│ └── requirements.txt
├── docker/ # Docker 配置
│ └── docker-compose.yml
└── docs/ # 文档
```
---
## 常见问题
### 端口被占用
```bash
# 查看端口占用
# Linux/Mac
lsof -i :3000
# Windows
netstat -ano | findstr :3000
```
修改对应服务的端口配置后重启。
### Docker 构建失败
```bash
# 清理 Docker 缓存
docker system prune -a
# 重新构建
docker-compose build --no-cache
```
### Python 依赖安装失败
确保使用 Python 3.10+
```bash
python --version # 需要 3.10+
```
---
## 下一步
- [环境要求](requirements.md) - 详细的软件版本要求
- [配置说明](configuration.md) - 环境变量配置指南
- [快速开始](../quickstart/index.md) - 创建第一个助手
- [Docker 部署](../deployment/docker.md) - 镜像构建与编排

View File

@@ -1,4 +1,4 @@
# 环境要求
# 环境要求
本页面列出运行 Realtime Agent Studio 所需的软件和硬件要求。
@@ -145,5 +145,6 @@ wsl --install -d Ubuntu
## 下一步
- [配置说明](configuration.md) - 环境变量配置
- [安装部署](index.md) - 开始安装
- [环境与部署](index.md) - 开始安装
- [Docker 部署](../deployment/docker.md) - 容器化部署

View File

@@ -1,9 +1,9 @@
<p align="center">
<p align="center">
<img src="images/logo.png" alt="Realtime Agent Studio" width="400">
</p>
<p align="center">
<strong>构建实时交互音视频智能体的开源工作平台</strong>
<strong>通过管理控制台与 API 构建、部署和运营实时多模态助手</strong>
</p>
<p align="center">
@@ -14,66 +14,65 @@
</p>
<p align="center">
<a href="overview/index.md">产品概览</a> ·
<a href="quickstart/index.md">快速开始</a> ·
<a href="api-reference/index.md">API 文档</a> ·
<a href="getting-started/index.md">安装部署</a> ·
<a href="roadmap.md">路线图</a>
<a href="concepts/assistants.md">构建助手</a> ·
<a href="concepts/index.md">核心概念</a> ·
<a href="api-reference/index.md">API 参考</a>
</p>
---
## 什么是 Realtime Agent Studio
Realtime Agent Studio (RAS) 是一个通过管理控制台与 API 构建、部署和运营实时多模态助手的开源平台。
Realtime Agent Studio (RAS) 是一款以大语言模型为核心,构建实时交互音视频智能体的工作平台。支持管线式的全双工交互引擎和原生多模态模型两种架构,覆盖实时交互智能体的配置、测试、发布、监控全流程。
## 适合谁
可以将 RAS 看作 [Vapi](https://vapi.ai)、[Retell](https://retellai.com)、[ElevenLabs Agents](https://elevenlabs.io) 的**开源替代方案**。
- 需要把实时语音或视频助手接入产品、设备或内部系统的开发团队
- 需要通过控制台快速配置提示词、模型、知识库、工具和工作流的运营团队
- 需要私有化部署、模型可替换、链路可观测的企业场景
---
## 核心特性
## 核心能力
<div class="grid cards" markdown>
- :zap: **低延迟实时引擎**
- :material-robot-outline: **助手构建**
---
管线式全双工架构VAD/ASR/TD/LLM/TTS 流水线处理,支持智能打断,端到端延迟 < 500ms
用统一的助手对象管理提示词、模型、知识库、工具、开场白和会话策略。
- :brain: **多模态模型支持**
- :material-pulse: **双引擎运行时**
---
支持 GPT-4o Realtime、Gemini Live、Step Audio 等原生多模态模型直连
同时支持 Pipeline 引擎与 Realtime 引擎,可按延迟、成本和可控性选择运行方式。
- :wrench: **可视化配置**
- :material-source-branch: **能力扩展**
---
无代码配置助手、提示词、工具调用、知识库关联,所见即所得
通过资源库、知识库、工具与工作流扩展助手能力,而不是把全部逻辑塞进单一提示词。
- :electric_plug: **开放 API**
- :material-api: **开放集成**
---
标准 WebSocket 协议RESTful 管理接口,支持 Webhook 回调
使用 REST API 管理资源,使用 WebSocket API 接入实时对话,面向 Web、移动端和第三方系统。
- :shield: **私有化部署**
- :material-shield-lock-outline: **私有化部署**
---
Docker 一键部署,数据完全自主可控,支持本地模型
支持 Docker 部署、自有模型服务和企业内网运行,便于满足合规与成本要求。
- :chart_with_upwards_trend: **全链路监控**
- :material-chart-line: **可观测与评估**
---
完整会话回放,实时仪表盘,自动化测试效果评估
提供会话历史、实时指标、自动化测试效果评估,帮助持续改进助手质量。
</div>
---
## 系统架构
平台架构层级:
@@ -81,243 +80,107 @@ Realtime Agent Studio (RAS) 是一款以大语言模型为核心,构建实时
```mermaid
flowchart TB
%% ================= ACCESS =================
subgraph Access["Access Layer"]
direction TB
API[API]
SDK[SDK]
Browser[Browser UI]
Embed[Web Embed]
end
subgraph Access["Access Layer"]
API["API"]
SDK["SDK"]
Browser["Browser UI"]
Embed["Web Embed"]
end
subgraph Runtime["Realtime Interaction Engine"]
direction LR
%% ================= REALTIME ENGINE =================
subgraph Runtime["Realtime Interaction Engine"]
subgraph Duplex["Duplex Interaction Engine"]
direction LR
direction LR
subgraph Pipeline["Pipeline Engine"]
direction LR
VAD["VAD"]
ASR["ASR"]
TD["Turn Detection"]
LLM["LLM"]
TTS["TTS"]
end
%% -------- Duplex Engine --------
subgraph Duplex["Duplex Interaction Engine"]
direction LR
subgraph Multi["Realtime Engine"]
MM["Realtime Model"]
end
end
subgraph Pipeline["Pipeline Engine"]
direction LR
VAD[VAD]
ASR[ASR]
TD[Turn Detection]
LLM[LLM]
TTS[TTS]
end
subgraph Capability["Agent Capabilities"]
subgraph Tools["Tool System"]
Webhook["Webhook"]
ClientTool["Client Tools"]
Builtin["Builtin Tools"]
end
subgraph Multi["Realtime Engine"]
MM[Realtime Model]
end
subgraph KB["Knowledge System"]
Docs["Documents"]
Vector[("Vector Index")]
Retrieval["Retrieval"]
end
end
end
end
subgraph Platform["Platform Services"]
direction TB
Backend["Backend Service"]
Frontend["Frontend Console"]
DB[("Database")]
end
%% -------- Capabilities --------
subgraph Capability["Agent Capabilities"]
subgraph Tools["Tool System"]
Webhook[Webhook]
ClientTool[Client Tools]
Builtin[Builtin Tools]
end
subgraph KB["Knowledge System"]
Docs[Documents]
Vector[(Vector Index)]
Retrieval[Retrieval]
end
end
end
%% ================= PLATFORM =================
subgraph Platform["Platform Services"]
direction TB
Backend[Backend Service]
Frontend[Frontend Console]
DB[(Database)]
end
%% ================= CONNECTIONS =================
Access --> Runtime
Runtime <--> Backend
Backend <--> DB
Backend <--> Frontend
LLM --> Tools
MM --> Tools
LLM <--> KB
MM <--> KB
Access --> Runtime
Runtime <--> Backend
Backend <--> DB
Backend <--> Frontend
LLM --> Tools
MM --> Tools
LLM <--> KB
MM <--> KB
```
管线式引擎交互引擎对话流程图:
```mermaid
flowchart LR
User((User Speech))
Audio[Audio Stream]
VAD[VAD\nVoice Activity Detection]
ASR[ASR\nSpeech Recognition]
TD[Turn Detection]
LLM[LLM\nReasoning]
Tools[Tools / APIs]
TTS[TTS\nSpeech Synthesis]
AudioOut[Audio Stream Out]
User --> Audio
Audio --> VAD
VAD --> ASR
ASR --> TD
TD --> LLM
LLM --> Tools
Tools --> LLM
LLM --> TTS
TTS --> AudioOut
AudioOut --> User
```
基于实时交互模型的对话流程图:
```mermaid
flowchart LR
User((User))
Input[Audio / Video / Text]
MM[Multimodal Model]
Tools[Tools / APIs]
KB[Knowledge Base]
Output[Audio / Video / Text]
User --> Input
Input --> MM
MM --> Tools
Tools --> MM
MM --> KB
KB --> MM
MM --> Output
Output --> User
```
---
## 技术栈
| 层级 | 技术 |
|------|------|
| **前端** | React 18, TypeScript, Tailwind CSS, Zustand |
| **后端** | FastAPI (Python 3.10+) |
| **引擎** | Python, WebSocket, asyncio |
| **数据库** | SQLite |
| **知识库** | chroma |
| **部署** | Docker |
---
## 快速导航
## 从这里开始
<div class="grid cards" markdown>
- :rocket: **[快速开始](quickstart/index.md)**
- :material-compass-outline: **[了解产品](overview/index.md)**
---
5 分钟创建你的第一个 AI 助手
先看产品定位、核心模块、适用场景,以及 RAS 与其他方案的差异。
- :book: **[核心概念](concepts/index.md)**
- :material-cog-outline: **[环境与部署](getting-started/index.md)**
---
了解助手、管线、多模态等核心概念
先把服务跑起来,了解环境要求、配置入口和部署方式。
- :wrench: **[安装部署](getting-started/index.md)**
- :material-rocket-launch-outline: **[创建第一个助手](quickstart/index.md)**
---
环境准备、本地开发与 Docker/生产部署
按最短路径准备资源、创建助手、测试效果并拿到接入所需信息。
- :robot: **[助手管理](assistants/index.md)**
- :material-tune: **[构建助手](concepts/assistants.md)**
---
创建和配置智能对话助手
按完整链路配置助手、提示词、模型、知识库、工具与工作流。
- :gear: **[功能定制](customization/knowledge-base.md)**
- :material-connection: **[接入应用](api-reference/index.md)**
---
知识库、工具、语音、工作流
查看 REST 与 WebSocket 接口,把助手嵌入到你的 Web、移动端或服务端系统。
- :bar_chart: **[数据分析](analysis/dashboard.md)**
- :material-lifebuoy: **[排查问题](resources/troubleshooting.md)**
---
仪表盘、历史记录、测试评估
- :electric_plug: **[API 参考](api-reference/index.md)**
---
WebSocket 协议与 REST 接口文档
当连接、对话质量或部署链路出现问题时,从这里进入可执行的排查步骤。
</div>
---
## 快速体验
### 使用 Docker 启动
```bash
git clone https://github.com/your-org/AI-VideoAssistant.git
cd docker
docker-compose up -d
# for development
# docker compose --profile dev up -d
```
访问 `http://localhost:3000` 即可使用控制台。
### WebSocket 连接示例
```javascript
const ws = new WebSocket('ws://localhost:8000/ws?assistant_id=YOUR_ID');
ws.onopen = () => {
ws.send(JSON.stringify({
type: 'session.start',
audio: { encoding: 'pcm_s16le', sample_rate_hz: 16000, channels: 1 }
}));
};
```
---
## 许可证
本项目基于 [MIT 许可证](https://github.com/your-org/AI-VideoAssistant/blob/main/LICENSE) 开源。

View File

@@ -1,6 +1,6 @@
# 系统架构
# 系统架构
本文档详细介绍 Realtime Agent Studio (RAS) 的系统架构设计
本文档只解释 Realtime Agent Studio (RAS) 的服务边界、数据流、部署形态和关键技术选型,不重复产品定位或上手流程
---
@@ -61,12 +61,12 @@ flowchart TB
### 1. Web 前端 (React)
管理控制台,提供可视化的配置和监控界面。
管理控制台,提供可视化的配置、测试和监控界面。
| 功能模块 | 说明 |
|---------|------|
| 助手管理 | 创建、配置、测试智能助手 |
| 资源库 | LLM/ASR/TTS/VAD 等模型管理 |
| 资源库 | LLM / ASR / TTS 等模型管理 |
| 知识库 | RAG 文档上传与管理 |
| 历史记录 | 会话日志查询与回放 |
| 仪表盘 | 实时数据统计 |
@@ -74,7 +74,7 @@ flowchart TB
### 2. API 服务 (FastAPI)
RESTful API 后端,处理所有管理操作
REST API 后端,处理资源管理、持久化配置和历史数据等控制面能力
```mermaid
flowchart LR
@@ -100,7 +100,7 @@ flowchart LR
### 3. 实时交互引擎 (Engine)
核心组件,处理实时音视频对话。
处理实时音视频对话、事件流转、模型调用与工具执行
```mermaid
flowchart TB
@@ -116,7 +116,7 @@ flowchart TB
TTS[语音合成 TTS]
end
subgraph Realtime["实时交互引擎连接"]
subgraph Realtime["实时引擎连接"]
RTOpenAI[OpenAI Realtime]
RTGemini[Gemini Live]
RTDoubao[Doubao 实时交互]
@@ -144,9 +144,9 @@ flowchart TB
| 类别 | 说明 | 可选项 |
|------|------|--------|
| **外部服务** | 管线式引擎各环节依赖的云/本地服务 | OpenAI、SiliconFlow、DashScope、本地模型 |
| **实时交互引擎** | 实时交互引擎可连接的后端 | OpenAI Realtime、Gemini Live、Doubao 实时交互引擎 |
| **工具** | 管线式 LLM 与实时交互引擎均可调用 | Webhook、客户端工具、内建工具 |
| **外部模型服务** | Pipeline 引擎各环节依赖的云端或本地服务 | OpenAI、SiliconFlow、DashScope、本地模型 |
| **实时模型连接** | Realtime 引擎可直接连接的后端 | OpenAI Realtime、Gemini Live、Doubao 实时交互 |
| **工具系统** | 由助手或引擎调用的外部执行能力 | Webhook、客户端工具、内建工具 |
---
@@ -154,7 +154,7 @@ flowchart TB
### 管线式全双工引擎
管线式引擎包含:**声音活动检测VAD**、**语音识别ASR**、**回合检测TD**、**大语言模型LLM**、**语音合成TTS**。外部服务可选用 **OpenAI**、**SiliconFlow**、**DashScope**、**本地模型**。LLM 可连接**工具**Webhook、客户端工具、内建工具
管线式引擎**VAD → ASR → TD → LLM → TTS** 组成。每个环节可替换,适合需要精细控制、工具扩展和较高可解释性的场景
```mermaid
sequenceDiagram
@@ -170,33 +170,28 @@ sequenceDiagram
C->>E: 音频流 (PCM)
E->>VAD: 检测语音活动
VAD-->>E: 有效语音段
E->>ASR: 语音转文字
E->>ASR: 语音转
ASR-->>E: 转写文本
E->>TD: 回合边界
TD-->>E: 可送 LLM 的输入
E->>TD: 判断回合边界
TD-->>E: 可送 LLM 的输入
E->>LLM: 生成回复
LLM->>Tools: 可选:调用工具
Tools-->>LLM: 工具结果
LLM-->>E: 回复文本 (流式)
E->>TTS: 文转语音
E->>TTS: 文转语音
TTS-->>E: 音频流
E->>C: 播放音频
```
**特点:**
- 灵活选择各环节供应商OpenAI、SiliconFlow、DashScope、本地模型
- 可独立优化 VAD、ASR、TD、LLM、TTS 每个环节
- LLM 与工具联动Webhook、客户端工具、内建工具
- 延迟约 500-1500ms
- 各环节可单独替换和优化
- 便于接入知识库、工具、工作流等能力
- 延迟通常高于端到端实时模型,但可控性更强
### 实时交互引擎
### Realtime 引擎
实时交互引擎可连接**实时交互引擎**,包括 **OpenAI Realtime**、**Gemini Live**、**Doubao 实时交互引擎**等,同样可连接**工具**Webhook、客户端工具、内建工具
### 原生多模态引擎
使用端到端多模态模型(如 GPT-4o Realtime
Realtime 引擎直接连接端到端实时模型,适合追求更低延迟和更自然多模态交互的场景
```mermaid
sequenceDiagram
@@ -204,17 +199,17 @@ sequenceDiagram
participant E as 引擎
participant RT as Realtime Model
C->>E: 音频
E->>RT: 音频输入
RT-->>E: 音频输出 (流式)
E->>C: 播放音频
C->>E: 音频/视频/文本输入
E->>RT: 实时流输入
RT-->>E: 流式文本/音频输出
E->>C: 播放或渲染结果
```
**特点:**
- 更低延迟 (< 300ms)
-自然的语音交互
- 依赖特定模型供应商
- 交互链路更短,延迟更低
-依赖具体模型供应商的能力边界
- 适合强调自然对话和多模态体验的入口
---
@@ -234,11 +229,11 @@ sequenceDiagram
API->>DB: 查询助手
DB-->>API: 助手数据
API-->>E: 配置信息
C->>E: session.start
E-->>C: session.started
E-->>C: config.resolved
loop 对话循环
C->>E: 音频帧 (binary)
E-->>C: input.speech_started
@@ -249,7 +244,7 @@ sequenceDiagram
E-->>C: 音频帧 (binary)
E-->>C: output.audio.end
end
C->>E: session.stop
E->>API: 保存会话记录
API->>DB: 存储
@@ -266,19 +261,19 @@ sequenceDiagram
Note over E: 正在播放 TTS 音频
E->>C: 音频帧...
C->>E: 用户说话 (VAD 检测)
E->>E: 触发打断
E->>TTS: 停止合成
E-->>C: output.audio.interrupted
Note over E: 处理新的用户输入
E-->>C: input.speech_started
```
---
## 部署架构
## 部署形态
### 开发环境
@@ -299,56 +294,19 @@ flowchart LR
## 技术选型
| 组件 | 技术 | 选型理由 |
|------|------|---------|
| **前端框架** | React 18 | 成熟生态,组件化开发 |
| **状态管理** | Zustand | 轻量级TypeScript 友好 |
| **UI 组件** | Tailwind CSS | 原子化 CSS快速开发 |
| **后端框架** | FastAPI | 高性能,自动 API 文档 |
| **WebSocket** | websockets | Python 异步 WebSocket |
| **ORM** | SQLAlchemy | 功能完善,支持多数据库 |
| **数据库** | SQLite/PostgreSQL | 开发简单/生产可靠 |
---
## 扩展性设计
### 模型适配器模式
```mermaid
classDiagram
class ModelAdapter {
<<interface>>
+generate(prompt) string
+stream(prompt) AsyncIterator
}
class OpenAIAdapter {
+generate(prompt) string
+stream(prompt) AsyncIterator
}
class AzureAdapter {
+generate(prompt) string
+stream(prompt) AsyncIterator
}
class LocalAdapter {
+generate(prompt) string
+stream(prompt) AsyncIterator
}
ModelAdapter <|-- OpenAIAdapter
ModelAdapter <|-- AzureAdapter
ModelAdapter <|-- LocalAdapter
```
通过适配器模式,可以轻松接入新的模型供应商。
| 组件 | 技术 | 说明 |
|------|------|------|
| **前端框架** | React 18 | 管理控制台与调试界面 |
| **状态管理** | Zustand | 前端轻量状态管理 |
| **UI 样式** | Tailwind CSS | 快速构建控制台界面 |
| **后端框架** | FastAPI | 管理接口与配置持久化 |
| **WebSocket** | websockets | 实时事件与音频流通信 |
| **数据库** | SQLite / PostgreSQL | 配置与历史数据存储 |
---
## 相关文档
- [WebSocket 协议](../api-reference/websocket.md) - 详细的协议规范
- [部署概览](../deployment/index.md) - Docker 部署
- [核心概念](../concepts/index.md) - 助手、管线等概念说明
- [产品概览](index.md) - 产品定位、核心模块与适用场景
- [引擎架构](../concepts/engines.md) - Pipeline 与 Realtime 的选择指南
- [WebSocket 协议](../api-reference/websocket.md) - 实时对话事件和消息格式

View File

@@ -1,148 +1,84 @@
# 产品概览
# 产品概览
了解 Realtime Agent Studio 的核心功能和设计理念
Realtime Agent Studio (RAS) 是一个通过管理控制台与 API 构建、部署和运营实时多模态助手的开源平台
---
## 什么是 RAS
## 产品定位
Realtime Agent Studio (RAS) 是一个**开源的实时交互智能体工作平台**,让开发者能够快速构建和部署具备语音对话能力的 AI 助手。
RAS 面向需要构建实时语音或视频助手的团队,目标不是替代你的业务系统,而是提供一套可组合的助手基础设施:
### 核心价值
- **控制台**:让团队快速配置助手、资源库、知识库、工具、工作流与评估策略
- **API 与实时运行时**:让应用、设备和第三方系统稳定接入实时对话能力
- **运维与分析能力**:让团队能观察会话效果、排查问题并持续迭代助手质量
| 价值主张 | 说明 |
|---------|------|
| **低代码配置** | 可视化界面配置助手,无需编写复杂代码 |
| **实时交互** | 毫秒级响应,支持语音打断,自然对话体验 |
| **开放灵活** | 支持多种模型供应商,自由选择最适合的方案 |
| **私有部署** | 完全自主可控,数据不出域 |
如果你把实时助手看作一条完整的产品链路RAS 负责其中的“构建、接入、运行、观测”四个阶段。
---
## 核心模块
## 功能模块
| 模块 | 负责什么 | 适合谁使用 |
|------|----------|------------|
| **助手** | 定义角色、行为、模型、知识、工具和会话策略 | 产品、运营、算法、开发 |
| **引擎** | 承载实时语音/多模态对话,输出事件流和音频流 | 开发、基础设施 |
| **资源库** | 管理 LLM、ASR、TTS 等外部能力接入 | 平台管理员、开发 |
| **知识库 / 工具 / 工作流** | 让助手获得领域知识、外部执行能力和复杂流程控制 | 业务设计者、开发 |
| **分析与评估** | 记录会话、监控指标、做自动化回归和效果评估 | 运营、QA、开发 |
```mermaid
mindmap
root((RAS))
助手管理
创建配置
提示词编辑
模型选择
工具调用
资源库
LLM 模型
ASR 模型
TTS 声音
知识库
文档上传
向量检索
RAG 问答
监控分析
会话回放
数据统计
自动测试
部署集成
WebSocket API
REST API
SDK
```
## 为什么是“控制台 + API”
### 助手管理
RAS 采用“控制台配置 + API 接入”的组合方式,而不是把所有内容都固化在代码里:
创建和配置智能对话助手:
- **控制台负责提效**:让非后端角色也能参与提示词、工具、知识、流程的配置与调优
- **API 负责集成**:让产品团队继续用自己的前端、服务端或设备侧应用承载最终体验
- **同一套助手配置可复用**:控制台保存的助手定义可以被不同渠道重复接入和评估
- **系统提示词** - 定义助手角色和行为
- **模型配置** - 选择 LLM、ASR、TTS 模型
- **工具调用** - 配置 Webhook 和客户端工具
- **开场白** - 设置首轮对话模式
### 资源库
集中管理各类模型资源:
- **语音识别 (ASR)** - 多供应商 ASR 模型管理
- **大语言模型 (LLM)** - OpenAI、Azure、本地模型
- **语音合成 (TTS)** - 多音色声音资源
### 知识库
为助手提供专业知识:
- **文档上传** - 支持 PDF、Word、Markdown 等格式
- **向量化索引** - 自动分块和向量化
- **RAG 检索** - 基于语义的知识检索
### 监控分析
全面的数据分析能力:
- **会话回放** - 完整链路日志和音频回放
- **实时仪表盘** - 并发数、延迟、错误率统计
- **自动化测试** - 批量测试和效果评估
---
## 对比其他方案
| 特性 | RAS | Vapi | Retell | ElevenLabs |
|------|-----|------|--------|------------|
| **开源** | :white_check_mark: | :x: | :x: | :x: |
| **私有部署** | :white_check_mark: | :x: | :x: | :x: |
| **管线式引擎** | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: |
| **多模态模型** | :white_check_mark: | :white_check_mark: | :x: | :white_check_mark: |
| **自定义 ASR/TTS** | :white_check_mark: | 有限 | 有限 | :x: |
| **知识库** | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: |
| **工作流编辑** | 开发中 | :white_check_mark: | :x: | :x: |
| **定价** | 免费 | 按量付费 | 按量付费 | 按量付费 |
---
## 适用场景
## 典型使用方式
<div class="grid cards" markdown>
- :telephone_receiver: **智能客服**
- :material-headset: **客户服务与运营自动化**
---
7x24 小时自动接听,处理常见咨询,复杂问题转人工
在客服、外呼、预约、售后等场景中接入实时语音助手,并保留人工接管与工具调用能力。
- :hospital: **医疗问诊**
- :material-school-outline: **培训、陪练与问答**
---
预问诊信息收集,健康咨询,用药提醒
用知识库、提示词和流程编排构建可持续优化的教学、培训或辅导助手。
- :school: **教育培训**
- :material-domain: **企业内部助手**
---
口语练习,知识问答,个性化辅导
通过私有部署、内部知识库和业务系统工具,把助手接入内部流程或设备终端。
- :handshake: **销售助手**
- :material-devices: **多端集成**
---
产品介绍,需求挖掘,预约安排
- :headphones: **语音助手**
---
智能家居控制,日程管理,信息查询
- :robot: **虚拟人**
---
数字人直播,虚拟主播,交互式展示
通过 WebSocket API 将同一个助手接入 Web、移动端、坐席工作台或自有硬件设备。
</div>
---
## 与其他方案的差异
## 下一步
本页是站内唯一保留“产品对比”视角的地方,用于帮助你快速判断 RAS 的定位边界。
- [快速开始](../quickstart/index.md) - 5 分钟创建第一个助手
- [系统架构](architecture.md) - 深入了解技术实现
- [核心概念](../concepts/index.md) - 学习关键概念
| 特性 | RAS | Vapi | Retell | ElevenLabs Agents |
|------|-----|------|--------|-------------------|
| **开源** | :white_check_mark: | :x: | :x: | :x: |
| **私有部署** | :white_check_mark: | :x: | :x: | :x: |
| **Pipeline 引擎** | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: |
| **Realtime / 多模态引擎** | :white_check_mark: | :white_check_mark: | :x: | :white_check_mark: |
| **自定义 ASR / TTS** | :white_check_mark: | 有限 | 有限 | :x: |
| **知识库与工具扩展** | :white_check_mark: | :white_check_mark: | :white_check_mark: | 有限 |
| **工作流编排** | 开发中 | :white_check_mark: | :x: | :x: |
| **数据与链路可观测** | :white_check_mark: | 有限 | 有限 | 有限 |
## 继续阅读
- [系统架构](architecture.md) - 从服务边界、数据流和部署形态理解系统如何组成
- [核心概念](../concepts/index.md) - 先建立助手、引擎与工作流的心智模型
- [快速开始](../quickstart/index.md) - 以最短路径创建第一个助手

View File

@@ -1,233 +1,44 @@
# 资源库配置详解
# 资源准备清单
本页面详细介绍资源库中各类资源的配置方法和最佳实践
本页保留原“资源库配置详解”链接,但在本轮文档收敛后,它只承担快速开始阶段的资源核对职责
## 语音识别 (ASR) 配置
## 你至少要准备什么
### 支持的接口类型
在创建第一个助手前,至少确认以下三类资源都已经可用:
| 接口类型 | 说明 |
|---------|------|
| OpenAI Compatible | 兼容 OpenAI 语音识别 API 格式的服务 |
| 资源 | 为什么需要 | 正式说明 |
|------|------------|------------|
| **LLM 模型** | 负责理解与生成回复 | [LLM 模型](../customization/models.md) |
| **ASR 资源** | 负责把语音输入转写为文本 | [语音识别](../customization/asr.md) |
| **TTS 声音资源** | 负责把文本回复合成为语音 | [声音资源](../customization/voices.md) |
### 配置字段说明
## 上手前自检
| 字段 | 必填 | 说明 |
|-----|-----|------|
| 模型名称 | 是 | 自定义显示名称,便于识别 |
| 接口类型 | 是 | 当前支持 OpenAI Compatible |
| 语言 | 是 | 识别语言:中文/英文/多语言 |
| Model Name | 否 | API 请求中的 model 参数 |
| Base URL | 是 | API 服务地址 |
| API Key | 是 | 服务认证密钥 |
| 热词 | 否 | 逗号分隔的专有名词列表 |
| 标点增强 | 否 | 是否自动添加标点 |
| 文本归一化 | 否 | 规范化数字、日期等格式 |
| 启用 | 否 | 是否在选择列表中显示 |
### LLM
### 推荐配置示例
- 已配置供应商、模型名称、Base URL 和凭证
- 已明确该模型用于文本生成、嵌入还是重排
- 已准备保守的默认参数,而不是先追求极端效果
**硅基流动 SenseVoice**
### ASR
```
模型名称SenseVoice 中文
Model NameFunAudioLLM/SenseVoiceSmall
Base URLhttps://api.siliconflow.cn/v1
语言:中文
```
- 已确认目标语言与模型匹配
- 已准备必要热词或专有名词词表
- 已能用固定样本测试识别准确率和延迟
### 测试识别效果
### TTS
1. 在 ASR 列表中找到目标模型
2. 点击 **试听识别** 按钮
3. 选择以下测试方式之一:
- **上传文件**:拖拽或选择音频文件
- **麦克风录音**:点击录音按钮开始录制
4. 点击 **开始识别** 查看结果
5. 检查识别文本、延迟和置信度
- 已选择主音色,并完成至少一次试听
- 已确认该声音适合实时对话,而不是仅适合离线播报
- 已为默认语速、音量等参数设定初始值
---
## 不在本页展开的内容
## 大语言模型 (LLM) 配置
字段说明、供应商差异、参数建议和最佳实践已经分别收敛到正式能力页:
### 支持的模型类型
- [LLM 模型](../customization/models.md)
- [语音识别](../customization/asr.md)
- [声音资源](../customization/voices.md)
- [TTS 参数](../customization/tts.md)
| 类型 | 用途 |
|-----|------|
| 文本 (text) | 对话生成,用于助手核心交互 |
| 嵌入 (embedding) | 向量化,用于知识库检索 |
| 重排 (rerank) | 结果重排序,优化检索结果 |
### 配置字段说明
| 字段 | 必填 | 说明 |
|-----|-----|------|
| 厂商 | 是 | 当前支持 OpenAI Compatible |
| 模型类型 | 是 | 文本/嵌入/重排 |
| 模型名称 | 是 | 自定义显示名称 |
| 模型标识 | 否 | API 请求中的 model 参数 |
| Base URL | 是 | API 服务地址 |
| API Key | 是 | 服务认证密钥 |
| 温度 | 否 | 输出随机性 (0-2),仅文本模型 |
| 上下文长度 | 否 | 最大 token 数 |
| 启用 | 否 | 是否在选择列表中显示 |
### 推荐配置示例
**OpenAI GPT-4o Mini**
```
模型名称GPT-4o Mini
模型类型:文本
模型标识gpt-4o-mini
Base URLhttps://api.openai.com/v1
温度0.7
上下文长度8192
```
**硅基流动 Qwen**
```
模型名称Qwen2.5-7B
模型类型:文本
模型标识Qwen/Qwen2.5-7B-Instruct
Base URLhttps://api.siliconflow.cn/v1
温度0.7
```
### 测试模型效果
1. 在 LLM 列表中找到目标模型
2. 点击 **预览** 按钮
3. 配置测试参数:
- **System Prompt**:系统提示词
- **User Message**:测试消息
- **Temperature**:温度参数
- **Max Tokens**:最大输出长度
4. 点击 **开始预览** 查看模型回复
5. 检查回复内容、延迟和 token 用量
---
## 声音资源 (TTS) 配置
### 支持的接口类型
| 接口类型 | 说明 |
|---------|------|
| OpenAI Compatible | 兼容 OpenAI TTS API 格式的服务 |
| DashScope | 阿里云 DashScope 语音合成服务 |
### 配置字段说明
| 字段 | 必填 | 说明 |
|-----|-----|------|
| 厂商 | 是 | OpenAI Compatible 或 DashScope |
| 声音名称 | 是 | 自定义显示名称 |
| 模型 | 是 | TTS 模型标识 |
| 声音 ID | 是 | 音色标识符 |
| Base URL | 否 | API 服务地址 |
| API Key | 是 | 服务认证密钥 |
| 语速 | 否 | 说话速度 (0.5-2.0),默认 1.0 |
| 增益 | 否 | 音量调节 (-10 to 10 dB) |
| 音调 | 否 | 声音高低 (-12 to 12) |
| 性别 | 否 | 声音性别标签 |
| 语言 | 否 | 声音语言标签 |
| 备注 | 否 | 声音特点描述 |
### 推荐配置示例
**硅基流动 CosyVoice**
```
厂商OpenAI Compatible
声音名称Anna 中文女声
模型FunAudioLLM/CosyVoice2-0.5B
声音 IDFunAudioLLM/CosyVoice2-0.5B:anna
Base URLhttps://api.siliconflow.cn/v1
语速1.0
性别:女
语言:中文
```
**DashScope TTS**
```
厂商DashScope
声音名称Cherry
模型qwen3-tts-flash-realtime
声音 IDCherry
Base URLwss://dashscope.aliyuncs.com/api-ws/v1/realtime
语速1.0
```
### CosyVoice 可用音色
| 音色 ID | 性别 | 风格 |
|--------|-----|------|
| alex | 男 | 成熟稳重 |
| anna | 女 | 温柔亲切 |
| bella | 女 | 活泼甜美 |
| benjamin | 男 | 年轻活力 |
| charles | 男 | 专业商务 |
| claire | 女 | 清新自然 |
| david | 男 | 沉稳大气 |
| diana | 女 | 优雅知性 |
### 试听声音效果
1. 在声音列表中找到目标声音
2. 点击 **播放** 按钮
3. 系统会自动合成一段试听语音
4. 检查声音效果是否符合预期
### 克隆声音
如需使用自定义声音:
1. 点击 **克隆声音** 按钮
2. 上传参考音频文件WAV/MP3
3. 填写声音名称和描述
4. 点击 **开始克隆**
!!! note "声音克隆说明"
声音克隆功能需要 TTS 服务支持。上传的参考音频建议为 10-30 秒的清晰人声录音。
---
## 配置最佳实践
### 资源命名规范
建议使用清晰的命名规范,便于后续管理:
```
[厂商/模型]-[用途/语言]-[特点]
```
示例:
- `SF-SenseVoice-中文`
- `OpenAI-GPT4o-对话`
- `SF-CosyVoice-Anna女声`
### 多环境管理
如果有测试和生产环境,建议:
1. 为不同环境创建独立的资源配置
2. 在名称中标注环境,如 `GPT4o-Prod``GPT4o-Test`
3. 通过"启用"开关控制可见性
### 成本优化
| 场景 | 推荐配置 |
|-----|---------|
| 开发测试 | 使用低成本模型,如 GPT-4o-mini |
| 生产环境 | 根据质量要求选择合适模型 |
| 高并发 | 考虑使用本地部署的开源模型 |
---
## 下一步
资源配置完成后,请返回 [快速开始](index.md) 继续创建助手。
准备完成后,请回到 [快速开始](index.md) 继续创建助手。

View File

@@ -1,221 +1,69 @@
# 快速开始
# 快速开始
5 分钟创建你的第一个 AI 助手
本页负责“创建第一个助手”的最短路径。环境要求、配置文件和部署方式统一放在 [环境与部署](../getting-started/index.md)
## 概述
## 目标
本指南将帮助你通过控制台快速创建一个能够进行语音对话的智能助手。在创建助手之前需要先在资源库Library中配置所需的模型资源。
完成本页后,你应该已经:
1. 准备好 1 个 LLM、1 个 ASR、1 个 TTS 资源
2. 创建并保存 1 个助手
3. 完成至少 1 轮测试对话
4. 拿到接入应用所需的 `assistant_id` 和 WebSocket 地址
## 前提条件
- 已部署 Realtime Agent Studio (RAS) 服务
- 拥有 LLM / ASR / TTS 服务的 API Key
- 已部署 Realtime Agent StudioRAS服务
- 已准备可用的 LLM / ASR / TTS 凭证
- 已能访问控制台与 WebSocket 服务
## 配置流程
## 第一步:准备资源
创建助手前,需要先准备好三种核心资源:
创建助手前,先准备三类资源:
```
┌─────────────────────────────────────────────────────────┐
│ 资源库配置 │
├─────────────────────────────────────────────────────────┤
│ 1. 语音识别 (ASR) ─→ 将用户语音转为文字 │
│ 2. 模型接入 (LLM) ─→ 理解用户意图并生成回复 │
│ 3. 声音资源 (TTS) ─→ 将文字回复转为语音输出 │
└─────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────┐
│ 创建助手 │
├─────────────────────────────────────────────────────────┤
│ 配置提示词 → 选择模型 → 配置语音 → 测试 → 发布 │
└─────────────────────────────────────────────────────────┘
```
- **LLM 模型**:决定助手如何理解和生成回复。详见 [LLM 模型](../customization/models.md)
- **ASR 资源**:决定语音输入如何转写。详见 [语音识别](../customization/asr.md)
- **TTS 声音资源**:决定回复如何被合成为语音。详见 [声音资源](../customization/voices.md)
---
## 第一步:配置资源库
在创建助手之前,需要先在资源库中添加 ASR、LLM、TTS 三种资源。
### 1.1 添加语音识别模型 (ASR)
语音识别模型负责将用户的语音输入转换为文字。
1. 在左侧导航栏点击 **语音识别**
2. 点击 **添加模型** 按钮
3. 填写配置信息:
| 配置项 | 说明 | 示例值 |
|-------|------|--------|
| 模型名称 | 自定义显示名称 | SenseVoice CN |
| 接口类型 | 选择 OpenAI Compatible | OpenAI Compatible |
| 语言 | 识别语言 | 中文 (Chinese) |
| Model Name | 模型标识符 | FunAudioLLM/SenseVoiceSmall |
| Base URL | API 服务地址 | https://api.siliconflow.cn/v1 |
| API Key | 服务密钥 | sk-xxxxxxxx |
4. 可选配置:
- **热词**:添加专有名词提高识别准确率
- **标点增强**:自动添加标点符号
- **文本归一化**:规范化数字、日期等格式
5. 点击 **确认添加**
!!! tip "试听识别功能"
添加完成后,可以点击列表中的试听按钮,上传或录制音频测试识别效果。
### 1.2 添加大语言模型 (LLM)
大语言模型是助手的"大脑",负责理解用户意图并生成回复。
1. 在左侧导航栏点击 **模型接入**
2. 点击 **添加模型** 按钮
3. 填写配置信息:
| 配置项 | 说明 | 示例值 |
|-------|------|--------|
| 厂商 | 接口类型 | OpenAI Compatible |
| 模型类型 | 文本/嵌入/重排 | 文本 |
| 模型名称 | 自定义显示名称 | GPT-4o Mini |
| 模型标识 | API 中的 model 参数 | gpt-4o-mini |
| Base URL | API 服务地址 | https://api.openai.com/v1 |
| API Key | 服务密钥 | sk-xxxxxxxx |
| 温度 | 输出随机性 (0-2) | 0.7 |
| 上下文长度 | 最大 token 数 | 8192 |
4. 点击 **确认添加**
!!! tip "预览功能"
添加完成后,可以点击预览按钮测试模型是否配置正确。
### 1.3 添加声音资源 (TTS)
声音资源用于将助手的文字回复转换为语音输出。
1. 在左侧导航栏点击 **声音资源**
2. 点击 **添加声音** 按钮
3. 填写配置信息:
| 配置项 | 说明 | 示例值 |
|-------|------|--------|
| 厂商 | 接口类型 | OpenAI Compatible 或 DashScope |
| 声音名称 | 自定义显示名称 | 客服小美 |
| 模型 | TTS 模型标识 | FunAudioLLM/CosyVoice2-0.5B |
| 声音 ID | 音色标识 | FunAudioLLM/CosyVoice2-0.5B:anna |
| Base URL | API 服务地址 | https://api.siliconflow.cn/v1 |
| API Key | 服务密钥 | sk-xxxxxxxx |
| 语速 | 说话速度 (0.5-2.0) | 1.0 |
| 增益 | 音量调节 (-10 to 10 dB) | 0 |
| 音调 | 声音高低 (-12 to 12) | 0 |
| 性别 | 声音性别 | 女 |
| 语言 | 声音语言 | 中文 |
4. 点击 **确认添加**
!!! tip "试听功能"
添加完成后,可以在列表中点击播放按钮试听声音效果。
---
如果你想先检查“资源是否准备齐”,可以看 [资源准备清单](dashboard.md)。
## 第二步:创建助手
资源配置完成后,可以开始创建助手。
1. 进入控制台中的 **助手** 页面
2. 新建一个助手,并填写最小必要信息:
- **助手名称**:让团队知道它服务于什么场景
- **系统提示词**:先定义角色、任务和限制
- **首轮模式**:决定由助手先说还是等待用户开口
3. 绑定默认模型:
- 文本生成使用一个 LLM
- 语音输入使用一个 ASR
- 语音输出使用一个 TTS 声音资源
### 2.1 新建助手
如果你想把助手设计得更稳,继续阅读:
1. 在左侧导航栏点击 **助手管理**
2. 点击 **新建助手** 按钮
3. 系统会自动创建一个名为 "New Assistant" 的助手
- [助手概念](../concepts/assistants.md)
- [配置选项](../concepts/assistants/configuration.md)
- [提示词指南](../concepts/assistants/prompts.md)
### 2.2 配置全局设置
## 第三步:补充能力
在助手详情页的 **全局** 标签页中配置
最小助手可以只依赖提示词和模型;更复杂的场景通常还需要以下能力
#### 基本信息
- **知识库**:让助手回答私有领域问题。见 [知识库](../customization/knowledge-base.md)
- **工具**:让助手执行查单、预约、查询等外部操作。见 [工具](../customization/tools.md)
- **工作流**:让助手处理多步骤、多分支流程。见 [工作流](../customization/workflows.md)
- **助手名称**:修改为有意义的名称,如 "客服助手"
- **语言**:选择助手的对话语言
## 第四步:测试并发布
#### 系统提示词
1. 打开助手测试面板,先验证文本对话,再验证语音输入输出
2. 观察事件流、转写、工具调用和最终回复是否符合预期
3. 保存当前配置,并确认该助手已可用于外部接入
配置系统提示词,定义助手的角色和行为:
更系统的验证方式见 [测试调试](../concepts/assistants/testing.md)。
```
你是一个友好的客服助手。你的任务是帮助用户解答问题。
## 第五步:接入应用
要求
- 保持友好和专业的语气
- 回答要简洁明了,每次回复控制在 2-3 句话
- 如果不确定答案,请如实告知
```
#### 开场白配置
设置对话开始时助手的问候语:
- **首回合模式**:选择 "助手先说" 让助手主动开场
- **开场白内容**:如 "你好,我是智能客服助手,请问有什么可以帮您?"
### 2.3 配置模型
**模型** 标签页中选择之前添加的资源:
| 配置项 | 说明 |
|-------|------|
| LLM 模型 | 选择在模型接入中添加的大语言模型 |
| ASR 模型 | 选择在语音识别中添加的 ASR 模型 |
### 2.4 配置语音
**语音** 标签页中配置:
| 配置项 | 说明 |
|-------|------|
| 启用语音输出 | 开启后助手会用语音回复 |
| 选择声音 | 选择在声音资源中添加的音色 |
| 语速 | 可微调当前助手的说话速度 |
### 2.5 保存配置
完成配置后,点击页面顶部的 **保存** 按钮。
---
## 第三步:测试助手
### 3.1 打开测试面板
点击助手卡片右上角的 **测试** 按钮,打开实时调试面板。
### 3.2 进行对话测试
| 测试场景 | 示例问题 | 预期结果 |
|---------|---------|---------|
| 基础问候 | "你好" | 助手友好回应 |
| 功能询问 | "你能做什么?" | 介绍自身能力 |
| 业务问题 | 根据你的场景设计 | 正确回答 |
| 边界测试 | 无关问题 | 婉拒或引导 |
### 3.3 检查各环节
在调试面板中可以看到:
- **ASR 输出**:用户语音识别结果
- **LLM 输入/输出**:模型的输入和生成内容
- **TTS 状态**:语音合成状态
---
## 第四步:发布助手
测试通过后:
1. 点击 **发布** 按钮
2. 复制生成的连接信息:
- `assistant_id`:用于 API 调用
- WebSocket 地址:用于实时对话
### 嵌入到应用
最小接入方式是使用 WebSocket API 建立实时会话
```javascript
const ws = new WebSocket('ws://your-server/ws?assistant_id=YOUR_ASSISTANT_ID');
@@ -223,54 +71,28 @@ const ws = new WebSocket('ws://your-server/ws?assistant_id=YOUR_ASSISTANT_ID');
ws.onopen = () => {
ws.send(JSON.stringify({
type: 'session.start',
audio: {
encoding: 'pcm_s16le',
sample_rate_hz: 16000,
channels: 1
}
audio: { encoding: 'pcm_s16le', sample_rate_hz: 16000, channels: 1 }
}));
};
ws.onmessage = (event) => {
console.log('收到消息:', event.data);
};
```
---
你通常只需要两项信息:
## 常见问题
- `assistant_id`:指定接入哪个助手
- WebSocket 地址:由引擎服务提供实时对话入口
### 资源库中添加模型失败?
完整协议见 [WebSocket 协议](../api-reference/websocket.md)。
1. 检查 API Key 是否正确
2. 确认 Base URL 格式正确(通常以 `/v1` 结尾)
3. 验证网络能否访问对应的 API 服务
## 常见卡点
### 助手不回复?
1. 检查是否已选择 LLM 模型
2. 确认 LLM 模型配置正确(可在模型接入页面预览测试)
3. 查看浏览器控制台是否有错误
### 语音识别不准确?
1. 检查是否选择了正确的语言
2. 尝试添加热词提高专有名词识别率
3. 确保录音设备工作正常
### 语音无法播放?
1. 检查浏览器是否允许自动播放音频
2. 确认已选择声音并正确配置
3. 在声音资源页面点击试听确认配置正确
---
- 资源配置不生效:回到 [资源准备清单](dashboard.md) 检查三类资源是否都已准备好
- 助手不回复:先看 [测试调试](../concepts/assistants/testing.md),再进入 [故障排查](../resources/troubleshooting.md)
- 回复质量不稳定:优先检查 [提示词指南](../concepts/assistants/prompts.md) 与 [知识库](../customization/knowledge-base.md)
## 下一步
恭喜!你已成功创建了第一个 AI 助手。接下来可以:
- [环境与部署](../getting-started/index.md) - 补全环境、配置和部署细节
- [构建助手](../concepts/assistants.md) - 深入配置助手、模型、知识库、工具与工作流
- [API 参考](../api-reference/index.md) - 查看管理接口与实时协议
- [配置知识库](../customization/knowledge-base.md) - 让助手回答专业问题
- [添加工具](../customization/tools.md) - 扩展助手能力
- [查看 API 文档](../api-reference/websocket.md) - 深入了解协议细节
- [Docker 部署](../deployment/index.md) - 使用容器运行

View File

@@ -1,110 +1,59 @@
# 常见问题
# 常见问题
## API Key 配置
本页只提供简短回答和跳转建议;如果你需要逐步排查,请直接进入 [故障排查](troubleshooting.md)。
### Q: 如何配置 API Key
## Q: 我应该先看哪一部分文档
进入 **LLM 库****语音库** 页面,点击对应模型的配置按钮填写 API Key。
- 想了解产品是什么:看 [产品概览](../overview/index.md)
- 想先把服务跑起来:看 [环境与部署](../getting-started/index.md)
- 想最快创建第一个助手:看 [快速开始](../quickstart/index.md)
- 想系统完成助手配置:从 [助手概览](../concepts/assistants.md) 开始
**步骤:**
## Q: 如何配置模型或 API Key
1. 在左侧导航栏选择 **模型配置**
2. 选择 **LLM 库****语音库**
3. 点击已添加模型的 **编辑** 按钮
4. 在 API Key 字段填写你的密钥
5. 点击 **保存**
进入对应资源页完成配置:
## 助手问题
- LLM见 [LLM 模型](../customization/models.md)
- ASR见 [语音识别](../customization/asr.md)
- TTS见 [声音资源](../customization/voices.md)
### Q: 助手无法回复?
## Q: 助手为什么不回复?
可能的原因和解决方案
通常先检查三件事
1. **检查模型配置是否正确**
- 确认 API Key 已正确填写
- 测试模型连接是否正常
- 助手是否已绑定可用的模型资源
- 提示词、知识库或工具是否配置完整
- WebSocket 会话是否已经正常建立
2. **确认知识库已正确关联**
- 进入助手配置的 **知识** 标签页
- 检查是否已选择知识库
下一步:
3. **查看系统日志排查错误**
- 打开浏览器开发者工具F12
- 检查 Console 和 Network 标签页
- 助手行为验证:看 [测试调试](../concepts/assistants/testing.md)
- 逐步排查:看 [故障排查](troubleshooting.md)
### Q: 助手回复内容不相关
## Q: 回复为什么不准确或不稳定
- 检查系统提示词是否清晰明确
- 调整 Temperature 参数(降低可提高准确性)
- 确认知识库内容与问题相关
- 增加知识库相似度阈值
优先检查:
## 语音识别
- 提示词是否明确了角色、任务和限制
- 是否应该补充知识库,而不是继续堆叠提示词
- 是否需要把复杂业务改成工作流,而不是单轮问答
### Q: 语音识别不准确?
相关文档:
1. **确认 ASR 模型选择正确**
- 中文场景推荐使用 SenseVoice
- 英文场景推荐使用 Whisper
- [提示词指南](../concepts/assistants/prompts.md)
- [知识库](../customization/knowledge-base.md)
- [工作流](../customization/workflows.md)
2. **检查音频采样率**
- 推荐采样率16kHz
- 推荐格式PCM 16-bit
## Q: 语音识别或语音播放效果不好怎么办?
3. **确认语言设置匹配**
- 在 ASR 配置中选择正确的语言
- 输入侧问题先看 [语音识别](../customization/asr.md)
- 输出侧问题先看 [声音资源](../customization/voices.md) 和 [TTS 参数](../customization/tts.md)
- 需要逐步定位链路问题时,再看 [故障排查](troubleshooting.md)
### Q: 语音延迟较高
## Q: 页面空白、接口报错或连接不上怎么办
- 检查网络连接稳定性
- 尝试切换 ASR 服务提供商
- 降低音频质量以减少传输数据量
这是典型的环境或链路问题:
## 语音合成
- 先确认 [环境与部署](../getting-started/index.md) 中的三个服务都已启动
- 再进入 [故障排查](troubleshooting.md) 按连接、API、页面加载或性能问题分类处理
### Q: TTS 声音不自然?
- 尝试不同的音色选项
- 调整语速参数(推荐 0.8-1.2
- 选择与内容风格匹配的声音
### Q: TTS 无法播放?
1. 检查浏览器是否允许自动播放音频
2. 确认 TTS API Key 配置正确
3. 检查网络连接
## 知识库
### Q: 知识库检索无结果?
- 确认文档已成功上传
- 降低相似度阈值(默认 0.7
- 增加返回结果数量
- 检查文档内容是否与查询相关
### Q: 文档上传失败?
- 检查文件大小是否超过 10MB
- 确认文件格式支持MD/PDF/TXT
- 尝试减小文档内容
## 部署问题
### Q: 页面空白或加载失败?
1. 检查浏览器控制台错误信息
2. 确认后端服务已启动
3. 检查 VITE_API_URL 环境变量配置
### Q: API 请求失败?
- 确认 VITE_API_URL 配置正确
- 检查后端服务是否运行
- 查看网络请求响应状态码
### Q: 静态资源 404
- 检查 Nginx `try_files` 配置
- 确认构建产物路径正确
- 检查文件权限设置

View File

@@ -1,4 +1,4 @@
# 开发路线图
# 开发路线图
本页面展示 Realtime Agent Studio 的开发计划和进度。
@@ -8,50 +8,47 @@
### 实时交互引擎
- [x] **管线式全双工引擎** - ASR/LLM/TTS 流水线架构
- [x] **管线式全双工引擎** - ASR / LLM / TTS 流水线架构
- [x] **智能打断处理** - VAD + EOU 检测
- [x] **OpenAI 兼容接口** - ASR/TTS 标准接口适配
- [x] **OpenAI 兼容接口** - ASR / TTS 标准接口适配
- [x] **DashScope TTS** - 阿里云语音合成适配
### 智能体配置管理
### 助手配置管理
- [x] **系统提示词编辑** - Prompt 配置,动态变量注入
- [x] **模型选择** - LLM/ASR/TTS 模型管理界面
- [x] **模型选择** - LLM / ASR / TTS 模型管理界面
- [x] **工具调用配置** - Webhook 工具 + 客户端工具
### 交互测试工具
### 调试与观察
- [x] **实时调试控制台** - WebSocket 调试连接示例
- [x] **完整会话回放** - 音频 + 转写 + LLM 响应
- [x] **会话检索筛选** - 按时间 / 助手 / 状态筛选
### 开放接口
- [x] **WebSocket 协议** - `/ws` 端点完整实现
- [x] **RESTful 接口** - 完整的 CRUD API
### 交互历史监控
- [x] **完整会话回放** - 音频 + 转写 + LLM 响应
- [x] **会话检索筛选** - 按时间/助手/状态筛选
---
## 开发中 :construction:
### 智能体配置管理
### 助手与能力编排
- [ ] **私有化 ASR/TTS 适配** - 本地模型接入
- [ ] **私有化 ASR / TTS 适配** - 本地模型接入
- [ ] **工作流编辑** - 可视化流程编排
- [ ] **知识库关联** - RAG 文档管理
### 实时交互引擎
- [ ] **原生多模态模型** - Step Audio 接入GPT-4o Realtime/Gemini Live 国内环境受限)
- [ ] **原生多模态模型** - Step Audio 接入GPT-4o Realtime / Gemini Live 国内环境受限)
- [ ] **WebRTC 协议** - `/webrtc` 端点
### 开放接口
- [ ] **SDK 支持** - JavaScript/Python SDK
- [ ] **电话接入** - 电话呼入自动接听/自动呼出接口和批量呼出
- [ ] **WebRTC 协议** - `/webrtc` 端点
- [ ] **SDK 支持** - JavaScript / Python SDK
- [ ] **电话接入** - 电话呼入自动接听 / 自动呼出接口和批量呼出
### 效果评估
@@ -65,13 +62,14 @@
- [ ] **Webhook 回调** - 会话事件通知机制
### 效果评估
### 数据与评估
- [ ] **实时仪表盘增强** - 完善统计看板功能
- [ ] **评估闭环** - 测试、评分、回归与变更追踪
### 企业特性
### 企业能力
- [ ] **多租户支持** - 团队/组织管理
- [ ] **多租户支持** - 团队 / 组织管理
- [ ] **权限管理** - RBAC 角色权限控制
- [ ] **审计日志** - 操作记录追踪
@@ -79,7 +77,7 @@
- [ ] **更多模型供应商** - 讯飞、百度、腾讯等
- [ ] **CRM 集成** - Salesforce、HubSpot 等
- [ ] **呼叫中心集成** - SIP/PSTN 网关
- [ ] **呼叫中心集成** - SIP / PSTN 网关
---
@@ -94,20 +92,19 @@
---
## 参考项目
## 生态参考
### 开源项目
* [Livekit Agent](https://github.com/livekit/agents)
* [Pipecat](https://github.com/pipecat-ai/pipecat)
* [vison-agent](https://github.com/GetStream/Vision-Agents)
* [active-call](https://github.com/miuda-ai/active-call)
* [TEN](https://github.com/TEN-framework/ten-framework)
* [airi](https://github.com/moeru-ai/airi)
* [Vocode Core](https://github.com/vocodedev/vocode-core)
* [awesome-voice-agents](https://github.com/yzfly/awesome-voice-agents)
### 商业项目
* [Vapi](https://vapi.ai)
* [Retell](https://www.retellai.com)
* [Sierra](https://sierra.ai/product/voice)
* [Bolna](https://platform.bolna.ai)
- [Livekit Agent](https://github.com/livekit/agents)
- [Pipecat](https://github.com/pipecat-ai/pipecat)
- [Vision Agents](https://github.com/GetStream/Vision-Agents)
- [active-call](https://github.com/miuda-ai/active-call)
- [TEN](https://github.com/TEN-framework/ten-framework)
- [airi](https://github.com/moeru-ai/airi)
- [Vocode Core](https://github.com/vocodedev/vocode-core)
- [awesome-voice-agents](https://github.com/yzfly/awesome-voice-agents)
### 文档与研究参考
- [Voice AI & Voice Agents](https://voiceaiandvoiceagents.com/)

View File

@@ -1,5 +1,5 @@
site_name: "Realtime Agent Studio"
site_description: "构建实时交互音视频智能体的开源工作平台"
site_name: "Realtime Agent Studio"
site_description: "Realtime Agent StudioRAS是一个通过管理控制台与 API 构建、部署和运营实时多模态助手的开源平台"
site_url: "https://your-org.github.io/AI-VideoAssistant"
copyright: "Copyright &copy; 2025 RAS Team"
site_author: "RAS Team"
@@ -9,51 +9,41 @@ site_dir: "site"
nav:
- 首页: index.md
- 产品概览:
- 概述: overview/index.md
- 系统架构: overview/architecture.md
- 快速开始:
- 5 分钟入门: quickstart/index.md
- 资源库配置: quickstart/dashboard.md
- 环境与部署: getting-started/index.md
- 创建第一个助手: quickstart/index.md
- 构建助手:
- 助手概览: concepts/assistants.md
- 基础配置: concepts/assistants/configuration.md
- 提示词: concepts/assistants/prompts.md
- LLM 模型: customization/models.md
- 语音识别: customization/asr.md
- 声音资源: customization/voices.md
- TTS 参数: customization/tts.md
- 知识库: customization/knowledge-base.md
- 工具: customization/tools.md
- 工作流: customization/workflows.md
- 测试与调试: concepts/assistants/testing.md
- 核心概念:
- 概述: concepts/index.md
- 助手详解: concepts/assistants.md
- 产品概览: overview/index.md
- 概念总览: concepts/index.md
- 引擎架构: concepts/engines.md
- 安装部署:
- 概述: getting-started/index.md
- 环境要求: getting-started/requirements.md
- 配置说明: getting-started/configuration.md
- 部署概览: deployment/index.md
- Docker 部署: deployment/docker.md
- 助手管理:
- 创建助手:
- 小助手:
- 配置选项: assistants/configuration.md
- 提示词指南: assistants/prompts.md
- 测试调试: assistants/testing.md
- 工作流:
- 配置选项: assistants/workflow-configuration.md
- 组件库:
- 模型接入: customization/models.md
- 语音识别: customization/asr.md
- 语音生成: customization/tts.md
- 知识库: customization/knowledge-base.md
- 工具与插件: customization/tools.md
- 数据分析:
- 仪表盘: analysis/dashboard.md
- 历史记录: analysis/history.md
- 效果评估: analysis/evaluation.md
- 自动化测试: analysis/autotest.md
- API 参考:
- 概述: api-reference/index.md
- Pipeline 引擎: concepts/pipeline-engine.md
- Realtime 引擎: concepts/realtime-engine.md
- 系统架构: overview/architecture.md
- 集成:
- API 参考: api-reference/index.md
- WebSocket 协议: api-reference/websocket.md
- 错误码: api-reference/errors.md
- 资源:
- 运维:
- 仪表盘: analysis/dashboard.md
- 历史记录: analysis/history.md
- 效果评估: analysis/evaluation.md
- 自动化测试: analysis/autotest.md
- 常见问题: resources/faq.md
- 故障排查: resources/troubleshooting.md
- 更新日志: changelog.md
- 路线图: roadmap.md
theme:
name: material
language: zh
@@ -148,7 +138,6 @@ plugins:
minify_html: true
extra:
# version.provider: mike — only enable when deploying with mike (versions.json is generated on deploy)
social:
- icon: fontawesome/brands/github
link: https://github.com/your-org/AI-VideoAssistant
@@ -164,3 +153,5 @@ extra_css:
extra_javascript:
- javascripts/mermaid.mjs
- javascripts/extra.js

View File

@@ -26,34 +26,27 @@ HISTORY_FINALIZE_DRAIN_TIMEOUT_SEC=1.5
SAMPLE_RATE=16000
# 20ms is recommended for VAD stability and latency.
# 100ms works but usually worsens start-of-speech accuracy.
# WS binary audio frame size validation is derived from SAMPLE_RATE + CHUNK_SIZE_MS.
# Client frame payloads must be a multiple of: SAMPLE_RATE * 2 * (CHUNK_SIZE_MS / 1000).
CHUNK_SIZE_MS=20
# Public default output codec exposed in config.resolved (overridable by runtime metadata).
DEFAULT_CODEC=pcm
MAX_AUDIO_BUFFER_SECONDS=30
# Agent profile selection (optional fallback when CLI args are not used)
# Prefer CLI:
# python -m app.main --agent-config config/agents/default.yaml
# python -m app.main --agent-profile default
# AGENT_CONFIG_PATH=config/agents/default.yaml
# AGENT_PROFILE=default
AGENT_CONFIG_DIR=config/agents
# Optional: provider credentials referenced from YAML, e.g. ${LLM_API_KEY}
# LLM_API_KEY=your_llm_api_key_here
# LLM_API_URL=https://api.openai.com/v1
# TTS_API_KEY=your_tts_api_key_here
# TTS_API_URL=https://api.example.com/v1/audio/speech
# ASR_API_KEY=your_asr_api_key_here
# ASR_API_URL=https://api.example.com/v1/audio/transcriptions
# Local assistant/agent YAML directory. In local mode the runtime resolves:
# ASSISTANT_LOCAL_CONFIG_DIR/<assistant_id>.yaml
ASSISTANT_LOCAL_CONFIG_DIR=config/agents
# Logging
LOG_LEVEL=INFO
# json is better for production/observability; text is easier locally.
# Controls both console and file log serialization/format.
LOG_FORMAT=json
# WebSocket behavior
INACTIVITY_TIMEOUT_SEC=60
HEARTBEAT_INTERVAL_SEC=50
# Public protocol label emitted in session.started/config.resolved payloads.
WS_PROTOCOL_VERSION=v1
# CORS / ICE (JSON strings)

View File

@@ -2,6 +2,11 @@ FROM python:3.12-slim
WORKDIR /app
# Build this image from the project parent directory so both
# engine-v3/engine and fastgpt-python-sdk are available in the context.
# Example:
# docker build -f engine-v3/engine/Dockerfile -t engine-v3 .
# Install system dependencies for audio processing
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
@@ -12,11 +17,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY engine-v3/engine/requirements.txt /tmp/requirements.txt
COPY fastgpt-python-sdk /deps/fastgpt-python-sdk
RUN pip install --no-cache-dir -r /tmp/requirements.txt \
&& pip install --no-cache-dir /deps/fastgpt-python-sdk
# Copy application code
COPY . .
COPY engine-v3/engine /app
# Create necessary directories
RUN mkdir -p /app/logs /app/data/vad

View File

@@ -1,6 +1,6 @@
# py-active-call-cc
# Realtime Agent Studio Engine
Python Active-Call: real-time audio streaming with WebSocket and WebRTC.
This repo contains a Python 3.11+ codebase for building low-latency realtime human-agent interaction pipelines (capture, stream, and process audio) using WebSockets or WebRTC.
This repo contains a Python 3.11+ codebase for building low-latency voice
pipelines (capture, stream, and process audio) using WebRTC and WebSockets.
@@ -14,35 +14,11 @@ It is currently in an early, experimental stage.
uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
```
使用 agent profile推荐
```
python -m app.main --agent-profile default
```
使用指定 YAML
```
python -m app.main --agent-config config/agents/default.yaml
```
Agent 配置路径优先级
1. `--agent-config`
2. `--agent-profile`(映射到 `config/agents/<profile>.yaml`
3. `AGENT_CONFIG_PATH`
4. `AGENT_PROFILE`
5. `config/agents/default.yaml`(若存在)
说明
- Agent 相关配置是严格模式YAML 缺少必须项会直接报错,不会回退到 `.env` 或代码默认值
- 如果要引用环境变量,请在 YAML 显式写 `${ENV_VAR}`
- `siliconflow` 独立 section 已移除;请在 `agent.llm / agent.tts / agent.asr` 内通过 `provider``api_key``api_url``model` 配置
- `agent.tts.provider` 现支持 `dashscope`Realtime 协议,非 OpenAI-compatible默认 URL 为 `wss://dashscope.aliyuncs.com/api-ws/v1/realtime`,默认模型为 `qwen3-tts-flash-realtime`
- `agent.tts.dashscope_mode`(兼容旧写法 `agent.tts.mode`)支持 `commit | server_commit`,且仅在 `provider=dashscope` 时生效:
- `commit`Engine 先按句切分,再逐句提交给 DashScope。
- `server_commit`Engine 不再逐句切分,由 DashScope 对整段文本自行切分。
- 现在支持在 Agent YAML 中配置 `agent.tools`(列表),用于声明运行时可调用工具。
- 工具配置示例见 `config/agents/tools.yaml`
- 启动阶段不再通过参数加载 Agent YAML
- 会话阶段统一按 `assistant_id` 拉取运行时配置:
- `BACKEND_URL`:从 backend API 获取
- `BACKEND_URL`(或 `BACKEND_MODE=disabled`):从 `ASSISTANT_LOCAL_CONFIG_DIR/<assistant_id>.yaml` 获取
## Backend Integration
@@ -50,6 +26,7 @@ Engine runtime now supports adapter-based backend integration:
- `BACKEND_MODE=auto|http|disabled`
- `BACKEND_URL` + `BACKEND_TIMEOUT_SEC`
- `ASSISTANT_LOCAL_CONFIG_DIR` (default `engine/config/agents`)
- `HISTORY_ENABLED=true|false`
Behavior:
@@ -58,6 +35,16 @@ Behavior:
- `http`: force HTTP backend; falls back to engine-only mode when URL is missing.
- `disabled`: force engine-only mode (no backend calls).
Assistant config source behavior:
- If `BACKEND_URL` is configured and backend mode is enabled, assistant config is loaded from backend API.
- If `BACKEND_URL` is empty (or backend mode is disabled), assistant config is loaded from local YAML.
Local assistant YAML example:
- File path: `engine/config/agents/<assistant_id>.yaml`
- Runtime still requires WebSocket query param `assistant_id`; it must match the local file name.
History write path is now asynchronous and buffered per session:
- `HISTORY_QUEUE_MAX_SIZE`
@@ -84,3 +71,6 @@ python mic_client.py
`/ws` uses a strict `v1` JSON control protocol with binary PCM audio frames.
See `docs/ws_v1_schema.md`.
# Reference
* [active-call](https://github.com/restsend/active-call)

View File

@@ -0,0 +1 @@
"""Adapters package."""

View File

@@ -0,0 +1 @@
"""Control-plane adapters package."""

View File

@@ -0,0 +1,683 @@
"""Backend adapter implementations for engine integration ports."""
from __future__ import annotations
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
import aiohttp
from loguru import logger
from app.config import settings
try:
import yaml
except ImportError: # pragma: no cover - validated when local YAML source is enabled
yaml = None
_ASSISTANT_ID_PATTERN = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_.-]{0,127}$")
def _assistant_error(code: str, assistant_id: str) -> Dict[str, Any]:
return {"__error_code": code, "assistantId": str(assistant_id or "")}
class NullBackendAdapter:
"""No-op adapter for engine-only runtime without backend dependencies."""
async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]:
_ = assistant_id
return None
async def create_call_record(
self,
*,
user_id: int,
assistant_id: Optional[str],
source: str = "debug",
) -> Optional[str]:
_ = (user_id, assistant_id, source)
return None
async def add_transcript(
self,
*,
call_id: str,
turn_index: int,
speaker: str,
content: str,
start_ms: int,
end_ms: int,
confidence: Optional[float] = None,
duration_ms: Optional[int] = None,
) -> bool:
_ = (call_id, turn_index, speaker, content, start_ms, end_ms, confidence, duration_ms)
return False
async def finalize_call_record(
self,
*,
call_id: str,
status: str,
duration_seconds: int,
) -> bool:
_ = (call_id, status, duration_seconds)
return False
async def search_knowledge_context(
self,
*,
kb_id: str,
query: str,
n_results: int = 5,
) -> List[Dict[str, Any]]:
_ = (kb_id, query, n_results)
return []
async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]:
_ = tool_id
return None
class HistoryDisabledBackendAdapter:
"""Adapter wrapper that disables history writes while keeping reads available."""
def __init__(self, delegate: HttpBackendAdapter | NullBackendAdapter):
self._delegate = delegate
async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]:
return await self._delegate.fetch_assistant_config(assistant_id)
async def create_call_record(
self,
*,
user_id: int,
assistant_id: Optional[str],
source: str = "debug",
) -> Optional[str]:
_ = (user_id, assistant_id, source)
return None
async def add_transcript(
self,
*,
call_id: str,
turn_index: int,
speaker: str,
content: str,
start_ms: int,
end_ms: int,
confidence: Optional[float] = None,
duration_ms: Optional[int] = None,
) -> bool:
_ = (call_id, turn_index, speaker, content, start_ms, end_ms, confidence, duration_ms)
return False
async def finalize_call_record(
self,
*,
call_id: str,
status: str,
duration_seconds: int,
) -> bool:
_ = (call_id, status, duration_seconds)
return False
async def search_knowledge_context(
self,
*,
kb_id: str,
query: str,
n_results: int = 5,
) -> List[Dict[str, Any]]:
return await self._delegate.search_knowledge_context(
kb_id=kb_id,
query=query,
n_results=n_results,
)
async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]:
return await self._delegate.fetch_tool_resource(tool_id)
class LocalYamlAssistantConfigAdapter(NullBackendAdapter):
"""Load assistant runtime config from local YAML files."""
def __init__(self, config_dir: str):
self._config_dir = self._resolve_base_dir(config_dir)
@staticmethod
def _resolve_base_dir(config_dir: str) -> Path:
raw = Path(str(config_dir or "").strip() or "engine/config/agents")
if raw.is_absolute():
return raw.resolve()
cwd_candidate = (Path.cwd() / raw).resolve()
if cwd_candidate.exists():
return cwd_candidate
engine_dir = Path(__file__).resolve().parent.parent
engine_candidate = (engine_dir / raw).resolve()
if engine_candidate.exists():
return engine_candidate
parts = raw.parts
if parts and parts[0] == "engine" and len(parts) > 1:
trimmed_candidate = (engine_dir / Path(*parts[1:])).resolve()
if trimmed_candidate.exists():
return trimmed_candidate
return cwd_candidate
def _resolve_config_file(self, assistant_id: str) -> Optional[Path]:
normalized = str(assistant_id or "").strip()
if not _ASSISTANT_ID_PATTERN.match(normalized):
return None
yaml_path = self._config_dir / f"{normalized}.yaml"
yml_path = self._config_dir / f"{normalized}.yml"
if yaml_path.exists():
return yaml_path
if yml_path.exists():
return yml_path
return None
@staticmethod
def _as_str(value: Any) -> Optional[str]:
if value is None:
return None
text = str(value).strip()
return text or None
@classmethod
def _translate_agent_schema(cls, assistant_id: str, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Translate legacy `agent:` YAML schema into runtime assistant metadata."""
agent = payload.get("agent")
if not isinstance(agent, dict):
return None
runtime: Dict[str, Any] = {
"assistantId": str(assistant_id),
"services": {},
}
llm = agent.get("llm")
if isinstance(llm, dict):
llm_runtime: Dict[str, Any] = {}
if cls._as_str(llm.get("provider")):
llm_runtime["provider"] = cls._as_str(llm.get("provider"))
if cls._as_str(llm.get("model")):
llm_runtime["model"] = cls._as_str(llm.get("model"))
if cls._as_str(llm.get("api_key")):
llm_runtime["apiKey"] = cls._as_str(llm.get("api_key"))
if cls._as_str(llm.get("api_url")):
llm_runtime["baseUrl"] = cls._as_str(llm.get("api_url"))
if cls._as_str(llm.get("app_id")):
llm_runtime["appId"] = cls._as_str(llm.get("app_id"))
if llm_runtime:
runtime["services"]["llm"] = llm_runtime
tts = agent.get("tts")
if isinstance(tts, dict):
tts_runtime: Dict[str, Any] = {}
if cls._as_str(tts.get("provider")):
tts_runtime["provider"] = cls._as_str(tts.get("provider"))
if cls._as_str(tts.get("model")):
tts_runtime["model"] = cls._as_str(tts.get("model"))
if cls._as_str(tts.get("api_key")):
tts_runtime["apiKey"] = cls._as_str(tts.get("api_key"))
if cls._as_str(tts.get("api_url")):
tts_runtime["baseUrl"] = cls._as_str(tts.get("api_url"))
if cls._as_str(tts.get("voice")):
tts_runtime["voice"] = cls._as_str(tts.get("voice"))
if cls._as_str(tts.get("app_id")):
tts_runtime["appId"] = cls._as_str(tts.get("app_id"))
if cls._as_str(tts.get("resource_id")):
tts_runtime["resourceId"] = cls._as_str(tts.get("resource_id"))
if cls._as_str(tts.get("cluster")):
tts_runtime["cluster"] = cls._as_str(tts.get("cluster"))
if cls._as_str(tts.get("uid")):
tts_runtime["uid"] = cls._as_str(tts.get("uid"))
if tts.get("speed") is not None:
tts_runtime["speed"] = tts.get("speed")
dashscope_mode = cls._as_str(tts.get("dashscope_mode")) or cls._as_str(tts.get("mode"))
if dashscope_mode:
tts_runtime["mode"] = dashscope_mode
if tts_runtime:
runtime["services"]["tts"] = tts_runtime
asr = agent.get("asr")
if isinstance(asr, dict):
asr_runtime: Dict[str, Any] = {}
if cls._as_str(asr.get("provider")):
asr_runtime["provider"] = cls._as_str(asr.get("provider"))
if cls._as_str(asr.get("model")):
asr_runtime["model"] = cls._as_str(asr.get("model"))
if cls._as_str(asr.get("api_key")):
asr_runtime["apiKey"] = cls._as_str(asr.get("api_key"))
if cls._as_str(asr.get("api_url")):
asr_runtime["baseUrl"] = cls._as_str(asr.get("api_url"))
if cls._as_str(asr.get("app_id")):
asr_runtime["appId"] = cls._as_str(asr.get("app_id"))
if cls._as_str(asr.get("resource_id")):
asr_runtime["resourceId"] = cls._as_str(asr.get("resource_id"))
if cls._as_str(asr.get("cluster")):
asr_runtime["cluster"] = cls._as_str(asr.get("cluster"))
if cls._as_str(asr.get("uid")):
asr_runtime["uid"] = cls._as_str(asr.get("uid"))
if isinstance(asr.get("request_params"), dict):
asr_runtime["requestParams"] = dict(asr.get("request_params") or {})
if asr.get("enable_interim") is not None:
asr_runtime["enableInterim"] = asr.get("enable_interim")
if asr.get("interim_interval_ms") is not None:
asr_runtime["interimIntervalMs"] = asr.get("interim_interval_ms")
if asr.get("min_audio_ms") is not None:
asr_runtime["minAudioMs"] = asr.get("min_audio_ms")
if asr_runtime:
runtime["services"]["asr"] = asr_runtime
duplex = agent.get("duplex")
if isinstance(duplex, dict):
if cls._as_str(duplex.get("system_prompt")):
runtime["systemPrompt"] = cls._as_str(duplex.get("system_prompt"))
if duplex.get("greeting") is not None:
runtime["greeting"] = duplex.get("greeting")
barge_in = agent.get("barge_in")
if isinstance(barge_in, dict):
runtime["bargeIn"] = {}
if barge_in.get("min_duration_ms") is not None:
runtime["bargeIn"]["minDurationMs"] = barge_in.get("min_duration_ms")
if barge_in.get("silence_tolerance_ms") is not None:
runtime["bargeIn"]["silenceToleranceMs"] = barge_in.get("silence_tolerance_ms")
if not runtime["bargeIn"]:
runtime.pop("bargeIn", None)
if isinstance(agent.get("tools"), list):
runtime["tools"] = agent.get("tools")
if not runtime.get("services"):
runtime.pop("services", None)
return runtime
async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]:
config_file = self._resolve_config_file(assistant_id)
if config_file is None:
return _assistant_error("assistant.not_found", assistant_id)
if yaml is None:
logger.warning(
"Local assistant config requested but PyYAML is unavailable (assistant_id={})",
assistant_id,
)
return _assistant_error("assistant.config_unavailable", assistant_id)
try:
with config_file.open("r", encoding="utf-8") as handle:
payload = yaml.safe_load(handle) or {}
except Exception as exc:
logger.warning(
"Failed to read local assistant config {} (assistant_id={}): {}",
config_file,
assistant_id,
exc,
)
return _assistant_error("assistant.config_unavailable", assistant_id)
if not isinstance(payload, dict):
logger.warning(
"Local assistant config is not an object (assistant_id={}, file={})",
assistant_id,
config_file,
)
return _assistant_error("assistant.config_unavailable", assistant_id)
translated = self._translate_agent_schema(assistant_id, payload)
if translated is not None:
payload = translated
# Accept either backend-like payload shape or a direct assistant metadata object.
if isinstance(payload.get("assistant"), dict) or isinstance(payload.get("sessionStartMetadata"), dict):
normalized_payload = dict(payload)
else:
normalized_payload = {"assistant": dict(payload)}
assistant_obj = normalized_payload.get("assistant")
if isinstance(assistant_obj, dict):
resolved_assistant_id = assistant_obj.get("assistantId") or assistant_obj.get("id") or assistant_id
assistant_obj["assistantId"] = str(resolved_assistant_id)
else:
normalized_payload["assistant"] = {"assistantId": str(assistant_id)}
normalized_payload.setdefault("assistantId", str(assistant_id))
normalized_payload.setdefault("configVersionId", f"local:{config_file.name}")
return normalized_payload
class AssistantConfigSourceAdapter:
"""Route assistant config reads by backend availability without changing other APIs."""
def __init__(
self,
*,
delegate: HttpBackendAdapter | NullBackendAdapter | HistoryDisabledBackendAdapter,
local_delegate: LocalYamlAssistantConfigAdapter,
use_backend_assistant_config: bool,
):
self._delegate = delegate
self._local_delegate = local_delegate
self._use_backend_assistant_config = bool(use_backend_assistant_config)
async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]:
if self._use_backend_assistant_config:
return await self._delegate.fetch_assistant_config(assistant_id)
return await self._local_delegate.fetch_assistant_config(assistant_id)
async def create_call_record(
self,
*,
user_id: int,
assistant_id: Optional[str],
source: str = "debug",
) -> Optional[str]:
return await self._delegate.create_call_record(
user_id=user_id,
assistant_id=assistant_id,
source=source,
)
async def add_transcript(
self,
*,
call_id: str,
turn_index: int,
speaker: str,
content: str,
start_ms: int,
end_ms: int,
confidence: Optional[float] = None,
duration_ms: Optional[int] = None,
) -> bool:
return await self._delegate.add_transcript(
call_id=call_id,
turn_index=turn_index,
speaker=speaker,
content=content,
start_ms=start_ms,
end_ms=end_ms,
confidence=confidence,
duration_ms=duration_ms,
)
async def finalize_call_record(
self,
*,
call_id: str,
status: str,
duration_seconds: int,
) -> bool:
return await self._delegate.finalize_call_record(
call_id=call_id,
status=status,
duration_seconds=duration_seconds,
)
async def search_knowledge_context(
self,
*,
kb_id: str,
query: str,
n_results: int = 5,
) -> List[Dict[str, Any]]:
return await self._delegate.search_knowledge_context(
kb_id=kb_id,
query=query,
n_results=n_results,
)
async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]:
return await self._delegate.fetch_tool_resource(tool_id)
class HttpBackendAdapter:
"""HTTP implementation of backend integration ports."""
def __init__(self, backend_url: str, timeout_sec: int = 10):
base_url = str(backend_url or "").strip().rstrip("/")
if not base_url:
raise ValueError("backend_url is required for HttpBackendAdapter")
self._base_url = base_url
self._timeout_sec = timeout_sec
def _timeout(self) -> aiohttp.ClientTimeout:
return aiohttp.ClientTimeout(total=self._timeout_sec)
async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]:
"""Fetch assistant config payload from backend API.
Expected response shape:
{
"assistant": {...},
"voice": {...} | null
}
"""
url = f"{self._base_url}/api/assistants/{assistant_id}/config"
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.get(url) as resp:
if resp.status == 404:
logger.warning(f"Assistant config not found: {assistant_id}")
return {"__error_code": "assistant.not_found", "assistantId": assistant_id}
resp.raise_for_status()
payload = await resp.json()
if not isinstance(payload, dict):
logger.warning("Assistant config payload is not a dict; ignoring")
return {"__error_code": "assistant.config_unavailable", "assistantId": assistant_id}
return payload
except Exception as exc:
logger.warning(f"Failed to fetch assistant config ({assistant_id}): {exc}")
return {"__error_code": "assistant.config_unavailable", "assistantId": assistant_id}
async def create_call_record(
self,
*,
user_id: int,
assistant_id: Optional[str],
source: str = "debug",
) -> Optional[str]:
"""Create a call record via backend history API and return call_id."""
url = f"{self._base_url}/api/history"
payload: Dict[str, Any] = {
"user_id": user_id,
"assistant_id": assistant_id,
"source": source,
"status": "connected",
}
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.post(url, json=payload) as resp:
resp.raise_for_status()
data = await resp.json()
call_id = str((data or {}).get("id") or "")
return call_id or None
except Exception as exc:
logger.warning(f"Failed to create history call record: {exc}")
return None
async def add_transcript(
self,
*,
call_id: str,
turn_index: int,
speaker: str,
content: str,
start_ms: int,
end_ms: int,
confidence: Optional[float] = None,
duration_ms: Optional[int] = None,
) -> bool:
"""Append a transcript segment to backend history."""
if not call_id:
return False
url = f"{self._base_url}/api/history/{call_id}/transcripts"
payload: Dict[str, Any] = {
"turn_index": turn_index,
"speaker": speaker,
"content": content,
"confidence": confidence,
"start_ms": start_ms,
"end_ms": end_ms,
"duration_ms": duration_ms,
}
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.post(url, json=payload) as resp:
resp.raise_for_status()
return True
except Exception as exc:
logger.warning(f"Failed to append history transcript (call_id={call_id}, turn={turn_index}): {exc}")
return False
async def finalize_call_record(
self,
*,
call_id: str,
status: str,
duration_seconds: int,
) -> bool:
"""Finalize a call record with status and duration."""
if not call_id:
return False
url = f"{self._base_url}/api/history/{call_id}"
payload: Dict[str, Any] = {
"status": status,
"duration_seconds": duration_seconds,
}
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.put(url, json=payload) as resp:
resp.raise_for_status()
return True
except Exception as exc:
logger.warning(f"Failed to finalize history call record ({call_id}): {exc}")
return False
async def search_knowledge_context(
self,
*,
kb_id: str,
query: str,
n_results: int = 5,
) -> List[Dict[str, Any]]:
"""Search backend knowledge base and return retrieval results."""
if not kb_id or not query.strip():
return []
try:
safe_n_results = max(1, int(n_results))
except (TypeError, ValueError):
safe_n_results = 5
url = f"{self._base_url}/api/knowledge/search"
payload: Dict[str, Any] = {
"kb_id": kb_id,
"query": query,
"nResults": safe_n_results,
}
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.post(url, json=payload) as resp:
if resp.status == 404:
logger.warning(f"Knowledge base not found for retrieval: {kb_id}")
return []
resp.raise_for_status()
data = await resp.json()
if not isinstance(data, dict):
return []
results = data.get("results", [])
if not isinstance(results, list):
return []
return [r for r in results if isinstance(r, dict)]
except Exception as exc:
logger.warning(f"Knowledge search failed (kb_id={kb_id}): {exc}")
return []
async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]:
"""Fetch tool resource configuration from backend API."""
if not tool_id:
return None
url = f"{self._base_url}/api/tools/resources/{tool_id}"
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.get(url) as resp:
if resp.status == 404:
return None
resp.raise_for_status()
data = await resp.json()
return data if isinstance(data, dict) else None
except Exception as exc:
logger.warning(f"Failed to fetch tool resource ({tool_id}): {exc}")
return None
def build_backend_adapter(
*,
backend_url: Optional[str],
backend_mode: str = "auto",
history_enabled: bool = True,
timeout_sec: int = 10,
assistant_local_config_dir: str = "engine/config/agents",
) -> AssistantConfigSourceAdapter:
"""Create backend adapter implementation based on runtime settings."""
mode = str(backend_mode or "auto").strip().lower()
has_url = bool(str(backend_url or "").strip())
base_adapter: HttpBackendAdapter | NullBackendAdapter
using_http_backend = False
if mode in {"disabled", "off", "none", "null", "engine_only", "engine-only"}:
base_adapter = NullBackendAdapter()
elif mode == "http":
if has_url:
base_adapter = HttpBackendAdapter(backend_url=str(backend_url), timeout_sec=timeout_sec)
using_http_backend = True
else:
logger.warning("BACKEND_MODE=http but BACKEND_URL is empty; falling back to NullBackendAdapter")
base_adapter = NullBackendAdapter()
else:
if has_url:
base_adapter = HttpBackendAdapter(backend_url=str(backend_url), timeout_sec=timeout_sec)
using_http_backend = True
else:
base_adapter = NullBackendAdapter()
runtime_adapter: HttpBackendAdapter | NullBackendAdapter | HistoryDisabledBackendAdapter
if not history_enabled:
runtime_adapter = HistoryDisabledBackendAdapter(base_adapter)
else:
runtime_adapter = base_adapter
return AssistantConfigSourceAdapter(
delegate=runtime_adapter,
local_delegate=LocalYamlAssistantConfigAdapter(assistant_local_config_dir),
use_backend_assistant_config=using_http_backend,
)
def build_backend_adapter_from_settings() -> AssistantConfigSourceAdapter:
"""Create backend adapter using current app settings."""
return build_backend_adapter(
backend_url=settings.backend_url,
backend_mode=settings.backend_mode,
history_enabled=settings.history_enabled,
timeout_sec=settings.backend_timeout_sec,
assistant_local_config_dir=settings.assistant_local_config_dir,
)

View File

@@ -1,357 +0,0 @@
"""Backend adapter implementations for engine integration ports."""
from __future__ import annotations
from typing import Any, Dict, List, Optional
import aiohttp
from loguru import logger
from app.config import settings
class NullBackendAdapter:
"""No-op adapter for engine-only runtime without backend dependencies."""
async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]:
_ = assistant_id
return None
async def create_call_record(
self,
*,
user_id: int,
assistant_id: Optional[str],
source: str = "debug",
) -> Optional[str]:
_ = (user_id, assistant_id, source)
return None
async def add_transcript(
self,
*,
call_id: str,
turn_index: int,
speaker: str,
content: str,
start_ms: int,
end_ms: int,
confidence: Optional[float] = None,
duration_ms: Optional[int] = None,
) -> bool:
_ = (call_id, turn_index, speaker, content, start_ms, end_ms, confidence, duration_ms)
return False
async def finalize_call_record(
self,
*,
call_id: str,
status: str,
duration_seconds: int,
) -> bool:
_ = (call_id, status, duration_seconds)
return False
async def search_knowledge_context(
self,
*,
kb_id: str,
query: str,
n_results: int = 5,
) -> List[Dict[str, Any]]:
_ = (kb_id, query, n_results)
return []
async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]:
_ = tool_id
return None
class HistoryDisabledBackendAdapter:
"""Adapter wrapper that disables history writes while keeping reads available."""
def __init__(self, delegate: HttpBackendAdapter | NullBackendAdapter):
self._delegate = delegate
async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]:
return await self._delegate.fetch_assistant_config(assistant_id)
async def create_call_record(
self,
*,
user_id: int,
assistant_id: Optional[str],
source: str = "debug",
) -> Optional[str]:
_ = (user_id, assistant_id, source)
return None
async def add_transcript(
self,
*,
call_id: str,
turn_index: int,
speaker: str,
content: str,
start_ms: int,
end_ms: int,
confidence: Optional[float] = None,
duration_ms: Optional[int] = None,
) -> bool:
_ = (call_id, turn_index, speaker, content, start_ms, end_ms, confidence, duration_ms)
return False
async def finalize_call_record(
self,
*,
call_id: str,
status: str,
duration_seconds: int,
) -> bool:
_ = (call_id, status, duration_seconds)
return False
async def search_knowledge_context(
self,
*,
kb_id: str,
query: str,
n_results: int = 5,
) -> List[Dict[str, Any]]:
return await self._delegate.search_knowledge_context(
kb_id=kb_id,
query=query,
n_results=n_results,
)
async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]:
return await self._delegate.fetch_tool_resource(tool_id)
class HttpBackendAdapter:
"""HTTP implementation of backend integration ports."""
def __init__(self, backend_url: str, timeout_sec: int = 10):
base_url = str(backend_url or "").strip().rstrip("/")
if not base_url:
raise ValueError("backend_url is required for HttpBackendAdapter")
self._base_url = base_url
self._timeout_sec = timeout_sec
def _timeout(self) -> aiohttp.ClientTimeout:
return aiohttp.ClientTimeout(total=self._timeout_sec)
async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]:
"""Fetch assistant config payload from backend API.
Expected response shape:
{
"assistant": {...},
"voice": {...} | null
}
"""
url = f"{self._base_url}/api/assistants/{assistant_id}/config"
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.get(url) as resp:
if resp.status == 404:
logger.warning(f"Assistant config not found: {assistant_id}")
return {"__error_code": "assistant.not_found", "assistantId": assistant_id}
resp.raise_for_status()
payload = await resp.json()
if not isinstance(payload, dict):
logger.warning("Assistant config payload is not a dict; ignoring")
return {"__error_code": "assistant.config_unavailable", "assistantId": assistant_id}
return payload
except Exception as exc:
logger.warning(f"Failed to fetch assistant config ({assistant_id}): {exc}")
return {"__error_code": "assistant.config_unavailable", "assistantId": assistant_id}
async def create_call_record(
self,
*,
user_id: int,
assistant_id: Optional[str],
source: str = "debug",
) -> Optional[str]:
"""Create a call record via backend history API and return call_id."""
url = f"{self._base_url}/api/history"
payload: Dict[str, Any] = {
"user_id": user_id,
"assistant_id": assistant_id,
"source": source,
"status": "connected",
}
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.post(url, json=payload) as resp:
resp.raise_for_status()
data = await resp.json()
call_id = str((data or {}).get("id") or "")
return call_id or None
except Exception as exc:
logger.warning(f"Failed to create history call record: {exc}")
return None
async def add_transcript(
self,
*,
call_id: str,
turn_index: int,
speaker: str,
content: str,
start_ms: int,
end_ms: int,
confidence: Optional[float] = None,
duration_ms: Optional[int] = None,
) -> bool:
"""Append a transcript segment to backend history."""
if not call_id:
return False
url = f"{self._base_url}/api/history/{call_id}/transcripts"
payload: Dict[str, Any] = {
"turn_index": turn_index,
"speaker": speaker,
"content": content,
"confidence": confidence,
"start_ms": start_ms,
"end_ms": end_ms,
"duration_ms": duration_ms,
}
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.post(url, json=payload) as resp:
resp.raise_for_status()
return True
except Exception as exc:
logger.warning(f"Failed to append history transcript (call_id={call_id}, turn={turn_index}): {exc}")
return False
async def finalize_call_record(
self,
*,
call_id: str,
status: str,
duration_seconds: int,
) -> bool:
"""Finalize a call record with status and duration."""
if not call_id:
return False
url = f"{self._base_url}/api/history/{call_id}"
payload: Dict[str, Any] = {
"status": status,
"duration_seconds": duration_seconds,
}
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.put(url, json=payload) as resp:
resp.raise_for_status()
return True
except Exception as exc:
logger.warning(f"Failed to finalize history call record ({call_id}): {exc}")
return False
async def search_knowledge_context(
self,
*,
kb_id: str,
query: str,
n_results: int = 5,
) -> List[Dict[str, Any]]:
"""Search backend knowledge base and return retrieval results."""
if not kb_id or not query.strip():
return []
try:
safe_n_results = max(1, int(n_results))
except (TypeError, ValueError):
safe_n_results = 5
url = f"{self._base_url}/api/knowledge/search"
payload: Dict[str, Any] = {
"kb_id": kb_id,
"query": query,
"nResults": safe_n_results,
}
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.post(url, json=payload) as resp:
if resp.status == 404:
logger.warning(f"Knowledge base not found for retrieval: {kb_id}")
return []
resp.raise_for_status()
data = await resp.json()
if not isinstance(data, dict):
return []
results = data.get("results", [])
if not isinstance(results, list):
return []
return [r for r in results if isinstance(r, dict)]
except Exception as exc:
logger.warning(f"Knowledge search failed (kb_id={kb_id}): {exc}")
return []
async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]:
"""Fetch tool resource configuration from backend API."""
if not tool_id:
return None
url = f"{self._base_url}/api/tools/resources/{tool_id}"
try:
async with aiohttp.ClientSession(timeout=self._timeout()) as session:
async with session.get(url) as resp:
if resp.status == 404:
return None
resp.raise_for_status()
data = await resp.json()
return data if isinstance(data, dict) else None
except Exception as exc:
logger.warning(f"Failed to fetch tool resource ({tool_id}): {exc}")
return None
def build_backend_adapter(
*,
backend_url: Optional[str],
backend_mode: str = "auto",
history_enabled: bool = True,
timeout_sec: int = 10,
) -> HttpBackendAdapter | NullBackendAdapter | HistoryDisabledBackendAdapter:
"""Create backend adapter implementation based on runtime settings."""
mode = str(backend_mode or "auto").strip().lower()
has_url = bool(str(backend_url or "").strip())
base_adapter: HttpBackendAdapter | NullBackendAdapter
if mode in {"disabled", "off", "none", "null", "engine_only", "engine-only"}:
base_adapter = NullBackendAdapter()
elif mode == "http":
if has_url:
base_adapter = HttpBackendAdapter(backend_url=str(backend_url), timeout_sec=timeout_sec)
else:
logger.warning("BACKEND_MODE=http but BACKEND_URL is empty; falling back to NullBackendAdapter")
base_adapter = NullBackendAdapter()
else:
if has_url:
base_adapter = HttpBackendAdapter(backend_url=str(backend_url), timeout_sec=timeout_sec)
else:
base_adapter = NullBackendAdapter()
if not history_enabled:
return HistoryDisabledBackendAdapter(base_adapter)
return base_adapter
def build_backend_adapter_from_settings() -> HttpBackendAdapter | NullBackendAdapter | HistoryDisabledBackendAdapter:
"""Create backend adapter using current app settings."""
return build_backend_adapter(
backend_url=settings.backend_url,
backend_mode=settings.backend_mode,
history_enabled=settings.history_enabled,
timeout_sec=settings.backend_timeout_sec,
)

View File

@@ -1,87 +0,0 @@
"""Compatibility wrappers around backend adapter implementations."""
from __future__ import annotations
from typing import Any, Dict, List, Optional
from app.backend_adapters import build_backend_adapter_from_settings
def _adapter():
return build_backend_adapter_from_settings()
async def fetch_assistant_config(assistant_id: str) -> Optional[Dict[str, Any]]:
"""Fetch assistant config payload from backend adapter."""
return await _adapter().fetch_assistant_config(assistant_id)
async def create_history_call_record(
*,
user_id: int,
assistant_id: Optional[str],
source: str = "debug",
) -> Optional[str]:
"""Create a call record via backend history API and return call_id."""
return await _adapter().create_call_record(
user_id=user_id,
assistant_id=assistant_id,
source=source,
)
async def add_history_transcript(
*,
call_id: str,
turn_index: int,
speaker: str,
content: str,
start_ms: int,
end_ms: int,
confidence: Optional[float] = None,
duration_ms: Optional[int] = None,
) -> bool:
"""Append a transcript segment to backend history."""
return await _adapter().add_transcript(
call_id=call_id,
turn_index=turn_index,
speaker=speaker,
content=content,
start_ms=start_ms,
end_ms=end_ms,
confidence=confidence,
duration_ms=duration_ms,
)
async def finalize_history_call_record(
*,
call_id: str,
status: str,
duration_seconds: int,
) -> bool:
"""Finalize a call record with status and duration."""
return await _adapter().finalize_call_record(
call_id=call_id,
status=status,
duration_seconds=duration_seconds,
)
async def search_knowledge_context(
*,
kb_id: str,
query: str,
n_results: int = 5,
) -> List[Dict[str, Any]]:
"""Search backend knowledge base and return retrieval results."""
return await _adapter().search_knowledge_context(
kb_id=kb_id,
query=query,
n_results=n_results,
)
async def fetch_tool_resource(tool_id: str) -> Optional[Dict[str, Any]]:
"""Fetch tool resource configuration from backend API."""
return await _adapter().fetch_tool_resource(tool_id)

View File

@@ -1,371 +1,31 @@
"""Configuration management using Pydantic settings and agent YAML profiles."""
"""Configuration management using Pydantic settings."""
import json
import os
import re
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, List, Optional
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
try:
import yaml
except ImportError: # pragma: no cover - validated when agent YAML is used
yaml = None
from dotenv import load_dotenv
except ImportError: # pragma: no cover - optional dependency in some runtimes
load_dotenv = None
def _prime_process_env_from_dotenv() -> None:
"""Load .env into process env early."""
if load_dotenv is None:
return
cwd_env = Path.cwd() / ".env"
engine_env = Path(__file__).resolve().parent.parent / ".env"
load_dotenv(dotenv_path=cwd_env, override=False)
if engine_env != cwd_env:
load_dotenv(dotenv_path=engine_env, override=False)
_ENV_REF_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)(?::([^}]*))?\}")
_DEFAULT_AGENT_CONFIG_DIR = "config/agents"
_DEFAULT_AGENT_CONFIG_FILE = "default.yaml"
_AGENT_SECTION_KEY_MAP: Dict[str, Dict[str, str]] = {
"vad": {
"type": "vad_type",
"model_path": "vad_model_path",
"threshold": "vad_threshold",
"min_speech_duration_ms": "vad_min_speech_duration_ms",
"eou_threshold_ms": "vad_eou_threshold_ms",
},
"llm": {
"provider": "llm_provider",
"model": "llm_model",
"temperature": "llm_temperature",
"api_key": "llm_api_key",
"api_url": "llm_api_url",
},
"tts": {
"provider": "tts_provider",
"api_key": "tts_api_key",
"api_url": "tts_api_url",
"model": "tts_model",
"voice": "tts_voice",
"dashscope_mode": "tts_mode",
"mode": "tts_mode",
"speed": "tts_speed",
},
"asr": {
"provider": "asr_provider",
"api_key": "asr_api_key",
"api_url": "asr_api_url",
"model": "asr_model",
"interim_interval_ms": "asr_interim_interval_ms",
"min_audio_ms": "asr_min_audio_ms",
"start_min_speech_ms": "asr_start_min_speech_ms",
"pre_speech_ms": "asr_pre_speech_ms",
"final_tail_ms": "asr_final_tail_ms",
},
"duplex": {
"enabled": "duplex_enabled",
"greeting": "duplex_greeting",
"system_prompt": "duplex_system_prompt",
"opener_audio_file": "duplex_opener_audio_file",
},
"barge_in": {
"min_duration_ms": "barge_in_min_duration_ms",
"silence_tolerance_ms": "barge_in_silence_tolerance_ms",
},
}
_AGENT_SETTING_KEYS = {
"vad_type",
"vad_model_path",
"vad_threshold",
"vad_min_speech_duration_ms",
"vad_eou_threshold_ms",
"llm_provider",
"llm_api_key",
"llm_api_url",
"llm_model",
"llm_temperature",
"tts_provider",
"tts_api_key",
"tts_api_url",
"tts_model",
"tts_voice",
"tts_mode",
"tts_speed",
"asr_provider",
"asr_api_key",
"asr_api_url",
"asr_model",
"asr_interim_interval_ms",
"asr_min_audio_ms",
"asr_start_min_speech_ms",
"asr_pre_speech_ms",
"asr_final_tail_ms",
"duplex_enabled",
"duplex_greeting",
"duplex_system_prompt",
"duplex_opener_audio_file",
"barge_in_min_duration_ms",
"barge_in_silence_tolerance_ms",
"tools",
}
_BASE_REQUIRED_AGENT_SETTING_KEYS = {
"vad_type",
"vad_model_path",
"vad_threshold",
"vad_min_speech_duration_ms",
"vad_eou_threshold_ms",
"llm_provider",
"llm_model",
"llm_temperature",
"tts_provider",
"tts_voice",
"tts_speed",
"asr_provider",
"asr_interim_interval_ms",
"asr_min_audio_ms",
"asr_start_min_speech_ms",
"asr_pre_speech_ms",
"asr_final_tail_ms",
"duplex_enabled",
"duplex_system_prompt",
"barge_in_min_duration_ms",
"barge_in_silence_tolerance_ms",
}
_OPENAI_COMPATIBLE_LLM_PROVIDERS = {"openai_compatible", "openai-compatible", "siliconflow"}
_OPENAI_COMPATIBLE_TTS_PROVIDERS = {"openai_compatible", "openai-compatible", "siliconflow"}
_DASHSCOPE_TTS_PROVIDERS = {"dashscope"}
_OPENAI_COMPATIBLE_ASR_PROVIDERS = {"openai_compatible", "openai-compatible", "siliconflow"}
def _normalized_provider(overrides: Dict[str, Any], key: str, default: str) -> str:
return str(overrides.get(key) or default).strip().lower()
def _is_blank(value: Any) -> bool:
return value is None or (isinstance(value, str) and not value.strip())
@dataclass(frozen=True)
class AgentConfigSelection:
"""Resolved agent config location and how it was selected."""
path: Optional[Path]
source: str
def _parse_cli_agent_args(argv: List[str]) -> Tuple[Optional[str], Optional[str]]:
"""Parse only agent-related CLI flags from argv."""
config_path: Optional[str] = None
profile: Optional[str] = None
i = 0
while i < len(argv):
arg = argv[i]
if arg.startswith("--agent-config="):
config_path = arg.split("=", 1)[1].strip() or None
elif arg == "--agent-config" and i + 1 < len(argv):
config_path = argv[i + 1].strip() or None
i += 1
elif arg.startswith("--agent-profile="):
profile = arg.split("=", 1)[1].strip() or None
elif arg == "--agent-profile" and i + 1 < len(argv):
profile = argv[i + 1].strip() or None
i += 1
i += 1
return config_path, profile
def _agent_config_dir() -> Path:
base_dir = Path(os.getenv("AGENT_CONFIG_DIR", _DEFAULT_AGENT_CONFIG_DIR))
if not base_dir.is_absolute():
base_dir = Path.cwd() / base_dir
return base_dir.resolve()
def _resolve_agent_selection(
agent_config_path: Optional[str] = None,
agent_profile: Optional[str] = None,
argv: Optional[List[str]] = None,
) -> AgentConfigSelection:
cli_path, cli_profile = _parse_cli_agent_args(list(argv if argv is not None else sys.argv[1:]))
path_value = agent_config_path or cli_path or os.getenv("AGENT_CONFIG_PATH")
profile_value = agent_profile or cli_profile or os.getenv("AGENT_PROFILE")
source = "none"
candidate: Optional[Path] = None
if path_value:
source = "cli_path" if (agent_config_path or cli_path) else "env_path"
candidate = Path(path_value)
elif profile_value:
source = "cli_profile" if (agent_profile or cli_profile) else "env_profile"
candidate = _agent_config_dir() / f"{profile_value}.yaml"
else:
fallback = _agent_config_dir() / _DEFAULT_AGENT_CONFIG_FILE
if fallback.exists():
source = "default"
candidate = fallback
if candidate is None:
raise ValueError(
"Agent YAML config is required. Provide --agent-config/--agent-profile "
"or create config/agents/default.yaml."
)
if not candidate.is_absolute():
candidate = (Path.cwd() / candidate).resolve()
else:
candidate = candidate.resolve()
if not candidate.exists():
raise ValueError(f"Agent config file not found ({source}): {candidate}")
if not candidate.is_file():
raise ValueError(f"Agent config path is not a file: {candidate}")
return AgentConfigSelection(path=candidate, source=source)
def _resolve_env_refs(value: Any) -> Any:
"""Resolve ${ENV_VAR} / ${ENV_VAR:default} placeholders recursively."""
if isinstance(value, dict):
return {k: _resolve_env_refs(v) for k, v in value.items()}
if isinstance(value, list):
return [_resolve_env_refs(item) for item in value]
if not isinstance(value, str) or "${" not in value:
return value
def _replace(match: re.Match[str]) -> str:
env_key = match.group(1)
default_value = match.group(2)
env_value = os.getenv(env_key)
if env_value is None:
if default_value is None:
raise ValueError(f"Missing environment variable referenced in agent YAML: {env_key}")
return default_value
return env_value
return _ENV_REF_PATTERN.sub(_replace, value)
def _normalize_agent_overrides(raw: Dict[str, Any]) -> Dict[str, Any]:
"""Normalize YAML into flat Settings fields."""
normalized: Dict[str, Any] = {}
for key, value in raw.items():
if key == "siliconflow":
raise ValueError(
"Section 'siliconflow' is no longer supported. "
"Move provider-specific fields into agent.llm / agent.asr / agent.tts."
)
if key == "tools":
if not isinstance(value, list):
raise ValueError("Agent config key 'tools' must be a list")
normalized["tools"] = value
continue
section_map = _AGENT_SECTION_KEY_MAP.get(key)
if section_map is None:
normalized[key] = value
continue
if not isinstance(value, dict):
raise ValueError(f"Agent config section '{key}' must be a mapping")
for nested_key, nested_value in value.items():
mapped_key = section_map.get(nested_key)
if mapped_key is None:
raise ValueError(f"Unknown key in '{key}' section: '{nested_key}'")
normalized[mapped_key] = nested_value
unknown_keys = sorted(set(normalized) - _AGENT_SETTING_KEYS)
if unknown_keys:
raise ValueError(
"Unknown agent config keys in YAML: "
+ ", ".join(unknown_keys)
)
return normalized
def _missing_required_keys(overrides: Dict[str, Any]) -> List[str]:
missing = set(_BASE_REQUIRED_AGENT_SETTING_KEYS - set(overrides))
string_required = {
"vad_type",
"vad_model_path",
"llm_provider",
"llm_model",
"tts_provider",
"tts_voice",
"asr_provider",
"duplex_system_prompt",
}
for key in string_required:
if key in overrides and _is_blank(overrides.get(key)):
missing.add(key)
llm_provider = _normalized_provider(overrides, "llm_provider", "openai")
if llm_provider in _OPENAI_COMPATIBLE_LLM_PROVIDERS or llm_provider == "openai":
if "llm_api_key" not in overrides or _is_blank(overrides.get("llm_api_key")):
missing.add("llm_api_key")
tts_provider = _normalized_provider(overrides, "tts_provider", "openai_compatible")
if tts_provider in _OPENAI_COMPATIBLE_TTS_PROVIDERS:
if "tts_api_key" not in overrides or _is_blank(overrides.get("tts_api_key")):
missing.add("tts_api_key")
if "tts_api_url" not in overrides or _is_blank(overrides.get("tts_api_url")):
missing.add("tts_api_url")
if "tts_model" not in overrides or _is_blank(overrides.get("tts_model")):
missing.add("tts_model")
elif tts_provider in _DASHSCOPE_TTS_PROVIDERS:
if "tts_api_key" not in overrides or _is_blank(overrides.get("tts_api_key")):
missing.add("tts_api_key")
asr_provider = _normalized_provider(overrides, "asr_provider", "openai_compatible")
if asr_provider in _OPENAI_COMPATIBLE_ASR_PROVIDERS:
if "asr_api_key" not in overrides or _is_blank(overrides.get("asr_api_key")):
missing.add("asr_api_key")
if "asr_api_url" not in overrides or _is_blank(overrides.get("asr_api_url")):
missing.add("asr_api_url")
if "asr_model" not in overrides or _is_blank(overrides.get("asr_model")):
missing.add("asr_model")
return sorted(missing)
def _load_agent_overrides(selection: AgentConfigSelection) -> Dict[str, Any]:
if yaml is None:
raise RuntimeError(
"PyYAML is required for agent YAML configuration. Install with: pip install pyyaml"
)
with selection.path.open("r", encoding="utf-8") as file:
raw = yaml.safe_load(file) or {}
if not isinstance(raw, dict):
raise ValueError(f"Agent config must be a YAML mapping: {selection.path}")
if "agent" in raw:
agent_value = raw["agent"]
if not isinstance(agent_value, dict):
raise ValueError("The 'agent' key in YAML must be a mapping")
raw = agent_value
resolved = _resolve_env_refs(raw)
overrides = _normalize_agent_overrides(resolved)
missing_required = _missing_required_keys(overrides)
if missing_required:
raise ValueError(
f"Missing required agent settings in YAML ({selection.path}): "
+ ", ".join(missing_required)
)
overrides["agent_config_path"] = str(selection.path)
overrides["agent_config_source"] = selection.source
return overrides
def load_settings(
agent_config_path: Optional[str] = None,
agent_profile: Optional[str] = None,
argv: Optional[List[str]] = None,
) -> "Settings":
"""Load settings from .env and optional agent YAML."""
selection = _resolve_agent_selection(
agent_config_path=agent_config_path,
agent_profile=agent_profile,
argv=argv,
)
agent_overrides = _load_agent_overrides(selection)
return Settings(**agent_overrides)
_prime_process_env_from_dotenv()
class Settings(BaseSettings):
@@ -402,9 +62,8 @@ class Settings(BaseSettings):
# LLM Configuration
llm_provider: str = Field(
default="openai",
description="LLM provider (openai, openai_compatible, siliconflow)"
description="LLM provider (openai, openai_compatible, siliconflow, fastgpt)"
)
llm_api_key: Optional[str] = Field(default=None, description="LLM provider API key")
llm_api_url: Optional[str] = Field(default=None, description="LLM provider API base URL")
llm_model: str = Field(default="gpt-4o-mini", description="LLM model name")
llm_temperature: float = Field(default=0.7, description="LLM temperature for response generation")
@@ -412,12 +71,15 @@ class Settings(BaseSettings):
# TTS Configuration
tts_provider: str = Field(
default="openai_compatible",
description="TTS provider (edge, openai_compatible, siliconflow, dashscope)"
description="TTS provider (openai_compatible, siliconflow, dashscope, volcengine)"
)
tts_api_key: Optional[str] = Field(default=None, description="TTS provider API key")
tts_api_url: Optional[str] = Field(default=None, description="TTS provider API URL")
tts_model: Optional[str] = Field(default=None, description="TTS model name")
tts_voice: str = Field(default="anna", description="TTS voice name")
tts_app_id: Optional[str] = Field(default=None, description="Provider-specific TTS app ID")
tts_resource_id: Optional[str] = Field(default=None, description="Provider-specific TTS resource ID")
tts_cluster: Optional[str] = Field(default=None, description="Provider-specific TTS cluster")
tts_uid: Optional[str] = Field(default=None, description="Provider-specific TTS user ID")
tts_mode: str = Field(
default="commit",
description="DashScope-only TTS mode (commit, server_commit). Ignored for non-dashscope providers."
@@ -427,11 +89,19 @@ class Settings(BaseSettings):
# ASR Configuration
asr_provider: str = Field(
default="openai_compatible",
description="ASR provider (openai_compatible, buffered, siliconflow)"
description="ASR provider (openai_compatible, buffered, siliconflow, dashscope, volcengine)"
)
asr_api_key: Optional[str] = Field(default=None, description="ASR provider API key")
asr_api_url: Optional[str] = Field(default=None, description="ASR provider API URL")
asr_model: Optional[str] = Field(default=None, description="ASR model name")
asr_app_id: Optional[str] = Field(default=None, description="Provider-specific ASR app ID")
asr_resource_id: Optional[str] = Field(default=None, description="Provider-specific ASR resource ID")
asr_cluster: Optional[str] = Field(default=None, description="Provider-specific ASR cluster")
asr_uid: Optional[str] = Field(default=None, description="Provider-specific ASR user ID")
asr_request_params_json: Optional[str] = Field(
default=None,
description="Provider-specific ASR request params as JSON string"
)
asr_enable_interim: bool = Field(default=False, description="Enable interim transcripts for offline ASR")
asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms")
asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result")
asr_start_min_speech_ms: int = Field(
@@ -505,6 +175,10 @@ class Settings(BaseSettings):
)
backend_url: Optional[str] = Field(default=None, description="Backend API base URL (e.g. http://localhost:8787)")
backend_timeout_sec: int = Field(default=10, description="Backend API request timeout in seconds")
assistant_local_config_dir: str = Field(
default="engine/config/agents",
description="Directory containing local assistant runtime YAML files"
)
history_enabled: bool = Field(default=True, description="Enable history write bridge")
history_default_user_id: int = Field(default=1, description="Fallback user_id for history records")
history_queue_max_size: int = Field(default=256, description="Max buffered transcript writes per session")
@@ -515,10 +189,6 @@ class Settings(BaseSettings):
description="Max wait before finalizing history when queue is still draining"
)
# Agent YAML metadata
agent_config_path: Optional[str] = Field(default=None, description="Resolved agent YAML path")
agent_config_source: str = Field(default="none", description="How the agent YAML was selected")
@property
def chunk_size_bytes(self) -> int:
"""Calculate chunk size in bytes based on sample rate and duration."""
@@ -543,7 +213,7 @@ class Settings(BaseSettings):
# Global settings instance
settings = load_settings()
settings = Settings()
def get_settings() -> Settings:

View File

@@ -20,16 +20,28 @@ except ImportError:
logger.warning("aiortc not available - WebRTC endpoint will be disabled")
from app.config import settings
from app.backend_adapters import build_backend_adapter_from_settings
from core.transports import SocketTransport, WebRtcTransport, BaseTransport
from core.session import Session
from adapters.control_plane.backend import build_backend_adapter_from_settings
from runtime.transports import SocketTransport, WebRtcTransport, BaseTransport
from runtime.session.manager import Session
from processors.tracks import Resampled16kTrack
from core.events import get_event_bus, reset_event_bus
from runtime.events import get_event_bus, reset_event_bus
# Check interval for heartbeat/timeout (seconds)
_HEARTBEAT_CHECK_INTERVAL_SEC = 5
def _inactivity_deadline(
*,
last_received_at: float,
inactivity_timeout_sec: int,
pending_client_tool_deadline: Optional[float] = None,
) -> float:
deadline = float(last_received_at) + float(inactivity_timeout_sec)
if pending_client_tool_deadline is not None:
deadline = max(deadline, float(pending_client_tool_deadline))
return deadline
async def heartbeat_and_timeout_task(
transport: BaseTransport,
session: Session,
@@ -48,8 +60,22 @@ async def heartbeat_and_timeout_task(
if transport.is_closed:
break
now = time.monotonic()
if now - last_received_at[0] > inactivity_timeout_sec:
logger.info(f"Session {session_id}: {inactivity_timeout_sec}s no message, closing")
pending_client_tool_deadline = session.pipeline.pending_client_tool_deadline()
idle_deadline = _inactivity_deadline(
last_received_at=last_received_at[0],
inactivity_timeout_sec=inactivity_timeout_sec,
pending_client_tool_deadline=pending_client_tool_deadline,
)
if now > idle_deadline:
if pending_client_tool_deadline is not None and pending_client_tool_deadline >= (
last_received_at[0] + inactivity_timeout_sec
):
logger.info(
"Session {}: no message before pending client tool deadline, closing",
session_id,
)
else:
logger.info(f"Session {session_id}: {inactivity_timeout_sec}s no message, closing")
await session.cleanup()
break
if now - last_heartbeat_at[0] >= heartbeat_interval_sec:
@@ -76,22 +102,39 @@ app.add_middleware(
# Active sessions storage
active_sessions: Dict[str, Session] = {}
backend_gateway = build_backend_adapter_from_settings()
control_plane_gateway = build_backend_adapter_from_settings()
# Configure logging
logger.remove()
logger.add(
"./logs/active_call_{time}.log",
rotation="1 day",
retention="7 days",
level=settings.log_level,
format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}"
)
logger.add(
lambda msg: print(msg, end=""),
level=settings.log_level,
format="{time:HH:mm:ss} | {level: <8} | {message}"
)
_log_format = str(settings.log_format or "text").strip().lower()
if _log_format == "json":
logger.add(
"./logs/active_call_{time}.log",
rotation="1 day",
retention="7 days",
level=settings.log_level,
serialize=True,
format="{message}",
)
logger.add(
lambda msg: print(msg, end=""),
level=settings.log_level,
serialize=True,
format="{message}",
)
else:
logger.add(
"./logs/active_call_{time}.log",
rotation="1 day",
retention="7 days",
level=settings.log_level,
format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
)
logger.add(
lambda msg: print(msg, end=""),
level=settings.log_level,
format="{time:HH:mm:ss} | {level: <8} | {message}",
)
@app.get("/health")
@@ -170,7 +213,7 @@ async def websocket_endpoint(websocket: WebSocket):
session = Session(
session_id,
transport,
backend_gateway=backend_gateway,
control_plane_gateway=control_plane_gateway,
assistant_id=assistant_id,
)
active_sessions[session_id] = session
@@ -255,7 +298,7 @@ async def webrtc_endpoint(websocket: WebSocket):
session = Session(
session_id,
transport,
backend_gateway=backend_gateway,
control_plane_gateway=control_plane_gateway,
assistant_id=assistant_id,
)
active_sessions[session_id] = session
@@ -371,12 +414,10 @@ async def startup_event():
logger.info(f"Server: {settings.host}:{settings.port}")
logger.info(f"Sample rate: {settings.sample_rate} Hz")
logger.info(f"VAD model: {settings.vad_model_path}")
if settings.agent_config_path:
logger.info(
f"Agent config loaded ({settings.agent_config_source}): {settings.agent_config_path}"
)
else:
logger.info("Agent config: none (using .env/default agent values)")
logger.info(
"Assistant runtime config source: backend when BACKEND_URL is set, "
"otherwise local YAML by assistant_id from ASSISTANT_LOCAL_CONFIG_DIR"
)
@app.on_event("shutdown")

View File

@@ -0,0 +1,47 @@
# Agent behavior configuration for DashScope realtime ASR/TTS.
# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers).
# Infra/server/network settings should stay in .env.
agent:
vad:
type: silero
model_path: data/vad/silero_vad.onnx
threshold: 0.5
min_speech_duration_ms: 100
eou_threshold_ms: 800
llm:
# provider: openai | openai_compatible | siliconflow
provider: openai_compatible
model: deepseek-v3
temperature: 0.7
api_key: your_llm_api_key
api_url: https://api.qnaigc.com/v1
tts:
provider: dashscope
api_key: your_tts_api_key
api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
model: qwen3-tts-flash-realtime
voice: Cherry
dashscope_mode: commit
speed: 1.0
asr:
provider: dashscope
api_key: your_asr_api_key
api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
model: qwen3-asr-flash-realtime
interim_interval_ms: 500
min_audio_ms: 300
start_min_speech_ms: 160
pre_speech_ms: 240
final_tail_ms: 120
duplex:
enabled: true
system_prompt: 你是一个人工智能助手你用简答语句回答避免使用标点符号和emoji。
barge_in:
min_duration_ms: 200
silence_tolerance_ms: 60

View File

@@ -0,0 +1,47 @@
# Agent behavior configuration for DashScope realtime ASR/TTS.
# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers).
# Infra/server/network settings should stay in .env.
agent:
vad:
type: silero
model_path: data/vad/silero_vad.onnx
threshold: 0.5
min_speech_duration_ms: 100
eou_threshold_ms: 800
llm:
# provider: openai | openai_compatible | siliconflow
provider: openai_compatible
model: deepseek-v3
temperature: 0.7
api_key: sk-fc4d59b360475f53401a864db8ce0985010acc4e696723d20a90d6569f38d80a
api_url: https://api.qnaigc.com/v1
tts:
provider: dashscope
api_key: sk-391f5126d18345d497c6e8717c8c9ad7
api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
model: qwen3-tts-flash-realtime
voice: Cherry
dashscope_mode: commit
speed: 1.0
asr:
provider: dashscope
api_key: sk-391f5126d18345d497c6e8717c8c9ad7
api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
model: qwen3-asr-flash-realtime
interim_interval_ms: 500
min_audio_ms: 300
start_min_speech_ms: 160
pre_speech_ms: 240
final_tail_ms: 120
duplex:
enabled: true
system_prompt: 你是一个人工智能助手你用简答语句回答避免使用标点符号和emoji。
barge_in:
min_duration_ms: 200
silence_tolerance_ms: 60

View File

@@ -11,7 +11,7 @@ agent:
eou_threshold_ms: 800
llm:
# provider: openai | openai_compatible | siliconflow
# provider: openai | openai_compatible | siliconflow | fastgpt
provider: openai_compatible
model: deepseek-v3
temperature: 0.7
@@ -21,12 +21,17 @@ agent:
api_url: https://api.qnaigc.com/v1
tts:
# provider: edge | openai_compatible | siliconflow | dashscope
# provider: openai_compatible | siliconflow | dashscope | volcengine
# dashscope defaults (if omitted):
# api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
# model: qwen3-tts-flash-realtime
# dashscope_mode: commit (engine splits) | server_commit (dashscope splits)
# note: dashscope_mode/mode is ONLY used when provider=dashscope.
# volcengine defaults (if omitted):
# api_url: https://openspeech.bytedance.com/api/v3/tts/unidirectional
# resource_id: seed-tts-2.0
# app_id: your volcengine app key
# api_key: your volcengine access key
provider: openai_compatible
api_key: your_tts_api_key
api_url: https://api.siliconflow.cn/v1/audio/speech
@@ -35,11 +40,26 @@ agent:
speed: 1.0
asr:
# provider: buffered | openai_compatible | siliconflow
# provider: buffered | openai_compatible | siliconflow | dashscope | volcengine
# dashscope defaults (if omitted):
# api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
# model: qwen3-asr-flash-realtime
# note: dashscope uses streaming ASR mode (chunk-by-chunk).
# volcengine defaults (if omitted):
# api_url: wss://openspeech.bytedance.com/api/v3/sauc/bigmodel
# model: bigmodel
# resource_id: volc.bigasr.sauc.duration
# app_id: your volcengine app key
# api_key: your volcengine access key
# request_params:
# end_window_size: 800
# force_to_speech_time: 1000
# note: volcengine uses streaming ASR mode (chunk-by-chunk).
provider: openai_compatible
api_key: you_asr_api_key
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
model: FunAudioLLM/SenseVoiceSmall
enable_interim: false
interim_interval_ms: 500
min_audio_ms: 300
start_min_speech_ms: 160
@@ -53,3 +73,4 @@ agent:
barge_in:
min_duration_ms: 200
silence_tolerance_ms: 60

View File

@@ -18,12 +18,17 @@ agent:
api_url: https://api.qnaigc.com/v1
tts:
# provider: edge | openai_compatible | siliconflow | dashscope
# provider: openai_compatible | siliconflow | dashscope | volcengine
# dashscope defaults (if omitted):
# api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
# model: qwen3-tts-flash-realtime
# dashscope_mode: commit (engine splits) | server_commit (dashscope splits)
# note: dashscope_mode/mode is ONLY used when provider=dashscope.
# volcengine defaults (if omitted):
# api_url: https://openspeech.bytedance.com/api/v3/tts/unidirectional
# resource_id: seed-tts-2.0
# app_id: your volcengine app key
# api_key: your volcengine access key
provider: openai_compatible
api_key: your_tts_api_key
api_url: https://api.siliconflow.cn/v1/audio/speech
@@ -32,11 +37,26 @@ agent:
speed: 1.0
asr:
# provider: buffered | openai_compatible | siliconflow
# provider: buffered | openai_compatible | siliconflow | dashscope | volcengine
# dashscope defaults (if omitted):
# api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
# model: qwen3-asr-flash-realtime
# note: dashscope uses streaming ASR mode (chunk-by-chunk).
# volcengine defaults (if omitted):
# api_url: wss://openspeech.bytedance.com/api/v3/sauc/bigmodel
# model: bigmodel
# resource_id: volc.bigasr.sauc.duration
# app_id: your volcengine app key
# api_key: your volcengine access key
# request_params:
# end_window_size: 800
# force_to_speech_time: 1000
# note: volcengine uses streaming ASR mode (chunk-by-chunk).
provider: openai_compatible
api_key: your_asr_api_key
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
model: FunAudioLLM/SenseVoiceSmall
enable_interim: false
interim_interval_ms: 500
min_audio_ms: 300
start_min_speech_ms: 160

View File

@@ -0,0 +1,68 @@
# Agent behavior configuration (safe to edit per profile)
# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers).
# Infra/server/network settings should stay in .env.
agent:
vad:
type: silero
model_path: data/vad/silero_vad.onnx
threshold: 0.5
min_speech_duration_ms: 100
eou_threshold_ms: 800
llm:
# provider: openai | openai_compatible | siliconflow
provider: openai_compatible
model: deepseek-v3
temperature: 0.7
# Required: no fallback. You can still reference env explicitly.
api_key: your_llm_api_key
# Optional for OpenAI-compatible endpoints:
api_url: https://api.qnaigc.com/v1
tts:
# provider: edge | openai_compatible | siliconflow | dashscope
# dashscope defaults (if omitted):
# api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
# model: qwen3-tts-flash-realtime
# dashscope_mode: commit (engine splits) | server_commit (dashscope splits)
# note: dashscope_mode/mode is ONLY used when provider=dashscope.
# volcengine defaults (if omitted):
provider: volcengine
api_url: https://openspeech.bytedance.com/api/v3/tts/unidirectional
resource_id: seed-tts-2.0
app_id: your_tts_app_id
api_key: your_tts_api_key
speed: 1.1
voice: zh_female_vv_uranus_bigtts
asr:
asr:
provider: volcengine
api_url: wss://openspeech.bytedance.com/api/v3/sauc/bigmodel
app_id: your_asr_app_id
api_key: your_asr_api_key
resource_id: volc.bigasr.sauc.duration
uid: caller-1
model: bigmodel
request_params:
end_window_size: 800
force_to_speech_time: 1000
enable_punc: true
enable_itn: false
enable_ddc: false
show_utterance: true
result_type: single
interim_interval_ms: 500
min_audio_ms: 300
start_min_speech_ms: 160
pre_speech_ms: 240
final_tail_ms: 120
duplex:
enabled: true
system_prompt: 你是一个人工智能助手你用简答语句回答避免使用标点符号和emoji。
barge_in:
min_duration_ms: 200
silence_tolerance_ms: 60

View File

@@ -0,0 +1,67 @@
# Agent behavior configuration (safe to edit per profile)
# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers).
# Infra/server/network settings should stay in .env.
agent:
vad:
type: silero
model_path: data/vad/silero_vad.onnx
threshold: 0.5
min_speech_duration_ms: 100
eou_threshold_ms: 800
llm:
# provider: openai | openai_compatible | siliconflow
provider: openai_compatible
model: deepseek-v3
temperature: 0.7
# Required: no fallback. You can still reference env explicitly.
api_key: sk-fc4d59b360475f53401a864db8ce0985010acc4e696723d20a90d6569f38d80a
# Optional for OpenAI-compatible endpoints:
api_url: https://api.qnaigc.com/v1
tts:
# provider: edge | openai_compatible | siliconflow | dashscope
# dashscope defaults (if omitted):
# api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
# model: qwen3-tts-flash-realtime
# dashscope_mode: commit (engine splits) | server_commit (dashscope splits)
# note: dashscope_mode/mode is ONLY used when provider=dashscope.
# volcengine defaults (if omitted):
provider: volcengine
api_url: https://openspeech.bytedance.com/api/v3/tts/unidirectional
resource_id: seed-tts-2.0
app_id: 2931820332
api_key: 4ustCTIpdCq8dE_msFrZvFn4nDpioIVo
speed: 1.1
voice: zh_female_vv_uranus_bigtts
asr:
provider: volcengine
api_url: wss://openspeech.bytedance.com/api/v3/sauc/bigmodel
app_id: 8607675070
api_key: QiO0AptfmU0GLTSitwn7t5-zeo4gJ6K1
resource_id: volc.bigasr.sauc.duration
uid: caller-1
model: bigmodel
request_params:
end_window_size: 800
force_to_speech_time: 1000
enable_punc: true
enable_itn: false
enable_ddc: false
show_utterance: true
result_type: single
interim_interval_ms: 500
min_audio_ms: 300
start_min_speech_ms: 160
pre_speech_ms: 240
final_tail_ms: 120
duplex:
enabled: true
system_prompt: 你是一个人工智能助手你用简答语句回答避免使用标点符号和emoji。
barge_in:
min_duration_ms: 200
silence_tolerance_ms: 60

View File

@@ -1,20 +0,0 @@
"""Core Components Package"""
from core.events import EventBus, get_event_bus
from core.transports import BaseTransport, SocketTransport, WebRtcTransport
from core.session import Session
from core.conversation import ConversationManager, ConversationState, ConversationTurn
from core.duplex_pipeline import DuplexPipeline
__all__ = [
"EventBus",
"get_event_bus",
"BaseTransport",
"SocketTransport",
"WebRtcTransport",
"Session",
"ConversationManager",
"ConversationState",
"ConversationTurn",
"DuplexPipeline",
]

View File

@@ -1,17 +0,0 @@
"""Port interfaces for engine-side integration boundaries."""
from core.ports.backend import (
AssistantConfigProvider,
BackendGateway,
HistoryWriter,
KnowledgeSearcher,
ToolResourceResolver,
)
__all__ = [
"AssistantConfigProvider",
"BackendGateway",
"HistoryWriter",
"KnowledgeSearcher",
"ToolResourceResolver",
]

View File

@@ -10,6 +10,7 @@ Configure with environment variables:
- `BACKEND_MODE=auto|http|disabled`
- `BACKEND_URL`
- `BACKEND_TIMEOUT_SEC`
- `ASSISTANT_LOCAL_CONFIG_DIR` (default: `engine/config/agents`)
- `HISTORY_ENABLED=true|false`
Mode behavior:
@@ -18,18 +19,23 @@ Mode behavior:
- `http`: force HTTP backend adapter (falls back to null adapter when URL is missing).
- `disabled`: force null adapter and run engine-only.
Assistant config source behavior:
- If `BACKEND_URL` exists and backend mode is enabled, fetch assistant config from backend.
- If `BACKEND_URL` is missing (or backend mode is disabled), load assistant config from local YAML.
- `assistant_id` query parameter is still required and maps to `engine/config/agents/<assistant_id>.yaml` when local YAML source is active.
## Architecture
- Ports: `core/ports/backend.py`
- Adapters: `app/backend_adapters.py`
- Compatibility wrappers: `app/backend_client.py`
- Ports: `runtime/ports/control_plane.py`
- Adapters: `adapters/control_plane/backend.py`
`Session` and `DuplexPipeline` receive backend capabilities via injected adapter
methods instead of hard-coding backend client imports.
## Async History Writes
Session history persistence is handled by `core/history_bridge.py`.
Session history persistence is handled by `runtime/history/bridge.py`.
Design:

View File

@@ -0,0 +1,47 @@
# Engine Extension Ports (Draft)
This document defines the draft port set used to keep core runtime extensible.
## Port Modules
- `runtime/ports/control_plane.py`
- `AssistantRuntimeConfigProvider`
- `ConversationHistoryStore`
- `KnowledgeRetriever`
- `ToolCatalog`
- `ControlPlaneGateway`
- `runtime/ports/llm.py`
- `LLMServiceSpec`
- `LLMPort`
- optional extensions: `LLMCancellable`, `LLMRuntimeConfigurable`
- `runtime/ports/tts.py`
- `TTSServiceSpec`
- `TTSPort`
- `runtime/ports/asr.py`
- `ASRServiceSpec`
- `ASRPort`
- explicit mode ports: `OfflineASRPort`, `StreamingASRPort`
- `runtime/ports/service_factory.py`
- `RealtimeServiceFactory`
## Adapter Layer
- `providers/factory/default.py` provides `DefaultRealtimeServiceFactory`.
- It maps resolved provider specs to concrete adapters.
- Runtime orchestration (`runtime/pipeline/duplex.py`) depends on the factory port/specs, not concrete provider classes.
## Provider Behavior (Current)
- LLM:
- supported providers: `openai`, `openai_compatible`, `openai-compatible`, `siliconflow`
- fallback: `MockLLMService`
- TTS:
- supported providers: `dashscope`, `volcengine`, `openai_compatible`, `openai-compatible`, `siliconflow`
- fallback: `MockTTSService`
- ASR:
- supported providers: `openai_compatible`, `openai-compatible`, `siliconflow`, `dashscope`, `volcengine`
- fallback: `BufferedASRService`
## Notes
- This is a draft contract set; follow-up work can add explicit capability negotiation and contract-version fields.

View File

@@ -0,0 +1,129 @@
# Engine High-Level Architecture
This document describes the runtime architecture of `engine` for realtime voice/text assistant interactions.
## Goals
- Low-latency duplex interaction (user speaks while assistant can respond)
- Clear separation between transport, orchestration, and model/service integrations
- Backend-optional runtime (works with or without external backend)
- Protocol-first interoperability through strict WS v1 control messages
## Top-Level Components
```mermaid
flowchart LR
C[Client\nWeb / Mobile / Device] <-- WS v1 + PCM --> A[FastAPI App\napp/main.py]
A --> S[Session\nruntime/session/manager.py]
S --> D[Duplex Pipeline\nruntime/pipeline/duplex.py]
D --> P[Processors\nVAD / EOU / Tracks]
D --> R[Workflow Runner\nworkflow/runner.py]
D --> E[Event Bus + Models\nruntime/events.py + protocol/ws_v1/*]
R --> SV[Service Layer\nproviders/asr/*\nproviders/llm/*\nproviders/tts/*]
R --> TE[Tool Executor\ntools/executor.py]
S --> HB[History Bridge\nruntime/history/bridge.py]
S --> BA[Control Plane Port\nruntime/ports/control_plane.py]
BA --> AD[Adapters\nadapters/control_plane/backend.py]
AD --> B[(External Backend API\noptional)]
SV --> M[(ASR/LLM/TTS Providers)]
```
## Request Lifecycle (Simplified)
1. Client connects to `/ws?assistant_id=<id>` and sends `session.start`.
2. App creates a `Session` with resolved assistant config (backend or local YAML).
3. Binary PCM frames enter the duplex pipeline.
4. `VAD`/`EOU` processors detect speech segments and trigger ASR finalization.
5. ASR text is routed into workflow + LLM generation.
6. Optional tool calls are executed (server-side or client-side result return).
7. LLM output streams as text deltas; TTS produces audio chunks for playback.
8. Session emits structured events (`transcript.*`, `assistant.*`, `output.audio.*`, `error`).
9. History bridge persists conversation data asynchronously.
10. On `session.stop` (or disconnect), session finalizes and drains pending writes.
## Layering and Responsibilities
### 1) Transport / API Layer
- Entry point: `app/main.py`
- Responsibilities:
- WebSocket lifecycle management
- WS v1 message validation and order guarantees
- Session creation and teardown
- Converting raw WS frames into internal events
### 2) Session + Orchestration Layer
- Core: `runtime/session/manager.py`, `runtime/pipeline/duplex.py`, `runtime/conversation.py`
- Responsibilities:
- Per-session state machine
- Turn boundaries and interruption/cancel handling
- Event sequencing (`seq`) and envelope consistency
- Bridging input/output tracks (`audio_in`, `audio_out`, `control`)
### 3) Processing Layer
- Modules: `processors/vad.py`, `processors/eou.py`, `processors/tracks.py`
- Responsibilities:
- Speech activity detection
- End-of-utterance decisioning
- Track-oriented routing and timing-sensitive pre/post processing
### 4) Workflow + Tooling Layer
- Modules: `workflow/runner.py`, `tools/executor.py`
- Responsibilities:
- Assistant workflow execution
- Tool call planning/execution and timeout handling
- Tool result normalization into protocol events
### 5) Service Integration Layer
- Modules: `providers/*`
- Responsibilities:
- Abstracting ASR/LLM/TTS provider differences
- Streaming token/audio adaptation
- Provider-specific adapters (OpenAI-compatible, DashScope, SiliconFlow, etc.)
### 6) Backend Integration Layer (Optional)
- Port: `runtime/ports/control_plane.py`
- Adapters: `adapters/control_plane/backend.py`
- Responsibilities:
- Fetching assistant runtime config
- Persisting call/session metadata and history
- Supporting `BACKEND_MODE=auto|http|disabled`
### 7) Persistence / Reliability Layer
- Module: `runtime/history/bridge.py`
- Responsibilities:
- Non-blocking queue-based history writes
- Retry with backoff on backend failures
- Best-effort drain on session finalize
## Key Design Principles
- Dependency inversion for backend: session/pipeline depend on port interfaces, not concrete clients.
- Streaming-first: text/audio are emitted incrementally to minimize perceived latency.
- Fail-soft behavior: backend/history failures should not block realtime interaction paths.
- Protocol strictness: WS v1 rejects malformed/out-of-order control traffic early.
- Explicit event model: all client-observable state changes are represented as typed events.
## Configuration Boundaries
- Runtime environment settings live in `app/config.py`.
- Assistant-specific behavior is loaded by `assistant_id`:
- backend mode: from backend API
- engine-only mode: local `engine/config/agents/<assistant_id>.yaml`
- Client-provided `metadata.overrides` and `dynamicVariables` can alter runtime behavior within protocol constraints.
## Related Docs
- WS protocol: `engine/docs/ws_v1_schema.md`
- Backend integration details: `engine/docs/backend_integration.md`
- Duplex interaction diagram: `engine/docs/duplex_interaction.svg`

View File

@@ -0,0 +1,21 @@
# Canonical Module Layout
This MVP uses a single canonical module layout without legacy import shims.
## Runtime and protocol
- `protocol.ws_v1.schema`
- `runtime.session.manager`
- `runtime.pipeline.duplex`
- `runtime.history.bridge`
- `runtime.events`
- `runtime.transports`
- `runtime.conversation`
- `runtime.ports.*`
## Integrations and orchestration
- `providers.*`
- `adapters.control_plane.backend`
- `workflow.runner`
- `tools.executor`

View File

@@ -7,9 +7,9 @@
- 握手顺序、状态机、错误语义与实现细节。
实现对照来源:
- `models/ws_v1.py`
- `core/session.py`
- `core/duplex_pipeline.py`
- `protocol/ws_v1/schema.py`
- `runtime/session/manager.py`
- `runtime/pipeline/duplex.py`
- `app/main.py`
---

View File

@@ -3,13 +3,15 @@
WAV file client for testing duplex voice conversation.
This client reads audio from a WAV file, sends it to the server,
and saves the AI's voice response to an output WAV file.
and saves a stereo WAV file with the input audio on the left channel
and the AI's voice response on the right channel.
Usage:
python examples/wav_client.py --input input.wav --output response.wav
python examples/wav_client.py --input input.wav --output response.wav --url ws://localhost:8000/ws
python examples/wav_client.py --input input.wav --output response.wav --wait-time 10
python wav_client.py --input ../data/audio_examples/two_utterances.wav -o response.wav
Requirements:
pip install soundfile websockets numpy
"""
@@ -45,20 +47,20 @@ except ImportError:
class WavFileClient:
"""
WAV file client for voice conversation testing.
Features:
- Read audio from WAV file
- Send audio to WebSocket server
- Receive and save response audio
- Receive and save stereo conversation audio
- Event logging
"""
def __init__(
self,
url: str,
input_file: str,
output_file: str,
assistant_id: str = "assistant_demo",
assistant_id: str = "default",
channel: str = "wav_client",
sample_rate: int = 16000,
chunk_duration_ms: int = 20,
@@ -69,7 +71,7 @@ class WavFileClient:
):
"""
Initialize WAV file client.
Args:
url: WebSocket server URL
input_file: Input WAV file path
@@ -92,48 +94,51 @@ class WavFileClient:
self.track_debug = track_debug
self.tail_silence_ms = max(0, int(tail_silence_ms))
self.frame_bytes = 640 # 16k mono pcm_s16le, 20ms
# WebSocket connection
self.ws = None
self.running = False
# Audio buffers
self.input_audio = np.array([], dtype=np.int16)
self.received_audio = bytearray()
self.output_segments: list[dict[str, object]] = []
self.current_output_segment: bytearray | None = None
# Statistics
self.bytes_sent = 0
self.bytes_received = 0
# TTFB tracking (per response)
self.send_start_time = None
self.response_start_time = None # set on each trackStart
self.response_start_time = None # set on each output.audio.start
self.waiting_for_first_audio = False
self.ttfb_ms = None # last TTFB for summary
self.ttfb_list = [] # TTFB for each response
# State tracking
self.track_started = False
self.track_ended = False
self.send_completed = False
self.session_ready = False
# Events log
self.events_log = []
def log_event(self, direction: str, message: str):
def log_event(self, direction: str, message: str) -> None:
"""Log an event with timestamp."""
timestamp = time.time()
self.events_log.append({
"timestamp": timestamp,
"direction": direction,
"message": message
})
# Handle encoding errors on Windows
self.events_log.append(
{
"timestamp": timestamp,
"direction": direction,
"message": message,
}
)
try:
print(f"{direction} {message}")
except UnicodeEncodeError:
# Replace problematic characters for console output
safe_message = message.encode('ascii', errors='replace').decode('ascii')
safe_message = message.encode("ascii", errors="replace").decode("ascii")
print(f"{direction} {safe_message}")
@staticmethod
@@ -152,119 +157,160 @@ class WavFileClient:
query = dict(parse_qsl(parts.query, keep_blank_values=True))
query["assistant_id"] = self.assistant_id
return urlunsplit((parts.scheme, parts.netloc, parts.path, urlencode(query), parts.fragment))
def _current_timeline_sample(self) -> int:
"""Return current sample position relative to input send start."""
if self.send_start_time is None:
return 0
elapsed_seconds = max(0.0, time.time() - self.send_start_time)
return int(round(elapsed_seconds * self.sample_rate))
def _start_output_segment(self) -> None:
"""Create a new assistant-audio segment if one is not active."""
if self.current_output_segment is not None:
return
self.current_output_segment = bytearray()
self.output_segments.append(
{
"start_sample": self._current_timeline_sample(),
"audio": self.current_output_segment,
}
)
def _close_output_segment(self) -> None:
"""Close the active assistant-audio segment, if any."""
self.current_output_segment = None
def _build_input_track(self) -> np.ndarray:
"""Build the saved left channel using the streamed input audio."""
input_track = self.input_audio.astype(np.int16, copy=True)
tail_samples = int(round(self.sample_rate * self.tail_silence_ms / 1000.0))
if tail_samples <= 0:
return input_track
if input_track.size == 0:
return np.zeros(tail_samples, dtype=np.int16)
return np.concatenate((input_track, np.zeros(tail_samples, dtype=np.int16)))
def _build_output_track(self) -> np.ndarray:
"""Build the saved right channel using received assistant audio."""
if not self.output_segments:
return np.zeros(0, dtype=np.int16)
total_samples = max(
int(segment["start_sample"]) + (len(segment["audio"]) // 2)
for segment in self.output_segments
)
mixed_track = np.zeros(total_samples, dtype=np.int32)
for segment in self.output_segments:
start_sample = int(segment["start_sample"])
segment_audio = np.frombuffer(bytes(segment["audio"]), dtype=np.int16).astype(np.int32)
if segment_audio.size == 0:
continue
end_sample = start_sample + segment_audio.size
mixed_track[start_sample:end_sample] += segment_audio
np.clip(mixed_track, -32768, 32767, out=mixed_track)
return mixed_track.astype(np.int16)
async def connect(self) -> None:
"""Connect to WebSocket server."""
session_url = self._session_url()
self.log_event("", f"Connecting to {session_url}...")
self.log_event("->", f"Connecting to {session_url}...")
self.ws = await websockets.connect(session_url)
self.running = True
self.log_event("", "Connected!")
self.log_event("->", "Connected!")
await self.send_command(
{
"type": "session.start",
"audio": {
"encoding": "pcm_s16le",
"sample_rate_hz": self.sample_rate,
"channels": 1,
},
"metadata": {
"channel": self.channel,
"source": "wav_client",
},
}
)
await self.send_command({
"type": "session.start",
"audio": {
"encoding": "pcm_s16le",
"sample_rate_hz": self.sample_rate,
"channels": 1
},
"metadata": {
"channel": self.channel,
"source": "wav_client",
},
})
async def send_command(self, cmd: dict) -> None:
"""Send JSON command to server."""
if self.ws:
await self.ws.send(json.dumps(cmd))
self.log_event("", f"Command: {cmd.get('type', 'unknown')}")
self.log_event("->", f"Command: {cmd.get('type', 'unknown')}")
async def send_hangup(self, reason: str = "Session complete") -> None:
"""Send hangup command."""
await self.send_command({
"type": "session.stop",
"reason": reason
})
await self.send_command({"type": "session.stop", "reason": reason})
def load_wav_file(self) -> tuple[np.ndarray, int]:
"""
Load and prepare WAV file for sending.
Returns:
Tuple of (audio_data as int16 numpy array, original sample rate)
"""
if not self.input_file.exists():
raise FileNotFoundError(f"Input file not found: {self.input_file}")
# Load audio file
audio_data, file_sample_rate = sf.read(self.input_file)
self.log_event("", f"Loaded: {self.input_file}")
self.log_event("", f" Original sample rate: {file_sample_rate} Hz")
self.log_event("", f" Duration: {len(audio_data) / file_sample_rate:.2f}s")
# Convert stereo to mono if needed
self.log_event("->", f"Loaded: {self.input_file}")
self.log_event("->", f" Original sample rate: {file_sample_rate} Hz")
self.log_event("->", f" Duration: {len(audio_data) / file_sample_rate:.2f}s")
if len(audio_data.shape) > 1:
audio_data = audio_data.mean(axis=1)
self.log_event("", " Converted stereo to mono")
# Resample if needed
self.log_event("->", " Converted stereo to mono")
if file_sample_rate != self.sample_rate:
# Simple resampling using numpy
duration = len(audio_data) / file_sample_rate
num_samples = int(duration * self.sample_rate)
indices = np.linspace(0, len(audio_data) - 1, num_samples)
audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data)
self.log_event("", f" Resampled to {self.sample_rate} Hz")
# Convert to int16
self.log_event("->", f" Resampled to {self.sample_rate} Hz")
if audio_data.dtype != np.int16:
# Normalize to [-1, 1] if needed
max_val = np.max(np.abs(audio_data))
if max_val > 1.0:
audio_data = audio_data / max_val
audio_data = (audio_data * 32767).astype(np.int16)
self.log_event("", f" Prepared: {len(audio_data)} samples ({len(audio_data)/self.sample_rate:.2f}s)")
self.log_event("->", f" Prepared: {len(audio_data)} samples ({len(audio_data) / self.sample_rate:.2f}s)")
self.input_audio = audio_data.copy()
return audio_data, file_sample_rate
async def audio_sender(self, audio_data: np.ndarray) -> None:
"""Send audio data to server in chunks."""
total_samples = len(audio_data)
chunk_size = self.chunk_samples
sent_samples = 0
self.send_start_time = time.time()
self.log_event("", f"Starting audio transmission ({total_samples} samples)...")
self.log_event("->", f"Starting audio transmission ({total_samples} samples)...")
while sent_samples < total_samples and self.running:
# Get next chunk
end_sample = min(sent_samples + chunk_size, total_samples)
chunk = audio_data[sent_samples:end_sample]
chunk_bytes = chunk.tobytes()
if len(chunk_bytes) % self.frame_bytes != 0:
# v1 audio framing requires 640-byte (20ms) PCM units.
pad = self.frame_bytes - (len(chunk_bytes) % self.frame_bytes)
chunk_bytes += b"\x00" * pad
# Send to server
if self.ws:
await self.ws.send(chunk_bytes)
self.bytes_sent += len(chunk_bytes)
sent_samples = end_sample
# Progress logging (every 500ms worth of audio)
if self.verbose and sent_samples % (self.sample_rate // 2) == 0:
progress = (sent_samples / total_samples) * 100
print(f" Sending: {progress:.0f}%", end="\r")
# Delay to simulate real-time streaming
# Server expects audio at real-time pace for VAD/ASR to work properly
await asyncio.sleep(self.chunk_duration_ms / 1000)
# Add a short silence tail to help VAD/EOU close the final utterance.
if self.tail_silence_ms > 0 and self.ws:
tail_frames = max(1, self.tail_silence_ms // 20)
silence = b"\x00" * self.frame_bytes
@@ -272,56 +318,53 @@ class WavFileClient:
await self.ws.send(silence)
self.bytes_sent += len(silence)
await asyncio.sleep(0.02)
self.log_event("", f"Sent trailing silence: {self.tail_silence_ms}ms")
self.log_event("->", f"Sent trailing silence: {self.tail_silence_ms}ms")
self.send_completed = True
elapsed = time.time() - self.send_start_time
self.log_event("", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent/1024:.1f} KB)")
self.log_event("->", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent / 1024:.1f} KB)")
async def receiver(self) -> None:
"""Receive messages from server."""
try:
while self.running:
try:
message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
if isinstance(message, bytes):
# Audio data received
self.bytes_received += len(message)
self.received_audio.extend(message)
# Calculate TTFB on first audio of each response
self._start_output_segment()
self.current_output_segment.extend(message)
if self.waiting_for_first_audio and self.response_start_time is not None:
ttfb_ms = (time.time() - self.response_start_time) * 1000
self.ttfb_ms = ttfb_ms
self.ttfb_list.append(ttfb_ms)
self.waiting_for_first_audio = False
self.log_event("", f"[TTFB] First audio latency: {ttfb_ms:.0f}ms")
# Log progress
self.log_event("<-", f"[TTFB] First audio latency: {ttfb_ms:.0f}ms")
duration_ms = len(message) / (self.sample_rate * 2) * 1000
total_ms = len(self.received_audio) / (self.sample_rate * 2) * 1000
if self.verbose:
print(f" Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r")
print(f"<- Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r")
else:
# JSON event
event = json.loads(message)
await self._handle_event(event)
except asyncio.TimeoutError:
continue
except websockets.ConnectionClosed:
self.log_event("", "Connection closed")
self.log_event("<-", "Connection closed")
self.running = False
break
except asyncio.CancelledError:
pass
except Exception as e:
self.log_event("!", f"Receiver error: {e}")
except Exception as exc:
self.log_event("!", f"Receiver error: {exc}")
self.running = False
async def _handle_event(self, event: dict) -> None:
"""Handle incoming event."""
event_type = event.get("type", "unknown")
@@ -331,14 +374,14 @@ class WavFileClient:
if event_type == "session.started":
self.session_ready = True
self.log_event("", f"Session ready!{ids}")
self.log_event("<-", f"Session ready!{ids}")
elif event_type == "config.resolved":
config = event.get("config", {})
self.log_event("", f"Config resolved (output={config.get('output', {})}){ids}")
self.log_event("<-", f"Config resolved (output={config.get('output', {})}){ids}")
elif event_type == "input.speech_started":
self.log_event("", f"Speech detected{ids}")
self.log_event("<-", f"Speech detected{ids}")
elif event_type == "input.speech_stopped":
self.log_event("", f"Silence detected{ids}")
self.log_event("<-", f"Silence detected{ids}")
elif event_type == "transcript.delta":
text = event.get("text", "")
display_text = text[:60] + "..." if len(text) > 60 else text
@@ -346,125 +389,128 @@ class WavFileClient:
elif event_type == "transcript.final":
text = event.get("text", "")
print(" " * 80, end="\r")
self.log_event("", f"You: {text}{ids}")
self.log_event("<-", f"You: {text}{ids}")
elif event_type == "metrics.ttfb":
latency_ms = event.get("latencyMs", 0)
self.log_event("", f"[TTFB] Server latency: {latency_ms}ms")
self.log_event("<-", f"[TTFB] Server latency: {latency_ms}ms")
elif event_type == "assistant.response.delta":
text = event.get("text", "")
if self.verbose and text:
self.log_event("", f"LLM: {text}{ids}")
self.log_event("<-", f"LLM: {text}{ids}")
elif event_type == "assistant.response.final":
text = event.get("text", "")
if text:
self.log_event("", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}{ids}")
summary = text[:100] + ("..." if len(text) > 100 else "")
self.log_event("<-", f"LLM Response (final): {summary}{ids}")
elif event_type == "output.audio.start":
self.track_started = True
self.response_start_time = time.time()
self.waiting_for_first_audio = True
self.log_event("", f"Bot started speaking{ids}")
self._close_output_segment()
self.log_event("<-", f"Bot started speaking{ids}")
elif event_type == "output.audio.end":
self.track_ended = True
self.log_event("", f"Bot finished speaking{ids}")
self._close_output_segment()
self.log_event("<-", f"Bot finished speaking{ids}")
elif event_type == "response.interrupted":
self.log_event("", f"Bot interrupted!{ids}")
self._close_output_segment()
self.log_event("<-", f"Bot interrupted!{ids}")
elif event_type == "error":
self.log_event("!", f"Error: {event.get('message')}{ids}")
elif event_type == "session.stopped":
self.log_event("", f"Session stopped: {event.get('reason')}{ids}")
self.log_event("<-", f"Session stopped: {event.get('reason')}{ids}")
self.running = False
else:
self.log_event("", f"Event: {event_type}{ids}")
self.log_event("<-", f"Event: {event_type}{ids}")
def save_output_wav(self) -> None:
"""Save received audio to output WAV file."""
if not self.received_audio:
self.log_event("!", "No audio received to save")
"""Save the conversation to a stereo WAV file."""
input_track = self._build_input_track()
output_track = self._build_output_track()
if input_track.size == 0 and output_track.size == 0:
self.log_event("!", "No audio available to save")
return
# Convert bytes to numpy array
audio_data = np.frombuffer(bytes(self.received_audio), dtype=np.int16)
# Ensure output directory exists
if not self.received_audio:
self.log_event("!", "No assistant audio received; saving silent right channel")
total_samples = max(input_track.size, output_track.size)
if input_track.size < total_samples:
input_track = np.pad(input_track, (0, total_samples - input_track.size))
if output_track.size < total_samples:
output_track = np.pad(output_track, (0, total_samples - output_track.size))
stereo_audio = np.column_stack((input_track, output_track)).astype(np.int16, copy=False)
self.output_file.parent.mkdir(parents=True, exist_ok=True)
# Save using wave module for compatibility
with wave.open(str(self.output_file), 'wb') as wav_file:
wav_file.setnchannels(1)
with wave.open(str(self.output_file), "wb") as wav_file:
wav_file.setnchannels(2)
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(self.sample_rate)
wav_file.writeframes(audio_data.tobytes())
duration = len(audio_data) / self.sample_rate
self.log_event("", f"Saved output: {self.output_file}")
self.log_event("", f" Duration: {duration:.2f}s ({len(audio_data)} samples)")
self.log_event("", f" Size: {len(self.received_audio)/1024:.1f} KB")
wav_file.writeframes(stereo_audio.tobytes())
duration = total_samples / self.sample_rate
self.log_event("->", f"Saved stereo output: {self.output_file}")
self.log_event("->", f" Duration: {duration:.2f}s ({total_samples} samples/channel)")
self.log_event("->", " Channels: left=input, right=assistant")
self.log_event("->", f" Size: {stereo_audio.nbytes / 1024:.1f} KB")
async def run(self) -> None:
"""Run the WAV file test."""
try:
# Load input WAV file
audio_data, _ = self.load_wav_file()
# Connect to server
await self.connect()
# Start receiver task
receiver_task = asyncio.create_task(self.receiver())
# Wait for session.started before streaming audio
ready_start = time.time()
while self.running and not self.session_ready:
if time.time() - ready_start > 8.0:
raise TimeoutError("Timeout waiting for session.started")
await asyncio.sleep(0.05)
# Send audio
await self.audio_sender(audio_data)
# Wait for response
self.log_event("", f"Waiting {self.wait_time}s for response...")
self.log_event("->", f"Waiting {self.wait_time}s for response...")
wait_start = time.time()
while self.running and (time.time() - wait_start) < self.wait_time:
# Check if track has ended (response complete)
if self.track_ended and self.send_completed:
# Give a little extra time for any remaining audio
await asyncio.sleep(1.0)
break
await asyncio.sleep(0.1)
# Cleanup
self.running = False
receiver_task.cancel()
try:
await receiver_task
except asyncio.CancelledError:
pass
# Save output
self.save_output_wav()
# Print summary
self._print_summary()
except FileNotFoundError as e:
print(f"Error: {e}")
except FileNotFoundError as exc:
print(f"Error: {exc}")
sys.exit(1)
except ConnectionRefusedError:
print(f"Error: Could not connect to {self.url}")
print("Make sure the server is running.")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
except Exception as exc:
print(f"Error: {exc}")
import traceback
traceback.print_exc()
sys.exit(1)
finally:
await self.close()
def _print_summary(self):
def _print_summary(self) -> None:
"""Print session summary."""
print("\n" + "=" * 50)
print("Session Summary")
@@ -477,19 +523,20 @@ class WavFileClient:
if len(self.ttfb_list) == 1:
print(f" TTFB: {self.ttfb_list[0]:.0f} ms")
else:
print(f" TTFB (per response): {', '.join(f'{t:.0f}ms' for t in self.ttfb_list)}")
values = ", ".join(f"{ttfb:.0f}ms" for ttfb in self.ttfb_list)
print(f" TTFB (per response): {values}")
if self.received_audio:
duration = len(self.received_audio) / (self.sample_rate * 2)
print(f" Response duration: {duration:.2f}s")
print("=" * 50)
async def close(self) -> None:
"""Close the connection."""
self.running = False
if self.ws:
try:
await self.ws.close()
except:
except Exception:
pass
@@ -498,67 +545,71 @@ async def main():
description="WAV file client for testing duplex voice conversation"
)
parser.add_argument(
"--input", "-i",
"--input",
"-i",
required=True,
help="Input WAV file path"
help="Input WAV file path",
)
parser.add_argument(
"--output", "-o",
"--output",
"-o",
required=True,
help="Output WAV file path for response"
help="Output WAV file path for stereo conversation audio",
)
parser.add_argument(
"--url",
default="ws://localhost:8000/ws",
help="WebSocket server URL (default: ws://localhost:8000/ws)"
help="WebSocket server URL (default: ws://localhost:8000/ws)",
)
parser.add_argument(
"--sample-rate",
type=int,
default=16000,
help="Target sample rate for audio (default: 16000)"
help="Target sample rate for audio (default: 16000)",
)
parser.add_argument(
"--assistant-id",
default="assistant_demo",
help="Assistant identifier used in websocket query parameter"
default="default",
help="Assistant identifier used in websocket query parameter",
)
parser.add_argument(
"--channel",
default="wav_client",
help="Client channel name"
help="Client channel name",
)
parser.add_argument(
"--chunk-duration",
type=int,
default=20,
help="Chunk duration in ms for sending (default: 20)"
help="Chunk duration in ms for sending (default: 20)",
)
parser.add_argument(
"--wait-time", "-w",
"--wait-time",
"-w",
type=float,
default=15.0,
help="Time to wait for response after sending (default: 15.0)"
help="Time to wait for response after sending (default: 15.0)",
)
parser.add_argument(
"--verbose", "-v",
"--verbose",
"-v",
action="store_true",
help="Enable verbose output"
help="Enable verbose output",
)
parser.add_argument(
"--track-debug",
action="store_true",
help="Print event trackId for protocol debugging"
help="Print event trackId for protocol debugging",
)
parser.add_argument(
"--tail-silence-ms",
type=int,
default=800,
help="Trailing silence to send after WAV playback for EOU detection (default: 800)"
help="Trailing silence to send after WAV playback for EOU detection (default: 800)",
)
args = parser.parse_args()
client = WavFileClient(
url=args.url,
input_file=args.input,
@@ -572,7 +623,7 @@ async def main():
track_debug=args.track_debug,
tail_silence_ms=args.tail_silence_ms,
)
await client.run()
@@ -580,4 +631,4 @@ if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
print("\nInterrupted by user")
print("\nInterrupted by user")

View File

@@ -1 +0,0 @@
"""Data Models Package"""

View File

@@ -1,143 +0,0 @@
"""Protocol command models matching the original active-call API."""
from typing import Optional, Dict, Any
from pydantic import BaseModel, Field
class InviteCommand(BaseModel):
"""Invite command to initiate a call."""
command: str = Field(default="invite", description="Command type")
option: Optional[Dict[str, Any]] = Field(default=None, description="Call configuration options")
class AcceptCommand(BaseModel):
"""Accept command to accept an incoming call."""
command: str = Field(default="accept", description="Command type")
option: Optional[Dict[str, Any]] = Field(default=None, description="Call configuration options")
class RejectCommand(BaseModel):
"""Reject command to reject an incoming call."""
command: str = Field(default="reject", description="Command type")
reason: str = Field(default="", description="Reason for rejection")
code: Optional[int] = Field(default=None, description="SIP response code")
class RingingCommand(BaseModel):
"""Ringing command to send ringing response."""
command: str = Field(default="ringing", description="Command type")
recorder: Optional[Dict[str, Any]] = Field(default=None, description="Call recording configuration")
early_media: bool = Field(default=False, description="Enable early media")
ringtone: Optional[str] = Field(default=None, description="Custom ringtone URL")
class TTSCommand(BaseModel):
"""TTS command to convert text to speech."""
command: str = Field(default="tts", description="Command type")
text: str = Field(..., description="Text to synthesize")
speaker: Optional[str] = Field(default=None, description="Speaker voice name")
play_id: Optional[str] = Field(default=None, description="Unique identifier for this TTS session")
auto_hangup: bool = Field(default=False, description="Auto hangup after TTS completion")
streaming: bool = Field(default=False, description="Streaming text input")
end_of_stream: bool = Field(default=False, description="End of streaming input")
wait_input_timeout: Optional[int] = Field(default=None, description="Max time to wait for input (seconds)")
option: Optional[Dict[str, Any]] = Field(default=None, description="TTS provider specific options")
class PlayCommand(BaseModel):
"""Play command to play audio from URL."""
command: str = Field(default="play", description="Command type")
url: str = Field(..., description="URL of audio file to play")
auto_hangup: bool = Field(default=False, description="Auto hangup after playback")
wait_input_timeout: Optional[int] = Field(default=None, description="Max time to wait for input (seconds)")
class InterruptCommand(BaseModel):
"""Interrupt command to interrupt current playback."""
command: str = Field(default="interrupt", description="Command type")
graceful: bool = Field(default=False, description="Wait for current TTS to complete")
class PauseCommand(BaseModel):
"""Pause command to pause current playback."""
command: str = Field(default="pause", description="Command type")
class ResumeCommand(BaseModel):
"""Resume command to resume paused playback."""
command: str = Field(default="resume", description="Command type")
class HangupCommand(BaseModel):
"""Hangup command to end the call."""
command: str = Field(default="hangup", description="Command type")
reason: Optional[str] = Field(default=None, description="Reason for hangup")
initiator: Optional[str] = Field(default=None, description="Who initiated the hangup")
class HistoryCommand(BaseModel):
"""History command to add conversation history."""
command: str = Field(default="history", description="Command type")
speaker: str = Field(..., description="Speaker identifier")
text: str = Field(..., description="Conversation text")
class ChatCommand(BaseModel):
"""Chat command for text-based conversation."""
command: str = Field(default="chat", description="Command type")
text: str = Field(..., description="Chat text message")
# Command type mapping
COMMAND_TYPES = {
"invite": InviteCommand,
"accept": AcceptCommand,
"reject": RejectCommand,
"ringing": RingingCommand,
"tts": TTSCommand,
"play": PlayCommand,
"interrupt": InterruptCommand,
"pause": PauseCommand,
"resume": ResumeCommand,
"hangup": HangupCommand,
"history": HistoryCommand,
"chat": ChatCommand,
}
def parse_command(data: Dict[str, Any]) -> BaseModel:
"""
Parse a command from JSON data.
Args:
data: JSON data as dictionary
Returns:
Parsed command model
Raises:
ValueError: If command type is unknown
"""
command_type = data.get("command")
if not command_type:
raise ValueError("Missing 'command' field")
command_class = COMMAND_TYPES.get(command_type)
if not command_class:
raise ValueError(f"Unknown command type: {command_type}")
return command_class(**data)

View File

@@ -1,126 +0,0 @@
"""Configuration models for call options."""
from typing import Optional, Dict, Any, List
from pydantic import BaseModel, Field
class VADOption(BaseModel):
"""Voice Activity Detection configuration."""
type: str = Field(default="silero", description="VAD algorithm type (silero, webrtc)")
samplerate: int = Field(default=16000, description="Audio sample rate for VAD")
speech_padding: int = Field(default=250, description="Speech padding in milliseconds")
silence_padding: int = Field(default=100, description="Silence padding in milliseconds")
ratio: float = Field(default=0.5, description="Voice detection ratio threshold")
voice_threshold: float = Field(default=0.5, description="Voice energy threshold")
max_buffer_duration_secs: int = Field(default=50, description="Maximum buffer duration in seconds")
silence_timeout: Optional[int] = Field(default=None, description="Silence timeout in milliseconds")
endpoint: Optional[str] = Field(default=None, description="Custom VAD service endpoint")
secret_key: Optional[str] = Field(default=None, description="VAD service secret key")
secret_id: Optional[str] = Field(default=None, description="VAD service secret ID")
class ASROption(BaseModel):
"""Automatic Speech Recognition configuration."""
provider: str = Field(..., description="ASR provider (tencent, aliyun, openai, etc.)")
language: Optional[str] = Field(default=None, description="Language code (zh-CN, en-US)")
app_id: Optional[str] = Field(default=None, description="Application ID")
secret_id: Optional[str] = Field(default=None, description="Secret ID for authentication")
secret_key: Optional[str] = Field(default=None, description="Secret key for authentication")
model_type: Optional[str] = Field(default=None, description="ASR model type (16k_zh, 8k_en)")
buffer_size: Optional[int] = Field(default=None, description="Audio buffer size in bytes")
samplerate: Optional[int] = Field(default=None, description="Audio sample rate")
endpoint: Optional[str] = Field(default=None, description="Custom ASR service endpoint")
extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters")
start_when_answer: bool = Field(default=False, description="Start ASR when call is answered")
class TTSOption(BaseModel):
"""Text-to-Speech configuration."""
samplerate: Optional[int] = Field(default=None, description="TTS output sample rate")
provider: str = Field(default="msedge", description="TTS provider (tencent, aliyun, deepgram, msedge)")
speed: float = Field(default=1.0, description="Speech speed multiplier")
app_id: Optional[str] = Field(default=None, description="Application ID")
secret_id: Optional[str] = Field(default=None, description="Secret ID for authentication")
secret_key: Optional[str] = Field(default=None, description="Secret key for authentication")
volume: Optional[int] = Field(default=None, description="Speech volume level (1-10)")
speaker: Optional[str] = Field(default=None, description="Voice speaker name")
codec: Optional[str] = Field(default=None, description="Audio codec")
subtitle: bool = Field(default=False, description="Enable subtitle generation")
emotion: Optional[str] = Field(default=None, description="Speech emotion")
endpoint: Optional[str] = Field(default=None, description="Custom TTS service endpoint")
extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters")
max_concurrent_tasks: Optional[int] = Field(default=None, description="Max concurrent tasks")
class RecorderOption(BaseModel):
"""Call recording configuration."""
recorder_file: str = Field(..., description="Path to recording file")
samplerate: int = Field(default=16000, description="Recording sample rate")
ptime: int = Field(default=200, description="Packet time in milliseconds")
class MediaPassOption(BaseModel):
"""Media pass-through configuration for external audio processing."""
url: str = Field(..., description="WebSocket URL for media streaming")
input_sample_rate: int = Field(default=16000, description="Sample rate of audio received from WebSocket")
output_sample_rate: int = Field(default=16000, description="Sample rate of audio sent to WebSocket")
packet_size: int = Field(default=2560, description="Packet size in bytes")
ptime: Optional[int] = Field(default=None, description="Buffered playback period in milliseconds")
class SipOption(BaseModel):
"""SIP protocol configuration."""
username: Optional[str] = Field(default=None, description="SIP username")
password: Optional[str] = Field(default=None, description="SIP password")
realm: Optional[str] = Field(default=None, description="SIP realm/domain")
headers: Optional[Dict[str, str]] = Field(default=None, description="Additional SIP headers")
class HandlerRule(BaseModel):
"""Handler routing rule."""
caller: Optional[str] = Field(default=None, description="Caller pattern (regex)")
callee: Optional[str] = Field(default=None, description="Callee pattern (regex)")
playbook: Optional[str] = Field(default=None, description="Playbook file path")
webhook: Optional[str] = Field(default=None, description="Webhook URL")
class CallOption(BaseModel):
"""Comprehensive call configuration options."""
# Basic options
denoise: bool = Field(default=False, description="Enable noise reduction")
offer: Optional[str] = Field(default=None, description="SDP offer string")
callee: Optional[str] = Field(default=None, description="Callee SIP URI or phone number")
caller: Optional[str] = Field(default=None, description="Caller SIP URI or phone number")
# Audio codec
codec: str = Field(default="pcm", description="Audio codec (pcm, pcma, pcmu, g722)")
# Component configurations
recorder: Optional[RecorderOption] = Field(default=None, description="Call recording config")
asr: Optional[ASROption] = Field(default=None, description="ASR configuration")
vad: Optional[VADOption] = Field(default=None, description="VAD configuration")
tts: Optional[TTSOption] = Field(default=None, description="TTS configuration")
media_pass: Optional[MediaPassOption] = Field(default=None, description="Media pass-through config")
sip: Optional[SipOption] = Field(default=None, description="SIP configuration")
# Timeouts and networking
handshake_timeout: Optional[int] = Field(default=None, description="Handshake timeout in seconds")
enable_ipv6: bool = Field(default=False, description="Enable IPv6 support")
inactivity_timeout: Optional[int] = Field(default=None, description="Inactivity timeout in seconds")
# EOU configuration
eou: Optional[Dict[str, Any]] = Field(default=None, description="End of utterance detection config")
# Extra parameters
extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional custom parameters")
class Config:
populate_by_name = True

View File

@@ -1,231 +0,0 @@
"""Protocol event models matching the original active-call API."""
from typing import Optional, Dict, Any
from pydantic import BaseModel, Field
from datetime import datetime
def current_timestamp_ms() -> int:
"""Get current timestamp in milliseconds."""
return int(datetime.now().timestamp() * 1000)
# Base Event Model
class BaseEvent(BaseModel):
"""Base event model."""
event: str = Field(..., description="Event type")
track_id: str = Field(..., description="Unique track identifier")
timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp in milliseconds")
# Lifecycle Events
class IncomingEvent(BaseEvent):
"""Incoming call event (SIP only)."""
event: str = Field(default="incoming", description="Event type")
caller: Optional[str] = Field(default=None, description="Caller's SIP URI")
callee: Optional[str] = Field(default=None, description="Callee's SIP URI")
sdp: Optional[str] = Field(default=None, description="SDP offer from caller")
class AnswerEvent(BaseEvent):
"""Call answered event."""
event: str = Field(default="answer", description="Event type")
sdp: Optional[str] = Field(default=None, description="SDP answer from server")
class RejectEvent(BaseEvent):
"""Call rejected event."""
event: str = Field(default="reject", description="Event type")
reason: Optional[str] = Field(default=None, description="Rejection reason")
code: Optional[int] = Field(default=None, description="SIP response code")
class RingingEvent(BaseEvent):
"""Call ringing event."""
event: str = Field(default="ringing", description="Event type")
early_media: bool = Field(default=False, description="Early media available")
class HangupEvent(BaseModel):
"""Call hangup event."""
event: str = Field(default="hangup", description="Event type")
timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp")
reason: Optional[str] = Field(default=None, description="Hangup reason")
initiator: Optional[str] = Field(default=None, description="Who initiated hangup")
start_time: Optional[str] = Field(default=None, description="Call start time (ISO 8601)")
hangup_time: Optional[str] = Field(default=None, description="Hangup time (ISO 8601)")
answer_time: Optional[str] = Field(default=None, description="Answer time (ISO 8601)")
ringing_time: Optional[str] = Field(default=None, description="Ringing time (ISO 8601)")
from_: Optional[Dict[str, Any]] = Field(default=None, alias="from", description="Caller info")
to: Optional[Dict[str, Any]] = Field(default=None, description="Callee info")
extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata")
class Config:
populate_by_name = True
# VAD Events
class SpeakingEvent(BaseEvent):
"""Speech detected event."""
event: str = Field(default="speaking", description="Event type")
start_time: int = Field(default_factory=current_timestamp_ms, description="Speech start time")
class SilenceEvent(BaseEvent):
"""Silence detected event."""
event: str = Field(default="silence", description="Event type")
start_time: int = Field(default_factory=current_timestamp_ms, description="Silence start time")
duration: int = Field(default=0, description="Silence duration in milliseconds")
# AI/ASR Events
class AsrFinalEvent(BaseEvent):
"""ASR final transcription event."""
event: str = Field(default="asrFinal", description="Event type")
index: int = Field(..., description="ASR result sequence number")
start_time: Optional[int] = Field(default=None, description="Speech start time")
end_time: Optional[int] = Field(default=None, description="Speech end time")
text: str = Field(..., description="Transcribed text")
class AsrDeltaEvent(BaseEvent):
"""ASR partial transcription event (streaming)."""
event: str = Field(default="asrDelta", description="Event type")
index: int = Field(..., description="ASR result sequence number")
start_time: Optional[int] = Field(default=None, description="Speech start time")
end_time: Optional[int] = Field(default=None, description="Speech end time")
text: str = Field(..., description="Partial transcribed text")
class EouEvent(BaseEvent):
"""End of utterance detection event."""
event: str = Field(default="eou", description="Event type")
completed: bool = Field(default=True, description="Whether utterance was completed")
# Audio Track Events
class TrackStartEvent(BaseEvent):
"""Audio track start event."""
event: str = Field(default="trackStart", description="Event type")
play_id: Optional[str] = Field(default=None, description="Play ID from TTS/Play command")
class TrackEndEvent(BaseEvent):
"""Audio track end event."""
event: str = Field(default="trackEnd", description="Event type")
duration: int = Field(..., description="Track duration in milliseconds")
ssrc: int = Field(..., description="RTP SSRC identifier")
play_id: Optional[str] = Field(default=None, description="Play ID from TTS/Play command")
class InterruptionEvent(BaseEvent):
"""Playback interruption event."""
event: str = Field(default="interruption", description="Event type")
play_id: Optional[str] = Field(default=None, description="Play ID that was interrupted")
subtitle: Optional[str] = Field(default=None, description="TTS text being played")
position: Optional[int] = Field(default=None, description="Word index position")
total_duration: Optional[int] = Field(default=None, description="Total TTS duration")
current: Optional[int] = Field(default=None, description="Elapsed time when interrupted")
# System Events
class ErrorEvent(BaseEvent):
"""Error event."""
event: str = Field(default="error", description="Event type")
sender: str = Field(..., description="Component that generated the error")
error: str = Field(..., description="Error message")
code: Optional[int] = Field(default=None, description="Error code")
class MetricsEvent(BaseModel):
"""Performance metrics event."""
event: str = Field(default="metrics", description="Event type")
timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp")
key: str = Field(..., description="Metric key")
duration: int = Field(..., description="Duration in milliseconds")
data: Optional[Dict[str, Any]] = Field(default=None, description="Additional metric data")
class AddHistoryEvent(BaseModel):
"""Conversation history entry added event."""
event: str = Field(default="addHistory", description="Event type")
timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp")
sender: Optional[str] = Field(default=None, description="Component that added history")
speaker: str = Field(..., description="Speaker identifier")
text: str = Field(..., description="Conversation text")
class DTMFEvent(BaseEvent):
"""DTMF tone detected event."""
event: str = Field(default="dtmf", description="Event type")
digit: str = Field(..., description="DTMF digit (0-9, *, #, A-D)")
class HeartBeatEvent(BaseModel):
"""Server-to-client heartbeat to keep connection alive."""
event: str = Field(default="heartBeat", description="Event type")
timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp in milliseconds")
# Event type mapping
EVENT_TYPES = {
"incoming": IncomingEvent,
"answer": AnswerEvent,
"reject": RejectEvent,
"ringing": RingingEvent,
"hangup": HangupEvent,
"speaking": SpeakingEvent,
"silence": SilenceEvent,
"asrFinal": AsrFinalEvent,
"asrDelta": AsrDeltaEvent,
"eou": EouEvent,
"trackStart": TrackStartEvent,
"trackEnd": TrackEndEvent,
"interruption": InterruptionEvent,
"error": ErrorEvent,
"metrics": MetricsEvent,
"addHistory": AddHistoryEvent,
"dtmf": DTMFEvent,
"heartBeat": HeartBeatEvent,
}
def create_event(event_type: str, **kwargs) -> BaseModel:
"""
Create an event model.
Args:
event_type: Type of event to create
**kwargs: Event fields
Returns:
Event model instance
Raises:
ValueError: If event type is unknown
"""
event_class = EVENT_TYPES.get(event_type)
if not event_class:
raise ValueError(f"Unknown event type: {event_type}")
return event_class(event=event_type, **kwargs)

View File

@@ -0,0 +1 @@
"""Protocol package."""

View File

@@ -0,0 +1 @@
"""WS v1 protocol package."""

View File

@@ -0,0 +1 @@
"""Providers package."""

View File

@@ -0,0 +1,15 @@
"""ASR providers."""
from providers.asr.buffered import BufferedASRService, MockASRService
from providers.asr.dashscope import DashScopeRealtimeASRService
from providers.asr.openai_compatible import OpenAICompatibleASRService, SiliconFlowASRService
from providers.asr.volcengine import VolcengineRealtimeASRService
__all__ = [
"BufferedASRService",
"MockASRService",
"DashScopeRealtimeASRService",
"OpenAICompatibleASRService",
"SiliconFlowASRService",
"VolcengineRealtimeASRService",
]

View File

@@ -9,7 +9,7 @@ import json
from typing import AsyncIterator, Optional
from loguru import logger
from services.base import BaseASRService, ASRResult, ServiceState
from providers.common.base import BaseASRService, ASRResult, ServiceState
# Try to import websockets for streaming ASR
try:
@@ -34,6 +34,7 @@ class BufferedASRService(BaseASRService):
language: str = "en"
):
super().__init__(sample_rate=sample_rate, language=language)
self.mode = "offline"
self._audio_buffer: bytes = b""
self._current_text: str = ""
@@ -86,6 +87,23 @@ class BufferedASRService(BaseASRService):
self._current_text = ""
self._audio_buffer = b""
return text
async def get_final_transcription(self) -> str:
"""Offline compatibility method used by DuplexPipeline."""
return self.get_and_clear_text()
def clear_buffer(self) -> None:
"""Offline compatibility method used by DuplexPipeline."""
self._audio_buffer = b""
self._current_text = ""
async def start_interim_transcription(self) -> None:
"""No-op for plain buffered ASR."""
return None
async def stop_interim_transcription(self) -> None:
"""No-op for plain buffered ASR."""
return None
def get_audio_buffer(self) -> bytes:
"""Get accumulated audio buffer."""
@@ -103,6 +121,7 @@ class MockASRService(BaseASRService):
def __init__(self, sample_rate: int = 16000, language: str = "en"):
super().__init__(sample_rate=sample_rate, language=language)
self.mode = "offline"
self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
self._mock_texts = [
"Hello, how are you?",
@@ -145,3 +164,18 @@ class MockASRService(BaseASRService):
continue
except asyncio.CancelledError:
break
def clear_buffer(self) -> None:
return None
async def get_final_transcription(self) -> str:
return ""
def get_and_clear_text(self) -> str:
return ""
async def start_interim_transcription(self) -> None:
return None
async def stop_interim_transcription(self) -> None:
return None

View File

@@ -0,0 +1,388 @@
"""DashScope realtime streaming ASR service.
Uses Qwen-ASR-Realtime via DashScope Python SDK.
"""
from __future__ import annotations
import asyncio
import base64
import json
import os
import sys
from typing import Any, AsyncIterator, Awaitable, Callable, Dict, Optional
from loguru import logger
from providers.common.base import ASRResult, BaseASRService, ServiceState
try:
import dashscope
from dashscope.audio.qwen_omni import MultiModality, OmniRealtimeCallback, OmniRealtimeConversation
# Some SDK builds keep TranscriptionParams under qwen_omni.omni_realtime.
try:
from dashscope.audio.qwen_omni import TranscriptionParams
except ImportError:
from dashscope.audio.qwen_omni.omni_realtime import TranscriptionParams
DASHSCOPE_SDK_AVAILABLE = True
DASHSCOPE_IMPORT_ERROR = ""
except Exception as exc:
DASHSCOPE_IMPORT_ERROR = f"{type(exc).__name__}: {exc}"
dashscope = None # type: ignore[assignment]
MultiModality = None # type: ignore[assignment]
OmniRealtimeConversation = None # type: ignore[assignment]
TranscriptionParams = None # type: ignore[assignment]
DASHSCOPE_SDK_AVAILABLE = False
class OmniRealtimeCallback: # type: ignore[no-redef]
"""Fallback callback base when DashScope SDK is unavailable."""
pass
class _DashScopeASRCallback(OmniRealtimeCallback):
"""Bridge DashScope SDK callbacks into asyncio loop-safe handlers."""
def __init__(self, owner: "DashScopeRealtimeASRService", loop: asyncio.AbstractEventLoop):
super().__init__()
self._owner = owner
self._loop = loop
def _schedule(self, fn: Callable[[], None]) -> None:
try:
self._loop.call_soon_threadsafe(fn)
except RuntimeError:
return
def on_open(self) -> None:
self._schedule(self._owner._on_ws_open)
def on_close(self, code: int, msg: str) -> None:
self._schedule(lambda: self._owner._on_ws_close(code, msg))
def on_event(self, message: Any) -> None:
self._schedule(lambda: self._owner._on_ws_event(message))
def on_error(self, message: Any) -> None:
self._schedule(lambda: self._owner._on_ws_error(message))
class DashScopeRealtimeASRService(BaseASRService):
"""Realtime streaming ASR implementation for DashScope Qwen-ASR-Realtime."""
DEFAULT_WS_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
DEFAULT_MODEL = "qwen3-asr-flash-realtime"
DEFAULT_FINAL_TIMEOUT_MS = 800
def __init__(
self,
api_key: Optional[str] = None,
api_url: Optional[str] = None,
model: Optional[str] = None,
sample_rate: int = 16000,
language: str = "auto",
on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None,
) -> None:
super().__init__(sample_rate=sample_rate, language=language)
self.mode = "streaming"
self.api_key = (
api_key
or os.getenv("DASHSCOPE_API_KEY")
or os.getenv("ASR_API_KEY")
)
self.api_url = api_url or os.getenv("DASHSCOPE_ASR_API_URL") or self.DEFAULT_WS_URL
self.model = model or os.getenv("DASHSCOPE_ASR_MODEL") or self.DEFAULT_MODEL
self.on_transcript = on_transcript
self._client: Optional[Any] = None
self._loop: Optional[asyncio.AbstractEventLoop] = None
self._callback: Optional[_DashScopeASRCallback] = None
self._running = False
self._session_ready = asyncio.Event()
self._transcript_queue: "asyncio.Queue[ASRResult]" = asyncio.Queue()
self._final_queue: "asyncio.Queue[str]" = asyncio.Queue()
self._utterance_active = False
self._audio_sent_in_utterance = False
self._last_interim_text = ""
self._last_error: Optional[str] = None
async def connect(self) -> None:
if not DASHSCOPE_SDK_AVAILABLE:
py_exec = sys.executable
hint = f"`{py_exec} -m pip install dashscope>=1.25.6`"
detail = f"; import error: {DASHSCOPE_IMPORT_ERROR}" if DASHSCOPE_IMPORT_ERROR else ""
raise RuntimeError(
f"dashscope SDK unavailable in interpreter {py_exec}; install with {hint}{detail}"
)
if not self.api_key:
raise ValueError("DashScope ASR API key not provided. Configure agent.asr.api_key in YAML.")
self._loop = asyncio.get_running_loop()
self._callback = _DashScopeASRCallback(owner=self, loop=self._loop)
if dashscope is not None:
dashscope.api_key = self.api_key
self._client = OmniRealtimeConversation( # type: ignore[misc]
model=self.model,
url=self.api_url,
callback=self._callback,
)
await asyncio.to_thread(self._client.connect)
await self._configure_session()
self._running = True
self.state = ServiceState.CONNECTED
logger.info(
"DashScope realtime ASR connected: model={}, sample_rate={}, language={}",
self.model,
self.sample_rate,
self.language,
)
async def disconnect(self) -> None:
self._running = False
self._utterance_active = False
self._audio_sent_in_utterance = False
self._drain_queue(self._final_queue)
self._drain_queue(self._transcript_queue)
self._session_ready.clear()
if self._client is not None:
close_fn = getattr(self._client, "close", None)
if callable(close_fn):
await asyncio.to_thread(close_fn)
self._client = None
self.state = ServiceState.DISCONNECTED
logger.info("DashScope realtime ASR disconnected")
async def begin_utterance(self) -> None:
self.clear_utterance()
self._utterance_active = True
async def send_audio(self, audio: bytes) -> None:
if not self._client:
raise RuntimeError("DashScope ASR service not connected")
if not audio:
return
if not self._utterance_active:
# Allow graceful fallback if caller sends before begin_utterance.
self._utterance_active = True
audio_b64 = base64.b64encode(audio).decode("ascii")
append_fn = getattr(self._client, "append_audio", None)
if not callable(append_fn):
raise RuntimeError("DashScope ASR SDK missing append_audio method")
await asyncio.to_thread(append_fn, audio_b64)
self._audio_sent_in_utterance = True
async def end_utterance(self) -> None:
if not self._client:
return
if not self._utterance_active or not self._audio_sent_in_utterance:
return
commit_fn = getattr(self._client, "commit", None)
if not callable(commit_fn):
raise RuntimeError("DashScope ASR SDK missing commit method")
await asyncio.to_thread(commit_fn)
self._utterance_active = False
async def wait_for_final_transcription(self, timeout_ms: int = DEFAULT_FINAL_TIMEOUT_MS) -> str:
if not self._audio_sent_in_utterance:
return ""
timeout_sec = max(0.05, float(timeout_ms) / 1000.0)
try:
text = await asyncio.wait_for(self._final_queue.get(), timeout=timeout_sec)
return str(text or "").strip()
except asyncio.TimeoutError:
logger.debug("DashScope ASR final timeout ({}ms), fallback to last interim", timeout_ms)
return str(self._last_interim_text or "").strip()
def clear_utterance(self) -> None:
self._utterance_active = False
self._audio_sent_in_utterance = False
self._last_interim_text = ""
self._last_error = None
self._drain_queue(self._final_queue)
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
while self._running:
try:
result = await asyncio.wait_for(self._transcript_queue.get(), timeout=0.1)
yield result
except asyncio.TimeoutError:
continue
except asyncio.CancelledError:
break
async def _configure_session(self) -> None:
if not self._client:
raise RuntimeError("DashScope ASR client is not initialized")
text_modality: Any = "text"
if MultiModality is not None and hasattr(MultiModality, "TEXT"):
text_modality = MultiModality.TEXT
transcription_params: Optional[Any] = None
if TranscriptionParams is not None:
try:
lang = "zh" if self.language == "auto" else self.language
transcription_params = TranscriptionParams(
language=lang,
sample_rate=self.sample_rate,
input_audio_format="pcm",
)
except Exception as exc:
logger.debug("DashScope ASR TranscriptionParams init failed: {}", exc)
transcription_params = None
update_attempts = [
{
"output_modalities": [text_modality],
"enable_turn_detection": False,
"enable_input_audio_transcription": True,
"transcription_params": transcription_params,
},
{
"output_modalities": [text_modality],
"enable_turn_detection": False,
"enable_input_audio_transcription": True,
},
{
"output_modalities": [text_modality],
},
]
update_fn = getattr(self._client, "update_session", None)
if not callable(update_fn):
raise RuntimeError("DashScope ASR SDK missing update_session method")
last_error: Optional[Exception] = None
for params in update_attempts:
if params.get("transcription_params") is None:
params = {k: v for k, v in params.items() if k != "transcription_params"}
try:
await asyncio.to_thread(update_fn, **params)
break
except TypeError as exc:
last_error = exc
continue
except Exception as exc:
last_error = exc
continue
else:
raise RuntimeError(f"DashScope ASR session.update failed: {last_error}")
try:
await asyncio.wait_for(self._session_ready.wait(), timeout=6.0)
except asyncio.TimeoutError:
logger.debug("DashScope ASR session ready wait timeout; continuing")
def _on_ws_open(self) -> None:
return None
def _on_ws_close(self, code: int, msg: str) -> None:
self._last_error = f"DashScope ASR websocket closed: {code} {msg}"
logger.debug(self._last_error)
def _on_ws_error(self, message: Any) -> None:
self._last_error = str(message)
logger.error("DashScope ASR error: {}", self._last_error)
def _on_ws_event(self, message: Any) -> None:
payload = self._coerce_event(message)
event_type = str(payload.get("type") or "").strip()
if not event_type:
return
if event_type in {"session.created", "session.updated"}:
self._session_ready.set()
return
if event_type == "error" or event_type.endswith(".failed"):
err_text = self._extract_text(payload, keys=("message", "error", "details"))
self._last_error = err_text or event_type
logger.error("DashScope ASR server event error: {}", self._last_error)
return
if event_type == "conversation.item.input_audio_transcription.text":
stash_text = self._extract_text(payload, keys=("stash", "text", "transcript"))
self._emit_transcript(stash_text, is_final=False)
return
if event_type == "conversation.item.input_audio_transcription.completed":
final_text = self._extract_text(payload, keys=("transcript", "text", "stash"))
self._emit_transcript(final_text, is_final=True)
return
def _emit_transcript(self, text: str, *, is_final: bool) -> None:
normalized = str(text or "").strip()
if not normalized:
return
if not is_final and normalized == self._last_interim_text:
return
if not is_final:
self._last_interim_text = normalized
if self._loop is None:
return
try:
asyncio.run_coroutine_threadsafe(
self._publish_transcript(normalized, is_final=is_final),
self._loop,
)
except RuntimeError:
return
async def _publish_transcript(self, text: str, *, is_final: bool) -> None:
await self._transcript_queue.put(ASRResult(text=text, is_final=is_final))
if is_final:
await self._final_queue.put(text)
if self.on_transcript:
try:
await self.on_transcript(text, is_final)
except Exception as exc:
logger.warning("DashScope ASR transcript callback failed: {}", exc)
@staticmethod
def _coerce_event(message: Any) -> Dict[str, Any]:
if isinstance(message, dict):
return message
if isinstance(message, str):
try:
parsed = json.loads(message)
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
return {"type": "raw", "text": message}
return {"type": "raw", "text": str(message)}
def _extract_text(self, payload: Dict[str, Any], *, keys: tuple[str, ...]) -> str:
for key in keys:
value = payload.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
if isinstance(value, dict):
nested = self._extract_text(value, keys=keys)
if nested:
return nested
for value in payload.values():
if isinstance(value, dict):
nested = self._extract_text(value, keys=keys)
if nested:
return nested
return ""
@staticmethod
def _drain_queue(queue: "asyncio.Queue[Any]") -> None:
while True:
try:
queue.get_nowait()
except asyncio.QueueEmpty:
break

View File

@@ -19,7 +19,7 @@ except ImportError:
AIOHTTP_AVAILABLE = False
logger.warning("aiohttp not available - OpenAICompatibleASRService will not work")
from services.base import BaseASRService, ASRResult, ServiceState
from providers.common.base import BaseASRService, ASRResult, ServiceState
class OpenAICompatibleASRService(BaseASRService):
@@ -53,6 +53,7 @@ class OpenAICompatibleASRService(BaseASRService):
model: str = "FunAudioLLM/SenseVoiceSmall",
sample_rate: int = 16000,
language: str = "auto",
enable_interim: bool = False,
interim_interval_ms: int = 500, # How often to send interim results
min_audio_for_interim_ms: int = 300, # Min audio before first interim
on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None
@@ -66,19 +67,22 @@ class OpenAICompatibleASRService(BaseASRService):
model: ASR model name or alias
sample_rate: Audio sample rate (16000 recommended)
language: Language code (auto for automatic detection)
enable_interim: Whether to generate interim transcriptions in offline mode
interim_interval_ms: How often to generate interim transcriptions
min_audio_for_interim_ms: Minimum audio duration before first interim
on_transcript: Callback for transcription results (text, is_final)
"""
super().__init__(sample_rate=sample_rate, language=language)
self.mode = "offline"
if not AIOHTTP_AVAILABLE:
raise RuntimeError("aiohttp is required for OpenAICompatibleASRService")
self.api_key = api_key or os.getenv("ASR_API_KEY") or os.getenv("SILICONFLOW_API_KEY")
self.api_key = api_key
raw_api_url = api_url or os.getenv("ASR_API_URL") or self.API_URL
self.api_url = self._resolve_transcriptions_endpoint(raw_api_url)
self.model = self.MODELS.get(model.lower(), model)
self.enable_interim = bool(enable_interim)
self.interim_interval_ms = interim_interval_ms
self.min_audio_for_interim_ms = min_audio_for_interim_ms
self.on_transcript = on_transcript
@@ -180,6 +184,9 @@ class OpenAICompatibleASRService(BaseASRService):
if not self._session:
logger.warning("ASR session not connected")
return None
if not is_final and not self.enable_interim:
return None
# Check minimum audio duration
audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000
@@ -309,6 +316,9 @@ class OpenAICompatibleASRService(BaseASRService):
This periodically transcribes buffered audio for
real-time feedback to the user.
"""
if not self.enable_interim:
return
if self._interim_task and not self._interim_task.done():
return

View File

@@ -1,6 +1,6 @@
"""Backward-compatible imports for legacy siliconflow_asr module."""
from services.openai_compatible_asr import OpenAICompatibleASRService
from providers.asr.openai_compatible import OpenAICompatibleASRService
# Backward-compatible alias
SiliconFlowASRService = OpenAICompatibleASRService

View File

@@ -0,0 +1,666 @@
"""Volcengine realtime ASR service.
Supports both:
- Volcengine Edge Gateway realtime transcription websocket, and
- Volcengine BigASR Seed websocket at openspeech.bytedance.com/api/v3/sauc/bigmodel.
"""
from __future__ import annotations
import asyncio
import base64
import gzip
import json
import os
import uuid
from typing import Any, AsyncIterator, Awaitable, Callable, Dict, Literal, Optional
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
import aiohttp
from loguru import logger
from providers.common.base import ASRResult, BaseASRService, ServiceState
VolcengineASRProtocol = Literal["gateway", "seed"]
class VolcengineRealtimeASRService(BaseASRService):
"""Realtime streaming ASR backed by Volcengine websocket APIs."""
DEFAULT_WS_URL = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel"
DEFAULT_GATEWAY_WS_URL = "wss://ai-gateway.vei.volces.com/v1/realtime"
DEFAULT_MODEL = "bigmodel"
DEFAULT_FINAL_TIMEOUT_MS = 1200
DEFAULT_SEED_RESOURCE_ID = "volc.bigasr.sauc.duration"
_SEED_FRAME_MS = 100
_SEED_PROTOCOL_VERSION = 0b0001
_SEED_FULL_CLIENT_REQUEST = 0b0001
_SEED_AUDIO_ONLY_REQUEST = 0b0010
_SEED_FULL_SERVER_RESPONSE = 0b1001
_SEED_SERVER_ACK = 0b1011
_SEED_SERVER_ERROR_RESPONSE = 0b1111
_SEED_NO_SEQUENCE = 0b0000
_SEED_POS_SEQUENCE = 0b0001
_SEED_NEG_WITH_SEQUENCE = 0b0011
_SEED_NO_SERIALIZATION = 0b0000
_SEED_JSON = 0b0001
_SEED_NO_COMPRESSION = 0b0000
_SEED_GZIP = 0b0001
def __init__(
self,
api_key: Optional[str] = None,
api_url: Optional[str] = None,
model: Optional[str] = None,
sample_rate: int = 16000,
language: str = "auto",
app_id: Optional[str] = None,
resource_id: Optional[str] = None,
uid: Optional[str] = None,
request_params: Optional[Dict[str, Any]] = None,
on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None,
) -> None:
super().__init__(sample_rate=sample_rate, language=language)
self.mode = "streaming"
self.api_key = api_key or os.getenv("VOLCENGINE_ASR_API_KEY") or os.getenv("ASR_API_KEY")
self.model = str(model or os.getenv("VOLCENGINE_ASR_MODEL") or self.DEFAULT_MODEL).strip()
raw_api_url = api_url or os.getenv("VOLCENGINE_ASR_API_URL") or self.DEFAULT_WS_URL
self.protocol = self._detect_protocol(raw_api_url)
self.api_url = self._resolve_api_url(raw_api_url, self.model, self.protocol)
self.app_id = app_id or os.getenv("VOLCENGINE_ASR_APP_ID") or os.getenv("ASR_APP_ID")
self.resource_id = (
resource_id
or os.getenv("VOLCENGINE_ASR_RESOURCE_ID")
or (self.DEFAULT_SEED_RESOURCE_ID if self.protocol == "seed" else None)
)
self.uid = uid or os.getenv("VOLCENGINE_ASR_UID")
self.request_params = self._load_request_params(request_params)
self.on_transcript = on_transcript
self._session: Optional[aiohttp.ClientSession] = None
self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
self._reader_task: Optional[asyncio.Task[None]] = None
self._running = False
self._session_ready = asyncio.Event()
self._transcript_queue: "asyncio.Queue[ASRResult]" = asyncio.Queue()
self._final_queue: "asyncio.Queue[str]" = asyncio.Queue()
self._utterance_active = False
self._audio_sent_in_utterance = False
self._last_interim_text = ""
self._last_error: Optional[str] = None
self._seed_audio_buffer = bytearray()
self._seed_sequence = 1
self._seed_request_id: Optional[str] = None
self._seed_frame_bytes = max(2, int((self.sample_rate * self._SEED_FRAME_MS / 1000) * 2))
@classmethod
def _detect_protocol(cls, api_url: str) -> VolcengineASRProtocol:
parsed = urlparse(str(api_url or "").strip())
host = parsed.netloc.lower()
path = parsed.path.lower()
if "openspeech.bytedance.com" in host and "/api/v3/sauc/bigmodel" in path:
return "seed"
return "gateway"
@classmethod
def _resolve_api_url(cls, api_url: str, model: str, protocol: VolcengineASRProtocol) -> str:
raw = str(api_url or "").strip()
if not raw:
raw = cls.DEFAULT_WS_URL if protocol == "seed" else cls.DEFAULT_GATEWAY_WS_URL
if protocol != "gateway":
return raw
parsed = urlparse(raw)
query = dict(parse_qsl(parsed.query, keep_blank_values=True))
query.setdefault("model", model or cls.DEFAULT_MODEL)
return urlunparse(parsed._replace(query=urlencode(query)))
@staticmethod
def _load_request_params(request_params: Optional[Dict[str, Any]]) -> Dict[str, Any]:
if isinstance(request_params, dict):
return dict(request_params)
raw = os.getenv("VOLCENGINE_ASR_REQUEST_PARAMS_JSON", "").strip()
if not raw:
return {}
try:
parsed = json.loads(raw)
except json.JSONDecodeError:
logger.warning("Ignoring invalid VOLCENGINE_ASR_REQUEST_PARAMS_JSON")
return {}
if isinstance(parsed, dict):
return parsed
return {}
async def connect(self) -> None:
if not self.api_key:
raise ValueError("Volcengine ASR API key not provided. Configure agent.asr.api_key in YAML.")
timeout = aiohttp.ClientTimeout(total=None, sock_read=None, sock_connect=15)
self._session = aiohttp.ClientSession(timeout=timeout)
self._running = True
if self.protocol == "gateway":
await self._connect_gateway()
logger.info(
"Volcengine gateway ASR connected: model={}, sample_rate={}, url={}",
self.model,
self.sample_rate,
self.api_url,
)
else:
if not self.app_id:
raise ValueError("Volcengine ASR app_id not provided. Configure agent.asr.app_id in YAML.")
logger.info(
"Volcengine BigASR Seed ready: model={}, sample_rate={}, resource_id={}",
self.model,
self.sample_rate,
self.resource_id,
)
self.state = ServiceState.CONNECTED
async def disconnect(self) -> None:
self._running = False
self._utterance_active = False
self._audio_sent_in_utterance = False
self._session_ready.clear()
self._seed_audio_buffer = bytearray()
self._drain_queue(self._final_queue)
self._drain_queue(self._transcript_queue)
await self._close_ws()
if self._session is not None:
await self._session.close()
self._session = None
self.state = ServiceState.DISCONNECTED
logger.info("Volcengine ASR disconnected")
async def begin_utterance(self) -> None:
self.clear_utterance()
if self.protocol == "seed":
await self._open_seed_stream()
self._utterance_active = True
async def send_audio(self, audio: bytes) -> None:
if not audio:
return
if self.protocol == "seed":
await self._send_seed_audio(audio)
return
if not self._ws:
raise RuntimeError("Volcengine ASR websocket is not connected")
if not self._utterance_active:
self._utterance_active = True
await self._ws.send_json(
{
"type": "input_audio_buffer.append",
"audio": base64.b64encode(audio).decode("ascii"),
}
)
self._audio_sent_in_utterance = True
async def end_utterance(self) -> None:
if not self._utterance_active:
return
if self.protocol == "seed":
await self._end_seed_utterance()
return
if not self._ws or not self._audio_sent_in_utterance:
return
await self._ws.send_json({"type": "input_audio_buffer.commit"})
self._utterance_active = False
async def wait_for_final_transcription(self, timeout_ms: int = DEFAULT_FINAL_TIMEOUT_MS) -> str:
if not self._audio_sent_in_utterance:
return ""
timeout_sec = max(0.05, float(timeout_ms) / 1000.0)
try:
return str(await asyncio.wait_for(self._final_queue.get(), timeout=timeout_sec) or "").strip()
except asyncio.TimeoutError:
logger.debug("Volcengine ASR final timeout ({}ms), fallback to last interim", timeout_ms)
return str(self._last_interim_text or "").strip()
finally:
if self.protocol == "seed":
await self._close_ws()
def clear_utterance(self) -> None:
self._utterance_active = False
self._audio_sent_in_utterance = False
self._last_interim_text = ""
self._last_error = None
self._seed_audio_buffer = bytearray()
self._seed_sequence = 1
self._seed_request_id = None
self._drain_queue(self._final_queue)
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
while self._running:
try:
yield await asyncio.wait_for(self._transcript_queue.get(), timeout=0.1)
except asyncio.TimeoutError:
continue
except asyncio.CancelledError:
break
async def _connect_gateway(self) -> None:
assert self._session is not None
headers = {"Authorization": f"Bearer {self.api_key}"}
if self.resource_id:
headers["X-Api-Resource-Id"] = self.resource_id
self._ws = await self._session.ws_connect(self.api_url, headers=headers, heartbeat=20)
self._reader_task = asyncio.create_task(self._reader_loop())
await self._configure_gateway_session()
async def _configure_gateway_session(self) -> None:
if not self._ws:
raise RuntimeError("Volcengine ASR websocket is not initialized")
session_payload: Dict[str, Any] = {
"input_audio_format": "pcm",
"input_audio_codec": "raw",
"input_audio_sample_rate": self.sample_rate,
"input_audio_bits": 16,
"input_audio_channel": 1,
"result_type": 0,
"input_audio_transcription": {
"model": self.model,
},
}
await self._ws.send_json(
{
"type": "transcription_session.update",
"session": session_payload,
}
)
try:
await asyncio.wait_for(self._session_ready.wait(), timeout=8.0)
except asyncio.TimeoutError as exc:
raise RuntimeError("Volcengine ASR session update timeout") from exc
async def _open_seed_stream(self) -> None:
if not self._session:
raise RuntimeError("Volcengine ASR session is not initialized")
await self._close_ws()
self._seed_request_id = uuid.uuid4().hex
headers = self._build_seed_headers(self._seed_request_id)
self._ws = await self._session.ws_connect(
self.api_url,
headers=headers,
heartbeat=20,
max_msg_size=1_000_000_000,
)
self._reader_task = asyncio.create_task(self._reader_loop())
await self._ws.send_bytes(self._build_seed_start_request())
async def _send_seed_audio(self, audio: bytes) -> None:
if not self._utterance_active:
await self.begin_utterance()
if not self._ws:
raise RuntimeError("Volcengine BigASR websocket is not connected")
self._seed_audio_buffer.extend(audio)
while len(self._seed_audio_buffer) >= self._seed_frame_bytes:
chunk = bytes(self._seed_audio_buffer[: self._seed_frame_bytes])
del self._seed_audio_buffer[: self._seed_frame_bytes]
self._seed_sequence += 1
await self._ws.send_bytes(self._build_seed_audio_request(chunk, sequence=self._seed_sequence))
self._audio_sent_in_utterance = True
async def _end_seed_utterance(self) -> None:
if not self._ws:
return
if not self._audio_sent_in_utterance and not self._seed_audio_buffer:
self._utterance_active = False
return
final_chunk = bytes(self._seed_audio_buffer)
self._seed_audio_buffer = bytearray()
self._seed_sequence += 1
await self._ws.send_bytes(
self._build_seed_audio_request(final_chunk, sequence=-self._seed_sequence, is_last=True)
)
self._audio_sent_in_utterance = True
self._utterance_active = False
async def _close_ws(self) -> None:
reader_task = self._reader_task
ws = self._ws
self._reader_task = None
self._ws = None
if reader_task:
reader_task.cancel()
try:
await reader_task
except asyncio.CancelledError:
pass
if ws is not None:
await ws.close()
async def _reader_loop(self) -> None:
ws = self._ws
if ws is None:
return
try:
async for msg in ws:
if msg.type == aiohttp.WSMsgType.TEXT:
if self.protocol == "gateway":
self._handle_gateway_event(msg.data)
else:
self._handle_seed_text(msg.data)
continue
if msg.type == aiohttp.WSMsgType.BINARY:
if self.protocol == "seed":
self._handle_seed_binary(msg.data)
continue
if msg.type == aiohttp.WSMsgType.ERROR:
self._last_error = str(ws.exception())
logger.error("Volcengine ASR websocket error: {}", self._last_error)
break
if msg.type in {aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE}:
break
except asyncio.CancelledError:
raise
except Exception as exc:
self._last_error = str(exc)
logger.error("Volcengine ASR reader loop failed: {}", exc)
finally:
if self._ws is ws:
self._ws = None
def _handle_gateway_event(self, message: str) -> None:
payload = self._coerce_event(message)
event_type = str(payload.get("type") or "").strip()
if not event_type:
return
if event_type in {"transcription_session.created", "transcription_session.updated"}:
self._session_ready.set()
return
if event_type == "error":
self._last_error = self._extract_text(payload, ("message", "error"))
logger.error("Volcengine ASR server error: {}", self._last_error or "unknown")
return
if event_type.endswith(".failed"):
self._last_error = self._extract_text(payload, ("message", "error", "transcript"))
logger.error("Volcengine ASR failed event: {}", self._last_error or event_type)
return
if event_type == "conversation.item.input_audio_transcription.result":
transcript = self._extract_text(payload, ("transcript", "result"))
self._emit_transcript_sync(transcript, is_final=False)
return
if event_type == "conversation.item.input_audio_transcription.delta":
transcript = self._extract_text(payload, ("delta",))
self._emit_transcript_sync(transcript, is_final=False)
return
if event_type == "conversation.item.input_audio_transcription.completed":
transcript = self._extract_text(payload, ("transcript", "result"))
self._emit_transcript_sync(transcript, is_final=True)
def _handle_seed_text(self, message: str) -> None:
payload = self._coerce_event(message)
if payload.get("type") == "error":
self._last_error = self._extract_text(payload, ("message", "error"))
logger.error("Volcengine BigASR error: {}", self._last_error or "unknown")
def _handle_seed_binary(self, message: bytes) -> None:
payload = self._parse_seed_response(message)
if payload.get("code"):
self._last_error = self._extract_text(payload, ("payload_msg",))
logger.error("Volcengine BigASR server error: {}", self._last_error or payload["code"])
return
body = payload.get("payload_msg")
if not isinstance(body, dict):
return
result = body.get("result")
if not isinstance(result, dict):
return
text = str(result.get("text") or "").strip()
if not text:
return
utterances = result.get("utterances")
if not isinstance(utterances, list) or not utterances:
return
first_utterance = utterances[0] if isinstance(utterances[0], dict) else {}
is_final = self._coerce_bool(first_utterance.get("definite")) is True
self._emit_transcript_sync(text, is_final=is_final)
def _emit_transcript_sync(self, text: str, *, is_final: bool) -> None:
cleaned = str(text or "").strip()
if not cleaned:
return
if not is_final:
self._last_interim_text = cleaned
else:
self._last_interim_text = ""
result = ASRResult(text=cleaned, is_final=is_final)
try:
self._transcript_queue.put_nowait(result)
except asyncio.QueueFull:
logger.debug("Volcengine ASR transcript queue full; dropping transcript")
if is_final:
try:
self._final_queue.put_nowait(cleaned)
except asyncio.QueueFull:
logger.debug("Volcengine ASR final queue full; dropping transcript")
if self.on_transcript:
asyncio.create_task(self.on_transcript(cleaned, is_final))
def _build_seed_headers(self, request_id: str) -> Dict[str, str]:
if not self.app_id:
raise ValueError("Volcengine ASR app_id not provided. Configure agent.asr.app_id in YAML.")
if not self.api_key:
raise ValueError("Volcengine ASR api_key not provided. Configure agent.asr.api_key in YAML.")
return {
"X-Api-App-Key": str(self.app_id),
"X-Api-Access-Key": str(self.api_key),
"X-Api-Resource-Id": str(self.resource_id or self.DEFAULT_SEED_RESOURCE_ID),
"X-Api-Request-Id": str(request_id),
}
def _build_seed_start_payload(self) -> Dict[str, Any]:
user_payload: Dict[str, Any] = {"uid": str(self.uid or self._seed_request_id or self.app_id or uuid.uuid4().hex)}
audio_payload: Dict[str, Any] = {
"format": "pcm",
"rate": self.sample_rate,
"bits": 16,
"channels": 1,
"codec": "raw",
}
if self.language and self.language != "auto":
audio_payload["language"] = self.language
request_payload: Dict[str, Any] = {
"model_name": self.model or self.DEFAULT_MODEL,
"enable_itn": False,
"enable_punc": True,
"enable_ddc": False,
"show_utterance": True,
"result_type": "single",
"vad_segment_duration": 3000,
"end_window_size": 500,
"force_to_speech_time": 1000,
}
extra = dict(self.request_params)
user_payload.update(self._as_dict(extra.pop("user", None)))
audio_payload.update(self._as_dict(extra.pop("audio", None)))
request_payload.update(self._as_dict(extra.pop("request", None)))
request_payload.update(extra)
return {
"user": user_payload,
"audio": audio_payload,
"request": request_payload,
}
def _build_seed_start_request(self) -> bytes:
payload = gzip.compress(json.dumps(self._build_seed_start_payload()).encode("utf-8"))
frame = bytearray(
self._build_seed_header(
message_type=self._SEED_FULL_CLIENT_REQUEST,
message_type_specific_flags=self._SEED_POS_SEQUENCE,
)
)
frame.extend((1).to_bytes(4, "big", signed=True))
frame.extend(len(payload).to_bytes(4, "big"))
frame.extend(payload)
return bytes(frame)
def _build_seed_audio_request(self, chunk: bytes, *, sequence: int, is_last: bool = False) -> bytes:
payload = gzip.compress(chunk)
frame = bytearray(
self._build_seed_header(
message_type=self._SEED_AUDIO_ONLY_REQUEST,
message_type_specific_flags=self._SEED_NEG_WITH_SEQUENCE if is_last else self._SEED_POS_SEQUENCE,
)
)
frame.extend(int(sequence).to_bytes(4, "big", signed=True))
frame.extend(len(payload).to_bytes(4, "big"))
frame.extend(payload)
return bytes(frame)
@classmethod
def _build_seed_header(
cls,
*,
message_type: int,
message_type_specific_flags: int,
serial_method: int = _SEED_JSON,
compression_type: int = _SEED_GZIP,
reserved_data: int = 0x00,
) -> bytes:
header = bytearray()
header.append((cls._SEED_PROTOCOL_VERSION << 4) | 0b0001)
header.append((message_type << 4) | message_type_specific_flags)
header.append((serial_method << 4) | compression_type)
header.append(reserved_data)
return bytes(header)
@classmethod
def _parse_seed_response(cls, response: bytes) -> Dict[str, Any]:
header_size = response[0] & 0x0F
message_type = response[1] >> 4
message_type_specific_flags = response[1] & 0x0F
serialization_method = response[2] >> 4
compression_type = response[2] & 0x0F
payload = response[header_size * 4 :]
result: Dict[str, Any] = {"is_last_package": False}
payload_message: Any = None
if message_type_specific_flags & 0x01:
result["payload_sequence"] = int.from_bytes(payload[:4], "big", signed=True)
payload = payload[4:]
if message_type_specific_flags & 0x02:
result["is_last_package"] = True
if message_type == cls._SEED_FULL_SERVER_RESPONSE:
result["payload_size"] = int.from_bytes(payload[:4], "big", signed=True)
payload_message = payload[4:]
elif message_type == cls._SEED_SERVER_ACK:
result["seq"] = int.from_bytes(payload[:4], "big", signed=True)
if len(payload) >= 8:
result["payload_size"] = int.from_bytes(payload[4:8], "big", signed=False)
payload_message = payload[8:]
elif message_type == cls._SEED_SERVER_ERROR_RESPONSE:
result["code"] = int.from_bytes(payload[:4], "big", signed=False)
result["payload_size"] = int.from_bytes(payload[4:8], "big", signed=False)
payload_message = payload[8:]
if payload_message is None:
return result
if compression_type == cls._SEED_GZIP:
payload_message = gzip.decompress(payload_message)
if serialization_method == cls._SEED_JSON:
payload_message = json.loads(payload_message.decode("utf-8"))
elif serialization_method != cls._SEED_NO_SERIALIZATION:
payload_message = payload_message.decode("utf-8")
result["payload_msg"] = payload_message
return result
@staticmethod
def _coerce_event(message: Any) -> Dict[str, Any]:
if isinstance(message, dict):
return message
if isinstance(message, str):
try:
loaded = json.loads(message)
if isinstance(loaded, dict):
return loaded
except json.JSONDecodeError:
return {"type": "raw", "message": message}
return {"type": "raw", "message": str(message)}
@staticmethod
def _extract_text(payload: Dict[str, Any], keys: tuple[str, ...]) -> str:
for key in keys:
value = payload.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
if isinstance(value, dict):
for nested_key in ("message", "text", "transcript", "result", "delta"):
nested = value.get(nested_key)
if isinstance(nested, str) and nested.strip():
return nested.strip()
return ""
@staticmethod
def _coerce_bool(value: Any) -> Optional[bool]:
if isinstance(value, bool):
return value
if isinstance(value, (int, float)):
return bool(value)
if isinstance(value, str):
normalized = value.strip().lower()
if normalized in {"1", "true", "yes", "on"}:
return True
if normalized in {"0", "false", "no", "off"}:
return False
return None
@staticmethod
def _as_dict(value: Any) -> Dict[str, Any]:
if isinstance(value, dict):
return dict(value)
return {}
@staticmethod
def _drain_queue(queue: "asyncio.Queue[Any]") -> None:
while True:
try:
queue.get_nowait()
except asyncio.QueueEmpty:
break

View File

@@ -0,0 +1 @@
"""Common provider types."""

View File

@@ -0,0 +1 @@
"""Provider factories."""

View File

@@ -0,0 +1,172 @@
"""Default runtime service factory implementing core extension ports."""
from __future__ import annotations
from typing import Any
from loguru import logger
from runtime.ports import (
ASRPort,
ASRServiceSpec,
LLMPort,
LLMServiceSpec,
RealtimeServiceFactory,
TTSPort,
TTSServiceSpec,
)
from providers.asr.buffered import BufferedASRService
from providers.asr.dashscope import DashScopeRealtimeASRService
from providers.asr.volcengine import VolcengineRealtimeASRService
from providers.tts.dashscope import DashScopeTTSService
from providers.llm.openai import MockLLMService, OpenAILLMService
from providers.asr.openai_compatible import OpenAICompatibleASRService
from providers.tts.openai_compatible import OpenAICompatibleTTSService
from providers.tts.mock import MockTTSService
from providers.tts.volcengine import VolcengineTTSService
_OPENAI_COMPATIBLE_PROVIDERS = {"openai_compatible", "openai-compatible", "siliconflow"}
_DASHSCOPE_PROVIDERS = {"dashscope"}
_VOLCENGINE_PROVIDERS = {"volcengine"}
_SUPPORTED_LLM_PROVIDERS = {"openai", "fastgpt", *_OPENAI_COMPATIBLE_PROVIDERS}
class DefaultRealtimeServiceFactory(RealtimeServiceFactory):
"""Build concrete runtime services from normalized specs."""
_DEFAULT_DASHSCOPE_TTS_REALTIME_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
_DEFAULT_DASHSCOPE_TTS_MODEL = "qwen3-tts-flash-realtime"
_DEFAULT_DASHSCOPE_ASR_REALTIME_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
_DEFAULT_DASHSCOPE_ASR_MODEL = "qwen3-asr-flash-realtime"
_DEFAULT_OPENAI_COMPATIBLE_TTS_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
_DEFAULT_OPENAI_COMPATIBLE_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall"
_DEFAULT_VOLCENGINE_TTS_URL = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
_DEFAULT_VOLCENGINE_TTS_RESOURCE_ID = "seed-tts-2.0"
_DEFAULT_VOLCENGINE_ASR_REALTIME_URL = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel"
_DEFAULT_VOLCENGINE_ASR_MODEL = "bigmodel"
@staticmethod
def _normalize_provider(provider: Any) -> str:
return str(provider or "").strip().lower()
@staticmethod
def _resolve_dashscope_mode(raw_mode: Any) -> str:
mode = str(raw_mode or "commit").strip().lower()
if mode in {"commit", "server_commit"}:
return mode
return "commit"
def create_llm_service(self, spec: LLMServiceSpec) -> LLMPort:
provider = self._normalize_provider(spec.provider)
if provider == "fastgpt" and spec.api_key and spec.base_url:
from providers.llm.fastgpt import FastGPTLLMService
return FastGPTLLMService(
api_key=spec.api_key,
base_url=spec.base_url,
app_id=spec.app_id,
model=spec.model,
system_prompt=spec.system_prompt,
)
if provider in _SUPPORTED_LLM_PROVIDERS and provider != "fastgpt" and spec.api_key:
return OpenAILLMService(
api_key=spec.api_key,
base_url=spec.base_url,
model=spec.model,
system_prompt=spec.system_prompt,
knowledge_config=spec.knowledge_config,
knowledge_searcher=spec.knowledge_searcher,
)
logger.warning(
"LLM provider unsupported or API key missing (provider={}); using mock LLM",
provider or "-",
)
return MockLLMService()
def create_tts_service(self, spec: TTSServiceSpec) -> TTSPort:
provider = self._normalize_provider(spec.provider)
if provider == "dashscope" and spec.api_key:
return DashScopeTTSService(
api_key=spec.api_key,
api_url=spec.api_url or self._DEFAULT_DASHSCOPE_TTS_REALTIME_URL,
voice=spec.voice,
model=spec.model or self._DEFAULT_DASHSCOPE_TTS_MODEL,
mode=self._resolve_dashscope_mode(spec.mode),
sample_rate=spec.sample_rate,
speed=spec.speed,
)
if provider in _VOLCENGINE_PROVIDERS and spec.api_key:
return VolcengineTTSService(
api_key=spec.api_key,
api_url=spec.api_url or self._DEFAULT_VOLCENGINE_TTS_URL,
voice=spec.voice,
model=spec.model,
app_id=spec.app_id,
resource_id=spec.resource_id or self._DEFAULT_VOLCENGINE_TTS_RESOURCE_ID,
uid=spec.uid,
sample_rate=spec.sample_rate,
speed=spec.speed,
)
if provider in _OPENAI_COMPATIBLE_PROVIDERS and spec.api_key:
return OpenAICompatibleTTSService(
api_key=spec.api_key,
api_url=spec.api_url,
voice=spec.voice,
model=spec.model or self._DEFAULT_OPENAI_COMPATIBLE_TTS_MODEL,
sample_rate=spec.sample_rate,
speed=spec.speed,
)
logger.warning(
"TTS provider unsupported or API key missing (provider={}); using mock TTS",
provider or "-",
)
return MockTTSService(sample_rate=spec.sample_rate)
def create_asr_service(self, spec: ASRServiceSpec) -> ASRPort:
provider = self._normalize_provider(spec.provider)
if provider in _DASHSCOPE_PROVIDERS and spec.api_key:
return DashScopeRealtimeASRService(
api_key=spec.api_key,
api_url=spec.api_url or self._DEFAULT_DASHSCOPE_ASR_REALTIME_URL,
model=spec.model or self._DEFAULT_DASHSCOPE_ASR_MODEL,
sample_rate=spec.sample_rate,
language=spec.language,
on_transcript=spec.on_transcript,
)
if provider in _VOLCENGINE_PROVIDERS and spec.api_key:
return VolcengineRealtimeASRService(
api_key=spec.api_key,
api_url=spec.api_url or self._DEFAULT_VOLCENGINE_ASR_REALTIME_URL,
model=spec.model or self._DEFAULT_VOLCENGINE_ASR_MODEL,
sample_rate=spec.sample_rate,
language=spec.language,
app_id=spec.app_id,
resource_id=spec.resource_id,
uid=spec.uid,
request_params=spec.request_params,
on_transcript=spec.on_transcript,
)
if provider in _OPENAI_COMPATIBLE_PROVIDERS and spec.api_key:
return OpenAICompatibleASRService(
api_key=spec.api_key,
api_url=spec.api_url,
model=spec.model or self._DEFAULT_OPENAI_COMPATIBLE_ASR_MODEL,
sample_rate=spec.sample_rate,
language=spec.language,
enable_interim=spec.enable_interim,
interim_interval_ms=spec.interim_interval_ms,
min_audio_for_interim_ms=spec.min_audio_for_interim_ms,
on_transcript=spec.on_transcript,
)
logger.info("Using buffered ASR service (provider={})", provider or "-")
return BufferedASRService(sample_rate=spec.sample_rate, language=spec.language)

View File

@@ -0,0 +1,14 @@
"""LLM providers."""
from providers.llm.openai import MockLLMService, OpenAILLMService
try: # pragma: no cover - import depends on optional sibling SDK
from providers.llm.fastgpt import FastGPTLLMService
except Exception: # pragma: no cover - provider remains lazily available via factory
FastGPTLLMService = None # type: ignore[assignment]
__all__ = [
"FastGPTLLMService",
"MockLLMService",
"OpenAILLMService",
]

View File

@@ -0,0 +1,553 @@
"""FastGPT-backed LLM provider."""
from __future__ import annotations
import asyncio
import json
import uuid
from typing import Any, AsyncIterator, Dict, List, Optional
from loguru import logger
from providers.common.base import BaseLLMService, LLMMessage, LLMStreamEvent, ServiceState
from providers.llm.fastgpt_types import (
FastGPTConversationState,
FastGPTField,
FastGPTInteractivePrompt,
FastGPTOption,
FastGPTPendingInteraction,
)
try:
from fastgpt_client import AsyncChatClient, aiter_stream_events
except Exception as exc: # pragma: no cover - exercised indirectly via connect()
AsyncChatClient = None # type: ignore[assignment]
aiter_stream_events = None # type: ignore[assignment]
_FASTGPT_IMPORT_ERROR: Optional[Exception] = exc
else: # pragma: no cover - import success depends on local environment
_FASTGPT_IMPORT_ERROR = None
class FastGPTLLMService(BaseLLMService):
"""LLM provider that delegates orchestration to FastGPT."""
INTERACTIVE_TOOL_NAME = "fastgpt.interactive"
INTERACTIVE_TIMEOUT_MS = 300000
def __init__(
self,
*,
api_key: str,
base_url: str,
app_id: Optional[str] = None,
model: str = "fastgpt",
system_prompt: Optional[str] = None,
):
super().__init__(model=model or "fastgpt")
self.api_key = api_key
self.base_url = str(base_url or "").rstrip("/")
self.app_id = str(app_id or "").strip()
self.system_prompt = system_prompt or ""
self.client: Any = None
self._cancel_event = asyncio.Event()
self._state = FastGPTConversationState()
self._knowledge_config: Dict[str, Any] = {}
self._tool_schemas: List[Dict[str, Any]] = []
async def connect(self) -> None:
if AsyncChatClient is None or aiter_stream_events is None:
raise RuntimeError(
"fastgpt_client package is not available. "
"Install the sibling fastgpt-python-sdk package first."
) from _FASTGPT_IMPORT_ERROR
if not self.api_key:
raise ValueError("FastGPT API key not provided")
if not self.base_url:
raise ValueError("FastGPT base URL not provided")
self.client = AsyncChatClient(api_key=self.api_key, base_url=self.base_url)
self.state = ServiceState.CONNECTED
logger.info("FastGPT LLM service connected: base_url={}", self.base_url)
async def disconnect(self) -> None:
if self.client and hasattr(self.client, "close"):
await self.client.close()
self.client = None
self._state.pending_interaction = None
self.state = ServiceState.DISCONNECTED
logger.info("FastGPT LLM service disconnected")
def cancel(self) -> None:
self._cancel_event.set()
self._state.pending_interaction = None
def set_knowledge_config(self, config: Optional[Dict[str, Any]]) -> None:
# FastGPT owns KB orchestration in this provider mode.
self._knowledge_config = dict(config or {})
def set_tool_schemas(self, schemas: Optional[List[Dict[str, Any]]]) -> None:
# FastGPT owns workflow and tool orchestration in this provider mode.
self._tool_schemas = list(schemas or [])
def handles_client_tool(self, tool_name: str) -> bool:
return str(tool_name or "").strip() == self.INTERACTIVE_TOOL_NAME
async def get_initial_greeting(self) -> Optional[str]:
if not self.client or not self.app_id:
return None
response = await self.client.get_chat_init(
appId=self.app_id,
chatId=self._ensure_chat_id(),
)
raise_for_status = getattr(response, "raise_for_status", None)
if callable(raise_for_status):
raise_for_status()
elif int(getattr(response, "status_code", 200) or 200) >= 400:
raise RuntimeError(f"FastGPT chat init failed: HTTP {getattr(response, 'status_code', 'unknown')}")
payload = response.json() if hasattr(response, "json") else {}
return self._extract_initial_greeting(payload)
async def generate(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
) -> str:
parts: List[str] = []
async for event in self.generate_stream(messages, temperature=temperature, max_tokens=max_tokens):
if event.type == "text_delta" and event.text:
parts.append(event.text)
if event.type == "tool_call":
break
return "".join(parts)
async def generate_stream(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
) -> AsyncIterator[LLMStreamEvent]:
del temperature, max_tokens
if not self.client:
raise RuntimeError("LLM service not connected")
self._cancel_event.clear()
request_messages = self._build_request_messages(messages)
response = await self.client.create_chat_completion(
messages=request_messages,
chatId=self._ensure_chat_id(),
detail=True,
stream=True,
)
try:
async for event in aiter_stream_events(response):
if self._cancel_event.is_set():
logger.info("FastGPT stream cancelled")
break
stop_after_event = False
for mapped in self._map_stream_event(event):
if mapped.type == "tool_call":
stop_after_event = True
yield mapped
if stop_after_event:
break
finally:
await self._close_stream_response(response)
async def resume_after_client_tool_result(
self,
tool_call_id: str,
result: Dict[str, Any],
) -> AsyncIterator[LLMStreamEvent]:
if not self.client:
raise RuntimeError("LLM service not connected")
pending = self._require_pending_interaction(tool_call_id)
follow_up_text = self._build_resume_text(pending, result)
self._state.pending_interaction = None
if not follow_up_text:
yield LLMStreamEvent(type="done")
return
self._cancel_event.clear()
response = await self.client.create_chat_completion(
messages=[{"role": "user", "content": follow_up_text}],
chatId=pending.chat_id,
detail=True,
stream=True,
)
try:
async for event in aiter_stream_events(response):
if self._cancel_event.is_set():
logger.info("FastGPT resume stream cancelled")
break
stop_after_event = False
for mapped in self._map_stream_event(event):
if mapped.type == "tool_call":
stop_after_event = True
yield mapped
if stop_after_event:
break
finally:
await self._close_stream_response(response)
async def _close_stream_response(self, response: Any) -> None:
if response is None:
return
# httpx async streaming responses must use `aclose()`.
aclose = getattr(response, "aclose", None)
if callable(aclose):
await aclose()
return
close = getattr(response, "close", None)
if callable(close):
maybe_awaitable = close()
if hasattr(maybe_awaitable, "__await__"):
await maybe_awaitable
def _ensure_chat_id(self) -> str:
chat_id = str(self._state.chat_id or "").strip()
if not chat_id:
chat_id = f"fastgpt_{uuid.uuid4().hex}"
self._state.chat_id = chat_id
return chat_id
def _build_request_messages(self, messages: List[LLMMessage]) -> List[Dict[str, Any]]:
non_empty = [msg for msg in messages if str(msg.content or "").strip()]
if not non_empty:
return [{"role": "user", "content": ""}]
latest_user = next((msg for msg in reversed(non_empty) if msg.role == "user"), None)
trailing_system = non_empty[-1] if non_empty and non_empty[-1].role == "system" else None
request: List[Dict[str, Any]] = []
if trailing_system and trailing_system is not latest_user:
request.append({"role": "system", "content": trailing_system.content.strip()})
if latest_user and str(latest_user.content or "").strip():
request.append({"role": "user", "content": latest_user.content.strip()})
return request
last_message = non_empty[-1]
payload = last_message.to_dict()
payload["content"] = str(payload.get("content") or "").strip()
return [payload]
def _extract_initial_greeting(self, payload: Any) -> Optional[str]:
if not isinstance(payload, dict):
return None
candidates: List[Any] = [
payload.get("app"),
payload.get("data"),
]
for container in candidates:
if not isinstance(container, dict):
continue
nested_app = container.get("app") if isinstance(container.get("app"), dict) else None
if nested_app:
text = self._welcome_text_from_app(nested_app)
if text:
return text
text = self._welcome_text_from_app(container)
if text:
return text
return None
@staticmethod
def _welcome_text_from_app(app_payload: Dict[str, Any]) -> Optional[str]:
chat_config = app_payload.get("chatConfig") if isinstance(app_payload.get("chatConfig"), dict) else {}
text = str(
chat_config.get("welcomeText")
or app_payload.get("welcomeText")
or ""
).strip()
return text or None
def _map_stream_event(self, event: Any) -> List[LLMStreamEvent]:
kind = str(getattr(event, "kind", "") or "")
data = getattr(event, "data", {})
if not isinstance(data, dict):
data = {}
if kind in {"data", "answer", "fastAnswer"}:
chunks = self._extract_text_chunks(kind, data)
return [LLMStreamEvent(type="text_delta", text=chunk) for chunk in chunks if chunk]
if kind == "interactive":
return [self._build_interactive_tool_event(data)]
if kind == "error":
message = str(data.get("message") or data.get("error") or "FastGPT streaming error")
raise RuntimeError(message)
if kind == "done":
return [LLMStreamEvent(type="done")]
return []
@staticmethod
def _normalize_interactive_payload(payload: Dict[str, Any]) -> Dict[str, Any]:
normalized = payload
wrapped = normalized.get("interactive")
if isinstance(wrapped, dict):
normalized = wrapped
interaction_type = str(normalized.get("type") or "").strip()
if interaction_type == "toolChildrenInteractive":
params = normalized.get("params") if isinstance(normalized.get("params"), dict) else {}
children_response = params.get("childrenResponse")
if isinstance(children_response, dict):
normalized = children_response
return normalized
def _extract_text_chunks(self, kind: str, data: Dict[str, Any]) -> List[str]:
if kind in {"answer", "fastAnswer"}:
text = str(data.get("text") or "")
if text:
return [text]
choices = data.get("choices") if isinstance(data.get("choices"), list) else []
if not choices:
text = str(data.get("text") or "")
return [text] if text else []
first = choices[0] if isinstance(choices[0], dict) else {}
delta = first.get("delta") if isinstance(first.get("delta"), dict) else {}
if isinstance(delta.get("content"), str) and delta.get("content"):
return [str(delta.get("content"))]
message = first.get("message") if isinstance(first.get("message"), dict) else {}
if isinstance(message.get("content"), str) and message.get("content"):
return [str(message.get("content"))]
return []
def _build_interactive_tool_event(self, payload: Dict[str, Any]) -> LLMStreamEvent:
normalized_payload = self._normalize_interactive_payload(payload)
prompt = self._parse_interactive_prompt(normalized_payload)
call_id = f"fgi_{uuid.uuid4().hex[:12]}"
pending = FastGPTPendingInteraction(
tool_call_id=call_id,
chat_id=self._ensure_chat_id(),
prompt=prompt,
timeout_ms=self.INTERACTIVE_TIMEOUT_MS,
fastgpt_event=dict(normalized_payload),
)
self._state.pending_interaction = pending
arguments = prompt.to_ws_arguments(chat_id=pending.chat_id)
tool_call = {
"id": call_id,
"type": "function",
"executor": "client",
"wait_for_response": True,
"timeout_ms": pending.timeout_ms,
"display_name": prompt.title or prompt.description or prompt.prompt or "FastGPT Interactive",
"function": {
"name": self.INTERACTIVE_TOOL_NAME,
"arguments": json.dumps(arguments, ensure_ascii=False),
},
}
return LLMStreamEvent(type="tool_call", tool_call=tool_call)
def _parse_interactive_prompt(self, payload: Dict[str, Any]) -> FastGPTInteractivePrompt:
params = payload.get("params") if isinstance(payload.get("params"), dict) else {}
kind = str(payload.get("type") or "userSelect").strip() or "userSelect"
title = str(
payload.get("title")
or params.get("title")
or payload.get("nodeName")
or payload.get("label")
or ""
).strip()
description = str(
payload.get("description")
or payload.get("desc")
or params.get("description")
or params.get("desc")
or ""
).strip()
prompt_text = str(
payload.get("opener")
or params.get("opener")
or payload.get("intro")
or params.get("intro")
or payload.get("prompt")
or params.get("prompt")
or payload.get("text")
or params.get("text")
or title
or description
).strip()
required = self._coerce_bool(payload.get("required"), default=True)
multiple = self._coerce_bool(params.get("multiple") or payload.get("multiple"), default=False)
submit_label = str(params.get("submitText") or payload.get("submitText") or "Continue").strip() or "Continue"
cancel_label = str(params.get("cancelText") or payload.get("cancelText") or "Cancel").strip() or "Cancel"
options: List[FastGPTOption] = []
raw_options = params.get("userSelectOptions") if isinstance(params.get("userSelectOptions"), list) else []
for index, raw_option in enumerate(raw_options):
if isinstance(raw_option, str):
value = raw_option.strip()
if not value:
continue
options.append(FastGPTOption(id=f"option_{index}", label=value, value=value))
continue
if not isinstance(raw_option, dict):
continue
label = str(raw_option.get("label") or raw_option.get("value") or raw_option.get("id") or "").strip()
value = str(raw_option.get("value") or raw_option.get("label") or raw_option.get("id") or "").strip()
option_id = str(raw_option.get("id") or value or f"option_{index}").strip()
if not label and not value:
continue
options.append(
FastGPTOption(
id=option_id or f"option_{index}",
label=label or value,
value=value or label,
description=str(
raw_option.get("description")
or raw_option.get("desc")
or raw_option.get("intro")
or raw_option.get("summary")
or ""
).strip(),
)
)
form: List[FastGPTField] = []
raw_form = params.get("inputForm") if isinstance(params.get("inputForm"), list) else []
for index, raw_field in enumerate(raw_form):
if not isinstance(raw_field, dict):
continue
field_options: List[FastGPTOption] = []
nested_options = raw_field.get("options") if isinstance(raw_field.get("options"), list) else []
for opt_index, option in enumerate(nested_options):
if isinstance(option, str):
value = option.strip()
if not value:
continue
field_options.append(FastGPTOption(id=f"field_{index}_opt_{opt_index}", label=value, value=value))
continue
if not isinstance(option, dict):
continue
label = str(option.get("label") or option.get("value") or option.get("id") or "").strip()
value = str(option.get("value") or option.get("label") or option.get("id") or "").strip()
option_id = str(option.get("id") or value or f"field_{index}_opt_{opt_index}").strip()
if not label and not value:
continue
field_options.append(
FastGPTOption(
id=option_id or f"field_{index}_opt_{opt_index}",
label=label or value,
value=value or label,
description=str(
option.get("description")
or option.get("desc")
or option.get("intro")
or option.get("summary")
or ""
).strip(),
)
)
name = str(raw_field.get("key") or raw_field.get("name") or raw_field.get("label") or f"field_{index}").strip()
label = str(raw_field.get("label") or raw_field.get("name") or name).strip()
form.append(
FastGPTField(
name=name or f"field_{index}",
label=label or name or f"field_{index}",
input_type=str(raw_field.get("type") or raw_field.get("inputType") or "text").strip() or "text",
required=self._coerce_bool(raw_field.get("required"), default=False),
placeholder=str(
raw_field.get("placeholder")
or raw_field.get("description")
or raw_field.get("desc")
or ""
).strip(),
default=raw_field.get("defaultValue", raw_field.get("default")),
options=field_options,
)
)
return FastGPTInteractivePrompt(
kind="userInput" if kind == "userInput" else "userSelect",
title=title,
description=description,
prompt=prompt_text,
required=required,
multiple=multiple,
submit_label=submit_label,
cancel_label=cancel_label,
options=options,
form=form,
raw=dict(payload),
)
def _require_pending_interaction(self, tool_call_id: str) -> FastGPTPendingInteraction:
pending = self._state.pending_interaction
if pending is None or pending.tool_call_id != tool_call_id:
raise ValueError(f"FastGPT interaction not pending for tool call: {tool_call_id}")
return pending
def _build_resume_text(self, pending: FastGPTPendingInteraction, result: Dict[str, Any]) -> str:
status = result.get("status") if isinstance(result.get("status"), dict) else {}
status_code = self._safe_int(status.get("code"), default=0)
output = result.get("output") if isinstance(result.get("output"), dict) else {}
action = str(output.get("action") or "").strip().lower()
if action == "cancel" or status_code == 499:
return ""
if status_code == 422:
raise ValueError("Invalid FastGPT interactive payload from client")
if status_code and not 200 <= status_code < 300:
raise ValueError(f"FastGPT interactive result rejected with status {status_code}")
if action and action != "submit":
raise ValueError(f"Unsupported FastGPT interactive action: {action}")
payload = output.get("result") if isinstance(output.get("result"), dict) else output
if not isinstance(payload, dict):
raise ValueError("FastGPT interactive client result must be an object")
if pending.prompt.kind == "userSelect":
selected = str(payload.get("selected") or "").strip()
if selected:
return selected
selected_values = payload.get("selected_values") if isinstance(payload.get("selected_values"), list) else []
values = [str(item).strip() for item in selected_values if str(item).strip()]
if values:
return ", ".join(values)
text_value = str(payload.get("text") or "").strip()
return text_value
text_value = str(payload.get("text") or "").strip()
if text_value:
return text_value
fields = payload.get("fields") if isinstance(payload.get("fields"), dict) else {}
compact_fields = {str(key): value for key, value in fields.items()}
if compact_fields:
return json.dumps(compact_fields, ensure_ascii=False)
return ""
@staticmethod
def _coerce_bool(value: Any, *, default: bool) -> bool:
if isinstance(value, bool):
return value
if isinstance(value, str):
normalized = value.strip().lower()
if normalized in {"true", "1", "yes", "on"}:
return True
if normalized in {"false", "0", "no", "off"}:
return False
return default
@staticmethod
def _safe_int(value: Any, *, default: int) -> int:
try:
return int(value)
except (TypeError, ValueError):
return default

View File

@@ -0,0 +1,95 @@
"""FastGPT-specific provider types."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, List, Literal, Optional
InteractiveKind = Literal["userSelect", "userInput"]
@dataclass(frozen=True)
class FastGPTOption:
id: str
label: str
value: str
description: str = ""
@dataclass(frozen=True)
class FastGPTField:
name: str
label: str
input_type: str = "text"
required: bool = False
placeholder: str = ""
default: Any = None
options: List[FastGPTOption] = field(default_factory=list)
@dataclass(frozen=True)
class FastGPTInteractivePrompt:
kind: InteractiveKind
title: str = ""
description: str = ""
prompt: str = ""
required: bool = True
multiple: bool = False
submit_label: str = "Continue"
cancel_label: str = "Cancel"
options: List[FastGPTOption] = field(default_factory=list)
form: List[FastGPTField] = field(default_factory=list)
raw: Dict[str, Any] = field(default_factory=dict)
def to_ws_arguments(
self,
*,
turn_id: Optional[str] = None,
response_id: Optional[str] = None,
chat_id: Optional[str] = None,
) -> Dict[str, Any]:
context: Dict[str, Any] = {}
if turn_id:
context["turn_id"] = turn_id
if response_id:
context["response_id"] = response_id
if chat_id:
context["chat_id"] = chat_id
return {
"provider": "fastgpt",
"version": "fastgpt_interactive_v1",
"interaction": {
"type": self.kind,
"title": self.title,
"description": self.description,
"prompt": self.prompt,
"required": self.required,
"multiple": self.multiple,
"submit_label": self.submit_label,
"cancel_label": self.cancel_label,
"options": [vars(item) for item in self.options],
"form": [
{
**vars(item),
"options": [vars(option) for option in item.options],
}
for item in self.form
],
},
"context": context,
}
@dataclass
class FastGPTPendingInteraction:
tool_call_id: str
chat_id: str
prompt: FastGPTInteractivePrompt
timeout_ms: int
fastgpt_event: Dict[str, Any] = field(default_factory=dict)
@dataclass
class FastGPTConversationState:
chat_id: Optional[str] = None
pending_interaction: Optional[FastGPTPendingInteraction] = None

View File

@@ -10,8 +10,8 @@ import uuid
from typing import AsyncIterator, Optional, List, Dict, Any, Callable, Awaitable
from loguru import logger
from app.backend_adapters import build_backend_adapter_from_settings
from services.base import BaseLLMService, LLMMessage, LLMStreamEvent, ServiceState
from adapters.control_plane.backend import build_backend_adapter_from_settings
from providers.common.base import BaseLLMService, LLMMessage, LLMStreamEvent, ServiceState
# Try to import openai
try:
@@ -44,13 +44,13 @@ class OpenAILLMService(BaseLLMService):
Args:
model: Model name (e.g., "gpt-4o-mini", "gpt-4o")
api_key: Provider API key (defaults to LLM_API_KEY/OPENAI_API_KEY env vars)
api_key: Provider API key
base_url: Custom API base URL (for Azure or compatible APIs)
system_prompt: Default system prompt for conversations
"""
super().__init__(model=model)
self.api_key = api_key or os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY")
self.api_key = api_key
self.base_url = base_url or os.getenv("LLM_API_URL") or os.getenv("OPENAI_API_URL")
self.system_prompt = system_prompt or (
"You are a helpful, friendly voice assistant. "

View File

@@ -0,0 +1 @@
"""Realtime providers."""

View File

@@ -13,7 +13,6 @@ The Realtime API provides:
- Barge-in/interruption handling
"""
import os
import asyncio
import json
import base64
@@ -98,7 +97,6 @@ class RealtimeService:
config: Realtime configuration (uses defaults if not provided)
"""
self.config = config or RealtimeConfig()
self.config.api_key = self.config.api_key or os.getenv("OPENAI_API_KEY")
self.state = RealtimeState.DISCONNECTED
self._ws = None

View File

@@ -0,0 +1,5 @@
"""TTS providers."""
from providers.tts.volcengine import VolcengineTTSService
__all__ = ["VolcengineTTSService"]

View File

@@ -12,7 +12,7 @@ from typing import Any, AsyncIterator, Dict, Optional, Tuple
from loguru import logger
from services.base import BaseTTSService, ServiceState, TTSChunk
from providers.common.base import BaseTTSService, ServiceState, TTSChunk
try:
import dashscope
@@ -89,7 +89,7 @@ class DashScopeTTSService(BaseTTSService):
speed: float = 1.0,
):
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
self.api_key = api_key or os.getenv("DASHSCOPE_API_KEY") or os.getenv("TTS_API_KEY")
self.api_key = api_key
self.api_url = (
api_url
or os.getenv("DASHSCOPE_TTS_API_URL")

View File

@@ -0,0 +1,49 @@
"""TTS service implementations used by the engine runtime."""
import asyncio
from typing import AsyncIterator
from loguru import logger
from providers.common.base import BaseTTSService, TTSChunk, ServiceState
class MockTTSService(BaseTTSService):
"""Mock TTS service for tests and no-provider fallback."""
def __init__(
self,
voice: str = "mock",
sample_rate: int = 16000,
speed: float = 1.0,
):
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
async def connect(self) -> None:
self.state = ServiceState.CONNECTED
logger.info("Mock TTS service connected")
async def disconnect(self) -> None:
self.state = ServiceState.DISCONNECTED
logger.info("Mock TTS service disconnected")
async def synthesize(self, text: str) -> bytes:
"""Generate silence based on text length."""
word_count = len(text.split())
duration_ms = word_count * 100
samples = int(self.sample_rate * duration_ms / 1000)
return bytes(samples * 2)
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
"""Generate silence chunks to emulate streaming synthesis."""
audio = await self.synthesize(text)
chunk_size = self.sample_rate * 2 // 10
for i in range(0, len(audio), chunk_size):
chunk_data = audio[i : i + chunk_size]
yield TTSChunk(
audio=chunk_data,
sample_rate=self.sample_rate,
is_final=(i + chunk_size >= len(audio)),
)
await asyncio.sleep(0.05)

View File

@@ -13,8 +13,8 @@ from typing import AsyncIterator, Optional
from urllib.parse import urlparse, urlunparse
from loguru import logger
from services.base import BaseTTSService, TTSChunk, ServiceState
from services.streaming_tts_adapter import StreamingTTSAdapter # backward-compatible re-export
from providers.common.base import BaseTTSService, TTSChunk, ServiceState
from providers.tts.streaming_adapter import StreamingTTSAdapter # backward-compatible re-export
class OpenAICompatibleTTSService(BaseTTSService):
@@ -49,7 +49,7 @@ class OpenAICompatibleTTSService(BaseTTSService):
Initialize OpenAI-compatible TTS service.
Args:
api_key: Provider API key (defaults to TTS_API_KEY/SILICONFLOW_API_KEY env vars)
api_key: Provider API key
api_url: Provider API URL (defaults to SiliconFlow endpoint)
voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
model: Model name
@@ -73,7 +73,7 @@ class OpenAICompatibleTTSService(BaseTTSService):
super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed)
self.api_key = api_key or os.getenv("TTS_API_KEY") or os.getenv("SILICONFLOW_API_KEY")
self.api_key = api_key
self.model = model
raw_api_url = api_url or os.getenv("TTS_API_URL") or "https://api.siliconflow.cn/v1/audio/speech"
self.api_url = self._resolve_speech_endpoint(raw_api_url)

View File

@@ -1,6 +1,6 @@
"""Backward-compatible imports for legacy siliconflow_tts module."""
from services.openai_compatible_tts import OpenAICompatibleTTSService, StreamingTTSAdapter
from providers.tts.openai_compatible import OpenAICompatibleTTSService, StreamingTTSAdapter
# Backward-compatible alias
SiliconFlowTTSService = OpenAICompatibleTTSService

View File

@@ -4,8 +4,8 @@ import asyncio
from loguru import logger
from services.base import BaseTTSService
from services.streaming_text import extract_tts_sentence, has_spoken_content
from providers.common.base import BaseTTSService
from providers.common.streaming_text import extract_tts_sentence, has_spoken_content
class StreamingTTSAdapter:

Some files were not shown because too many files have changed in this diff Show More