diff --git a/.gitignore b/.gitignore index a9bcc58..cee9c76 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,3 @@ # OS artifacts .DS_Store -Thumbs.db - -# Workspace runtime data -data/ +Thumbs.db \ No newline at end of file diff --git a/api/app/models.py b/api/app/models.py index 29579f2..aaad83c 100644 --- a/api/app/models.py +++ b/api/app/models.py @@ -127,11 +127,13 @@ class Assistant(Base): speed: Mapped[float] = mapped_column(Float, default=1.0) hotwords: Mapped[dict] = mapped_column(JSON, default=list) tools: Mapped[dict] = mapped_column(JSON, default=list) + asr_interim_enabled: Mapped[bool] = mapped_column(default=False) bot_cannot_be_interrupted: Mapped[bool] = mapped_column(default=False) interruption_sensitivity: Mapped[int] = mapped_column(Integer, default=500) config_mode: Mapped[str] = mapped_column(String(32), default="platform") api_url: Mapped[Optional[str]] = mapped_column(String(255), nullable=True) api_key: Mapped[Optional[str]] = mapped_column(String(255), nullable=True) + app_id: Mapped[Optional[str]] = mapped_column(String(255), nullable=True) # 模型关联 llm_model_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True) asr_model_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True) diff --git a/api/app/routers/asr.py b/api/app/routers/asr.py index b167802..07596a6 100644 --- a/api/app/routers/asr.py +++ b/api/app/routers/asr.py @@ -1,6 +1,14 @@ +import asyncio +import base64 +import io +import json import os +import sys +import threading import time -from typing import List, Optional +import wave +from array import array +from typing import Any, Dict, List, Optional, Tuple import httpx from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile @@ -17,6 +25,32 @@ from ..schemas import ( router = APIRouter(prefix="/asr", tags=["ASR Models"]) OPENAI_COMPATIBLE_DEFAULT_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall" +DASHSCOPE_DEFAULT_ASR_MODEL = "qwen3-asr-flash-realtime" +DASHSCOPE_DEFAULT_BASE_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + +try: + import dashscope + from dashscope.audio.qwen_omni import MultiModality, OmniRealtimeCallback, OmniRealtimeConversation + + try: + from dashscope.audio.qwen_omni import TranscriptionParams + except ImportError: + from dashscope.audio.qwen_omni.omni_realtime import TranscriptionParams + + DASHSCOPE_SDK_AVAILABLE = True + DASHSCOPE_IMPORT_ERROR = "" +except Exception as exc: + dashscope = None # type: ignore[assignment] + MultiModality = None # type: ignore[assignment] + OmniRealtimeConversation = None # type: ignore[assignment] + TranscriptionParams = None # type: ignore[assignment] + DASHSCOPE_SDK_AVAILABLE = False + DASHSCOPE_IMPORT_ERROR = f"{type(exc).__name__}: {exc}" + + class OmniRealtimeCallback: # type: ignore[no-redef] + """Fallback callback base when DashScope SDK is unavailable.""" + + pass def _is_openai_compatible_vendor(vendor: str) -> bool: @@ -29,12 +63,377 @@ def _is_openai_compatible_vendor(vendor: str) -> bool: } +def _is_dashscope_vendor(vendor: str) -> bool: + return (vendor or "").strip().lower() == "dashscope" + + def _default_asr_model(vendor: str) -> str: if _is_openai_compatible_vendor(vendor): return OPENAI_COMPATIBLE_DEFAULT_ASR_MODEL + if _is_dashscope_vendor(vendor): + return DASHSCOPE_DEFAULT_ASR_MODEL return "whisper-1" +def _dashscope_language(language: Optional[str]) -> Optional[str]: + normalized = (language or "").strip().lower() + if not normalized or normalized in {"multi-lingual", "multilingual", "multi_lingual", "auto"}: + return None + if normalized.startswith("zh"): + return "zh" + if normalized.startswith("en"): + return "en" + return normalized + + +class _DashScopePreviewCallback(OmniRealtimeCallback): + """Collect DashScope ASR websocket events for preview/test flows.""" + + def __init__(self) -> None: + super().__init__() + self._open_event = threading.Event() + self._session_ready_event = threading.Event() + self._done_event = threading.Event() + self._lock = threading.Lock() + self._final_text = "" + self._last_interim_text = "" + self._error_message: Optional[str] = None + + def on_open(self) -> None: + self._open_event.set() + + def on_close(self, code: int, reason: str) -> None: + if self._done_event.is_set(): + return + self._error_message = f"DashScope websocket closed unexpectedly: {code} {reason}" + self._done_event.set() + self._session_ready_event.set() + + def on_error(self, message: Any) -> None: + self._error_message = str(message) + self._done_event.set() + self._session_ready_event.set() + + def on_event(self, response: Any) -> None: + payload = _coerce_dashscope_event(response) + event_type = str(payload.get("type") or "").strip() + if not event_type: + return + + if event_type in {"session.created", "session.updated"}: + self._session_ready_event.set() + return + + if event_type == "error" or event_type.endswith(".failed"): + self._error_message = _format_dashscope_error_event(payload) + self._done_event.set() + self._session_ready_event.set() + return + + if event_type == "conversation.item.input_audio_transcription.text": + interim_text = _extract_dashscope_text(payload, keys=("stash", "text", "transcript")) + if interim_text: + with self._lock: + self._last_interim_text = interim_text + return + + if event_type == "conversation.item.input_audio_transcription.completed": + final_text = _extract_dashscope_text(payload, keys=("transcript", "text", "stash")) + with self._lock: + if final_text: + self._final_text = final_text + self._done_event.set() + return + + if event_type in {"response.done", "session.finished"}: + self._done_event.set() + + def wait_for_open(self, timeout: float = 10.0) -> None: + if not self._open_event.wait(timeout): + raise TimeoutError("DashScope websocket open timeout") + + def wait_for_session_ready(self, timeout: float = 6.0) -> bool: + return self._session_ready_event.wait(timeout) + + def wait_for_done(self, timeout: float = 20.0) -> None: + if not self._done_event.wait(timeout): + raise TimeoutError("DashScope transcription timeout") + + def raise_if_error(self) -> None: + if self._error_message: + raise RuntimeError(self._error_message) + + def read_text(self) -> str: + with self._lock: + return self._final_text or self._last_interim_text + + +def _coerce_dashscope_event(response: Any) -> Dict[str, Any]: + if isinstance(response, dict): + return response + if isinstance(response, str): + try: + parsed = json.loads(response) + if isinstance(parsed, dict): + return parsed + except json.JSONDecodeError: + pass + return {"type": "raw", "message": str(response)} + + +def _format_dashscope_error_event(payload: Dict[str, Any]) -> str: + error = payload.get("error") + if isinstance(error, dict): + code = str(error.get("code") or "").strip() + message = str(error.get("message") or "").strip() + if code and message: + return f"{code}: {message}" + return message or str(error) + return str(error or "DashScope realtime ASR error") + + +def _extract_dashscope_text(payload: Dict[str, Any], *, keys: Tuple[str, ...]) -> str: + for key in keys: + value = payload.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + if isinstance(value, dict): + nested = _extract_dashscope_text(value, keys=keys) + if nested: + return nested + + for value in payload.values(): + if isinstance(value, dict): + nested = _extract_dashscope_text(value, keys=keys) + if nested: + return nested + return "" + + +def _create_dashscope_realtime_client( + *, + model: str, + callback: _DashScopePreviewCallback, + url: str, + api_key: str, +) -> Any: + if OmniRealtimeConversation is None: + raise RuntimeError("DashScope SDK unavailable") + + init_kwargs = { + "model": model, + "callback": callback, + "url": url, + } + try: + return OmniRealtimeConversation(api_key=api_key, **init_kwargs) # type: ignore[misc] + except TypeError as exc: + if "api_key" not in str(exc): + raise + return OmniRealtimeConversation(**init_kwargs) # type: ignore[misc] + + +def _close_dashscope_client(client: Any) -> None: + finish_fn = getattr(client, "finish", None) + if callable(finish_fn): + try: + finish_fn() + except Exception: + pass + + close_fn = getattr(client, "close", None) + if callable(close_fn): + try: + close_fn() + except Exception: + pass + + +def _configure_dashscope_session( + *, + client: Any, + callback: _DashScopePreviewCallback, + sample_rate: int, + language: Optional[str], +) -> None: + update_fn = getattr(client, "update_session", None) + if not callable(update_fn): + raise RuntimeError("DashScope ASR SDK missing update_session method") + + text_modality: Any = "text" + if MultiModality is not None and hasattr(MultiModality, "TEXT"): + text_modality = MultiModality.TEXT + + transcription_params: Optional[Any] = None + language_hint = _dashscope_language(language) + if TranscriptionParams is not None: + try: + params_kwargs: Dict[str, Any] = { + "sample_rate": sample_rate, + "input_audio_format": "pcm", + } + if language_hint: + params_kwargs["language"] = language_hint + transcription_params = TranscriptionParams(**params_kwargs) + except Exception: + transcription_params = None + + update_attempts = [ + { + "output_modalities": [text_modality], + "enable_turn_detection": False, + "enable_input_audio_transcription": True, + "transcription_params": transcription_params, + }, + { + "output_modalities": [text_modality], + "enable_turn_detection": False, + "enable_input_audio_transcription": True, + }, + { + "output_modalities": [text_modality], + }, + ] + + last_error: Optional[Exception] = None + for params in update_attempts: + if params.get("transcription_params") is None: + params = {key: value for key, value in params.items() if key != "transcription_params"} + try: + update_fn(**params) + callback.wait_for_session_ready() + callback.raise_if_error() + return + except TypeError as exc: + last_error = exc + continue + except Exception as exc: + last_error = exc + continue + + raise RuntimeError(f"DashScope ASR session.update failed: {last_error}") + + +def _load_wav_pcm16_mono(audio_bytes: bytes) -> Tuple[bytes, int]: + try: + with wave.open(io.BytesIO(audio_bytes), "rb") as wav_file: + channel_count = wav_file.getnchannels() + sample_width = wav_file.getsampwidth() + sample_rate = wav_file.getframerate() + compression = wav_file.getcomptype() + pcm_frames = wav_file.readframes(wav_file.getnframes()) + except wave.Error as exc: + raise RuntimeError("DashScope preview currently supports WAV audio. Record in browser or upload a .wav file.") from exc + + if compression != "NONE": + raise RuntimeError("DashScope preview requires uncompressed PCM WAV audio.") + if sample_width != 2: + raise RuntimeError("DashScope preview requires 16-bit PCM WAV audio.") + if not pcm_frames: + raise RuntimeError("Uploaded WAV file is empty") + if channel_count <= 1: + return pcm_frames, sample_rate + + samples = array("h") + samples.frombytes(pcm_frames) + if sys.byteorder == "big": + samples.byteswap() + + mono_samples = array( + "h", + ( + int(sum(samples[index:index + channel_count]) / channel_count) + for index in range(0, len(samples), channel_count) + ), + ) + if sys.byteorder == "big": + mono_samples.byteswap() + return mono_samples.tobytes(), sample_rate + + +def _probe_dashscope_asr_connection(*, api_key: str, base_url: str, model: str, language: Optional[str]) -> None: + if not DASHSCOPE_SDK_AVAILABLE: + hint = f"`{sys.executable} -m pip install dashscope>=1.25.11`" + detail = f"; import error: {DASHSCOPE_IMPORT_ERROR}" if DASHSCOPE_IMPORT_ERROR else "" + raise RuntimeError(f"dashscope package not installed; install with {hint}{detail}") + + callback = _DashScopePreviewCallback() + if dashscope is not None: + dashscope.api_key = api_key + client = _create_dashscope_realtime_client( + model=model, + callback=callback, + url=base_url, + api_key=api_key, + ) + + try: + client.connect() + callback.wait_for_open() + _configure_dashscope_session( + client=client, + callback=callback, + sample_rate=16000, + language=language, + ) + finally: + _close_dashscope_client(client) + + +def _transcribe_dashscope_preview( + *, + audio_bytes: bytes, + api_key: str, + base_url: str, + model: str, + language: Optional[str], +) -> Dict[str, Any]: + if not DASHSCOPE_SDK_AVAILABLE: + hint = f"`{sys.executable} -m pip install dashscope>=1.25.11`" + detail = f"; import error: {DASHSCOPE_IMPORT_ERROR}" if DASHSCOPE_IMPORT_ERROR else "" + raise RuntimeError(f"dashscope package not installed; install with {hint}{detail}") + + pcm_audio, sample_rate = _load_wav_pcm16_mono(audio_bytes) + callback = _DashScopePreviewCallback() + if dashscope is not None: + dashscope.api_key = api_key + client = _create_dashscope_realtime_client( + model=model, + callback=callback, + url=base_url, + api_key=api_key, + ) + + try: + client.connect() + callback.wait_for_open() + _configure_dashscope_session( + client=client, + callback=callback, + sample_rate=sample_rate, + language=language, + ) + + append_fn = getattr(client, "append_audio", None) + if not callable(append_fn): + raise RuntimeError("DashScope ASR SDK missing append_audio method") + commit_fn = getattr(client, "commit", None) + if not callable(commit_fn): + raise RuntimeError("DashScope ASR SDK missing commit method") + + append_fn(base64.b64encode(pcm_audio).decode("ascii")) + commit_fn() + callback.wait_for_done() + callback.raise_if_error() + return { + "transcript": callback.read_text(), + "language": _dashscope_language(language) or "Multi-lingual", + "confidence": None, + } + finally: + _close_dashscope_client(client) + + # ============ ASR Models CRUD ============ @router.get("") def list_asr_models( @@ -132,6 +531,27 @@ def test_asr_model( start_time = time.time() try: + if _is_dashscope_vendor(model.vendor): + effective_api_key = (model.api_key or "").strip() or os.getenv("DASHSCOPE_API_KEY", "").strip() or os.getenv("ASR_API_KEY", "").strip() + if not effective_api_key: + return ASRTestResponse(success=False, error=f"API key is required for ASR model: {model.name}") + + base_url = (model.base_url or "").strip() or DASHSCOPE_DEFAULT_BASE_URL + selected_model = (model.model_name or "").strip() or _default_asr_model(model.vendor) + _probe_dashscope_asr_connection( + api_key=effective_api_key, + base_url=base_url, + model=selected_model, + language=model.language, + ) + latency_ms = int((time.time() - start_time) * 1000) + return ASRTestResponse( + success=True, + language=model.language, + latency_ms=latency_ms, + message="DashScope realtime ASR connected", + ) + # 连接性测试优先,避免依赖真实音频输入 headers = {"Authorization": f"Bearer {model.api_key}"} with httpx.Client(timeout=60.0) as client: @@ -246,7 +666,7 @@ async def preview_asr_model( api_key: Optional[str] = Form(None), db: Session = Depends(get_db), ): - """预览 ASR:上传音频并调用 OpenAI-compatible /audio/transcriptions。""" + """预览 ASR:根据供应商调用 OpenAI-compatible 或 DashScope 实时识别。""" model = db.query(ASRModel).filter(ASRModel.id == id).first() if not model: raise HTTPException(status_code=404, detail="ASR Model not found") @@ -264,18 +684,50 @@ async def preview_asr_model( raise HTTPException(status_code=400, detail="Uploaded audio file is empty") effective_api_key = (api_key or "").strip() or (model.api_key or "").strip() - if not effective_api_key and _is_openai_compatible_vendor(model.vendor): - effective_api_key = os.getenv("SILICONFLOW_API_KEY", "").strip() + if not effective_api_key: + if _is_openai_compatible_vendor(model.vendor): + effective_api_key = os.getenv("SILICONFLOW_API_KEY", "").strip() + elif _is_dashscope_vendor(model.vendor): + effective_api_key = os.getenv("DASHSCOPE_API_KEY", "").strip() or os.getenv("ASR_API_KEY", "").strip() if not effective_api_key: raise HTTPException(status_code=400, detail=f"API key is required for ASR model: {model.name}") base_url = (model.base_url or "").strip().rstrip("/") + if _is_dashscope_vendor(model.vendor) and not base_url: + base_url = DASHSCOPE_DEFAULT_BASE_URL if not base_url: raise HTTPException(status_code=400, detail=f"Base URL is required for ASR model: {model.name}") selected_model = (model.model_name or "").strip() or _default_asr_model(model.vendor) - data = {"model": selected_model} effective_language = (language or "").strip() or None + + start_time = time.time() + if _is_dashscope_vendor(model.vendor): + try: + payload = await asyncio.to_thread( + _transcribe_dashscope_preview, + audio_bytes=audio_bytes, + api_key=effective_api_key, + base_url=base_url, + model=selected_model, + language=effective_language or model.language, + ) + except Exception as exc: + raise HTTPException(status_code=502, detail=f"DashScope ASR request failed: {exc}") from exc + + transcript = str(payload.get("transcript") or "") + response_language = str(payload.get("language") or effective_language or model.language) + latency_ms = int((time.time() - start_time) * 1000) + return ASRTestResponse( + success=bool(transcript), + transcript=transcript, + language=response_language, + confidence=None, + latency_ms=latency_ms, + message=None if transcript else "No transcript in response", + ) + + data = {"model": selected_model} if effective_language: data["language"] = effective_language if model.hotwords: @@ -284,7 +736,6 @@ async def preview_asr_model( headers = {"Authorization": f"Bearer {effective_api_key}"} files = {"file": (filename, audio_bytes, content_type)} - start_time = time.time() try: with httpx.Client(timeout=90.0) as client: response = client.post( diff --git a/api/app/routers/assistants.py b/api/app/routers/assistants.py index bf43303..c398cc0 100644 --- a/api/app/routers/assistants.py +++ b/api/app/routers/assistants.py @@ -126,6 +126,12 @@ def _ensure_assistant_schema(db: Session) -> None: if "manual_opener_tool_calls" not in columns: db.execute(text("ALTER TABLE assistants ADD COLUMN manual_opener_tool_calls JSON")) altered = True + if "asr_interim_enabled" not in columns: + db.execute(text("ALTER TABLE assistants ADD COLUMN asr_interim_enabled BOOLEAN DEFAULT 0")) + altered = True + if "app_id" not in columns: + db.execute(text("ALTER TABLE assistants ADD COLUMN app_id VARCHAR(255)")) + altered = True if altered: db.commit() @@ -294,7 +300,7 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s config_mode = str(assistant.config_mode or "platform").strip().lower() - if config_mode in {"dify", "fastgpt"}: + if config_mode == "dify": metadata["services"]["llm"] = { "provider": "openai", "model": "", @@ -305,6 +311,19 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s warnings.append(f"External LLM API URL is empty for mode: {assistant.config_mode}") if not (assistant.api_key or "").strip(): warnings.append(f"External LLM API key is empty for mode: {assistant.config_mode}") + elif config_mode == "fastgpt": + metadata["services"]["llm"] = { + "provider": "fastgpt", + "model": "fastgpt", + "apiKey": assistant.api_key, + "baseUrl": assistant.api_url, + } + if (assistant.app_id or "").strip(): + metadata["services"]["llm"]["appId"] = assistant.app_id + if not (assistant.api_url or "").strip(): + warnings.append(f"FastGPT API URL is empty for mode: {assistant.config_mode}") + if not (assistant.api_key or "").strip(): + warnings.append(f"FastGPT API key is empty for mode: {assistant.config_mode}") elif assistant.llm_model_id: llm = db.query(LLMModel).filter(LLMModel.id == assistant.llm_model_id).first() if llm: @@ -317,18 +336,27 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s else: warnings.append(f"LLM model not found: {assistant.llm_model_id}") + asr_runtime: Dict[str, Any] = { + "enableInterim": bool(assistant.asr_interim_enabled), + } if assistant.asr_model_id: asr = db.query(ASRModel).filter(ASRModel.id == assistant.asr_model_id).first() if asr: - asr_provider = "openai_compatible" if _is_openai_compatible_vendor(asr.vendor) else "buffered" - metadata["services"]["asr"] = { + if _is_dashscope_vendor(asr.vendor): + asr_provider = "dashscope" + elif _is_openai_compatible_vendor(asr.vendor): + asr_provider = "openai_compatible" + else: + asr_provider = "buffered" + asr_runtime.update({ "provider": asr_provider, "model": asr.model_name or asr.name, - "apiKey": asr.api_key if asr_provider == "openai_compatible" else None, - "baseUrl": asr.base_url if asr_provider == "openai_compatible" else None, - } + "apiKey": asr.api_key if asr_provider in {"openai_compatible", "dashscope"} else None, + "baseUrl": asr.base_url if asr_provider in {"openai_compatible", "dashscope"} else None, + }) else: warnings.append(f"ASR model not found: {assistant.asr_model_id}") + metadata["services"]["asr"] = asr_runtime if not assistant.voice_output_enabled: metadata["services"]["tts"] = {"enabled": False} @@ -432,11 +460,13 @@ def assistant_to_dict(assistant: Assistant) -> dict: "speed": assistant.speed, "hotwords": assistant.hotwords or [], "tools": _normalize_assistant_tool_ids(assistant.tools), + "asrInterimEnabled": bool(assistant.asr_interim_enabled), "botCannotBeInterrupted": bool(assistant.bot_cannot_be_interrupted), "interruptionSensitivity": assistant.interruption_sensitivity, "configMode": assistant.config_mode, "apiUrl": assistant.api_url, "apiKey": assistant.api_key, + "appId": assistant.app_id, "llmModelId": assistant.llm_model_id, "asrModelId": assistant.asr_model_id, "embeddingModelId": assistant.embedding_model_id, @@ -452,12 +482,14 @@ def _apply_assistant_update(assistant: Assistant, update_data: dict) -> None: "firstTurnMode": "first_turn_mode", "manualOpenerToolCalls": "manual_opener_tool_calls", "interruptionSensitivity": "interruption_sensitivity", + "asrInterimEnabled": "asr_interim_enabled", "botCannotBeInterrupted": "bot_cannot_be_interrupted", "configMode": "config_mode", "voiceOutputEnabled": "voice_output_enabled", "generatedOpenerEnabled": "generated_opener_enabled", "apiUrl": "api_url", "apiKey": "api_key", + "appId": "app_id", "llmModelId": "llm_model_id", "asrModelId": "asr_model_id", "embeddingModelId": "embedding_model_id", @@ -646,11 +678,13 @@ def create_assistant(data: AssistantCreate, db: Session = Depends(get_db)): speed=data.speed, hotwords=data.hotwords, tools=_normalize_assistant_tool_ids(data.tools), + asr_interim_enabled=data.asrInterimEnabled, bot_cannot_be_interrupted=data.botCannotBeInterrupted, interruption_sensitivity=data.interruptionSensitivity, config_mode=data.configMode, api_url=data.apiUrl, api_key=data.apiKey, + app_id=data.appId, llm_model_id=data.llmModelId, asr_model_id=data.asrModelId, embedding_model_id=data.embeddingModelId, diff --git a/api/app/schemas.py b/api/app/schemas.py index 9bf2274..5778982 100644 --- a/api/app/schemas.py +++ b/api/app/schemas.py @@ -191,6 +191,7 @@ class ASRModelCreate(ASRModelBase): class ASRModelUpdate(BaseModel): name: Optional[str] = None + vendor: Optional[str] = None language: Optional[str] = None base_url: Optional[str] = None api_key: Optional[str] = None @@ -291,11 +292,13 @@ class AssistantBase(BaseModel): speed: float = 1.0 hotwords: List[str] = [] tools: List[str] = [] + asrInterimEnabled: bool = False botCannotBeInterrupted: bool = False interruptionSensitivity: int = 500 configMode: str = "platform" apiUrl: Optional[str] = None apiKey: Optional[str] = None + appId: Optional[str] = None # 模型关联 llmModelId: Optional[str] = None asrModelId: Optional[str] = None @@ -322,11 +325,13 @@ class AssistantUpdate(BaseModel): speed: Optional[float] = None hotwords: Optional[List[str]] = None tools: Optional[List[str]] = None + asrInterimEnabled: Optional[bool] = None botCannotBeInterrupted: Optional[bool] = None interruptionSensitivity: Optional[int] = None configMode: Optional[str] = None apiUrl: Optional[str] = None apiKey: Optional[str] = None + appId: Optional[str] = None llmModelId: Optional[str] = None asrModelId: Optional[str] = None embeddingModelId: Optional[str] = None diff --git a/api/init_db.py b/api/init_db.py index e3373f6..162eb99 100644 --- a/api/init_db.py +++ b/api/init_db.py @@ -34,6 +34,7 @@ SEED_LLM_IDS = { SEED_ASR_IDS = { "sensevoice_small": short_id("asr"), "telespeech_asr": short_id("asr"), + "dashscope_realtime": short_id("asr"), } SEED_ASSISTANT_IDS = { @@ -408,6 +409,20 @@ def init_default_asr_models(): enable_normalization=True, enabled=True, ), + ASRModel( + id=SEED_ASR_IDS["dashscope_realtime"], + user_id=1, + name="DashScope Realtime ASR", + vendor="DashScope", + language="Multi-lingual", + base_url=DASHSCOPE_REALTIME_URL, + api_key="YOUR_API_KEY", + model_name="qwen3-asr-flash-realtime", + hotwords=[], + enable_punctuation=True, + enable_normalization=True, + enabled=True, + ), ] seed_if_empty(db, ASRModel, asr_models, "✅ 默认ASR模型已初始化") diff --git a/api/tests/test_asr.py b/api/tests/test_asr.py index 209116c..1cd3c01 100644 --- a/api/tests/test_asr.py +++ b/api/tests/test_asr.py @@ -1,8 +1,21 @@ """Tests for ASR Model API endpoints""" +import io +import wave + import pytest from unittest.mock import patch, MagicMock +def _make_wav_bytes(sample_rate: int = 16000) -> bytes: + with io.BytesIO() as buffer: + with wave.open(buffer, "wb") as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(sample_rate) + wav_file.writeframes(b"\x00\x00" * sample_rate) + return buffer.getvalue() + + class TestASRModelAPI: """Test cases for ASR Model endpoints""" @@ -75,6 +88,24 @@ class TestASRModelAPI: assert data["language"] == "en" assert data["enable_punctuation"] == False + def test_update_asr_model_vendor(self, client, sample_asr_model_data): + """Test updating ASR vendor metadata.""" + create_response = client.post("/api/asr", json=sample_asr_model_data) + model_id = create_response.json()["id"] + + response = client.put( + f"/api/asr/{model_id}", + json={ + "vendor": "DashScope", + "model_name": "qwen3-asr-flash-realtime", + "base_url": "wss://dashscope.aliyuncs.com/api-ws/v1/realtime", + }, + ) + assert response.status_code == 200 + data = response.json() + assert data["vendor"] == "DashScope" + assert data["model_name"] == "qwen3-asr-flash-realtime" + def test_delete_asr_model(self, client, sample_asr_model_data): """Test deleting an ASR model""" # Create first @@ -234,6 +265,28 @@ class TestASRModelAPI: response = client.post(f"/api/asr/{model_id}/test") assert response.status_code == 200 + def test_test_asr_model_dashscope(self, client, sample_asr_model_data, monkeypatch): + """Test DashScope ASR connectivity probe.""" + from app.routers import asr as asr_router + + sample_asr_model_data["vendor"] = "DashScope" + sample_asr_model_data["base_url"] = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + sample_asr_model_data["model_name"] = "qwen3-asr-flash-realtime" + create_response = client.post("/api/asr", json=sample_asr_model_data) + model_id = create_response.json()["id"] + + def fake_probe(**kwargs): + assert kwargs["api_key"] == sample_asr_model_data["api_key"] + assert kwargs["model"] == "qwen3-asr-flash-realtime" + + monkeypatch.setattr(asr_router, "_probe_dashscope_asr_connection", fake_probe) + + response = client.post(f"/api/asr/{model_id}/test") + assert response.status_code == 200 + data = response.json() + assert data["success"] is True + assert data["message"] == "DashScope realtime ASR connected" + @patch('httpx.Client') def test_test_asr_model_failure(self, mock_client_class, client, sample_asr_model_data): """Test testing an ASR model with failed connection""" @@ -274,7 +327,7 @@ class TestASRModelAPI: def test_different_asr_vendors(self, client): """Test creating ASR models with different vendors""" - vendors = ["SiliconFlow", "OpenAI", "Azure"] + vendors = ["SiliconFlow", "OpenAI", "Azure", "DashScope"] for vendor in vendors: data = { "id": f"asr-vendor-{vendor.lower()}", @@ -345,3 +398,33 @@ class TestASRModelAPI: ) assert response.status_code == 400 assert "Only audio files are supported" in response.text + + def test_preview_asr_model_dashscope(self, client, sample_asr_model_data, monkeypatch): + """Test ASR preview endpoint with DashScope realtime helper.""" + from app.routers import asr as asr_router + + sample_asr_model_data["vendor"] = "DashScope" + sample_asr_model_data["base_url"] = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + sample_asr_model_data["model_name"] = "qwen3-asr-flash-realtime" + create_response = client.post("/api/asr", json=sample_asr_model_data) + model_id = create_response.json()["id"] + + def fake_preview(**kwargs): + assert kwargs["base_url"] == sample_asr_model_data["base_url"] + assert kwargs["model"] == sample_asr_model_data["model_name"] + return { + "transcript": "你好,这是实时识别", + "language": "zh", + "confidence": None, + } + + monkeypatch.setattr(asr_router, "_transcribe_dashscope_preview", fake_preview) + + response = client.post( + f"/api/asr/{model_id}/preview", + files={"file": ("sample.wav", _make_wav_bytes(), "audio/wav")}, + ) + assert response.status_code == 200 + payload = response.json() + assert payload["success"] is True + assert payload["transcript"] == "你好,这是实时识别" diff --git a/api/tests/test_assistants.py b/api/tests/test_assistants.py index 0d880ef..eaab5b5 100644 --- a/api/tests/test_assistants.py +++ b/api/tests/test_assistants.py @@ -27,7 +27,9 @@ class TestAssistantAPI: assert data["voiceOutputEnabled"] is True assert data["firstTurnMode"] == "bot_first" assert data["generatedOpenerEnabled"] is False + assert data["asrInterimEnabled"] is False assert data["botCannotBeInterrupted"] is False + assert data["appId"] is None assert "id" in data assert data["callCount"] == 0 @@ -37,6 +39,7 @@ class TestAssistantAPI: response = client.post("/api/assistants", json=data) assert response.status_code == 200 assert response.json()["name"] == "Minimal Assistant" + assert response.json()["asrInterimEnabled"] is False def test_get_assistant_by_id(self, client, sample_assistant_data): """Test getting a specific assistant by ID""" @@ -68,6 +71,7 @@ class TestAssistantAPI: "prompt": "You are an updated assistant.", "speed": 1.5, "voiceOutputEnabled": False, + "asrInterimEnabled": True, "manualOpenerToolCalls": [ {"toolName": "text_msg_prompt", "arguments": {"msg": "请选择服务类型"}} ], @@ -79,6 +83,7 @@ class TestAssistantAPI: assert data["prompt"] == "You are an updated assistant." assert data["speed"] == 1.5 assert data["voiceOutputEnabled"] is False + assert data["asrInterimEnabled"] is True assert data["manualOpenerToolCalls"] == [ {"toolName": "text_msg_prompt", "arguments": {"msg": "请选择服务类型"}} ] @@ -213,6 +218,7 @@ class TestAssistantAPI: "prompt": "runtime prompt", "opener": "runtime opener", "manualOpenerToolCalls": [{"toolName": "text_msg_prompt", "arguments": {"msg": "欢迎"}}], + "asrInterimEnabled": True, "speed": 1.1, }) assistant_resp = client.post("/api/assistants", json=sample_assistant_data) @@ -232,6 +238,7 @@ class TestAssistantAPI: assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"] assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"] assert metadata["services"]["asr"]["baseUrl"] == sample_asr_model_data["base_url"] + assert metadata["services"]["asr"]["enableInterim"] is True expected_tts_voice = f"{sample_voice_data['model']}:{sample_voice_data['voice_key']}" assert metadata["services"]["tts"]["voice"] == expected_tts_voice assert metadata["services"]["tts"]["baseUrl"] == sample_voice_data["base_url"] @@ -309,6 +316,7 @@ class TestAssistantAPI: assert runtime_resp.status_code == 200 metadata = runtime_resp.json()["sessionStartMetadata"] assert metadata["output"]["mode"] == "text" + assert metadata["services"]["asr"]["enableInterim"] is False assert metadata["services"]["tts"]["enabled"] is False def test_runtime_config_dashscope_voice_provider(self, client, sample_assistant_data): @@ -343,6 +351,48 @@ class TestAssistantAPI: assert tts["apiKey"] == "dashscope-key" assert tts["baseUrl"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + def test_runtime_config_dashscope_asr_provider(self, client, sample_assistant_data): + """DashScope ASR models should map to dashscope asr provider in runtime metadata.""" + asr_resp = client.post("/api/asr", json={ + "name": "DashScope Realtime ASR", + "vendor": "DashScope", + "language": "zh", + "base_url": "wss://dashscope.aliyuncs.com/api-ws/v1/realtime", + "api_key": "dashscope-asr-key", + "model_name": "qwen3-asr-flash-realtime", + "hotwords": [], + "enable_punctuation": True, + "enable_normalization": True, + "enabled": True, + }) + assert asr_resp.status_code == 200 + asr_payload = asr_resp.json() + + sample_assistant_data.update({ + "asrModelId": asr_payload["id"], + }) + assistant_resp = client.post("/api/assistants", json=sample_assistant_data) + assert assistant_resp.status_code == 200 + assistant_id = assistant_resp.json()["id"] + + runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config") + assert runtime_resp.status_code == 200 + metadata = runtime_resp.json()["sessionStartMetadata"] + asr = metadata["services"]["asr"] + assert asr["provider"] == "dashscope" + assert asr["baseUrl"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + assert asr["enableInterim"] is False + + def test_runtime_config_defaults_asr_interim_disabled_without_asr_model(self, client, sample_assistant_data): + assistant_resp = client.post("/api/assistants", json=sample_assistant_data) + assert assistant_resp.status_code == 200 + assistant_id = assistant_resp.json()["id"] + + runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config") + assert runtime_resp.status_code == 200 + metadata = runtime_resp.json()["sessionStartMetadata"] + assert metadata["services"]["asr"]["enableInterim"] is False + def test_assistant_interrupt_and_generated_opener_flags(self, client, sample_assistant_data): sample_assistant_data.update({ "firstTurnMode": "user_first", @@ -370,3 +420,21 @@ class TestAssistantAPI: assert metadata["greeting"] == "" assert metadata["bargeIn"]["enabled"] is False assert metadata["bargeIn"]["minDurationMs"] == 900 + + def test_fastgpt_app_id_persists_and_flows_to_runtime(self, client, sample_assistant_data): + sample_assistant_data.update({ + "configMode": "fastgpt", + "apiUrl": "https://cloud.fastgpt.cn/api", + "apiKey": "fastgpt-key", + "appId": "app-fastgpt-123", + }) + assistant_resp = client.post("/api/assistants", json=sample_assistant_data) + assert assistant_resp.status_code == 200 + assistant_id = assistant_resp.json()["id"] + assert assistant_resp.json()["appId"] == "app-fastgpt-123" + + runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config") + assert runtime_resp.status_code == 200 + metadata = runtime_resp.json()["sessionStartMetadata"] + assert metadata["services"]["llm"]["provider"] == "fastgpt" + assert metadata["services"]["llm"]["appId"] == "app-fastgpt-123" diff --git a/docs/content/analysis/evaluation.md b/docs/content/analysis/evaluation.md index afe6816..91ec136 100644 --- a/docs/content/analysis/evaluation.md +++ b/docs/content/analysis/evaluation.md @@ -163,4 +163,4 @@ - [自动化测试](autotest.md) - 批量测试助手 - [历史记录](history.md) - 查看对话详情 -- [提示词指南](../assistants/prompts.md) - 优化提示词 +- [提示词指南](../concepts/assistants/prompts.md) - 优化提示词 diff --git a/docs/content/api-reference/index.md b/docs/content/api-reference/index.md index 1f22bd2..2ff42df 100644 --- a/docs/content/api-reference/index.md +++ b/docs/content/api-reference/index.md @@ -1,4 +1,4 @@ -# API 参考 +# API 参考 本节提供 Realtime Agent Studio (RAS) 的完整 API 文档。 @@ -163,6 +163,8 @@ WebSocket API 使用双向消息通信: ## SDK +> 下面的 SDK 包名和类名沿用当前包标识;产品名称在文档中统一使用 Realtime Agent Studio(RAS)。 + ### JavaScript SDK ```bash @@ -230,3 +232,4 @@ async with client.connect(assistant.id) as conv: - [WebSocket 协议](websocket.md) - 实时对话协议详解 - [错误码](errors.md) - 错误处理参考 - [快速开始](../quickstart/index.md) - 快速创建助手 + diff --git a/docs/content/assistants/configuration.md b/docs/content/assistants/configuration.md index 962c63f..f2405df 100644 --- a/docs/content/assistants/configuration.md +++ b/docs/content/assistants/configuration.md @@ -1,218 +1,8 @@ -# 配置选项 +# 配置选项(旧入口) -助手配置界面包含多个标签页,每个标签页负责不同方面的配置。 +本页保留旧链接,用于承接历史导航或外部引用。助手配置的正式文档已经迁移到: -## 全局设置 +- [配置选项](../concepts/assistants/configuration.md) - 助手配置界面与运行时配置层说明 +- [助手概念](../concepts/assistants.md) - 先理解助手对象、会话与动态变量 -全局设置定义助手的核心对话能力。 - -| 配置项 | 说明 | 建议值 | -|-------|------|--------| -| 助手名称 | 用于标识和管理 | 简洁明确 | -| 系统提示词 | 定义角色、任务和约束 | 详见[提示词指南](prompts.md) | -| 开场白 | 对话开始时的问候语 | 简短友好 | -| 温度参数 | 控制回复随机性 | 0.7(通用)/ 0.3(严谨) | -| 上下文长度 | 保留的历史消息数 | 10-20 | - -### 高级选项 - -- **首轮模式** - 设置首次对话的触发方式 -- **打断检测** - 用户打断时的处理策略 -- **超时设置** - 无响应时的处理 - -## 语音配置 - -配置语音识别和语音合成参数。 - -### TTS 语音合成 - -| 配置 | 说明 | -|------|------| -| TTS 引擎 | 选择语音合成服务(阿里/火山/Minimax) | -| 音色 | 选择语音风格和性别 | -| 语速 | 语音播放速度(0.5-2.0) | -| 音量 | 语音输出音量(0-100) | -| 音调 | 语音音调高低(0.5-2.0) | - -### ASR 语音识别 - -| 配置 | 说明 | -|------|------| -| ASR 引擎 | 选择语音识别服务 | -| 语言 | 识别语言(中文/英文/多语言) | -| 热词 | 提高特定词汇识别准确率 | - -## 工具绑定 - -配置助手可调用的外部工具。 - -### 可用工具类型 - -| 工具 | 说明 | -|------|------| -| 搜索工具 | 网络搜索获取信息 | -| 天气查询 | 查询天气预报 | -| 计算器 | 数学计算 | -| 知识库检索 | RAG 知识检索 | -| 自定义工具 | HTTP 回调外部 API | - -### 配置步骤 - -1. 在工具列表中勾选需要的工具 -2. 配置工具参数(如有) -3. 测试工具调用是否正常 - -## 知识关联 - -关联 RAG 知识库,让助手能够回答专业领域问题。 - -### 配置参数 - -| 参数 | 说明 | 建议值 | -|------|------|--------| -| 知识库 | 选择要关联的知识库 | - | -| 相似度阈值 | 低于此分数不返回 | 0.7 | -| 返回数量 | 单次检索返回条数 | 3 | -| 检索策略 | 混合/向量/关键词 | 混合 | - -### 多知识库 - -支持关联多个知识库,系统会自动合并检索结果。 - -## 外部链接 - -配置第三方服务集成和 Webhook 回调。 - -### Webhook 配置 - -| 字段 | 说明 | -|------|------| -| 回调 URL | 接收事件的 HTTP 端点 | -| 事件类型 | 订阅的事件(对话开始/结束/工具调用等) | -| 认证方式 | API Key / Bearer Token / 无 | - -### 支持的事件 - -- `conversation.started` - 对话开始 -- `conversation.ended` - 对话结束 -- `tool.called` - 工具被调用 -- `human.transfer` - 转人工 - -## 配置持久化与运行时覆盖 - -助手配置分为两层: - -1. **数据库持久化配置(基线配置)**:通过助手管理 API 保存,后续会话默认读取这一层。 -2. **会话级覆盖配置(runtime overrides)**:仅对当前 WebSocket 会话生效,不会写回数据库。 - -### 哪些配置会存到数据库 - -以下字段会持久化在 `assistants` / `assistant_opener_audio` 等表中(通过创建/更新助手写入): - -| 类别 | 典型字段 | -|------|---------| -| 对话行为 | `name`、`prompt`、`opener`、`firstTurnMode`、`generatedOpenerEnabled` | -| 输出与打断 | `voiceOutputEnabled`、`voice`、`speed`、`botCannotBeInterrupted`、`interruptionSensitivity` | -| 工具与知识库 | `tools`、`knowledgeBaseId` | -| 模型与外部模式 | `configMode`、`apiUrl`、`apiKey`、`llmModelId`、`asrModelId`、`embeddingModelId`、`rerankModelId` | -| 开场音频 | `openerAudioEnabled` 及音频文件状态(`ready`、`durationMs` 等) | - -> 引擎在连接时通过 `assistant_id` 从后端读取该助手的 `sessionStartMetadata` 作为默认运行配置。 - -### 哪些配置可以在会话中覆盖 - -客户端可在 `session.start.metadata.overrides` 中覆盖以下白名单字段(仅当前会话有效): - -- `systemPrompt` -- `greeting` -- `firstTurnMode` -- `generatedOpenerEnabled` -- `output` -- `bargeIn` -- `knowledgeBaseId` -- `knowledge` -- `tools` -- `openerAudio` - -以下字段不能由客户端覆盖: - -- `services`(模型 provider / apiKey / baseUrl 等) -- `assistantId` / `appId` / `configVersionId`(及下划线变体) -- 包含密钥语义的字段(如 `apiKey`、`token`、`secret`、`password`、`authorization`) - -### 覆盖示例(代码) - -下面示例展示「数据库基线配置 + 会话 overrides」的最终效果。 - -```json -// 1) 数据库存储的基线配置(示意) -// GET /api/v1/assistants/asst_demo/config -> sessionStartMetadata -{ - "systemPrompt": "你是电商客服助手,回答要简洁。", - "greeting": "你好,我是你的客服助手。", - "firstTurnMode": "bot_first", - "output": { "mode": "audio" }, - "knowledgeBaseId": "kb_orders", - "tools": [ - { "type": "function", "function": { "name": "query_order" } } - ] -} -``` - -```json -// 2) 客户端发起会话时的覆盖 -{ - "type": "session.start", - "metadata": { - "channel": "web", - "history": { "userId": 1001 }, - "overrides": { - "greeting": "你好,我来帮你查订单进度。", - "output": { "mode": "text" }, - "knowledgeBaseId": "kb_vip_orders", - "tools": [ - { "type": "function", "function": { "name": "query_vip_order" } } - ] - } - } -} -``` - -```json -// 3) 引擎合并后的有效配置(示意) -{ - "assistantId": "asst_demo", - "systemPrompt": "你是电商客服助手,回答要简洁。", - "greeting": "你好,我来帮你查订单进度。", - "firstTurnMode": "bot_first", - "output": { "mode": "text" }, - "knowledgeBaseId": "kb_vip_orders", - "tools": [ - { "type": "function", "function": { "name": "query_vip_order" } } - ], - "channel": "web", - "history": { "userId": 1001 } -} -``` - -合并规则可简化为: - -```python -effective = {**db_session_start_metadata, **metadata.overrides} -``` - -当 `WS_EMIT_CONFIG_RESOLVED=true` 时,服务端会返回 `config.resolved`(公开、安全裁剪后的快照)用于前端调试当前生效配置。 - -## 配置导入导出 - -### 导出配置 - -1. 在助手详情页点击 **更多** -2. 选择 **导出配置** -3. 下载 JSON 格式的配置文件 - -### 导入配置 - -1. 点击 **新建助手** -2. 选择 **从配置导入** -3. 上传配置文件 +如果你是从创建路径进入,也可以直接回到 [快速开始](../quickstart/index.md)。 diff --git a/docs/content/assistants/index.md b/docs/content/assistants/index.md index 110a8dd..ea0611d 100644 --- a/docs/content/assistants/index.md +++ b/docs/content/assistants/index.md @@ -1,57 +1,10 @@ -# 助手管理 +# 助手管理(旧入口) -助手是 Realtime Agent Studio (RAS) 的核心模块,用于创建和配置智能对话机器人。每个助手都可以独立配置提示词、语音、知识库和工具。 +本页保留旧链接,用于承接历史导航或外部引用。助手相关内容已经拆分到更明确的文档中: -## 概述 +- [助手概念](../concepts/assistants.md) - 了解助手是什么、由哪些部分组成,以及会话如何运行 +- [配置选项](../concepts/assistants/configuration.md) - 查看控制台和运行时配置项的分工 +- [提示词指南](../concepts/assistants/prompts.md) - 编写高质量系统提示词 +- [测试调试](../concepts/assistants/testing.md) - 验证助手行为并排查问题 -![助手管理](../images/assistants.png) - -## 助手能力 - -| 能力 | 说明 | -|------|------| -| **智能对话** | 基于 LLM 的自然语言理解和生成 | -| **语音交互** | 支持语音识别和语音合成 | -| **知识检索** | 关联知识库回答专业问题 | -| **工具调用** | 调用外部 API 执行操作 | -| **工作流** | 支持复杂的多轮对话流程 | - -## 创建助手 - -### 步骤 - -1. 进入 **助手管理** 页面 -2. 点击 **新建助手** 按钮 -3. 填写基本信息 -4. 配置各项参数 -5. 保存并发布 - -### 基本信息 - -| 配置项 | 说明 | -|-------|------| -| 助手名称 | 唯一标识,用于区分不同助手 | -| 提示词 | 定义助手的角色和行为 | -| 温度参数 | 控制回复的随机性(0-1) | - -## 调试助手 - -在助手详情页可进行实时调试: - -- **文本对话测试** - 快速验证回复质量 -- **语音输入测试** - 测试 ASR 识别效果 -- **工具调用验证** - 确认工具正常执行 - -## 发布助手 - -配置完成后: - -1. 点击 **保存** - 保存当前配置 -2. 点击 **发布** - 发布到生产环境 -3. 获取 API 调用地址 - 用于集成 - -## 下一步 - -- [配置选项](configuration.md) - 详细的配置标签页说明 -- [提示词指南](prompts.md) - 如何编写高质量的系统提示词 -- [测试调试](testing.md) - 助手测试与问题排查 +如果你是第一次上手,建议直接从 [快速开始](../quickstart/index.md) 进入。 diff --git a/docs/content/assistants/prompts.md b/docs/content/assistants/prompts.md index d359111..466339d 100644 --- a/docs/content/assistants/prompts.md +++ b/docs/content/assistants/prompts.md @@ -1,184 +1,8 @@ -# 提示词指南 +# 提示词指南(旧入口) -系统提示词(System Prompt)是定义助手行为的核心配置。本指南介绍如何编写高质量的提示词。 +本页保留旧链接,用于承接历史导航或外部引用。提示词的正式文档已经迁移到: -## 提示词结构 +- [提示词指南](../concepts/assistants/prompts.md) - 设计角色、任务、限制与风格 +- [助手概念](../concepts/assistants.md) - 理解提示词在助手体系中的位置 -一个完整的系统提示词通常包含以下部分: - -``` -[角色定义] -[任务描述] -[行为约束] -[输出格式] -[示例(可选)] -``` - -## 编写原则 - -### 1. 明确角色 - -告诉助手它是谁: - -``` -你是一个专业的技术支持工程师,专门负责解答产品使用问题。 -``` - -### 2. 定义任务 - -明确助手需要完成什么: - -``` -你的主要任务是: -1. 解答用户关于产品功能的问题 -2. 提供使用指导和最佳实践 -3. 帮助用户排查常见故障 -``` - -### 3. 设置约束 - -限制不希望出现的行为: - -``` -请注意: -- 不要讨论与产品无关的话题 -- 不要编造不存在的功能 -- 如果不确定答案,请建议用户联系人工客服 -``` - -### 4. 指定风格 - -定义回复的语气和风格: - -``` -回复风格要求: -- 使用友好、专业的语气 -- 回答简洁明了,避免冗长 -- 适当使用列表和步骤说明 -``` - -## 提示词模板 - -### 客服助手 - -``` -你是 [公司名称] 的智能客服助手。 - -## 你的职责 -- 解答用户关于产品和服务的问题 -- 处理常见的投诉和建议 -- 引导用户完成操作流程 - -## 回复要求 -- 保持友好和耐心 -- 回答简洁,一般不超过 3 句话 -- 如果问题复杂,建议转接人工客服 - -## 禁止行为 -- 不要讨论竞争对手 -- 不要承诺无法兑现的事项 -- 不要透露内部信息 -``` - -### 技术支持 - -``` -你是一个技术支持工程师,专门帮助用户解决技术问题。 - -## 工作流程 -1. 首先了解用户遇到的具体问题 -2. 询问必要的环境信息(系统版本、错误信息等) -3. 提供分步骤的解决方案 -4. 确认问题是否解决 - -## 回复格式 -- 使用编号列表说明操作步骤 -- 提供代码示例时使用代码块 -- 复杂问题可以分多次回复 -``` - -### 销售顾问 - -``` -你是一个产品销售顾问,帮助用户了解产品并做出购买决策。 - -## 沟通策略 -- 先了解用户需求,再推荐合适的产品 -- 突出产品优势,但不贬低竞品 -- 提供真实的价格和优惠信息 - -## 目标 -- 帮助用户找到最适合的方案 -- 解答购买相关的疑问 -- 促进成交但不过度推销 -``` - -## 动态变量 - -提示词支持动态变量,使用 `{{变量名}}` 语法: - -``` -你好 {{customer_name}},欢迎来到 {{company_name}}。 -你当前的会员等级是 {{membership_tier}}。 -``` - -在 `session.start` 时通过 `dynamicVariables` 传入: - -```json -{ - "type": "session.start", - "metadata": { - "dynamicVariables": { - "customer_name": "张三", - "company_name": "AI 公司", - "membership_tier": "黄金会员" - } - } -} -``` - -## 常见问题 - -### 回复太长 - -在提示词中明确限制: - -``` -回复长度要求: -- 一般问题:1-2 句话 -- 复杂问题:不超过 5 句话 -- 避免重复和冗余内容 -``` - -### 答非所问 - -增加任务边界说明: - -``` -重要提示: -- 只回答与 [产品/服务] 相关的问题 -- 对于无关问题,礼貌地拒绝并引导回正题 -``` - -### 编造信息 - -强调诚实原则: - -``` -信息准确性要求: -- 只提供你确定的信息 -- 不确定时说"我不太确定,建议您..." -- 绝对不要编造数据或功能 -``` - -## 最佳实践 - -1. **迭代优化** - 根据实际对话效果持续调整 -2. **测试覆盖** - 用各种场景测试提示词效果 -3. **版本管理** - 保存历史版本,便于回退 -4. **定期复盘** - 分析对话记录,发现改进点 - -## 下一步 - -- [测试调试](testing.md) - 验证提示词效果 -- [知识库配置](../customization/knowledge-base.md) - 补充专业知识 +如果你想先完成最小可用配置,请从 [快速开始](../quickstart/index.md) 继续。 diff --git a/docs/content/assistants/testing.md b/docs/content/assistants/testing.md index ca4bd06..5b1a039 100644 --- a/docs/content/assistants/testing.md +++ b/docs/content/assistants/testing.md @@ -1,162 +1,8 @@ -# 测试调试 +# 测试调试(旧入口) -本指南介绍如何测试和调试 AI 助手,确保其行为符合预期。 +本页保留旧链接,用于承接历史导航或外部引用。测试与调试的正式文档已经迁移到: -## 测试面板 +- [测试调试](../concepts/assistants/testing.md) - 验证助手行为、事件流和常见问题定位 +- [故障排查](../resources/troubleshooting.md) - 进入更细的链路排查步骤 -在助手详情页,点击 **测试** 按钮打开测试面板。 - -### 功能介绍 - -| 功能 | 说明 | -|------|------| -| 文本对话 | 直接输入文字进行测试 | -| 语音测试 | 使用麦克风进行语音对话 | -| 查看日志 | 实时查看系统日志 | -| 事件追踪 | 查看 WebSocket 事件流 | - -## 测试用例设计 - -### 基础功能测试 - -| 测试项 | 输入 | 预期结果 | -|--------|------|---------| -| 问候响应 | "你好" | 友好的问候回复 | -| 功能介绍 | "你能做什么?" | 准确描述能力范围 | -| 开场白 | 连接后自动 | 播放配置的开场白 | - -### 业务场景测试 - -根据助手定位设计测试用例: - -``` -场景:产品咨询助手 - -测试用例 1:常见问题 -- 输入:"产品有哪些功能?" -- 预期:准确列出主要功能 - -测试用例 2:价格询问 -- 输入:"多少钱?" -- 预期:提供价格信息或引导方式 - -测试用例 3:超出范围 -- 输入:"帮我写一首诗" -- 预期:礼貌拒绝并引导回业务话题 -``` - -### 边界测试 - -| 测试项 | 输入 | 预期结果 | -|--------|------|---------| -| 空输入 | "" | 提示用户输入内容 | -| 超长输入 | 1000+ 字符 | 正常处理或提示过长 | -| 特殊字符 | "" | 安全处理,不执行 | -| 敏感内容 | 不当言论 | 拒绝回复并提示 | - -## 日志分析 - -### 查看日志 - -在测试面板的 **日志** 标签页,可以看到: - -- ASR 识别结果 -- LLM 推理过程 -- TTS 合成状态 -- 工具调用记录 - -### 常见日志 - -``` -[ASR] transcript.final: "你好,请问有什么可以帮你" -[LLM] request: messages=[...] -[LLM] response: "您好!我是..." -[TTS] synthesizing: "您好!我是..." -[TTS] audio.start -[TTS] audio.end -``` - -## 事件追踪 - -在 **事件** 标签页查看完整的 WebSocket 事件流: - -```json -{"type": "session.started", "timestamp": 1704067200000} -{"type": "input.speech_started", "timestamp": 1704067201000} -{"type": "transcript.delta", "data": {"text": "你"}} -{"type": "transcript.delta", "data": {"text": "好"}} -{"type": "transcript.final", "data": {"text": "你好"}} -{"type": "assistant.response.delta", "data": {"text": "您"}} -{"type": "assistant.response.final", "data": {"text": "您好!..."}} -{"type": "output.audio.start"} -{"type": "output.audio.end"} -``` - -## 性能指标 - -关注以下性能指标: - -| 指标 | 说明 | 建议值 | -|------|------|--------| -| TTFB | 首字节时间 | < 500ms | -| 识别延迟 | ASR 处理时间 | < 1s | -| 回复延迟 | LLM 推理时间 | < 2s | -| 合成延迟 | TTS 处理时间 | < 500ms | - -## 常见问题排查 - -### 助手不响应 - -1. **检查连接状态** - - 确认 WebSocket 连接成功 - - 查看是否收到 `session.started` 事件 - -2. **检查模型配置** - - 确认 LLM 模型 API Key 有效 - - 测试模型连接是否正常 - -3. **查看错误日志** - - 打开浏览器开发者工具 - - 检查 Console 和 Network 标签 - -### 回复质量差 - -1. **优化提示词** - - 增加更明确的指令 - - 添加示例和约束 - -2. **调整温度参数** - - 降低 temperature 提高一致性 - - 适当值通常在 0.3-0.7 - -3. **补充知识库** - - 上传相关文档 - - 提高检索相关性 - -### 语音问题 - -1. **ASR 识别不准** - - 检查麦克风权限 - - 尝试更换 ASR 引擎 - - 添加热词提高识别率 - -2. **TTS 不播放** - - 检查浏览器自动播放限制 - - 确认 TTS 配置正确 - -## 自动化测试 - -使用自动化测试功能进行批量测试: - -1. 进入 **自动化测试** 页面 -2. 创建测试任务 -3. 配置测试用例 -4. 运行测试并查看报告 - -详见 [自动化测试](../analysis/autotest.md)。 - -## 下一步 - -- [自动化测试](../analysis/autotest.md) - 批量测试 -- [历史记录](../analysis/history.md) - 查看对话记录 -- [效果评估](../analysis/evaluation.md) - 评估对话质量 +如果你还没创建助手,请先完成 [快速开始](../quickstart/index.md)。 diff --git a/docs/content/assistants/workflow-configuration.md b/docs/content/assistants/workflow-configuration.md index f2f4861..facf111 100644 --- a/docs/content/assistants/workflow-configuration.md +++ b/docs/content/assistants/workflow-configuration.md @@ -1,68 +1,7 @@ -# 工作流配置选项(TODO 版本) +# 工作流配置(旧入口) -本文档是工作流配置页的第一版草稿,后续会根据实际能力继续细化。 +本页保留旧链接,用于承接早期草稿和历史引用。工作流的正式文档已收敛到: -## 配置目标 - -- 将多步骤对话拆分为可编排节点 -- 为不同分支定义独立提示词和工具权限 -- 在会话中按条件切换节点并透传上下文 - -## 基础配置项(建议) - -| 配置项 | 说明 | 建议值 | -|---|---|---| -| 工作流名称 | 用于区分业务流程 | 简洁、业务语义明确 | -| 入口节点 | 用户进入后的首个节点 | 固定单入口 | -| 全局提示词 | 对所有节点生效的共性约束 | 保持简短,避免与节点提示词冲突 | -| 节点提示词 | 当前节点的任务说明 | 单一职责,明确输入/输出 | -| 节点工具白名单 | 当前节点可调用工具集合 | 最小权限原则 | -| 节点超时 | 节点等待超时处理 | 3-10 秒 | -| 失败回退节点 | 异常时兜底节点 | 建议统一到人工或澄清节点 | - -## 节点建议类型 - -- 意图识别节点:判断用户诉求并路由 -- 信息收集节点:收集订单号、手机号等关键信息 -- 处理节点:执行查询、计算、调用工具 -- 回复节点:组织最终答复 -- 结束节点:输出结束语并关闭会话 - -## 配置示例 - -```yaml -workflow: - name: "订单咨询流程" - entry: "intent_router" - global_prompt: "优先给出可执行步骤,必要时先澄清信息。" - nodes: - - id: "intent_router" - type: "router" - prompt: "识别用户意图:查订单、退款、投诉" - next: - - when: "intent == query_order" - to: "collect_order_id" - - when: "intent == refund" - to: "refund_policy" - - id: "collect_order_id" - type: "collect" - prompt: "请用户提供订单号" - tools: ["query_order"] - fallback: "human_handoff" - - id: "human_handoff" - type: "end" - prompt: "转人工处理" -``` - -## 已知限制(当前) - -- 不支持在文档中完整定义所有表达式语法 -- 不同执行引擎的节点字段可能存在差异 -- 可视化编排与 YAML 字段暂未完全一一对应 - -## 后续计划 - -- 补充节点字段的完整 Schema -- 补充路由条件表达式规范 -- 增加“调试与回放”章节 +- [工作流](../customization/workflows.md) - 了解工作流的定位、节点结构、设计建议和当前边界 +如果你正在配置助手中的流程能力,请优先阅读上述页面,再结合 [工具](../customization/tools.md) 与 [助手概念](../concepts/assistants.md) 一起使用。 diff --git a/docs/content/changelog.md b/docs/content/changelog.md index b99bb81..149d557 100644 --- a/docs/content/changelog.md +++ b/docs/content/changelog.md @@ -1,4 +1,4 @@ -# 更新日志 +# 更新日志 本文档记录 Realtime Agent Studio 的所有重要变更。 @@ -29,7 +29,7 @@ - **OpenAI 兼容接口** - 支持 OpenAI Compatible 的 ASR/TTS 服务 - **DashScope TTS** - 阿里云语音合成服务适配 -#### 智能体配置 +#### 助手配置 - **系统提示词** - 支持角色定义和动态变量 `{{variable}}` - **模型管理** - LLM/ASR/TTS 模型统一管理界面 diff --git a/docs/content/concepts/assistants.md b/docs/content/concepts/assistants.md index f22c4c8..8e17ab5 100644 --- a/docs/content/concepts/assistants.md +++ b/docs/content/concepts/assistants.md @@ -1,253 +1,147 @@ # 助手概念详解 -深入了解助手(Assistant)的设计理念和配置细节。 +助手(Assistant)是 Realtime Agent Studio(RAS)中最核心的配置单元,也是控制台和 API 对外暴露能力的基本对象。 --- -## 什么是助手? +## 什么是助手 -**助手**是 RAS 中的核心实体,代表一个具有特定角色、能力和行为的 AI 对话智能体。每个助手都是独立配置的,可以服务于不同的业务场景。 +一个助手代表一个可接入、可测试、可发布的实时 AI 入口。它回答三个问题: -### 助手的组成 +- **它是谁**:角色、语气、目标、限制、开场方式、静默时候的行动(比如静默时候的询问 Ask-on-Idle) +- **它能做什么**:语言模型能力、语音模型能力(ASR、TTS、用户打断灵敏度(Barge-in)、语句端点设置(End-of-Utterance))、知识库、记忆、工具(Webhook、客户端工具、系统工具、MCP)、输出模式 +- **它在一次会话中如何运行**:通过 `assistant_id` 载入配置,并在运行时接收动态变量、对话时候的上下文更新 -```mermaid -flowchart TB - subgraph Assistant["助手"] - Identity[身份定义] - Models[模型配置] - Capabilities[能力扩展] - Behavior[行为控制] - end +如果把引擎理解为“运行时”,那么助手就是“运行时要执行的那份定义”。 - subgraph Identity - Name[名称] - Prompt[系统提示词] - Language[语言] - end +## 助手由哪些部分组成 - subgraph Models - LLM[LLM 模型] - ASR[ASR 模型] - TTS[TTS 声音] - end +| 层次 | 负责什么 | 典型内容 | +|------|----------|----------| +| **身份层** | 定义助手角色和交互风格 | 系统提示词、限制、开场白、静默处理 | +| **模型层** | 决定理解与生成能力 | LLM、ASR、TTS、引擎类型、用户打断、语句端点 | +| **能力层** | 扩展知识和执行能力 | 知识库、工具、记忆 | +| **会话层** | 决定运行时上下文如何注入 | `assistant_id`、动态变量 | - subgraph Capabilities - Tools[工具调用] - KB[知识库] - end +## 身份层 - subgraph Behavior - Greeting[开场白] - Interruption[打断设置] - Output[输出模式] - end -``` - ---- - -## 身份定义 +助手首先是一个“被约束的角色”,而不是一段孤立的模型调用。 ### 系统提示词 -系统提示词是助手最重要的配置,它定义了: +系统提示词定义助手的角色、任务、边界和风格,是所有能力组合的基础。 -| 要素 | 说明 | 示例 | +| 要素 | 作用 | 示例 | |------|------|------| -| **角色** | 助手扮演什么身份 | "你是一名专业的医疗咨询顾问" | -| **能力** | 助手能做什么 | "你可以回答健康问题,但不能开具处方" | -| **限制** | 助手不能做什么 | "不要讨论政治话题" | -| **风格** | 回复的语气和格式 | "保持友好专业,回答简洁" | +| **角色** | 告诉模型“自己是谁” | 客服助手、销售顾问、培训教练 | +| **任务** | 指定要完成的结果 | 解答咨询、收集信息、调用工具处理业务 | +| **限制** | 明确哪些事不能做 | 不承诺超权限优惠、不输出未经验证的结论 | +| **风格** | 约束回答节奏和措辞 | 简洁、口语化、每次 2-3 句 | -### 提示词模板 +### 开场白 -```markdown -## 角色 -你是{{company}}的智能客服助手"小智"。 +一个助手还要定义会话应该如何开始,以及用户静默时候如何处理,包括: -## 任务 -- 回答用户关于产品和服务的问题 -- 协助处理订单查询和售后问题 -- 收集用户反馈 +- **首轮模式**:助手先说、用户先说或者机器先说 +- **开场白**:使用固定开场白或者AI生成开场白 -## 限制 -- 不讨论竞争对手产品 -- 不承诺超出权限的优惠 -- 遇到复杂问题引导用户联系人工客服 +### 静默处理 -## 风格 -- 语气友好亲切 -- 回答简洁明了,每次 2-3 句话 -- 适当使用语气词使对话更自然 -``` +用户静默时候是否询问用户是否在线 ---- +## 模型层 -## 模型配置 +模型决定助手的基础理解、推理和表达能力,但不是助手定义的全部。 -### LLM 模型 +- **LLM** 决定对话推理与文本生成能力 +- **ASR** 决定语音输入如何被实时转写 +- **TTS** 决定文本回复如何转成可播放语音 +- **引擎类型** 决定运行链路是分段可控还是端到端低延迟 +- **VAD** 声音活动模型,判断用户是否在说话 +- **EOU** 语句端点模型,判断用户是否完成一段语句等待回复 +- **Barge In** 由于用户声音活动或者手动请求,是否打断助手当前的回复 -大语言模型是助手的"大脑",负责理解用户意图和生成回复。 +## 能力层 -| 参数 | 说明 | 建议值 | -|------|------|--------| -| **温度** | 回复随机性,越高越发散 | 0.7 (对话) / 0.3 (问答) | -| **最大 Token** | 单次回复长度上限 | 256-512 | -| **上下文长度** | 记忆的对话轮数 | 10-20 轮 | +### 知识库 -### ASR 模型 - -语音识别模型将用户语音转为文字。 - -| 配置 | 说明 | -|------|------| -| **语言** | 识别语言,如中文、英文 | -| **热词** | 提高特定词汇识别率 | -| **标点** | 是否自动添加标点 | - -### TTS 声音 - -语音合成将助手回复转为语音输出。 - -| 配置 | 说明 | -|------|------| -| **音色** | 选择声音角色 | -| **语速** | 说话速度,0.5-2.0 | -| **音调** | 声音高低 | - ---- - -## 能力扩展 - -### 工具调用 - -通过工具让助手能够执行外部操作: +知识库用于补充私有领域知识,让助手回答超出基础模型常识之外的问题。 ```mermaid flowchart LR - User[用户] -->|"查询订单"| Assistant[助手] - Assistant -->|调用工具| API[订单 API] - API -->|返回数据| Assistant - Assistant -->|回复| User -``` - -**工具定义示例:** - -```json -{ - "name": "get_order_status", - "description": "查询用户订单状态", - "parameters": { - "type": "object", - "properties": { - "order_id": { - "type": "string", - "description": "订单编号" - } - }, - "required": ["order_id"] - } -} -``` - -### 知识库关联 - -让助手基于私有文档回答问题: - -```mermaid -flowchart LR - Question[用户问题] --> Search[知识检索] - Search --> KB[(知识库)] - KB --> Context[相关内容] + Question[用户问题] --> Retrieval[检索] + Retrieval --> KB[(知识库)] + KB --> Context[相关片段] Context --> LLM[LLM] LLM --> Answer[回答] ``` ---- +知识库适合承载政策、产品资料、流程说明、FAQ 和内部文档,而不是把所有业务知识堆进系统提示词。 -## 行为控制 +### 工具 -### 开场白设置 - -| 模式 | 说明 | -|------|------| -| **助手先说** | 连接后助手主动问候 | -| **用户先说** | 等待用户开口 | -| **静默** | 不自动开场 | - -### 打断设置 - -| 选项 | 说明 | -|------|------| -| **允许打断** | 用户可随时插话 | -| **禁止打断** | 助手说完才能输入 | -| **灵敏度** | 打断触发的敏感程度 | - -### 输出模式 - -| 模式 | 说明 | -|------|------| -| **语音** | TTS 语音输出 | -| **文本** | 纯文本输出 | -| **混合** | 同时输出语音和文本 | - ---- - -## 助手版本管理 - -### 草稿与发布 +工具让助手从“会说”变成“能做事”。 ```mermaid -gitGraph - commit id: "创建助手" - commit id: "配置提示词" - commit id: "添加工具" - branch published - checkout published - commit id: "发布 v1" - checkout main - commit id: "修改提示词" - commit id: "调整参数" - checkout published - merge main id: "发布 v2" +flowchart LR + User[用户] --> Assistant[助手] + Assistant --> Tool[工具 / 外部系统] + Tool --> Assistant + Assistant --> User ``` -- **草稿**: 可随时修改,仅供测试 -- **发布**: 正式上线,用于生产环境 +适合用工具处理的任务包括:订单查询、预约、外部搜索、写入业务系统、调用客户端能力等。 -### 配置导入导出 +## 会话层 -支持以 JSON 格式导入导出助手配置,便于: +### `assistant_id` 的作用 -- 备份和恢复 -- 跨环境迁移 -- 团队共享模板 +在接入层面,客户端通过 `assistant_id` 指定要加载哪一个助手。引擎据此读取默认配置,并把同一份助手定义应用到当前会话。 ---- +### 会话生命周期 -## 最佳实践 +```mermaid +stateDiagram-v2 + [*] --> Connecting: WebSocket 连接 + Connecting --> Started: session.started + Started --> Active: config.resolved / 开始对话 + Active --> Active: 多轮交互 + Active --> Stopped: session.stop 或连接关闭 + Stopped --> [*] +``` -### 1. 提示词工程 +一次会话通常会沉淀以下信息: -- **明确角色**: 清晰定义助手身份 -- **设定边界**: 明确能做什么、不能做什么 -- **控制长度**: 语音场景下回复要简短 +- 用户与助手消息时间线 +- 音频流、转写结果和模型输出 +- 工具调用记录与中间事件 +- 自定义 metadata、渠道和业务上下文 -### 2. 模型选择 -- **平衡成本与效果**: 不一定需要最强模型 -- **测试不同供应商**: 找到最适合场景的组合 -- **考虑延迟**: 语音交互对延迟敏感 +### 动态变量与会话级覆盖 -### 3. 工具设计 +助手的默认配置不需要为每个用户都重新复制一份。RAS 提供两种常见的运行时注入方式: -- **单一职责**: 每个工具做一件事 -- **清晰描述**: 让 LLM 正确理解何时调用 -- **错误处理**: 工具失败时优雅降级 +- **动态变量**:在提示词中使用 `{{variable}}` 占位,并在会话开始时传入具体值 +- **会话级覆盖**:仅对当前会话覆盖部分运行时参数,不回写助手基线配置 ---- +```json +{ + "type": "session.start", + "metadata": { + "dynamicVariables": { + "company_name": "ABC 公司", + "customer_name": "张三", + "tier": "VIP" + } + } +} +``` + +这种设计让你既能复用标准助手,又能在每次接入时注入渠道、用户、订单或上下文信息。 ## 相关文档 -- [助手配置](../assistants/configuration.md) - 配置界面详解 -- [提示词指南](../assistants/prompts.md) - 编写高质量提示词 -- [工具集成](../customization/tools.md) - 工具配置详情 +- [配置选项](assistants/configuration.md) - 查看助手在控制台和运行时有哪些配置层 +- [提示词指南](assistants/prompts.md) - 设计角色、任务、限制和语气 +- [测试调试](assistants/testing.md) - 验证助手质量并定位问题 diff --git a/docs/content/concepts/assistants/configuration.md b/docs/content/concepts/assistants/configuration.md new file mode 100644 index 0000000..962c63f --- /dev/null +++ b/docs/content/concepts/assistants/configuration.md @@ -0,0 +1,218 @@ +# 配置选项 + +助手配置界面包含多个标签页,每个标签页负责不同方面的配置。 + +## 全局设置 + +全局设置定义助手的核心对话能力。 + +| 配置项 | 说明 | 建议值 | +|-------|------|--------| +| 助手名称 | 用于标识和管理 | 简洁明确 | +| 系统提示词 | 定义角色、任务和约束 | 详见[提示词指南](prompts.md) | +| 开场白 | 对话开始时的问候语 | 简短友好 | +| 温度参数 | 控制回复随机性 | 0.7(通用)/ 0.3(严谨) | +| 上下文长度 | 保留的历史消息数 | 10-20 | + +### 高级选项 + +- **首轮模式** - 设置首次对话的触发方式 +- **打断检测** - 用户打断时的处理策略 +- **超时设置** - 无响应时的处理 + +## 语音配置 + +配置语音识别和语音合成参数。 + +### TTS 语音合成 + +| 配置 | 说明 | +|------|------| +| TTS 引擎 | 选择语音合成服务(阿里/火山/Minimax) | +| 音色 | 选择语音风格和性别 | +| 语速 | 语音播放速度(0.5-2.0) | +| 音量 | 语音输出音量(0-100) | +| 音调 | 语音音调高低(0.5-2.0) | + +### ASR 语音识别 + +| 配置 | 说明 | +|------|------| +| ASR 引擎 | 选择语音识别服务 | +| 语言 | 识别语言(中文/英文/多语言) | +| 热词 | 提高特定词汇识别准确率 | + +## 工具绑定 + +配置助手可调用的外部工具。 + +### 可用工具类型 + +| 工具 | 说明 | +|------|------| +| 搜索工具 | 网络搜索获取信息 | +| 天气查询 | 查询天气预报 | +| 计算器 | 数学计算 | +| 知识库检索 | RAG 知识检索 | +| 自定义工具 | HTTP 回调外部 API | + +### 配置步骤 + +1. 在工具列表中勾选需要的工具 +2. 配置工具参数(如有) +3. 测试工具调用是否正常 + +## 知识关联 + +关联 RAG 知识库,让助手能够回答专业领域问题。 + +### 配置参数 + +| 参数 | 说明 | 建议值 | +|------|------|--------| +| 知识库 | 选择要关联的知识库 | - | +| 相似度阈值 | 低于此分数不返回 | 0.7 | +| 返回数量 | 单次检索返回条数 | 3 | +| 检索策略 | 混合/向量/关键词 | 混合 | + +### 多知识库 + +支持关联多个知识库,系统会自动合并检索结果。 + +## 外部链接 + +配置第三方服务集成和 Webhook 回调。 + +### Webhook 配置 + +| 字段 | 说明 | +|------|------| +| 回调 URL | 接收事件的 HTTP 端点 | +| 事件类型 | 订阅的事件(对话开始/结束/工具调用等) | +| 认证方式 | API Key / Bearer Token / 无 | + +### 支持的事件 + +- `conversation.started` - 对话开始 +- `conversation.ended` - 对话结束 +- `tool.called` - 工具被调用 +- `human.transfer` - 转人工 + +## 配置持久化与运行时覆盖 + +助手配置分为两层: + +1. **数据库持久化配置(基线配置)**:通过助手管理 API 保存,后续会话默认读取这一层。 +2. **会话级覆盖配置(runtime overrides)**:仅对当前 WebSocket 会话生效,不会写回数据库。 + +### 哪些配置会存到数据库 + +以下字段会持久化在 `assistants` / `assistant_opener_audio` 等表中(通过创建/更新助手写入): + +| 类别 | 典型字段 | +|------|---------| +| 对话行为 | `name`、`prompt`、`opener`、`firstTurnMode`、`generatedOpenerEnabled` | +| 输出与打断 | `voiceOutputEnabled`、`voice`、`speed`、`botCannotBeInterrupted`、`interruptionSensitivity` | +| 工具与知识库 | `tools`、`knowledgeBaseId` | +| 模型与外部模式 | `configMode`、`apiUrl`、`apiKey`、`llmModelId`、`asrModelId`、`embeddingModelId`、`rerankModelId` | +| 开场音频 | `openerAudioEnabled` 及音频文件状态(`ready`、`durationMs` 等) | + +> 引擎在连接时通过 `assistant_id` 从后端读取该助手的 `sessionStartMetadata` 作为默认运行配置。 + +### 哪些配置可以在会话中覆盖 + +客户端可在 `session.start.metadata.overrides` 中覆盖以下白名单字段(仅当前会话有效): + +- `systemPrompt` +- `greeting` +- `firstTurnMode` +- `generatedOpenerEnabled` +- `output` +- `bargeIn` +- `knowledgeBaseId` +- `knowledge` +- `tools` +- `openerAudio` + +以下字段不能由客户端覆盖: + +- `services`(模型 provider / apiKey / baseUrl 等) +- `assistantId` / `appId` / `configVersionId`(及下划线变体) +- 包含密钥语义的字段(如 `apiKey`、`token`、`secret`、`password`、`authorization`) + +### 覆盖示例(代码) + +下面示例展示「数据库基线配置 + 会话 overrides」的最终效果。 + +```json +// 1) 数据库存储的基线配置(示意) +// GET /api/v1/assistants/asst_demo/config -> sessionStartMetadata +{ + "systemPrompt": "你是电商客服助手,回答要简洁。", + "greeting": "你好,我是你的客服助手。", + "firstTurnMode": "bot_first", + "output": { "mode": "audio" }, + "knowledgeBaseId": "kb_orders", + "tools": [ + { "type": "function", "function": { "name": "query_order" } } + ] +} +``` + +```json +// 2) 客户端发起会话时的覆盖 +{ + "type": "session.start", + "metadata": { + "channel": "web", + "history": { "userId": 1001 }, + "overrides": { + "greeting": "你好,我来帮你查订单进度。", + "output": { "mode": "text" }, + "knowledgeBaseId": "kb_vip_orders", + "tools": [ + { "type": "function", "function": { "name": "query_vip_order" } } + ] + } + } +} +``` + +```json +// 3) 引擎合并后的有效配置(示意) +{ + "assistantId": "asst_demo", + "systemPrompt": "你是电商客服助手,回答要简洁。", + "greeting": "你好,我来帮你查订单进度。", + "firstTurnMode": "bot_first", + "output": { "mode": "text" }, + "knowledgeBaseId": "kb_vip_orders", + "tools": [ + { "type": "function", "function": { "name": "query_vip_order" } } + ], + "channel": "web", + "history": { "userId": 1001 } +} +``` + +合并规则可简化为: + +```python +effective = {**db_session_start_metadata, **metadata.overrides} +``` + +当 `WS_EMIT_CONFIG_RESOLVED=true` 时,服务端会返回 `config.resolved`(公开、安全裁剪后的快照)用于前端调试当前生效配置。 + +## 配置导入导出 + +### 导出配置 + +1. 在助手详情页点击 **更多** +2. 选择 **导出配置** +3. 下载 JSON 格式的配置文件 + +### 导入配置 + +1. 点击 **新建助手** +2. 选择 **从配置导入** +3. 上传配置文件 diff --git a/docs/content/concepts/assistants/prompts.md b/docs/content/concepts/assistants/prompts.md new file mode 100644 index 0000000..c6ea015 --- /dev/null +++ b/docs/content/concepts/assistants/prompts.md @@ -0,0 +1,184 @@ +# 提示词指南 + +系统提示词(System Prompt)是定义助手行为的核心配置。本指南介绍如何编写高质量的提示词。 + +## 提示词结构 + +一个完整的系统提示词通常包含以下部分: + +``` +[角色定义] +[任务描述] +[行为约束] +[输出格式] +[示例(可选)] +``` + +## 编写原则 + +### 1. 明确角色 + +告诉助手它是谁: + +``` +你是一个专业的技术支持工程师,专门负责解答产品使用问题。 +``` + +### 2. 定义任务 + +明确助手需要完成什么: + +``` +你的主要任务是: +1. 解答用户关于产品功能的问题 +2. 提供使用指导和最佳实践 +3. 帮助用户排查常见故障 +``` + +### 3. 设置约束 + +限制不希望出现的行为: + +``` +请注意: +- 不要讨论与产品无关的话题 +- 不要编造不存在的功能 +- 如果不确定答案,请建议用户联系人工客服 +``` + +### 4. 指定风格 + +定义回复的语气和风格: + +``` +回复风格要求: +- 使用友好、专业的语气 +- 回答简洁明了,避免冗长 +- 适当使用列表和步骤说明 +``` + +## 提示词模板 + +### 客服助手 + +``` +你是 [公司名称] 的智能客服助手。 + +## 你的职责 +- 解答用户关于产品和服务的问题 +- 处理常见的投诉和建议 +- 引导用户完成操作流程 + +## 回复要求 +- 保持友好和耐心 +- 回答简洁,一般不超过 3 句话 +- 如果问题复杂,建议转接人工客服 + +## 禁止行为 +- 不要讨论竞争对手 +- 不要承诺无法兑现的事项 +- 不要透露内部信息 +``` + +### 技术支持 + +``` +你是一个技术支持工程师,专门帮助用户解决技术问题。 + +## 工作流程 +1. 首先了解用户遇到的具体问题 +2. 询问必要的环境信息(系统版本、错误信息等) +3. 提供分步骤的解决方案 +4. 确认问题是否解决 + +## 回复格式 +- 使用编号列表说明操作步骤 +- 提供代码示例时使用代码块 +- 复杂问题可以分多次回复 +``` + +### 销售顾问 + +``` +你是一个产品销售顾问,帮助用户了解产品并做出购买决策。 + +## 沟通策略 +- 先了解用户需求,再推荐合适的产品 +- 突出产品优势,但不贬低竞品 +- 提供真实的价格和优惠信息 + +## 目标 +- 帮助用户找到最适合的方案 +- 解答购买相关的疑问 +- 促进成交但不过度推销 +``` + +## 动态变量 + +提示词支持动态变量,使用 `{{变量名}}` 语法: + +``` +你好 {{customer_name}},欢迎来到 {{company_name}}。 +你当前的会员等级是 {{membership_tier}}。 +``` + +在 `session.start` 时通过 `dynamicVariables` 传入: + +```json +{ + "type": "session.start", + "metadata": { + "dynamicVariables": { + "customer_name": "张三", + "company_name": "AI 公司", + "membership_tier": "黄金会员" + } + } +} +``` + +## 常见问题 + +### 回复太长 + +在提示词中明确限制: + +``` +回复长度要求: +- 一般问题:1-2 句话 +- 复杂问题:不超过 5 句话 +- 避免重复和冗余内容 +``` + +### 答非所问 + +增加任务边界说明: + +``` +重要提示: +- 只回答与 [产品/服务] 相关的问题 +- 对于无关问题,礼貌地拒绝并引导回正题 +``` + +### 编造信息 + +强调诚实原则: + +``` +信息准确性要求: +- 只提供你确定的信息 +- 不确定时说"我不太确定,建议您..." +- 绝对不要编造数据或功能 +``` + +## 最佳实践 + +1. **迭代优化** - 根据实际对话效果持续调整 +2. **测试覆盖** - 用各种场景测试提示词效果 +3. **版本管理** - 保存历史版本,便于回退 +4. **定期复盘** - 分析对话记录,发现改进点 + +## 下一步 + +- [测试调试](testing.md) - 验证提示词效果 +- [知识库配置](../../customization/knowledge-base.md) - 补充专业知识 diff --git a/docs/content/concepts/assistants/testing.md b/docs/content/concepts/assistants/testing.md new file mode 100644 index 0000000..21839ac --- /dev/null +++ b/docs/content/concepts/assistants/testing.md @@ -0,0 +1,162 @@ +# 测试调试 + +本指南介绍如何测试和调试 AI 助手,确保其行为符合预期。 + +## 测试面板 + +在助手详情页,点击 **测试** 按钮打开测试面板。 + +### 功能介绍 + +| 功能 | 说明 | +|------|------| +| 文本对话 | 直接输入文字进行测试 | +| 语音测试 | 使用麦克风进行语音对话 | +| 查看日志 | 实时查看系统日志 | +| 事件追踪 | 查看 WebSocket 事件流 | + +## 测试用例设计 + +### 基础功能测试 + +| 测试项 | 输入 | 预期结果 | +|--------|------|---------| +| 问候响应 | "你好" | 友好的问候回复 | +| 功能介绍 | "你能做什么?" | 准确描述能力范围 | +| 开场白 | 连接后自动 | 播放配置的开场白 | + +### 业务场景测试 + +根据助手定位设计测试用例: + +``` +场景:产品咨询助手 + +测试用例 1:常见问题 +- 输入:"产品有哪些功能?" +- 预期:准确列出主要功能 + +测试用例 2:价格询问 +- 输入:"多少钱?" +- 预期:提供价格信息或引导方式 + +测试用例 3:超出范围 +- 输入:"帮我写一首诗" +- 预期:礼貌拒绝并引导回业务话题 +``` + +### 边界测试 + +| 测试项 | 输入 | 预期结果 | +|--------|------|---------| +| 空输入 | "" | 提示用户输入内容 | +| 超长输入 | 1000+ 字符 | 正常处理或提示过长 | +| 特殊字符 | "" | 安全处理,不执行 | +| 敏感内容 | 不当言论 | 拒绝回复并提示 | + +## 日志分析 + +### 查看日志 + +在测试面板的 **日志** 标签页,可以看到: + +- ASR 识别结果 +- LLM 推理过程 +- TTS 合成状态 +- 工具调用记录 + +### 常见日志 + +``` +[ASR] transcript.final: "你好,请问有什么可以帮你" +[LLM] request: messages=[...] +[LLM] response: "您好!我是..." +[TTS] synthesizing: "您好!我是..." +[TTS] audio.start +[TTS] audio.end +``` + +## 事件追踪 + +在 **事件** 标签页查看完整的 WebSocket 事件流: + +```json +{"type": "session.started", "timestamp": 1704067200000} +{"type": "input.speech_started", "timestamp": 1704067201000} +{"type": "transcript.delta", "data": {"text": "你"}} +{"type": "transcript.delta", "data": {"text": "好"}} +{"type": "transcript.final", "data": {"text": "你好"}} +{"type": "assistant.response.delta", "data": {"text": "您"}} +{"type": "assistant.response.final", "data": {"text": "您好!..."}} +{"type": "output.audio.start"} +{"type": "output.audio.end"} +``` + +## 性能指标 + +关注以下性能指标: + +| 指标 | 说明 | 建议值 | +|------|------|--------| +| TTFB | 首字节时间 | < 500ms | +| 识别延迟 | ASR 处理时间 | < 1s | +| 回复延迟 | LLM 推理时间 | < 2s | +| 合成延迟 | TTS 处理时间 | < 500ms | + +## 常见问题排查 + +### 助手不响应 + +1. **检查连接状态** + - 确认 WebSocket 连接成功 + - 查看是否收到 `session.started` 事件 + +2. **检查模型配置** + - 确认 LLM 模型 API Key 有效 + - 测试模型连接是否正常 + +3. **查看错误日志** + - 打开浏览器开发者工具 + - 检查 Console 和 Network 标签 + +### 回复质量差 + +1. **优化提示词** + - 增加更明确的指令 + - 添加示例和约束 + +2. **调整温度参数** + - 降低 temperature 提高一致性 + - 适当值通常在 0.3-0.7 + +3. **补充知识库** + - 上传相关文档 + - 提高检索相关性 + +### 语音问题 + +1. **ASR 识别不准** + - 检查麦克风权限 + - 尝试更换 ASR 引擎 + - 添加热词提高识别率 + +2. **TTS 不播放** + - 检查浏览器自动播放限制 + - 确认 TTS 配置正确 + +## 自动化测试 + +使用自动化测试功能进行批量测试: + +1. 进入 **自动化测试** 页面 +2. 创建测试任务 +3. 配置测试用例 +4. 运行测试并查看报告 + +详见 [自动化测试](../../analysis/autotest.md)。 + +## 下一步 + +- [自动化测试](../../analysis/autotest.md) - 批量测试 +- [历史记录](../../analysis/history.md) - 查看对话记录 +- [效果评估](../../analysis/evaluation.md) - 评估对话质量 diff --git a/docs/content/concepts/engines.md b/docs/content/concepts/engines.md index 16d06af..400d0cd 100644 --- a/docs/content/concepts/engines.md +++ b/docs/content/concepts/engines.md @@ -1,349 +1,107 @@ -# 引擎架构详解 +# 引擎架构 -深入了解 RAS 的两种引擎架构:管线式引擎和多模态引擎。 +RAS 提供两类实时运行时:**Pipeline 引擎** 和 **Realtime 引擎**。本页只回答一个问题:你的助手应该跑在哪种引擎上。 --- -## 引擎概述 +## 先记住这条判断标准 -引擎是 RAS 的核心,负责处理实时语音交互。根据不同需求,可以选择两种架构: +- 如果你优先考虑 **可控性、可替换性、成本管理、工具 / 知识 / 流程编排**,优先选 **Pipeline 引擎** +- 如果你优先考虑 **超低延迟、更自然的端到端语音体验**,优先选 **Realtime 引擎** -| 架构 | 特点 | 适用场景 | -|------|------|---------| -| **管线式** | 灵活、可定制、成本可控 | 大多数场景 | -| **多模态** | 低延迟、自然、简单 | 高端体验场景 | +## 两类引擎的区别 ---- +| 维度 | Pipeline 引擎 | Realtime 引擎 | +|------|---------------|---------------| +| **交互路径** | VAD → ASR → TD → LLM → TTS | 端到端实时模型 | +| **可控性** | 高,每个环节可替换 | 中,更多依赖模型供应商 | +| **延迟** | 中等,通常由多环节累加 | 低,链路更短 | +| **能力编排** | 更适合接入工具、知识库、工作流 | 也可接工具,但流程可控性较弱 | +| **成本结构** | 可按环节优化 | 往往更依赖单一供应商定价 | +| **适合场景** | 企业客服、流程型助手、电话场景、知识问答 | 高拟真语音助手、多模态入口、高自然度体验 | -## 管线式引擎 (Pipeline) +## Pipeline 引擎是什么 -### 架构设计 - -管线式引擎包含 **声音活动检测(VAD)**、**语音识别(ASR)**、**回合检测(TD)**、**大语言模型(LLM)**、**语音合成(TTS)**,各环节可对接**外部服务**(OpenAI、SiliconFlow、DashScope、本地模型)。LLM 可连接**工具**(Webhook、客户端工具、内建工具)。 +Pipeline 引擎把实时语音拆成多个明确环节: ```mermaid flowchart LR - subgraph Input["输入处理"] - Audio[用户音频] --> VAD[声音活动检测 VAD] - VAD --> ASR[语音识别 ASR] - ASR --> Text[转写文本] - Text --> TD[回合检测 TD] - end - - subgraph Process["语义处理"] - TD --> LLM[大语言模型 LLM] - LLM --> Response[回复文本] - LLM --> Tools[工具] - end - - subgraph Output["输出生成"] - Response --> TTS[语音合成 TTS] - TTS --> OutputAudio[助手音频] - end + VAD[VAD] --> ASR[ASR] + ASR --> TD[回合检测] + TD --> LLM[LLM] + LLM --> TTS[TTS] ``` -### 数据流详解 +这样做的好处是: -```mermaid -sequenceDiagram - participant U as 用户 - participant E as 引擎 - participant ASR as ASR 服务 - participant LLM as LLM 服务 - participant TTS as TTS 服务 +- 你可以分别选择 ASR、LLM、TTS 的供应商 +- 你可以单独优化某一个环节,而不是整体替换 +- 工具、知识库和工作流更容易插入到链路中 - U->>E: 音频帧 (PCM 16kHz) - - Note over E: VAD 检测语音活动 - E->>E: 累积音频缓冲 - - Note over E: 回合检测 (TD) 确定可送 LLM 的输入 - E->>ASR: 发送音频 - ASR-->>E: 转写文本 (流式) - E-->>U: transcript.delta - E-->>U: transcript.final - - E->>LLM: 发送对话历史 + 用户输入 - LLM-->>E: 回复文本 (流式) - E-->>U: assistant.response.delta - - loop 流式合成 - E->>TTS: 文本片段 - TTS-->>E: 音频片段 - E-->>U: 音频帧 - end - - E-->>U: assistant.response.final -``` +代价是: -### 延迟分析 +- 延迟会累加 +- 系统集成更复杂 +- 你需要同时管理多类外部依赖 -管线式引擎的延迟由各环节累加: +## Realtime 引擎是什么 -| 环节 | 典型延迟 | 优化方向 | -|------|---------|---------| -| VAD/EOU | 200-500ms | 调整灵敏度 | -| ASR | 100-300ms | 选择快速模型 | -| LLM TTFT | 200-500ms | 选择低延迟模型 | -| TTS | 100-200ms | 流式合成 | -| **总计** | **600-1500ms** | - | - -### 流式优化 - -为降低感知延迟,采用流式处理: - -```mermaid -gantt - title 非流式 vs 流式处理 - dateFormat X - axisFormat %s - - section 非流式 - ASR完成 :a1, 0, 300ms - LLM完成 :a2, after a1, 800ms - TTS完成 :a3, after a2, 500ms - 播放 :a4, after a3, 500ms - - section 流式 - ASR :b1, 0, 300ms - LLM开始 :b2, after b1, 200ms - TTS开始 :b3, after b2, 100ms - 边生成边播放 :b4, after b3, 600ms -``` - ---- - -## 实时交互引擎与多模态 - -### 实时交互引擎连接 - -实时交互引擎可连接**实时交互引擎**后端,包括: - -| 后端 | 说明 | -|------|------| -| **OpenAI Realtime** | OpenAI 实时语音模型 | -| **Gemini Live** | Google 实时多模态 | -| **Doubao 实时交互引擎** | 豆包实时交互 | - -实时交互引擎与管线式引擎中的 LLM 一样,均可连接**工具**:Webhook、客户端工具、内建工具。 - -### 多模态引擎架构 - -多模态引擎使用端到端模型,直接处理音频输入输出: +Realtime 引擎直接连接端到端实时模型,让模型同时处理输入、理解、生成与打断。 ```mermaid flowchart LR - subgraph Client["客户端"] - Mic[麦克风] --> AudioIn[音频输入] - AudioOut[音频输出] --> Speaker[扬声器] - end - - subgraph Engine["引擎"] - AudioIn --> RT[Realtime Model] - RT --> AudioOut - RT --> Tools[工具] - end - - subgraph Model["实时交互引擎"] - RT --> GPT4o[OpenAI Realtime] - RT --> Gemini[Gemini Live] - RT --> Doubao[Doubao 实时] - end + Input[音频 / 视频 / 文本输入] --> RT[Realtime Model] + RT --> Output[音频 / 文本输出] + RT --> Tools[工具] ``` -### 数据流详解 +这样做的好处是: -```mermaid -sequenceDiagram - participant U as 用户 - participant E as 引擎 - participant RT as Realtime Model +- 链路更短,延迟更低 +- 全双工与打断通常更自然 +- 接入路径更简单,适合强调体验的入口 - U->>E: 音频帧 - E->>RT: 转发音频 - - Note over RT: 端到端处理 - - RT-->>E: 音频响应 (流式) - E-->>U: 播放音频 - - Note over U,RT: 支持全双工
用户可随时打断 -``` +代价是: -### 外部服务(管线式) +- 更依赖特定模型供应商 +- 对 ASR / TTS / 回合检测的独立控制更弱 +- 成本和能力边界受实时模型限制更大 -管线式引擎各环节可选用以下**外部服务**: +## 怎么选 -| 服务 | 说明 | -|------|------| -| **OpenAI** | LLM / ASR / TTS 等 | -| **SiliconFlow** | 国内 API 服务 | -| **DashScope** | 阿里云灵积 | -| **本地模型** | 私有化部署模型 | +### 适合选择 Pipeline 的情况 -### 支持的实时交互模型 +- 你要接入特定 ASR 或 TTS 供应商 +- 你需要知识库、工具、工作流形成稳定业务流程 +- 你更在意可解释性、观测和分段优化 +- 你需要把成本按环节精细控制 -| 模型 | 供应商 | 特点 | -|------|--------|------| -| **OpenAI Realtime** | OpenAI | 最自然的语音,延迟极低 | -| **Gemini Live** | Google | 多模态能力强 | -| **Doubao 实时交互** | 字节跳动 | 国内可用,中文优化 | +### 适合选择 Realtime 的情况 -### 延迟对比 +- 你把“自然对话感”放在首位 +- 你需要更低的首响和更顺滑的打断体验 +- 你可以接受对某个模型供应商的依赖 +- 你的场景更接近语音助手、陪练、虚拟角色或多模态入口 -```mermaid -xychart-beta - title "端到端延迟对比" - x-axis ["管线式 (普通)", "管线式 (优化)", "多模态"] - y-axis "延迟 (ms)" 0 --> 1500 - bar [1200, 700, 300] -``` +## 简化决策表 ---- +| 场景 | 推荐引擎 | 原因 | +|------|----------|------| +| 企业客服 / 电话机器人 | Pipeline | 可控、可审计、易接工具与业务系统 | +| 知识问答 / 业务流程助手 | Pipeline | 更适合接知识库与工作流 | +| 高拟真语音助手 | Realtime | 更自然、更低延迟 | +| 多模态入口 | Realtime | 端到端处理音频 / 视频 / 文本 | +| 预算敏感场景 | Pipeline | 更容易逐环节优化成本 | -## 智能打断机制 +## 智能打断的差异 -两种引擎都支持智能打断,但实现方式不同。 +两类引擎都支持打断,但边界不同: -### 管线式引擎打断 +- **Pipeline**:由 VAD / 回合检测与 TTS 停止逻辑协同实现,行为更可控 +- **Realtime**:更多由实时模型内部完成,体验更自然,但可解释性更低 -```mermaid -sequenceDiagram - participant U as 用户 - participant E as 引擎 - participant TTS as TTS +## 继续阅读 - Note over E,TTS: TTS 正在合成播放 - E->>U: 音频帧... - - U->>E: 用户说话 (检测到 VAD) - E->>E: 判断是否有效打断 - - alt 有效打断 - E->>TTS: 停止合成 - E->>E: 清空音频缓冲 - E-->>U: output.audio.interrupted - Note over E: 处理新输入 - else 噪音/误触发 - Note over E: 继续播放 - end -``` - -### 多模态引擎打断 - -多模态模型原生支持全双工,打断由模型内部处理: - -```mermaid -sequenceDiagram - participant U as 用户 - participant E as 引擎 - participant RT as Realtime Model - - Note over RT: 模型正在输出 - RT-->>E: 音频流... - E-->>U: 播放 - - U->>E: 用户说话 - E->>RT: 转发用户音频 - - Note over RT: 模型检测到打断
自动停止输出 - - RT-->>E: 新的响应 - E-->>U: 播放新响应 -``` - ---- - -## 引擎选择指南 - -### 决策流程 - -```mermaid -flowchart TD - Start[选择引擎] --> Q1{延迟要求?} - - Q1 -->|< 500ms| Q2{预算充足?} - Q1 -->|> 500ms 可接受| Pipeline[管线式引擎] - - Q2 -->|是| Q3{模型可用?} - Q2 -->|否| Pipeline - - Q3 -->|GPT-4o/Gemini 可用| Multimodal[多模态引擎] - Q3 -->|国内环境受限| Q4{Step Audio?} - - Q4 -->|可用| Multimodal - Q4 -->|不可用| Pipeline -``` - -### 场景推荐 - -| 场景 | 推荐引擎 | 理由 | -|------|---------|------| -| **企业客服** | 管线式 | 成本可控,可定制 ASR | -| **高端虚拟人** | 多模态 | 最自然的交互体验 | -| **电话机器人** | 管线式 | 可对接电信 ASR | -| **语音助手** | 多模态 | 低延迟,自然对话 | -| **口语练习** | 管线式 | 需要精确的 ASR 评分 | - -### 混合方案 - -也可以根据用户等级使用不同引擎: - -```mermaid -flowchart LR - User[用户请求] --> Router{路由判断} - - Router -->|VIP 用户| Multimodal[多模态引擎] - Router -->|普通用户| Pipeline[管线式引擎] - - Multimodal --> Response[响应] - Pipeline --> Response -``` - ---- - -## 配置示例 - -### 管线式引擎配置 - -```json -{ - "engine": "pipeline", - "asr": { - "provider": "openai-compatible", - "model": "FunAudioLLM/SenseVoiceSmall", - "language": "zh" - }, - "llm": { - "provider": "openai", - "model": "gpt-4o-mini", - "temperature": 0.7 - }, - "tts": { - "provider": "openai-compatible", - "model": "FunAudioLLM/CosyVoice2-0.5B", - "voice": "anna" - } -} -``` - -### 多模态引擎配置 - -```json -{ - "engine": "multimodal", - "model": { - "provider": "openai", - "model": "gpt-4o-realtime-preview", - "voice": "alloy" - } -} -``` - ---- - -## 相关文档 - -- [系统架构](../overview/architecture.md) - 整体架构设计 -- [WebSocket 协议](../api-reference/websocket.md) - 协议详情 -- [部署指南](../deployment/index.md) - 引擎部署配置 +- [Pipeline 引擎](pipeline-engine.md) - 查看分段链路、延迟构成与配置示例 +- [Realtime 引擎](realtime-engine.md) - 查看端到端实时模型的交互路径 +- [系统架构](../overview/architecture.md) - 从服务边界理解引擎在整体系统中的位置 diff --git a/docs/content/concepts/index.md b/docs/content/concepts/index.md index aab0fda..2051b80 100644 --- a/docs/content/concepts/index.md +++ b/docs/content/concepts/index.md @@ -1,286 +1,49 @@ -# 核心概念 +# 核心概念 -本章节介绍 Realtime Agent Studio 中的核心概念,帮助你更好地理解和使用平台。 +本章节只解释 Realtime Agent Studio 的关键心智模型,不重复环境部署或助手构建的操作细节。 --- -## 概念总览 +## 先建立这三个概念 -```mermaid -flowchart TB - subgraph Platform["RAS 平台"] - Assistant[助手 Assistant] - - subgraph Resources["资源库"] - LLM[LLM 模型] - ASR[ASR 模型] - TTS[TTS 声音] - KB[知识库] - end - - subgraph Engine["交互引擎"] - Pipeline[管线式引擎] - Multimodal[多模态引擎] - end - - Session[会话 Session] - end +### 1. 助手是“对外提供能力的配置单元” - Assistant --> LLM - Assistant --> ASR - Assistant --> TTS - Assistant --> KB - Assistant --> Engine - Engine --> Session -``` +助手决定了一个实时 AI 入口对外表现成什么角色:它使用什么提示词、哪些模型、能访问哪些知识和工具、会话如何开始以及运行时如何被覆盖。 + +- [助手概念](assistants.md) — 统一理解助手、会话、动态变量与能力边界 +- [配置选项](assistants/configuration.md) — 了解界面层和运行时配置项如何分工 +- [提示词指南](assistants/prompts.md) — 学会定义助手的角色、任务、风格与约束 +- [测试调试](assistants/testing.md) — 理解如何验证助手行为和定位问题 + +### 2. 引擎是“承载实时交互的运行时” + +RAS 同时提供 Pipeline 引擎与 Realtime 引擎。它们都能驱动实时助手,但在延迟、可控性、成本和可替换性上各有取舍。 + +- [引擎概览](engines.md) — 两类引擎的能力边界与选择建议 +- [Pipeline 引擎](pipeline-engine.md) — VAD/ASR/TD/LLM/TTS 串联的可组合链路 +- [Realtime 引擎](realtime-engine.md) — 面向端到端实时模型的低延迟交互路径 + +### 3. 工作流是“把复杂业务拆成步骤和分支的方法” + +当单一提示词不足以稳定处理多步骤、多条件、多工具的业务流程时,应使用工作流来显式编排节点、路由和回退策略。 + +- [工作流](../customization/workflows.md) — 了解何时需要工作流、它由哪些部分组成、如何设计可维护的流程 --- -## 助手 (Assistant) +## 本章节不负责什么 -**助手**是 RAS 的核心实体,代表一个可对话的 AI 智能体。 +以下内容属于“如何搭建和使用”,不在本章节展开说明: -### 助手配置 +- 助手搭建、模型/知识库/工具/工作流配置:从 [助手概览](assistants.md) 进入构建链路 +- 部署与环境变量:见 [环境与部署](../getting-started/index.md) +- 第一个助手的最短操作路径:见 [快速开始](../quickstart/index.md) +- 事件格式与接入协议:见 [API 参考](../api-reference/index.md) -每个助手包含以下配置: +## 建议阅读顺序 -| 配置项 | 说明 | -|-------|------| -| **名称** | 助手的显示名称 | -| **系统提示词** | 定义助手角色、行为、限制 | -| **LLM 模型** | 选择用于生成回复的大语言模型 | -| **ASR 模型** | 选择用于语音识别的模型 | -| **TTS 声音** | 选择用于语音合成的音色 | -| **工具** | 配置助手可调用的外部工具 | -| **知识库** | 关联的知识库(用于 RAG) | +1. 先读 [助手概念](assistants.md),明确你要配置的对象到底是什么 +2. 再读 [引擎概览](engines.md),决定应该选择 Pipeline 还是 Realtime +3. 如果场景涉及多步骤流程,再读 [工作流](../customization/workflows.md) +4. 最后回到 [快速开始](../quickstart/index.md) 或 [助手概览](assistants.md) 开始具体配置 -### 助手生命周期 - -```mermaid -stateDiagram-v2 - [*] --> Draft: 创建 - Draft --> Draft: 编辑配置 - Draft --> Published: 发布 - Published --> Draft: 取消发布 - Published --> Published: 更新配置 - Published --> [*]: 删除 -``` - ---- - -## 会话 (Session) - -**会话**代表一次完整的对话交互,从用户连接到断开。 - -### 会话状态 - -```mermaid -stateDiagram-v2 - [*] --> Connecting: WebSocket 连接 - Connecting --> Started: session.started - Started --> Active: 对话中 - Active --> Active: 多轮对话 - Active --> Stopped: session.stop - Stopped --> [*]: 连接关闭 -``` - -### 会话数据 - -每个会话记录包含: - -- **基本信息** - ID、时长、时间戳 -- **音频数据** - 用户和助手的音频记录 -- **转写文本** - ASR 识别结果 -- **LLM 交互** - 输入输出和工具调用 -- **元数据** - 渠道、来源、自定义变量 - ---- - -## 管线式引擎 vs 多模态引擎 - -RAS 支持两种引擎架构,适用于不同场景。 - -### 管线式引擎 (Pipeline) - -将语音交互拆分为多个环节,包含 **VAD(声音活动检测)**、**ASR(语音识别)**、**TD(回合检测)**、**LLM(大语言模型)**、**TTS(语音合成)**。外部服务可选 **OpenAI**、**SiliconFlow**、**DashScope**、**本地模型**。LLM 与实时交互引擎均可连接**工具**(Webhook、客户端工具、内建工具)。 - -``` -用户语音 → [VAD] → [ASR] → [TD] → 文本 → [LLM] → 回复 → [TTS] → 助手语音 -``` - -**优点:** - -- 灵活选择各环节供应商(OpenAI、SiliconFlow、DashScope、本地模型) -- 可独立优化 VAD、ASR、TD、LLM、TTS 每个环节 -- 成本可控 - -**缺点:** - -- 延迟较高(累加延迟) -- 需要协调多个服务 - -### 实时交互引擎与多模态 (Realtime / Multimodal) - -实时交互引擎可连接 **OpenAI Realtime**、**Gemini Live**、**Doubao 实时交互引擎** 等,同样可连接工具。使用端到端模型直接处理: - -``` -用户语音 → [Realtime Model] → 助手语音 -``` - -**优点:** - -- 更低延迟 -- 更自然的语音 -- 架构简单 - -**缺点:** - -- 依赖特定供应商 -- 成本较高 -- 可定制性有限 - -### 选择建议 - -| 场景 | 推荐引擎 | -|------|---------| -| 成本敏感 | 管线式 | -| 延迟敏感 | 多模态 | -| 需要特定 ASR/TTS | 管线式 | -| 追求最自然体验 | 多模态 | - ---- - -## 智能打断 (Barge-in) - -**智能打断**是指用户在助手说话时可以随时插话,系统能够: - -1. 检测用户开始说话 -2. 立即停止 TTS 播放 -3. 处理用户新的输入 - -### 打断检测方式 - -| 方式 | 说明 | -|------|------| -| **VAD** | Voice Activity Detection,检测到声音活动即打断 | -| **语义** | 基于语音内容判断是否有意义的打断 | -| **混合** | VAD + 语义结合,减少误触发 | - -### 打断流程 - -```mermaid -sequenceDiagram - participant User as 用户 - participant Engine as 引擎 - participant TTS as TTS - - Note over Engine,TTS: 助手正在播放回复 - Engine->>User: 音频流... - User->>Engine: 开始说话 (VAD 触发) - Engine->>Engine: 打断判断 - Engine->>TTS: 停止合成 - Engine->>User: output.audio.interrupted - Note over Engine: 处理新输入 -``` - ---- - -## 工具调用 (Tool Calling) - -助手可以通过**工具**扩展能力,访问外部系统或执行特定操作。 - -### 工具类型 - -管线式引擎中的 LLM 与实时交互引擎均可连接**工具**,包括: - -| 类型 | 说明 | 示例 | -|------|------|------| -| **Webhook** | 调用外部 HTTP API | 查询订单、预约日程 | -| **客户端工具** | 由客户端执行的操作 | 打开页面、显示表单 | -| **内建工具** | 平台提供的工具 | 代码执行、计算器 | - -### 工具调用流程 - -```mermaid -sequenceDiagram - participant User as 用户 - participant LLM as LLM - participant Tool as 工具 - - User->>LLM: "帮我查一下订单状态" - LLM->>LLM: 决定调用工具 - LLM->>Tool: get_order_status(order_id) - Tool-->>LLM: {status: "已发货"} - LLM->>User: "您的订单已发货" -``` - ---- - -## 知识库 (Knowledge Base) - -**知识库**让助手能够基于私有文档回答问题,实现 RAG(检索增强生成)。 - -### 工作原理 - -```mermaid -flowchart LR - subgraph Indexing["索引阶段"] - Doc[文档] --> Chunk[分块] - Chunk --> Embed[向量化] - Embed --> Store[(向量数据库)] - end - - subgraph Query["查询阶段"] - Q[用户问题] --> QEmbed[问题向量化] - QEmbed --> Search[相似度搜索] - Store --> Search - Search --> Context[相关上下文] - Context --> LLM[LLM 生成回答] - end -``` - -### 支持的文档格式 - -- PDF -- Word (.docx) -- Markdown -- 纯文本 -- HTML - ---- - -## 动态变量 - -**动态变量**允许在运行时向助手注入上下文信息。 - -### 使用方式 - -在系统提示词中使用 `{{variable}}` 占位符: - -``` -你是{{company_name}}的客服助手。 -当前用户是{{customer_name}},会员等级为{{tier}}。 -``` - -连接时通过 `dynamicVariables` 传入: - -```json -{ - "type": "session.start", - "metadata": { - "dynamicVariables": { - "company_name": "ABC 公司", - "customer_name": "张三", - "tier": "VIP" - } - } -} -``` - ---- - -## 下一步 - -- [快速开始](../quickstart/index.md) - 创建第一个助手 -- [助手配置](../assistants/configuration.md) - 详细配置说明 -- [WebSocket 协议](../api-reference/websocket.md) - API 接口详情 diff --git a/docs/content/concepts/pipeline-engine.md b/docs/content/concepts/pipeline-engine.md new file mode 100644 index 0000000..1f4d5e5 --- /dev/null +++ b/docs/content/concepts/pipeline-engine.md @@ -0,0 +1,137 @@ +# Pipeline 引擎 + +Pipeline 引擎把实时对话拆成多个清晰环节,适合需要高可控性、可替换外部能力和复杂业务编排的场景。 + +--- + +## 运行链路 + +```mermaid +flowchart LR + subgraph Input["输入处理"] + Audio[用户音频] --> VAD[声音活动检测 VAD] + VAD --> ASR[语音识别 ASR] + ASR --> TD[回合检测 TD] + end + + subgraph Reasoning["语义处理"] + TD --> LLM[大语言模型 LLM] + LLM --> Tools[工具] + LLM --> Text[回复文本] + end + + subgraph Output["输出生成"] + Text --> TTS[语音合成 TTS] + TTS --> AudioOut[助手音频] + end +``` + +Pipeline 的关键价值不在于“环节多”,而在于每个环节都可以被单独选择、单独优化、单独观测。 + +## 它适合什么场景 + +- 需要接特定 ASR / TTS 供应商 +- 需要稳定接入知识库、工具和工作流 +- 需要把问题定位到具体环节,而不是只看到整体失败 +- 需要按延迟、成本、质量对不同环节分别优化 + +## 数据流 + +```mermaid +sequenceDiagram + participant U as 用户 + participant E as 引擎 + participant ASR as ASR 服务 + participant LLM as LLM 服务 + participant TTS as TTS 服务 + + U->>E: 音频帧 (PCM) + E->>E: VAD / 回合检测 + E->>ASR: 发送可识别音频 + ASR-->>E: transcript.delta / transcript.final + E->>LLM: 发送对话历史与当前输入 + LLM-->>E: assistant.response.delta + E->>TTS: 文本片段 + TTS-->>E: 音频片段 + E-->>U: 音频流与事件 +``` + +## 延迟来自哪里 + +| 环节 | 典型影响 | 常见优化点 | +|------|----------|------------| +| **VAD / EoU** | 用户说完后多久触发回复 | 调整静音阈值和最短语音门限 | +| **ASR** | 语音转写速度和准确率 | 选择合适模型、热词和语言设置 | +| **LLM** | 首个 token 返回速度 | 选择低延迟模型、优化上下文 | +| **TTS** | 文字到音频的生成速度 | 选择流式 TTS,缩短单次回复 | + +Pipeline 的总延迟通常不是单点问题,而是链路总和。因此更适合做“逐环节调优”。 + +## EoU(用户说完)为什么重要 + +Pipeline 必须决定“什么时候把当前轮输入正式交给 LLM”。这个判断通常由 **EoU** 完成。 + +- 阈值小:响应更快,但更容易把用户停顿误判为说完 +- 阈值大:更稳,但首次响应会更慢 + +你可以把它理解为 Pipeline 中最直接影响“对话节奏感”的参数之一。 + +## 工具、知识库和工作流如何插入 + +Pipeline 特别适合把业务能力插入到对话中: + +- **知识库**:在 LLM 生成前补充领域事实 +- **工具**:在需要外部信息或动作时调用系统能力 +- **工作流**:在多步骤、多分支流程中决定接下来走哪个节点 + +这也是它在企业客服、流程助手和知识问答场景中更常见的原因。 + +## 智能打断 + +在 Pipeline 中,打断通常由 VAD 检测和 TTS 停止逻辑协同完成: + +```mermaid +sequenceDiagram + participant U as 用户 + participant E as 引擎 + participant TTS as TTS + + Note over E,TTS: 正在播放回复 + E->>U: 音频流... + U->>E: 用户开始说话 + E->>E: 判定是否触发打断 + E->>TTS: 停止合成 / 播放 + E-->>U: output.audio.interrupted +``` + +相比端到端实时模型,这种方式更容易解释“为什么打断”以及“在哪个环节发生了问题”。 + +## 配置示例 + +```json +{ + "engine": "pipeline", + "asr": { + "provider": "openai-compatible", + "model": "FunAudioLLM/SenseVoiceSmall", + "language": "zh" + }, + "llm": { + "provider": "openai", + "model": "gpt-4o-mini", + "temperature": 0.7 + }, + "tts": { + "provider": "openai-compatible", + "model": "FunAudioLLM/CosyVoice2-0.5B", + "voice": "anna" + } +} +``` + +## 相关文档 + +- [引擎架构](engines.md) - 回到选择指南 +- [Realtime 引擎](realtime-engine.md) - 对比端到端实时模型路径 +- [工具](../customization/tools.md) - 设计可被 LLM 安全调用的工具 +- [知识库](../customization/knowledge-base.md) - 在对话中补充领域知识 diff --git a/docs/content/concepts/realtime-engine.md b/docs/content/concepts/realtime-engine.md new file mode 100644 index 0000000..757bd7b --- /dev/null +++ b/docs/content/concepts/realtime-engine.md @@ -0,0 +1,97 @@ +# Realtime 引擎 + +Realtime 引擎直接连接端到端实时模型,适合把低延迟和自然语音体验放在第一位的场景。 + +--- + +## 运行链路 + +```mermaid +flowchart LR + Input[音频 / 视频 / 文本输入] --> RT[Realtime Model] + RT --> Output[音频 / 文本输出] + RT --> Tools[工具] +``` + +与 Pipeline 不同,Realtime 引擎不会把 ASR、回合检测、LLM、TTS 作为独立阶段暴露出来,而是更多依赖实时模型整体处理。 + +## 常见后端 + +| 后端 | 特点 | +|------|------| +| **OpenAI Realtime** | 语音交互自然,延迟低 | +| **Gemini Live** | 多模态能力强 | +| **Doubao 实时交互** | 更适合国内环境与中文场景 | + +## 它适合什么场景 + +- 语音助手、陪练、虚拟角色等高自然度体验场景 +- 对首响和连续打断体验要求高的入口 +- 希望减少链路拼装复杂度,直接接入端到端模型的团队 + +## 数据流 + +```mermaid +sequenceDiagram + participant U as 用户 + participant E as 引擎 + participant RT as Realtime Model + + U->>E: 音频 / 视频 / 文本输入 + E->>RT: 转发实时流 + RT-->>E: 流式文本 / 音频输出 + E-->>U: 播放或渲染结果 +``` + +## Realtime 的优势 + +- **延迟更低**:链路更短,用户感知更自然 +- **全双工更顺滑**:用户插话时,模型更容易在内部处理打断 +- **多模态更直接**:适合音频、视频、文本混合输入输出场景 + +## Realtime 的取舍 + +- 更依赖实时模型供应商的能力边界 +- 不容易对 ASR / TTS / 回合检测做独立替换 +- 成本和可观测性往往不如 Pipeline 那样可逐环节拆分 + +## 智能打断 + +Realtime 模型通常原生支持全双工和打断: + +```mermaid +sequenceDiagram + participant U as 用户 + participant E as 引擎 + participant RT as Realtime Model + + Note over RT: 模型正在输出 + RT-->>E: 音频流... + E-->>U: 播放 + U->>E: 用户开始说话 + E->>RT: 转发新输入 + Note over RT: 模型内部处理中断并切换回复 + RT-->>E: 新的响应 + E-->>U: 播放新响应 +``` + +这种方式更自然,但你通常只能看到模型的整体行为,而不是每个中间阶段的细节。 + +## 配置示例 + +```json +{ + "engine": "multimodal", + "model": { + "provider": "openai", + "model": "gpt-4o-realtime-preview", + "voice": "alloy" + } +} +``` + +## 相关文档 + +- [引擎架构](engines.md) - 回到两类引擎的选择指南 +- [Pipeline 引擎](pipeline-engine.md) - 查看分段可控的运行路径 +- [WebSocket 协议](../api-reference/websocket.md) - 了解客户端如何与引擎建立会话 diff --git a/docs/content/customization/asr.md b/docs/content/customization/asr.md index 2251804..74e1097 100644 --- a/docs/content/customization/asr.md +++ b/docs/content/customization/asr.md @@ -1,6 +1,21 @@ -# 语音识别 +# 语音识别 -语音识别(ASR)负责将用户音频实时转写为文本,供对话引擎理解。 +语音识别(ASR)负责把用户音频实时转写成文本,供引擎继续理解和处理。 + +## 关键配置项 + +| 配置项 | 说明 | +|--------|------| +| **ASR 引擎** | 选择语音识别服务提供商或自建服务 | +| **模型** | 实际使用的识别模型名称 | +| **语言** | 中文、英文或多语言 | +| **热词** | 提高业务词汇、品牌词、专有名词识别率 | +| **标点与规范化** | 自动补全标点、规范数字和日期等 | + +## 模式 + +- `offline`:引擎本地缓冲音频后触发识别(适用于 OpenAI-compatible / SiliconFlow)。 +- `streaming`:音频分片实时发送到服务端,服务端持续返回转写事件(适用于 DashScope Realtime ASR、Volcengine BigASR)。 ## 配置项 @@ -8,17 +23,31 @@ |---|---| | ASR 引擎 | 选择语音识别服务提供商 | | 模型 | 识别模型名称 | +| `enable_interim` | 是否开启离线 ASR 中间结果(默认 `false`,仅离线模式生效) | +| `app_id` / `resource_id` | Volcengine 等厂商的应用标识与资源标识 | +| `request_params` | 厂商原生请求参数透传,例如 `end_window_size`、`force_to_speech_time`、`context` | | 语言 | 中文/英文/多语言 | | 热词 | 提升特定词汇识别准确率 | | 标点与规范化 | 是否自动补全标点、文本规范化 | -## 建议 +## 选择建议 -- 客服场景建议开启热词并维护业务词表 -- 多语言场景建议按会话入口显式指定语言 -- 对延迟敏感场景优先选择流式识别模型 +- 客服、外呼等业务场景建议维护热词表,并按业务线持续更新 +- 多语言入口建议显式指定语言,避免模型自动判断带来的波动 +- 对延迟敏感的场景优先选择流式识别模型 +- 对准确率敏感的场景,先评估专有名词、数字、地址等样本的识别表现 + +## 运行建议 + +- 使用与接入端一致的采样率和编码方式,减少额外转换 +- 在测试阶段准备固定样本,便于对比不同模型或参数的变化 +- 把“识别准确率”和“识别延迟”一起看,不要只看其中一项 ## 相关文档 -- [语音配置总览](voices.md) - +- [声音资源](voices.md) - 完整语音输入输出链路中的 TTS 侧配置 +- [快速开始](../quickstart/index.md) - 以任务路径接入第一个 ASR 资源 +- 客服场景建议开启热词并维护业务词表 +- 多语言场景建议按会话入口显式指定语言 +- 对延迟敏感场景优先选择流式识别模型 +- 当前支持提供商:`openai_compatible`、`siliconflow`、`dashscope`、`volcengine`、`buffered`(回退) diff --git a/docs/content/customization/knowledge-base.md b/docs/content/customization/knowledge-base.md index 8678f6a..1b742b2 100644 --- a/docs/content/customization/knowledge-base.md +++ b/docs/content/customization/knowledge-base.md @@ -1,53 +1,86 @@ -# 知识库 +# 知识库 -知识库基于 RAG(检索增强生成)技术,让 AI 能够回答私有领域问题。 +知识库负责承载助手需要引用的私有事实、业务资料和长文档内容,是 RAG(检索增强生成)能力的正式说明页。 -## 概述 +## 什么时候应该用知识库 -![知识库](../images/knowledge.png) +当问题答案主要来自“稳定文档”而不是实时外部动作时,优先使用知识库: -## 创建知识库 +- 产品说明、政策条款、操作流程、培训材料 +- 内部手册、FAQ、规范文档 +- 需要被多位助手复用的领域知识 -### 步骤 +如果任务本质上是“查状态、写数据、执行动作”,那通常更适合 [工具](tools.md),而不是知识库。 -1. 进入 **知识库** 页面 -2. 点击 **新建知识库** -3. 填写知识库名称 -4. 上传文档 +## 工作原理 -### 支持格式 +```mermaid +flowchart LR + subgraph Indexing["索引阶段"] + Doc[文档] --> Chunk[分块] + Chunk --> Embed[向量化] + Embed --> Store[(向量数据库)] + end -| 格式 | 说明 | -|------|------| -| Markdown | 最佳选择,格式清晰 | -| PDF | 自动提取文本 | -| TXT | 纯文本支持 | -| Word | 需转换为其他格式 | + subgraph Query["查询阶段"] + Q[用户问题] --> Search[相似度检索] + Store --> Search + Search --> Context[相关片段] + Context --> LLM[LLM 生成回答] + end +``` -### 文档上传 +核心原则很简单:把长文档转成可检索的片段,在用户提问时只把最相关的内容送给模型。 -- 拖拽上传或点击选择 -- 单文件大小限制 10MB -- 建议单文档不超过 50000 字 +## 适合放进知识库的内容 -## 配置检索参数 +| 适合 | 不适合 | +|------|--------| +| 稳定规则、标准答案、产品文档 | 高频变化的实时状态 | +| 领域术语、说明手册、培训材料 | 需要外部系统写入或变更的动作 | +| 需要跨助手复用的内容 | 只在单次会话里临时生成的数据 | -| 参数 | 说明 | 默认值 | -|------|------|--------| -| 相似度阈值 | 低于此分数的结果不返回 | 0.7 | -| 返回数量 | 单次检索返回的结果数 | 3 | -| 分块大小 | 文档分块的最大长度 | 500 | +## 内容准备建议 -## 管理知识库 +- 优先上传结构清晰、主题明确的文档 +- 对超长文档按主题拆分,减少一次索引的噪声 +- 标题、章节名和表格说明对召回质量很重要,不要全部删掉格式信息 +- 与其堆很多相近文档,不如先清理重复、过期和相互冲突的内容 -- **查看文档** - 浏览已上传的文件 -- **删除文档** - 移除不需要的内容 -- **更新文档** - 重新上传覆盖 -- **测试检索** - 验证知识库效果 +## 常见配置项 -## 关联助手 +| 配置项 | 作用 | 常见做法 | +|--------|------|----------| +| **相似度阈值** | 过滤弱相关结果 | 从保守值起步,再按误召回调 | +| **返回数量** | 控制一次送给模型的候选片段数 | 先少后多,避免上下文污染 | +| **分块大小** | 决定每个文档片段的长度 | 按文档类型和问题粒度调整 | -在助手配置的 **知识** 标签页中: -1. 选择要关联的知识库 -2. 设置检索策略 -3. 保存配置 +## 创建与维护 + +### 最小流程 + +1. 新建知识库 +2. 上传文档 +3. 完成索引 +4. 用典型问题测试召回结果 +5. 绑定到目标助手 + +### 日常维护 + +- 删除过期或互相矛盾的文档 +- 当业务口径变化时,优先更新知识库而不是只改提示词 +- 为关键问题准备固定测试问句,观察召回是否稳定 + +## 与助手的关系 + +知识库不是独立产品入口,而是助手的能力层: + +- 助手决定是否、何时、以什么风格使用知识 +- 知识库决定能够提供哪些事实片段 +- 工作流和工具可以与知识库并用,但承担不同职责 + +## 相关文档 + +- [助手概念](../concepts/assistants.md) - 知识库在助手能力层中的位置 +- [LLM 模型](models.md) - 为知识库准备嵌入或重排模型 +- [工具](tools.md) - 当任务需要执行动作时,优先考虑工具而不是知识库 diff --git a/docs/content/customization/models.md b/docs/content/customization/models.md index f149c81..8343d11 100644 --- a/docs/content/customization/models.md +++ b/docs/content/customization/models.md @@ -1,44 +1,53 @@ -# 模型配置 +# LLM 模型 -## LLM 模型库 +本页是资源库中 LLM 模型的正式说明页,聚焦文本生成、嵌入和重排模型的接入与选择。 -![LLM模型库](../images/llms.png) +## 这页负责什么 -### 支持的模型 +当你需要为助手配置“理解与生成能力”时,请从这里开始决定: -| 供应商 | 模型 | 特点 | +- 使用哪个供应商或模型家族 +- 该模型负责文本生成、嵌入还是重排 +- 接口地址、认证信息和默认参数如何设置 + +语音识别和语音合成分别由 [语音识别](asr.md) 与 [声音资源](voices.md) 说明,不在本页重复。 + +## 模型类型 + +| 类型 | 用途 | 常见场景 | +|------|------|----------| +| **文本模型** | 生成回复、总结、分类、规划 | 助手主对话、工具调用决策 | +| **嵌入模型** | 向量化文档或查询 | 知识库检索 | +| **重排模型** | 对检索结果再次排序 | 提升知识召回质量 | + +## 配置清单 + +| 配置项 | 说明 | 建议 | |--------|------|------| -| **OpenAI** | GPT-4 / GPT-3.5 | 通用能力强 | -| **DeepSeek** | DeepSeek Chat | 高性价比 | -| **SiliconFlow** | 多种开源模型 | 本地部署友好 | -| **Google** | Gemini Pro | 多模态支持 | +| **供应商** | OpenAI 兼容、托管平台或自建服务 | 用统一命名规范区分环境 | +| **模型名称** | 控制台中的显示名称 | 体现厂商、用途和环境 | +| **模型标识** | 请求中实际使用的 model 名称 | 保持与供应商文档一致 | +| **Base URL** | 接口地址 | 为不同环境分别配置 | +| **API Key / Token** | 鉴权凭证 | 与显示名称配套管理 | +| **默认参数** | Temperature、Max Tokens、上下文长度等 | 按业务场景收敛默认值 | -### 配置步骤 +## 选择建议 -1. 进入 **LLM 库** 页面 -2. 点击 **添加模型** -3. 选择供应商 -4. 填写 API Key 和 Endpoint -5. 设置默认参数 +- **先按用途选模型,再按成本和延迟筛选供应商** +- **文本模型不要承担知识库检索职责**:检索应交给嵌入与重排模型 +- **为不同环境建立清晰命名**:如 `prod-gpt4o-mini`、`staging-qwen-text` +- **默认参数要保守**:让助手默认稳定,再在单个场景内按需调优 -### 参数说明 +## 常见组合 -| 参数 | 说明 | 建议值 | -|------|------|--------| -| Temperature | 随机性 | 0.7 | -| Max Tokens | 最大输出长度 | 2048 | -| Top P | 核采样 | 0.9 | +| 目标 | 推荐组合 | +|------|----------| +| **通用对话助手** | 1 个文本模型 | +| **知识问答助手** | 文本模型 + 嵌入模型 | +| **高质量知识召回** | 文本模型 + 嵌入模型 + 重排模型 | -## ASR 语音识别 +## 下一步 -### 支持引擎 - -- **Whisper** - OpenAI 通用语音识别 -- **SenseVoice** - 高精度中文语音识别 - -### 配置方法 - -1. 进入 **ASR 库** 页面 -2. 选择识别引擎 -3. 配置音频参数(采样率、编码) -4. 测试识别效果 +- [语音识别](asr.md) - 为语音输入选择 ASR +- [声音资源](voices.md) - 为语音输出准备 TTS 资源 +- [知识库](knowledge-base.md) - 把嵌入 / 重排模型接入 RAG 链路 diff --git a/docs/content/customization/tools.md b/docs/content/customization/tools.md index 2c5a5b0..993846d 100644 --- a/docs/content/customization/tools.md +++ b/docs/content/customization/tools.md @@ -1,38 +1,60 @@ -# 工具集成 +# 工具 -工具(Tools)让助手能够执行外部操作,如查询天气、搜索信息、调用 API 等。 +工具让助手从“会回答”扩展成“能执行动作”。本页是工具能力的正式说明页。 -## 概述 +## 什么时候应该用工具 -工具是助手能力的扩展。当用户的请求需要外部数据或操作时,助手会调用相应的工具。 +当用户请求需要依赖外部系统、实时数据或执行某个动作时,应该使用工具,而不是只靠提示词或知识库。 -## 内置工具 +典型场景包括: -| 工具 | 说明 | 参数 | -|------|------|------| -| `search` | 网络搜索 | query: 搜索关键词 | -| `weather` | 天气查询 | city: 城市名称 | -| `calculator` | 数学计算 | expression: 计算表达式 | -| `knowledge` | 知识库检索 | query: 查询内容 | +- 查询订单、库存、物流、天气等实时信息 +- 创建预约、提交表单、写入业务系统 +- 获取客户端环境能力,如定位、相机、权限确认 -### 启用内置工具 +如果问题本质上是“查阅稳定资料”,优先用 [知识库](knowledge-base.md);如果问题是“执行动作或读写实时状态”,优先用工具。 -在助手配置的 **工具** 标签页: +## 工具类型 -1. 勾选需要启用的工具 -2. 配置工具参数(如有) -3. 保存配置 +| 类型 | 说明 | 常见场景 | +|------|------|----------| +| **Webhook 工具** | 调用外部 HTTP API | 订单查询、CRM 写入、预约服务 | +| **客户端工具** | 由接入端在本地执行 | 获取定位、打开相机、请求用户授权 | +| **内建工具** | 平台或运行时直接提供 | 搜索、计算、知识检索等 | -## 自定义工具 +## 工具调用的基本过程 -支持通过 HTTP 回调实现自定义工具。 +```mermaid +sequenceDiagram + participant User as 用户 + participant Assistant as 助手 / 模型 + participant Tool as 工具 -### 定义工具 + User->>Assistant: 发起请求 + Assistant->>Assistant: 判断是否需要工具 + Assistant->>Tool: 发起工具调用 + Tool-->>Assistant: 返回结构化结果 + Assistant->>User: 组织最终回复 +``` + +关键点不是“模型会不会调用工具”,而是“工具的定义是否足够清晰,能让模型在正确时机调用”。 + +## 如何定义一个好工具 + +| 要素 | 为什么重要 | +|------|------------| +| **清晰名称** | 让模型知道它是做什么的,而不是猜用途 | +| **明确描述** | 告诉模型何时调用、何时不要调用 | +| **完整参数定义** | 降低缺参、错参和歧义调用 | +| **稳定返回结构** | 让模型更容易根据结果组织回复 | +| **明确错误语义** | 让失败时也能安全退回用户对话 | + +## Webhook 工具示例 ```json { "name": "query_order", - "description": "查询用户订单信息", + "description": "根据订单号查询当前订单状态,仅用于用户已提供订单号的场景。", "parameters": { "type": "object", "properties": { @@ -42,188 +64,45 @@ } }, "required": ["order_id"] - }, - "endpoint": { - "url": "https://api.example.com/orders", - "method": "GET", - "headers": { - "Authorization": "Bearer {{api_key}}" - } } } ``` -### 工具字段说明 +## 客户端工具的作用 -| 字段 | 说明 | -|------|------| -| name | 工具名称(英文标识符) | -| description | 工具描述(LLM 用于理解工具用途) | -| parameters | 参数定义(JSON Schema 格式) | -| endpoint | HTTP 调用配置 | +某些动作必须在接入端执行,例如: -### 参数映射 +- 获取当前位置 +- 请求麦克风或相机权限 +- 打开特定页面或原生能力 -工具参数自动映射到 HTTP 请求: +这类工具通常通过事件流和客户端配合完成,而不是由后端直接执行。 -- **GET 请求**:参数作为 query string -- **POST 请求**:参数作为 JSON body +## 工具设计建议 -## 客户端工具 +- **一工具一职责**:不要把多个业务动作塞进同一个工具 +- **名称与描述写给模型看**:必须明确何时用、何时不用 +- **先设计错误返回**:失败时模型应该知道如何解释给用户 +- **减少高权限工具暴露面**:不是每个助手、每个工作流节点都需要全部工具 +- **把业务规则放回系统**:工具负责执行,提示词负责决策边界 -某些工具需要在客户端执行(如获取地理位置)。 +## 与知识库、工作流的分工 -### 工作流程 +- **知识库**:提供稳定事实 +- **工具**:执行动作或读取实时状态 +- **工作流**:决定何时进入某个步骤、调用哪个工具、失败如何回退 -1. 助手返回 `assistant.tool_call` 事件 -2. 客户端执行工具并获取结果 -3. 客户端发送 `tool_call.results` 消息 -4. 助手继续生成回复 +当一个助手开始涉及多步骤、多系统调用时,工具通常应与 [工作流](workflows.md) 一起设计,而不是孤立配置。 -### 服务端事件 +## 安全与治理 -```json -{ - "type": "assistant.tool_call", - "data": { - "tool_call_id": "call_abc123", - "tool_name": "get_location", - "arguments": {} - } -} -``` +- 校验输入,不直接信任模型生成的参数 +- 为工具设置最小权限和清晰的可见范围 +- 记录调用日志,便于审计和回放 +- 对外部接口增加超时、重试和速率限制策略 -### 客户端响应 +## 相关文档 -```json -{ - "type": "tool_call.results", - "results": [ - { - "tool_call_id": "call_abc123", - "name": "get_location", - "output": { - "latitude": 39.9042, - "longitude": 116.4074, - "city": "北京" - }, - "status": { - "code": 200, - "message": "ok" - } - } - ] -} -``` - -## 工具调用示例 - -### 天气查询 - -用户:"北京今天天气怎么样?" - -助手调用工具: -```json -{ - "tool_name": "weather", - "arguments": { - "city": "北京" - } -} -``` - -工具返回: -```json -{ - "temperature": 25, - "condition": "晴", - "humidity": 40 -} -``` - -助手回复:"北京今天天气晴朗,气温 25 度,湿度 40%。" - -### 订单查询 - -用户:"帮我查一下订单 12345" - -助手调用工具: -```json -{ - "tool_name": "query_order", - "arguments": { - "order_id": "12345" - } -} -``` - -工具返回: -```json -{ - "order_id": "12345", - "status": "已发货", - "tracking": "SF1234567890" -} -``` - -助手回复:"您的订单 12345 已发货,快递单号是 SF1234567890。" - -## 工具配置最佳实践 - -### 1. 清晰的描述 - -工具描述应该让 LLM 准确理解何时使用: - -``` -好的描述: -"查询指定城市的实时天气信息,包括温度、天气状况和湿度" - -不好的描述: -"天气工具" -``` - -### 2. 完整的参数定义 - -```json -{ - "parameters": { - "type": "object", - "properties": { - "city": { - "type": "string", - "description": "城市名称,如 '北京'、'上海'" - }, - "date": { - "type": "string", - "description": "日期,格式 YYYY-MM-DD,可选,默认今天" - } - }, - "required": ["city"] - } -} -``` - -### 3. 错误处理 - -工具应返回清晰的错误信息: - -```json -{ - "status": { - "code": 404, - "message": "未找到该城市的天气数据" - } -} -``` - -## 安全注意事项 - -1. **验证输入** - 不要直接信任用户输入 -2. **限制权限** - 工具只应有必要的权限 -3. **审计日志** - 记录所有工具调用 -4. **速率限制** - 防止滥用 - -## 下一步 - -- [知识库配置](knowledge-base.md) - 让助手具备专业知识 -- [工作流编排](workflows.md) - 复杂对话流程 +- [知识库](knowledge-base.md) - 当问题更适合“查资料”时使用知识库 +- [工作流](workflows.md) - 当工具调用需要流程控制和分支逻辑时接入工作流 +- [助手概念](../concepts/assistants.md) - 理解工具在助手能力层中的位置 diff --git a/docs/content/customization/tts.md b/docs/content/customization/tts.md index 3915f29..2b311dc 100644 --- a/docs/content/customization/tts.md +++ b/docs/content/customization/tts.md @@ -1,25 +1,25 @@ -# 语音生成 +# TTS 参数 -语音生成(TTS)负责将助手回复文本转换为可播放音频。 +TTS 参数决定助手语音输出的节奏、音量和听感。本页只讨论参数层面的调优建议。 -## 配置项 +## 常用参数 -| 配置项 | 说明 | -|---|---| -| TTS 引擎 | 选择语音合成服务提供商 | -| 声音/音色 | 选择目标音色或发音人 | -| 模型 | 语音合成模型名称 | -| 语速 | 播放速度,通常 0.5-2.0 | -| 音量/增益 | 输出音量控制 | -| 音调 | 声线高低调整 | +| 参数 | 说明 | 常见范围 | +|------|------|----------| +| **语速** | 说话速度 | `0.5 - 2.0` | +| **音量 / 增益** | 输出音量强弱 | 供应商自定义 | +| **音调** | 声线高低 | 供应商自定义 | +| **模型** | 合成模型名称 | 依供应商而定 | +| **声音 ID** | 发音人或音色标识 | 依供应商而定 | -## 建议 +## 调优建议 -- 对话助手建议保持语速在 `0.9-1.2` -- 生产环境建议固定主音色,降低体验波动 -- 若需要打断能力,优先使用低延迟流式 TTS +- 对话助手通常建议把语速控制在 `0.9 - 1.2` +- 需要打断能力的场景,优先选择低延迟流式 TTS,并避免过长的单次回复 +- 如果业务强调可信度或专业感,先保证清晰度和稳定性,再追求个性化音色 +- 不要只试听一句问候语,至少用三类文案对比:短答复、长答复、数字或专有名词较多的答复 ## 相关文档 -- [语音配置总览](voices.md) - +- [声音资源](voices.md) - 先选择适合的供应商、模型和音色 +- [语音识别](asr.md) - 结合输入侧延迟一起评估整条语音链路 diff --git a/docs/content/customization/voices.md b/docs/content/customization/voices.md index 866b682..a756b49 100644 --- a/docs/content/customization/voices.md +++ b/docs/content/customization/voices.md @@ -1,58 +1,43 @@ -# 语音合成 +# 声音资源 -语音合成(TTS)模块提供自然流畅的语音输出能力。 +本页是资源库中 TTS 声音与发音人资源的正式说明页,聚焦“选择哪种声音给助手输出”。 -## 概述 +## 这页负责什么 -![语音合成](../images/voices.png) +当你已经决定启用语音输出后,需要在这里完成: -## 支持的引擎 +- 选择供应商、模型和声音资源 +- 为不同业务或语言准备不同音色 +- 通过预览和测试确定默认发音人 -| 供应商 | 特点 | 适用场景 | -|--------|------|---------| -| **阿里云** | 多音色、高自然度 | 通用场景 | -| **火山引擎** | 低延迟、实时性好 | 实时对话 | -| **Minimax** | 高性价比 | 批量合成 | +更细的速度、音量、音调等参数建议见 [TTS 参数](tts.md)。 -## 配置方法 +## 选择声音时要考虑什么 -### 添加语音配置 - -1. 进入 **语音库** 页面 -2. 点击 **添加语音** -3. 选择供应商 -4. 填写 API 凭证 -5. 保存配置 - -### 测试语音 - -- 在线预览发音效果 -- 调整语速和音量 -- 切换不同音色 - -## 音色选择 - -### 中文音色 - -| 音色 | 风格 | +| 维度 | 说明 | |------|------| -| 晓晓 | 标准女声 | -| 晓北 | 知性女声 | -| 逍遥 | 青年男声 | -| 丫丫 | 活泼童声 | +| **语言与口音** | 是否覆盖目标用户语言与地区口音 | +| **风格** | 专业、亲切、活泼、沉稳等输出气质 | +| **延迟** | 是否适合实时对话,而不仅是离线合成 | +| **稳定性** | 长文本、多轮会话中的音色一致性 | +| **成本** | 单次调用成本和高并发可用性 | -### 英文音色 +## 推荐做法 -| 音色 | 风格 | -|------|------| -| Joanna | 专业女声 | -| Matthew | 沉稳男声 | -| Amy | 亲切女声 | +1. 先为每类业务角色确定一条主音色 +2. 再按语言或渠道补充少量备选音色 +3. 通过固定测试文案试听,统一比较自然度、节奏和可懂度 +4. 上线后尽量保持默认音色稳定,避免频繁切换影响用户体验 -## 参数调优 +## 常见资源组织方式 -| 参数 | 范围 | 说明 | -|------|------|------| -| 语速 | 0.5-2.0 | 1.0 为正常速度 | -| 音量 | 0-100 | 输出音量百分比 | -| 音调 | 0.5-2.0 | 语音音调高低 | +| 组织方式 | 适用场景 | +|----------|----------| +| **按语言区分** | 中英文或多语种助手 | +| **按业务角色区分** | 客服、销售、培训、提醒类助手 | +| **按环境区分** | 开发、预发、生产使用不同供应商或凭证 | + +## 下一步 + +- [TTS 参数](tts.md) - 调整语速、增益、音调等输出参数 +- [快速开始](../quickstart/index.md) - 把声音资源绑定到第一个助手 diff --git a/docs/content/customization/workflows.md b/docs/content/customization/workflows.md index d0803a7..5702cbe 100644 --- a/docs/content/customization/workflows.md +++ b/docs/content/customization/workflows.md @@ -1,53 +1,106 @@ -# 工作流管理 +# 工作流 -工作流提供可视化的对话流程编排能力,支持复杂的业务场景。 +工作流用于把复杂业务拆成明确的步骤、分支和回退策略,是 RAS 中承载流程逻辑的正式能力页。 -## 概述 +## 什么时候需要工作流 -![工作流](../images/workflows.png) +当一个助手同时满足以下任一情况时,通常应考虑工作流,而不是继续堆叠单一提示词: -## 节点类型 +- 需要多轮收集信息,例如订单号、手机号、预约时间等 +- 需要按意图或条件走不同分支 +- 需要串联多个工具或业务系统 +- 需要在异常或信息不足时统一回退到澄清、兜底或人工节点 -| 节点 | 图标 | 功能说明 | -|------|------|---------| -| **对话节点** | 💬 | AI 自动回复,可设置回复策略 | -| **工具节点** | 🔧 | 调用外部 API 或自定义工具 | -| **人工节点** | 👤 | 转接人工客服 | -| **结束节点** | 🏁 | 结束对话流程 | +## 工作流与助手的关系 -## 创建工作流 +助手负责对外表现、全局策略和渠道接入;工作流负责把某个业务流程拆成可维护的节点。 -### 步骤 +```mermaid +flowchart LR + Assistant[助手] --> Workflow[工作流] + Workflow --> Nodes[节点与分支] + Nodes --> Tools[工具 / 知识库 / 人工] +``` -1. 进入 **工作流** 页面 -2. 点击 **新建工作流** -3. 从左侧拖拽节点到画布 -4. 连接节点建立流程 -5. 配置各节点参数 -6. 保存并发布 +这意味着: -### 节点配置 +- 助手定义角色、提示词基线、模型和输出方式 +- 工作流定义“这类问题该按什么顺序被处理” +- 工具和知识库作为节点可调用的能力,被有选择地暴露给流程 -#### 对话节点配置 +## 关键组成 -- 回复模板 -- 条件分支 -- 知识库检索 +| 组成 | 作用 | 设计建议 | +|------|------|----------| +| **工作流名称** | 区分业务流程 | 用业务语义命名,避免过于技术化 | +| **入口节点** | 用户进入后的第一步 | 保持单入口,便于理解和测试 | +| **全局提示词** | 对所有节点生效的共性约束 | 保持简短,避免与节点提示词冲突 | +| **节点提示词** | 当前节点的任务说明 | 单一职责,明确输入 / 输出 | +| **节点工具白名单** | 控制当前节点可调用的工具集合 | 遵循最小权限原则 | +| **超时与回退** | 异常、超时、缺信息时的处理方式 | 优先回到澄清、兜底或人工节点 | +| **上下文透传** | 在节点之间共享状态 | 只传递后续节点真正需要的信息 | -#### 工具节点配置 +## 常见节点类型 -- 选择工具类型 -- 配置输入参数 -- 设置输出处理 +| 节点类型 | 适合做什么 | +|----------|------------| +| **路由节点** | 判断用户意图并进入不同分支 | +| **信息收集节点** | 收集订单号、联系方式、时间等关键信息 | +| **处理节点** | 调用工具、执行查询、计算或写入系统 | +| **回复节点** | 组织最终答复并控制输出风格 | +| **人工节点** | 转接人工、排队或发起通知 | +| **结束节点** | 输出结束语并关闭流程 | -#### 人工节点配置 +## 推荐编排步骤 -- 转接规则 -- 排队策略 -- 通知设置 +1. 先写清楚流程目标:这条工作流要解决哪一类业务问题 +2. 画出最小节点图:入口、关键分支、结束和兜底 +3. 为每个节点定义唯一职责和输入 / 输出 +4. 再绑定知识库、工具和回退策略 +5. 在测试面板或流程调试工具中验证每条主路径和异常路径 -## 流程测试 +## 配置示例 -- 支持单步调试 -- 可查看执行日志 -- 实时验证流程逻辑 +```yaml +workflow: + name: "订单咨询流程" + entry: "intent_router" + global_prompt: "优先给出可执行步骤,必要时先澄清信息。" + nodes: + - id: "intent_router" + type: "router" + prompt: "识别用户意图:查订单、退款、投诉" + next: + - when: "intent == query_order" + to: "collect_order_id" + - when: "intent == refund" + to: "refund_policy" + - id: "collect_order_id" + type: "collect" + prompt: "请用户提供订单号" + tools: ["query_order"] + fallback: "human_handoff" + - id: "human_handoff" + type: "end" + prompt: "转人工处理" +``` + +## 设计建议 + +- **让每个节点只做一件事**:避免单节点同时负责路由、收集信息和最终回复 +- **工具按节点授权**:不要把所有工具暴露给整条流程中的每个节点 +- **把失败路径设计出来**:超时、无结果、参数缺失都应该有明确回退 +- **优先传状态,不传长文本**:节点之间共享必要结构化信息,比传递大段自然语言更稳 +- **为流程保留可观测性**:每条主路径都应能在调试时解释“为什么走到这里” + +## 当前边界 + +- 文档不会完整覆盖所有表达式或节点字段的最终 Schema +- 不同执行引擎下,可用节点字段和运行行为可能存在差异 +- 可视化编排与底层字段映射可能不会一一对应 + +## 相关文档 + +- [助手概念](../concepts/assistants.md) - 工作流在助手体系中的位置 +- [工具](tools.md) - 设计可被流程安全调用的工具 +- [知识库](knowledge-base.md) - 让流程中的节点使用 RAG 能力 diff --git a/docs/content/getting-started/configuration.md b/docs/content/getting-started/configuration.md index 14c2e84..beeac47 100644 --- a/docs/content/getting-started/configuration.md +++ b/docs/content/getting-started/configuration.md @@ -1,4 +1,4 @@ -# 配置说明 +# 配置说明 本页面介绍 Realtime Agent Studio 各组件的配置方法。 @@ -274,5 +274,6 @@ python -c "from config import settings; print(settings)" ## 下一步 -- [安装部署](index.md) - 开始安装服务 +- [环境与部署](index.md) - 开始安装服务 - [Docker 部署](../deployment/docker.md) - 容器化部署 + diff --git a/docs/content/getting-started/index.md b/docs/content/getting-started/index.md index 6363418..1b1064b 100644 --- a/docs/content/getting-started/index.md +++ b/docs/content/getting-started/index.md @@ -1,12 +1,12 @@ -# 安装部署 +# 环境与部署 -本章节介绍如何安装和配置 Realtime Agent Studio (RAS) 开发环境。 +本页属于“快速开始”中的环境与部署路径,只负责把服务跑起来、说明配置入口和部署方式。首次创建助手请转到 [创建第一个助手](../quickstart/index.md)。 --- -## 系统组件 +## 先理解部署对象 -RAS 由三个核心服务组成: +Realtime Agent Studio(RAS)通常由三个核心服务组成: ```mermaid flowchart LR @@ -26,47 +26,32 @@ flowchart LR Engine <--> API ``` -| 组件 | 端口 | 说明 | -|------|------|------| -| **Web 前端** | 3000 | React + TypeScript 管理控制台 | -| **API 服务** | 8080 | Python FastAPI 后端 | -| **Engine 服务** | 8000 | 实时对话引擎(WebSocket) | +| 组件 | 默认端口 | 负责什么 | +|------|----------|----------| +| **Web 前端** | 3000 | 管理控制台与调试界面 | +| **API 服务** | 8080 | 资源管理、配置持久化、历史数据 | +| **Engine 服务** | 8000 | 实时会话、事件流和音频流 | ---- +## 选择你的安装方式 -## 快速安装 +### 方式一:Docker Compose -### 方式一:Docker Compose(推荐) - -最快捷的启动方式,适合快速体验和生产部署。 +适合希望尽快跑通一套完整环境的团队。 ```bash -# 1. 克隆项目 +# 仓库目录示例沿用当前代码仓库 slug +# 你本地实际目录名可以不同 git clone https://github.com/your-org/AI-VideoAssistant.git cd AI-VideoAssistant -# 2. 启动服务 docker-compose up -d - -# 3. 访问控制台 -open http://localhost:3000 ``` -!!! tip "首次启动" - 首次启动需要构建镜像,可能需要几分钟时间。 - ### 方式二:本地开发 -适合需要修改代码的开发者。 +适合需要分别调试前端、API 和 Engine 的开发者。 -#### 1. 克隆项目 - -```bash -git clone https://github.com/your-org/AI-VideoAssistant.git -cd AI-VideoAssistant -``` - -#### 2. 启动 API 服务 +#### 启动 API 服务 ```bash cd api @@ -76,7 +61,7 @@ pip install -r requirements.txt uvicorn main:app --host 0.0.0.0 --port 8080 --reload ``` -#### 3. 启动 Engine 服务 +#### 启动 Engine 服务 ```bash cd engine @@ -86,7 +71,7 @@ pip install -r requirements.txt python main.py ``` -#### 4. 启动 Web 前端 +#### 启动 Web 前端 ```bash cd web @@ -94,97 +79,37 @@ npm install npm run dev ``` -访问 `http://localhost:3000` +## 基础验证 ---- +完成安装后,至少确认以下入口可访问: -## 验证安装 +| 服务 | 地址 | 用途 | +|------|------|------| +| Web | `http://localhost:3000` | 打开控制台 | +| API | `http://localhost:8080/docs` | 查看管理接口 | +| Engine | `http://localhost:8000/health` | 检查实时引擎健康状态 | -### 检查服务状态 +如果你需要更完整的环境变量、配置文件和部署说明,请继续阅读本章节其他页面: -| 服务 | URL | 预期结果 | -|------|-----|---------| -| Web | http://localhost:3000 | 看到登录/控制台页面 | -| API | http://localhost:8080/docs | 看到 Swagger 文档 | -| Engine | http://localhost:8000/health | 返回 `{"status": "ok"}` | +- [环境要求](requirements.md) +- [配置说明](configuration.md) +- [部署概览](../deployment/index.md) +- [Docker 部署](../deployment/docker.md) -### 测试 WebSocket 连接 +## 目录结构(阅读导向) -```javascript -const ws = new WebSocket('ws://localhost:8000/ws?assistant_id=test'); -ws.onopen = () => console.log('Connected!'); -ws.onerror = (e) => console.error('Error:', e); +```text +repo/ +├── web/ # 管理控制台 +├── api/ # 控制面与管理接口 +├── engine/ # 实时交互引擎 +├── docker/ # 部署编排与镜像配置 +└── docs/ # 当前文档站点 ``` ---- +## 遇到问题时去哪里 -## 目录结构 +- 需要“快速判断往哪看”:先看 [常见问题](../resources/faq.md) +- 需要“按步骤排查”:直接看 [故障排查](../resources/troubleshooting.md) +- 已经跑通环境,准备创建助手:回到 [快速开始](../quickstart/index.md) -``` -AI-VideoAssistant/ -├── web/ # React 前端 -│ ├── src/ -│ │ ├── components/ # UI 组件 -│ │ ├── pages/ # 页面 -│ │ ├── stores/ # Zustand 状态 -│ │ └── api/ # API 客户端 -│ └── package.json -├── api/ # FastAPI 后端 -│ ├── app/ -│ │ ├── routers/ # API 路由 -│ │ ├── models/ # 数据模型 -│ │ └── services/ # 业务逻辑 -│ └── requirements.txt -├── engine/ # 实时交互引擎 -│ ├── app/ -│ │ ├── pipeline/ # 管线引擎 -│ │ └── multimodal/ # 多模态引擎 -│ └── requirements.txt -├── docker/ # Docker 配置 -│ └── docker-compose.yml -└── docs/ # 文档 -``` - ---- - -## 常见问题 - -### 端口被占用 - -```bash -# 查看端口占用 -# Linux/Mac -lsof -i :3000 - -# Windows -netstat -ano | findstr :3000 -``` - -修改对应服务的端口配置后重启。 - -### Docker 构建失败 - -```bash -# 清理 Docker 缓存 -docker system prune -a - -# 重新构建 -docker-compose build --no-cache -``` - -### Python 依赖安装失败 - -确保使用 Python 3.10+: - -```bash -python --version # 需要 3.10+ -``` - ---- - -## 下一步 - -- [环境要求](requirements.md) - 详细的软件版本要求 -- [配置说明](configuration.md) - 环境变量配置指南 -- [快速开始](../quickstart/index.md) - 创建第一个助手 -- [Docker 部署](../deployment/docker.md) - 镜像构建与编排 diff --git a/docs/content/getting-started/requirements.md b/docs/content/getting-started/requirements.md index e925de6..36d3b75 100644 --- a/docs/content/getting-started/requirements.md +++ b/docs/content/getting-started/requirements.md @@ -1,4 +1,4 @@ -# 环境要求 +# 环境要求 本页面列出运行 Realtime Agent Studio 所需的软件和硬件要求。 @@ -145,5 +145,6 @@ wsl --install -d Ubuntu ## 下一步 - [配置说明](configuration.md) - 环境变量配置 -- [安装部署](index.md) - 开始安装 +- [环境与部署](index.md) - 开始安装 - [Docker 部署](../deployment/docker.md) - 容器化部署 + diff --git a/docs/content/index.md b/docs/content/index.md index b5f6ccc..d1f033a 100644 --- a/docs/content/index.md +++ b/docs/content/index.md @@ -1,9 +1,9 @@ -

+

Realtime Agent Studio

- 构建实时交互音视频智能体的开源工作平台 + 通过管理控制台与 API 构建、部署和运营实时多模态助手

@@ -14,66 +14,65 @@

+ 产品概览 · 快速开始 · - API 文档 · - 安装部署 · - 路线图 + 构建助手 · + 核心概念 · + API 参考

--- -## 什么是 Realtime Agent Studio? +Realtime Agent Studio (RAS) 是一个通过管理控制台与 API 构建、部署和运营实时多模态助手的开源平台。 -Realtime Agent Studio (RAS) 是一款以大语言模型为核心,构建实时交互音视频智能体的工作平台。支持管线式的全双工交互引擎和原生多模态模型两种架构,覆盖实时交互智能体的配置、测试、发布、监控全流程。 +## 适合谁 -可以将 RAS 看作 [Vapi](https://vapi.ai)、[Retell](https://retellai.com)、[ElevenLabs Agents](https://elevenlabs.io) 的**开源替代方案**。 +- 需要把实时语音或视频助手接入产品、设备或内部系统的开发团队 +- 需要通过控制台快速配置提示词、模型、知识库、工具和工作流的运营团队 +- 需要私有化部署、模型可替换、链路可观测的企业场景 ---- - -## 核心特性 +## 核心能力
-- :zap: **低延迟实时引擎** +- :material-robot-outline: **助手构建** --- - 管线式全双工架构,VAD/ASR/TD/LLM/TTS 流水线处理,支持智能打断,端到端延迟 < 500ms + 用统一的助手对象管理提示词、模型、知识库、工具、开场白和会话策略。 -- :brain: **多模态模型支持** +- :material-pulse: **双引擎运行时** --- - 支持 GPT-4o Realtime、Gemini Live、Step Audio 等原生多模态模型直连 + 同时支持 Pipeline 引擎与 Realtime 引擎,可按延迟、成本和可控性选择运行方式。 -- :wrench: **可视化配置** +- :material-source-branch: **能力扩展** --- - 无代码配置助手、提示词、工具调用、知识库关联,所见即所得 + 通过资源库、知识库、工具与工作流扩展助手能力,而不是把全部逻辑塞进单一提示词。 -- :electric_plug: **开放 API** +- :material-api: **开放集成** --- - 标准 WebSocket 协议,RESTful 管理接口,支持 Webhook 回调 + 使用 REST API 管理资源,使用 WebSocket API 接入实时对话,面向 Web、移动端和第三方系统。 -- :shield: **私有化部署** +- :material-shield-lock-outline: **私有化部署** --- - Docker 一键部署,数据完全自主可控,支持本地模型 + 支持 Docker 部署、自有模型服务和企业内网运行,便于满足合规与成本要求。 -- :chart_with_upwards_trend: **全链路监控** +- :material-chart-line: **可观测与评估** --- - 完整会话回放,实时仪表盘,自动化测试与效果评估 + 提供会话历史、实时指标、自动化测试和效果评估,帮助持续改进助手质量。
---- - ## 系统架构 平台架构层级: @@ -81,243 +80,107 @@ Realtime Agent Studio (RAS) 是一款以大语言模型为核心,构建实时 ```mermaid flowchart TB -%% ================= ACCESS ================= -subgraph Access["Access Layer"] -direction TB -API[API] -SDK[SDK] -Browser[Browser UI] -Embed[Web Embed] -end + subgraph Access["Access Layer"] + API["API"] + SDK["SDK"] + Browser["Browser UI"] + Embed["Web Embed"] + end + subgraph Runtime["Realtime Interaction Engine"] + direction LR -%% ================= REALTIME ENGINE ================= -subgraph Runtime["Realtime Interaction Engine"] + subgraph Duplex["Duplex Interaction Engine"] + direction LR -direction LR + subgraph Pipeline["Pipeline Engine"] + direction LR + VAD["VAD"] + ASR["ASR"] + TD["Turn Detection"] + LLM["LLM"] + TTS["TTS"] + end -%% -------- Duplex Engine -------- -subgraph Duplex["Duplex Interaction Engine"] -direction LR + subgraph Multi["Realtime Engine"] + MM["Realtime Model"] + end + end -subgraph Pipeline["Pipeline Engine"] -direction LR -VAD[VAD] -ASR[ASR] -TD[Turn Detection] -LLM[LLM] -TTS[TTS] -end + subgraph Capability["Agent Capabilities"] + subgraph Tools["Tool System"] + Webhook["Webhook"] + ClientTool["Client Tools"] + Builtin["Builtin Tools"] + end -subgraph Multi["Realtime Engine"] -MM[Realtime Model] -end + subgraph KB["Knowledge System"] + Docs["Documents"] + Vector[("Vector Index")] + Retrieval["Retrieval"] + end + end + end -end + subgraph Platform["Platform Services"] + direction TB + Backend["Backend Service"] + Frontend["Frontend Console"] + DB[("Database")] + end - -%% -------- Capabilities -------- -subgraph Capability["Agent Capabilities"] - -subgraph Tools["Tool System"] -Webhook[Webhook] -ClientTool[Client Tools] -Builtin[Builtin Tools] -end - -subgraph KB["Knowledge System"] -Docs[Documents] -Vector[(Vector Index)] -Retrieval[Retrieval] -end - -end - -end - - -%% ================= PLATFORM ================= -subgraph Platform["Platform Services"] -direction TB -Backend[Backend Service] -Frontend[Frontend Console] -DB[(Database)] -end - - -%% ================= CONNECTIONS ================= - -Access --> Runtime - -Runtime <--> Backend -Backend <--> DB -Backend <--> Frontend - -LLM --> Tools -MM --> Tools - -LLM <--> KB -MM <--> KB + Access --> Runtime + Runtime <--> Backend + Backend <--> DB + Backend <--> Frontend + LLM --> Tools + MM --> Tools + LLM <--> KB + MM <--> KB ``` -管线式引擎交互引擎对话流程图: - -```mermaid -flowchart LR - -User((User Speech)) -Audio[Audio Stream] - -VAD[VAD\nVoice Activity Detection] -ASR[ASR\nSpeech Recognition] - -TD[Turn Detection] - -LLM[LLM\nReasoning] - -Tools[Tools / APIs] - -TTS[TTS\nSpeech Synthesis] - -AudioOut[Audio Stream Out] - -User --> Audio -Audio --> VAD -VAD --> ASR -ASR --> TD -TD --> LLM - -LLM --> Tools -Tools --> LLM - -LLM --> TTS -TTS --> AudioOut -AudioOut --> User -``` - -基于实时交互模型的对话流程图: - -```mermaid -flowchart LR - -User((User)) - -Input[Audio / Video / Text] - -MM[Multimodal Model] - -Tools[Tools / APIs] -KB[Knowledge Base] - -Output[Audio / Video / Text] - -User --> Input -Input --> MM - -MM --> Tools -Tools --> MM - -MM --> KB -KB --> MM - -MM --> Output -Output --> User -``` - ---- - -## 技术栈 - -| 层级 | 技术 | -|------|------| -| **前端** | React 18, TypeScript, Tailwind CSS, Zustand | -| **后端** | FastAPI (Python 3.10+) | -| **引擎** | Python, WebSocket, asyncio | -| **数据库** | SQLite | -| **知识库** | chroma | -| **部署** | Docker | - ---- - -## 快速导航 +## 从这里开始
-- :rocket: **[快速开始](quickstart/index.md)** +- :material-compass-outline: **[了解产品](overview/index.md)** --- - 5 分钟创建你的第一个 AI 助手 + 先看产品定位、核心模块、适用场景,以及 RAS 与其他方案的差异。 -- :book: **[核心概念](concepts/index.md)** +- :material-cog-outline: **[环境与部署](getting-started/index.md)** --- - 了解助手、管线、多模态等核心概念 + 先把服务跑起来,了解环境要求、配置入口和部署方式。 -- :wrench: **[安装部署](getting-started/index.md)** +- :material-rocket-launch-outline: **[创建第一个助手](quickstart/index.md)** --- - 环境准备、本地开发与 Docker/生产部署 + 按最短路径准备资源、创建助手、测试效果并拿到接入所需信息。 -- :robot: **[助手管理](assistants/index.md)** +- :material-tune: **[构建助手](concepts/assistants.md)** --- - 创建和配置智能对话助手 + 按完整链路配置助手、提示词、模型、知识库、工具与工作流。 -- :gear: **[功能定制](customization/knowledge-base.md)** +- :material-connection: **[接入应用](api-reference/index.md)** --- - 知识库、工具、语音、工作流 + 查看 REST 与 WebSocket 接口,把助手嵌入到你的 Web、移动端或服务端系统。 -- :bar_chart: **[数据分析](analysis/dashboard.md)** +- :material-lifebuoy: **[排查问题](resources/troubleshooting.md)** --- - 仪表盘、历史记录、测试评估 - -- :electric_plug: **[API 参考](api-reference/index.md)** - - --- - - WebSocket 协议与 REST 接口文档 + 当连接、对话质量或部署链路出现问题时,从这里进入可执行的排查步骤。
---- -## 快速体验 -### 使用 Docker 启动 -```bash -git clone https://github.com/your-org/AI-VideoAssistant.git -cd docker -docker-compose up -d -# for development -# docker compose --profile dev up -d -``` - -访问 `http://localhost:3000` 即可使用控制台。 - -### WebSocket 连接示例 - -```javascript -const ws = new WebSocket('ws://localhost:8000/ws?assistant_id=YOUR_ID'); - -ws.onopen = () => { - ws.send(JSON.stringify({ - type: 'session.start', - audio: { encoding: 'pcm_s16le', sample_rate_hz: 16000, channels: 1 } - })); -}; -``` - ---- - -## 许可证 - -本项目基于 [MIT 许可证](https://github.com/your-org/AI-VideoAssistant/blob/main/LICENSE) 开源。 diff --git a/docs/content/overview/architecture.md b/docs/content/overview/architecture.md index bfa830b..8262644 100644 --- a/docs/content/overview/architecture.md +++ b/docs/content/overview/architecture.md @@ -1,6 +1,6 @@ -# 系统架构 +# 系统架构 -本文档详细介绍 Realtime Agent Studio (RAS) 的系统架构设计。 +本文档只解释 Realtime Agent Studio (RAS) 的服务边界、数据流、部署形态和关键技术选型,不重复产品定位或上手流程。 --- @@ -61,12 +61,12 @@ flowchart TB ### 1. Web 前端 (React) -管理控制台,提供可视化的配置和监控界面。 +管理控制台,提供可视化的配置、测试和监控界面。 | 功能模块 | 说明 | |---------|------| | 助手管理 | 创建、配置、测试智能助手 | -| 资源库 | LLM/ASR/TTS/VAD 等模型管理 | +| 资源库 | LLM / ASR / TTS 等模型管理 | | 知识库 | RAG 文档上传与管理 | | 历史记录 | 会话日志查询与回放 | | 仪表盘 | 实时数据统计 | @@ -74,7 +74,7 @@ flowchart TB ### 2. API 服务 (FastAPI) -RESTful API 后端,处理所有管理操作。 +REST API 后端,处理资源管理、持久化配置和历史数据等控制面能力。 ```mermaid flowchart LR @@ -100,7 +100,7 @@ flowchart LR ### 3. 实时交互引擎 (Engine) -核心组件,处理实时音视频对话。 +处理实时音视频对话、事件流转、模型调用与工具执行。 ```mermaid flowchart TB @@ -116,7 +116,7 @@ flowchart TB TTS[语音合成 TTS] end - subgraph Realtime["实时交互引擎连接"] + subgraph Realtime["实时引擎连接"] RTOpenAI[OpenAI Realtime] RTGemini[Gemini Live] RTDoubao[Doubao 实时交互] @@ -144,9 +144,9 @@ flowchart TB | 类别 | 说明 | 可选项 | |------|------|--------| -| **外部服务** | 管线式引擎各环节所依赖的云/本地服务 | OpenAI、SiliconFlow、DashScope、本地模型 | -| **实时交互引擎** | 实时交互引擎可连接的后端 | OpenAI Realtime、Gemini Live、Doubao 实时交互引擎 | -| **工具** | 管线式 LLM 与实时交互引擎均可调用 | Webhook、客户端工具、内建工具 | +| **外部模型服务** | Pipeline 引擎各环节依赖的云端或本地服务 | OpenAI、SiliconFlow、DashScope、本地模型 | +| **实时模型连接** | Realtime 引擎可直接连接的后端 | OpenAI Realtime、Gemini Live、Doubao 实时交互 | +| **工具系统** | 由助手或引擎调用的外部执行能力 | Webhook、客户端工具、内建工具 | --- @@ -154,7 +154,7 @@ flowchart TB ### 管线式全双工引擎 -管线式引擎包含:**声音活动检测(VAD)**、**语音识别(ASR)**、**回合检测(TD)**、**大语言模型(LLM)**、**语音合成(TTS)**。外部服务可选用 **OpenAI**、**SiliconFlow**、**DashScope**、**本地模型**。LLM 可连接**工具**(Webhook、客户端工具、内建工具)。 +管线式引擎由 **VAD → ASR → TD → LLM → TTS** 组成。每个环节可替换,适合需要精细控制、工具扩展和较高可解释性的场景。 ```mermaid sequenceDiagram @@ -170,33 +170,28 @@ sequenceDiagram C->>E: 音频流 (PCM) E->>VAD: 检测语音活动 VAD-->>E: 有效语音段 - E->>ASR: 语音转文字 + E->>ASR: 语音转写 ASR-->>E: 转写文本 - E->>TD: 回合边界 - TD-->>E: 可送 LLM 的输入 + E->>TD: 判断回合边界 + TD-->>E: 可送入 LLM 的输入 E->>LLM: 生成回复 LLM->>Tools: 可选:调用工具 Tools-->>LLM: 工具结果 LLM-->>E: 回复文本 (流式) - E->>TTS: 文字转语音 + E->>TTS: 文本转语音 TTS-->>E: 音频流 E->>C: 播放音频 ``` **特点:** -- 灵活选择各环节供应商(OpenAI、SiliconFlow、DashScope、本地模型) -- 可独立优化 VAD、ASR、TD、LLM、TTS 每个环节 -- LLM 与工具联动(Webhook、客户端工具、内建工具) -- 延迟约 500-1500ms +- 各环节可单独替换和优化 +- 便于接入知识库、工具、工作流等能力 +- 延迟通常高于端到端实时模型,但可控性更强 -### 实时交互引擎 +### Realtime 引擎 -实时交互引擎可连接**实时交互引擎**,包括 **OpenAI Realtime**、**Gemini Live**、**Doubao 实时交互引擎**等,同样可连接**工具**(Webhook、客户端工具、内建工具)。 - -### 原生多模态引擎 - -使用端到端多模态模型(如 GPT-4o Realtime): +Realtime 引擎直接连接端到端实时模型,适合追求更低延迟和更自然多模态交互的场景。 ```mermaid sequenceDiagram @@ -204,17 +199,17 @@ sequenceDiagram participant E as 引擎 participant RT as Realtime Model - C->>E: 音频流 - E->>RT: 音频输入 - RT-->>E: 音频输出 (流式) - E->>C: 播放音频 + C->>E: 音频/视频/文本输入 + E->>RT: 实时流输入 + RT-->>E: 流式文本/音频输出 + E->>C: 播放或渲染结果 ``` **特点:** -- 更低延迟 (< 300ms) -- 更自然的语音交互 -- 依赖特定模型供应商 +- 交互链路更短,延迟更低 +- 更依赖具体模型供应商的能力边界 +- 适合强调自然对话和多模态体验的入口 --- @@ -234,11 +229,11 @@ sequenceDiagram API->>DB: 查询助手 DB-->>API: 助手数据 API-->>E: 配置信息 - + C->>E: session.start E-->>C: session.started E-->>C: config.resolved - + loop 对话循环 C->>E: 音频帧 (binary) E-->>C: input.speech_started @@ -249,7 +244,7 @@ sequenceDiagram E-->>C: 音频帧 (binary) E-->>C: output.audio.end end - + C->>E: session.stop E->>API: 保存会话记录 API->>DB: 存储 @@ -266,19 +261,19 @@ sequenceDiagram Note over E: 正在播放 TTS 音频 E->>C: 音频帧... - + C->>E: 用户说话 (VAD 检测) E->>E: 触发打断 E->>TTS: 停止合成 E-->>C: output.audio.interrupted - + Note over E: 处理新的用户输入 E-->>C: input.speech_started ``` --- -## 部署架构 +## 部署形态 ### 开发环境 @@ -299,56 +294,19 @@ flowchart LR ## 技术选型 -| 组件 | 技术 | 选型理由 | -|------|------|---------| -| **前端框架** | React 18 | 成熟生态,组件化开发 | -| **状态管理** | Zustand | 轻量级,TypeScript 友好 | -| **UI 组件** | Tailwind CSS | 原子化 CSS,快速开发 | -| **后端框架** | FastAPI | 高性能,自动 API 文档 | -| **WebSocket** | websockets | Python 异步 WebSocket | -| **ORM** | SQLAlchemy | 功能完善,支持多数据库 | -| **数据库** | SQLite/PostgreSQL | 开发简单/生产可靠 | - ---- - -## 扩展性设计 - -### 模型适配器模式 - -```mermaid -classDiagram - class ModelAdapter { - <> - +generate(prompt) string - +stream(prompt) AsyncIterator - } - - class OpenAIAdapter { - +generate(prompt) string - +stream(prompt) AsyncIterator - } - - class AzureAdapter { - +generate(prompt) string - +stream(prompt) AsyncIterator - } - - class LocalAdapter { - +generate(prompt) string - +stream(prompt) AsyncIterator - } - - ModelAdapter <|-- OpenAIAdapter - ModelAdapter <|-- AzureAdapter - ModelAdapter <|-- LocalAdapter -``` - -通过适配器模式,可以轻松接入新的模型供应商。 +| 组件 | 技术 | 说明 | +|------|------|------| +| **前端框架** | React 18 | 管理控制台与调试界面 | +| **状态管理** | Zustand | 前端轻量状态管理 | +| **UI 样式** | Tailwind CSS | 快速构建控制台界面 | +| **后端框架** | FastAPI | 管理接口与配置持久化 | +| **WebSocket** | websockets | 实时事件与音频流通信 | +| **数据库** | SQLite / PostgreSQL | 配置与历史数据存储 | --- ## 相关文档 -- [WebSocket 协议](../api-reference/websocket.md) - 详细的协议规范 -- [部署概览](../deployment/index.md) - Docker 部署 -- [核心概念](../concepts/index.md) - 助手、管线等概念说明 +- [产品概览](index.md) - 产品定位、核心模块与适用场景 +- [引擎架构](../concepts/engines.md) - Pipeline 与 Realtime 的选择指南 +- [WebSocket 协议](../api-reference/websocket.md) - 实时对话事件和消息格式 diff --git a/docs/content/overview/index.md b/docs/content/overview/index.md index b9d0398..3f18efd 100644 --- a/docs/content/overview/index.md +++ b/docs/content/overview/index.md @@ -1,148 +1,84 @@ -# 产品概览 +# 产品概览 -了解 Realtime Agent Studio 的核心功能和设计理念。 +Realtime Agent Studio (RAS) 是一个通过管理控制台与 API 构建、部署和运营实时多模态助手的开源平台。 --- -## 什么是 RAS? +## 产品定位 -Realtime Agent Studio (RAS) 是一个**开源的实时交互智能体工作平台**,让开发者能够快速构建和部署具备语音对话能力的 AI 助手。 +RAS 面向需要构建实时语音或视频助手的团队,目标不是替代你的业务系统,而是提供一套可组合的助手基础设施: -### 核心价值 +- **控制台**:让团队快速配置助手、资源库、知识库、工具、工作流与评估策略 +- **API 与实时运行时**:让应用、设备和第三方系统稳定接入实时对话能力 +- **运维与分析能力**:让团队能观察会话效果、排查问题并持续迭代助手质量 -| 价值主张 | 说明 | -|---------|------| -| **低代码配置** | 可视化界面配置助手,无需编写复杂代码 | -| **实时交互** | 毫秒级响应,支持语音打断,自然对话体验 | -| **开放灵活** | 支持多种模型供应商,自由选择最适合的方案 | -| **私有部署** | 完全自主可控,数据不出域 | +如果你把实时助手看作一条完整的产品链路,RAS 负责其中的“构建、接入、运行、观测”四个阶段。 ---- +## 核心模块 -## 功能模块 +| 模块 | 负责什么 | 适合谁使用 | +|------|----------|------------| +| **助手** | 定义角色、行为、模型、知识、工具和会话策略 | 产品、运营、算法、开发 | +| **引擎** | 承载实时语音/多模态对话,输出事件流和音频流 | 开发、基础设施 | +| **资源库** | 管理 LLM、ASR、TTS 等外部能力接入 | 平台管理员、开发 | +| **知识库 / 工具 / 工作流** | 让助手获得领域知识、外部执行能力和复杂流程控制 | 业务设计者、开发 | +| **分析与评估** | 记录会话、监控指标、做自动化回归和效果评估 | 运营、QA、开发 | -```mermaid -mindmap - root((RAS)) - 助手管理 - 创建配置 - 提示词编辑 - 模型选择 - 工具调用 - 资源库 - LLM 模型 - ASR 模型 - TTS 声音 - 知识库 - 文档上传 - 向量检索 - RAG 问答 - 监控分析 - 会话回放 - 数据统计 - 自动测试 - 部署集成 - WebSocket API - REST API - SDK -``` +## 为什么是“控制台 + API” -### 助手管理 +RAS 采用“控制台配置 + API 接入”的组合方式,而不是把所有内容都固化在代码里: -创建和配置智能对话助手: +- **控制台负责提效**:让非后端角色也能参与提示词、工具、知识、流程的配置与调优 +- **API 负责集成**:让产品团队继续用自己的前端、服务端或设备侧应用承载最终体验 +- **同一套助手配置可复用**:控制台保存的助手定义可以被不同渠道重复接入和评估 -- **系统提示词** - 定义助手角色和行为 -- **模型配置** - 选择 LLM、ASR、TTS 模型 -- **工具调用** - 配置 Webhook 和客户端工具 -- **开场白** - 设置首轮对话模式 - -### 资源库 - -集中管理各类模型资源: - -- **语音识别 (ASR)** - 多供应商 ASR 模型管理 -- **大语言模型 (LLM)** - OpenAI、Azure、本地模型 -- **语音合成 (TTS)** - 多音色声音资源 - -### 知识库 - -为助手提供专业知识: - -- **文档上传** - 支持 PDF、Word、Markdown 等格式 -- **向量化索引** - 自动分块和向量化 -- **RAG 检索** - 基于语义的知识检索 - -### 监控分析 - -全面的数据分析能力: - -- **会话回放** - 完整链路日志和音频回放 -- **实时仪表盘** - 并发数、延迟、错误率统计 -- **自动化测试** - 批量测试和效果评估 - ---- - -## 对比其他方案 - -| 特性 | RAS | Vapi | Retell | ElevenLabs | -|------|-----|------|--------|------------| -| **开源** | :white_check_mark: | :x: | :x: | :x: | -| **私有部署** | :white_check_mark: | :x: | :x: | :x: | -| **管线式引擎** | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: | -| **多模态模型** | :white_check_mark: | :white_check_mark: | :x: | :white_check_mark: | -| **自定义 ASR/TTS** | :white_check_mark: | 有限 | 有限 | :x: | -| **知识库** | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: | -| **工作流编辑** | 开发中 | :white_check_mark: | :x: | :x: | -| **定价** | 免费 | 按量付费 | 按量付费 | 按量付费 | - ---- - -## 适用场景 +## 典型使用方式
-- :telephone_receiver: **智能客服** +- :material-headset: **客户服务与运营自动化** --- - 7x24 小时自动接听,处理常见咨询,复杂问题转人工 + 在客服、外呼、预约、售后等场景中接入实时语音助手,并保留人工接管与工具调用能力。 -- :hospital: **医疗问诊** +- :material-school-outline: **培训、陪练与问答** --- - 预问诊信息收集,健康咨询,用药提醒 + 用知识库、提示词和流程编排构建可持续优化的教学、培训或辅导助手。 -- :school: **教育培训** +- :material-domain: **企业内部助手** --- - 口语练习,知识问答,个性化辅导 + 通过私有部署、内部知识库和业务系统工具,把助手接入内部流程或设备终端。 -- :handshake: **销售助手** +- :material-devices: **多端集成** --- - 产品介绍,需求挖掘,预约安排 - -- :headphones: **语音助手** - - --- - - 智能家居控制,日程管理,信息查询 - -- :robot: **虚拟人** - - --- - - 数字人直播,虚拟主播,交互式展示 + 通过 WebSocket API 将同一个助手接入 Web、移动端、坐席工作台或自有硬件设备。
---- +## 与其他方案的差异 -## 下一步 +本页是站内唯一保留“产品对比”视角的地方,用于帮助你快速判断 RAS 的定位边界。 -- [快速开始](../quickstart/index.md) - 5 分钟创建第一个助手 -- [系统架构](architecture.md) - 深入了解技术实现 -- [核心概念](../concepts/index.md) - 学习关键概念 +| 特性 | RAS | Vapi | Retell | ElevenLabs Agents | +|------|-----|------|--------|-------------------| +| **开源** | :white_check_mark: | :x: | :x: | :x: | +| **私有部署** | :white_check_mark: | :x: | :x: | :x: | +| **Pipeline 引擎** | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x: | +| **Realtime / 多模态引擎** | :white_check_mark: | :white_check_mark: | :x: | :white_check_mark: | +| **自定义 ASR / TTS** | :white_check_mark: | 有限 | 有限 | :x: | +| **知识库与工具扩展** | :white_check_mark: | :white_check_mark: | :white_check_mark: | 有限 | +| **工作流编排** | 开发中 | :white_check_mark: | :x: | :x: | +| **数据与链路可观测** | :white_check_mark: | 有限 | 有限 | 有限 | + +## 继续阅读 + +- [系统架构](architecture.md) - 从服务边界、数据流和部署形态理解系统如何组成 +- [核心概念](../concepts/index.md) - 先建立助手、引擎与工作流的心智模型 +- [快速开始](../quickstart/index.md) - 以最短路径创建第一个助手 diff --git a/docs/content/quickstart/dashboard.md b/docs/content/quickstart/dashboard.md index 57643c0..9c1e049 100644 --- a/docs/content/quickstart/dashboard.md +++ b/docs/content/quickstart/dashboard.md @@ -1,233 +1,44 @@ -# 资源库配置详解 +# 资源准备清单 -本页面详细介绍资源库中各类资源的配置方法和最佳实践。 +本页保留原“资源库配置详解”链接,但在本轮文档收敛后,它只承担快速开始阶段的资源核对职责。 -## 语音识别 (ASR) 配置 +## 你至少要准备什么 -### 支持的接口类型 +在创建第一个助手前,至少确认以下三类资源都已经可用: -| 接口类型 | 说明 | -|---------|------| -| OpenAI Compatible | 兼容 OpenAI 语音识别 API 格式的服务 | +| 资源 | 为什么需要 | 正式说明页 | +|------|------------|------------| +| **LLM 模型** | 负责理解与生成回复 | [LLM 模型](../customization/models.md) | +| **ASR 资源** | 负责把语音输入转写为文本 | [语音识别](../customization/asr.md) | +| **TTS 声音资源** | 负责把文本回复合成为语音 | [声音资源](../customization/voices.md) | -### 配置字段说明 +## 上手前自检 -| 字段 | 必填 | 说明 | -|-----|-----|------| -| 模型名称 | 是 | 自定义显示名称,便于识别 | -| 接口类型 | 是 | 当前支持 OpenAI Compatible | -| 语言 | 是 | 识别语言:中文/英文/多语言 | -| Model Name | 否 | API 请求中的 model 参数 | -| Base URL | 是 | API 服务地址 | -| API Key | 是 | 服务认证密钥 | -| 热词 | 否 | 逗号分隔的专有名词列表 | -| 标点增强 | 否 | 是否自动添加标点 | -| 文本归一化 | 否 | 规范化数字、日期等格式 | -| 启用 | 否 | 是否在选择列表中显示 | +### LLM -### 推荐配置示例 +- 已配置供应商、模型名称、Base URL 和凭证 +- 已明确该模型用于文本生成、嵌入还是重排 +- 已准备保守的默认参数,而不是先追求极端效果 -**硅基流动 SenseVoice** +### ASR -``` -模型名称:SenseVoice 中文 -Model Name:FunAudioLLM/SenseVoiceSmall -Base URL:https://api.siliconflow.cn/v1 -语言:中文 -``` +- 已确认目标语言与模型匹配 +- 已准备必要热词或专有名词词表 +- 已能用固定样本测试识别准确率和延迟 -### 测试识别效果 +### TTS -1. 在 ASR 列表中找到目标模型 -2. 点击 **试听识别** 按钮 -3. 选择以下测试方式之一: - - **上传文件**:拖拽或选择音频文件 - - **麦克风录音**:点击录音按钮开始录制 -4. 点击 **开始识别** 查看结果 -5. 检查识别文本、延迟和置信度 +- 已选择主音色,并完成至少一次试听 +- 已确认该声音适合实时对话,而不是仅适合离线播报 +- 已为默认语速、音量等参数设定初始值 ---- +## 不在本页展开的内容 -## 大语言模型 (LLM) 配置 +字段说明、供应商差异、参数建议和最佳实践已经分别收敛到正式能力页: -### 支持的模型类型 +- [LLM 模型](../customization/models.md) +- [语音识别](../customization/asr.md) +- [声音资源](../customization/voices.md) +- [TTS 参数](../customization/tts.md) -| 类型 | 用途 | -|-----|------| -| 文本 (text) | 对话生成,用于助手核心交互 | -| 嵌入 (embedding) | 向量化,用于知识库检索 | -| 重排 (rerank) | 结果重排序,优化检索结果 | - -### 配置字段说明 - -| 字段 | 必填 | 说明 | -|-----|-----|------| -| 厂商 | 是 | 当前支持 OpenAI Compatible | -| 模型类型 | 是 | 文本/嵌入/重排 | -| 模型名称 | 是 | 自定义显示名称 | -| 模型标识 | 否 | API 请求中的 model 参数 | -| Base URL | 是 | API 服务地址 | -| API Key | 是 | 服务认证密钥 | -| 温度 | 否 | 输出随机性 (0-2),仅文本模型 | -| 上下文长度 | 否 | 最大 token 数 | -| 启用 | 否 | 是否在选择列表中显示 | - -### 推荐配置示例 - -**OpenAI GPT-4o Mini** - -``` -模型名称:GPT-4o Mini -模型类型:文本 -模型标识:gpt-4o-mini -Base URL:https://api.openai.com/v1 -温度:0.7 -上下文长度:8192 -``` - -**硅基流动 Qwen** - -``` -模型名称:Qwen2.5-7B -模型类型:文本 -模型标识:Qwen/Qwen2.5-7B-Instruct -Base URL:https://api.siliconflow.cn/v1 -温度:0.7 -``` - -### 测试模型效果 - -1. 在 LLM 列表中找到目标模型 -2. 点击 **预览** 按钮 -3. 配置测试参数: - - **System Prompt**:系统提示词 - - **User Message**:测试消息 - - **Temperature**:温度参数 - - **Max Tokens**:最大输出长度 -4. 点击 **开始预览** 查看模型回复 -5. 检查回复内容、延迟和 token 用量 - ---- - -## 声音资源 (TTS) 配置 - -### 支持的接口类型 - -| 接口类型 | 说明 | -|---------|------| -| OpenAI Compatible | 兼容 OpenAI TTS API 格式的服务 | -| DashScope | 阿里云 DashScope 语音合成服务 | - -### 配置字段说明 - -| 字段 | 必填 | 说明 | -|-----|-----|------| -| 厂商 | 是 | OpenAI Compatible 或 DashScope | -| 声音名称 | 是 | 自定义显示名称 | -| 模型 | 是 | TTS 模型标识 | -| 声音 ID | 是 | 音色标识符 | -| Base URL | 否 | API 服务地址 | -| API Key | 是 | 服务认证密钥 | -| 语速 | 否 | 说话速度 (0.5-2.0),默认 1.0 | -| 增益 | 否 | 音量调节 (-10 to 10 dB) | -| 音调 | 否 | 声音高低 (-12 to 12) | -| 性别 | 否 | 声音性别标签 | -| 语言 | 否 | 声音语言标签 | -| 备注 | 否 | 声音特点描述 | - -### 推荐配置示例 - -**硅基流动 CosyVoice** - -``` -厂商:OpenAI Compatible -声音名称:Anna 中文女声 -模型:FunAudioLLM/CosyVoice2-0.5B -声音 ID:FunAudioLLM/CosyVoice2-0.5B:anna -Base URL:https://api.siliconflow.cn/v1 -语速:1.0 -性别:女 -语言:中文 -``` - -**DashScope TTS** - -``` -厂商:DashScope -声音名称:Cherry -模型:qwen3-tts-flash-realtime -声音 ID:Cherry -Base URL:wss://dashscope.aliyuncs.com/api-ws/v1/realtime -语速:1.0 -``` - -### CosyVoice 可用音色 - -| 音色 ID | 性别 | 风格 | -|--------|-----|------| -| alex | 男 | 成熟稳重 | -| anna | 女 | 温柔亲切 | -| bella | 女 | 活泼甜美 | -| benjamin | 男 | 年轻活力 | -| charles | 男 | 专业商务 | -| claire | 女 | 清新自然 | -| david | 男 | 沉稳大气 | -| diana | 女 | 优雅知性 | - -### 试听声音效果 - -1. 在声音列表中找到目标声音 -2. 点击 **播放** 按钮 -3. 系统会自动合成一段试听语音 -4. 检查声音效果是否符合预期 - -### 克隆声音 - -如需使用自定义声音: - -1. 点击 **克隆声音** 按钮 -2. 上传参考音频文件(WAV/MP3) -3. 填写声音名称和描述 -4. 点击 **开始克隆** - -!!! note "声音克隆说明" - 声音克隆功能需要 TTS 服务支持。上传的参考音频建议为 10-30 秒的清晰人声录音。 - ---- - -## 配置最佳实践 - -### 资源命名规范 - -建议使用清晰的命名规范,便于后续管理: - -``` -[厂商/模型]-[用途/语言]-[特点] -``` - -示例: -- `SF-SenseVoice-中文` -- `OpenAI-GPT4o-对话` -- `SF-CosyVoice-Anna女声` - -### 多环境管理 - -如果有测试和生产环境,建议: - -1. 为不同环境创建独立的资源配置 -2. 在名称中标注环境,如 `GPT4o-Prod`、`GPT4o-Test` -3. 通过"启用"开关控制可见性 - -### 成本优化 - -| 场景 | 推荐配置 | -|-----|---------| -| 开发测试 | 使用低成本模型,如 GPT-4o-mini | -| 生产环境 | 根据质量要求选择合适模型 | -| 高并发 | 考虑使用本地部署的开源模型 | - ---- - -## 下一步 - -资源配置完成后,请返回 [快速开始](index.md) 继续创建助手。 +准备完成后,请回到 [快速开始](index.md) 继续创建助手。 diff --git a/docs/content/quickstart/index.md b/docs/content/quickstart/index.md index 3e15783..47f3b7c 100644 --- a/docs/content/quickstart/index.md +++ b/docs/content/quickstart/index.md @@ -1,221 +1,69 @@ -# 快速开始 +# 快速开始 -5 分钟创建你的第一个 AI 助手。 +本页负责“创建第一个助手”的最短路径。环境要求、配置文件和部署方式统一放在 [环境与部署](../getting-started/index.md)。 -## 概述 +## 目标 -本指南将帮助你通过控制台快速创建一个能够进行语音对话的智能助手。在创建助手之前,需要先在资源库(Library)中配置所需的模型资源。 +完成本页后,你应该已经: + +1. 准备好 1 个 LLM、1 个 ASR、1 个 TTS 资源 +2. 创建并保存 1 个助手 +3. 完成至少 1 轮测试对话 +4. 拿到接入应用所需的 `assistant_id` 和 WebSocket 地址 ## 前提条件 -- 已部署 Realtime Agent Studio (RAS) 服务 -- 拥有 LLM / ASR / TTS 服务的 API Key +- 已部署 Realtime Agent Studio(RAS)服务 +- 已准备可用的 LLM / ASR / TTS 凭证 +- 已能访问控制台与 WebSocket 服务 -## 配置流程 +## 第一步:准备资源 -创建助手前,需要先准备好三种核心资源: +创建助手之前,先准备三类资源: -``` -┌─────────────────────────────────────────────────────────┐ -│ 资源库配置 │ -├─────────────────────────────────────────────────────────┤ -│ 1. 语音识别 (ASR) ─→ 将用户语音转为文字 │ -│ 2. 模型接入 (LLM) ─→ 理解用户意图并生成回复 │ -│ 3. 声音资源 (TTS) ─→ 将文字回复转为语音输出 │ -└─────────────────────────────────────────────────────────┘ - ↓ -┌─────────────────────────────────────────────────────────┐ -│ 创建助手 │ -├─────────────────────────────────────────────────────────┤ -│ 配置提示词 → 选择模型 → 配置语音 → 测试 → 发布 │ -└─────────────────────────────────────────────────────────┘ -``` +- **LLM 模型**:决定助手如何理解和生成回复。详见 [LLM 模型](../customization/models.md) +- **ASR 资源**:决定语音输入如何转写。详见 [语音识别](../customization/asr.md) +- **TTS 声音资源**:决定回复如何被合成为语音。详见 [声音资源](../customization/voices.md) ---- - -## 第一步:配置资源库 - -在创建助手之前,需要先在资源库中添加 ASR、LLM、TTS 三种资源。 - -### 1.1 添加语音识别模型 (ASR) - -语音识别模型负责将用户的语音输入转换为文字。 - -1. 在左侧导航栏点击 **语音识别** -2. 点击 **添加模型** 按钮 -3. 填写配置信息: - -| 配置项 | 说明 | 示例值 | -|-------|------|--------| -| 模型名称 | 自定义显示名称 | SenseVoice CN | -| 接口类型 | 选择 OpenAI Compatible | OpenAI Compatible | -| 语言 | 识别语言 | 中文 (Chinese) | -| Model Name | 模型标识符 | FunAudioLLM/SenseVoiceSmall | -| Base URL | API 服务地址 | https://api.siliconflow.cn/v1 | -| API Key | 服务密钥 | sk-xxxxxxxx | - -4. 可选配置: - - **热词**:添加专有名词提高识别准确率 - - **标点增强**:自动添加标点符号 - - **文本归一化**:规范化数字、日期等格式 - -5. 点击 **确认添加** - -!!! tip "试听识别功能" - 添加完成后,可以点击列表中的试听按钮,上传或录制音频测试识别效果。 - -### 1.2 添加大语言模型 (LLM) - -大语言模型是助手的"大脑",负责理解用户意图并生成回复。 - -1. 在左侧导航栏点击 **模型接入** -2. 点击 **添加模型** 按钮 -3. 填写配置信息: - -| 配置项 | 说明 | 示例值 | -|-------|------|--------| -| 厂商 | 接口类型 | OpenAI Compatible | -| 模型类型 | 文本/嵌入/重排 | 文本 | -| 模型名称 | 自定义显示名称 | GPT-4o Mini | -| 模型标识 | API 中的 model 参数 | gpt-4o-mini | -| Base URL | API 服务地址 | https://api.openai.com/v1 | -| API Key | 服务密钥 | sk-xxxxxxxx | -| 温度 | 输出随机性 (0-2) | 0.7 | -| 上下文长度 | 最大 token 数 | 8192 | - -4. 点击 **确认添加** - -!!! tip "预览功能" - 添加完成后,可以点击预览按钮测试模型是否配置正确。 - -### 1.3 添加声音资源 (TTS) - -声音资源用于将助手的文字回复转换为语音输出。 - -1. 在左侧导航栏点击 **声音资源** -2. 点击 **添加声音** 按钮 -3. 填写配置信息: - -| 配置项 | 说明 | 示例值 | -|-------|------|--------| -| 厂商 | 接口类型 | OpenAI Compatible 或 DashScope | -| 声音名称 | 自定义显示名称 | 客服小美 | -| 模型 | TTS 模型标识 | FunAudioLLM/CosyVoice2-0.5B | -| 声音 ID | 音色标识 | FunAudioLLM/CosyVoice2-0.5B:anna | -| Base URL | API 服务地址 | https://api.siliconflow.cn/v1 | -| API Key | 服务密钥 | sk-xxxxxxxx | -| 语速 | 说话速度 (0.5-2.0) | 1.0 | -| 增益 | 音量调节 (-10 to 10 dB) | 0 | -| 音调 | 声音高低 (-12 to 12) | 0 | -| 性别 | 声音性别 | 女 | -| 语言 | 声音语言 | 中文 | - -4. 点击 **确认添加** - -!!! tip "试听功能" - 添加完成后,可以在列表中点击播放按钮试听声音效果。 - ---- +如果你想先检查“资源是否准备齐”,可以看 [资源准备清单](dashboard.md)。 ## 第二步:创建助手 -资源配置完成后,可以开始创建助手。 +1. 进入控制台中的 **助手** 页面 +2. 新建一个助手,并填写最小必要信息: + - **助手名称**:让团队知道它服务于什么场景 + - **系统提示词**:先定义角色、任务和限制 + - **首轮模式**:决定由助手先说还是等待用户开口 +3. 绑定默认模型: + - 文本生成使用一个 LLM + - 语音输入使用一个 ASR + - 语音输出使用一个 TTS 声音资源 -### 2.1 新建助手 +如果你想把助手设计得更稳,继续阅读: -1. 在左侧导航栏点击 **助手管理** -2. 点击 **新建助手** 按钮 -3. 系统会自动创建一个名为 "New Assistant" 的助手 +- [助手概念](../concepts/assistants.md) +- [配置选项](../concepts/assistants/configuration.md) +- [提示词指南](../concepts/assistants/prompts.md) -### 2.2 配置全局设置 +## 第三步:补充能力 -在助手详情页的 **全局** 标签页中配置: +最小助手可以只依赖提示词和模型;更复杂的场景通常还需要以下能力: -#### 基本信息 +- **知识库**:让助手回答私有领域问题。见 [知识库](../customization/knowledge-base.md) +- **工具**:让助手执行查单、预约、查询等外部操作。见 [工具](../customization/tools.md) +- **工作流**:让助手处理多步骤、多分支流程。见 [工作流](../customization/workflows.md) -- **助手名称**:修改为有意义的名称,如 "客服助手" -- **语言**:选择助手的对话语言 +## 第四步:测试并发布 -#### 系统提示词 +1. 打开助手测试面板,先验证文本对话,再验证语音输入输出 +2. 观察事件流、转写、工具调用和最终回复是否符合预期 +3. 保存当前配置,并确认该助手已可用于外部接入 -配置系统提示词,定义助手的角色和行为: +更系统的验证方式见 [测试调试](../concepts/assistants/testing.md)。 -``` -你是一个友好的客服助手。你的任务是帮助用户解答问题。 +## 第五步:接入应用 -要求: -- 保持友好和专业的语气 -- 回答要简洁明了,每次回复控制在 2-3 句话 -- 如果不确定答案,请如实告知 -``` - -#### 开场白配置 - -设置对话开始时助手的问候语: - -- **首回合模式**:选择 "助手先说" 让助手主动开场 -- **开场白内容**:如 "你好,我是智能客服助手,请问有什么可以帮您?" - -### 2.3 配置模型 - -在 **模型** 标签页中选择之前添加的资源: - -| 配置项 | 说明 | -|-------|------| -| LLM 模型 | 选择在模型接入中添加的大语言模型 | -| ASR 模型 | 选择在语音识别中添加的 ASR 模型 | - -### 2.4 配置语音 - -在 **语音** 标签页中配置: - -| 配置项 | 说明 | -|-------|------| -| 启用语音输出 | 开启后助手会用语音回复 | -| 选择声音 | 选择在声音资源中添加的音色 | -| 语速 | 可微调当前助手的说话速度 | - -### 2.5 保存配置 - -完成配置后,点击页面顶部的 **保存** 按钮。 - ---- - -## 第三步:测试助手 - -### 3.1 打开测试面板 - -点击助手卡片右上角的 **测试** 按钮,打开实时调试面板。 - -### 3.2 进行对话测试 - -| 测试场景 | 示例问题 | 预期结果 | -|---------|---------|---------| -| 基础问候 | "你好" | 助手友好回应 | -| 功能询问 | "你能做什么?" | 介绍自身能力 | -| 业务问题 | 根据你的场景设计 | 正确回答 | -| 边界测试 | 无关问题 | 婉拒或引导 | - -### 3.3 检查各环节 - -在调试面板中可以看到: - -- **ASR 输出**:用户语音识别结果 -- **LLM 输入/输出**:模型的输入和生成内容 -- **TTS 状态**:语音合成状态 - ---- - -## 第四步:发布助手 - -测试通过后: - -1. 点击 **发布** 按钮 -2. 复制生成的连接信息: - - `assistant_id`:用于 API 调用 - - WebSocket 地址:用于实时对话 - -### 嵌入到应用 +最小接入方式是使用 WebSocket API 建立实时会话: ```javascript const ws = new WebSocket('ws://your-server/ws?assistant_id=YOUR_ASSISTANT_ID'); @@ -223,54 +71,28 @@ const ws = new WebSocket('ws://your-server/ws?assistant_id=YOUR_ASSISTANT_ID'); ws.onopen = () => { ws.send(JSON.stringify({ type: 'session.start', - audio: { - encoding: 'pcm_s16le', - sample_rate_hz: 16000, - channels: 1 - } + audio: { encoding: 'pcm_s16le', sample_rate_hz: 16000, channels: 1 } })); }; - -ws.onmessage = (event) => { - console.log('收到消息:', event.data); -}; ``` ---- +你通常只需要两项信息: -## 常见问题 +- `assistant_id`:指定接入哪个助手 +- WebSocket 地址:由引擎服务提供实时对话入口 -### 资源库中添加模型失败? +完整协议见 [WebSocket 协议](../api-reference/websocket.md)。 -1. 检查 API Key 是否正确 -2. 确认 Base URL 格式正确(通常以 `/v1` 结尾) -3. 验证网络能否访问对应的 API 服务 +## 常见卡点 -### 助手不回复? - -1. 检查是否已选择 LLM 模型 -2. 确认 LLM 模型配置正确(可在模型接入页面预览测试) -3. 查看浏览器控制台是否有错误 - -### 语音识别不准确? - -1. 检查是否选择了正确的语言 -2. 尝试添加热词提高专有名词识别率 -3. 确保录音设备工作正常 - -### 语音无法播放? - -1. 检查浏览器是否允许自动播放音频 -2. 确认已选择声音并正确配置 -3. 在声音资源页面点击试听确认配置正确 - ---- +- 资源配置不生效:回到 [资源准备清单](dashboard.md) 检查三类资源是否都已准备好 +- 助手不回复:先看 [测试调试](../concepts/assistants/testing.md),再进入 [故障排查](../resources/troubleshooting.md) +- 回复质量不稳定:优先检查 [提示词指南](../concepts/assistants/prompts.md) 与 [知识库](../customization/knowledge-base.md) ## 下一步 -恭喜!你已成功创建了第一个 AI 助手。接下来可以: +- [环境与部署](../getting-started/index.md) - 补全环境、配置和部署细节 +- [构建助手](../concepts/assistants.md) - 深入配置助手、模型、知识库、工具与工作流 +- [API 参考](../api-reference/index.md) - 查看管理接口与实时协议 + -- [配置知识库](../customization/knowledge-base.md) - 让助手回答专业问题 -- [添加工具](../customization/tools.md) - 扩展助手能力 -- [查看 API 文档](../api-reference/websocket.md) - 深入了解协议细节 -- [Docker 部署](../deployment/index.md) - 使用容器运行 diff --git a/docs/content/resources/faq.md b/docs/content/resources/faq.md index 0b1b729..1c59e42 100644 --- a/docs/content/resources/faq.md +++ b/docs/content/resources/faq.md @@ -1,110 +1,59 @@ -# 常见问题 +# 常见问题 -## API Key 配置 +本页只提供简短回答和跳转建议;如果你需要逐步排查,请直接进入 [故障排查](troubleshooting.md)。 -### Q: 如何配置 API Key? +## Q: 我应该先看哪一部分文档? -进入 **LLM 库** 或 **语音库** 页面,点击对应模型的配置按钮填写 API Key。 +- 想了解产品是什么:看 [产品概览](../overview/index.md) +- 想先把服务跑起来:看 [环境与部署](../getting-started/index.md) +- 想最快创建第一个助手:看 [快速开始](../quickstart/index.md) +- 想系统完成助手配置:从 [助手概览](../concepts/assistants.md) 开始 -**步骤:** +## Q: 如何配置模型或 API Key? -1. 在左侧导航栏选择 **模型配置** -2. 选择 **LLM 库** 或 **语音库** -3. 点击已添加模型的 **编辑** 按钮 -4. 在 API Key 字段填写你的密钥 -5. 点击 **保存** +进入对应资源页完成配置: -## 助手问题 +- LLM:见 [LLM 模型](../customization/models.md) +- ASR:见 [语音识别](../customization/asr.md) +- TTS:见 [声音资源](../customization/voices.md) -### Q: 助手无法回复? +## Q: 助手为什么不回复? -可能的原因和解决方案: +通常先检查三件事: -1. **检查模型配置是否正确** - - 确认 API Key 已正确填写 - - 测试模型连接是否正常 +- 助手是否已绑定可用的模型资源 +- 提示词、知识库或工具是否配置完整 +- WebSocket 会话是否已经正常建立 -2. **确认知识库已正确关联** - - 进入助手配置的 **知识** 标签页 - - 检查是否已选择知识库 +下一步: -3. **查看系统日志排查错误** - - 打开浏览器开发者工具(F12) - - 检查 Console 和 Network 标签页 +- 助手行为验证:看 [测试调试](../concepts/assistants/testing.md) +- 逐步排查:看 [故障排查](troubleshooting.md) -### Q: 助手回复内容不相关? +## Q: 回复为什么不准确或不稳定? -- 检查系统提示词是否清晰明确 -- 调整 Temperature 参数(降低可提高准确性) -- 确认知识库内容与问题相关 -- 增加知识库相似度阈值 +优先检查: -## 语音识别 +- 提示词是否明确了角色、任务和限制 +- 是否应该补充知识库,而不是继续堆叠提示词 +- 是否需要把复杂业务改成工作流,而不是单轮问答 -### Q: 语音识别不准确? +相关文档: -1. **确认 ASR 模型选择正确** - - 中文场景推荐使用 SenseVoice - - 英文场景推荐使用 Whisper +- [提示词指南](../concepts/assistants/prompts.md) +- [知识库](../customization/knowledge-base.md) +- [工作流](../customization/workflows.md) -2. **检查音频采样率** - - 推荐采样率:16kHz - - 推荐格式:PCM 16-bit +## Q: 语音识别或语音播放效果不好怎么办? -3. **确认语言设置匹配** - - 在 ASR 配置中选择正确的语言 +- 输入侧问题先看 [语音识别](../customization/asr.md) +- 输出侧问题先看 [声音资源](../customization/voices.md) 和 [TTS 参数](../customization/tts.md) +- 需要逐步定位链路问题时,再看 [故障排查](troubleshooting.md) -### Q: 语音延迟较高? +## Q: 页面空白、接口报错或连接不上怎么办? -- 检查网络连接稳定性 -- 尝试切换 ASR 服务提供商 -- 降低音频质量以减少传输数据量 +这是典型的环境或链路问题: -## 语音合成 +- 先确认 [环境与部署](../getting-started/index.md) 中的三个服务都已启动 +- 再进入 [故障排查](troubleshooting.md) 按连接、API、页面加载或性能问题分类处理 -### Q: TTS 声音不自然? - -- 尝试不同的音色选项 -- 调整语速参数(推荐 0.8-1.2) -- 选择与内容风格匹配的声音 - -### Q: TTS 无法播放? - -1. 检查浏览器是否允许自动播放音频 -2. 确认 TTS API Key 配置正确 -3. 检查网络连接 - -## 知识库 - -### Q: 知识库检索无结果? - -- 确认文档已成功上传 -- 降低相似度阈值(默认 0.7) -- 增加返回结果数量 -- 检查文档内容是否与查询相关 - -### Q: 文档上传失败? - -- 检查文件大小是否超过 10MB -- 确认文件格式支持(MD/PDF/TXT) -- 尝试减小文档内容 - -## 部署问题 - -### Q: 页面空白或加载失败? - -1. 检查浏览器控制台错误信息 -2. 确认后端服务已启动 -3. 检查 VITE_API_URL 环境变量配置 - -### Q: API 请求失败? - -- 确认 VITE_API_URL 配置正确 -- 检查后端服务是否运行 -- 查看网络请求响应状态码 - -### Q: 静态资源 404? - -- 检查 Nginx `try_files` 配置 -- 确认构建产物路径正确 -- 检查文件权限设置 diff --git a/docs/content/roadmap.md b/docs/content/roadmap.md index 1cd594f..d7cc311 100644 --- a/docs/content/roadmap.md +++ b/docs/content/roadmap.md @@ -1,4 +1,4 @@ -# 开发路线图 +# 开发路线图 本页面展示 Realtime Agent Studio 的开发计划和进度。 @@ -8,50 +8,47 @@ ### 实时交互引擎 -- [x] **管线式全双工引擎** - ASR/LLM/TTS 流水线架构 +- [x] **管线式全双工引擎** - ASR / LLM / TTS 流水线架构 - [x] **智能打断处理** - VAD + EOU 检测 -- [x] **OpenAI 兼容接口** - ASR/TTS 标准接口适配 +- [x] **OpenAI 兼容接口** - ASR / TTS 标准接口适配 - [x] **DashScope TTS** - 阿里云语音合成适配 -### 智能体配置管理 +### 助手配置管理 - [x] **系统提示词编辑** - Prompt 配置,动态变量注入 -- [x] **模型选择** - LLM/ASR/TTS 模型管理界面 +- [x] **模型选择** - LLM / ASR / TTS 模型管理界面 - [x] **工具调用配置** - Webhook 工具 + 客户端工具 -### 交互测试工具 +### 调试与观察 - [x] **实时调试控制台** - WebSocket 调试连接示例 +- [x] **完整会话回放** - 音频 + 转写 + LLM 响应 +- [x] **会话检索筛选** - 按时间 / 助手 / 状态筛选 ### 开放接口 - [x] **WebSocket 协议** - `/ws` 端点完整实现 - [x] **RESTful 接口** - 完整的 CRUD API -### 交互历史监控 - -- [x] **完整会话回放** - 音频 + 转写 + LLM 响应 -- [x] **会话检索筛选** - 按时间/助手/状态筛选 - --- ## 开发中 :construction: -### 智能体配置管理 +### 助手与能力编排 -- [ ] **私有化 ASR/TTS 适配** - 本地模型接入 +- [ ] **私有化 ASR / TTS 适配** - 本地模型接入 - [ ] **工作流编辑** - 可视化流程编排 - [ ] **知识库关联** - RAG 文档管理 ### 实时交互引擎 -- [ ] **原生多模态模型** - Step Audio 接入(GPT-4o Realtime/Gemini Live 国内环境受限) +- [ ] **原生多模态模型** - Step Audio 接入(GPT-4o Realtime / Gemini Live 国内环境受限) +- [ ] **WebRTC 协议** - `/webrtc` 端点 ### 开放接口 -- [ ] **SDK 支持** - JavaScript/Python SDK -- [ ] **电话接入** - 电话呼入自动接听/自动呼出接口和批量呼出 -- [ ] **WebRTC 协议** - `/webrtc` 端点 +- [ ] **SDK 支持** - JavaScript / Python SDK +- [ ] **电话接入** - 电话呼入自动接听 / 自动呼出接口和批量呼出 ### 效果评估 @@ -65,13 +62,14 @@ - [ ] **Webhook 回调** - 会话事件通知机制 -### 效果评估 +### 数据与评估 - [ ] **实时仪表盘增强** - 完善统计看板功能 +- [ ] **评估闭环** - 测试、评分、回归与变更追踪 -### 企业特性 +### 企业能力 -- [ ] **多租户支持** - 团队/组织管理 +- [ ] **多租户支持** - 团队 / 组织管理 - [ ] **权限管理** - RBAC 角色权限控制 - [ ] **审计日志** - 操作记录追踪 @@ -79,7 +77,7 @@ - [ ] **更多模型供应商** - 讯飞、百度、腾讯等 - [ ] **CRM 集成** - Salesforce、HubSpot 等 -- [ ] **呼叫中心集成** - SIP/PSTN 网关 +- [ ] **呼叫中心集成** - SIP / PSTN 网关 --- @@ -94,20 +92,19 @@ --- -## 参考项目 +## 生态参考 ### 开源项目 -* [Livekit Agent](https://github.com/livekit/agents) -* [Pipecat](https://github.com/pipecat-ai/pipecat) -* [vison-agent](https://github.com/GetStream/Vision-Agents) -* [active-call](https://github.com/miuda-ai/active-call) -* [TEN](https://github.com/TEN-framework/ten-framework) -* [airi](https://github.com/moeru-ai/airi) -* [Vocode Core](https://github.com/vocodedev/vocode-core) -* [awesome-voice-agents](https://github.com/yzfly/awesome-voice-agents) -### 商业项目 -* [Vapi](https://vapi.ai) -* [Retell](https://www.retellai.com) -* [Sierra](https://sierra.ai/product/voice) -* [Bolna](https://platform.bolna.ai) \ No newline at end of file +- [Livekit Agent](https://github.com/livekit/agents) +- [Pipecat](https://github.com/pipecat-ai/pipecat) +- [Vision Agents](https://github.com/GetStream/Vision-Agents) +- [active-call](https://github.com/miuda-ai/active-call) +- [TEN](https://github.com/TEN-framework/ten-framework) +- [airi](https://github.com/moeru-ai/airi) +- [Vocode Core](https://github.com/vocodedev/vocode-core) +- [awesome-voice-agents](https://github.com/yzfly/awesome-voice-agents) + +### 文档与研究参考 + +- [Voice AI & Voice Agents](https://voiceaiandvoiceagents.com/) diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index c9cecff..ac06612 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -1,5 +1,5 @@ -site_name: "Realtime Agent Studio" -site_description: "构建实时交互音视频智能体的开源工作平台" +site_name: "Realtime Agent Studio" +site_description: "Realtime Agent Studio(RAS)是一个通过管理控制台与 API 构建、部署和运营实时多模态助手的开源平台。" site_url: "https://your-org.github.io/AI-VideoAssistant" copyright: "Copyright © 2025 RAS Team" site_author: "RAS Team" @@ -9,51 +9,41 @@ site_dir: "site" nav: - 首页: index.md - - 产品概览: - - 概述: overview/index.md - - 系统架构: overview/architecture.md - 快速开始: - - 5 分钟入门: quickstart/index.md - - 资源库配置: quickstart/dashboard.md + - 环境与部署: getting-started/index.md + - 创建第一个助手: quickstart/index.md + - 构建助手: + - 助手概览: concepts/assistants.md + - 基础配置: concepts/assistants/configuration.md + - 提示词: concepts/assistants/prompts.md + - LLM 模型: customization/models.md + - 语音识别: customization/asr.md + - 声音资源: customization/voices.md + - TTS 参数: customization/tts.md + - 知识库: customization/knowledge-base.md + - 工具: customization/tools.md + - 工作流: customization/workflows.md + - 测试与调试: concepts/assistants/testing.md - 核心概念: - - 概述: concepts/index.md - - 助手详解: concepts/assistants.md + - 产品概览: overview/index.md + - 概念总览: concepts/index.md - 引擎架构: concepts/engines.md - - 安装部署: - - 概述: getting-started/index.md - - 环境要求: getting-started/requirements.md - - 配置说明: getting-started/configuration.md - - 部署概览: deployment/index.md - - Docker 部署: deployment/docker.md - - 助手管理: - - 创建助手: - - 小助手: - - 配置选项: assistants/configuration.md - - 提示词指南: assistants/prompts.md - - 测试调试: assistants/testing.md - - 工作流: - - 配置选项: assistants/workflow-configuration.md - - 组件库: - - 模型接入: customization/models.md - - 语音识别: customization/asr.md - - 语音生成: customization/tts.md - - 知识库: customization/knowledge-base.md - - 工具与插件: customization/tools.md - - 数据分析: - - 仪表盘: analysis/dashboard.md - - 历史记录: analysis/history.md - - 效果评估: analysis/evaluation.md - - 自动化测试: analysis/autotest.md - - API 参考: - - 概述: api-reference/index.md + - Pipeline 引擎: concepts/pipeline-engine.md + - Realtime 引擎: concepts/realtime-engine.md + - 系统架构: overview/architecture.md + - 集成: + - API 参考: api-reference/index.md - WebSocket 协议: api-reference/websocket.md - 错误码: api-reference/errors.md - - 资源: + - 运维: + - 仪表盘: analysis/dashboard.md + - 历史记录: analysis/history.md + - 效果评估: analysis/evaluation.md + - 自动化测试: analysis/autotest.md - 常见问题: resources/faq.md - 故障排查: resources/troubleshooting.md - 更新日志: changelog.md - 路线图: roadmap.md - theme: name: material language: zh @@ -148,7 +138,6 @@ plugins: minify_html: true extra: - # version.provider: mike — only enable when deploying with mike (versions.json is generated on deploy) social: - icon: fontawesome/brands/github link: https://github.com/your-org/AI-VideoAssistant @@ -164,3 +153,5 @@ extra_css: extra_javascript: - javascripts/mermaid.mjs - javascripts/extra.js + + diff --git a/engine/.env.example b/engine/.env.example index 4007aa0..7f09de7 100644 --- a/engine/.env.example +++ b/engine/.env.example @@ -26,34 +26,27 @@ HISTORY_FINALIZE_DRAIN_TIMEOUT_SEC=1.5 SAMPLE_RATE=16000 # 20ms is recommended for VAD stability and latency. # 100ms works but usually worsens start-of-speech accuracy. +# WS binary audio frame size validation is derived from SAMPLE_RATE + CHUNK_SIZE_MS. +# Client frame payloads must be a multiple of: SAMPLE_RATE * 2 * (CHUNK_SIZE_MS / 1000). CHUNK_SIZE_MS=20 +# Public default output codec exposed in config.resolved (overridable by runtime metadata). DEFAULT_CODEC=pcm MAX_AUDIO_BUFFER_SECONDS=30 -# Agent profile selection (optional fallback when CLI args are not used) -# Prefer CLI: -# python -m app.main --agent-config config/agents/default.yaml -# python -m app.main --agent-profile default -# AGENT_CONFIG_PATH=config/agents/default.yaml -# AGENT_PROFILE=default -AGENT_CONFIG_DIR=config/agents - -# Optional: provider credentials referenced from YAML, e.g. ${LLM_API_KEY} -# LLM_API_KEY=your_llm_api_key_here -# LLM_API_URL=https://api.openai.com/v1 -# TTS_API_KEY=your_tts_api_key_here -# TTS_API_URL=https://api.example.com/v1/audio/speech -# ASR_API_KEY=your_asr_api_key_here -# ASR_API_URL=https://api.example.com/v1/audio/transcriptions +# Local assistant/agent YAML directory. In local mode the runtime resolves: +# ASSISTANT_LOCAL_CONFIG_DIR/.yaml +ASSISTANT_LOCAL_CONFIG_DIR=config/agents # Logging LOG_LEVEL=INFO # json is better for production/observability; text is easier locally. +# Controls both console and file log serialization/format. LOG_FORMAT=json # WebSocket behavior INACTIVITY_TIMEOUT_SEC=60 HEARTBEAT_INTERVAL_SEC=50 +# Public protocol label emitted in session.started/config.resolved payloads. WS_PROTOCOL_VERSION=v1 # CORS / ICE (JSON strings) diff --git a/engine/Dockerfile b/engine/Dockerfile index e6e5806..ab7b3d3 100644 --- a/engine/Dockerfile +++ b/engine/Dockerfile @@ -2,6 +2,11 @@ FROM python:3.12-slim WORKDIR /app +# Build this image from the project parent directory so both +# engine-v3/engine and fastgpt-python-sdk are available in the context. +# Example: +# docker build -f engine-v3/engine/Dockerfile -t engine-v3 . + # Install system dependencies for audio processing RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ @@ -12,11 +17,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* # Install Python dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +COPY engine-v3/engine/requirements.txt /tmp/requirements.txt +COPY fastgpt-python-sdk /deps/fastgpt-python-sdk +RUN pip install --no-cache-dir -r /tmp/requirements.txt \ + && pip install --no-cache-dir /deps/fastgpt-python-sdk # Copy application code -COPY . . +COPY engine-v3/engine /app # Create necessary directories RUN mkdir -p /app/logs /app/data/vad diff --git a/engine/README.md b/engine/README.md index 9d0949b..5018c39 100644 --- a/engine/README.md +++ b/engine/README.md @@ -1,6 +1,6 @@ -# py-active-call-cc +# Realtime Agent Studio Engine -Python Active-Call: real-time audio streaming with WebSocket and WebRTC. +This repo contains a Python 3.11+ codebase for building low-latency realtime human-agent interaction pipelines (capture, stream, and process audio) using WebSockets or WebRTC. This repo contains a Python 3.11+ codebase for building low-latency voice pipelines (capture, stream, and process audio) using WebRTC and WebSockets. @@ -14,35 +14,11 @@ It is currently in an early, experimental stage. uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 ``` -使用 agent profile(推荐) - -``` -python -m app.main --agent-profile default -``` - -使用指定 YAML - -``` -python -m app.main --agent-config config/agents/default.yaml -``` - -Agent 配置路径优先级 -1. `--agent-config` -2. `--agent-profile`(映射到 `config/agents/.yaml`) -3. `AGENT_CONFIG_PATH` -4. `AGENT_PROFILE` -5. `config/agents/default.yaml`(若存在) - 说明 -- Agent 相关配置是严格模式:YAML 缺少必须项会直接报错,不会回退到 `.env` 或代码默认值。 -- 如果要引用环境变量,请在 YAML 显式写 `${ENV_VAR}`。 -- `siliconflow` 独立 section 已移除;请在 `agent.llm / agent.tts / agent.asr` 内通过 `provider`、`api_key`、`api_url`、`model` 配置。 -- `agent.tts.provider` 现支持 `dashscope`(Realtime 协议,非 OpenAI-compatible);默认 URL 为 `wss://dashscope.aliyuncs.com/api-ws/v1/realtime`,默认模型为 `qwen3-tts-flash-realtime`。 -- `agent.tts.dashscope_mode`(兼容旧写法 `agent.tts.mode`)支持 `commit | server_commit`,且仅在 `provider=dashscope` 时生效: - - `commit`:Engine 先按句切分,再逐句提交给 DashScope。 - - `server_commit`:Engine 不再逐句切分,由 DashScope 对整段文本自行切分。 -- 现在支持在 Agent YAML 中配置 `agent.tools`(列表),用于声明运行时可调用工具。 -- 工具配置示例见 `config/agents/tools.yaml`。 +- 启动阶段不再通过参数加载 Agent YAML。 +- 会话阶段统一按 `assistant_id` 拉取运行时配置: + - 有 `BACKEND_URL`:从 backend API 获取。 + - 无 `BACKEND_URL`(或 `BACKEND_MODE=disabled`):从 `ASSISTANT_LOCAL_CONFIG_DIR/.yaml` 获取。 ## Backend Integration @@ -50,6 +26,7 @@ Engine runtime now supports adapter-based backend integration: - `BACKEND_MODE=auto|http|disabled` - `BACKEND_URL` + `BACKEND_TIMEOUT_SEC` +- `ASSISTANT_LOCAL_CONFIG_DIR` (default `engine/config/agents`) - `HISTORY_ENABLED=true|false` Behavior: @@ -58,6 +35,16 @@ Behavior: - `http`: force HTTP backend; falls back to engine-only mode when URL is missing. - `disabled`: force engine-only mode (no backend calls). +Assistant config source behavior: + +- If `BACKEND_URL` is configured and backend mode is enabled, assistant config is loaded from backend API. +- If `BACKEND_URL` is empty (or backend mode is disabled), assistant config is loaded from local YAML. + +Local assistant YAML example: + +- File path: `engine/config/agents/.yaml` +- Runtime still requires WebSocket query param `assistant_id`; it must match the local file name. + History write path is now asynchronous and buffered per session: - `HISTORY_QUEUE_MAX_SIZE` @@ -84,3 +71,6 @@ python mic_client.py `/ws` uses a strict `v1` JSON control protocol with binary PCM audio frames. See `docs/ws_v1_schema.md`. + +# Reference +* [active-call](https://github.com/restsend/active-call) diff --git a/engine/adapters/__init__.py b/engine/adapters/__init__.py new file mode 100644 index 0000000..6a94df0 --- /dev/null +++ b/engine/adapters/__init__.py @@ -0,0 +1 @@ +"""Adapters package.""" diff --git a/engine/adapters/control_plane/__init__.py b/engine/adapters/control_plane/__init__.py new file mode 100644 index 0000000..06f5004 --- /dev/null +++ b/engine/adapters/control_plane/__init__.py @@ -0,0 +1 @@ +"""Control-plane adapters package.""" diff --git a/engine/adapters/control_plane/backend.py b/engine/adapters/control_plane/backend.py new file mode 100644 index 0000000..bc32cf1 --- /dev/null +++ b/engine/adapters/control_plane/backend.py @@ -0,0 +1,683 @@ +"""Backend adapter implementations for engine integration ports.""" + +from __future__ import annotations + +import re +from pathlib import Path +from typing import Any, Dict, List, Optional + +import aiohttp +from loguru import logger + +from app.config import settings + +try: + import yaml +except ImportError: # pragma: no cover - validated when local YAML source is enabled + yaml = None + + +_ASSISTANT_ID_PATTERN = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_.-]{0,127}$") + + +def _assistant_error(code: str, assistant_id: str) -> Dict[str, Any]: + return {"__error_code": code, "assistantId": str(assistant_id or "")} + + +class NullBackendAdapter: + """No-op adapter for engine-only runtime without backend dependencies.""" + + async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]: + _ = assistant_id + return None + + async def create_call_record( + self, + *, + user_id: int, + assistant_id: Optional[str], + source: str = "debug", + ) -> Optional[str]: + _ = (user_id, assistant_id, source) + return None + + async def add_transcript( + self, + *, + call_id: str, + turn_index: int, + speaker: str, + content: str, + start_ms: int, + end_ms: int, + confidence: Optional[float] = None, + duration_ms: Optional[int] = None, + ) -> bool: + _ = (call_id, turn_index, speaker, content, start_ms, end_ms, confidence, duration_ms) + return False + + async def finalize_call_record( + self, + *, + call_id: str, + status: str, + duration_seconds: int, + ) -> bool: + _ = (call_id, status, duration_seconds) + return False + + async def search_knowledge_context( + self, + *, + kb_id: str, + query: str, + n_results: int = 5, + ) -> List[Dict[str, Any]]: + _ = (kb_id, query, n_results) + return [] + + async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]: + _ = tool_id + return None + + +class HistoryDisabledBackendAdapter: + """Adapter wrapper that disables history writes while keeping reads available.""" + + def __init__(self, delegate: HttpBackendAdapter | NullBackendAdapter): + self._delegate = delegate + + async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]: + return await self._delegate.fetch_assistant_config(assistant_id) + + async def create_call_record( + self, + *, + user_id: int, + assistant_id: Optional[str], + source: str = "debug", + ) -> Optional[str]: + _ = (user_id, assistant_id, source) + return None + + async def add_transcript( + self, + *, + call_id: str, + turn_index: int, + speaker: str, + content: str, + start_ms: int, + end_ms: int, + confidence: Optional[float] = None, + duration_ms: Optional[int] = None, + ) -> bool: + _ = (call_id, turn_index, speaker, content, start_ms, end_ms, confidence, duration_ms) + return False + + async def finalize_call_record( + self, + *, + call_id: str, + status: str, + duration_seconds: int, + ) -> bool: + _ = (call_id, status, duration_seconds) + return False + + async def search_knowledge_context( + self, + *, + kb_id: str, + query: str, + n_results: int = 5, + ) -> List[Dict[str, Any]]: + return await self._delegate.search_knowledge_context( + kb_id=kb_id, + query=query, + n_results=n_results, + ) + + async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]: + return await self._delegate.fetch_tool_resource(tool_id) + + +class LocalYamlAssistantConfigAdapter(NullBackendAdapter): + """Load assistant runtime config from local YAML files.""" + + def __init__(self, config_dir: str): + self._config_dir = self._resolve_base_dir(config_dir) + + @staticmethod + def _resolve_base_dir(config_dir: str) -> Path: + raw = Path(str(config_dir or "").strip() or "engine/config/agents") + if raw.is_absolute(): + return raw.resolve() + + cwd_candidate = (Path.cwd() / raw).resolve() + if cwd_candidate.exists(): + return cwd_candidate + + engine_dir = Path(__file__).resolve().parent.parent + engine_candidate = (engine_dir / raw).resolve() + if engine_candidate.exists(): + return engine_candidate + + parts = raw.parts + if parts and parts[0] == "engine" and len(parts) > 1: + trimmed_candidate = (engine_dir / Path(*parts[1:])).resolve() + if trimmed_candidate.exists(): + return trimmed_candidate + + return cwd_candidate + + def _resolve_config_file(self, assistant_id: str) -> Optional[Path]: + normalized = str(assistant_id or "").strip() + if not _ASSISTANT_ID_PATTERN.match(normalized): + return None + + yaml_path = self._config_dir / f"{normalized}.yaml" + yml_path = self._config_dir / f"{normalized}.yml" + if yaml_path.exists(): + return yaml_path + if yml_path.exists(): + return yml_path + return None + + @staticmethod + def _as_str(value: Any) -> Optional[str]: + if value is None: + return None + text = str(value).strip() + return text or None + + @classmethod + def _translate_agent_schema(cls, assistant_id: str, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Translate legacy `agent:` YAML schema into runtime assistant metadata.""" + agent = payload.get("agent") + if not isinstance(agent, dict): + return None + + runtime: Dict[str, Any] = { + "assistantId": str(assistant_id), + "services": {}, + } + + llm = agent.get("llm") + if isinstance(llm, dict): + llm_runtime: Dict[str, Any] = {} + if cls._as_str(llm.get("provider")): + llm_runtime["provider"] = cls._as_str(llm.get("provider")) + if cls._as_str(llm.get("model")): + llm_runtime["model"] = cls._as_str(llm.get("model")) + if cls._as_str(llm.get("api_key")): + llm_runtime["apiKey"] = cls._as_str(llm.get("api_key")) + if cls._as_str(llm.get("api_url")): + llm_runtime["baseUrl"] = cls._as_str(llm.get("api_url")) + if cls._as_str(llm.get("app_id")): + llm_runtime["appId"] = cls._as_str(llm.get("app_id")) + if llm_runtime: + runtime["services"]["llm"] = llm_runtime + + tts = agent.get("tts") + if isinstance(tts, dict): + tts_runtime: Dict[str, Any] = {} + if cls._as_str(tts.get("provider")): + tts_runtime["provider"] = cls._as_str(tts.get("provider")) + if cls._as_str(tts.get("model")): + tts_runtime["model"] = cls._as_str(tts.get("model")) + if cls._as_str(tts.get("api_key")): + tts_runtime["apiKey"] = cls._as_str(tts.get("api_key")) + if cls._as_str(tts.get("api_url")): + tts_runtime["baseUrl"] = cls._as_str(tts.get("api_url")) + if cls._as_str(tts.get("voice")): + tts_runtime["voice"] = cls._as_str(tts.get("voice")) + if cls._as_str(tts.get("app_id")): + tts_runtime["appId"] = cls._as_str(tts.get("app_id")) + if cls._as_str(tts.get("resource_id")): + tts_runtime["resourceId"] = cls._as_str(tts.get("resource_id")) + if cls._as_str(tts.get("cluster")): + tts_runtime["cluster"] = cls._as_str(tts.get("cluster")) + if cls._as_str(tts.get("uid")): + tts_runtime["uid"] = cls._as_str(tts.get("uid")) + if tts.get("speed") is not None: + tts_runtime["speed"] = tts.get("speed") + dashscope_mode = cls._as_str(tts.get("dashscope_mode")) or cls._as_str(tts.get("mode")) + if dashscope_mode: + tts_runtime["mode"] = dashscope_mode + if tts_runtime: + runtime["services"]["tts"] = tts_runtime + + asr = agent.get("asr") + if isinstance(asr, dict): + asr_runtime: Dict[str, Any] = {} + if cls._as_str(asr.get("provider")): + asr_runtime["provider"] = cls._as_str(asr.get("provider")) + if cls._as_str(asr.get("model")): + asr_runtime["model"] = cls._as_str(asr.get("model")) + if cls._as_str(asr.get("api_key")): + asr_runtime["apiKey"] = cls._as_str(asr.get("api_key")) + if cls._as_str(asr.get("api_url")): + asr_runtime["baseUrl"] = cls._as_str(asr.get("api_url")) + if cls._as_str(asr.get("app_id")): + asr_runtime["appId"] = cls._as_str(asr.get("app_id")) + if cls._as_str(asr.get("resource_id")): + asr_runtime["resourceId"] = cls._as_str(asr.get("resource_id")) + if cls._as_str(asr.get("cluster")): + asr_runtime["cluster"] = cls._as_str(asr.get("cluster")) + if cls._as_str(asr.get("uid")): + asr_runtime["uid"] = cls._as_str(asr.get("uid")) + if isinstance(asr.get("request_params"), dict): + asr_runtime["requestParams"] = dict(asr.get("request_params") or {}) + if asr.get("enable_interim") is not None: + asr_runtime["enableInterim"] = asr.get("enable_interim") + if asr.get("interim_interval_ms") is not None: + asr_runtime["interimIntervalMs"] = asr.get("interim_interval_ms") + if asr.get("min_audio_ms") is not None: + asr_runtime["minAudioMs"] = asr.get("min_audio_ms") + if asr_runtime: + runtime["services"]["asr"] = asr_runtime + + duplex = agent.get("duplex") + if isinstance(duplex, dict): + if cls._as_str(duplex.get("system_prompt")): + runtime["systemPrompt"] = cls._as_str(duplex.get("system_prompt")) + if duplex.get("greeting") is not None: + runtime["greeting"] = duplex.get("greeting") + + barge_in = agent.get("barge_in") + if isinstance(barge_in, dict): + runtime["bargeIn"] = {} + if barge_in.get("min_duration_ms") is not None: + runtime["bargeIn"]["minDurationMs"] = barge_in.get("min_duration_ms") + if barge_in.get("silence_tolerance_ms") is not None: + runtime["bargeIn"]["silenceToleranceMs"] = barge_in.get("silence_tolerance_ms") + if not runtime["bargeIn"]: + runtime.pop("bargeIn", None) + + if isinstance(agent.get("tools"), list): + runtime["tools"] = agent.get("tools") + + if not runtime.get("services"): + runtime.pop("services", None) + return runtime + + async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]: + config_file = self._resolve_config_file(assistant_id) + if config_file is None: + return _assistant_error("assistant.not_found", assistant_id) + + if yaml is None: + logger.warning( + "Local assistant config requested but PyYAML is unavailable (assistant_id={})", + assistant_id, + ) + return _assistant_error("assistant.config_unavailable", assistant_id) + + try: + with config_file.open("r", encoding="utf-8") as handle: + payload = yaml.safe_load(handle) or {} + except Exception as exc: + logger.warning( + "Failed to read local assistant config {} (assistant_id={}): {}", + config_file, + assistant_id, + exc, + ) + return _assistant_error("assistant.config_unavailable", assistant_id) + + if not isinstance(payload, dict): + logger.warning( + "Local assistant config is not an object (assistant_id={}, file={})", + assistant_id, + config_file, + ) + return _assistant_error("assistant.config_unavailable", assistant_id) + + translated = self._translate_agent_schema(assistant_id, payload) + if translated is not None: + payload = translated + + # Accept either backend-like payload shape or a direct assistant metadata object. + if isinstance(payload.get("assistant"), dict) or isinstance(payload.get("sessionStartMetadata"), dict): + normalized_payload = dict(payload) + else: + normalized_payload = {"assistant": dict(payload)} + + assistant_obj = normalized_payload.get("assistant") + if isinstance(assistant_obj, dict): + resolved_assistant_id = assistant_obj.get("assistantId") or assistant_obj.get("id") or assistant_id + assistant_obj["assistantId"] = str(resolved_assistant_id) + else: + normalized_payload["assistant"] = {"assistantId": str(assistant_id)} + + normalized_payload.setdefault("assistantId", str(assistant_id)) + normalized_payload.setdefault("configVersionId", f"local:{config_file.name}") + return normalized_payload + + +class AssistantConfigSourceAdapter: + """Route assistant config reads by backend availability without changing other APIs.""" + + def __init__( + self, + *, + delegate: HttpBackendAdapter | NullBackendAdapter | HistoryDisabledBackendAdapter, + local_delegate: LocalYamlAssistantConfigAdapter, + use_backend_assistant_config: bool, + ): + self._delegate = delegate + self._local_delegate = local_delegate + self._use_backend_assistant_config = bool(use_backend_assistant_config) + + async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]: + if self._use_backend_assistant_config: + return await self._delegate.fetch_assistant_config(assistant_id) + return await self._local_delegate.fetch_assistant_config(assistant_id) + + async def create_call_record( + self, + *, + user_id: int, + assistant_id: Optional[str], + source: str = "debug", + ) -> Optional[str]: + return await self._delegate.create_call_record( + user_id=user_id, + assistant_id=assistant_id, + source=source, + ) + + async def add_transcript( + self, + *, + call_id: str, + turn_index: int, + speaker: str, + content: str, + start_ms: int, + end_ms: int, + confidence: Optional[float] = None, + duration_ms: Optional[int] = None, + ) -> bool: + return await self._delegate.add_transcript( + call_id=call_id, + turn_index=turn_index, + speaker=speaker, + content=content, + start_ms=start_ms, + end_ms=end_ms, + confidence=confidence, + duration_ms=duration_ms, + ) + + async def finalize_call_record( + self, + *, + call_id: str, + status: str, + duration_seconds: int, + ) -> bool: + return await self._delegate.finalize_call_record( + call_id=call_id, + status=status, + duration_seconds=duration_seconds, + ) + + async def search_knowledge_context( + self, + *, + kb_id: str, + query: str, + n_results: int = 5, + ) -> List[Dict[str, Any]]: + return await self._delegate.search_knowledge_context( + kb_id=kb_id, + query=query, + n_results=n_results, + ) + + async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]: + return await self._delegate.fetch_tool_resource(tool_id) + + +class HttpBackendAdapter: + """HTTP implementation of backend integration ports.""" + + def __init__(self, backend_url: str, timeout_sec: int = 10): + base_url = str(backend_url or "").strip().rstrip("/") + if not base_url: + raise ValueError("backend_url is required for HttpBackendAdapter") + self._base_url = base_url + self._timeout_sec = timeout_sec + + def _timeout(self) -> aiohttp.ClientTimeout: + return aiohttp.ClientTimeout(total=self._timeout_sec) + + async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]: + """Fetch assistant config payload from backend API. + + Expected response shape: + { + "assistant": {...}, + "voice": {...} | null + } + """ + url = f"{self._base_url}/api/assistants/{assistant_id}/config" + + try: + async with aiohttp.ClientSession(timeout=self._timeout()) as session: + async with session.get(url) as resp: + if resp.status == 404: + logger.warning(f"Assistant config not found: {assistant_id}") + return {"__error_code": "assistant.not_found", "assistantId": assistant_id} + resp.raise_for_status() + payload = await resp.json() + if not isinstance(payload, dict): + logger.warning("Assistant config payload is not a dict; ignoring") + return {"__error_code": "assistant.config_unavailable", "assistantId": assistant_id} + return payload + except Exception as exc: + logger.warning(f"Failed to fetch assistant config ({assistant_id}): {exc}") + return {"__error_code": "assistant.config_unavailable", "assistantId": assistant_id} + + async def create_call_record( + self, + *, + user_id: int, + assistant_id: Optional[str], + source: str = "debug", + ) -> Optional[str]: + """Create a call record via backend history API and return call_id.""" + url = f"{self._base_url}/api/history" + payload: Dict[str, Any] = { + "user_id": user_id, + "assistant_id": assistant_id, + "source": source, + "status": "connected", + } + + try: + async with aiohttp.ClientSession(timeout=self._timeout()) as session: + async with session.post(url, json=payload) as resp: + resp.raise_for_status() + data = await resp.json() + call_id = str((data or {}).get("id") or "") + return call_id or None + except Exception as exc: + logger.warning(f"Failed to create history call record: {exc}") + return None + + async def add_transcript( + self, + *, + call_id: str, + turn_index: int, + speaker: str, + content: str, + start_ms: int, + end_ms: int, + confidence: Optional[float] = None, + duration_ms: Optional[int] = None, + ) -> bool: + """Append a transcript segment to backend history.""" + if not call_id: + return False + + url = f"{self._base_url}/api/history/{call_id}/transcripts" + payload: Dict[str, Any] = { + "turn_index": turn_index, + "speaker": speaker, + "content": content, + "confidence": confidence, + "start_ms": start_ms, + "end_ms": end_ms, + "duration_ms": duration_ms, + } + + try: + async with aiohttp.ClientSession(timeout=self._timeout()) as session: + async with session.post(url, json=payload) as resp: + resp.raise_for_status() + return True + except Exception as exc: + logger.warning(f"Failed to append history transcript (call_id={call_id}, turn={turn_index}): {exc}") + return False + + async def finalize_call_record( + self, + *, + call_id: str, + status: str, + duration_seconds: int, + ) -> bool: + """Finalize a call record with status and duration.""" + if not call_id: + return False + + url = f"{self._base_url}/api/history/{call_id}" + payload: Dict[str, Any] = { + "status": status, + "duration_seconds": duration_seconds, + } + + try: + async with aiohttp.ClientSession(timeout=self._timeout()) as session: + async with session.put(url, json=payload) as resp: + resp.raise_for_status() + return True + except Exception as exc: + logger.warning(f"Failed to finalize history call record ({call_id}): {exc}") + return False + + async def search_knowledge_context( + self, + *, + kb_id: str, + query: str, + n_results: int = 5, + ) -> List[Dict[str, Any]]: + """Search backend knowledge base and return retrieval results.""" + if not kb_id or not query.strip(): + return [] + try: + safe_n_results = max(1, int(n_results)) + except (TypeError, ValueError): + safe_n_results = 5 + + url = f"{self._base_url}/api/knowledge/search" + payload: Dict[str, Any] = { + "kb_id": kb_id, + "query": query, + "nResults": safe_n_results, + } + + try: + async with aiohttp.ClientSession(timeout=self._timeout()) as session: + async with session.post(url, json=payload) as resp: + if resp.status == 404: + logger.warning(f"Knowledge base not found for retrieval: {kb_id}") + return [] + resp.raise_for_status() + data = await resp.json() + if not isinstance(data, dict): + return [] + results = data.get("results", []) + if not isinstance(results, list): + return [] + return [r for r in results if isinstance(r, dict)] + except Exception as exc: + logger.warning(f"Knowledge search failed (kb_id={kb_id}): {exc}") + return [] + + async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]: + """Fetch tool resource configuration from backend API.""" + if not tool_id: + return None + + url = f"{self._base_url}/api/tools/resources/{tool_id}" + try: + async with aiohttp.ClientSession(timeout=self._timeout()) as session: + async with session.get(url) as resp: + if resp.status == 404: + return None + resp.raise_for_status() + data = await resp.json() + return data if isinstance(data, dict) else None + except Exception as exc: + logger.warning(f"Failed to fetch tool resource ({tool_id}): {exc}") + return None + + +def build_backend_adapter( + *, + backend_url: Optional[str], + backend_mode: str = "auto", + history_enabled: bool = True, + timeout_sec: int = 10, + assistant_local_config_dir: str = "engine/config/agents", +) -> AssistantConfigSourceAdapter: + """Create backend adapter implementation based on runtime settings.""" + mode = str(backend_mode or "auto").strip().lower() + has_url = bool(str(backend_url or "").strip()) + + base_adapter: HttpBackendAdapter | NullBackendAdapter + using_http_backend = False + if mode in {"disabled", "off", "none", "null", "engine_only", "engine-only"}: + base_adapter = NullBackendAdapter() + elif mode == "http": + if has_url: + base_adapter = HttpBackendAdapter(backend_url=str(backend_url), timeout_sec=timeout_sec) + using_http_backend = True + else: + logger.warning("BACKEND_MODE=http but BACKEND_URL is empty; falling back to NullBackendAdapter") + base_adapter = NullBackendAdapter() + else: + if has_url: + base_adapter = HttpBackendAdapter(backend_url=str(backend_url), timeout_sec=timeout_sec) + using_http_backend = True + else: + base_adapter = NullBackendAdapter() + + runtime_adapter: HttpBackendAdapter | NullBackendAdapter | HistoryDisabledBackendAdapter + if not history_enabled: + runtime_adapter = HistoryDisabledBackendAdapter(base_adapter) + else: + runtime_adapter = base_adapter + + return AssistantConfigSourceAdapter( + delegate=runtime_adapter, + local_delegate=LocalYamlAssistantConfigAdapter(assistant_local_config_dir), + use_backend_assistant_config=using_http_backend, + ) + + +def build_backend_adapter_from_settings() -> AssistantConfigSourceAdapter: + """Create backend adapter using current app settings.""" + return build_backend_adapter( + backend_url=settings.backend_url, + backend_mode=settings.backend_mode, + history_enabled=settings.history_enabled, + timeout_sec=settings.backend_timeout_sec, + assistant_local_config_dir=settings.assistant_local_config_dir, + ) diff --git a/engine/app/backend_adapters.py b/engine/app/backend_adapters.py deleted file mode 100644 index 6ff2716..0000000 --- a/engine/app/backend_adapters.py +++ /dev/null @@ -1,357 +0,0 @@ -"""Backend adapter implementations for engine integration ports.""" - -from __future__ import annotations - -from typing import Any, Dict, List, Optional - -import aiohttp -from loguru import logger - -from app.config import settings - - -class NullBackendAdapter: - """No-op adapter for engine-only runtime without backend dependencies.""" - - async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]: - _ = assistant_id - return None - - async def create_call_record( - self, - *, - user_id: int, - assistant_id: Optional[str], - source: str = "debug", - ) -> Optional[str]: - _ = (user_id, assistant_id, source) - return None - - async def add_transcript( - self, - *, - call_id: str, - turn_index: int, - speaker: str, - content: str, - start_ms: int, - end_ms: int, - confidence: Optional[float] = None, - duration_ms: Optional[int] = None, - ) -> bool: - _ = (call_id, turn_index, speaker, content, start_ms, end_ms, confidence, duration_ms) - return False - - async def finalize_call_record( - self, - *, - call_id: str, - status: str, - duration_seconds: int, - ) -> bool: - _ = (call_id, status, duration_seconds) - return False - - async def search_knowledge_context( - self, - *, - kb_id: str, - query: str, - n_results: int = 5, - ) -> List[Dict[str, Any]]: - _ = (kb_id, query, n_results) - return [] - - async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]: - _ = tool_id - return None - - -class HistoryDisabledBackendAdapter: - """Adapter wrapper that disables history writes while keeping reads available.""" - - def __init__(self, delegate: HttpBackendAdapter | NullBackendAdapter): - self._delegate = delegate - - async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]: - return await self._delegate.fetch_assistant_config(assistant_id) - - async def create_call_record( - self, - *, - user_id: int, - assistant_id: Optional[str], - source: str = "debug", - ) -> Optional[str]: - _ = (user_id, assistant_id, source) - return None - - async def add_transcript( - self, - *, - call_id: str, - turn_index: int, - speaker: str, - content: str, - start_ms: int, - end_ms: int, - confidence: Optional[float] = None, - duration_ms: Optional[int] = None, - ) -> bool: - _ = (call_id, turn_index, speaker, content, start_ms, end_ms, confidence, duration_ms) - return False - - async def finalize_call_record( - self, - *, - call_id: str, - status: str, - duration_seconds: int, - ) -> bool: - _ = (call_id, status, duration_seconds) - return False - - async def search_knowledge_context( - self, - *, - kb_id: str, - query: str, - n_results: int = 5, - ) -> List[Dict[str, Any]]: - return await self._delegate.search_knowledge_context( - kb_id=kb_id, - query=query, - n_results=n_results, - ) - - async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]: - return await self._delegate.fetch_tool_resource(tool_id) - - -class HttpBackendAdapter: - """HTTP implementation of backend integration ports.""" - - def __init__(self, backend_url: str, timeout_sec: int = 10): - base_url = str(backend_url or "").strip().rstrip("/") - if not base_url: - raise ValueError("backend_url is required for HttpBackendAdapter") - self._base_url = base_url - self._timeout_sec = timeout_sec - - def _timeout(self) -> aiohttp.ClientTimeout: - return aiohttp.ClientTimeout(total=self._timeout_sec) - - async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]: - """Fetch assistant config payload from backend API. - - Expected response shape: - { - "assistant": {...}, - "voice": {...} | null - } - """ - url = f"{self._base_url}/api/assistants/{assistant_id}/config" - - try: - async with aiohttp.ClientSession(timeout=self._timeout()) as session: - async with session.get(url) as resp: - if resp.status == 404: - logger.warning(f"Assistant config not found: {assistant_id}") - return {"__error_code": "assistant.not_found", "assistantId": assistant_id} - resp.raise_for_status() - payload = await resp.json() - if not isinstance(payload, dict): - logger.warning("Assistant config payload is not a dict; ignoring") - return {"__error_code": "assistant.config_unavailable", "assistantId": assistant_id} - return payload - except Exception as exc: - logger.warning(f"Failed to fetch assistant config ({assistant_id}): {exc}") - return {"__error_code": "assistant.config_unavailable", "assistantId": assistant_id} - - async def create_call_record( - self, - *, - user_id: int, - assistant_id: Optional[str], - source: str = "debug", - ) -> Optional[str]: - """Create a call record via backend history API and return call_id.""" - url = f"{self._base_url}/api/history" - payload: Dict[str, Any] = { - "user_id": user_id, - "assistant_id": assistant_id, - "source": source, - "status": "connected", - } - - try: - async with aiohttp.ClientSession(timeout=self._timeout()) as session: - async with session.post(url, json=payload) as resp: - resp.raise_for_status() - data = await resp.json() - call_id = str((data or {}).get("id") or "") - return call_id or None - except Exception as exc: - logger.warning(f"Failed to create history call record: {exc}") - return None - - async def add_transcript( - self, - *, - call_id: str, - turn_index: int, - speaker: str, - content: str, - start_ms: int, - end_ms: int, - confidence: Optional[float] = None, - duration_ms: Optional[int] = None, - ) -> bool: - """Append a transcript segment to backend history.""" - if not call_id: - return False - - url = f"{self._base_url}/api/history/{call_id}/transcripts" - payload: Dict[str, Any] = { - "turn_index": turn_index, - "speaker": speaker, - "content": content, - "confidence": confidence, - "start_ms": start_ms, - "end_ms": end_ms, - "duration_ms": duration_ms, - } - - try: - async with aiohttp.ClientSession(timeout=self._timeout()) as session: - async with session.post(url, json=payload) as resp: - resp.raise_for_status() - return True - except Exception as exc: - logger.warning(f"Failed to append history transcript (call_id={call_id}, turn={turn_index}): {exc}") - return False - - async def finalize_call_record( - self, - *, - call_id: str, - status: str, - duration_seconds: int, - ) -> bool: - """Finalize a call record with status and duration.""" - if not call_id: - return False - - url = f"{self._base_url}/api/history/{call_id}" - payload: Dict[str, Any] = { - "status": status, - "duration_seconds": duration_seconds, - } - - try: - async with aiohttp.ClientSession(timeout=self._timeout()) as session: - async with session.put(url, json=payload) as resp: - resp.raise_for_status() - return True - except Exception as exc: - logger.warning(f"Failed to finalize history call record ({call_id}): {exc}") - return False - - async def search_knowledge_context( - self, - *, - kb_id: str, - query: str, - n_results: int = 5, - ) -> List[Dict[str, Any]]: - """Search backend knowledge base and return retrieval results.""" - if not kb_id or not query.strip(): - return [] - try: - safe_n_results = max(1, int(n_results)) - except (TypeError, ValueError): - safe_n_results = 5 - - url = f"{self._base_url}/api/knowledge/search" - payload: Dict[str, Any] = { - "kb_id": kb_id, - "query": query, - "nResults": safe_n_results, - } - - try: - async with aiohttp.ClientSession(timeout=self._timeout()) as session: - async with session.post(url, json=payload) as resp: - if resp.status == 404: - logger.warning(f"Knowledge base not found for retrieval: {kb_id}") - return [] - resp.raise_for_status() - data = await resp.json() - if not isinstance(data, dict): - return [] - results = data.get("results", []) - if not isinstance(results, list): - return [] - return [r for r in results if isinstance(r, dict)] - except Exception as exc: - logger.warning(f"Knowledge search failed (kb_id={kb_id}): {exc}") - return [] - - async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]: - """Fetch tool resource configuration from backend API.""" - if not tool_id: - return None - - url = f"{self._base_url}/api/tools/resources/{tool_id}" - try: - async with aiohttp.ClientSession(timeout=self._timeout()) as session: - async with session.get(url) as resp: - if resp.status == 404: - return None - resp.raise_for_status() - data = await resp.json() - return data if isinstance(data, dict) else None - except Exception as exc: - logger.warning(f"Failed to fetch tool resource ({tool_id}): {exc}") - return None - - -def build_backend_adapter( - *, - backend_url: Optional[str], - backend_mode: str = "auto", - history_enabled: bool = True, - timeout_sec: int = 10, -) -> HttpBackendAdapter | NullBackendAdapter | HistoryDisabledBackendAdapter: - """Create backend adapter implementation based on runtime settings.""" - mode = str(backend_mode or "auto").strip().lower() - has_url = bool(str(backend_url or "").strip()) - - base_adapter: HttpBackendAdapter | NullBackendAdapter - if mode in {"disabled", "off", "none", "null", "engine_only", "engine-only"}: - base_adapter = NullBackendAdapter() - elif mode == "http": - if has_url: - base_adapter = HttpBackendAdapter(backend_url=str(backend_url), timeout_sec=timeout_sec) - else: - logger.warning("BACKEND_MODE=http but BACKEND_URL is empty; falling back to NullBackendAdapter") - base_adapter = NullBackendAdapter() - else: - if has_url: - base_adapter = HttpBackendAdapter(backend_url=str(backend_url), timeout_sec=timeout_sec) - else: - base_adapter = NullBackendAdapter() - - if not history_enabled: - return HistoryDisabledBackendAdapter(base_adapter) - return base_adapter - - -def build_backend_adapter_from_settings() -> HttpBackendAdapter | NullBackendAdapter | HistoryDisabledBackendAdapter: - """Create backend adapter using current app settings.""" - return build_backend_adapter( - backend_url=settings.backend_url, - backend_mode=settings.backend_mode, - history_enabled=settings.history_enabled, - timeout_sec=settings.backend_timeout_sec, - ) diff --git a/engine/app/backend_client.py b/engine/app/backend_client.py deleted file mode 100644 index 93ea183..0000000 --- a/engine/app/backend_client.py +++ /dev/null @@ -1,87 +0,0 @@ -"""Compatibility wrappers around backend adapter implementations.""" - -from __future__ import annotations - -from typing import Any, Dict, List, Optional - -from app.backend_adapters import build_backend_adapter_from_settings - - -def _adapter(): - return build_backend_adapter_from_settings() - - -async def fetch_assistant_config(assistant_id: str) -> Optional[Dict[str, Any]]: - """Fetch assistant config payload from backend adapter.""" - return await _adapter().fetch_assistant_config(assistant_id) - - -async def create_history_call_record( - *, - user_id: int, - assistant_id: Optional[str], - source: str = "debug", -) -> Optional[str]: - """Create a call record via backend history API and return call_id.""" - return await _adapter().create_call_record( - user_id=user_id, - assistant_id=assistant_id, - source=source, - ) - - -async def add_history_transcript( - *, - call_id: str, - turn_index: int, - speaker: str, - content: str, - start_ms: int, - end_ms: int, - confidence: Optional[float] = None, - duration_ms: Optional[int] = None, -) -> bool: - """Append a transcript segment to backend history.""" - return await _adapter().add_transcript( - call_id=call_id, - turn_index=turn_index, - speaker=speaker, - content=content, - start_ms=start_ms, - end_ms=end_ms, - confidence=confidence, - duration_ms=duration_ms, - ) - - -async def finalize_history_call_record( - *, - call_id: str, - status: str, - duration_seconds: int, -) -> bool: - """Finalize a call record with status and duration.""" - return await _adapter().finalize_call_record( - call_id=call_id, - status=status, - duration_seconds=duration_seconds, - ) - - -async def search_knowledge_context( - *, - kb_id: str, - query: str, - n_results: int = 5, -) -> List[Dict[str, Any]]: - """Search backend knowledge base and return retrieval results.""" - return await _adapter().search_knowledge_context( - kb_id=kb_id, - query=query, - n_results=n_results, - ) - - -async def fetch_tool_resource(tool_id: str) -> Optional[Dict[str, Any]]: - """Fetch tool resource configuration from backend API.""" - return await _adapter().fetch_tool_resource(tool_id) diff --git a/engine/app/config.py b/engine/app/config.py index e81b852..c5f8902 100644 --- a/engine/app/config.py +++ b/engine/app/config.py @@ -1,371 +1,31 @@ -"""Configuration management using Pydantic settings and agent YAML profiles.""" +"""Configuration management using Pydantic settings.""" import json import os -import re -import sys -from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, List, Optional from pydantic import Field from pydantic_settings import BaseSettings, SettingsConfigDict try: - import yaml -except ImportError: # pragma: no cover - validated when agent YAML is used - yaml = None + from dotenv import load_dotenv +except ImportError: # pragma: no cover - optional dependency in some runtimes + load_dotenv = None + +def _prime_process_env_from_dotenv() -> None: + """Load .env into process env early.""" + if load_dotenv is None: + return + + cwd_env = Path.cwd() / ".env" + engine_env = Path(__file__).resolve().parent.parent / ".env" + load_dotenv(dotenv_path=cwd_env, override=False) + if engine_env != cwd_env: + load_dotenv(dotenv_path=engine_env, override=False) -_ENV_REF_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)(?::([^}]*))?\}") -_DEFAULT_AGENT_CONFIG_DIR = "config/agents" -_DEFAULT_AGENT_CONFIG_FILE = "default.yaml" -_AGENT_SECTION_KEY_MAP: Dict[str, Dict[str, str]] = { - "vad": { - "type": "vad_type", - "model_path": "vad_model_path", - "threshold": "vad_threshold", - "min_speech_duration_ms": "vad_min_speech_duration_ms", - "eou_threshold_ms": "vad_eou_threshold_ms", - }, - "llm": { - "provider": "llm_provider", - "model": "llm_model", - "temperature": "llm_temperature", - "api_key": "llm_api_key", - "api_url": "llm_api_url", - }, - "tts": { - "provider": "tts_provider", - "api_key": "tts_api_key", - "api_url": "tts_api_url", - "model": "tts_model", - "voice": "tts_voice", - "dashscope_mode": "tts_mode", - "mode": "tts_mode", - "speed": "tts_speed", - }, - "asr": { - "provider": "asr_provider", - "api_key": "asr_api_key", - "api_url": "asr_api_url", - "model": "asr_model", - "interim_interval_ms": "asr_interim_interval_ms", - "min_audio_ms": "asr_min_audio_ms", - "start_min_speech_ms": "asr_start_min_speech_ms", - "pre_speech_ms": "asr_pre_speech_ms", - "final_tail_ms": "asr_final_tail_ms", - }, - "duplex": { - "enabled": "duplex_enabled", - "greeting": "duplex_greeting", - "system_prompt": "duplex_system_prompt", - "opener_audio_file": "duplex_opener_audio_file", - }, - "barge_in": { - "min_duration_ms": "barge_in_min_duration_ms", - "silence_tolerance_ms": "barge_in_silence_tolerance_ms", - }, -} -_AGENT_SETTING_KEYS = { - "vad_type", - "vad_model_path", - "vad_threshold", - "vad_min_speech_duration_ms", - "vad_eou_threshold_ms", - "llm_provider", - "llm_api_key", - "llm_api_url", - "llm_model", - "llm_temperature", - "tts_provider", - "tts_api_key", - "tts_api_url", - "tts_model", - "tts_voice", - "tts_mode", - "tts_speed", - "asr_provider", - "asr_api_key", - "asr_api_url", - "asr_model", - "asr_interim_interval_ms", - "asr_min_audio_ms", - "asr_start_min_speech_ms", - "asr_pre_speech_ms", - "asr_final_tail_ms", - "duplex_enabled", - "duplex_greeting", - "duplex_system_prompt", - "duplex_opener_audio_file", - "barge_in_min_duration_ms", - "barge_in_silence_tolerance_ms", - "tools", -} -_BASE_REQUIRED_AGENT_SETTING_KEYS = { - "vad_type", - "vad_model_path", - "vad_threshold", - "vad_min_speech_duration_ms", - "vad_eou_threshold_ms", - "llm_provider", - "llm_model", - "llm_temperature", - "tts_provider", - "tts_voice", - "tts_speed", - "asr_provider", - "asr_interim_interval_ms", - "asr_min_audio_ms", - "asr_start_min_speech_ms", - "asr_pre_speech_ms", - "asr_final_tail_ms", - "duplex_enabled", - "duplex_system_prompt", - "barge_in_min_duration_ms", - "barge_in_silence_tolerance_ms", -} -_OPENAI_COMPATIBLE_LLM_PROVIDERS = {"openai_compatible", "openai-compatible", "siliconflow"} -_OPENAI_COMPATIBLE_TTS_PROVIDERS = {"openai_compatible", "openai-compatible", "siliconflow"} -_DASHSCOPE_TTS_PROVIDERS = {"dashscope"} -_OPENAI_COMPATIBLE_ASR_PROVIDERS = {"openai_compatible", "openai-compatible", "siliconflow"} - - -def _normalized_provider(overrides: Dict[str, Any], key: str, default: str) -> str: - return str(overrides.get(key) or default).strip().lower() - - -def _is_blank(value: Any) -> bool: - return value is None or (isinstance(value, str) and not value.strip()) - - -@dataclass(frozen=True) -class AgentConfigSelection: - """Resolved agent config location and how it was selected.""" - - path: Optional[Path] - source: str - - -def _parse_cli_agent_args(argv: List[str]) -> Tuple[Optional[str], Optional[str]]: - """Parse only agent-related CLI flags from argv.""" - config_path: Optional[str] = None - profile: Optional[str] = None - i = 0 - while i < len(argv): - arg = argv[i] - if arg.startswith("--agent-config="): - config_path = arg.split("=", 1)[1].strip() or None - elif arg == "--agent-config" and i + 1 < len(argv): - config_path = argv[i + 1].strip() or None - i += 1 - elif arg.startswith("--agent-profile="): - profile = arg.split("=", 1)[1].strip() or None - elif arg == "--agent-profile" and i + 1 < len(argv): - profile = argv[i + 1].strip() or None - i += 1 - i += 1 - return config_path, profile - - -def _agent_config_dir() -> Path: - base_dir = Path(os.getenv("AGENT_CONFIG_DIR", _DEFAULT_AGENT_CONFIG_DIR)) - if not base_dir.is_absolute(): - base_dir = Path.cwd() / base_dir - return base_dir.resolve() - - -def _resolve_agent_selection( - agent_config_path: Optional[str] = None, - agent_profile: Optional[str] = None, - argv: Optional[List[str]] = None, -) -> AgentConfigSelection: - cli_path, cli_profile = _parse_cli_agent_args(list(argv if argv is not None else sys.argv[1:])) - path_value = agent_config_path or cli_path or os.getenv("AGENT_CONFIG_PATH") - profile_value = agent_profile or cli_profile or os.getenv("AGENT_PROFILE") - source = "none" - candidate: Optional[Path] = None - - if path_value: - source = "cli_path" if (agent_config_path or cli_path) else "env_path" - candidate = Path(path_value) - elif profile_value: - source = "cli_profile" if (agent_profile or cli_profile) else "env_profile" - candidate = _agent_config_dir() / f"{profile_value}.yaml" - else: - fallback = _agent_config_dir() / _DEFAULT_AGENT_CONFIG_FILE - if fallback.exists(): - source = "default" - candidate = fallback - - if candidate is None: - raise ValueError( - "Agent YAML config is required. Provide --agent-config/--agent-profile " - "or create config/agents/default.yaml." - ) - - if not candidate.is_absolute(): - candidate = (Path.cwd() / candidate).resolve() - else: - candidate = candidate.resolve() - - if not candidate.exists(): - raise ValueError(f"Agent config file not found ({source}): {candidate}") - if not candidate.is_file(): - raise ValueError(f"Agent config path is not a file: {candidate}") - return AgentConfigSelection(path=candidate, source=source) - - -def _resolve_env_refs(value: Any) -> Any: - """Resolve ${ENV_VAR} / ${ENV_VAR:default} placeholders recursively.""" - if isinstance(value, dict): - return {k: _resolve_env_refs(v) for k, v in value.items()} - if isinstance(value, list): - return [_resolve_env_refs(item) for item in value] - if not isinstance(value, str) or "${" not in value: - return value - - def _replace(match: re.Match[str]) -> str: - env_key = match.group(1) - default_value = match.group(2) - env_value = os.getenv(env_key) - if env_value is None: - if default_value is None: - raise ValueError(f"Missing environment variable referenced in agent YAML: {env_key}") - return default_value - return env_value - - return _ENV_REF_PATTERN.sub(_replace, value) - - -def _normalize_agent_overrides(raw: Dict[str, Any]) -> Dict[str, Any]: - """Normalize YAML into flat Settings fields.""" - normalized: Dict[str, Any] = {} - - for key, value in raw.items(): - if key == "siliconflow": - raise ValueError( - "Section 'siliconflow' is no longer supported. " - "Move provider-specific fields into agent.llm / agent.asr / agent.tts." - ) - if key == "tools": - if not isinstance(value, list): - raise ValueError("Agent config key 'tools' must be a list") - normalized["tools"] = value - continue - section_map = _AGENT_SECTION_KEY_MAP.get(key) - if section_map is None: - normalized[key] = value - continue - - if not isinstance(value, dict): - raise ValueError(f"Agent config section '{key}' must be a mapping") - - for nested_key, nested_value in value.items(): - mapped_key = section_map.get(nested_key) - if mapped_key is None: - raise ValueError(f"Unknown key in '{key}' section: '{nested_key}'") - normalized[mapped_key] = nested_value - - unknown_keys = sorted(set(normalized) - _AGENT_SETTING_KEYS) - if unknown_keys: - raise ValueError( - "Unknown agent config keys in YAML: " - + ", ".join(unknown_keys) - ) - return normalized - - -def _missing_required_keys(overrides: Dict[str, Any]) -> List[str]: - missing = set(_BASE_REQUIRED_AGENT_SETTING_KEYS - set(overrides)) - string_required = { - "vad_type", - "vad_model_path", - "llm_provider", - "llm_model", - "tts_provider", - "tts_voice", - "asr_provider", - "duplex_system_prompt", - } - for key in string_required: - if key in overrides and _is_blank(overrides.get(key)): - missing.add(key) - - llm_provider = _normalized_provider(overrides, "llm_provider", "openai") - if llm_provider in _OPENAI_COMPATIBLE_LLM_PROVIDERS or llm_provider == "openai": - if "llm_api_key" not in overrides or _is_blank(overrides.get("llm_api_key")): - missing.add("llm_api_key") - - tts_provider = _normalized_provider(overrides, "tts_provider", "openai_compatible") - if tts_provider in _OPENAI_COMPATIBLE_TTS_PROVIDERS: - if "tts_api_key" not in overrides or _is_blank(overrides.get("tts_api_key")): - missing.add("tts_api_key") - if "tts_api_url" not in overrides or _is_blank(overrides.get("tts_api_url")): - missing.add("tts_api_url") - if "tts_model" not in overrides or _is_blank(overrides.get("tts_model")): - missing.add("tts_model") - elif tts_provider in _DASHSCOPE_TTS_PROVIDERS: - if "tts_api_key" not in overrides or _is_blank(overrides.get("tts_api_key")): - missing.add("tts_api_key") - - asr_provider = _normalized_provider(overrides, "asr_provider", "openai_compatible") - if asr_provider in _OPENAI_COMPATIBLE_ASR_PROVIDERS: - if "asr_api_key" not in overrides or _is_blank(overrides.get("asr_api_key")): - missing.add("asr_api_key") - if "asr_api_url" not in overrides or _is_blank(overrides.get("asr_api_url")): - missing.add("asr_api_url") - if "asr_model" not in overrides or _is_blank(overrides.get("asr_model")): - missing.add("asr_model") - - return sorted(missing) - - -def _load_agent_overrides(selection: AgentConfigSelection) -> Dict[str, Any]: - if yaml is None: - raise RuntimeError( - "PyYAML is required for agent YAML configuration. Install with: pip install pyyaml" - ) - - with selection.path.open("r", encoding="utf-8") as file: - raw = yaml.safe_load(file) or {} - - if not isinstance(raw, dict): - raise ValueError(f"Agent config must be a YAML mapping: {selection.path}") - - if "agent" in raw: - agent_value = raw["agent"] - if not isinstance(agent_value, dict): - raise ValueError("The 'agent' key in YAML must be a mapping") - raw = agent_value - - resolved = _resolve_env_refs(raw) - overrides = _normalize_agent_overrides(resolved) - missing_required = _missing_required_keys(overrides) - if missing_required: - raise ValueError( - f"Missing required agent settings in YAML ({selection.path}): " - + ", ".join(missing_required) - ) - - overrides["agent_config_path"] = str(selection.path) - overrides["agent_config_source"] = selection.source - return overrides - - -def load_settings( - agent_config_path: Optional[str] = None, - agent_profile: Optional[str] = None, - argv: Optional[List[str]] = None, -) -> "Settings": - """Load settings from .env and optional agent YAML.""" - selection = _resolve_agent_selection( - agent_config_path=agent_config_path, - agent_profile=agent_profile, - argv=argv, - ) - agent_overrides = _load_agent_overrides(selection) - return Settings(**agent_overrides) +_prime_process_env_from_dotenv() class Settings(BaseSettings): @@ -402,9 +62,8 @@ class Settings(BaseSettings): # LLM Configuration llm_provider: str = Field( default="openai", - description="LLM provider (openai, openai_compatible, siliconflow)" + description="LLM provider (openai, openai_compatible, siliconflow, fastgpt)" ) - llm_api_key: Optional[str] = Field(default=None, description="LLM provider API key") llm_api_url: Optional[str] = Field(default=None, description="LLM provider API base URL") llm_model: str = Field(default="gpt-4o-mini", description="LLM model name") llm_temperature: float = Field(default=0.7, description="LLM temperature for response generation") @@ -412,12 +71,15 @@ class Settings(BaseSettings): # TTS Configuration tts_provider: str = Field( default="openai_compatible", - description="TTS provider (edge, openai_compatible, siliconflow, dashscope)" + description="TTS provider (openai_compatible, siliconflow, dashscope, volcengine)" ) - tts_api_key: Optional[str] = Field(default=None, description="TTS provider API key") tts_api_url: Optional[str] = Field(default=None, description="TTS provider API URL") tts_model: Optional[str] = Field(default=None, description="TTS model name") tts_voice: str = Field(default="anna", description="TTS voice name") + tts_app_id: Optional[str] = Field(default=None, description="Provider-specific TTS app ID") + tts_resource_id: Optional[str] = Field(default=None, description="Provider-specific TTS resource ID") + tts_cluster: Optional[str] = Field(default=None, description="Provider-specific TTS cluster") + tts_uid: Optional[str] = Field(default=None, description="Provider-specific TTS user ID") tts_mode: str = Field( default="commit", description="DashScope-only TTS mode (commit, server_commit). Ignored for non-dashscope providers." @@ -427,11 +89,19 @@ class Settings(BaseSettings): # ASR Configuration asr_provider: str = Field( default="openai_compatible", - description="ASR provider (openai_compatible, buffered, siliconflow)" + description="ASR provider (openai_compatible, buffered, siliconflow, dashscope, volcengine)" ) - asr_api_key: Optional[str] = Field(default=None, description="ASR provider API key") asr_api_url: Optional[str] = Field(default=None, description="ASR provider API URL") asr_model: Optional[str] = Field(default=None, description="ASR model name") + asr_app_id: Optional[str] = Field(default=None, description="Provider-specific ASR app ID") + asr_resource_id: Optional[str] = Field(default=None, description="Provider-specific ASR resource ID") + asr_cluster: Optional[str] = Field(default=None, description="Provider-specific ASR cluster") + asr_uid: Optional[str] = Field(default=None, description="Provider-specific ASR user ID") + asr_request_params_json: Optional[str] = Field( + default=None, + description="Provider-specific ASR request params as JSON string" + ) + asr_enable_interim: bool = Field(default=False, description="Enable interim transcripts for offline ASR") asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms") asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result") asr_start_min_speech_ms: int = Field( @@ -505,6 +175,10 @@ class Settings(BaseSettings): ) backend_url: Optional[str] = Field(default=None, description="Backend API base URL (e.g. http://localhost:8787)") backend_timeout_sec: int = Field(default=10, description="Backend API request timeout in seconds") + assistant_local_config_dir: str = Field( + default="engine/config/agents", + description="Directory containing local assistant runtime YAML files" + ) history_enabled: bool = Field(default=True, description="Enable history write bridge") history_default_user_id: int = Field(default=1, description="Fallback user_id for history records") history_queue_max_size: int = Field(default=256, description="Max buffered transcript writes per session") @@ -515,10 +189,6 @@ class Settings(BaseSettings): description="Max wait before finalizing history when queue is still draining" ) - # Agent YAML metadata - agent_config_path: Optional[str] = Field(default=None, description="Resolved agent YAML path") - agent_config_source: str = Field(default="none", description="How the agent YAML was selected") - @property def chunk_size_bytes(self) -> int: """Calculate chunk size in bytes based on sample rate and duration.""" @@ -543,7 +213,7 @@ class Settings(BaseSettings): # Global settings instance -settings = load_settings() +settings = Settings() def get_settings() -> Settings: diff --git a/engine/app/main.py b/engine/app/main.py index b8a39bb..d93875e 100644 --- a/engine/app/main.py +++ b/engine/app/main.py @@ -20,16 +20,28 @@ except ImportError: logger.warning("aiortc not available - WebRTC endpoint will be disabled") from app.config import settings -from app.backend_adapters import build_backend_adapter_from_settings -from core.transports import SocketTransport, WebRtcTransport, BaseTransport -from core.session import Session +from adapters.control_plane.backend import build_backend_adapter_from_settings +from runtime.transports import SocketTransport, WebRtcTransport, BaseTransport +from runtime.session.manager import Session from processors.tracks import Resampled16kTrack -from core.events import get_event_bus, reset_event_bus +from runtime.events import get_event_bus, reset_event_bus # Check interval for heartbeat/timeout (seconds) _HEARTBEAT_CHECK_INTERVAL_SEC = 5 +def _inactivity_deadline( + *, + last_received_at: float, + inactivity_timeout_sec: int, + pending_client_tool_deadline: Optional[float] = None, +) -> float: + deadline = float(last_received_at) + float(inactivity_timeout_sec) + if pending_client_tool_deadline is not None: + deadline = max(deadline, float(pending_client_tool_deadline)) + return deadline + + async def heartbeat_and_timeout_task( transport: BaseTransport, session: Session, @@ -48,8 +60,22 @@ async def heartbeat_and_timeout_task( if transport.is_closed: break now = time.monotonic() - if now - last_received_at[0] > inactivity_timeout_sec: - logger.info(f"Session {session_id}: {inactivity_timeout_sec}s no message, closing") + pending_client_tool_deadline = session.pipeline.pending_client_tool_deadline() + idle_deadline = _inactivity_deadline( + last_received_at=last_received_at[0], + inactivity_timeout_sec=inactivity_timeout_sec, + pending_client_tool_deadline=pending_client_tool_deadline, + ) + if now > idle_deadline: + if pending_client_tool_deadline is not None and pending_client_tool_deadline >= ( + last_received_at[0] + inactivity_timeout_sec + ): + logger.info( + "Session {}: no message before pending client tool deadline, closing", + session_id, + ) + else: + logger.info(f"Session {session_id}: {inactivity_timeout_sec}s no message, closing") await session.cleanup() break if now - last_heartbeat_at[0] >= heartbeat_interval_sec: @@ -76,22 +102,39 @@ app.add_middleware( # Active sessions storage active_sessions: Dict[str, Session] = {} -backend_gateway = build_backend_adapter_from_settings() +control_plane_gateway = build_backend_adapter_from_settings() # Configure logging logger.remove() -logger.add( - "./logs/active_call_{time}.log", - rotation="1 day", - retention="7 days", - level=settings.log_level, - format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}" -) -logger.add( - lambda msg: print(msg, end=""), - level=settings.log_level, - format="{time:HH:mm:ss} | {level: <8} | {message}" -) +_log_format = str(settings.log_format or "text").strip().lower() +if _log_format == "json": + logger.add( + "./logs/active_call_{time}.log", + rotation="1 day", + retention="7 days", + level=settings.log_level, + serialize=True, + format="{message}", + ) + logger.add( + lambda msg: print(msg, end=""), + level=settings.log_level, + serialize=True, + format="{message}", + ) +else: + logger.add( + "./logs/active_call_{time}.log", + rotation="1 day", + retention="7 days", + level=settings.log_level, + format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}", + ) + logger.add( + lambda msg: print(msg, end=""), + level=settings.log_level, + format="{time:HH:mm:ss} | {level: <8} | {message}", + ) @app.get("/health") @@ -170,7 +213,7 @@ async def websocket_endpoint(websocket: WebSocket): session = Session( session_id, transport, - backend_gateway=backend_gateway, + control_plane_gateway=control_plane_gateway, assistant_id=assistant_id, ) active_sessions[session_id] = session @@ -255,7 +298,7 @@ async def webrtc_endpoint(websocket: WebSocket): session = Session( session_id, transport, - backend_gateway=backend_gateway, + control_plane_gateway=control_plane_gateway, assistant_id=assistant_id, ) active_sessions[session_id] = session @@ -371,12 +414,10 @@ async def startup_event(): logger.info(f"Server: {settings.host}:{settings.port}") logger.info(f"Sample rate: {settings.sample_rate} Hz") logger.info(f"VAD model: {settings.vad_model_path}") - if settings.agent_config_path: - logger.info( - f"Agent config loaded ({settings.agent_config_source}): {settings.agent_config_path}" - ) - else: - logger.info("Agent config: none (using .env/default agent values)") + logger.info( + "Assistant runtime config source: backend when BACKEND_URL is set, " + "otherwise local YAML by assistant_id from ASSISTANT_LOCAL_CONFIG_DIR" + ) @app.on_event("shutdown") diff --git a/engine/config/agents/dashscope.yaml b/engine/config/agents/dashscope.yaml new file mode 100644 index 0000000..6cc77e9 --- /dev/null +++ b/engine/config/agents/dashscope.yaml @@ -0,0 +1,47 @@ +# Agent behavior configuration for DashScope realtime ASR/TTS. +# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers). +# Infra/server/network settings should stay in .env. + +agent: + vad: + type: silero + model_path: data/vad/silero_vad.onnx + threshold: 0.5 + min_speech_duration_ms: 100 + eou_threshold_ms: 800 + + llm: + # provider: openai | openai_compatible | siliconflow + provider: openai_compatible + model: deepseek-v3 + temperature: 0.7 + api_key: your_llm_api_key + api_url: https://api.qnaigc.com/v1 + + tts: + provider: dashscope + api_key: your_tts_api_key + api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime + model: qwen3-tts-flash-realtime + voice: Cherry + dashscope_mode: commit + speed: 1.0 + + asr: + provider: dashscope + api_key: your_asr_api_key + api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime + model: qwen3-asr-flash-realtime + interim_interval_ms: 500 + min_audio_ms: 300 + start_min_speech_ms: 160 + pre_speech_ms: 240 + final_tail_ms: 120 + + duplex: + enabled: true + system_prompt: 你是一个人工智能助手,你用简答语句回答,避免使用标点符号和emoji。 + + barge_in: + min_duration_ms: 200 + silence_tolerance_ms: 60 diff --git a/engine/config/agents/dashscope_ontest.yaml b/engine/config/agents/dashscope_ontest.yaml new file mode 100644 index 0000000..55db902 --- /dev/null +++ b/engine/config/agents/dashscope_ontest.yaml @@ -0,0 +1,47 @@ +# Agent behavior configuration for DashScope realtime ASR/TTS. +# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers). +# Infra/server/network settings should stay in .env. + +agent: + vad: + type: silero + model_path: data/vad/silero_vad.onnx + threshold: 0.5 + min_speech_duration_ms: 100 + eou_threshold_ms: 800 + + llm: + # provider: openai | openai_compatible | siliconflow + provider: openai_compatible + model: deepseek-v3 + temperature: 0.7 + api_key: sk-fc4d59b360475f53401a864db8ce0985010acc4e696723d20a90d6569f38d80a + api_url: https://api.qnaigc.com/v1 + + tts: + provider: dashscope + api_key: sk-391f5126d18345d497c6e8717c8c9ad7 + api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime + model: qwen3-tts-flash-realtime + voice: Cherry + dashscope_mode: commit + speed: 1.0 + + asr: + provider: dashscope + api_key: sk-391f5126d18345d497c6e8717c8c9ad7 + api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime + model: qwen3-asr-flash-realtime + interim_interval_ms: 500 + min_audio_ms: 300 + start_min_speech_ms: 160 + pre_speech_ms: 240 + final_tail_ms: 120 + + duplex: + enabled: true + system_prompt: 你是一个人工智能助手,你用简答语句回答,避免使用标点符号和emoji。 + + barge_in: + min_duration_ms: 200 + silence_tolerance_ms: 60 diff --git a/engine/config/agents/example.yaml b/engine/config/agents/example.yaml index dd0e927..2aa750b 100644 --- a/engine/config/agents/example.yaml +++ b/engine/config/agents/example.yaml @@ -11,7 +11,7 @@ agent: eou_threshold_ms: 800 llm: - # provider: openai | openai_compatible | siliconflow + # provider: openai | openai_compatible | siliconflow | fastgpt provider: openai_compatible model: deepseek-v3 temperature: 0.7 @@ -21,12 +21,17 @@ agent: api_url: https://api.qnaigc.com/v1 tts: - # provider: edge | openai_compatible | siliconflow | dashscope + # provider: openai_compatible | siliconflow | dashscope | volcengine # dashscope defaults (if omitted): # api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime # model: qwen3-tts-flash-realtime # dashscope_mode: commit (engine splits) | server_commit (dashscope splits) # note: dashscope_mode/mode is ONLY used when provider=dashscope. + # volcengine defaults (if omitted): + # api_url: https://openspeech.bytedance.com/api/v3/tts/unidirectional + # resource_id: seed-tts-2.0 + # app_id: your volcengine app key + # api_key: your volcengine access key provider: openai_compatible api_key: your_tts_api_key api_url: https://api.siliconflow.cn/v1/audio/speech @@ -35,11 +40,26 @@ agent: speed: 1.0 asr: - # provider: buffered | openai_compatible | siliconflow + # provider: buffered | openai_compatible | siliconflow | dashscope | volcengine + # dashscope defaults (if omitted): + # api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime + # model: qwen3-asr-flash-realtime + # note: dashscope uses streaming ASR mode (chunk-by-chunk). + # volcengine defaults (if omitted): + # api_url: wss://openspeech.bytedance.com/api/v3/sauc/bigmodel + # model: bigmodel + # resource_id: volc.bigasr.sauc.duration + # app_id: your volcengine app key + # api_key: your volcengine access key + # request_params: + # end_window_size: 800 + # force_to_speech_time: 1000 + # note: volcengine uses streaming ASR mode (chunk-by-chunk). provider: openai_compatible api_key: you_asr_api_key api_url: https://api.siliconflow.cn/v1/audio/transcriptions model: FunAudioLLM/SenseVoiceSmall + enable_interim: false interim_interval_ms: 500 min_audio_ms: 300 start_min_speech_ms: 160 @@ -53,3 +73,4 @@ agent: barge_in: min_duration_ms: 200 silence_tolerance_ms: 60 + diff --git a/engine/config/agents/tools.yaml b/engine/config/agents/tools.yaml index 4d8bd72..11cd7c3 100644 --- a/engine/config/agents/tools.yaml +++ b/engine/config/agents/tools.yaml @@ -18,12 +18,17 @@ agent: api_url: https://api.qnaigc.com/v1 tts: - # provider: edge | openai_compatible | siliconflow | dashscope + # provider: openai_compatible | siliconflow | dashscope | volcengine # dashscope defaults (if omitted): # api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime # model: qwen3-tts-flash-realtime # dashscope_mode: commit (engine splits) | server_commit (dashscope splits) # note: dashscope_mode/mode is ONLY used when provider=dashscope. + # volcengine defaults (if omitted): + # api_url: https://openspeech.bytedance.com/api/v3/tts/unidirectional + # resource_id: seed-tts-2.0 + # app_id: your volcengine app key + # api_key: your volcengine access key provider: openai_compatible api_key: your_tts_api_key api_url: https://api.siliconflow.cn/v1/audio/speech @@ -32,11 +37,26 @@ agent: speed: 1.0 asr: - # provider: buffered | openai_compatible | siliconflow + # provider: buffered | openai_compatible | siliconflow | dashscope | volcengine + # dashscope defaults (if omitted): + # api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime + # model: qwen3-asr-flash-realtime + # note: dashscope uses streaming ASR mode (chunk-by-chunk). + # volcengine defaults (if omitted): + # api_url: wss://openspeech.bytedance.com/api/v3/sauc/bigmodel + # model: bigmodel + # resource_id: volc.bigasr.sauc.duration + # app_id: your volcengine app key + # api_key: your volcengine access key + # request_params: + # end_window_size: 800 + # force_to_speech_time: 1000 + # note: volcengine uses streaming ASR mode (chunk-by-chunk). provider: openai_compatible api_key: your_asr_api_key api_url: https://api.siliconflow.cn/v1/audio/transcriptions model: FunAudioLLM/SenseVoiceSmall + enable_interim: false interim_interval_ms: 500 min_audio_ms: 300 start_min_speech_ms: 160 diff --git a/engine/config/agents/volcengine.yaml b/engine/config/agents/volcengine.yaml new file mode 100644 index 0000000..acd66b3 --- /dev/null +++ b/engine/config/agents/volcengine.yaml @@ -0,0 +1,68 @@ +# Agent behavior configuration (safe to edit per profile) +# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers). +# Infra/server/network settings should stay in .env. + +agent: + vad: + type: silero + model_path: data/vad/silero_vad.onnx + threshold: 0.5 + min_speech_duration_ms: 100 + eou_threshold_ms: 800 + + llm: + # provider: openai | openai_compatible | siliconflow + provider: openai_compatible + model: deepseek-v3 + temperature: 0.7 + # Required: no fallback. You can still reference env explicitly. + api_key: your_llm_api_key + # Optional for OpenAI-compatible endpoints: + api_url: https://api.qnaigc.com/v1 + + tts: + # provider: edge | openai_compatible | siliconflow | dashscope + # dashscope defaults (if omitted): + # api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime + # model: qwen3-tts-flash-realtime + # dashscope_mode: commit (engine splits) | server_commit (dashscope splits) + # note: dashscope_mode/mode is ONLY used when provider=dashscope. + # volcengine defaults (if omitted): + provider: volcengine + api_url: https://openspeech.bytedance.com/api/v3/tts/unidirectional + resource_id: seed-tts-2.0 + app_id: your_tts_app_id + api_key: your_tts_api_key + speed: 1.1 + voice: zh_female_vv_uranus_bigtts + + asr: + asr: + provider: volcengine + api_url: wss://openspeech.bytedance.com/api/v3/sauc/bigmodel + app_id: your_asr_app_id + api_key: your_asr_api_key + resource_id: volc.bigasr.sauc.duration + uid: caller-1 + model: bigmodel + request_params: + end_window_size: 800 + force_to_speech_time: 1000 + enable_punc: true + enable_itn: false + enable_ddc: false + show_utterance: true + result_type: single + interim_interval_ms: 500 + min_audio_ms: 300 + start_min_speech_ms: 160 + pre_speech_ms: 240 + final_tail_ms: 120 + + duplex: + enabled: true + system_prompt: 你是一个人工智能助手,你用简答语句回答,避免使用标点符号和emoji。 + + barge_in: + min_duration_ms: 200 + silence_tolerance_ms: 60 diff --git a/engine/config/agents/volcengine_ontest.yaml b/engine/config/agents/volcengine_ontest.yaml new file mode 100644 index 0000000..181fa79 --- /dev/null +++ b/engine/config/agents/volcengine_ontest.yaml @@ -0,0 +1,67 @@ +# Agent behavior configuration (safe to edit per profile) +# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers). +# Infra/server/network settings should stay in .env. + +agent: + vad: + type: silero + model_path: data/vad/silero_vad.onnx + threshold: 0.5 + min_speech_duration_ms: 100 + eou_threshold_ms: 800 + + llm: + # provider: openai | openai_compatible | siliconflow + provider: openai_compatible + model: deepseek-v3 + temperature: 0.7 + # Required: no fallback. You can still reference env explicitly. + api_key: sk-fc4d59b360475f53401a864db8ce0985010acc4e696723d20a90d6569f38d80a + # Optional for OpenAI-compatible endpoints: + api_url: https://api.qnaigc.com/v1 + + tts: + # provider: edge | openai_compatible | siliconflow | dashscope + # dashscope defaults (if omitted): + # api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime + # model: qwen3-tts-flash-realtime + # dashscope_mode: commit (engine splits) | server_commit (dashscope splits) + # note: dashscope_mode/mode is ONLY used when provider=dashscope. + # volcengine defaults (if omitted): + provider: volcengine + api_url: https://openspeech.bytedance.com/api/v3/tts/unidirectional + resource_id: seed-tts-2.0 + app_id: 2931820332 + api_key: 4ustCTIpdCq8dE_msFrZvFn4nDpioIVo + speed: 1.1 + voice: zh_female_vv_uranus_bigtts + + asr: + provider: volcengine + api_url: wss://openspeech.bytedance.com/api/v3/sauc/bigmodel + app_id: 8607675070 + api_key: QiO0AptfmU0GLTSitwn7t5-zeo4gJ6K1 + resource_id: volc.bigasr.sauc.duration + uid: caller-1 + model: bigmodel + request_params: + end_window_size: 800 + force_to_speech_time: 1000 + enable_punc: true + enable_itn: false + enable_ddc: false + show_utterance: true + result_type: single + interim_interval_ms: 500 + min_audio_ms: 300 + start_min_speech_ms: 160 + pre_speech_ms: 240 + final_tail_ms: 120 + + duplex: + enabled: true + system_prompt: 你是一个人工智能助手,你用简答语句回答,避免使用标点符号和emoji。 + + barge_in: + min_duration_ms: 200 + silence_tolerance_ms: 60 diff --git a/engine/core/__init__.py b/engine/core/__init__.py deleted file mode 100644 index 0110686..0000000 --- a/engine/core/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Core Components Package""" - -from core.events import EventBus, get_event_bus -from core.transports import BaseTransport, SocketTransport, WebRtcTransport -from core.session import Session -from core.conversation import ConversationManager, ConversationState, ConversationTurn -from core.duplex_pipeline import DuplexPipeline - -__all__ = [ - "EventBus", - "get_event_bus", - "BaseTransport", - "SocketTransport", - "WebRtcTransport", - "Session", - "ConversationManager", - "ConversationState", - "ConversationTurn", - "DuplexPipeline", -] diff --git a/engine/core/ports/__init__.py b/engine/core/ports/__init__.py deleted file mode 100644 index 7d7c9dd..0000000 --- a/engine/core/ports/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Port interfaces for engine-side integration boundaries.""" - -from core.ports.backend import ( - AssistantConfigProvider, - BackendGateway, - HistoryWriter, - KnowledgeSearcher, - ToolResourceResolver, -) - -__all__ = [ - "AssistantConfigProvider", - "BackendGateway", - "HistoryWriter", - "KnowledgeSearcher", - "ToolResourceResolver", -] diff --git a/engine/data/audio_examples/single_utterance_16k.wav b/engine/data/audio_examples/single_utterance_16k.wav deleted file mode 100644 index 8c7bbe5..0000000 Binary files a/engine/data/audio_examples/single_utterance_16k.wav and /dev/null differ diff --git a/engine/data/audio_examples/three_utterances.wav b/engine/data/audio_examples/three_utterances_simple.wav similarity index 80% rename from engine/data/audio_examples/three_utterances.wav rename to engine/data/audio_examples/three_utterances_simple.wav index c2dca2f..40cd649 100644 Binary files a/engine/data/audio_examples/three_utterances.wav and b/engine/data/audio_examples/three_utterances_simple.wav differ diff --git a/engine/data/audio_examples/two_utterances.wav b/engine/data/audio_examples/two_utterances.wav deleted file mode 100644 index 5c66f70..0000000 Binary files a/engine/data/audio_examples/two_utterances.wav and /dev/null differ diff --git a/engine/docs/backend_integration.md b/engine/docs/backend_integration.md index 1f5d14d..22fa09e 100644 --- a/engine/docs/backend_integration.md +++ b/engine/docs/backend_integration.md @@ -10,6 +10,7 @@ Configure with environment variables: - `BACKEND_MODE=auto|http|disabled` - `BACKEND_URL` - `BACKEND_TIMEOUT_SEC` +- `ASSISTANT_LOCAL_CONFIG_DIR` (default: `engine/config/agents`) - `HISTORY_ENABLED=true|false` Mode behavior: @@ -18,18 +19,23 @@ Mode behavior: - `http`: force HTTP backend adapter (falls back to null adapter when URL is missing). - `disabled`: force null adapter and run engine-only. +Assistant config source behavior: + +- If `BACKEND_URL` exists and backend mode is enabled, fetch assistant config from backend. +- If `BACKEND_URL` is missing (or backend mode is disabled), load assistant config from local YAML. +- `assistant_id` query parameter is still required and maps to `engine/config/agents/.yaml` when local YAML source is active. + ## Architecture -- Ports: `core/ports/backend.py` -- Adapters: `app/backend_adapters.py` -- Compatibility wrappers: `app/backend_client.py` +- Ports: `runtime/ports/control_plane.py` +- Adapters: `adapters/control_plane/backend.py` `Session` and `DuplexPipeline` receive backend capabilities via injected adapter methods instead of hard-coding backend client imports. ## Async History Writes -Session history persistence is handled by `core/history_bridge.py`. +Session history persistence is handled by `runtime/history/bridge.py`. Design: diff --git a/engine/docs/extension_ports.md b/engine/docs/extension_ports.md new file mode 100644 index 0000000..c0f65f6 --- /dev/null +++ b/engine/docs/extension_ports.md @@ -0,0 +1,47 @@ +# Engine Extension Ports (Draft) + +This document defines the draft port set used to keep core runtime extensible. + +## Port Modules + +- `runtime/ports/control_plane.py` + - `AssistantRuntimeConfigProvider` + - `ConversationHistoryStore` + - `KnowledgeRetriever` + - `ToolCatalog` + - `ControlPlaneGateway` +- `runtime/ports/llm.py` + - `LLMServiceSpec` + - `LLMPort` + - optional extensions: `LLMCancellable`, `LLMRuntimeConfigurable` +- `runtime/ports/tts.py` + - `TTSServiceSpec` + - `TTSPort` +- `runtime/ports/asr.py` + - `ASRServiceSpec` + - `ASRPort` + - explicit mode ports: `OfflineASRPort`, `StreamingASRPort` +- `runtime/ports/service_factory.py` + - `RealtimeServiceFactory` + +## Adapter Layer + +- `providers/factory/default.py` provides `DefaultRealtimeServiceFactory`. +- It maps resolved provider specs to concrete adapters. +- Runtime orchestration (`runtime/pipeline/duplex.py`) depends on the factory port/specs, not concrete provider classes. + +## Provider Behavior (Current) + +- LLM: + - supported providers: `openai`, `openai_compatible`, `openai-compatible`, `siliconflow` + - fallback: `MockLLMService` +- TTS: + - supported providers: `dashscope`, `volcengine`, `openai_compatible`, `openai-compatible`, `siliconflow` + - fallback: `MockTTSService` +- ASR: + - supported providers: `openai_compatible`, `openai-compatible`, `siliconflow`, `dashscope`, `volcengine` + - fallback: `BufferedASRService` + +## Notes + +- This is a draft contract set; follow-up work can add explicit capability negotiation and contract-version fields. diff --git a/engine/docs/high_level_architecture.md b/engine/docs/high_level_architecture.md new file mode 100644 index 0000000..bdae564 --- /dev/null +++ b/engine/docs/high_level_architecture.md @@ -0,0 +1,129 @@ +# Engine High-Level Architecture + +This document describes the runtime architecture of `engine` for realtime voice/text assistant interactions. + +## Goals + +- Low-latency duplex interaction (user speaks while assistant can respond) +- Clear separation between transport, orchestration, and model/service integrations +- Backend-optional runtime (works with or without external backend) +- Protocol-first interoperability through strict WS v1 control messages + +## Top-Level Components + +```mermaid +flowchart LR + C[Client\nWeb / Mobile / Device] <-- WS v1 + PCM --> A[FastAPI App\napp/main.py] + A --> S[Session\nruntime/session/manager.py] + S --> D[Duplex Pipeline\nruntime/pipeline/duplex.py] + + D --> P[Processors\nVAD / EOU / Tracks] + D --> R[Workflow Runner\nworkflow/runner.py] + D --> E[Event Bus + Models\nruntime/events.py + protocol/ws_v1/*] + + R --> SV[Service Layer\nproviders/asr/*\nproviders/llm/*\nproviders/tts/*] + R --> TE[Tool Executor\ntools/executor.py] + + S --> HB[History Bridge\nruntime/history/bridge.py] + S --> BA[Control Plane Port\nruntime/ports/control_plane.py] + BA --> AD[Adapters\nadapters/control_plane/backend.py] + + AD --> B[(External Backend API\noptional)] + SV --> M[(ASR/LLM/TTS Providers)] +``` + +## Request Lifecycle (Simplified) + +1. Client connects to `/ws?assistant_id=` and sends `session.start`. +2. App creates a `Session` with resolved assistant config (backend or local YAML). +3. Binary PCM frames enter the duplex pipeline. +4. `VAD`/`EOU` processors detect speech segments and trigger ASR finalization. +5. ASR text is routed into workflow + LLM generation. +6. Optional tool calls are executed (server-side or client-side result return). +7. LLM output streams as text deltas; TTS produces audio chunks for playback. +8. Session emits structured events (`transcript.*`, `assistant.*`, `output.audio.*`, `error`). +9. History bridge persists conversation data asynchronously. +10. On `session.stop` (or disconnect), session finalizes and drains pending writes. + +## Layering and Responsibilities + +### 1) Transport / API Layer + +- Entry point: `app/main.py` +- Responsibilities: + - WebSocket lifecycle management + - WS v1 message validation and order guarantees + - Session creation and teardown + - Converting raw WS frames into internal events + +### 2) Session + Orchestration Layer + +- Core: `runtime/session/manager.py`, `runtime/pipeline/duplex.py`, `runtime/conversation.py` +- Responsibilities: + - Per-session state machine + - Turn boundaries and interruption/cancel handling + - Event sequencing (`seq`) and envelope consistency + - Bridging input/output tracks (`audio_in`, `audio_out`, `control`) + +### 3) Processing Layer + +- Modules: `processors/vad.py`, `processors/eou.py`, `processors/tracks.py` +- Responsibilities: + - Speech activity detection + - End-of-utterance decisioning + - Track-oriented routing and timing-sensitive pre/post processing + +### 4) Workflow + Tooling Layer + +- Modules: `workflow/runner.py`, `tools/executor.py` +- Responsibilities: + - Assistant workflow execution + - Tool call planning/execution and timeout handling + - Tool result normalization into protocol events + +### 5) Service Integration Layer + +- Modules: `providers/*` +- Responsibilities: + - Abstracting ASR/LLM/TTS provider differences + - Streaming token/audio adaptation + - Provider-specific adapters (OpenAI-compatible, DashScope, SiliconFlow, etc.) + +### 6) Backend Integration Layer (Optional) + +- Port: `runtime/ports/control_plane.py` +- Adapters: `adapters/control_plane/backend.py` +- Responsibilities: + - Fetching assistant runtime config + - Persisting call/session metadata and history + - Supporting `BACKEND_MODE=auto|http|disabled` + +### 7) Persistence / Reliability Layer + +- Module: `runtime/history/bridge.py` +- Responsibilities: + - Non-blocking queue-based history writes + - Retry with backoff on backend failures + - Best-effort drain on session finalize + +## Key Design Principles + +- Dependency inversion for backend: session/pipeline depend on port interfaces, not concrete clients. +- Streaming-first: text/audio are emitted incrementally to minimize perceived latency. +- Fail-soft behavior: backend/history failures should not block realtime interaction paths. +- Protocol strictness: WS v1 rejects malformed/out-of-order control traffic early. +- Explicit event model: all client-observable state changes are represented as typed events. + +## Configuration Boundaries + +- Runtime environment settings live in `app/config.py`. +- Assistant-specific behavior is loaded by `assistant_id`: + - backend mode: from backend API + - engine-only mode: local `engine/config/agents/.yaml` +- Client-provided `metadata.overrides` and `dynamicVariables` can alter runtime behavior within protocol constraints. + +## Related Docs + +- WS protocol: `engine/docs/ws_v1_schema.md` +- Backend integration details: `engine/docs/backend_integration.md` +- Duplex interaction diagram: `engine/docs/duplex_interaction.svg` diff --git a/engine/docs/import_migration.md b/engine/docs/import_migration.md new file mode 100644 index 0000000..eaeba1c --- /dev/null +++ b/engine/docs/import_migration.md @@ -0,0 +1,21 @@ +# Canonical Module Layout + +This MVP uses a single canonical module layout without legacy import shims. + +## Runtime and protocol + +- `protocol.ws_v1.schema` +- `runtime.session.manager` +- `runtime.pipeline.duplex` +- `runtime.history.bridge` +- `runtime.events` +- `runtime.transports` +- `runtime.conversation` +- `runtime.ports.*` + +## Integrations and orchestration + +- `providers.*` +- `adapters.control_plane.backend` +- `workflow.runner` +- `tools.executor` diff --git a/engine/docs/ws_v1_schema_zh.md b/engine/docs/ws_v1_schema_zh.md index ce5f175..1681c23 100644 --- a/engine/docs/ws_v1_schema_zh.md +++ b/engine/docs/ws_v1_schema_zh.md @@ -7,9 +7,9 @@ - 握手顺序、状态机、错误语义与实现细节。 实现对照来源: -- `models/ws_v1.py` -- `core/session.py` -- `core/duplex_pipeline.py` +- `protocol/ws_v1/schema.py` +- `runtime/session/manager.py` +- `runtime/pipeline/duplex.py` - `app/main.py` --- diff --git a/engine/examples/wav_client.py b/engine/examples/wav_client.py index 1e4a50d..14b2587 100644 --- a/engine/examples/wav_client.py +++ b/engine/examples/wav_client.py @@ -3,13 +3,15 @@ WAV file client for testing duplex voice conversation. This client reads audio from a WAV file, sends it to the server, -and saves the AI's voice response to an output WAV file. +and saves a stereo WAV file with the input audio on the left channel +and the AI's voice response on the right channel. Usage: python examples/wav_client.py --input input.wav --output response.wav python examples/wav_client.py --input input.wav --output response.wav --url ws://localhost:8000/ws python examples/wav_client.py --input input.wav --output response.wav --wait-time 10 python wav_client.py --input ../data/audio_examples/two_utterances.wav -o response.wav + Requirements: pip install soundfile websockets numpy """ @@ -45,20 +47,20 @@ except ImportError: class WavFileClient: """ WAV file client for voice conversation testing. - + Features: - Read audio from WAV file - Send audio to WebSocket server - - Receive and save response audio + - Receive and save stereo conversation audio - Event logging """ - + def __init__( self, url: str, input_file: str, output_file: str, - assistant_id: str = "assistant_demo", + assistant_id: str = "default", channel: str = "wav_client", sample_rate: int = 16000, chunk_duration_ms: int = 20, @@ -69,7 +71,7 @@ class WavFileClient: ): """ Initialize WAV file client. - + Args: url: WebSocket server URL input_file: Input WAV file path @@ -92,48 +94,51 @@ class WavFileClient: self.track_debug = track_debug self.tail_silence_ms = max(0, int(tail_silence_ms)) self.frame_bytes = 640 # 16k mono pcm_s16le, 20ms - + # WebSocket connection self.ws = None self.running = False - + # Audio buffers + self.input_audio = np.array([], dtype=np.int16) self.received_audio = bytearray() - + self.output_segments: list[dict[str, object]] = [] + self.current_output_segment: bytearray | None = None + # Statistics self.bytes_sent = 0 self.bytes_received = 0 - + # TTFB tracking (per response) self.send_start_time = None - self.response_start_time = None # set on each trackStart + self.response_start_time = None # set on each output.audio.start self.waiting_for_first_audio = False self.ttfb_ms = None # last TTFB for summary self.ttfb_list = [] # TTFB for each response - + # State tracking self.track_started = False self.track_ended = False self.send_completed = False self.session_ready = False - + # Events log self.events_log = [] - - def log_event(self, direction: str, message: str): + + def log_event(self, direction: str, message: str) -> None: """Log an event with timestamp.""" timestamp = time.time() - self.events_log.append({ - "timestamp": timestamp, - "direction": direction, - "message": message - }) - # Handle encoding errors on Windows + self.events_log.append( + { + "timestamp": timestamp, + "direction": direction, + "message": message, + } + ) try: print(f"{direction} {message}") except UnicodeEncodeError: - # Replace problematic characters for console output - safe_message = message.encode('ascii', errors='replace').decode('ascii') + safe_message = message.encode("ascii", errors="replace").decode("ascii") print(f"{direction} {safe_message}") @staticmethod @@ -152,119 +157,160 @@ class WavFileClient: query = dict(parse_qsl(parts.query, keep_blank_values=True)) query["assistant_id"] = self.assistant_id return urlunsplit((parts.scheme, parts.netloc, parts.path, urlencode(query), parts.fragment)) - + + def _current_timeline_sample(self) -> int: + """Return current sample position relative to input send start.""" + if self.send_start_time is None: + return 0 + elapsed_seconds = max(0.0, time.time() - self.send_start_time) + return int(round(elapsed_seconds * self.sample_rate)) + + def _start_output_segment(self) -> None: + """Create a new assistant-audio segment if one is not active.""" + if self.current_output_segment is not None: + return + self.current_output_segment = bytearray() + self.output_segments.append( + { + "start_sample": self._current_timeline_sample(), + "audio": self.current_output_segment, + } + ) + + def _close_output_segment(self) -> None: + """Close the active assistant-audio segment, if any.""" + self.current_output_segment = None + + def _build_input_track(self) -> np.ndarray: + """Build the saved left channel using the streamed input audio.""" + input_track = self.input_audio.astype(np.int16, copy=True) + tail_samples = int(round(self.sample_rate * self.tail_silence_ms / 1000.0)) + if tail_samples <= 0: + return input_track + if input_track.size == 0: + return np.zeros(tail_samples, dtype=np.int16) + return np.concatenate((input_track, np.zeros(tail_samples, dtype=np.int16))) + + def _build_output_track(self) -> np.ndarray: + """Build the saved right channel using received assistant audio.""" + if not self.output_segments: + return np.zeros(0, dtype=np.int16) + + total_samples = max( + int(segment["start_sample"]) + (len(segment["audio"]) // 2) + for segment in self.output_segments + ) + mixed_track = np.zeros(total_samples, dtype=np.int32) + + for segment in self.output_segments: + start_sample = int(segment["start_sample"]) + segment_audio = np.frombuffer(bytes(segment["audio"]), dtype=np.int16).astype(np.int32) + if segment_audio.size == 0: + continue + end_sample = start_sample + segment_audio.size + mixed_track[start_sample:end_sample] += segment_audio + + np.clip(mixed_track, -32768, 32767, out=mixed_track) + return mixed_track.astype(np.int16) + async def connect(self) -> None: """Connect to WebSocket server.""" session_url = self._session_url() - self.log_event("→", f"Connecting to {session_url}...") + self.log_event("->", f"Connecting to {session_url}...") self.ws = await websockets.connect(session_url) self.running = True - self.log_event("←", "Connected!") + self.log_event("->", "Connected!") + + await self.send_command( + { + "type": "session.start", + "audio": { + "encoding": "pcm_s16le", + "sample_rate_hz": self.sample_rate, + "channels": 1, + }, + "metadata": { + "channel": self.channel, + "source": "wav_client", + }, + } + ) - await self.send_command({ - "type": "session.start", - "audio": { - "encoding": "pcm_s16le", - "sample_rate_hz": self.sample_rate, - "channels": 1 - }, - "metadata": { - "channel": self.channel, - "source": "wav_client", - }, - }) - async def send_command(self, cmd: dict) -> None: """Send JSON command to server.""" if self.ws: await self.ws.send(json.dumps(cmd)) - self.log_event("→", f"Command: {cmd.get('type', 'unknown')}") - + self.log_event("->", f"Command: {cmd.get('type', 'unknown')}") + async def send_hangup(self, reason: str = "Session complete") -> None: """Send hangup command.""" - await self.send_command({ - "type": "session.stop", - "reason": reason - }) - + await self.send_command({"type": "session.stop", "reason": reason}) + def load_wav_file(self) -> tuple[np.ndarray, int]: """ Load and prepare WAV file for sending. - + Returns: Tuple of (audio_data as int16 numpy array, original sample rate) """ if not self.input_file.exists(): raise FileNotFoundError(f"Input file not found: {self.input_file}") - - # Load audio file + audio_data, file_sample_rate = sf.read(self.input_file) - self.log_event("→", f"Loaded: {self.input_file}") - self.log_event("→", f" Original sample rate: {file_sample_rate} Hz") - self.log_event("→", f" Duration: {len(audio_data) / file_sample_rate:.2f}s") - - # Convert stereo to mono if needed + self.log_event("->", f"Loaded: {self.input_file}") + self.log_event("->", f" Original sample rate: {file_sample_rate} Hz") + self.log_event("->", f" Duration: {len(audio_data) / file_sample_rate:.2f}s") + if len(audio_data.shape) > 1: audio_data = audio_data.mean(axis=1) - self.log_event("→", " Converted stereo to mono") - - # Resample if needed + self.log_event("->", " Converted stereo to mono") + if file_sample_rate != self.sample_rate: - # Simple resampling using numpy duration = len(audio_data) / file_sample_rate num_samples = int(duration * self.sample_rate) indices = np.linspace(0, len(audio_data) - 1, num_samples) audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data) - self.log_event("→", f" Resampled to {self.sample_rate} Hz") - - # Convert to int16 + self.log_event("->", f" Resampled to {self.sample_rate} Hz") + if audio_data.dtype != np.int16: - # Normalize to [-1, 1] if needed max_val = np.max(np.abs(audio_data)) if max_val > 1.0: audio_data = audio_data / max_val audio_data = (audio_data * 32767).astype(np.int16) - - self.log_event("→", f" Prepared: {len(audio_data)} samples ({len(audio_data)/self.sample_rate:.2f}s)") - + + self.log_event("->", f" Prepared: {len(audio_data)} samples ({len(audio_data) / self.sample_rate:.2f}s)") + self.input_audio = audio_data.copy() return audio_data, file_sample_rate - + async def audio_sender(self, audio_data: np.ndarray) -> None: """Send audio data to server in chunks.""" total_samples = len(audio_data) chunk_size = self.chunk_samples sent_samples = 0 - + self.send_start_time = time.time() - self.log_event("→", f"Starting audio transmission ({total_samples} samples)...") - + self.log_event("->", f"Starting audio transmission ({total_samples} samples)...") + while sent_samples < total_samples and self.running: - # Get next chunk end_sample = min(sent_samples + chunk_size, total_samples) chunk = audio_data[sent_samples:end_sample] chunk_bytes = chunk.tobytes() if len(chunk_bytes) % self.frame_bytes != 0: - # v1 audio framing requires 640-byte (20ms) PCM units. pad = self.frame_bytes - (len(chunk_bytes) % self.frame_bytes) chunk_bytes += b"\x00" * pad - - # Send to server + if self.ws: await self.ws.send(chunk_bytes) self.bytes_sent += len(chunk_bytes) - + sent_samples = end_sample - - # Progress logging (every 500ms worth of audio) + if self.verbose and sent_samples % (self.sample_rate // 2) == 0: progress = (sent_samples / total_samples) * 100 print(f" Sending: {progress:.0f}%", end="\r") - - # Delay to simulate real-time streaming - # Server expects audio at real-time pace for VAD/ASR to work properly + await asyncio.sleep(self.chunk_duration_ms / 1000) - # Add a short silence tail to help VAD/EOU close the final utterance. if self.tail_silence_ms > 0 and self.ws: tail_frames = max(1, self.tail_silence_ms // 20) silence = b"\x00" * self.frame_bytes @@ -272,56 +318,53 @@ class WavFileClient: await self.ws.send(silence) self.bytes_sent += len(silence) await asyncio.sleep(0.02) - self.log_event("→", f"Sent trailing silence: {self.tail_silence_ms}ms") - + self.log_event("->", f"Sent trailing silence: {self.tail_silence_ms}ms") + self.send_completed = True elapsed = time.time() - self.send_start_time - self.log_event("→", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent/1024:.1f} KB)") - + self.log_event("->", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent / 1024:.1f} KB)") + async def receiver(self) -> None: """Receive messages from server.""" try: while self.running: try: message = await asyncio.wait_for(self.ws.recv(), timeout=0.1) - + if isinstance(message, bytes): - # Audio data received self.bytes_received += len(message) self.received_audio.extend(message) - - # Calculate TTFB on first audio of each response + self._start_output_segment() + self.current_output_segment.extend(message) + if self.waiting_for_first_audio and self.response_start_time is not None: ttfb_ms = (time.time() - self.response_start_time) * 1000 self.ttfb_ms = ttfb_ms self.ttfb_list.append(ttfb_ms) self.waiting_for_first_audio = False - self.log_event("←", f"[TTFB] First audio latency: {ttfb_ms:.0f}ms") - - # Log progress + self.log_event("<-", f"[TTFB] First audio latency: {ttfb_ms:.0f}ms") + duration_ms = len(message) / (self.sample_rate * 2) * 1000 total_ms = len(self.received_audio) / (self.sample_rate * 2) * 1000 if self.verbose: - print(f"← Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r") - + print(f"<- Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r") else: - # JSON event event = json.loads(message) await self._handle_event(event) - + except asyncio.TimeoutError: continue except websockets.ConnectionClosed: - self.log_event("←", "Connection closed") + self.log_event("<-", "Connection closed") self.running = False break - + except asyncio.CancelledError: pass - except Exception as e: - self.log_event("!", f"Receiver error: {e}") + except Exception as exc: + self.log_event("!", f"Receiver error: {exc}") self.running = False - + async def _handle_event(self, event: dict) -> None: """Handle incoming event.""" event_type = event.get("type", "unknown") @@ -331,14 +374,14 @@ class WavFileClient: if event_type == "session.started": self.session_ready = True - self.log_event("←", f"Session ready!{ids}") + self.log_event("<-", f"Session ready!{ids}") elif event_type == "config.resolved": config = event.get("config", {}) - self.log_event("←", f"Config resolved (output={config.get('output', {})}){ids}") + self.log_event("<-", f"Config resolved (output={config.get('output', {})}){ids}") elif event_type == "input.speech_started": - self.log_event("←", f"Speech detected{ids}") + self.log_event("<-", f"Speech detected{ids}") elif event_type == "input.speech_stopped": - self.log_event("←", f"Silence detected{ids}") + self.log_event("<-", f"Silence detected{ids}") elif event_type == "transcript.delta": text = event.get("text", "") display_text = text[:60] + "..." if len(text) > 60 else text @@ -346,125 +389,128 @@ class WavFileClient: elif event_type == "transcript.final": text = event.get("text", "") print(" " * 80, end="\r") - self.log_event("←", f"→ You: {text}{ids}") + self.log_event("<-", f"You: {text}{ids}") elif event_type == "metrics.ttfb": latency_ms = event.get("latencyMs", 0) - self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms") + self.log_event("<-", f"[TTFB] Server latency: {latency_ms}ms") elif event_type == "assistant.response.delta": text = event.get("text", "") if self.verbose and text: - self.log_event("←", f"LLM: {text}{ids}") + self.log_event("<-", f"LLM: {text}{ids}") elif event_type == "assistant.response.final": text = event.get("text", "") if text: - self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}{ids}") + summary = text[:100] + ("..." if len(text) > 100 else "") + self.log_event("<-", f"LLM Response (final): {summary}{ids}") elif event_type == "output.audio.start": self.track_started = True self.response_start_time = time.time() self.waiting_for_first_audio = True - self.log_event("←", f"Bot started speaking{ids}") + self._close_output_segment() + self.log_event("<-", f"Bot started speaking{ids}") elif event_type == "output.audio.end": self.track_ended = True - self.log_event("←", f"Bot finished speaking{ids}") + self._close_output_segment() + self.log_event("<-", f"Bot finished speaking{ids}") elif event_type == "response.interrupted": - self.log_event("←", f"Bot interrupted!{ids}") + self._close_output_segment() + self.log_event("<-", f"Bot interrupted!{ids}") elif event_type == "error": self.log_event("!", f"Error: {event.get('message')}{ids}") elif event_type == "session.stopped": - self.log_event("←", f"Session stopped: {event.get('reason')}{ids}") + self.log_event("<-", f"Session stopped: {event.get('reason')}{ids}") self.running = False else: - self.log_event("←", f"Event: {event_type}{ids}") - + self.log_event("<-", f"Event: {event_type}{ids}") + def save_output_wav(self) -> None: - """Save received audio to output WAV file.""" - if not self.received_audio: - self.log_event("!", "No audio received to save") + """Save the conversation to a stereo WAV file.""" + input_track = self._build_input_track() + output_track = self._build_output_track() + + if input_track.size == 0 and output_track.size == 0: + self.log_event("!", "No audio available to save") return - - # Convert bytes to numpy array - audio_data = np.frombuffer(bytes(self.received_audio), dtype=np.int16) - - # Ensure output directory exists + + if not self.received_audio: + self.log_event("!", "No assistant audio received; saving silent right channel") + + total_samples = max(input_track.size, output_track.size) + if input_track.size < total_samples: + input_track = np.pad(input_track, (0, total_samples - input_track.size)) + if output_track.size < total_samples: + output_track = np.pad(output_track, (0, total_samples - output_track.size)) + + stereo_audio = np.column_stack((input_track, output_track)).astype(np.int16, copy=False) + self.output_file.parent.mkdir(parents=True, exist_ok=True) - - # Save using wave module for compatibility - with wave.open(str(self.output_file), 'wb') as wav_file: - wav_file.setnchannels(1) + + with wave.open(str(self.output_file), "wb") as wav_file: + wav_file.setnchannels(2) wav_file.setsampwidth(2) # 16-bit wav_file.setframerate(self.sample_rate) - wav_file.writeframes(audio_data.tobytes()) - - duration = len(audio_data) / self.sample_rate - self.log_event("→", f"Saved output: {self.output_file}") - self.log_event("→", f" Duration: {duration:.2f}s ({len(audio_data)} samples)") - self.log_event("→", f" Size: {len(self.received_audio)/1024:.1f} KB") - + wav_file.writeframes(stereo_audio.tobytes()) + + duration = total_samples / self.sample_rate + self.log_event("->", f"Saved stereo output: {self.output_file}") + self.log_event("->", f" Duration: {duration:.2f}s ({total_samples} samples/channel)") + self.log_event("->", " Channels: left=input, right=assistant") + self.log_event("->", f" Size: {stereo_audio.nbytes / 1024:.1f} KB") + async def run(self) -> None: """Run the WAV file test.""" try: - # Load input WAV file audio_data, _ = self.load_wav_file() - - # Connect to server + await self.connect() - - # Start receiver task + receiver_task = asyncio.create_task(self.receiver()) - # Wait for session.started before streaming audio ready_start = time.time() while self.running and not self.session_ready: if time.time() - ready_start > 8.0: raise TimeoutError("Timeout waiting for session.started") await asyncio.sleep(0.05) - - # Send audio + await self.audio_sender(audio_data) - - # Wait for response - self.log_event("→", f"Waiting {self.wait_time}s for response...") - + + self.log_event("->", f"Waiting {self.wait_time}s for response...") + wait_start = time.time() while self.running and (time.time() - wait_start) < self.wait_time: - # Check if track has ended (response complete) if self.track_ended and self.send_completed: - # Give a little extra time for any remaining audio await asyncio.sleep(1.0) break await asyncio.sleep(0.1) - - # Cleanup + self.running = False receiver_task.cancel() - + try: await receiver_task except asyncio.CancelledError: pass - - # Save output + self.save_output_wav() - - # Print summary self._print_summary() - - except FileNotFoundError as e: - print(f"Error: {e}") + + except FileNotFoundError as exc: + print(f"Error: {exc}") sys.exit(1) except ConnectionRefusedError: print(f"Error: Could not connect to {self.url}") print("Make sure the server is running.") sys.exit(1) - except Exception as e: - print(f"Error: {e}") + except Exception as exc: + print(f"Error: {exc}") import traceback + traceback.print_exc() sys.exit(1) finally: await self.close() - - def _print_summary(self): + + def _print_summary(self) -> None: """Print session summary.""" print("\n" + "=" * 50) print("Session Summary") @@ -477,19 +523,20 @@ class WavFileClient: if len(self.ttfb_list) == 1: print(f" TTFB: {self.ttfb_list[0]:.0f} ms") else: - print(f" TTFB (per response): {', '.join(f'{t:.0f}ms' for t in self.ttfb_list)}") + values = ", ".join(f"{ttfb:.0f}ms" for ttfb in self.ttfb_list) + print(f" TTFB (per response): {values}") if self.received_audio: duration = len(self.received_audio) / (self.sample_rate * 2) print(f" Response duration: {duration:.2f}s") print("=" * 50) - + async def close(self) -> None: """Close the connection.""" self.running = False if self.ws: try: await self.ws.close() - except: + except Exception: pass @@ -498,67 +545,71 @@ async def main(): description="WAV file client for testing duplex voice conversation" ) parser.add_argument( - "--input", "-i", + "--input", + "-i", required=True, - help="Input WAV file path" + help="Input WAV file path", ) parser.add_argument( - "--output", "-o", + "--output", + "-o", required=True, - help="Output WAV file path for response" + help="Output WAV file path for stereo conversation audio", ) parser.add_argument( "--url", default="ws://localhost:8000/ws", - help="WebSocket server URL (default: ws://localhost:8000/ws)" + help="WebSocket server URL (default: ws://localhost:8000/ws)", ) parser.add_argument( "--sample-rate", type=int, default=16000, - help="Target sample rate for audio (default: 16000)" + help="Target sample rate for audio (default: 16000)", ) parser.add_argument( "--assistant-id", - default="assistant_demo", - help="Assistant identifier used in websocket query parameter" + default="default", + help="Assistant identifier used in websocket query parameter", ) parser.add_argument( "--channel", default="wav_client", - help="Client channel name" + help="Client channel name", ) parser.add_argument( "--chunk-duration", type=int, default=20, - help="Chunk duration in ms for sending (default: 20)" + help="Chunk duration in ms for sending (default: 20)", ) parser.add_argument( - "--wait-time", "-w", + "--wait-time", + "-w", type=float, default=15.0, - help="Time to wait for response after sending (default: 15.0)" + help="Time to wait for response after sending (default: 15.0)", ) parser.add_argument( - "--verbose", "-v", + "--verbose", + "-v", action="store_true", - help="Enable verbose output" + help="Enable verbose output", ) parser.add_argument( "--track-debug", action="store_true", - help="Print event trackId for protocol debugging" + help="Print event trackId for protocol debugging", ) parser.add_argument( "--tail-silence-ms", type=int, default=800, - help="Trailing silence to send after WAV playback for EOU detection (default: 800)" + help="Trailing silence to send after WAV playback for EOU detection (default: 800)", ) - + args = parser.parse_args() - + client = WavFileClient( url=args.url, input_file=args.input, @@ -572,7 +623,7 @@ async def main(): track_debug=args.track_debug, tail_silence_ms=args.tail_silence_ms, ) - + await client.run() @@ -580,4 +631,4 @@ if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: - print("\nInterrupted by user") + print("\nInterrupted by user") \ No newline at end of file diff --git a/engine/models/__init__.py b/engine/models/__init__.py deleted file mode 100644 index 924d5fd..0000000 --- a/engine/models/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Data Models Package""" diff --git a/engine/models/commands.py b/engine/models/commands.py deleted file mode 100644 index 5bcf47e..0000000 --- a/engine/models/commands.py +++ /dev/null @@ -1,143 +0,0 @@ -"""Protocol command models matching the original active-call API.""" - -from typing import Optional, Dict, Any -from pydantic import BaseModel, Field - - -class InviteCommand(BaseModel): - """Invite command to initiate a call.""" - - command: str = Field(default="invite", description="Command type") - option: Optional[Dict[str, Any]] = Field(default=None, description="Call configuration options") - - -class AcceptCommand(BaseModel): - """Accept command to accept an incoming call.""" - - command: str = Field(default="accept", description="Command type") - option: Optional[Dict[str, Any]] = Field(default=None, description="Call configuration options") - - -class RejectCommand(BaseModel): - """Reject command to reject an incoming call.""" - - command: str = Field(default="reject", description="Command type") - reason: str = Field(default="", description="Reason for rejection") - code: Optional[int] = Field(default=None, description="SIP response code") - - -class RingingCommand(BaseModel): - """Ringing command to send ringing response.""" - - command: str = Field(default="ringing", description="Command type") - recorder: Optional[Dict[str, Any]] = Field(default=None, description="Call recording configuration") - early_media: bool = Field(default=False, description="Enable early media") - ringtone: Optional[str] = Field(default=None, description="Custom ringtone URL") - - -class TTSCommand(BaseModel): - """TTS command to convert text to speech.""" - - command: str = Field(default="tts", description="Command type") - text: str = Field(..., description="Text to synthesize") - speaker: Optional[str] = Field(default=None, description="Speaker voice name") - play_id: Optional[str] = Field(default=None, description="Unique identifier for this TTS session") - auto_hangup: bool = Field(default=False, description="Auto hangup after TTS completion") - streaming: bool = Field(default=False, description="Streaming text input") - end_of_stream: bool = Field(default=False, description="End of streaming input") - wait_input_timeout: Optional[int] = Field(default=None, description="Max time to wait for input (seconds)") - option: Optional[Dict[str, Any]] = Field(default=None, description="TTS provider specific options") - - -class PlayCommand(BaseModel): - """Play command to play audio from URL.""" - - command: str = Field(default="play", description="Command type") - url: str = Field(..., description="URL of audio file to play") - auto_hangup: bool = Field(default=False, description="Auto hangup after playback") - wait_input_timeout: Optional[int] = Field(default=None, description="Max time to wait for input (seconds)") - - -class InterruptCommand(BaseModel): - """Interrupt command to interrupt current playback.""" - - command: str = Field(default="interrupt", description="Command type") - graceful: bool = Field(default=False, description="Wait for current TTS to complete") - - -class PauseCommand(BaseModel): - """Pause command to pause current playback.""" - - command: str = Field(default="pause", description="Command type") - - -class ResumeCommand(BaseModel): - """Resume command to resume paused playback.""" - - command: str = Field(default="resume", description="Command type") - - -class HangupCommand(BaseModel): - """Hangup command to end the call.""" - - command: str = Field(default="hangup", description="Command type") - reason: Optional[str] = Field(default=None, description="Reason for hangup") - initiator: Optional[str] = Field(default=None, description="Who initiated the hangup") - - -class HistoryCommand(BaseModel): - """History command to add conversation history.""" - - command: str = Field(default="history", description="Command type") - speaker: str = Field(..., description="Speaker identifier") - text: str = Field(..., description="Conversation text") - - -class ChatCommand(BaseModel): - """Chat command for text-based conversation.""" - - command: str = Field(default="chat", description="Command type") - text: str = Field(..., description="Chat text message") - - -# Command type mapping -COMMAND_TYPES = { - "invite": InviteCommand, - "accept": AcceptCommand, - "reject": RejectCommand, - "ringing": RingingCommand, - "tts": TTSCommand, - "play": PlayCommand, - "interrupt": InterruptCommand, - "pause": PauseCommand, - "resume": ResumeCommand, - "hangup": HangupCommand, - "history": HistoryCommand, - "chat": ChatCommand, -} - - -def parse_command(data: Dict[str, Any]) -> BaseModel: - """ - Parse a command from JSON data. - - Args: - data: JSON data as dictionary - - Returns: - Parsed command model - - Raises: - ValueError: If command type is unknown - """ - command_type = data.get("command") - - if not command_type: - raise ValueError("Missing 'command' field") - - command_class = COMMAND_TYPES.get(command_type) - - if not command_class: - raise ValueError(f"Unknown command type: {command_type}") - - return command_class(**data) diff --git a/engine/models/config.py b/engine/models/config.py deleted file mode 100644 index 009411e..0000000 --- a/engine/models/config.py +++ /dev/null @@ -1,126 +0,0 @@ -"""Configuration models for call options.""" - -from typing import Optional, Dict, Any, List -from pydantic import BaseModel, Field - - -class VADOption(BaseModel): - """Voice Activity Detection configuration.""" - - type: str = Field(default="silero", description="VAD algorithm type (silero, webrtc)") - samplerate: int = Field(default=16000, description="Audio sample rate for VAD") - speech_padding: int = Field(default=250, description="Speech padding in milliseconds") - silence_padding: int = Field(default=100, description="Silence padding in milliseconds") - ratio: float = Field(default=0.5, description="Voice detection ratio threshold") - voice_threshold: float = Field(default=0.5, description="Voice energy threshold") - max_buffer_duration_secs: int = Field(default=50, description="Maximum buffer duration in seconds") - silence_timeout: Optional[int] = Field(default=None, description="Silence timeout in milliseconds") - endpoint: Optional[str] = Field(default=None, description="Custom VAD service endpoint") - secret_key: Optional[str] = Field(default=None, description="VAD service secret key") - secret_id: Optional[str] = Field(default=None, description="VAD service secret ID") - - -class ASROption(BaseModel): - """Automatic Speech Recognition configuration.""" - - provider: str = Field(..., description="ASR provider (tencent, aliyun, openai, etc.)") - language: Optional[str] = Field(default=None, description="Language code (zh-CN, en-US)") - app_id: Optional[str] = Field(default=None, description="Application ID") - secret_id: Optional[str] = Field(default=None, description="Secret ID for authentication") - secret_key: Optional[str] = Field(default=None, description="Secret key for authentication") - model_type: Optional[str] = Field(default=None, description="ASR model type (16k_zh, 8k_en)") - buffer_size: Optional[int] = Field(default=None, description="Audio buffer size in bytes") - samplerate: Optional[int] = Field(default=None, description="Audio sample rate") - endpoint: Optional[str] = Field(default=None, description="Custom ASR service endpoint") - extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") - start_when_answer: bool = Field(default=False, description="Start ASR when call is answered") - - -class TTSOption(BaseModel): - """Text-to-Speech configuration.""" - - samplerate: Optional[int] = Field(default=None, description="TTS output sample rate") - provider: str = Field(default="msedge", description="TTS provider (tencent, aliyun, deepgram, msedge)") - speed: float = Field(default=1.0, description="Speech speed multiplier") - app_id: Optional[str] = Field(default=None, description="Application ID") - secret_id: Optional[str] = Field(default=None, description="Secret ID for authentication") - secret_key: Optional[str] = Field(default=None, description="Secret key for authentication") - volume: Optional[int] = Field(default=None, description="Speech volume level (1-10)") - speaker: Optional[str] = Field(default=None, description="Voice speaker name") - codec: Optional[str] = Field(default=None, description="Audio codec") - subtitle: bool = Field(default=False, description="Enable subtitle generation") - emotion: Optional[str] = Field(default=None, description="Speech emotion") - endpoint: Optional[str] = Field(default=None, description="Custom TTS service endpoint") - extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") - max_concurrent_tasks: Optional[int] = Field(default=None, description="Max concurrent tasks") - - -class RecorderOption(BaseModel): - """Call recording configuration.""" - - recorder_file: str = Field(..., description="Path to recording file") - samplerate: int = Field(default=16000, description="Recording sample rate") - ptime: int = Field(default=200, description="Packet time in milliseconds") - - -class MediaPassOption(BaseModel): - """Media pass-through configuration for external audio processing.""" - - url: str = Field(..., description="WebSocket URL for media streaming") - input_sample_rate: int = Field(default=16000, description="Sample rate of audio received from WebSocket") - output_sample_rate: int = Field(default=16000, description="Sample rate of audio sent to WebSocket") - packet_size: int = Field(default=2560, description="Packet size in bytes") - ptime: Optional[int] = Field(default=None, description="Buffered playback period in milliseconds") - - -class SipOption(BaseModel): - """SIP protocol configuration.""" - - username: Optional[str] = Field(default=None, description="SIP username") - password: Optional[str] = Field(default=None, description="SIP password") - realm: Optional[str] = Field(default=None, description="SIP realm/domain") - headers: Optional[Dict[str, str]] = Field(default=None, description="Additional SIP headers") - - -class HandlerRule(BaseModel): - """Handler routing rule.""" - - caller: Optional[str] = Field(default=None, description="Caller pattern (regex)") - callee: Optional[str] = Field(default=None, description="Callee pattern (regex)") - playbook: Optional[str] = Field(default=None, description="Playbook file path") - webhook: Optional[str] = Field(default=None, description="Webhook URL") - - -class CallOption(BaseModel): - """Comprehensive call configuration options.""" - - # Basic options - denoise: bool = Field(default=False, description="Enable noise reduction") - offer: Optional[str] = Field(default=None, description="SDP offer string") - callee: Optional[str] = Field(default=None, description="Callee SIP URI or phone number") - caller: Optional[str] = Field(default=None, description="Caller SIP URI or phone number") - - # Audio codec - codec: str = Field(default="pcm", description="Audio codec (pcm, pcma, pcmu, g722)") - - # Component configurations - recorder: Optional[RecorderOption] = Field(default=None, description="Call recording config") - asr: Optional[ASROption] = Field(default=None, description="ASR configuration") - vad: Optional[VADOption] = Field(default=None, description="VAD configuration") - tts: Optional[TTSOption] = Field(default=None, description="TTS configuration") - media_pass: Optional[MediaPassOption] = Field(default=None, description="Media pass-through config") - sip: Optional[SipOption] = Field(default=None, description="SIP configuration") - - # Timeouts and networking - handshake_timeout: Optional[int] = Field(default=None, description="Handshake timeout in seconds") - enable_ipv6: bool = Field(default=False, description="Enable IPv6 support") - inactivity_timeout: Optional[int] = Field(default=None, description="Inactivity timeout in seconds") - - # EOU configuration - eou: Optional[Dict[str, Any]] = Field(default=None, description="End of utterance detection config") - - # Extra parameters - extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional custom parameters") - - class Config: - populate_by_name = True diff --git a/engine/models/events.py b/engine/models/events.py deleted file mode 100644 index 031b8be..0000000 --- a/engine/models/events.py +++ /dev/null @@ -1,231 +0,0 @@ -"""Protocol event models matching the original active-call API.""" - -from typing import Optional, Dict, Any -from pydantic import BaseModel, Field -from datetime import datetime - - -def current_timestamp_ms() -> int: - """Get current timestamp in milliseconds.""" - return int(datetime.now().timestamp() * 1000) - - -# Base Event Model -class BaseEvent(BaseModel): - """Base event model.""" - - event: str = Field(..., description="Event type") - track_id: str = Field(..., description="Unique track identifier") - timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp in milliseconds") - - -# Lifecycle Events -class IncomingEvent(BaseEvent): - """Incoming call event (SIP only).""" - - event: str = Field(default="incoming", description="Event type") - caller: Optional[str] = Field(default=None, description="Caller's SIP URI") - callee: Optional[str] = Field(default=None, description="Callee's SIP URI") - sdp: Optional[str] = Field(default=None, description="SDP offer from caller") - - -class AnswerEvent(BaseEvent): - """Call answered event.""" - - event: str = Field(default="answer", description="Event type") - sdp: Optional[str] = Field(default=None, description="SDP answer from server") - - -class RejectEvent(BaseEvent): - """Call rejected event.""" - - event: str = Field(default="reject", description="Event type") - reason: Optional[str] = Field(default=None, description="Rejection reason") - code: Optional[int] = Field(default=None, description="SIP response code") - - -class RingingEvent(BaseEvent): - """Call ringing event.""" - - event: str = Field(default="ringing", description="Event type") - early_media: bool = Field(default=False, description="Early media available") - - -class HangupEvent(BaseModel): - """Call hangup event.""" - - event: str = Field(default="hangup", description="Event type") - timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp") - reason: Optional[str] = Field(default=None, description="Hangup reason") - initiator: Optional[str] = Field(default=None, description="Who initiated hangup") - start_time: Optional[str] = Field(default=None, description="Call start time (ISO 8601)") - hangup_time: Optional[str] = Field(default=None, description="Hangup time (ISO 8601)") - answer_time: Optional[str] = Field(default=None, description="Answer time (ISO 8601)") - ringing_time: Optional[str] = Field(default=None, description="Ringing time (ISO 8601)") - from_: Optional[Dict[str, Any]] = Field(default=None, alias="from", description="Caller info") - to: Optional[Dict[str, Any]] = Field(default=None, description="Callee info") - extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata") - - class Config: - populate_by_name = True - - -# VAD Events -class SpeakingEvent(BaseEvent): - """Speech detected event.""" - - event: str = Field(default="speaking", description="Event type") - start_time: int = Field(default_factory=current_timestamp_ms, description="Speech start time") - - -class SilenceEvent(BaseEvent): - """Silence detected event.""" - - event: str = Field(default="silence", description="Event type") - start_time: int = Field(default_factory=current_timestamp_ms, description="Silence start time") - duration: int = Field(default=0, description="Silence duration in milliseconds") - - -# AI/ASR Events -class AsrFinalEvent(BaseEvent): - """ASR final transcription event.""" - - event: str = Field(default="asrFinal", description="Event type") - index: int = Field(..., description="ASR result sequence number") - start_time: Optional[int] = Field(default=None, description="Speech start time") - end_time: Optional[int] = Field(default=None, description="Speech end time") - text: str = Field(..., description="Transcribed text") - - -class AsrDeltaEvent(BaseEvent): - """ASR partial transcription event (streaming).""" - - event: str = Field(default="asrDelta", description="Event type") - index: int = Field(..., description="ASR result sequence number") - start_time: Optional[int] = Field(default=None, description="Speech start time") - end_time: Optional[int] = Field(default=None, description="Speech end time") - text: str = Field(..., description="Partial transcribed text") - - -class EouEvent(BaseEvent): - """End of utterance detection event.""" - - event: str = Field(default="eou", description="Event type") - completed: bool = Field(default=True, description="Whether utterance was completed") - - -# Audio Track Events -class TrackStartEvent(BaseEvent): - """Audio track start event.""" - - event: str = Field(default="trackStart", description="Event type") - play_id: Optional[str] = Field(default=None, description="Play ID from TTS/Play command") - - -class TrackEndEvent(BaseEvent): - """Audio track end event.""" - - event: str = Field(default="trackEnd", description="Event type") - duration: int = Field(..., description="Track duration in milliseconds") - ssrc: int = Field(..., description="RTP SSRC identifier") - play_id: Optional[str] = Field(default=None, description="Play ID from TTS/Play command") - - -class InterruptionEvent(BaseEvent): - """Playback interruption event.""" - - event: str = Field(default="interruption", description="Event type") - play_id: Optional[str] = Field(default=None, description="Play ID that was interrupted") - subtitle: Optional[str] = Field(default=None, description="TTS text being played") - position: Optional[int] = Field(default=None, description="Word index position") - total_duration: Optional[int] = Field(default=None, description="Total TTS duration") - current: Optional[int] = Field(default=None, description="Elapsed time when interrupted") - - -# System Events -class ErrorEvent(BaseEvent): - """Error event.""" - - event: str = Field(default="error", description="Event type") - sender: str = Field(..., description="Component that generated the error") - error: str = Field(..., description="Error message") - code: Optional[int] = Field(default=None, description="Error code") - - -class MetricsEvent(BaseModel): - """Performance metrics event.""" - - event: str = Field(default="metrics", description="Event type") - timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp") - key: str = Field(..., description="Metric key") - duration: int = Field(..., description="Duration in milliseconds") - data: Optional[Dict[str, Any]] = Field(default=None, description="Additional metric data") - - -class AddHistoryEvent(BaseModel): - """Conversation history entry added event.""" - - event: str = Field(default="addHistory", description="Event type") - timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp") - sender: Optional[str] = Field(default=None, description="Component that added history") - speaker: str = Field(..., description="Speaker identifier") - text: str = Field(..., description="Conversation text") - - -class DTMFEvent(BaseEvent): - """DTMF tone detected event.""" - - event: str = Field(default="dtmf", description="Event type") - digit: str = Field(..., description="DTMF digit (0-9, *, #, A-D)") - - -class HeartBeatEvent(BaseModel): - """Server-to-client heartbeat to keep connection alive.""" - - event: str = Field(default="heartBeat", description="Event type") - timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp in milliseconds") - - -# Event type mapping -EVENT_TYPES = { - "incoming": IncomingEvent, - "answer": AnswerEvent, - "reject": RejectEvent, - "ringing": RingingEvent, - "hangup": HangupEvent, - "speaking": SpeakingEvent, - "silence": SilenceEvent, - "asrFinal": AsrFinalEvent, - "asrDelta": AsrDeltaEvent, - "eou": EouEvent, - "trackStart": TrackStartEvent, - "trackEnd": TrackEndEvent, - "interruption": InterruptionEvent, - "error": ErrorEvent, - "metrics": MetricsEvent, - "addHistory": AddHistoryEvent, - "dtmf": DTMFEvent, - "heartBeat": HeartBeatEvent, -} - - -def create_event(event_type: str, **kwargs) -> BaseModel: - """ - Create an event model. - - Args: - event_type: Type of event to create - **kwargs: Event fields - - Returns: - Event model instance - - Raises: - ValueError: If event type is unknown - """ - event_class = EVENT_TYPES.get(event_type) - - if not event_class: - raise ValueError(f"Unknown event type: {event_type}") - - return event_class(event=event_type, **kwargs) diff --git a/engine/protocol/__init__.py b/engine/protocol/__init__.py new file mode 100644 index 0000000..311e510 --- /dev/null +++ b/engine/protocol/__init__.py @@ -0,0 +1 @@ +"""Protocol package.""" diff --git a/engine/protocol/ws_v1/__init__.py b/engine/protocol/ws_v1/__init__.py new file mode 100644 index 0000000..6b76589 --- /dev/null +++ b/engine/protocol/ws_v1/__init__.py @@ -0,0 +1 @@ +"""WS v1 protocol package.""" diff --git a/engine/models/ws_v1.py b/engine/protocol/ws_v1/schema.py similarity index 100% rename from engine/models/ws_v1.py rename to engine/protocol/ws_v1/schema.py diff --git a/engine/providers/__init__.py b/engine/providers/__init__.py new file mode 100644 index 0000000..2209974 --- /dev/null +++ b/engine/providers/__init__.py @@ -0,0 +1 @@ +"""Providers package.""" diff --git a/engine/providers/asr/__init__.py b/engine/providers/asr/__init__.py new file mode 100644 index 0000000..5e5dc29 --- /dev/null +++ b/engine/providers/asr/__init__.py @@ -0,0 +1,15 @@ +"""ASR providers.""" + +from providers.asr.buffered import BufferedASRService, MockASRService +from providers.asr.dashscope import DashScopeRealtimeASRService +from providers.asr.openai_compatible import OpenAICompatibleASRService, SiliconFlowASRService +from providers.asr.volcengine import VolcengineRealtimeASRService + +__all__ = [ + "BufferedASRService", + "MockASRService", + "DashScopeRealtimeASRService", + "OpenAICompatibleASRService", + "SiliconFlowASRService", + "VolcengineRealtimeASRService", +] diff --git a/engine/services/asr.py b/engine/providers/asr/buffered.py similarity index 81% rename from engine/services/asr.py rename to engine/providers/asr/buffered.py index 51ab584..624963c 100644 --- a/engine/services/asr.py +++ b/engine/providers/asr/buffered.py @@ -9,7 +9,7 @@ import json from typing import AsyncIterator, Optional from loguru import logger -from services.base import BaseASRService, ASRResult, ServiceState +from providers.common.base import BaseASRService, ASRResult, ServiceState # Try to import websockets for streaming ASR try: @@ -34,6 +34,7 @@ class BufferedASRService(BaseASRService): language: str = "en" ): super().__init__(sample_rate=sample_rate, language=language) + self.mode = "offline" self._audio_buffer: bytes = b"" self._current_text: str = "" @@ -86,6 +87,23 @@ class BufferedASRService(BaseASRService): self._current_text = "" self._audio_buffer = b"" return text + + async def get_final_transcription(self) -> str: + """Offline compatibility method used by DuplexPipeline.""" + return self.get_and_clear_text() + + def clear_buffer(self) -> None: + """Offline compatibility method used by DuplexPipeline.""" + self._audio_buffer = b"" + self._current_text = "" + + async def start_interim_transcription(self) -> None: + """No-op for plain buffered ASR.""" + return None + + async def stop_interim_transcription(self) -> None: + """No-op for plain buffered ASR.""" + return None def get_audio_buffer(self) -> bytes: """Get accumulated audio buffer.""" @@ -103,6 +121,7 @@ class MockASRService(BaseASRService): def __init__(self, sample_rate: int = 16000, language: str = "en"): super().__init__(sample_rate=sample_rate, language=language) + self.mode = "offline" self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue() self._mock_texts = [ "Hello, how are you?", @@ -145,3 +164,18 @@ class MockASRService(BaseASRService): continue except asyncio.CancelledError: break + + def clear_buffer(self) -> None: + return None + + async def get_final_transcription(self) -> str: + return "" + + def get_and_clear_text(self) -> str: + return "" + + async def start_interim_transcription(self) -> None: + return None + + async def stop_interim_transcription(self) -> None: + return None diff --git a/engine/providers/asr/dashscope.py b/engine/providers/asr/dashscope.py new file mode 100644 index 0000000..bed4ede --- /dev/null +++ b/engine/providers/asr/dashscope.py @@ -0,0 +1,388 @@ +"""DashScope realtime streaming ASR service. + +Uses Qwen-ASR-Realtime via DashScope Python SDK. +""" + +from __future__ import annotations + +import asyncio +import base64 +import json +import os +import sys +from typing import Any, AsyncIterator, Awaitable, Callable, Dict, Optional + +from loguru import logger + +from providers.common.base import ASRResult, BaseASRService, ServiceState + +try: + import dashscope + from dashscope.audio.qwen_omni import MultiModality, OmniRealtimeCallback, OmniRealtimeConversation + + # Some SDK builds keep TranscriptionParams under qwen_omni.omni_realtime. + try: + from dashscope.audio.qwen_omni import TranscriptionParams + except ImportError: + from dashscope.audio.qwen_omni.omni_realtime import TranscriptionParams + + DASHSCOPE_SDK_AVAILABLE = True + DASHSCOPE_IMPORT_ERROR = "" +except Exception as exc: + DASHSCOPE_IMPORT_ERROR = f"{type(exc).__name__}: {exc}" + dashscope = None # type: ignore[assignment] + MultiModality = None # type: ignore[assignment] + OmniRealtimeConversation = None # type: ignore[assignment] + TranscriptionParams = None # type: ignore[assignment] + DASHSCOPE_SDK_AVAILABLE = False + + class OmniRealtimeCallback: # type: ignore[no-redef] + """Fallback callback base when DashScope SDK is unavailable.""" + + pass + + +class _DashScopeASRCallback(OmniRealtimeCallback): + """Bridge DashScope SDK callbacks into asyncio loop-safe handlers.""" + + def __init__(self, owner: "DashScopeRealtimeASRService", loop: asyncio.AbstractEventLoop): + super().__init__() + self._owner = owner + self._loop = loop + + def _schedule(self, fn: Callable[[], None]) -> None: + try: + self._loop.call_soon_threadsafe(fn) + except RuntimeError: + return + + def on_open(self) -> None: + self._schedule(self._owner._on_ws_open) + + def on_close(self, code: int, msg: str) -> None: + self._schedule(lambda: self._owner._on_ws_close(code, msg)) + + def on_event(self, message: Any) -> None: + self._schedule(lambda: self._owner._on_ws_event(message)) + + def on_error(self, message: Any) -> None: + self._schedule(lambda: self._owner._on_ws_error(message)) + + +class DashScopeRealtimeASRService(BaseASRService): + """Realtime streaming ASR implementation for DashScope Qwen-ASR-Realtime.""" + + DEFAULT_WS_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + DEFAULT_MODEL = "qwen3-asr-flash-realtime" + DEFAULT_FINAL_TIMEOUT_MS = 800 + + def __init__( + self, + api_key: Optional[str] = None, + api_url: Optional[str] = None, + model: Optional[str] = None, + sample_rate: int = 16000, + language: str = "auto", + on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None, + ) -> None: + super().__init__(sample_rate=sample_rate, language=language) + self.mode = "streaming" + self.api_key = ( + api_key + or os.getenv("DASHSCOPE_API_KEY") + or os.getenv("ASR_API_KEY") + ) + self.api_url = api_url or os.getenv("DASHSCOPE_ASR_API_URL") or self.DEFAULT_WS_URL + self.model = model or os.getenv("DASHSCOPE_ASR_MODEL") or self.DEFAULT_MODEL + self.on_transcript = on_transcript + + self._client: Optional[Any] = None + self._loop: Optional[asyncio.AbstractEventLoop] = None + self._callback: Optional[_DashScopeASRCallback] = None + + self._running = False + self._session_ready = asyncio.Event() + self._transcript_queue: "asyncio.Queue[ASRResult]" = asyncio.Queue() + self._final_queue: "asyncio.Queue[str]" = asyncio.Queue() + + self._utterance_active = False + self._audio_sent_in_utterance = False + self._last_interim_text = "" + self._last_error: Optional[str] = None + + async def connect(self) -> None: + if not DASHSCOPE_SDK_AVAILABLE: + py_exec = sys.executable + hint = f"`{py_exec} -m pip install dashscope>=1.25.6`" + detail = f"; import error: {DASHSCOPE_IMPORT_ERROR}" if DASHSCOPE_IMPORT_ERROR else "" + raise RuntimeError( + f"dashscope SDK unavailable in interpreter {py_exec}; install with {hint}{detail}" + ) + if not self.api_key: + raise ValueError("DashScope ASR API key not provided. Configure agent.asr.api_key in YAML.") + + self._loop = asyncio.get_running_loop() + self._callback = _DashScopeASRCallback(owner=self, loop=self._loop) + + if dashscope is not None: + dashscope.api_key = self.api_key + + self._client = OmniRealtimeConversation( # type: ignore[misc] + model=self.model, + url=self.api_url, + callback=self._callback, + ) + await asyncio.to_thread(self._client.connect) + await self._configure_session() + + self._running = True + self.state = ServiceState.CONNECTED + logger.info( + "DashScope realtime ASR connected: model={}, sample_rate={}, language={}", + self.model, + self.sample_rate, + self.language, + ) + + async def disconnect(self) -> None: + self._running = False + self._utterance_active = False + self._audio_sent_in_utterance = False + self._drain_queue(self._final_queue) + self._drain_queue(self._transcript_queue) + self._session_ready.clear() + + if self._client is not None: + close_fn = getattr(self._client, "close", None) + if callable(close_fn): + await asyncio.to_thread(close_fn) + self._client = None + self.state = ServiceState.DISCONNECTED + logger.info("DashScope realtime ASR disconnected") + + async def begin_utterance(self) -> None: + self.clear_utterance() + self._utterance_active = True + + async def send_audio(self, audio: bytes) -> None: + if not self._client: + raise RuntimeError("DashScope ASR service not connected") + if not audio: + return + + if not self._utterance_active: + # Allow graceful fallback if caller sends before begin_utterance. + self._utterance_active = True + + audio_b64 = base64.b64encode(audio).decode("ascii") + append_fn = getattr(self._client, "append_audio", None) + if not callable(append_fn): + raise RuntimeError("DashScope ASR SDK missing append_audio method") + await asyncio.to_thread(append_fn, audio_b64) + self._audio_sent_in_utterance = True + + async def end_utterance(self) -> None: + if not self._client: + return + if not self._utterance_active or not self._audio_sent_in_utterance: + return + + commit_fn = getattr(self._client, "commit", None) + if not callable(commit_fn): + raise RuntimeError("DashScope ASR SDK missing commit method") + await asyncio.to_thread(commit_fn) + self._utterance_active = False + + async def wait_for_final_transcription(self, timeout_ms: int = DEFAULT_FINAL_TIMEOUT_MS) -> str: + if not self._audio_sent_in_utterance: + return "" + timeout_sec = max(0.05, float(timeout_ms) / 1000.0) + try: + text = await asyncio.wait_for(self._final_queue.get(), timeout=timeout_sec) + return str(text or "").strip() + except asyncio.TimeoutError: + logger.debug("DashScope ASR final timeout ({}ms), fallback to last interim", timeout_ms) + return str(self._last_interim_text or "").strip() + + def clear_utterance(self) -> None: + self._utterance_active = False + self._audio_sent_in_utterance = False + self._last_interim_text = "" + self._last_error = None + self._drain_queue(self._final_queue) + + async def receive_transcripts(self) -> AsyncIterator[ASRResult]: + while self._running: + try: + result = await asyncio.wait_for(self._transcript_queue.get(), timeout=0.1) + yield result + except asyncio.TimeoutError: + continue + except asyncio.CancelledError: + break + + async def _configure_session(self) -> None: + if not self._client: + raise RuntimeError("DashScope ASR client is not initialized") + + text_modality: Any = "text" + if MultiModality is not None and hasattr(MultiModality, "TEXT"): + text_modality = MultiModality.TEXT + + transcription_params: Optional[Any] = None + if TranscriptionParams is not None: + try: + lang = "zh" if self.language == "auto" else self.language + transcription_params = TranscriptionParams( + language=lang, + sample_rate=self.sample_rate, + input_audio_format="pcm", + ) + except Exception as exc: + logger.debug("DashScope ASR TranscriptionParams init failed: {}", exc) + transcription_params = None + + update_attempts = [ + { + "output_modalities": [text_modality], + "enable_turn_detection": False, + "enable_input_audio_transcription": True, + "transcription_params": transcription_params, + }, + { + "output_modalities": [text_modality], + "enable_turn_detection": False, + "enable_input_audio_transcription": True, + }, + { + "output_modalities": [text_modality], + }, + ] + + update_fn = getattr(self._client, "update_session", None) + if not callable(update_fn): + raise RuntimeError("DashScope ASR SDK missing update_session method") + + last_error: Optional[Exception] = None + for params in update_attempts: + if params.get("transcription_params") is None: + params = {k: v for k, v in params.items() if k != "transcription_params"} + try: + await asyncio.to_thread(update_fn, **params) + break + except TypeError as exc: + last_error = exc + continue + except Exception as exc: + last_error = exc + continue + else: + raise RuntimeError(f"DashScope ASR session.update failed: {last_error}") + + try: + await asyncio.wait_for(self._session_ready.wait(), timeout=6.0) + except asyncio.TimeoutError: + logger.debug("DashScope ASR session ready wait timeout; continuing") + + def _on_ws_open(self) -> None: + return None + + def _on_ws_close(self, code: int, msg: str) -> None: + self._last_error = f"DashScope ASR websocket closed: {code} {msg}" + logger.debug(self._last_error) + + def _on_ws_error(self, message: Any) -> None: + self._last_error = str(message) + logger.error("DashScope ASR error: {}", self._last_error) + + def _on_ws_event(self, message: Any) -> None: + payload = self._coerce_event(message) + event_type = str(payload.get("type") or "").strip() + if not event_type: + return + + if event_type in {"session.created", "session.updated"}: + self._session_ready.set() + return + if event_type == "error" or event_type.endswith(".failed"): + err_text = self._extract_text(payload, keys=("message", "error", "details")) + self._last_error = err_text or event_type + logger.error("DashScope ASR server event error: {}", self._last_error) + return + + if event_type == "conversation.item.input_audio_transcription.text": + stash_text = self._extract_text(payload, keys=("stash", "text", "transcript")) + self._emit_transcript(stash_text, is_final=False) + return + + if event_type == "conversation.item.input_audio_transcription.completed": + final_text = self._extract_text(payload, keys=("transcript", "text", "stash")) + self._emit_transcript(final_text, is_final=True) + return + + def _emit_transcript(self, text: str, *, is_final: bool) -> None: + normalized = str(text or "").strip() + if not normalized: + return + if not is_final and normalized == self._last_interim_text: + return + if not is_final: + self._last_interim_text = normalized + + if self._loop is None: + return + try: + asyncio.run_coroutine_threadsafe( + self._publish_transcript(normalized, is_final=is_final), + self._loop, + ) + except RuntimeError: + return + + async def _publish_transcript(self, text: str, *, is_final: bool) -> None: + await self._transcript_queue.put(ASRResult(text=text, is_final=is_final)) + if is_final: + await self._final_queue.put(text) + if self.on_transcript: + try: + await self.on_transcript(text, is_final) + except Exception as exc: + logger.warning("DashScope ASR transcript callback failed: {}", exc) + + @staticmethod + def _coerce_event(message: Any) -> Dict[str, Any]: + if isinstance(message, dict): + return message + if isinstance(message, str): + try: + parsed = json.loads(message) + if isinstance(parsed, dict): + return parsed + except json.JSONDecodeError: + return {"type": "raw", "text": message} + return {"type": "raw", "text": str(message)} + + def _extract_text(self, payload: Dict[str, Any], *, keys: tuple[str, ...]) -> str: + for key in keys: + value = payload.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + if isinstance(value, dict): + nested = self._extract_text(value, keys=keys) + if nested: + return nested + + for value in payload.values(): + if isinstance(value, dict): + nested = self._extract_text(value, keys=keys) + if nested: + return nested + return "" + + @staticmethod + def _drain_queue(queue: "asyncio.Queue[Any]") -> None: + while True: + try: + queue.get_nowait() + except asyncio.QueueEmpty: + break diff --git a/engine/services/openai_compatible_asr.py b/engine/providers/asr/openai_compatible.py similarity index 96% rename from engine/services/openai_compatible_asr.py rename to engine/providers/asr/openai_compatible.py index 7972189..6d90e95 100644 --- a/engine/services/openai_compatible_asr.py +++ b/engine/providers/asr/openai_compatible.py @@ -19,7 +19,7 @@ except ImportError: AIOHTTP_AVAILABLE = False logger.warning("aiohttp not available - OpenAICompatibleASRService will not work") -from services.base import BaseASRService, ASRResult, ServiceState +from providers.common.base import BaseASRService, ASRResult, ServiceState class OpenAICompatibleASRService(BaseASRService): @@ -53,6 +53,7 @@ class OpenAICompatibleASRService(BaseASRService): model: str = "FunAudioLLM/SenseVoiceSmall", sample_rate: int = 16000, language: str = "auto", + enable_interim: bool = False, interim_interval_ms: int = 500, # How often to send interim results min_audio_for_interim_ms: int = 300, # Min audio before first interim on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None @@ -66,19 +67,22 @@ class OpenAICompatibleASRService(BaseASRService): model: ASR model name or alias sample_rate: Audio sample rate (16000 recommended) language: Language code (auto for automatic detection) + enable_interim: Whether to generate interim transcriptions in offline mode interim_interval_ms: How often to generate interim transcriptions min_audio_for_interim_ms: Minimum audio duration before first interim on_transcript: Callback for transcription results (text, is_final) """ super().__init__(sample_rate=sample_rate, language=language) + self.mode = "offline" if not AIOHTTP_AVAILABLE: raise RuntimeError("aiohttp is required for OpenAICompatibleASRService") - self.api_key = api_key or os.getenv("ASR_API_KEY") or os.getenv("SILICONFLOW_API_KEY") + self.api_key = api_key raw_api_url = api_url or os.getenv("ASR_API_URL") or self.API_URL self.api_url = self._resolve_transcriptions_endpoint(raw_api_url) self.model = self.MODELS.get(model.lower(), model) + self.enable_interim = bool(enable_interim) self.interim_interval_ms = interim_interval_ms self.min_audio_for_interim_ms = min_audio_for_interim_ms self.on_transcript = on_transcript @@ -180,6 +184,9 @@ class OpenAICompatibleASRService(BaseASRService): if not self._session: logger.warning("ASR session not connected") return None + + if not is_final and not self.enable_interim: + return None # Check minimum audio duration audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000 @@ -309,6 +316,9 @@ class OpenAICompatibleASRService(BaseASRService): This periodically transcribes buffered audio for real-time feedback to the user. """ + if not self.enable_interim: + return + if self._interim_task and not self._interim_task.done(): return diff --git a/engine/services/siliconflow_asr.py b/engine/providers/asr/siliconflow.py similarity index 75% rename from engine/services/siliconflow_asr.py rename to engine/providers/asr/siliconflow.py index 2cb95dc..d0aeb50 100644 --- a/engine/services/siliconflow_asr.py +++ b/engine/providers/asr/siliconflow.py @@ -1,6 +1,6 @@ """Backward-compatible imports for legacy siliconflow_asr module.""" -from services.openai_compatible_asr import OpenAICompatibleASRService +from providers.asr.openai_compatible import OpenAICompatibleASRService # Backward-compatible alias SiliconFlowASRService = OpenAICompatibleASRService diff --git a/engine/providers/asr/volcengine.py b/engine/providers/asr/volcengine.py new file mode 100644 index 0000000..1f7c18e --- /dev/null +++ b/engine/providers/asr/volcengine.py @@ -0,0 +1,666 @@ +"""Volcengine realtime ASR service. + +Supports both: +- Volcengine Edge Gateway realtime transcription websocket, and +- Volcengine BigASR Seed websocket at openspeech.bytedance.com/api/v3/sauc/bigmodel. +""" + +from __future__ import annotations + +import asyncio +import base64 +import gzip +import json +import os +import uuid +from typing import Any, AsyncIterator, Awaitable, Callable, Dict, Literal, Optional +from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse + +import aiohttp +from loguru import logger + +from providers.common.base import ASRResult, BaseASRService, ServiceState + +VolcengineASRProtocol = Literal["gateway", "seed"] + + +class VolcengineRealtimeASRService(BaseASRService): + """Realtime streaming ASR backed by Volcengine websocket APIs.""" + + DEFAULT_WS_URL = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel" + DEFAULT_GATEWAY_WS_URL = "wss://ai-gateway.vei.volces.com/v1/realtime" + DEFAULT_MODEL = "bigmodel" + DEFAULT_FINAL_TIMEOUT_MS = 1200 + DEFAULT_SEED_RESOURCE_ID = "volc.bigasr.sauc.duration" + _SEED_FRAME_MS = 100 + _SEED_PROTOCOL_VERSION = 0b0001 + _SEED_FULL_CLIENT_REQUEST = 0b0001 + _SEED_AUDIO_ONLY_REQUEST = 0b0010 + _SEED_FULL_SERVER_RESPONSE = 0b1001 + _SEED_SERVER_ACK = 0b1011 + _SEED_SERVER_ERROR_RESPONSE = 0b1111 + _SEED_NO_SEQUENCE = 0b0000 + _SEED_POS_SEQUENCE = 0b0001 + _SEED_NEG_WITH_SEQUENCE = 0b0011 + _SEED_NO_SERIALIZATION = 0b0000 + _SEED_JSON = 0b0001 + _SEED_NO_COMPRESSION = 0b0000 + _SEED_GZIP = 0b0001 + + def __init__( + self, + api_key: Optional[str] = None, + api_url: Optional[str] = None, + model: Optional[str] = None, + sample_rate: int = 16000, + language: str = "auto", + app_id: Optional[str] = None, + resource_id: Optional[str] = None, + uid: Optional[str] = None, + request_params: Optional[Dict[str, Any]] = None, + on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None, + ) -> None: + super().__init__(sample_rate=sample_rate, language=language) + self.mode = "streaming" + self.api_key = api_key or os.getenv("VOLCENGINE_ASR_API_KEY") or os.getenv("ASR_API_KEY") + self.model = str(model or os.getenv("VOLCENGINE_ASR_MODEL") or self.DEFAULT_MODEL).strip() + raw_api_url = api_url or os.getenv("VOLCENGINE_ASR_API_URL") or self.DEFAULT_WS_URL + self.protocol = self._detect_protocol(raw_api_url) + self.api_url = self._resolve_api_url(raw_api_url, self.model, self.protocol) + self.app_id = app_id or os.getenv("VOLCENGINE_ASR_APP_ID") or os.getenv("ASR_APP_ID") + self.resource_id = ( + resource_id + or os.getenv("VOLCENGINE_ASR_RESOURCE_ID") + or (self.DEFAULT_SEED_RESOURCE_ID if self.protocol == "seed" else None) + ) + self.uid = uid or os.getenv("VOLCENGINE_ASR_UID") + self.request_params = self._load_request_params(request_params) + self.on_transcript = on_transcript + + self._session: Optional[aiohttp.ClientSession] = None + self._ws: Optional[aiohttp.ClientWebSocketResponse] = None + self._reader_task: Optional[asyncio.Task[None]] = None + + self._running = False + self._session_ready = asyncio.Event() + self._transcript_queue: "asyncio.Queue[ASRResult]" = asyncio.Queue() + self._final_queue: "asyncio.Queue[str]" = asyncio.Queue() + + self._utterance_active = False + self._audio_sent_in_utterance = False + self._last_interim_text = "" + self._last_error: Optional[str] = None + + self._seed_audio_buffer = bytearray() + self._seed_sequence = 1 + self._seed_request_id: Optional[str] = None + self._seed_frame_bytes = max(2, int((self.sample_rate * self._SEED_FRAME_MS / 1000) * 2)) + + @classmethod + def _detect_protocol(cls, api_url: str) -> VolcengineASRProtocol: + parsed = urlparse(str(api_url or "").strip()) + host = parsed.netloc.lower() + path = parsed.path.lower() + if "openspeech.bytedance.com" in host and "/api/v3/sauc/bigmodel" in path: + return "seed" + return "gateway" + + @classmethod + def _resolve_api_url(cls, api_url: str, model: str, protocol: VolcengineASRProtocol) -> str: + raw = str(api_url or "").strip() + if not raw: + raw = cls.DEFAULT_WS_URL if protocol == "seed" else cls.DEFAULT_GATEWAY_WS_URL + if protocol != "gateway": + return raw + + parsed = urlparse(raw) + query = dict(parse_qsl(parsed.query, keep_blank_values=True)) + query.setdefault("model", model or cls.DEFAULT_MODEL) + return urlunparse(parsed._replace(query=urlencode(query))) + + @staticmethod + def _load_request_params(request_params: Optional[Dict[str, Any]]) -> Dict[str, Any]: + if isinstance(request_params, dict): + return dict(request_params) + raw = os.getenv("VOLCENGINE_ASR_REQUEST_PARAMS_JSON", "").strip() + if not raw: + return {} + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + logger.warning("Ignoring invalid VOLCENGINE_ASR_REQUEST_PARAMS_JSON") + return {} + if isinstance(parsed, dict): + return parsed + return {} + + async def connect(self) -> None: + if not self.api_key: + raise ValueError("Volcengine ASR API key not provided. Configure agent.asr.api_key in YAML.") + + timeout = aiohttp.ClientTimeout(total=None, sock_read=None, sock_connect=15) + self._session = aiohttp.ClientSession(timeout=timeout) + self._running = True + + if self.protocol == "gateway": + await self._connect_gateway() + logger.info( + "Volcengine gateway ASR connected: model={}, sample_rate={}, url={}", + self.model, + self.sample_rate, + self.api_url, + ) + else: + if not self.app_id: + raise ValueError("Volcengine ASR app_id not provided. Configure agent.asr.app_id in YAML.") + logger.info( + "Volcengine BigASR Seed ready: model={}, sample_rate={}, resource_id={}", + self.model, + self.sample_rate, + self.resource_id, + ) + + self.state = ServiceState.CONNECTED + + async def disconnect(self) -> None: + self._running = False + self._utterance_active = False + self._audio_sent_in_utterance = False + self._session_ready.clear() + self._seed_audio_buffer = bytearray() + self._drain_queue(self._final_queue) + self._drain_queue(self._transcript_queue) + + await self._close_ws() + + if self._session is not None: + await self._session.close() + self._session = None + + self.state = ServiceState.DISCONNECTED + logger.info("Volcengine ASR disconnected") + + async def begin_utterance(self) -> None: + self.clear_utterance() + if self.protocol == "seed": + await self._open_seed_stream() + self._utterance_active = True + + async def send_audio(self, audio: bytes) -> None: + if not audio: + return + + if self.protocol == "seed": + await self._send_seed_audio(audio) + return + + if not self._ws: + raise RuntimeError("Volcengine ASR websocket is not connected") + if not self._utterance_active: + self._utterance_active = True + + await self._ws.send_json( + { + "type": "input_audio_buffer.append", + "audio": base64.b64encode(audio).decode("ascii"), + } + ) + self._audio_sent_in_utterance = True + + async def end_utterance(self) -> None: + if not self._utterance_active: + return + + if self.protocol == "seed": + await self._end_seed_utterance() + return + + if not self._ws or not self._audio_sent_in_utterance: + return + await self._ws.send_json({"type": "input_audio_buffer.commit"}) + self._utterance_active = False + + async def wait_for_final_transcription(self, timeout_ms: int = DEFAULT_FINAL_TIMEOUT_MS) -> str: + if not self._audio_sent_in_utterance: + return "" + + timeout_sec = max(0.05, float(timeout_ms) / 1000.0) + try: + return str(await asyncio.wait_for(self._final_queue.get(), timeout=timeout_sec) or "").strip() + except asyncio.TimeoutError: + logger.debug("Volcengine ASR final timeout ({}ms), fallback to last interim", timeout_ms) + return str(self._last_interim_text or "").strip() + finally: + if self.protocol == "seed": + await self._close_ws() + + def clear_utterance(self) -> None: + self._utterance_active = False + self._audio_sent_in_utterance = False + self._last_interim_text = "" + self._last_error = None + self._seed_audio_buffer = bytearray() + self._seed_sequence = 1 + self._seed_request_id = None + self._drain_queue(self._final_queue) + + async def receive_transcripts(self) -> AsyncIterator[ASRResult]: + while self._running: + try: + yield await asyncio.wait_for(self._transcript_queue.get(), timeout=0.1) + except asyncio.TimeoutError: + continue + except asyncio.CancelledError: + break + + async def _connect_gateway(self) -> None: + assert self._session is not None + headers = {"Authorization": f"Bearer {self.api_key}"} + if self.resource_id: + headers["X-Api-Resource-Id"] = self.resource_id + + self._ws = await self._session.ws_connect(self.api_url, headers=headers, heartbeat=20) + self._reader_task = asyncio.create_task(self._reader_loop()) + await self._configure_gateway_session() + + async def _configure_gateway_session(self) -> None: + if not self._ws: + raise RuntimeError("Volcengine ASR websocket is not initialized") + + session_payload: Dict[str, Any] = { + "input_audio_format": "pcm", + "input_audio_codec": "raw", + "input_audio_sample_rate": self.sample_rate, + "input_audio_bits": 16, + "input_audio_channel": 1, + "result_type": 0, + "input_audio_transcription": { + "model": self.model, + }, + } + + await self._ws.send_json( + { + "type": "transcription_session.update", + "session": session_payload, + } + ) + + try: + await asyncio.wait_for(self._session_ready.wait(), timeout=8.0) + except asyncio.TimeoutError as exc: + raise RuntimeError("Volcengine ASR session update timeout") from exc + + async def _open_seed_stream(self) -> None: + if not self._session: + raise RuntimeError("Volcengine ASR session is not initialized") + + await self._close_ws() + self._seed_request_id = uuid.uuid4().hex + headers = self._build_seed_headers(self._seed_request_id) + self._ws = await self._session.ws_connect( + self.api_url, + headers=headers, + heartbeat=20, + max_msg_size=1_000_000_000, + ) + self._reader_task = asyncio.create_task(self._reader_loop()) + await self._ws.send_bytes(self._build_seed_start_request()) + + async def _send_seed_audio(self, audio: bytes) -> None: + if not self._utterance_active: + await self.begin_utterance() + if not self._ws: + raise RuntimeError("Volcengine BigASR websocket is not connected") + + self._seed_audio_buffer.extend(audio) + while len(self._seed_audio_buffer) >= self._seed_frame_bytes: + chunk = bytes(self._seed_audio_buffer[: self._seed_frame_bytes]) + del self._seed_audio_buffer[: self._seed_frame_bytes] + self._seed_sequence += 1 + await self._ws.send_bytes(self._build_seed_audio_request(chunk, sequence=self._seed_sequence)) + self._audio_sent_in_utterance = True + + async def _end_seed_utterance(self) -> None: + if not self._ws: + return + if not self._audio_sent_in_utterance and not self._seed_audio_buffer: + self._utterance_active = False + return + + final_chunk = bytes(self._seed_audio_buffer) + self._seed_audio_buffer = bytearray() + self._seed_sequence += 1 + await self._ws.send_bytes( + self._build_seed_audio_request(final_chunk, sequence=-self._seed_sequence, is_last=True) + ) + self._audio_sent_in_utterance = True + self._utterance_active = False + + async def _close_ws(self) -> None: + reader_task = self._reader_task + ws = self._ws + self._reader_task = None + self._ws = None + + if reader_task: + reader_task.cancel() + try: + await reader_task + except asyncio.CancelledError: + pass + + if ws is not None: + await ws.close() + + async def _reader_loop(self) -> None: + ws = self._ws + if ws is None: + return + + try: + async for msg in ws: + if msg.type == aiohttp.WSMsgType.TEXT: + if self.protocol == "gateway": + self._handle_gateway_event(msg.data) + else: + self._handle_seed_text(msg.data) + continue + if msg.type == aiohttp.WSMsgType.BINARY: + if self.protocol == "seed": + self._handle_seed_binary(msg.data) + continue + if msg.type == aiohttp.WSMsgType.ERROR: + self._last_error = str(ws.exception()) + logger.error("Volcengine ASR websocket error: {}", self._last_error) + break + if msg.type in {aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE}: + break + except asyncio.CancelledError: + raise + except Exception as exc: + self._last_error = str(exc) + logger.error("Volcengine ASR reader loop failed: {}", exc) + finally: + if self._ws is ws: + self._ws = None + + def _handle_gateway_event(self, message: str) -> None: + payload = self._coerce_event(message) + event_type = str(payload.get("type") or "").strip() + if not event_type: + return + + if event_type in {"transcription_session.created", "transcription_session.updated"}: + self._session_ready.set() + return + + if event_type == "error": + self._last_error = self._extract_text(payload, ("message", "error")) + logger.error("Volcengine ASR server error: {}", self._last_error or "unknown") + return + + if event_type.endswith(".failed"): + self._last_error = self._extract_text(payload, ("message", "error", "transcript")) + logger.error("Volcengine ASR failed event: {}", self._last_error or event_type) + return + + if event_type == "conversation.item.input_audio_transcription.result": + transcript = self._extract_text(payload, ("transcript", "result")) + self._emit_transcript_sync(transcript, is_final=False) + return + + if event_type == "conversation.item.input_audio_transcription.delta": + transcript = self._extract_text(payload, ("delta",)) + self._emit_transcript_sync(transcript, is_final=False) + return + + if event_type == "conversation.item.input_audio_transcription.completed": + transcript = self._extract_text(payload, ("transcript", "result")) + self._emit_transcript_sync(transcript, is_final=True) + + def _handle_seed_text(self, message: str) -> None: + payload = self._coerce_event(message) + if payload.get("type") == "error": + self._last_error = self._extract_text(payload, ("message", "error")) + logger.error("Volcengine BigASR error: {}", self._last_error or "unknown") + + def _handle_seed_binary(self, message: bytes) -> None: + payload = self._parse_seed_response(message) + if payload.get("code"): + self._last_error = self._extract_text(payload, ("payload_msg",)) + logger.error("Volcengine BigASR server error: {}", self._last_error or payload["code"]) + return + + body = payload.get("payload_msg") + if not isinstance(body, dict): + return + result = body.get("result") + if not isinstance(result, dict): + return + + text = str(result.get("text") or "").strip() + if not text: + return + + utterances = result.get("utterances") + if not isinstance(utterances, list) or not utterances: + return + first_utterance = utterances[0] if isinstance(utterances[0], dict) else {} + is_final = self._coerce_bool(first_utterance.get("definite")) is True + self._emit_transcript_sync(text, is_final=is_final) + + def _emit_transcript_sync(self, text: str, *, is_final: bool) -> None: + cleaned = str(text or "").strip() + if not cleaned: + return + + if not is_final: + self._last_interim_text = cleaned + else: + self._last_interim_text = "" + + result = ASRResult(text=cleaned, is_final=is_final) + try: + self._transcript_queue.put_nowait(result) + except asyncio.QueueFull: + logger.debug("Volcengine ASR transcript queue full; dropping transcript") + + if is_final: + try: + self._final_queue.put_nowait(cleaned) + except asyncio.QueueFull: + logger.debug("Volcengine ASR final queue full; dropping transcript") + + if self.on_transcript: + asyncio.create_task(self.on_transcript(cleaned, is_final)) + + def _build_seed_headers(self, request_id: str) -> Dict[str, str]: + if not self.app_id: + raise ValueError("Volcengine ASR app_id not provided. Configure agent.asr.app_id in YAML.") + if not self.api_key: + raise ValueError("Volcengine ASR api_key not provided. Configure agent.asr.api_key in YAML.") + + return { + "X-Api-App-Key": str(self.app_id), + "X-Api-Access-Key": str(self.api_key), + "X-Api-Resource-Id": str(self.resource_id or self.DEFAULT_SEED_RESOURCE_ID), + "X-Api-Request-Id": str(request_id), + } + + def _build_seed_start_payload(self) -> Dict[str, Any]: + user_payload: Dict[str, Any] = {"uid": str(self.uid or self._seed_request_id or self.app_id or uuid.uuid4().hex)} + audio_payload: Dict[str, Any] = { + "format": "pcm", + "rate": self.sample_rate, + "bits": 16, + "channels": 1, + "codec": "raw", + } + if self.language and self.language != "auto": + audio_payload["language"] = self.language + + request_payload: Dict[str, Any] = { + "model_name": self.model or self.DEFAULT_MODEL, + "enable_itn": False, + "enable_punc": True, + "enable_ddc": False, + "show_utterance": True, + "result_type": "single", + "vad_segment_duration": 3000, + "end_window_size": 500, + "force_to_speech_time": 1000, + } + + extra = dict(self.request_params) + user_payload.update(self._as_dict(extra.pop("user", None))) + audio_payload.update(self._as_dict(extra.pop("audio", None))) + request_payload.update(self._as_dict(extra.pop("request", None))) + request_payload.update(extra) + + return { + "user": user_payload, + "audio": audio_payload, + "request": request_payload, + } + + def _build_seed_start_request(self) -> bytes: + payload = gzip.compress(json.dumps(self._build_seed_start_payload()).encode("utf-8")) + frame = bytearray( + self._build_seed_header( + message_type=self._SEED_FULL_CLIENT_REQUEST, + message_type_specific_flags=self._SEED_POS_SEQUENCE, + ) + ) + frame.extend((1).to_bytes(4, "big", signed=True)) + frame.extend(len(payload).to_bytes(4, "big")) + frame.extend(payload) + return bytes(frame) + + def _build_seed_audio_request(self, chunk: bytes, *, sequence: int, is_last: bool = False) -> bytes: + payload = gzip.compress(chunk) + frame = bytearray( + self._build_seed_header( + message_type=self._SEED_AUDIO_ONLY_REQUEST, + message_type_specific_flags=self._SEED_NEG_WITH_SEQUENCE if is_last else self._SEED_POS_SEQUENCE, + ) + ) + frame.extend(int(sequence).to_bytes(4, "big", signed=True)) + frame.extend(len(payload).to_bytes(4, "big")) + frame.extend(payload) + return bytes(frame) + + @classmethod + def _build_seed_header( + cls, + *, + message_type: int, + message_type_specific_flags: int, + serial_method: int = _SEED_JSON, + compression_type: int = _SEED_GZIP, + reserved_data: int = 0x00, + ) -> bytes: + header = bytearray() + header.append((cls._SEED_PROTOCOL_VERSION << 4) | 0b0001) + header.append((message_type << 4) | message_type_specific_flags) + header.append((serial_method << 4) | compression_type) + header.append(reserved_data) + return bytes(header) + + @classmethod + def _parse_seed_response(cls, response: bytes) -> Dict[str, Any]: + header_size = response[0] & 0x0F + message_type = response[1] >> 4 + message_type_specific_flags = response[1] & 0x0F + serialization_method = response[2] >> 4 + compression_type = response[2] & 0x0F + payload = response[header_size * 4 :] + + result: Dict[str, Any] = {"is_last_package": False} + payload_message: Any = None + + if message_type_specific_flags & 0x01: + result["payload_sequence"] = int.from_bytes(payload[:4], "big", signed=True) + payload = payload[4:] + + if message_type_specific_flags & 0x02: + result["is_last_package"] = True + + if message_type == cls._SEED_FULL_SERVER_RESPONSE: + result["payload_size"] = int.from_bytes(payload[:4], "big", signed=True) + payload_message = payload[4:] + elif message_type == cls._SEED_SERVER_ACK: + result["seq"] = int.from_bytes(payload[:4], "big", signed=True) + if len(payload) >= 8: + result["payload_size"] = int.from_bytes(payload[4:8], "big", signed=False) + payload_message = payload[8:] + elif message_type == cls._SEED_SERVER_ERROR_RESPONSE: + result["code"] = int.from_bytes(payload[:4], "big", signed=False) + result["payload_size"] = int.from_bytes(payload[4:8], "big", signed=False) + payload_message = payload[8:] + + if payload_message is None: + return result + if compression_type == cls._SEED_GZIP: + payload_message = gzip.decompress(payload_message) + if serialization_method == cls._SEED_JSON: + payload_message = json.loads(payload_message.decode("utf-8")) + elif serialization_method != cls._SEED_NO_SERIALIZATION: + payload_message = payload_message.decode("utf-8") + + result["payload_msg"] = payload_message + return result + + @staticmethod + def _coerce_event(message: Any) -> Dict[str, Any]: + if isinstance(message, dict): + return message + if isinstance(message, str): + try: + loaded = json.loads(message) + if isinstance(loaded, dict): + return loaded + except json.JSONDecodeError: + return {"type": "raw", "message": message} + return {"type": "raw", "message": str(message)} + + @staticmethod + def _extract_text(payload: Dict[str, Any], keys: tuple[str, ...]) -> str: + for key in keys: + value = payload.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + if isinstance(value, dict): + for nested_key in ("message", "text", "transcript", "result", "delta"): + nested = value.get(nested_key) + if isinstance(nested, str) and nested.strip(): + return nested.strip() + return "" + + @staticmethod + def _coerce_bool(value: Any) -> Optional[bool]: + if isinstance(value, bool): + return value + if isinstance(value, (int, float)): + return bool(value) + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in {"1", "true", "yes", "on"}: + return True + if normalized in {"0", "false", "no", "off"}: + return False + return None + + @staticmethod + def _as_dict(value: Any) -> Dict[str, Any]: + if isinstance(value, dict): + return dict(value) + return {} + + @staticmethod + def _drain_queue(queue: "asyncio.Queue[Any]") -> None: + while True: + try: + queue.get_nowait() + except asyncio.QueueEmpty: + break diff --git a/engine/providers/common/__init__.py b/engine/providers/common/__init__.py new file mode 100644 index 0000000..8550c10 --- /dev/null +++ b/engine/providers/common/__init__.py @@ -0,0 +1 @@ +"""Common provider types.""" diff --git a/engine/services/base.py b/engine/providers/common/base.py similarity index 100% rename from engine/services/base.py rename to engine/providers/common/base.py diff --git a/engine/services/streaming_text.py b/engine/providers/common/streaming_text.py similarity index 100% rename from engine/services/streaming_text.py rename to engine/providers/common/streaming_text.py diff --git a/engine/providers/factory/__init__.py b/engine/providers/factory/__init__.py new file mode 100644 index 0000000..9be8bc5 --- /dev/null +++ b/engine/providers/factory/__init__.py @@ -0,0 +1 @@ +"""Provider factories.""" diff --git a/engine/providers/factory/default.py b/engine/providers/factory/default.py new file mode 100644 index 0000000..478d290 --- /dev/null +++ b/engine/providers/factory/default.py @@ -0,0 +1,172 @@ +"""Default runtime service factory implementing core extension ports.""" + +from __future__ import annotations + +from typing import Any + +from loguru import logger + +from runtime.ports import ( + ASRPort, + ASRServiceSpec, + LLMPort, + LLMServiceSpec, + RealtimeServiceFactory, + TTSPort, + TTSServiceSpec, +) +from providers.asr.buffered import BufferedASRService +from providers.asr.dashscope import DashScopeRealtimeASRService +from providers.asr.volcengine import VolcengineRealtimeASRService +from providers.tts.dashscope import DashScopeTTSService +from providers.llm.openai import MockLLMService, OpenAILLMService +from providers.asr.openai_compatible import OpenAICompatibleASRService +from providers.tts.openai_compatible import OpenAICompatibleTTSService +from providers.tts.mock import MockTTSService +from providers.tts.volcengine import VolcengineTTSService + +_OPENAI_COMPATIBLE_PROVIDERS = {"openai_compatible", "openai-compatible", "siliconflow"} +_DASHSCOPE_PROVIDERS = {"dashscope"} +_VOLCENGINE_PROVIDERS = {"volcengine"} +_SUPPORTED_LLM_PROVIDERS = {"openai", "fastgpt", *_OPENAI_COMPATIBLE_PROVIDERS} + + +class DefaultRealtimeServiceFactory(RealtimeServiceFactory): + """Build concrete runtime services from normalized specs.""" + + _DEFAULT_DASHSCOPE_TTS_REALTIME_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + _DEFAULT_DASHSCOPE_TTS_MODEL = "qwen3-tts-flash-realtime" + _DEFAULT_DASHSCOPE_ASR_REALTIME_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + _DEFAULT_DASHSCOPE_ASR_MODEL = "qwen3-asr-flash-realtime" + _DEFAULT_OPENAI_COMPATIBLE_TTS_MODEL = "FunAudioLLM/CosyVoice2-0.5B" + _DEFAULT_OPENAI_COMPATIBLE_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall" + _DEFAULT_VOLCENGINE_TTS_URL = "https://openspeech.bytedance.com/api/v3/tts/unidirectional" + _DEFAULT_VOLCENGINE_TTS_RESOURCE_ID = "seed-tts-2.0" + _DEFAULT_VOLCENGINE_ASR_REALTIME_URL = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel" + _DEFAULT_VOLCENGINE_ASR_MODEL = "bigmodel" + + @staticmethod + def _normalize_provider(provider: Any) -> str: + return str(provider or "").strip().lower() + + @staticmethod + def _resolve_dashscope_mode(raw_mode: Any) -> str: + mode = str(raw_mode or "commit").strip().lower() + if mode in {"commit", "server_commit"}: + return mode + return "commit" + + def create_llm_service(self, spec: LLMServiceSpec) -> LLMPort: + provider = self._normalize_provider(spec.provider) + if provider == "fastgpt" and spec.api_key and spec.base_url: + from providers.llm.fastgpt import FastGPTLLMService + + return FastGPTLLMService( + api_key=spec.api_key, + base_url=spec.base_url, + app_id=spec.app_id, + model=spec.model, + system_prompt=spec.system_prompt, + ) + + if provider in _SUPPORTED_LLM_PROVIDERS and provider != "fastgpt" and spec.api_key: + return OpenAILLMService( + api_key=spec.api_key, + base_url=spec.base_url, + model=spec.model, + system_prompt=spec.system_prompt, + knowledge_config=spec.knowledge_config, + knowledge_searcher=spec.knowledge_searcher, + ) + + logger.warning( + "LLM provider unsupported or API key missing (provider={}); using mock LLM", + provider or "-", + ) + return MockLLMService() + + def create_tts_service(self, spec: TTSServiceSpec) -> TTSPort: + provider = self._normalize_provider(spec.provider) + + if provider == "dashscope" and spec.api_key: + return DashScopeTTSService( + api_key=spec.api_key, + api_url=spec.api_url or self._DEFAULT_DASHSCOPE_TTS_REALTIME_URL, + voice=spec.voice, + model=spec.model or self._DEFAULT_DASHSCOPE_TTS_MODEL, + mode=self._resolve_dashscope_mode(spec.mode), + sample_rate=spec.sample_rate, + speed=spec.speed, + ) + + if provider in _VOLCENGINE_PROVIDERS and spec.api_key: + return VolcengineTTSService( + api_key=spec.api_key, + api_url=spec.api_url or self._DEFAULT_VOLCENGINE_TTS_URL, + voice=spec.voice, + model=spec.model, + app_id=spec.app_id, + resource_id=spec.resource_id or self._DEFAULT_VOLCENGINE_TTS_RESOURCE_ID, + uid=spec.uid, + sample_rate=spec.sample_rate, + speed=spec.speed, + ) + + if provider in _OPENAI_COMPATIBLE_PROVIDERS and spec.api_key: + return OpenAICompatibleTTSService( + api_key=spec.api_key, + api_url=spec.api_url, + voice=spec.voice, + model=spec.model or self._DEFAULT_OPENAI_COMPATIBLE_TTS_MODEL, + sample_rate=spec.sample_rate, + speed=spec.speed, + ) + + logger.warning( + "TTS provider unsupported or API key missing (provider={}); using mock TTS", + provider or "-", + ) + return MockTTSService(sample_rate=spec.sample_rate) + + def create_asr_service(self, spec: ASRServiceSpec) -> ASRPort: + provider = self._normalize_provider(spec.provider) + + if provider in _DASHSCOPE_PROVIDERS and spec.api_key: + return DashScopeRealtimeASRService( + api_key=spec.api_key, + api_url=spec.api_url or self._DEFAULT_DASHSCOPE_ASR_REALTIME_URL, + model=spec.model or self._DEFAULT_DASHSCOPE_ASR_MODEL, + sample_rate=spec.sample_rate, + language=spec.language, + on_transcript=spec.on_transcript, + ) + + if provider in _VOLCENGINE_PROVIDERS and spec.api_key: + return VolcengineRealtimeASRService( + api_key=spec.api_key, + api_url=spec.api_url or self._DEFAULT_VOLCENGINE_ASR_REALTIME_URL, + model=spec.model or self._DEFAULT_VOLCENGINE_ASR_MODEL, + sample_rate=spec.sample_rate, + language=spec.language, + app_id=spec.app_id, + resource_id=spec.resource_id, + uid=spec.uid, + request_params=spec.request_params, + on_transcript=spec.on_transcript, + ) + + if provider in _OPENAI_COMPATIBLE_PROVIDERS and spec.api_key: + return OpenAICompatibleASRService( + api_key=spec.api_key, + api_url=spec.api_url, + model=spec.model or self._DEFAULT_OPENAI_COMPATIBLE_ASR_MODEL, + sample_rate=spec.sample_rate, + language=spec.language, + enable_interim=spec.enable_interim, + interim_interval_ms=spec.interim_interval_ms, + min_audio_for_interim_ms=spec.min_audio_for_interim_ms, + on_transcript=spec.on_transcript, + ) + + logger.info("Using buffered ASR service (provider={})", provider or "-") + return BufferedASRService(sample_rate=spec.sample_rate, language=spec.language) diff --git a/engine/providers/llm/__init__.py b/engine/providers/llm/__init__.py new file mode 100644 index 0000000..528d1e1 --- /dev/null +++ b/engine/providers/llm/__init__.py @@ -0,0 +1,14 @@ +"""LLM providers.""" + +from providers.llm.openai import MockLLMService, OpenAILLMService + +try: # pragma: no cover - import depends on optional sibling SDK + from providers.llm.fastgpt import FastGPTLLMService +except Exception: # pragma: no cover - provider remains lazily available via factory + FastGPTLLMService = None # type: ignore[assignment] + +__all__ = [ + "FastGPTLLMService", + "MockLLMService", + "OpenAILLMService", +] diff --git a/engine/providers/llm/fastgpt.py b/engine/providers/llm/fastgpt.py new file mode 100644 index 0000000..a48814b --- /dev/null +++ b/engine/providers/llm/fastgpt.py @@ -0,0 +1,553 @@ +"""FastGPT-backed LLM provider.""" + +from __future__ import annotations + +import asyncio +import json +import uuid +from typing import Any, AsyncIterator, Dict, List, Optional + +from loguru import logger + +from providers.common.base import BaseLLMService, LLMMessage, LLMStreamEvent, ServiceState +from providers.llm.fastgpt_types import ( + FastGPTConversationState, + FastGPTField, + FastGPTInteractivePrompt, + FastGPTOption, + FastGPTPendingInteraction, +) + +try: + from fastgpt_client import AsyncChatClient, aiter_stream_events +except Exception as exc: # pragma: no cover - exercised indirectly via connect() + AsyncChatClient = None # type: ignore[assignment] + aiter_stream_events = None # type: ignore[assignment] + _FASTGPT_IMPORT_ERROR: Optional[Exception] = exc +else: # pragma: no cover - import success depends on local environment + _FASTGPT_IMPORT_ERROR = None + + +class FastGPTLLMService(BaseLLMService): + """LLM provider that delegates orchestration to FastGPT.""" + + INTERACTIVE_TOOL_NAME = "fastgpt.interactive" + INTERACTIVE_TIMEOUT_MS = 300000 + + def __init__( + self, + *, + api_key: str, + base_url: str, + app_id: Optional[str] = None, + model: str = "fastgpt", + system_prompt: Optional[str] = None, + ): + super().__init__(model=model or "fastgpt") + self.api_key = api_key + self.base_url = str(base_url or "").rstrip("/") + self.app_id = str(app_id or "").strip() + self.system_prompt = system_prompt or "" + self.client: Any = None + self._cancel_event = asyncio.Event() + self._state = FastGPTConversationState() + self._knowledge_config: Dict[str, Any] = {} + self._tool_schemas: List[Dict[str, Any]] = [] + + async def connect(self) -> None: + if AsyncChatClient is None or aiter_stream_events is None: + raise RuntimeError( + "fastgpt_client package is not available. " + "Install the sibling fastgpt-python-sdk package first." + ) from _FASTGPT_IMPORT_ERROR + if not self.api_key: + raise ValueError("FastGPT API key not provided") + if not self.base_url: + raise ValueError("FastGPT base URL not provided") + self.client = AsyncChatClient(api_key=self.api_key, base_url=self.base_url) + self.state = ServiceState.CONNECTED + logger.info("FastGPT LLM service connected: base_url={}", self.base_url) + + async def disconnect(self) -> None: + if self.client and hasattr(self.client, "close"): + await self.client.close() + self.client = None + self._state.pending_interaction = None + self.state = ServiceState.DISCONNECTED + logger.info("FastGPT LLM service disconnected") + + def cancel(self) -> None: + self._cancel_event.set() + self._state.pending_interaction = None + + def set_knowledge_config(self, config: Optional[Dict[str, Any]]) -> None: + # FastGPT owns KB orchestration in this provider mode. + self._knowledge_config = dict(config or {}) + + def set_tool_schemas(self, schemas: Optional[List[Dict[str, Any]]]) -> None: + # FastGPT owns workflow and tool orchestration in this provider mode. + self._tool_schemas = list(schemas or []) + + def handles_client_tool(self, tool_name: str) -> bool: + return str(tool_name or "").strip() == self.INTERACTIVE_TOOL_NAME + + async def get_initial_greeting(self) -> Optional[str]: + if not self.client or not self.app_id: + return None + + response = await self.client.get_chat_init( + appId=self.app_id, + chatId=self._ensure_chat_id(), + ) + raise_for_status = getattr(response, "raise_for_status", None) + if callable(raise_for_status): + raise_for_status() + elif int(getattr(response, "status_code", 200) or 200) >= 400: + raise RuntimeError(f"FastGPT chat init failed: HTTP {getattr(response, 'status_code', 'unknown')}") + + payload = response.json() if hasattr(response, "json") else {} + return self._extract_initial_greeting(payload) + + async def generate( + self, + messages: List[LLMMessage], + temperature: float = 0.7, + max_tokens: Optional[int] = None, + ) -> str: + parts: List[str] = [] + async for event in self.generate_stream(messages, temperature=temperature, max_tokens=max_tokens): + if event.type == "text_delta" and event.text: + parts.append(event.text) + if event.type == "tool_call": + break + return "".join(parts) + + async def generate_stream( + self, + messages: List[LLMMessage], + temperature: float = 0.7, + max_tokens: Optional[int] = None, + ) -> AsyncIterator[LLMStreamEvent]: + del temperature, max_tokens + if not self.client: + raise RuntimeError("LLM service not connected") + + self._cancel_event.clear() + request_messages = self._build_request_messages(messages) + response = await self.client.create_chat_completion( + messages=request_messages, + chatId=self._ensure_chat_id(), + detail=True, + stream=True, + ) + try: + async for event in aiter_stream_events(response): + if self._cancel_event.is_set(): + logger.info("FastGPT stream cancelled") + break + + stop_after_event = False + for mapped in self._map_stream_event(event): + if mapped.type == "tool_call": + stop_after_event = True + yield mapped + if stop_after_event: + break + finally: + await self._close_stream_response(response) + + async def resume_after_client_tool_result( + self, + tool_call_id: str, + result: Dict[str, Any], + ) -> AsyncIterator[LLMStreamEvent]: + if not self.client: + raise RuntimeError("LLM service not connected") + + pending = self._require_pending_interaction(tool_call_id) + follow_up_text = self._build_resume_text(pending, result) + self._state.pending_interaction = None + + if not follow_up_text: + yield LLMStreamEvent(type="done") + return + + self._cancel_event.clear() + response = await self.client.create_chat_completion( + messages=[{"role": "user", "content": follow_up_text}], + chatId=pending.chat_id, + detail=True, + stream=True, + ) + try: + async for event in aiter_stream_events(response): + if self._cancel_event.is_set(): + logger.info("FastGPT resume stream cancelled") + break + + stop_after_event = False + for mapped in self._map_stream_event(event): + if mapped.type == "tool_call": + stop_after_event = True + yield mapped + if stop_after_event: + break + finally: + await self._close_stream_response(response) + + async def _close_stream_response(self, response: Any) -> None: + if response is None: + return + + # httpx async streaming responses must use `aclose()`. + aclose = getattr(response, "aclose", None) + if callable(aclose): + await aclose() + return + + close = getattr(response, "close", None) + if callable(close): + maybe_awaitable = close() + if hasattr(maybe_awaitable, "__await__"): + await maybe_awaitable + + def _ensure_chat_id(self) -> str: + chat_id = str(self._state.chat_id or "").strip() + if not chat_id: + chat_id = f"fastgpt_{uuid.uuid4().hex}" + self._state.chat_id = chat_id + return chat_id + + def _build_request_messages(self, messages: List[LLMMessage]) -> List[Dict[str, Any]]: + non_empty = [msg for msg in messages if str(msg.content or "").strip()] + if not non_empty: + return [{"role": "user", "content": ""}] + + latest_user = next((msg for msg in reversed(non_empty) if msg.role == "user"), None) + trailing_system = non_empty[-1] if non_empty and non_empty[-1].role == "system" else None + + request: List[Dict[str, Any]] = [] + if trailing_system and trailing_system is not latest_user: + request.append({"role": "system", "content": trailing_system.content.strip()}) + if latest_user and str(latest_user.content or "").strip(): + request.append({"role": "user", "content": latest_user.content.strip()}) + return request + + last_message = non_empty[-1] + payload = last_message.to_dict() + payload["content"] = str(payload.get("content") or "").strip() + return [payload] + + def _extract_initial_greeting(self, payload: Any) -> Optional[str]: + if not isinstance(payload, dict): + return None + + candidates: List[Any] = [ + payload.get("app"), + payload.get("data"), + ] + for container in candidates: + if not isinstance(container, dict): + continue + nested_app = container.get("app") if isinstance(container.get("app"), dict) else None + if nested_app: + text = self._welcome_text_from_app(nested_app) + if text: + return text + text = self._welcome_text_from_app(container) + if text: + return text + + return None + + @staticmethod + def _welcome_text_from_app(app_payload: Dict[str, Any]) -> Optional[str]: + chat_config = app_payload.get("chatConfig") if isinstance(app_payload.get("chatConfig"), dict) else {} + text = str( + chat_config.get("welcomeText") + or app_payload.get("welcomeText") + or "" + ).strip() + return text or None + + def _map_stream_event(self, event: Any) -> List[LLMStreamEvent]: + kind = str(getattr(event, "kind", "") or "") + data = getattr(event, "data", {}) + if not isinstance(data, dict): + data = {} + + if kind in {"data", "answer", "fastAnswer"}: + chunks = self._extract_text_chunks(kind, data) + return [LLMStreamEvent(type="text_delta", text=chunk) for chunk in chunks if chunk] + + if kind == "interactive": + return [self._build_interactive_tool_event(data)] + + if kind == "error": + message = str(data.get("message") or data.get("error") or "FastGPT streaming error") + raise RuntimeError(message) + + if kind == "done": + return [LLMStreamEvent(type="done")] + + return [] + + @staticmethod + def _normalize_interactive_payload(payload: Dict[str, Any]) -> Dict[str, Any]: + normalized = payload + wrapped = normalized.get("interactive") + if isinstance(wrapped, dict): + normalized = wrapped + + interaction_type = str(normalized.get("type") or "").strip() + if interaction_type == "toolChildrenInteractive": + params = normalized.get("params") if isinstance(normalized.get("params"), dict) else {} + children_response = params.get("childrenResponse") + if isinstance(children_response, dict): + normalized = children_response + + return normalized + + def _extract_text_chunks(self, kind: str, data: Dict[str, Any]) -> List[str]: + if kind in {"answer", "fastAnswer"}: + text = str(data.get("text") or "") + if text: + return [text] + + choices = data.get("choices") if isinstance(data.get("choices"), list) else [] + if not choices: + text = str(data.get("text") or "") + return [text] if text else [] + + first = choices[0] if isinstance(choices[0], dict) else {} + delta = first.get("delta") if isinstance(first.get("delta"), dict) else {} + if isinstance(delta.get("content"), str) and delta.get("content"): + return [str(delta.get("content"))] + message = first.get("message") if isinstance(first.get("message"), dict) else {} + if isinstance(message.get("content"), str) and message.get("content"): + return [str(message.get("content"))] + return [] + + def _build_interactive_tool_event(self, payload: Dict[str, Any]) -> LLMStreamEvent: + normalized_payload = self._normalize_interactive_payload(payload) + prompt = self._parse_interactive_prompt(normalized_payload) + call_id = f"fgi_{uuid.uuid4().hex[:12]}" + pending = FastGPTPendingInteraction( + tool_call_id=call_id, + chat_id=self._ensure_chat_id(), + prompt=prompt, + timeout_ms=self.INTERACTIVE_TIMEOUT_MS, + fastgpt_event=dict(normalized_payload), + ) + self._state.pending_interaction = pending + arguments = prompt.to_ws_arguments(chat_id=pending.chat_id) + tool_call = { + "id": call_id, + "type": "function", + "executor": "client", + "wait_for_response": True, + "timeout_ms": pending.timeout_ms, + "display_name": prompt.title or prompt.description or prompt.prompt or "FastGPT Interactive", + "function": { + "name": self.INTERACTIVE_TOOL_NAME, + "arguments": json.dumps(arguments, ensure_ascii=False), + }, + } + return LLMStreamEvent(type="tool_call", tool_call=tool_call) + + def _parse_interactive_prompt(self, payload: Dict[str, Any]) -> FastGPTInteractivePrompt: + params = payload.get("params") if isinstance(payload.get("params"), dict) else {} + kind = str(payload.get("type") or "userSelect").strip() or "userSelect" + title = str( + payload.get("title") + or params.get("title") + or payload.get("nodeName") + or payload.get("label") + or "" + ).strip() + description = str( + payload.get("description") + or payload.get("desc") + or params.get("description") + or params.get("desc") + or "" + ).strip() + prompt_text = str( + payload.get("opener") + or params.get("opener") + or payload.get("intro") + or params.get("intro") + or payload.get("prompt") + or params.get("prompt") + or payload.get("text") + or params.get("text") + or title + or description + ).strip() + required = self._coerce_bool(payload.get("required"), default=True) + multiple = self._coerce_bool(params.get("multiple") or payload.get("multiple"), default=False) + submit_label = str(params.get("submitText") or payload.get("submitText") or "Continue").strip() or "Continue" + cancel_label = str(params.get("cancelText") or payload.get("cancelText") or "Cancel").strip() or "Cancel" + + options: List[FastGPTOption] = [] + raw_options = params.get("userSelectOptions") if isinstance(params.get("userSelectOptions"), list) else [] + for index, raw_option in enumerate(raw_options): + if isinstance(raw_option, str): + value = raw_option.strip() + if not value: + continue + options.append(FastGPTOption(id=f"option_{index}", label=value, value=value)) + continue + if not isinstance(raw_option, dict): + continue + label = str(raw_option.get("label") or raw_option.get("value") or raw_option.get("id") or "").strip() + value = str(raw_option.get("value") or raw_option.get("label") or raw_option.get("id") or "").strip() + option_id = str(raw_option.get("id") or value or f"option_{index}").strip() + if not label and not value: + continue + options.append( + FastGPTOption( + id=option_id or f"option_{index}", + label=label or value, + value=value or label, + description=str( + raw_option.get("description") + or raw_option.get("desc") + or raw_option.get("intro") + or raw_option.get("summary") + or "" + ).strip(), + ) + ) + + form: List[FastGPTField] = [] + raw_form = params.get("inputForm") if isinstance(params.get("inputForm"), list) else [] + for index, raw_field in enumerate(raw_form): + if not isinstance(raw_field, dict): + continue + field_options: List[FastGPTOption] = [] + nested_options = raw_field.get("options") if isinstance(raw_field.get("options"), list) else [] + for opt_index, option in enumerate(nested_options): + if isinstance(option, str): + value = option.strip() + if not value: + continue + field_options.append(FastGPTOption(id=f"field_{index}_opt_{opt_index}", label=value, value=value)) + continue + if not isinstance(option, dict): + continue + label = str(option.get("label") or option.get("value") or option.get("id") or "").strip() + value = str(option.get("value") or option.get("label") or option.get("id") or "").strip() + option_id = str(option.get("id") or value or f"field_{index}_opt_{opt_index}").strip() + if not label and not value: + continue + field_options.append( + FastGPTOption( + id=option_id or f"field_{index}_opt_{opt_index}", + label=label or value, + value=value or label, + description=str( + option.get("description") + or option.get("desc") + or option.get("intro") + or option.get("summary") + or "" + ).strip(), + ) + ) + name = str(raw_field.get("key") or raw_field.get("name") or raw_field.get("label") or f"field_{index}").strip() + label = str(raw_field.get("label") or raw_field.get("name") or name).strip() + form.append( + FastGPTField( + name=name or f"field_{index}", + label=label or name or f"field_{index}", + input_type=str(raw_field.get("type") or raw_field.get("inputType") or "text").strip() or "text", + required=self._coerce_bool(raw_field.get("required"), default=False), + placeholder=str( + raw_field.get("placeholder") + or raw_field.get("description") + or raw_field.get("desc") + or "" + ).strip(), + default=raw_field.get("defaultValue", raw_field.get("default")), + options=field_options, + ) + ) + + return FastGPTInteractivePrompt( + kind="userInput" if kind == "userInput" else "userSelect", + title=title, + description=description, + prompt=prompt_text, + required=required, + multiple=multiple, + submit_label=submit_label, + cancel_label=cancel_label, + options=options, + form=form, + raw=dict(payload), + ) + + def _require_pending_interaction(self, tool_call_id: str) -> FastGPTPendingInteraction: + pending = self._state.pending_interaction + if pending is None or pending.tool_call_id != tool_call_id: + raise ValueError(f"FastGPT interaction not pending for tool call: {tool_call_id}") + return pending + + def _build_resume_text(self, pending: FastGPTPendingInteraction, result: Dict[str, Any]) -> str: + status = result.get("status") if isinstance(result.get("status"), dict) else {} + status_code = self._safe_int(status.get("code"), default=0) + output = result.get("output") if isinstance(result.get("output"), dict) else {} + action = str(output.get("action") or "").strip().lower() + + if action == "cancel" or status_code == 499: + return "" + if status_code == 422: + raise ValueError("Invalid FastGPT interactive payload from client") + if status_code and not 200 <= status_code < 300: + raise ValueError(f"FastGPT interactive result rejected with status {status_code}") + if action and action != "submit": + raise ValueError(f"Unsupported FastGPT interactive action: {action}") + + payload = output.get("result") if isinstance(output.get("result"), dict) else output + if not isinstance(payload, dict): + raise ValueError("FastGPT interactive client result must be an object") + + if pending.prompt.kind == "userSelect": + selected = str(payload.get("selected") or "").strip() + if selected: + return selected + selected_values = payload.get("selected_values") if isinstance(payload.get("selected_values"), list) else [] + values = [str(item).strip() for item in selected_values if str(item).strip()] + if values: + return ", ".join(values) + text_value = str(payload.get("text") or "").strip() + return text_value + + text_value = str(payload.get("text") or "").strip() + if text_value: + return text_value + fields = payload.get("fields") if isinstance(payload.get("fields"), dict) else {} + compact_fields = {str(key): value for key, value in fields.items()} + if compact_fields: + return json.dumps(compact_fields, ensure_ascii=False) + return "" + + @staticmethod + def _coerce_bool(value: Any, *, default: bool) -> bool: + if isinstance(value, bool): + return value + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in {"true", "1", "yes", "on"}: + return True + if normalized in {"false", "0", "no", "off"}: + return False + return default + + @staticmethod + def _safe_int(value: Any, *, default: int) -> int: + try: + return int(value) + except (TypeError, ValueError): + return default diff --git a/engine/providers/llm/fastgpt_types.py b/engine/providers/llm/fastgpt_types.py new file mode 100644 index 0000000..71766e3 --- /dev/null +++ b/engine/providers/llm/fastgpt_types.py @@ -0,0 +1,95 @@ +"""FastGPT-specific provider types.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Dict, List, Literal, Optional + +InteractiveKind = Literal["userSelect", "userInput"] + + +@dataclass(frozen=True) +class FastGPTOption: + id: str + label: str + value: str + description: str = "" + + +@dataclass(frozen=True) +class FastGPTField: + name: str + label: str + input_type: str = "text" + required: bool = False + placeholder: str = "" + default: Any = None + options: List[FastGPTOption] = field(default_factory=list) + + +@dataclass(frozen=True) +class FastGPTInteractivePrompt: + kind: InteractiveKind + title: str = "" + description: str = "" + prompt: str = "" + required: bool = True + multiple: bool = False + submit_label: str = "Continue" + cancel_label: str = "Cancel" + options: List[FastGPTOption] = field(default_factory=list) + form: List[FastGPTField] = field(default_factory=list) + raw: Dict[str, Any] = field(default_factory=dict) + + def to_ws_arguments( + self, + *, + turn_id: Optional[str] = None, + response_id: Optional[str] = None, + chat_id: Optional[str] = None, + ) -> Dict[str, Any]: + context: Dict[str, Any] = {} + if turn_id: + context["turn_id"] = turn_id + if response_id: + context["response_id"] = response_id + if chat_id: + context["chat_id"] = chat_id + return { + "provider": "fastgpt", + "version": "fastgpt_interactive_v1", + "interaction": { + "type": self.kind, + "title": self.title, + "description": self.description, + "prompt": self.prompt, + "required": self.required, + "multiple": self.multiple, + "submit_label": self.submit_label, + "cancel_label": self.cancel_label, + "options": [vars(item) for item in self.options], + "form": [ + { + **vars(item), + "options": [vars(option) for option in item.options], + } + for item in self.form + ], + }, + "context": context, + } + + +@dataclass +class FastGPTPendingInteraction: + tool_call_id: str + chat_id: str + prompt: FastGPTInteractivePrompt + timeout_ms: int + fastgpt_event: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class FastGPTConversationState: + chat_id: Optional[str] = None + pending_interaction: Optional[FastGPTPendingInteraction] = None diff --git a/engine/services/llm.py b/engine/providers/llm/openai.py similarity index 98% rename from engine/services/llm.py rename to engine/providers/llm/openai.py index eb7f89c..02735fe 100644 --- a/engine/services/llm.py +++ b/engine/providers/llm/openai.py @@ -10,8 +10,8 @@ import uuid from typing import AsyncIterator, Optional, List, Dict, Any, Callable, Awaitable from loguru import logger -from app.backend_adapters import build_backend_adapter_from_settings -from services.base import BaseLLMService, LLMMessage, LLMStreamEvent, ServiceState +from adapters.control_plane.backend import build_backend_adapter_from_settings +from providers.common.base import BaseLLMService, LLMMessage, LLMStreamEvent, ServiceState # Try to import openai try: @@ -44,13 +44,13 @@ class OpenAILLMService(BaseLLMService): Args: model: Model name (e.g., "gpt-4o-mini", "gpt-4o") - api_key: Provider API key (defaults to LLM_API_KEY/OPENAI_API_KEY env vars) + api_key: Provider API key base_url: Custom API base URL (for Azure or compatible APIs) system_prompt: Default system prompt for conversations """ super().__init__(model=model) - self.api_key = api_key or os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") + self.api_key = api_key self.base_url = base_url or os.getenv("LLM_API_URL") or os.getenv("OPENAI_API_URL") self.system_prompt = system_prompt or ( "You are a helpful, friendly voice assistant. " diff --git a/engine/providers/realtime/__init__.py b/engine/providers/realtime/__init__.py new file mode 100644 index 0000000..0d4cb46 --- /dev/null +++ b/engine/providers/realtime/__init__.py @@ -0,0 +1 @@ +"""Realtime providers.""" diff --git a/engine/services/realtime.py b/engine/providers/realtime/service.py similarity index 99% rename from engine/services/realtime.py rename to engine/providers/realtime/service.py index 3fd95c1..142f018 100644 --- a/engine/services/realtime.py +++ b/engine/providers/realtime/service.py @@ -13,7 +13,6 @@ The Realtime API provides: - Barge-in/interruption handling """ -import os import asyncio import json import base64 @@ -98,7 +97,6 @@ class RealtimeService: config: Realtime configuration (uses defaults if not provided) """ self.config = config or RealtimeConfig() - self.config.api_key = self.config.api_key or os.getenv("OPENAI_API_KEY") self.state = RealtimeState.DISCONNECTED self._ws = None diff --git a/engine/providers/tts/__init__.py b/engine/providers/tts/__init__.py new file mode 100644 index 0000000..b2b237a --- /dev/null +++ b/engine/providers/tts/__init__.py @@ -0,0 +1,5 @@ +"""TTS providers.""" + +from providers.tts.volcengine import VolcengineTTSService + +__all__ = ["VolcengineTTSService"] diff --git a/engine/services/dashscope_tts.py b/engine/providers/tts/dashscope.py similarity index 98% rename from engine/services/dashscope_tts.py rename to engine/providers/tts/dashscope.py index 6d89221..c0b3fdb 100644 --- a/engine/services/dashscope_tts.py +++ b/engine/providers/tts/dashscope.py @@ -12,7 +12,7 @@ from typing import Any, AsyncIterator, Dict, Optional, Tuple from loguru import logger -from services.base import BaseTTSService, ServiceState, TTSChunk +from providers.common.base import BaseTTSService, ServiceState, TTSChunk try: import dashscope @@ -89,7 +89,7 @@ class DashScopeTTSService(BaseTTSService): speed: float = 1.0, ): super().__init__(voice=voice, sample_rate=sample_rate, speed=speed) - self.api_key = api_key or os.getenv("DASHSCOPE_API_KEY") or os.getenv("TTS_API_KEY") + self.api_key = api_key self.api_url = ( api_url or os.getenv("DASHSCOPE_TTS_API_URL") diff --git a/engine/providers/tts/mock.py b/engine/providers/tts/mock.py new file mode 100644 index 0000000..1d1e143 --- /dev/null +++ b/engine/providers/tts/mock.py @@ -0,0 +1,49 @@ +"""TTS service implementations used by the engine runtime.""" + +import asyncio +from typing import AsyncIterator + +from loguru import logger + +from providers.common.base import BaseTTSService, TTSChunk, ServiceState + + +class MockTTSService(BaseTTSService): + """Mock TTS service for tests and no-provider fallback.""" + + def __init__( + self, + voice: str = "mock", + sample_rate: int = 16000, + speed: float = 1.0, + ): + super().__init__(voice=voice, sample_rate=sample_rate, speed=speed) + + async def connect(self) -> None: + self.state = ServiceState.CONNECTED + logger.info("Mock TTS service connected") + + async def disconnect(self) -> None: + self.state = ServiceState.DISCONNECTED + logger.info("Mock TTS service disconnected") + + async def synthesize(self, text: str) -> bytes: + """Generate silence based on text length.""" + word_count = len(text.split()) + duration_ms = word_count * 100 + samples = int(self.sample_rate * duration_ms / 1000) + return bytes(samples * 2) + + async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]: + """Generate silence chunks to emulate streaming synthesis.""" + audio = await self.synthesize(text) + + chunk_size = self.sample_rate * 2 // 10 + for i in range(0, len(audio), chunk_size): + chunk_data = audio[i : i + chunk_size] + yield TTSChunk( + audio=chunk_data, + sample_rate=self.sample_rate, + is_final=(i + chunk_size >= len(audio)), + ) + await asyncio.sleep(0.05) diff --git a/engine/services/openai_compatible_tts.py b/engine/providers/tts/openai_compatible.py similarity index 97% rename from engine/services/openai_compatible_tts.py rename to engine/providers/tts/openai_compatible.py index b2dc30d..767ad12 100644 --- a/engine/services/openai_compatible_tts.py +++ b/engine/providers/tts/openai_compatible.py @@ -13,8 +13,8 @@ from typing import AsyncIterator, Optional from urllib.parse import urlparse, urlunparse from loguru import logger -from services.base import BaseTTSService, TTSChunk, ServiceState -from services.streaming_tts_adapter import StreamingTTSAdapter # backward-compatible re-export +from providers.common.base import BaseTTSService, TTSChunk, ServiceState +from providers.tts.streaming_adapter import StreamingTTSAdapter # backward-compatible re-export class OpenAICompatibleTTSService(BaseTTSService): @@ -49,7 +49,7 @@ class OpenAICompatibleTTSService(BaseTTSService): Initialize OpenAI-compatible TTS service. Args: - api_key: Provider API key (defaults to TTS_API_KEY/SILICONFLOW_API_KEY env vars) + api_key: Provider API key api_url: Provider API URL (defaults to SiliconFlow endpoint) voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana) model: Model name @@ -73,7 +73,7 @@ class OpenAICompatibleTTSService(BaseTTSService): super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed) - self.api_key = api_key or os.getenv("TTS_API_KEY") or os.getenv("SILICONFLOW_API_KEY") + self.api_key = api_key self.model = model raw_api_url = api_url or os.getenv("TTS_API_URL") or "https://api.siliconflow.cn/v1/audio/speech" self.api_url = self._resolve_speech_endpoint(raw_api_url) diff --git a/engine/services/siliconflow_tts.py b/engine/providers/tts/siliconflow.py similarity index 72% rename from engine/services/siliconflow_tts.py rename to engine/providers/tts/siliconflow.py index 3cdf32a..3b894d9 100644 --- a/engine/services/siliconflow_tts.py +++ b/engine/providers/tts/siliconflow.py @@ -1,6 +1,6 @@ """Backward-compatible imports for legacy siliconflow_tts module.""" -from services.openai_compatible_tts import OpenAICompatibleTTSService, StreamingTTSAdapter +from providers.tts.openai_compatible import OpenAICompatibleTTSService, StreamingTTSAdapter # Backward-compatible alias SiliconFlowTTSService = OpenAICompatibleTTSService diff --git a/engine/services/streaming_tts_adapter.py b/engine/providers/tts/streaming_adapter.py similarity index 95% rename from engine/services/streaming_tts_adapter.py rename to engine/providers/tts/streaming_adapter.py index d4cb745..853e7ab 100644 --- a/engine/services/streaming_tts_adapter.py +++ b/engine/providers/tts/streaming_adapter.py @@ -4,8 +4,8 @@ import asyncio from loguru import logger -from services.base import BaseTTSService -from services.streaming_text import extract_tts_sentence, has_spoken_content +from providers.common.base import BaseTTSService +from providers.common.streaming_text import extract_tts_sentence, has_spoken_content class StreamingTTSAdapter: diff --git a/engine/providers/tts/volcengine.py b/engine/providers/tts/volcengine.py new file mode 100644 index 0000000..d7502a1 --- /dev/null +++ b/engine/providers/tts/volcengine.py @@ -0,0 +1,219 @@ +"""Volcengine TTS service. + +Uses Volcengine's unidirectional HTTP streaming TTS API and adapts streamed +base64 audio chunks into engine-native ``TTSChunk`` events. +""" + +from __future__ import annotations + +import asyncio +import base64 +import codecs +import json +import os +import uuid +from typing import Any, AsyncIterator, Optional + +import aiohttp +from loguru import logger + +from providers.common.base import BaseTTSService, ServiceState, TTSChunk + + +class VolcengineTTSService(BaseTTSService): + """Streaming TTS adapter for Volcengine's HTTP v3 API.""" + + DEFAULT_API_URL = "https://openspeech.bytedance.com/api/v3/tts/unidirectional" + DEFAULT_RESOURCE_ID = "seed-tts-2.0" + + def __init__( + self, + api_key: Optional[str] = None, + api_url: Optional[str] = None, + voice: str = "zh_female_shuangkuaisisi_moon_bigtts", + model: Optional[str] = None, + app_id: Optional[str] = None, + resource_id: Optional[str] = None, + uid: Optional[str] = None, + sample_rate: int = 16000, + speed: float = 1.0, + ) -> None: + super().__init__(voice=voice, sample_rate=sample_rate, speed=speed) + self.api_key = api_key or os.getenv("VOLCENGINE_TTS_API_KEY") or os.getenv("TTS_API_KEY") + self.api_url = api_url or os.getenv("VOLCENGINE_TTS_API_URL") or self.DEFAULT_API_URL + self.model = str(model or os.getenv("VOLCENGINE_TTS_MODEL") or "").strip() or None + self.app_id = app_id or os.getenv("VOLCENGINE_TTS_APP_ID") or os.getenv("TTS_APP_ID") + self.resource_id = resource_id or os.getenv("VOLCENGINE_TTS_RESOURCE_ID") or self.DEFAULT_RESOURCE_ID + self.uid = uid or os.getenv("VOLCENGINE_TTS_UID") + + self._session: Optional[aiohttp.ClientSession] = None + self._cancel_event = asyncio.Event() + self._synthesis_lock = asyncio.Lock() + self._pending_audio: list[bytes] = [] + + async def connect(self) -> None: + if not self.api_key: + raise ValueError("Volcengine TTS API key not provided. Configure agent.tts.api_key in YAML.") + if not self.app_id: + raise ValueError("Volcengine TTS app_id not provided. Configure agent.tts.app_id in YAML.") + + timeout = aiohttp.ClientTimeout(total=None, sock_read=None, sock_connect=15) + self._session = aiohttp.ClientSession(timeout=timeout) + self.state = ServiceState.CONNECTED + logger.info( + "Volcengine TTS service ready: speaker={}, sample_rate={}, resource_id={}", + self.voice, + self.sample_rate, + self.resource_id, + ) + + async def disconnect(self) -> None: + self._cancel_event.set() + if self._session is not None: + await self._session.close() + self._session = None + self.state = ServiceState.DISCONNECTED + logger.info("Volcengine TTS service disconnected") + + async def synthesize(self, text: str) -> bytes: + audio = b"" + async for chunk in self.synthesize_stream(text): + audio += chunk.audio + return audio + + async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]: + if not self._session: + raise RuntimeError("Volcengine TTS service not connected") + if not text.strip(): + return + + async with self._synthesis_lock: + self._cancel_event.clear() + + headers = { + "Content-Type": "application/json", + "X-Api-App-Key": str(self.app_id), + "X-Api-Access-Key": str(self.api_key), + "X-Api-Resource-Id": str(self.resource_id), + "X-Api-Request-Id": str(uuid.uuid4()), + } + payload = { + "user": { + "uid": str(self.uid or self.app_id), + }, + "req_params": { + "text": text, + "speaker": self.voice, + "audio_params": { + "format": "pcm", + "sample_rate": self.sample_rate, + "speech_rate": self._speech_rate_percent(self.speed), + }, + }, + } + if self.model: + payload["req_params"]["model"] = self.model + + chunk_size = max(1, self.sample_rate * 2 // 10) + audio_buffer = b"" + pending_chunk: Optional[bytes] = None + + try: + async with self._session.post(self.api_url, headers=headers, json=payload) as response: + if response.status != 200: + error_text = await response.text() + raise RuntimeError(f"Volcengine TTS error {response.status}: {error_text}") + + async for audio_bytes in self._iter_audio_bytes(response): + if self._cancel_event.is_set(): + logger.info("Volcengine TTS synthesis cancelled") + return + + audio_buffer += audio_bytes + while len(audio_buffer) >= chunk_size: + emitted = audio_buffer[:chunk_size] + audio_buffer = audio_buffer[chunk_size:] + if pending_chunk is not None: + yield TTSChunk(audio=pending_chunk, sample_rate=self.sample_rate, is_final=False) + pending_chunk = emitted + + if self._cancel_event.is_set(): + return + + if pending_chunk is not None: + if audio_buffer: + yield TTSChunk(audio=pending_chunk, sample_rate=self.sample_rate, is_final=False) + pending_chunk = None + else: + yield TTSChunk(audio=pending_chunk, sample_rate=self.sample_rate, is_final=True) + pending_chunk = None + + if audio_buffer: + yield TTSChunk(audio=audio_buffer, sample_rate=self.sample_rate, is_final=True) + + except asyncio.CancelledError: + logger.info("Volcengine TTS synthesis cancelled via asyncio") + raise + except Exception as exc: + logger.error("Volcengine TTS synthesis error: {}", exc) + raise + + async def cancel(self) -> None: + self._cancel_event.set() + + async def _iter_audio_bytes(self, response: aiohttp.ClientResponse) -> AsyncIterator[bytes]: + decoder = json.JSONDecoder() + utf8_decoder = codecs.getincrementaldecoder("utf-8")() + text_buffer = "" + self._pending_audio.clear() + + async for raw_chunk in response.content.iter_any(): + text_buffer += utf8_decoder.decode(raw_chunk) + text_buffer = self._yield_audio_payloads(decoder, text_buffer) + while self._pending_audio: + yield self._pending_audio.pop(0) + + text_buffer += utf8_decoder.decode(b"", final=True) + text_buffer = self._yield_audio_payloads(decoder, text_buffer) + while self._pending_audio: + yield self._pending_audio.pop(0) + + def _yield_audio_payloads(self, decoder: json.JSONDecoder, text_buffer: str) -> str: + while True: + stripped = text_buffer.lstrip() + if not stripped: + return "" + if len(stripped) != len(text_buffer): + text_buffer = stripped + + try: + payload, idx = decoder.raw_decode(text_buffer) + except json.JSONDecodeError: + return text_buffer + + text_buffer = text_buffer[idx:] + audio = self._extract_audio_bytes(payload) + if audio: + self._pending_audio.append(audio) + + def _extract_audio_bytes(self, payload: Any) -> bytes: + if not isinstance(payload, dict): + return b"" + + code = payload.get("code") + if code not in (None, 0, 20000000): + message = str(payload.get("message") or "unknown error") + raise RuntimeError(f"Volcengine TTS stream error {code}: {message}") + + encoded = payload.get("data") + if isinstance(encoded, str) and encoded.strip(): + try: + return base64.b64decode(encoded) + except Exception as exc: + logger.warning("Failed to decode Volcengine TTS audio chunk: {}", exc) + return b"" + + @staticmethod + def _speech_rate_percent(speed: float) -> int: + clamped = max(0.5, min(2.0, float(speed or 1.0))) + return int(round((clamped - 1.0) * 100)) diff --git a/engine/pyproject.toml b/engine/pyproject.toml index 8786905..c0031f0 100644 --- a/engine/pyproject.toml +++ b/engine/pyproject.toml @@ -31,7 +31,17 @@ Issues = "https://github.com/yourusername/py-active-call-cc/issues" [tool.setuptools.packages.find] where = ["."] -include = ["app*"] +include = [ + "app*", + "adapters*", + "protocol*", + "providers*", + "processors*", + "runtime*", + "tools*", + "utils*", + "workflow*", +] exclude = ["tests*", "scripts*", "reference*"] [tool.black] diff --git a/engine/requirements.txt b/engine/requirements.txt index a32b7d2..0d8b90d 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -29,11 +29,10 @@ aiohttp>=3.9.1 openai>=1.0.0 dashscope>=1.25.11 -# AI Services - TTS -edge-tts>=6.1.0 -pydub>=0.25.0 # For audio format conversion - # Microphone client dependencies sounddevice>=0.4.6 soundfile>=0.12.1 pyaudio>=0.2.13 # More reliable audio on Windows + +# FastGPT runtime support is installed from the sibling fastgpt-python-sdk package. +# Local dev: pip install -e ..\\fastgpt-python-sdk diff --git a/engine/runtime/__init__.py b/engine/runtime/__init__.py new file mode 100644 index 0000000..1364082 --- /dev/null +++ b/engine/runtime/__init__.py @@ -0,0 +1 @@ +"""Runtime package.""" diff --git a/engine/core/conversation.py b/engine/runtime/conversation.py similarity index 99% rename from engine/core/conversation.py rename to engine/runtime/conversation.py index 08b23c6..fe21c01 100644 --- a/engine/core/conversation.py +++ b/engine/runtime/conversation.py @@ -10,7 +10,7 @@ from dataclasses import dataclass, field from enum import Enum from loguru import logger -from services.base import LLMMessage +from providers.common.base import LLMMessage class ConversationState(Enum): diff --git a/engine/core/events.py b/engine/runtime/events.py similarity index 100% rename from engine/core/events.py rename to engine/runtime/events.py diff --git a/engine/runtime/history/__init__.py b/engine/runtime/history/__init__.py new file mode 100644 index 0000000..44329ff --- /dev/null +++ b/engine/runtime/history/__init__.py @@ -0,0 +1 @@ +"""Runtime history package.""" diff --git a/engine/core/history_bridge.py b/engine/runtime/history/bridge.py similarity index 98% rename from engine/core/history_bridge.py rename to engine/runtime/history/bridge.py index ead9a3b..bacd682 100644 --- a/engine/core/history_bridge.py +++ b/engine/runtime/history/bridge.py @@ -5,10 +5,12 @@ from __future__ import annotations import asyncio import time from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional from loguru import logger +from runtime.ports import ConversationHistoryStore + @dataclass class _HistoryTranscriptJob: @@ -29,7 +31,7 @@ class SessionHistoryBridge: def __init__( self, *, - history_writer: Any, + history_writer: ConversationHistoryStore | None, enabled: bool, queue_max_size: int, retry_max_attempts: int, diff --git a/engine/runtime/pipeline/__init__.py b/engine/runtime/pipeline/__init__.py new file mode 100644 index 0000000..8861a22 --- /dev/null +++ b/engine/runtime/pipeline/__init__.py @@ -0,0 +1 @@ +"""Runtime pipeline package.""" diff --git a/engine/runtime/pipeline/asr_flow.py b/engine/runtime/pipeline/asr_flow.py new file mode 100644 index 0000000..1b539f5 --- /dev/null +++ b/engine/runtime/pipeline/asr_flow.py @@ -0,0 +1,13 @@ +"""ASR flow helpers extracted from the duplex pipeline. + +This module is intentionally lightweight for phase-wise migration. +""" + +from __future__ import annotations + +from providers.common.base import ASRResult + + +def is_final_result(result: ASRResult) -> bool: + """Return whether an ASR result is final.""" + return bool(result.is_final) diff --git a/engine/runtime/pipeline/constants.py b/engine/runtime/pipeline/constants.py new file mode 100644 index 0000000..0109925 --- /dev/null +++ b/engine/runtime/pipeline/constants.py @@ -0,0 +1,6 @@ +"""Shared constants for the runtime duplex pipeline.""" + +TRACK_AUDIO_IN = "audio_in" +TRACK_AUDIO_OUT = "audio_out" +TRACK_CONTROL = "control" +PCM_FRAME_BYTES = 640 # 16k mono pcm_s16le, 20ms diff --git a/engine/core/duplex_pipeline.py b/engine/runtime/pipeline/duplex.py similarity index 87% rename from engine/core/duplex_pipeline.py rename to engine/runtime/pipeline/duplex.py index 13f1852..17c7dd8 100644 --- a/engine/core/duplex_pipeline.py +++ b/engine/runtime/pipeline/duplex.py @@ -26,21 +26,28 @@ import aiohttp from loguru import logger from app.config import settings -from core.conversation import ConversationManager, ConversationState -from core.events import get_event_bus -from core.tool_executor import execute_server_tool -from core.transports import BaseTransport -from models.ws_v1 import ev +from providers.factory.default import DefaultRealtimeServiceFactory +from runtime.conversation import ConversationManager, ConversationState +from runtime.events import get_event_bus +from runtime.ports import ( + ASRMode, + ASRPort, + ASRServiceSpec, + LLMPort, + LLMServiceSpec, + OfflineASRPort, + RealtimeServiceFactory, + StreamingASRPort, + TTSPort, + TTSServiceSpec, +) +from tools.executor import execute_server_tool +from runtime.transports import BaseTransport +from protocol.ws_v1.schema import ev from processors.eou import EouDetector from processors.vad import SileroVAD, VADProcessor -from services.asr import BufferedASRService -from services.base import BaseASRService, BaseLLMService, BaseTTSService, LLMMessage, LLMStreamEvent -from services.dashscope_tts import DashScopeTTSService -from services.llm import MockLLMService, OpenAILLMService -from services.openai_compatible_asr import OpenAICompatibleASRService -from services.openai_compatible_tts import OpenAICompatibleTTSService -from services.streaming_text import extract_tts_sentence, has_spoken_content -from services.tts import EdgeTTSService, MockTTSService +from providers.common.base import LLMMessage, LLMStreamEvent +from providers.common.streaming_text import extract_tts_sentence, has_spoken_content class DuplexPipeline: @@ -66,6 +73,8 @@ class DuplexPipeline: _MIN_SPLIT_SPOKEN_CHARS = 6 _TOOL_WAIT_TIMEOUT_SECONDS = 60.0 _SERVER_TOOL_TIMEOUT_SECONDS = 15.0 + _MAX_LLM_ROUNDS = 3 + _MAX_PROVIDER_MANAGED_ROUNDS = 24 TRACK_AUDIO_IN = "audio_in" TRACK_AUDIO_OUT = "audio_out" TRACK_CONTROL = "control" @@ -73,6 +82,7 @@ class DuplexPipeline: _ASR_DELTA_THROTTLE_MS = 500 _LLM_DELTA_THROTTLE_MS = 80 _ASR_CAPTURE_MAX_MS = 15000 + _ASR_STREAM_FINAL_TIMEOUT_MS = 800 _OPENER_PRE_ROLL_MS = 180 _DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = { "current_time": { @@ -258,9 +268,9 @@ class DuplexPipeline: self, transport: BaseTransport, session_id: str, - llm_service: Optional[BaseLLMService] = None, - tts_service: Optional[BaseTTSService] = None, - asr_service: Optional[BaseASRService] = None, + llm_service: Optional[LLMPort] = None, + tts_service: Optional[TTSPort] = None, + asr_service: Optional[ASRPort] = None, system_prompt: Optional[str] = None, greeting: Optional[str] = None, knowledge_searcher: Optional[ @@ -272,6 +282,7 @@ class DuplexPipeline: server_tool_executor: Optional[ Callable[[Dict[str, Any]], Awaitable[Dict[str, Any]]] ] = None, + service_factory: Optional[RealtimeServiceFactory] = None, ): """ Initialize duplex pipeline. @@ -279,8 +290,8 @@ class DuplexPipeline: Args: transport: Transport for sending audio/events session_id: Session identifier - llm_service: LLM service (defaults to OpenAI) - tts_service: TTS service (defaults to EdgeTTS) + llm_service: Optional injected LLM port implementation + tts_service: Optional injected TTS port implementation asr_service: ASR service (optional) system_prompt: System prompt for LLM greeting: Optional greeting to speak on start @@ -312,12 +323,18 @@ class DuplexPipeline: self.llm_service = llm_service self.tts_service = tts_service self.asr_service = asr_service # Will be initialized in start() + self._asr_mode: ASRMode = self._resolve_asr_mode( + settings.asr_provider, + getattr(asr_service, "mode", None), + ) + self._service_factory = service_factory or DefaultRealtimeServiceFactory() self._knowledge_searcher = knowledge_searcher self._tool_resource_resolver = tool_resource_resolver self._server_tool_executor = server_tool_executor # Track last sent transcript to avoid duplicates self._last_sent_transcript = "" + self._latest_asr_interim_text = "" self._pending_transcript_delta: str = "" self._last_transcript_delta_emit_ms: float = 0.0 @@ -393,6 +410,7 @@ class DuplexPipeline: self._runtime_tool_display_names: Dict[str, str] = {} self._runtime_tool_wait_for_response: Dict[str, bool] = {} self._pending_tool_waiters: Dict[str, asyncio.Future] = {} + self._pending_tool_deadlines: Dict[str, float] = {} self._early_tool_results: Dict[str, Dict[str, Any]] = {} self._completed_tool_call_ids: set[str] = set() self._pending_client_tool_call_ids: set[str] = set() @@ -579,10 +597,13 @@ class DuplexPipeline: "provider": llm_provider, "model": str(self._runtime_llm.get("model") or settings.llm_model), "baseUrl": llm_base_url, + "appId": str(self._runtime_llm.get("appId") or ""), }, "asr": { "provider": asr_provider, + "mode": self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode")), "model": str(self._runtime_asr.get("model") or settings.asr_model or ""), + "enableInterim": self._asr_interim_enabled(), "interimIntervalMs": int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms), "minAudioMs": int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms), }, @@ -777,9 +798,21 @@ class DuplexPipeline: return None @staticmethod - def _is_openai_compatible_provider(provider: Any) -> bool: - normalized = str(provider or "").strip().lower() - return normalized in {"openai_compatible", "openai-compatible", "siliconflow"} + def _coerce_json_object(value: Any) -> Optional[Dict[str, Any]]: + if isinstance(value, dict): + return dict(value) + if isinstance(value, str): + raw = value.strip() + if not raw: + return None + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + logger.warning("Ignoring invalid JSON object config: {}", raw[:120]) + return None + if isinstance(parsed, dict): + return parsed + return None @staticmethod def _is_dashscope_tts_provider(provider: Any) -> bool: @@ -787,9 +820,20 @@ class DuplexPipeline: return normalized == "dashscope" @staticmethod - def _is_llm_provider_supported(provider: Any) -> bool: - normalized = str(provider or "").strip().lower() - return normalized in {"openai", "openai_compatible", "openai-compatible", "siliconflow"} + def _resolve_asr_mode(provider: Any, raw_mode: Any = None) -> ASRMode: + normalized_mode = str(raw_mode or "").strip().lower() + if normalized_mode in {"offline", "streaming"}: + return normalized_mode # type: ignore[return-value] + normalized_provider = str(provider or "").strip().lower() + if normalized_provider in {"dashscope", "volcengine"}: + return "streaming" + return "offline" + + def _offline_asr(self) -> OfflineASRPort: + return self.asr_service # type: ignore[return-value] + + def _streaming_asr(self) -> StreamingASRPort: + return self.asr_service # type: ignore[return-value] @staticmethod def _default_llm_base_url(provider: Any) -> Optional[str]: @@ -798,10 +842,6 @@ class DuplexPipeline: return "https://api.siliconflow.cn/v1" return None - @staticmethod - def _default_dashscope_tts_realtime_url() -> str: - return "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" - @staticmethod def _default_dashscope_tts_model() -> str: return "qwen3-tts-flash-realtime" @@ -847,6 +887,20 @@ class DuplexPipeline: return self._runtime_barge_in_min_duration_ms return self._barge_in_min_duration_ms + def _asr_interim_enabled(self) -> bool: + current_mode = self._asr_mode + if not self.asr_service: + current_mode = self._resolve_asr_mode( + self._runtime_asr.get("provider") or settings.asr_provider, + self._runtime_asr.get("mode"), + ) + if current_mode != "offline": + return True + enabled = self._coerce_bool(self._runtime_asr.get("enableInterim")) + if enabled is not None: + return enabled + return bool(settings.asr_enable_interim) + def _barge_in_silence_tolerance_frames(self) -> int: """Convert silence tolerance from ms to frame count using current chunk size.""" chunk_ms = max(1, settings.chunk_size_ms) @@ -887,31 +941,45 @@ class DuplexPipeline: return None return text.strip().strip('"').strip("'") + async def _resolve_provider_initial_greeting(self) -> Optional[str]: + if not self.llm_service or not hasattr(self.llm_service, "get_initial_greeting"): + return None + + try: + greeting = await self.llm_service.get_initial_greeting() + except Exception as exc: + logger.warning("Failed to load provider initial greeting: {}", exc) + return None + + text = str(greeting or "").strip() + return text or None + async def start(self) -> None: """Start the pipeline and connect services.""" try: # Connect LLM service if not self.llm_service: llm_provider = (self._runtime_llm.get("provider") or settings.llm_provider).lower() - llm_api_key = self._runtime_llm.get("apiKey") or settings.llm_api_key + llm_api_key = self._runtime_llm.get("apiKey") llm_base_url = ( self._runtime_llm.get("baseUrl") or settings.llm_api_url or self._default_llm_base_url(llm_provider) ) llm_model = self._runtime_llm.get("model") or settings.llm_model - - if self._is_llm_provider_supported(llm_provider) and llm_api_key: - self.llm_service = OpenAILLMService( - api_key=llm_api_key, - base_url=llm_base_url, - model=llm_model, + self.llm_service = self._service_factory.create_llm_service( + LLMServiceSpec( + provider=llm_provider, + model=str(llm_model), + api_key=str(llm_api_key).strip() if llm_api_key else None, + base_url=str(llm_base_url).strip() if llm_base_url else None, + app_id=str(self._runtime_llm.get("appId")).strip() if self._runtime_llm.get("appId") else None, + system_prompt=self.conversation.system_prompt, + temperature=settings.llm_temperature, knowledge_config=self._resolved_knowledge_config(), knowledge_searcher=self._knowledge_searcher, ) - else: - logger.warning("LLM provider unsupported or API key missing - using mock LLM") - self.llm_service = MockLLMService() + ) if hasattr(self.llm_service, "set_knowledge_config"): self.llm_service.set_knowledge_config(self._resolved_knowledge_config()) @@ -926,10 +994,14 @@ class DuplexPipeline: if tts_output_enabled: if not self.tts_service: tts_provider = (self._runtime_tts.get("provider") or settings.tts_provider).lower() - tts_api_key = self._runtime_tts.get("apiKey") or settings.tts_api_key + tts_api_key = self._runtime_tts.get("apiKey") tts_api_url = self._runtime_tts.get("baseUrl") or settings.tts_api_url tts_voice = self._runtime_tts.get("voice") or settings.tts_voice tts_model = self._runtime_tts.get("model") or settings.tts_model + tts_app_id = self._runtime_tts.get("appId") or settings.tts_app_id + tts_resource_id = self._runtime_tts.get("resourceId") or settings.tts_resource_id + tts_cluster = self._runtime_tts.get("cluster") or settings.tts_cluster + tts_uid = self._runtime_tts.get("uid") or settings.tts_uid tts_speed = float(self._runtime_tts.get("speed") or settings.tts_speed) tts_mode = self._resolved_dashscope_tts_mode() runtime_mode = str(self._runtime_tts.get("mode") or "").strip() @@ -938,41 +1010,33 @@ class DuplexPipeline: "services.tts.mode is DashScope-only and will be ignored " f"for provider={tts_provider}" ) - - if self._is_dashscope_tts_provider(tts_provider) and tts_api_key: - self.tts_service = DashScopeTTSService( - api_key=tts_api_key, - api_url=tts_api_url or self._default_dashscope_tts_realtime_url(), - voice=tts_voice, - model=tts_model or self._default_dashscope_tts_model(), + self.tts_service = self._service_factory.create_tts_service( + TTSServiceSpec( + provider=tts_provider, + api_key=str(tts_api_key).strip() if tts_api_key else None, + api_url=str(tts_api_url).strip() if tts_api_url else None, + voice=str(tts_voice), + model=str(tts_model).strip() if tts_model else None, + app_id=str(tts_app_id).strip() if tts_app_id else None, + resource_id=str(tts_resource_id).strip() if tts_resource_id else None, + cluster=str(tts_cluster).strip() if tts_cluster else None, + uid=str(tts_uid).strip() if tts_uid else None, + sample_rate=settings.sample_rate, + speed=tts_speed, mode=str(tts_mode), - sample_rate=settings.sample_rate, - speed=tts_speed ) - logger.info("Using DashScope realtime TTS service") - elif self._is_openai_compatible_provider(tts_provider) and tts_api_key: - self.tts_service = OpenAICompatibleTTSService( - api_key=tts_api_key, - api_url=tts_api_url, - voice=tts_voice, - model=tts_model or "FunAudioLLM/CosyVoice2-0.5B", - sample_rate=settings.sample_rate, - speed=tts_speed - ) - logger.info(f"Using OpenAI-compatible TTS service (provider={tts_provider})") - else: - self.tts_service = EdgeTTSService( - voice=tts_voice, - sample_rate=settings.sample_rate - ) - logger.info("Using Edge TTS service") + ) try: await self.tts_service.connect() except Exception as e: - logger.warning(f"TTS backend unavailable ({e}); falling back to MockTTS") - self.tts_service = MockTTSService( - sample_rate=settings.sample_rate + logger.warning(f"TTS backend unavailable ({e}); falling back to default TTS adapter") + self.tts_service = self._service_factory.create_tts_service( + TTSServiceSpec( + provider="mock", + voice="mock", + sample_rate=settings.sample_rate, + ) ) await self.tts_service.connect() else: @@ -982,32 +1046,51 @@ class DuplexPipeline: # Connect ASR service if not self.asr_service: asr_provider = (self._runtime_asr.get("provider") or settings.asr_provider).lower() - asr_api_key = self._runtime_asr.get("apiKey") or settings.asr_api_key + asr_api_key = self._runtime_asr.get("apiKey") asr_api_url = self._runtime_asr.get("baseUrl") or settings.asr_api_url asr_model = self._runtime_asr.get("model") or settings.asr_model + asr_app_id = self._runtime_asr.get("appId") or settings.asr_app_id + asr_resource_id = self._runtime_asr.get("resourceId") or settings.asr_resource_id + asr_cluster = self._runtime_asr.get("cluster") or settings.asr_cluster + asr_uid = self._runtime_asr.get("uid") or settings.asr_uid + asr_request_params = self._coerce_json_object(self._runtime_asr.get("requestParams")) + if asr_request_params is None: + asr_request_params = self._coerce_json_object(settings.asr_request_params_json) + asr_enable_interim = self._coerce_bool(self._runtime_asr.get("enableInterim")) + if asr_enable_interim is None: + asr_enable_interim = bool(settings.asr_enable_interim) asr_interim_interval = int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms) asr_min_audio_ms = int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms) + asr_mode = self._resolve_asr_mode(asr_provider, self._runtime_asr.get("mode")) - if self._is_openai_compatible_provider(asr_provider) and asr_api_key: - self.asr_service = OpenAICompatibleASRService( - api_key=asr_api_key, - api_url=asr_api_url, - model=asr_model or "FunAudioLLM/SenseVoiceSmall", + self.asr_service = self._service_factory.create_asr_service( + ASRServiceSpec( + provider=asr_provider, sample_rate=settings.sample_rate, + mode=asr_mode, + language="auto", + api_key=str(asr_api_key).strip() if asr_api_key else None, + api_url=str(asr_api_url).strip() if asr_api_url else None, + model=str(asr_model).strip() if asr_model else None, + app_id=str(asr_app_id).strip() if asr_app_id else None, + resource_id=str(asr_resource_id).strip() if asr_resource_id else None, + cluster=str(asr_cluster).strip() if asr_cluster else None, + uid=str(asr_uid).strip() if asr_uid else None, + request_params=asr_request_params, + enable_interim=asr_enable_interim, interim_interval_ms=asr_interim_interval, min_audio_for_interim_ms=asr_min_audio_ms, - on_transcript=self._on_transcript_callback + on_transcript=self._on_transcript_callback, ) - logger.info(f"Using OpenAI-compatible ASR service (provider={asr_provider})") - else: - self.asr_service = BufferedASRService( - sample_rate=settings.sample_rate - ) - logger.info("Using Buffered ASR service (no real transcription)") + ) + self._asr_mode = self._resolve_asr_mode( + self._runtime_asr.get("provider") or settings.asr_provider, + getattr(self.asr_service, "mode", self._runtime_asr.get("mode")), + ) await self.asr_service.connect() - logger.info("DuplexPipeline services connected") + logger.info("DuplexPipeline services connected (asr_mode={})", self._asr_mode) if not self._outbound_task or self._outbound_task.done(): self._outbound_task = asyncio.create_task(self._outbound_loop()) @@ -1031,7 +1114,11 @@ class DuplexPipeline: if not self._bot_starts_first(): return - if self._generated_opener_enabled() and self._resolved_tool_schemas(): + provider_greeting = await self._resolve_provider_initial_greeting() + if provider_greeting: + self.conversation.greeting = provider_greeting + + if not provider_greeting and self._generated_opener_enabled() and self._resolved_tool_schemas(): # Run generated opener as a normal tool-capable assistant turn. # Use an empty user input so the opener can be driven by system prompt policy. if self._current_turn_task and not self._current_turn_task.done(): @@ -1042,13 +1129,13 @@ class DuplexPipeline: return manual_opener_execution: Dict[str, List[Dict[str, Any]]] = {"toolCalls": [], "toolResults": []} - if not self._generated_opener_enabled() and self._resolved_manual_opener_tool_calls(): + if not provider_greeting and not self._generated_opener_enabled() and self._resolved_manual_opener_tool_calls(): self._start_turn() self._start_response() manual_opener_execution = await self._execute_manual_opener_tool_calls() greeting_to_speak = self.conversation.greeting - if self._generated_opener_enabled(): + if not provider_greeting and self._generated_opener_enabled(): generated_greeting = await self._generate_runtime_greeting() if generated_greeting: greeting_to_speak = generated_greeting @@ -1472,6 +1559,9 @@ class DuplexPipeline: text: Transcribed text is_final: Whether this is the final transcription """ + if not is_final and not self._asr_interim_enabled(): + return + # Avoid sending duplicate transcripts if text == self._last_sent_transcript and not is_final: return @@ -1480,6 +1570,7 @@ class DuplexPipeline: self._last_sent_transcript = text if is_final: + self._latest_asr_interim_text = "" self._pending_transcript_delta = "" self._last_transcript_delta_emit_ms = 0.0 await self._send_event( @@ -1495,6 +1586,7 @@ class DuplexPipeline: logger.debug(f"Sent transcript (final): {text[:50]}...") return + self._latest_asr_interim_text = text self._pending_transcript_delta = text should_emit = ( self._last_transcript_delta_emit_ms <= 0.0 @@ -1518,14 +1610,16 @@ class DuplexPipeline: await self.conversation.start_user_turn() self._audio_buffer = b"" self._last_sent_transcript = "" + self._latest_asr_interim_text = "" self.eou_detector.reset() self._asr_capture_active = False self._asr_capture_started_ms = 0.0 self._pending_speech_audio = b"" - # Clear ASR buffer. Interim starts only after ASR capture is activated. - if hasattr(self.asr_service, 'clear_buffer'): - self.asr_service.clear_buffer() + if self._asr_mode == "streaming": + self._streaming_asr().clear_utterance() + else: + self._offline_asr().clear_buffer() logger.debug("User speech started") @@ -1534,8 +1628,11 @@ class DuplexPipeline: if self._asr_capture_active: return - if hasattr(self.asr_service, 'start_interim_transcription'): - await self.asr_service.start_interim_transcription() + if self._asr_mode == "streaming": + await self._streaming_asr().begin_utterance() + else: + if self._asr_interim_enabled(): + await self._offline_asr().start_interim_transcription() # Prime ASR with a short pre-speech context window so the utterance # start isn't lost while waiting for VAD to transition to Speech. @@ -1568,24 +1665,22 @@ class DuplexPipeline: self._pending_speech_audio = b"" return - # Add a tiny trailing silence tail to stabilize final-token decoding. - if self._asr_final_tail_bytes > 0: - final_tail = b"\x00" * self._asr_final_tail_bytes - await self.asr_service.send_audio(final_tail) - - # Stop interim transcriptions - if hasattr(self.asr_service, 'stop_interim_transcription'): - await self.asr_service.stop_interim_transcription() - - # Get final transcription from ASR service user_text = "" - - if hasattr(self.asr_service, 'get_final_transcription'): - # SiliconFlow ASR - get final transcription - user_text = await self.asr_service.get_final_transcription() - elif hasattr(self.asr_service, 'get_and_clear_text'): - # Buffered ASR - get accumulated text - user_text = self.asr_service.get_and_clear_text() + if self._asr_mode == "streaming": + streaming_asr = self._streaming_asr() + await streaming_asr.end_utterance() + user_text = await streaming_asr.wait_for_final_transcription( + timeout_ms=self._ASR_STREAM_FINAL_TIMEOUT_MS + ) + if not user_text.strip(): + user_text = self._latest_asr_interim_text + else: + # Add a tiny trailing silence tail to stabilize final-token decoding. + if self._asr_final_tail_bytes > 0: + final_tail = b"\x00" * self._asr_final_tail_bytes + await self.asr_service.send_audio(final_tail) + await self._offline_asr().stop_interim_transcription() + user_text = await self._offline_asr().get_final_transcription() # Skip if no meaningful text if not user_text or not user_text.strip(): @@ -1593,6 +1688,7 @@ class DuplexPipeline: # Reset for next utterance self._audio_buffer = b"" self._last_sent_transcript = "" + self._latest_asr_interim_text = "" self._asr_capture_active = False self._asr_capture_started_ms = 0.0 self._pending_speech_audio = b"" @@ -1617,6 +1713,7 @@ class DuplexPipeline: # Clear buffers self._audio_buffer = b"" self._last_sent_transcript = "" + self._latest_asr_interim_text = "" self._pending_transcript_delta = "" self._last_transcript_delta_emit_ms = 0.0 self._asr_capture_active = False @@ -1879,12 +1976,35 @@ class DuplexPipeline: return bool(self._runtime_tool_wait_for_response.get(normalized, False)) def _tool_executor(self, tool_call: Dict[str, Any]) -> str: + explicit_executor = str(tool_call.get("executor") or "").strip().lower() + if explicit_executor in {"client", "server"}: + return explicit_executor name = self._tool_name(tool_call) if name and name in self._runtime_tool_executor: return self._runtime_tool_executor[name] # Default to server execution unless explicitly marked as client. return "server" + def _tool_wait_for_response_for_call(self, tool_name: str, tool_call: Dict[str, Any]) -> bool: + explicit_wait = tool_call.get("wait_for_response") + if explicit_wait is None: + explicit_wait = tool_call.get("waitForResponse") + if isinstance(explicit_wait, bool): + return explicit_wait + return self._tool_wait_for_response(tool_name) + + def _tool_timeout_ms(self, tool_call: Dict[str, Any]) -> int: + raw_timeout = tool_call.get("timeout_ms") + if raw_timeout is None: + raw_timeout = tool_call.get("timeoutMs") + try: + timeout_ms = int(raw_timeout) + except (TypeError, ValueError): + timeout_ms = 0 + if timeout_ms > 0: + return timeout_ms + return int(self._TOOL_WAIT_TIMEOUT_SECONDS * 1000) + def _tool_arguments(self, tool_call: Dict[str, Any]) -> Dict[str, Any]: fn = tool_call.get("function") if not isinstance(fn, dict): @@ -2104,7 +2224,7 @@ class DuplexPipeline: self._early_tool_results[call_id] = item self._completed_tool_call_ids.add(call_id) - async def _wait_for_single_tool_result(self, call_id: str) -> Dict[str, Any]: + async def _wait_for_single_tool_result(self, call_id: str, timeout_seconds: Optional[float] = None) -> Dict[str, Any]: if call_id in self._completed_tool_call_ids and call_id not in self._early_tool_results: return { "tool_call_id": call_id, @@ -2118,8 +2238,10 @@ class DuplexPipeline: loop = asyncio.get_running_loop() future = loop.create_future() self._pending_tool_waiters[call_id] = future + timeout = timeout_seconds if isinstance(timeout_seconds, (int, float)) and timeout_seconds > 0 else self._TOOL_WAIT_TIMEOUT_SECONDS + self._pending_tool_deadlines[call_id] = time.monotonic() + timeout try: - return await asyncio.wait_for(future, timeout=self._TOOL_WAIT_TIMEOUT_SECONDS) + return await asyncio.wait_for(future, timeout=timeout) except asyncio.TimeoutError: self._completed_tool_call_ids.add(call_id) return { @@ -2129,8 +2251,14 @@ class DuplexPipeline: } finally: self._pending_tool_waiters.pop(call_id, None) + self._pending_tool_deadlines.pop(call_id, None) self._pending_client_tool_call_ids.discard(call_id) + def pending_client_tool_deadline(self) -> Optional[float]: + if not self._pending_tool_deadlines: + return None + return max(self._pending_tool_deadlines.values()) + def _normalize_stream_event(self, item: Any) -> LLMStreamEvent: if isinstance(item, LLMStreamEvent): return item @@ -2171,7 +2299,8 @@ class DuplexPipeline: messages = self.conversation.get_messages() if system_context and system_context.strip(): messages = [*messages, LLMMessage(role="system", content=system_context.strip())] - max_rounds = 3 + llm_rounds = 0 + provider_rounds_remaining = self._MAX_PROVIDER_MANAGED_ROUNDS await self.conversation.start_assistant_turn() self._is_bot_speaking = True @@ -2181,10 +2310,28 @@ class DuplexPipeline: first_audio_sent = False self._pending_llm_delta = "" self._last_llm_delta_emit_ms = 0.0 - for _ in range(max_rounds): + pending_provider_stream = None + while True: if self._interrupt_event.is_set(): break + if pending_provider_stream is not None: + if provider_rounds_remaining <= 0: + logger.warning( + "Provider-managed tool chain exceeded {} rounds; ending turn early", + self._MAX_PROVIDER_MANAGED_ROUNDS, + ) + break + provider_rounds_remaining -= 1 + else: + if llm_rounds >= self._MAX_LLM_ROUNDS: + logger.warning( + "LLM tool planning exceeded {} rounds; ending turn early", + self._MAX_LLM_ROUNDS, + ) + break + llm_rounds += 1 + sentence_buffer = "" pending_punctuation = "" round_response = "" @@ -2192,7 +2339,10 @@ class DuplexPipeline: allow_text_output = True use_engine_sentence_split = self._use_engine_sentence_split_for_tts() - async for raw_event in self.llm_service.generate_stream(messages): + stream_iter = pending_provider_stream if pending_provider_stream is not None else self.llm_service.generate_stream(messages) + pending_provider_stream = None + + async for raw_event in stream_iter: if self._interrupt_event.is_set(): break @@ -2207,14 +2357,21 @@ class DuplexPipeline: if not tool_call: continue allow_text_output = False + tool_name = self._tool_name(tool_call) or "unknown_tool" executor = self._tool_executor(tool_call) enriched_tool_call = dict(tool_call) enriched_tool_call["executor"] = executor - tool_name = self._tool_name(enriched_tool_call) or "unknown_tool" tool_id = self._tool_id_for_name(tool_name) - tool_display_name = self._tool_display_name(tool_name) or tool_name - wait_for_response = self._tool_wait_for_response(tool_name) + tool_display_name = str( + enriched_tool_call.get("displayName") + or enriched_tool_call.get("display_name") + or self._tool_display_name(tool_name) + or tool_name + ).strip() + wait_for_response = self._tool_wait_for_response_for_call(tool_name, enriched_tool_call) enriched_tool_call["wait_for_response"] = wait_for_response + timeout_ms = self._tool_timeout_ms(enriched_tool_call) + enriched_tool_call["timeout_ms"] = timeout_ms call_id = str(enriched_tool_call.get("id") or "").strip() fn_payload = ( dict(enriched_tool_call.get("function")) @@ -2223,6 +2380,15 @@ class DuplexPipeline: ) raw_args = str(fn_payload.get("arguments") or "") if isinstance(fn_payload, dict) else "" tool_arguments = self._tool_arguments(enriched_tool_call) + if tool_name == "fastgpt.interactive": + context_payload = ( + dict(tool_arguments.get("context")) + if isinstance(tool_arguments.get("context"), dict) + else {} + ) + context_payload.setdefault("turn_id", turn_id) + context_payload.setdefault("response_id", response_id) + tool_arguments["context"] = context_payload merged_tool_arguments = self._apply_tool_default_args(tool_name, tool_arguments) try: merged_args_text = json.dumps(merged_tool_arguments, ensure_ascii=False) @@ -2249,9 +2415,9 @@ class DuplexPipeline: tool_id=tool_id, tool_display_name=tool_display_name, wait_for_response=wait_for_response, - arguments=tool_arguments, + arguments=merged_tool_arguments, executor=executor, - timeout_ms=int(self._TOOL_WAIT_TIMEOUT_SECONDS * 1000), + timeout_ms=timeout_ms, tool_call=enriched_tool_call, ) }, @@ -2382,6 +2548,8 @@ class DuplexPipeline: break tool_results: List[Dict[str, Any]] = [] + provider_managed_tool = False + provider_resumed = False for call in tool_calls: call_id = str(call.get("id") or "").strip() if not call_id: @@ -2391,9 +2559,27 @@ class DuplexPipeline: tool_id = self._tool_id_for_name(tool_name) logger.info(f"[Tool] execute start name={tool_name} call_id={call_id} executor={executor}") if executor == "client": - result = await self._wait_for_single_tool_result(call_id) + timeout_ms = self._tool_timeout_ms(call) + result = await self._wait_for_single_tool_result( + call_id, + timeout_seconds=(timeout_ms / 1000.0), + ) await self._emit_tool_result(result, source="client") tool_results.append(result) + if ( + hasattr(self.llm_service, "handles_client_tool") + and hasattr(self.llm_service, "resume_after_client_tool_result") + and self.llm_service.handles_client_tool(tool_name) + ): + provider_managed_tool = True + status = result.get("status") if isinstance(result.get("status"), dict) else {} + status_code = int(status.get("code") or 0) if status else 0 + output = result.get("output") if isinstance(result.get("output"), dict) else {} + action = str(output.get("action") or "").strip().lower() + if 200 <= status_code < 300 and action != "cancel": + pending_provider_stream = self.llm_service.resume_after_client_tool_result(call_id, result) + provider_resumed = True + break continue call_for_executor = dict(call) @@ -2420,6 +2606,11 @@ class DuplexPipeline: await self._emit_tool_result(result, source="server") tool_results.append(result) + if provider_resumed: + continue + if provider_managed_tool: + break + messages = [ *messages, LLMMessage( diff --git a/engine/runtime/pipeline/events_out.py b/engine/runtime/pipeline/events_out.py new file mode 100644 index 0000000..dabbafb --- /dev/null +++ b/engine/runtime/pipeline/events_out.py @@ -0,0 +1,12 @@ +"""Output-event shaping helpers for the runtime pipeline.""" + +from __future__ import annotations + +from typing import Any, Dict + + +def assistant_text_delta_event(text: str, **extra: Any) -> Dict[str, Any]: + """Build a normalized assistant text delta payload.""" + payload: Dict[str, Any] = {"type": "assistant.text.delta", "text": str(text)} + payload.update(extra) + return payload diff --git a/engine/runtime/pipeline/interrupts.py b/engine/runtime/pipeline/interrupts.py new file mode 100644 index 0000000..1960d56 --- /dev/null +++ b/engine/runtime/pipeline/interrupts.py @@ -0,0 +1,8 @@ +"""Interruption-related helpers extracted from the duplex pipeline.""" + +from __future__ import annotations + + +def should_interrupt(min_duration_ms: int, detected_ms: int) -> bool: + """Decide whether interruption conditions are met.""" + return int(detected_ms) >= max(0, int(min_duration_ms)) diff --git a/engine/runtime/pipeline/llm_flow.py b/engine/runtime/pipeline/llm_flow.py new file mode 100644 index 0000000..d938fb0 --- /dev/null +++ b/engine/runtime/pipeline/llm_flow.py @@ -0,0 +1,13 @@ +"""LLM flow helpers extracted from the duplex pipeline. + +This module is intentionally lightweight for phase-wise migration. +""" + +from __future__ import annotations + +from providers.common.base import LLMStreamEvent + + +def is_done_event(event: LLMStreamEvent) -> bool: + """Return whether an LLM stream event signals completion.""" + return str(event.type) == "done" diff --git a/engine/runtime/pipeline/tooling.py b/engine/runtime/pipeline/tooling.py new file mode 100644 index 0000000..459dceb --- /dev/null +++ b/engine/runtime/pipeline/tooling.py @@ -0,0 +1,13 @@ +"""Tooling helpers extracted from the duplex pipeline.""" + +from __future__ import annotations + +from typing import Any + + +def normalize_tool_name(name: Any, aliases: dict[str, str]) -> str: + """Normalize tool name with alias mapping.""" + normalized = str(name or "").strip() + if not normalized: + return "" + return aliases.get(normalized, normalized) diff --git a/engine/runtime/pipeline/tts_flow.py b/engine/runtime/pipeline/tts_flow.py new file mode 100644 index 0000000..156547e --- /dev/null +++ b/engine/runtime/pipeline/tts_flow.py @@ -0,0 +1,15 @@ +"""TTS flow helpers extracted from the duplex pipeline. + +This module is intentionally lightweight for phase-wise migration. +""" + +from __future__ import annotations + +from providers.common.base import TTSChunk + + +def chunk_duration_ms(chunk: TTSChunk) -> float: + """Estimate chunk duration in milliseconds for pcm16 mono.""" + if chunk.sample_rate <= 0: + return 0.0 + return (len(chunk.audio) / 2.0 / float(chunk.sample_rate)) * 1000.0 diff --git a/engine/runtime/ports/__init__.py b/engine/runtime/ports/__init__.py new file mode 100644 index 0000000..0ef9fe6 --- /dev/null +++ b/engine/runtime/ports/__init__.py @@ -0,0 +1,46 @@ +"""Port interfaces for runtime integration boundaries.""" + +from runtime.ports.asr import ( + ASRMode, + ASRPort, + ASRServiceSpec, + OfflineASRPort, + StreamingASRPort, +) +from runtime.ports.control_plane import ( + AssistantRuntimeConfigProvider, + ControlPlaneGateway, + ConversationHistoryStore, + KnowledgeRetriever, + ToolCatalog, +) +from runtime.ports.llm import ( + LLMCancellable, + LLMClientToolResumable, + LLMPort, + LLMRuntimeConfigurable, + LLMServiceSpec, +) +from runtime.ports.service_factory import RealtimeServiceFactory +from runtime.ports.tts import TTSPort, TTSServiceSpec + +__all__ = [ + "ASRMode", + "ASRPort", + "ASRServiceSpec", + "OfflineASRPort", + "StreamingASRPort", + "AssistantRuntimeConfigProvider", + "ControlPlaneGateway", + "ConversationHistoryStore", + "KnowledgeRetriever", + "ToolCatalog", + "LLMCancellable", + "LLMClientToolResumable", + "LLMPort", + "LLMRuntimeConfigurable", + "LLMServiceSpec", + "RealtimeServiceFactory", + "TTSPort", + "TTSServiceSpec", +] diff --git a/engine/runtime/ports/asr.py b/engine/runtime/ports/asr.py new file mode 100644 index 0000000..b1310b1 --- /dev/null +++ b/engine/runtime/ports/asr.py @@ -0,0 +1,90 @@ +"""ASR extension port contracts.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, AsyncIterator, Awaitable, Callable, Dict, Literal, Optional, Protocol + +from providers.common.base import ASRResult + +TranscriptCallback = Callable[[str, bool], Awaitable[None]] +ASRMode = Literal["offline", "streaming"] + + +@dataclass(frozen=True) +class ASRServiceSpec: + """Resolved runtime configuration for ASR service creation.""" + + provider: str + sample_rate: int + mode: Optional[ASRMode] = None + language: str = "auto" + api_key: Optional[str] = None + api_url: Optional[str] = None + model: Optional[str] = None + app_id: Optional[str] = None + resource_id: Optional[str] = None + cluster: Optional[str] = None + uid: Optional[str] = None + request_params: Optional[Dict[str, Any]] = None + enable_interim: bool = False + interim_interval_ms: int = 500 + min_audio_for_interim_ms: int = 300 + on_transcript: Optional[TranscriptCallback] = None + + +class ASRPort(Protocol): + """Port for speech recognition providers.""" + + mode: ASRMode + + async def connect(self) -> None: + """Establish connection to ASR provider.""" + + async def disconnect(self) -> None: + """Release ASR resources.""" + + async def send_audio(self, audio: bytes) -> None: + """Push one PCM audio chunk for recognition.""" + + async def receive_transcripts(self) -> AsyncIterator[ASRResult]: + """Stream partial/final recognition results.""" + + +class OfflineASRPort(ASRPort, Protocol): + """Port for offline/buffered ASR providers.""" + + mode: Literal["offline"] + + async def start_interim_transcription(self) -> None: + """Start interim transcription loop.""" + + async def stop_interim_transcription(self) -> None: + """Stop interim transcription loop.""" + + def clear_buffer(self) -> None: + """Clear provider-side ASR buffer.""" + + async def get_final_transcription(self) -> str: + """Return final transcription for the current utterance.""" + + def get_and_clear_text(self) -> str: + """Return buffered text and clear internal state.""" + + +class StreamingASRPort(ASRPort, Protocol): + """Port for streaming ASR providers.""" + + mode: Literal["streaming"] + + async def begin_utterance(self) -> None: + """Start a new utterance stream.""" + + async def end_utterance(self) -> None: + """Signal end of current utterance stream.""" + + async def wait_for_final_transcription(self, timeout_ms: int = 800) -> str: + """Wait for final transcript after utterance end.""" + + def clear_utterance(self) -> None: + """Reset utterance-local state.""" diff --git a/engine/core/ports/backend.py b/engine/runtime/ports/control_plane.py similarity index 75% rename from engine/core/ports/backend.py rename to engine/runtime/ports/control_plane.py index 227c743..c50d642 100644 --- a/engine/core/ports/backend.py +++ b/engine/runtime/ports/control_plane.py @@ -1,7 +1,7 @@ -"""Backend integration ports. +"""Control-plane integration ports. These interfaces define the boundary between engine runtime logic and -backend-side capabilities (config lookup, history persistence, retrieval, +control-plane capabilities (config lookup, history persistence, retrieval, and tool resource discovery). """ @@ -10,14 +10,14 @@ from __future__ import annotations from typing import Any, Dict, List, Optional, Protocol -class AssistantConfigProvider(Protocol): +class AssistantRuntimeConfigProvider(Protocol): """Port for loading trusted assistant runtime configuration.""" async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]: """Fetch assistant configuration payload.""" -class HistoryWriter(Protocol): +class ConversationHistoryStore(Protocol): """Port for persisting call and transcript history.""" async def create_call_record( @@ -27,7 +27,7 @@ class HistoryWriter(Protocol): assistant_id: Optional[str], source: str = "debug", ) -> Optional[str]: - """Create a call record and return backend call ID.""" + """Create a call record and return control-plane call ID.""" async def add_transcript( self, @@ -53,7 +53,7 @@ class HistoryWriter(Protocol): """Finalize a call record.""" -class KnowledgeSearcher(Protocol): +class KnowledgeRetriever(Protocol): """Port for RAG / knowledge retrieval operations.""" async def search_knowledge_context( @@ -66,19 +66,18 @@ class KnowledgeSearcher(Protocol): """Search a knowledge source and return ranked snippets.""" -class ToolResourceResolver(Protocol): +class ToolCatalog(Protocol): """Port for resolving tool metadata/configuration.""" async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]: """Fetch tool resource configuration.""" -class BackendGateway( - AssistantConfigProvider, - HistoryWriter, - KnowledgeSearcher, - ToolResourceResolver, +class ControlPlaneGateway( + AssistantRuntimeConfigProvider, + ConversationHistoryStore, + KnowledgeRetriever, + ToolCatalog, Protocol, ): - """Composite backend gateway interface used by engine services.""" - + """Composite control-plane gateway used by engine services.""" diff --git a/engine/runtime/ports/llm.py b/engine/runtime/ports/llm.py new file mode 100644 index 0000000..a5480f4 --- /dev/null +++ b/engine/runtime/ports/llm.py @@ -0,0 +1,82 @@ +"""LLM extension port contracts.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, AsyncIterator, Awaitable, Callable, Dict, List, Optional, Protocol + +from providers.common.base import LLMMessage, LLMStreamEvent + +KnowledgeRetrieverFn = Callable[..., Awaitable[List[Dict[str, Any]]]] + + +@dataclass(frozen=True) +class LLMServiceSpec: + """Resolved runtime configuration for LLM service creation.""" + + provider: str + model: str + api_key: Optional[str] = None + base_url: Optional[str] = None + app_id: Optional[str] = None + system_prompt: Optional[str] = None + temperature: float = 0.7 + knowledge_config: Dict[str, Any] = field(default_factory=dict) + knowledge_searcher: Optional[KnowledgeRetrieverFn] = None + + +class LLMPort(Protocol): + """Port for LLM providers.""" + + async def connect(self) -> None: + """Establish connection to LLM provider.""" + + async def disconnect(self) -> None: + """Release LLM resources.""" + + async def generate( + self, + messages: List[LLMMessage], + temperature: float = 0.7, + max_tokens: Optional[int] = None, + ) -> str: + """Generate a complete assistant response.""" + + async def generate_stream( + self, + messages: List[LLMMessage], + temperature: float = 0.7, + max_tokens: Optional[int] = None, + ) -> AsyncIterator[LLMStreamEvent]: + """Generate streaming assistant response events.""" + + +class LLMCancellable(Protocol): + """Optional extension for interrupting in-flight LLM generation.""" + + def cancel(self) -> None: + """Cancel an in-flight generation request.""" + + +class LLMRuntimeConfigurable(Protocol): + """Optional extension for runtime config updates.""" + + def set_knowledge_config(self, config: Optional[Dict[str, Any]]) -> None: + """Apply runtime knowledge retrieval settings.""" + + def set_tool_schemas(self, schemas: Optional[List[Dict[str, Any]]]) -> None: + """Apply runtime tool schemas used for tool calling.""" + + +class LLMClientToolResumable(Protocol): + """Optional extension for providers that pause on client-side tool results.""" + + def handles_client_tool(self, tool_name: str) -> bool: + """Return True when the provider owns the lifecycle of this client tool.""" + + def resume_after_client_tool_result( + self, + tool_call_id: str, + result: Dict[str, Any], + ) -> AsyncIterator[LLMStreamEvent]: + """Resume the provider stream after a correlated client-side tool result.""" diff --git a/engine/runtime/ports/service_factory.py b/engine/runtime/ports/service_factory.py new file mode 100644 index 0000000..7ce8b77 --- /dev/null +++ b/engine/runtime/ports/service_factory.py @@ -0,0 +1,22 @@ +"""Factory port for creating runtime ASR/LLM/TTS services.""" + +from __future__ import annotations + +from typing import Protocol + +from runtime.ports.asr import ASRPort, ASRServiceSpec +from runtime.ports.llm import LLMPort, LLMServiceSpec +from runtime.ports.tts import TTSPort, TTSServiceSpec + + +class RealtimeServiceFactory(Protocol): + """Port for provider-specific service construction.""" + + def create_llm_service(self, spec: LLMServiceSpec) -> LLMPort: + """Create an LLM service instance from a resolved spec.""" + + def create_tts_service(self, spec: TTSServiceSpec) -> TTSPort: + """Create a TTS service instance from a resolved spec.""" + + def create_asr_service(self, spec: ASRServiceSpec) -> ASRPort: + """Create an ASR service instance from a resolved spec.""" diff --git a/engine/runtime/ports/tts.py b/engine/runtime/ports/tts.py new file mode 100644 index 0000000..a98e17d --- /dev/null +++ b/engine/runtime/ports/tts.py @@ -0,0 +1,45 @@ +"""TTS extension port contracts.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import AsyncIterator, Optional, Protocol + +from providers.common.base import TTSChunk + + +@dataclass(frozen=True) +class TTSServiceSpec: + """Resolved runtime configuration for TTS service creation.""" + + provider: str + voice: str + sample_rate: int + speed: float = 1.0 + api_key: Optional[str] = None + api_url: Optional[str] = None + model: Optional[str] = None + app_id: Optional[str] = None + resource_id: Optional[str] = None + cluster: Optional[str] = None + uid: Optional[str] = None + mode: str = "commit" + + +class TTSPort(Protocol): + """Port for speech synthesis providers.""" + + async def connect(self) -> None: + """Establish connection to TTS provider.""" + + async def disconnect(self) -> None: + """Release TTS resources.""" + + async def synthesize(self, text: str) -> bytes: + """Synthesize complete PCM payload for text.""" + + async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]: + """Stream synthesized PCM chunks for text.""" + + async def cancel(self) -> None: + """Cancel an in-flight synthesis request.""" diff --git a/engine/runtime/session/__init__.py b/engine/runtime/session/__init__.py new file mode 100644 index 0000000..d224fb9 --- /dev/null +++ b/engine/runtime/session/__init__.py @@ -0,0 +1 @@ +"""Runtime session package.""" diff --git a/engine/runtime/session/lifecycle.py b/engine/runtime/session/lifecycle.py new file mode 100644 index 0000000..9fd8ebf --- /dev/null +++ b/engine/runtime/session/lifecycle.py @@ -0,0 +1,10 @@ +"""Lifecycle helper utilities for runtime sessions.""" + +from __future__ import annotations + +from datetime import datetime, timezone + + +def utc_now_iso() -> str: + """Return current UTC timestamp in ISO 8601 format.""" + return datetime.now(timezone.utc).isoformat() diff --git a/engine/core/session.py b/engine/runtime/session/manager.py similarity index 92% rename from engine/core/session.py rename to engine/runtime/session/manager.py index de00855..1f65ea7 100644 --- a/engine/core/session.py +++ b/engine/runtime/session/manager.py @@ -9,15 +9,22 @@ from enum import Enum from typing import Optional, Dict, Any, List from loguru import logger -from app.backend_adapters import build_backend_adapter_from_settings -from core.transports import BaseTransport -from core.duplex_pipeline import DuplexPipeline -from core.conversation import ConversationTurn -from core.history_bridge import SessionHistoryBridge -from core.workflow_runner import WorkflowRunner, WorkflowTransition, WorkflowNodeDef, WorkflowEdgeDef +from adapters.control_plane.backend import build_backend_adapter_from_settings +from runtime.transports import BaseTransport +from runtime.ports import ( + AssistantRuntimeConfigProvider, + ControlPlaneGateway, + ConversationHistoryStore, + KnowledgeRetriever, + ToolCatalog, +) +from runtime.pipeline.duplex import DuplexPipeline +from runtime.conversation import ConversationTurn +from runtime.history.bridge import SessionHistoryBridge +from workflow.runner import WorkflowRunner, WorkflowTransition, WorkflowNodeDef, WorkflowEdgeDef from app.config import settings -from services.base import LLMMessage -from models.ws_v1 import ( +from providers.common.base import LLMMessage +from protocol.ws_v1.schema import ( parse_client_message, ev, SessionStartMessage, @@ -54,7 +61,7 @@ class Session: TRACK_AUDIO_IN = "audio_in" TRACK_AUDIO_OUT = "audio_out" TRACK_CONTROL = "control" - AUDIO_FRAME_BYTES = 640 # 16k mono pcm_s16le, 20ms + AUDIO_FRAME_BYTES = 640 # Legacy fallback: 16k mono pcm_s16le, 20ms _METADATA_ALLOWED_TOP_LEVEL_KEYS = { "overrides", "dynamicVariables", @@ -97,7 +104,11 @@ class Session: session_id: str, transport: BaseTransport, use_duplex: bool = None, - backend_gateway: Optional[Any] = None, + control_plane_gateway: Optional[ControlPlaneGateway] = None, + runtime_config_provider: Optional[AssistantRuntimeConfigProvider] = None, + history_store: Optional[ConversationHistoryStore] = None, + knowledge_retriever: Optional[KnowledgeRetriever] = None, + tool_catalog: Optional[ToolCatalog] = None, assistant_id: Optional[str] = None, ): """ @@ -107,14 +118,24 @@ class Session: session_id: Unique session identifier transport: Transport instance for communication use_duplex: Whether to use duplex pipeline (defaults to settings.duplex_enabled) + control_plane_gateway: Optional composite control-plane dependency + runtime_config_provider: Optional assistant runtime config provider + history_store: Optional conversation history store + knowledge_retriever: Optional knowledge retrieval dependency + tool_catalog: Optional tool resource catalog """ self.id = session_id self.transport = transport self.use_duplex = use_duplex if use_duplex is not None else settings.duplex_enabled + self.audio_frame_bytes = self._compute_audio_frame_bytes() self._assistant_id = str(assistant_id or "").strip() or None - self._backend_gateway = backend_gateway or build_backend_adapter_from_settings() + self._control_plane_gateway = control_plane_gateway or build_backend_adapter_from_settings() + self._runtime_config_provider = runtime_config_provider or self._control_plane_gateway + self._history_store = history_store or self._control_plane_gateway + self._knowledge_retriever = knowledge_retriever or self._control_plane_gateway + self._tool_catalog = tool_catalog or self._control_plane_gateway self._history_bridge = SessionHistoryBridge( - history_writer=self._backend_gateway, + history_writer=self._history_store, enabled=settings.history_enabled, queue_max_size=settings.history_queue_max_size, retry_max_attempts=settings.history_retry_max_attempts, @@ -127,8 +148,8 @@ class Session: session_id=session_id, system_prompt=settings.duplex_system_prompt, greeting=settings.duplex_greeting, - knowledge_searcher=getattr(self._backend_gateway, "search_knowledge_context", None), - tool_resource_resolver=getattr(self._backend_gateway, "fetch_tool_resource", None), + knowledge_searcher=getattr(self._knowledge_retriever, "search_knowledge_context", None), + tool_resource_resolver=getattr(self._tool_catalog, "fetch_tool_resource", None), ) # Session state @@ -210,11 +231,14 @@ class Session: ) return - frame_bytes = self.AUDIO_FRAME_BYTES + frame_bytes = getattr(self, "audio_frame_bytes", self._compute_audio_frame_bytes()) if len(audio_bytes) % frame_bytes != 0: await self._send_error( "client", - f"Audio frame size must be a multiple of {frame_bytes} bytes (20ms PCM)", + ( + f"Audio frame size must be a multiple of {frame_bytes} bytes " + f"({settings.chunk_size_ms}ms PCM @ {settings.sample_rate}Hz)" + ), "audio.frame_size_mismatch", stage="audio", retryable=False, @@ -384,6 +408,7 @@ class Session: ev( "session.started", trackId=self.current_track_id, + protocolVersion=self._public_ws_protocol_version(), tracks={ "audio_in": self.TRACK_AUDIO_IN, "audio_out": self.TRACK_AUDIO_OUT, @@ -930,18 +955,18 @@ class Session: self, assistant_id: str, ) -> tuple[Dict[str, Any], Optional[Dict[str, str]]]: - """Load trusted runtime metadata from backend assistant config.""" + """Load trusted runtime metadata from control-plane assistant config.""" if not assistant_id: return {}, { "code": "protocol.assistant_id_required", "message": "Missing required query parameter assistant_id", } - provider = getattr(self._backend_gateway, "fetch_assistant_config", None) + provider = getattr(self._runtime_config_provider, "fetch_assistant_config", None) if not callable(provider): return {}, { "code": "assistant.config_unavailable", - "message": "Assistant config backend unavailable", + "message": "Assistant config control plane unavailable", } payload = await provider(str(assistant_id).strip()) @@ -1137,6 +1162,7 @@ class Session: output_mode = str(runtime_output.get("mode") or "").strip().lower() if isinstance(runtime_output, dict) else "" if output_mode not in {"audio", "text"}: output_mode = "audio" + output_codec = str(runtime_output.get("codec") or settings.default_codec or "pcm").strip().lower() or "pcm" tools_allowlist: List[str] = [] runtime_tools = runtime.get("tools", {}) if isinstance(runtime, dict) else {} @@ -1146,7 +1172,11 @@ class Session: tools_allowlist = [str(item) for item in allowlist if item is not None and str(item).strip()] resolved: Dict[str, Any] = { - "output": {"mode": output_mode}, + "protocolVersion": self._public_ws_protocol_version(), + "output": { + "mode": output_mode, + "codec": output_codec, + }, "tools": { "enabled": bool(tools_allowlist), "count": len(tools_allowlist), @@ -1162,6 +1192,24 @@ class Session: return resolved + @staticmethod + def _compute_audio_frame_bytes() -> int: + """Compute expected PCM frame bytes from SAMPLE_RATE and CHUNK_SIZE_MS.""" + sample_rate = max(1, int(getattr(settings, "sample_rate", 16000))) + chunk_ms = max(1, int(getattr(settings, "chunk_size_ms", 20))) + bytes_per_frame = int(round(sample_rate * 2 * (chunk_ms / 1000.0))) + if bytes_per_frame < 2: + bytes_per_frame = 2 + if bytes_per_frame % 2 != 0: + bytes_per_frame += 1 + return bytes_per_frame + + @staticmethod + def _public_ws_protocol_version() -> str: + """Return public protocol version label announced to clients.""" + version = str(getattr(settings, "ws_protocol_version", "v1") or "v1").strip() + return version or "v1" + def _extract_json_obj(self, text: str) -> Optional[Dict[str, Any]]: """Best-effort extraction of a JSON object from freeform text.""" try: diff --git a/engine/runtime/session/metadata.py b/engine/runtime/session/metadata.py new file mode 100644 index 0000000..ab32971 --- /dev/null +++ b/engine/runtime/session/metadata.py @@ -0,0 +1,9 @@ +"""Metadata helpers extracted from session manager.""" + +from __future__ import annotations + +import re +from typing import Pattern + +DYNAMIC_VARIABLE_KEY_RE: Pattern[str] = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$") +DYNAMIC_VARIABLE_PLACEHOLDER_RE: Pattern[str] = re.compile(r"\{\{\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\}\}") diff --git a/engine/runtime/session/workflow_bridge.py b/engine/runtime/session/workflow_bridge.py new file mode 100644 index 0000000..e03d939 --- /dev/null +++ b/engine/runtime/session/workflow_bridge.py @@ -0,0 +1,12 @@ +"""Workflow bridge helpers for runtime session orchestration.""" + +from __future__ import annotations + +from typing import Optional + +from workflow.runner import WorkflowRunner + + +def has_active_workflow(workflow_runner: Optional[WorkflowRunner]) -> bool: + """Return whether a workflow runner exists and has a current node.""" + return bool(workflow_runner and workflow_runner.current_node is not None) diff --git a/engine/core/transports.py b/engine/runtime/transports.py similarity index 100% rename from engine/core/transports.py rename to engine/runtime/transports.py diff --git a/engine/scripts/generate_test_audio/.env.example b/engine/scripts/generate_test_audio/.env.example new file mode 100644 index 0000000..72b7707 --- /dev/null +++ b/engine/scripts/generate_test_audio/.env.example @@ -0,0 +1 @@ +SILICONFLOW_API_KEY=sk-4163471a164f40769590b72863711781 \ No newline at end of file diff --git a/engine/services/__init__.py b/engine/services/__init__.py deleted file mode 100644 index 0e46834..0000000 --- a/engine/services/__init__.py +++ /dev/null @@ -1,53 +0,0 @@ -"""AI Services package. - -Provides ASR, LLM, TTS, and Realtime API services for voice conversation. -""" - -from services.base import ( - ServiceState, - ASRResult, - LLMMessage, - TTSChunk, - BaseASRService, - BaseLLMService, - BaseTTSService, -) -from services.llm import OpenAILLMService, MockLLMService -from services.dashscope_tts import DashScopeTTSService -from services.tts import EdgeTTSService, MockTTSService -from services.asr import BufferedASRService, MockASRService -from services.openai_compatible_asr import OpenAICompatibleASRService, SiliconFlowASRService -from services.openai_compatible_tts import OpenAICompatibleTTSService, SiliconFlowTTSService -from services.streaming_tts_adapter import StreamingTTSAdapter -from services.realtime import RealtimeService, RealtimeConfig, RealtimePipeline - -__all__ = [ - # Base classes - "ServiceState", - "ASRResult", - "LLMMessage", - "TTSChunk", - "BaseASRService", - "BaseLLMService", - "BaseTTSService", - # LLM - "OpenAILLMService", - "MockLLMService", - # TTS - "DashScopeTTSService", - "EdgeTTSService", - "MockTTSService", - # ASR - "BufferedASRService", - "MockASRService", - "OpenAICompatibleASRService", - "SiliconFlowASRService", - # TTS (SiliconFlow) - "OpenAICompatibleTTSService", - "SiliconFlowTTSService", - "StreamingTTSAdapter", - # Realtime - "RealtimeService", - "RealtimeConfig", - "RealtimePipeline", -] diff --git a/engine/services/tts.py b/engine/services/tts.py deleted file mode 100644 index e838f08..0000000 --- a/engine/services/tts.py +++ /dev/null @@ -1,271 +0,0 @@ -"""TTS (Text-to-Speech) Service implementations. - -Provides multiple TTS backend options including edge-tts (free) -and placeholder for cloud services. -""" - -import os -import io -import asyncio -import struct -from typing import AsyncIterator, Optional -from loguru import logger - -from services.base import BaseTTSService, TTSChunk, ServiceState - -# Try to import edge-tts -try: - import edge_tts - EDGE_TTS_AVAILABLE = True -except ImportError: - EDGE_TTS_AVAILABLE = False - logger.warning("edge-tts not available - EdgeTTS service will be disabled") - - -class EdgeTTSService(BaseTTSService): - """ - Microsoft Edge TTS service. - - Uses edge-tts library for free, high-quality speech synthesis. - Supports streaming for low-latency playback. - """ - - # Voice mapping for common languages - VOICE_MAP = { - "en": "en-US-JennyNeural", - "en-US": "en-US-JennyNeural", - "en-GB": "en-GB-SoniaNeural", - "zh": "zh-CN-XiaoxiaoNeural", - "zh-CN": "zh-CN-XiaoxiaoNeural", - "zh-TW": "zh-TW-HsiaoChenNeural", - "ja": "ja-JP-NanamiNeural", - "ko": "ko-KR-SunHiNeural", - "fr": "fr-FR-DeniseNeural", - "de": "de-DE-KatjaNeural", - "es": "es-ES-ElviraNeural", - } - - def __init__( - self, - voice: str = "en-US-JennyNeural", - sample_rate: int = 16000, - speed: float = 1.0 - ): - """ - Initialize Edge TTS service. - - Args: - voice: Voice name (e.g., "en-US-JennyNeural") or language code (e.g., "en") - sample_rate: Target sample rate (will be resampled) - speed: Speech speed multiplier - """ - # Resolve voice from language code if needed - if voice in self.VOICE_MAP: - voice = self.VOICE_MAP[voice] - - super().__init__(voice=voice, sample_rate=sample_rate, speed=speed) - self._cancel_event = asyncio.Event() - - async def connect(self) -> None: - """Edge TTS doesn't require explicit connection.""" - if not EDGE_TTS_AVAILABLE: - raise RuntimeError("edge-tts package not installed") - self.state = ServiceState.CONNECTED - logger.info(f"Edge TTS service ready: voice={self.voice}") - - async def disconnect(self) -> None: - """Edge TTS doesn't require explicit disconnection.""" - self.state = ServiceState.DISCONNECTED - logger.info("Edge TTS service disconnected") - - def _get_rate_string(self) -> str: - """Convert speed to rate string for edge-tts.""" - # edge-tts uses percentage format: "+0%", "-10%", "+20%" - percentage = int((self.speed - 1.0) * 100) - if percentage >= 0: - return f"+{percentage}%" - return f"{percentage}%" - - async def synthesize(self, text: str) -> bytes: - """ - Synthesize complete audio for text. - - Args: - text: Text to synthesize - - Returns: - PCM audio data (16-bit, mono, 16kHz) - """ - if not EDGE_TTS_AVAILABLE: - raise RuntimeError("edge-tts not available") - - # Collect all chunks - audio_data = b"" - async for chunk in self.synthesize_stream(text): - audio_data += chunk.audio - - return audio_data - - async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]: - """ - Synthesize audio in streaming mode. - - Args: - text: Text to synthesize - - Yields: - TTSChunk objects with PCM audio - """ - if not EDGE_TTS_AVAILABLE: - raise RuntimeError("edge-tts not available") - - self._cancel_event.clear() - - try: - communicate = edge_tts.Communicate( - text, - voice=self.voice, - rate=self._get_rate_string() - ) - - # edge-tts outputs MP3, we need to decode to PCM - # For now, collect MP3 chunks and yield after conversion - mp3_data = b"" - - async for chunk in communicate.stream(): - # Check for cancellation - if self._cancel_event.is_set(): - logger.info("TTS synthesis cancelled") - return - - if chunk["type"] == "audio": - mp3_data += chunk["data"] - - # Convert MP3 to PCM - if mp3_data: - pcm_data = await self._convert_mp3_to_pcm(mp3_data) - if pcm_data: - # Yield in chunks for streaming playback - chunk_size = self.sample_rate * 2 // 10 # 100ms chunks - for i in range(0, len(pcm_data), chunk_size): - if self._cancel_event.is_set(): - return - - chunk_data = pcm_data[i:i + chunk_size] - yield TTSChunk( - audio=chunk_data, - sample_rate=self.sample_rate, - is_final=(i + chunk_size >= len(pcm_data)) - ) - - except asyncio.CancelledError: - logger.info("TTS synthesis cancelled via asyncio") - raise - except Exception as e: - logger.error(f"TTS synthesis error: {e}") - raise - - async def _convert_mp3_to_pcm(self, mp3_data: bytes) -> bytes: - """ - Convert MP3 audio to PCM. - - Uses pydub or ffmpeg for conversion. - """ - try: - # Try using pydub (requires ffmpeg) - from pydub import AudioSegment - - # Load MP3 from bytes - audio = AudioSegment.from_mp3(io.BytesIO(mp3_data)) - - # Convert to target format - audio = audio.set_frame_rate(self.sample_rate) - audio = audio.set_channels(1) - audio = audio.set_sample_width(2) # 16-bit - - # Export as raw PCM - return audio.raw_data - - except ImportError: - logger.warning("pydub not available, trying fallback") - # Fallback: Use subprocess to call ffmpeg directly - return await self._ffmpeg_convert(mp3_data) - except Exception as e: - logger.error(f"Audio conversion error: {e}") - return b"" - - async def _ffmpeg_convert(self, mp3_data: bytes) -> bytes: - """Convert MP3 to PCM using ffmpeg subprocess.""" - try: - process = await asyncio.create_subprocess_exec( - "ffmpeg", - "-i", "pipe:0", - "-f", "s16le", - "-acodec", "pcm_s16le", - "-ar", str(self.sample_rate), - "-ac", "1", - "pipe:1", - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.DEVNULL - ) - - stdout, _ = await process.communicate(input=mp3_data) - return stdout - - except Exception as e: - logger.error(f"ffmpeg conversion error: {e}") - return b"" - - async def cancel(self) -> None: - """Cancel ongoing synthesis.""" - self._cancel_event.set() - - -class MockTTSService(BaseTTSService): - """ - Mock TTS service for testing without actual synthesis. - - Generates silence or simple tones. - """ - - def __init__( - self, - voice: str = "mock", - sample_rate: int = 16000, - speed: float = 1.0 - ): - super().__init__(voice=voice, sample_rate=sample_rate, speed=speed) - - async def connect(self) -> None: - self.state = ServiceState.CONNECTED - logger.info("Mock TTS service connected") - - async def disconnect(self) -> None: - self.state = ServiceState.DISCONNECTED - logger.info("Mock TTS service disconnected") - - async def synthesize(self, text: str) -> bytes: - """Generate silence based on text length.""" - # Approximate: 100ms per word - word_count = len(text.split()) - duration_ms = word_count * 100 - samples = int(self.sample_rate * duration_ms / 1000) - - # Generate silence (zeros) - return bytes(samples * 2) # 16-bit = 2 bytes per sample - - async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]: - """Generate silence chunks.""" - audio = await self.synthesize(text) - - # Yield in 100ms chunks - chunk_size = self.sample_rate * 2 // 10 - for i in range(0, len(audio), chunk_size): - chunk_data = audio[i:i + chunk_size] - yield TTSChunk( - audio=chunk_data, - sample_rate=self.sample_rate, - is_final=(i + chunk_size >= len(audio)) - ) - await asyncio.sleep(0.05) # Simulate processing time diff --git a/engine/tests/test_agent_config.py b/engine/tests/test_agent_config.py index 6432581..90bb277 100644 --- a/engine/tests/test_agent_config.py +++ b/engine/tests/test_agent_config.py @@ -1,293 +1,21 @@ -import os -from pathlib import Path +import importlib -import pytest -os.environ.setdefault("LLM_API_KEY", "test-openai-key") -os.environ.setdefault("TTS_API_KEY", "test-tts-key") -os.environ.setdefault("ASR_API_KEY", "test-asr-key") +def test_settings_load_from_environment(monkeypatch): + monkeypatch.setenv("HOST", "127.0.0.1") + monkeypatch.setenv("PORT", "8123") -from app.config import load_settings + import app.config as config_module + importlib.reload(config_module) + settings = config_module.get_settings() + assert settings.host == "127.0.0.1" + assert settings.port == 8123 -def _write_yaml(path: Path, content: str) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(content, encoding="utf-8") +def test_assistant_local_config_dir_default_present(): + import app.config as config_module -def _full_agent_yaml(llm_model: str = "gpt-4o-mini", llm_key: str = "test-openai-key") -> str: - return f""" -agent: - vad: - type: silero - model_path: data/vad/silero_vad.onnx - threshold: 0.63 - min_speech_duration_ms: 100 - eou_threshold_ms: 800 - - llm: - provider: openai_compatible - model: {llm_model} - temperature: 0.2 - api_key: {llm_key} - api_url: https://example-llm.invalid/v1 - - tts: - provider: openai_compatible - api_key: test-tts-key - api_url: https://example-tts.invalid/v1/audio/speech - model: FunAudioLLM/CosyVoice2-0.5B - voice: anna - speed: 1.0 - - asr: - provider: openai_compatible - api_key: test-asr-key - api_url: https://example-asr.invalid/v1/audio/transcriptions - model: FunAudioLLM/SenseVoiceSmall - interim_interval_ms: 500 - min_audio_ms: 300 - start_min_speech_ms: 160 - pre_speech_ms: 240 - final_tail_ms: 120 - - duplex: - enabled: true - system_prompt: You are a strict test assistant. - - barge_in: - min_duration_ms: 200 - silence_tolerance_ms: 60 -""".strip() - - -def _dashscope_tts_yaml() -> str: - return _full_agent_yaml().replace( - """ tts: - provider: openai_compatible - api_key: test-tts-key - api_url: https://example-tts.invalid/v1/audio/speech - model: FunAudioLLM/CosyVoice2-0.5B - voice: anna - speed: 1.0 -""", - """ tts: - provider: dashscope - api_key: test-dashscope-key - voice: Cherry - speed: 1.0 -""", - ) - - -def test_cli_profile_loads_agent_yaml(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - config_dir = tmp_path / "config" / "agents" - _write_yaml( - config_dir / "support.yaml", - _full_agent_yaml(llm_model="gpt-4.1-mini"), - ) - - settings = load_settings( - argv=["--agent-profile", "support"], - ) - - assert settings.llm_model == "gpt-4.1-mini" - assert settings.llm_temperature == 0.2 - assert settings.vad_threshold == 0.63 - assert settings.agent_config_source == "cli_profile" - assert settings.agent_config_path == str((config_dir / "support.yaml").resolve()) - - -def test_cli_path_has_higher_priority_than_env(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - env_file = tmp_path / "config" / "agents" / "env.yaml" - cli_file = tmp_path / "config" / "agents" / "cli.yaml" - - _write_yaml(env_file, _full_agent_yaml(llm_model="env-model")) - _write_yaml(cli_file, _full_agent_yaml(llm_model="cli-model")) - - monkeypatch.setenv("AGENT_CONFIG_PATH", str(env_file)) - - settings = load_settings(argv=["--agent-config", str(cli_file)]) - - assert settings.llm_model == "cli-model" - assert settings.agent_config_source == "cli_path" - assert settings.agent_config_path == str(cli_file.resolve()) - - -def test_default_yaml_is_loaded_without_args_or_env(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - default_file = tmp_path / "config" / "agents" / "default.yaml" - _write_yaml(default_file, _full_agent_yaml(llm_model="from-default")) - - monkeypatch.delenv("AGENT_CONFIG_PATH", raising=False) - monkeypatch.delenv("AGENT_PROFILE", raising=False) - - settings = load_settings(argv=[]) - - assert settings.llm_model == "from-default" - assert settings.agent_config_source == "default" - assert settings.agent_config_path == str(default_file.resolve()) - - -def test_missing_required_agent_settings_fail(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - file_path = tmp_path / "missing-required.yaml" - _write_yaml( - file_path, - """ -agent: - llm: - model: gpt-4o-mini -""".strip(), - ) - - with pytest.raises(ValueError, match="Missing required agent settings in YAML"): - load_settings(argv=["--agent-config", str(file_path)]) - - -def test_blank_required_provider_key_fails(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - file_path = tmp_path / "blank-key.yaml" - _write_yaml(file_path, _full_agent_yaml(llm_key="")) - - with pytest.raises(ValueError, match="Missing required agent settings in YAML"): - load_settings(argv=["--agent-config", str(file_path)]) - - -def test_missing_tts_api_url_fails(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - file_path = tmp_path / "missing-tts-url.yaml" - _write_yaml( - file_path, - _full_agent_yaml().replace( - " api_url: https://example-tts.invalid/v1/audio/speech\n", - "", - ), - ) - - with pytest.raises(ValueError, match="Missing required agent settings in YAML"): - load_settings(argv=["--agent-config", str(file_path)]) - - -def test_dashscope_tts_allows_default_url_and_model(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - file_path = tmp_path / "dashscope-tts.yaml" - _write_yaml(file_path, _dashscope_tts_yaml()) - - settings = load_settings(argv=["--agent-config", str(file_path)]) - - assert settings.tts_provider == "dashscope" - assert settings.tts_api_key == "test-dashscope-key" - assert settings.tts_api_url is None - assert settings.tts_model is None - - -def test_dashscope_tts_requires_api_key(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - file_path = tmp_path / "dashscope-tts-missing-key.yaml" - _write_yaml(file_path, _dashscope_tts_yaml().replace(" api_key: test-dashscope-key\n", "")) - - with pytest.raises(ValueError, match="Missing required agent settings in YAML"): - load_settings(argv=["--agent-config", str(file_path)]) - - -def test_missing_asr_api_url_fails(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - file_path = tmp_path / "missing-asr-url.yaml" - _write_yaml( - file_path, - _full_agent_yaml().replace( - " api_url: https://example-asr.invalid/v1/audio/transcriptions\n", - "", - ), - ) - - with pytest.raises(ValueError, match="Missing required agent settings in YAML"): - load_settings(argv=["--agent-config", str(file_path)]) - - -def test_agent_yaml_unknown_key_fails(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - file_path = tmp_path / "bad-agent.yaml" - _write_yaml(file_path, _full_agent_yaml() + "\n unknown_option: true") - - with pytest.raises(ValueError, match="Unknown agent config keys"): - load_settings(argv=["--agent-config", str(file_path)]) - - -def test_legacy_siliconflow_section_fails(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - file_path = tmp_path / "legacy-siliconflow.yaml" - _write_yaml( - file_path, - """ -agent: - siliconflow: - api_key: x -""".strip(), - ) - - with pytest.raises(ValueError, match="Section 'siliconflow' is no longer supported"): - load_settings(argv=["--agent-config", str(file_path)]) - - -def test_agent_yaml_missing_env_reference_fails(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - file_path = tmp_path / "bad-ref.yaml" - _write_yaml( - file_path, - _full_agent_yaml(llm_key="${UNSET_LLM_API_KEY}"), - ) - - with pytest.raises(ValueError, match="Missing environment variable"): - load_settings(argv=["--agent-config", str(file_path)]) - - -def test_agent_yaml_tools_list_is_loaded(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - file_path = tmp_path / "tools-agent.yaml" - _write_yaml( - file_path, - _full_agent_yaml() - + """ - - tools: - - current_time - - name: weather - description: Get weather by city. - parameters: - type: object - properties: - city: - type: string - required: [city] - executor: server -""", - ) - - settings = load_settings(argv=["--agent-config", str(file_path)]) - - assert isinstance(settings.tools, list) - assert settings.tools[0] == "current_time" - assert settings.tools[1]["name"] == "weather" - assert settings.tools[1]["executor"] == "server" - - -def test_agent_yaml_tools_must_be_list(monkeypatch, tmp_path): - monkeypatch.chdir(tmp_path) - file_path = tmp_path / "bad-tools-agent.yaml" - _write_yaml( - file_path, - _full_agent_yaml() - + """ - - tools: - weather: - executor: server -""", - ) - - with pytest.raises(ValueError, match="Agent config key 'tools' must be a list"): - load_settings(argv=["--agent-config", str(file_path)]) + settings = config_module.get_settings() + assert isinstance(settings.assistant_local_config_dir, str) + assert settings.assistant_local_config_dir diff --git a/engine/tests/test_asr_factory_modes.py b/engine/tests/test_asr_factory_modes.py new file mode 100644 index 0000000..5cd78f8 --- /dev/null +++ b/engine/tests/test_asr_factory_modes.py @@ -0,0 +1,71 @@ +from providers.asr.buffered import BufferedASRService +from providers.asr.dashscope import DashScopeRealtimeASRService +from providers.asr.openai_compatible import OpenAICompatibleASRService +from providers.asr.volcengine import VolcengineRealtimeASRService +from providers.factory.default import DefaultRealtimeServiceFactory +from runtime.ports import ASRServiceSpec + + +def test_create_asr_service_dashscope_returns_streaming_provider(): + factory = DefaultRealtimeServiceFactory() + service = factory.create_asr_service( + ASRServiceSpec( + provider="dashscope", + mode="streaming", + sample_rate=16000, + api_key="test-key", + model="qwen3-asr-flash-realtime", + ) + ) + assert isinstance(service, DashScopeRealtimeASRService) + assert service.mode == "streaming" + + +def test_create_asr_service_openai_compatible_returns_offline_provider(): + factory = DefaultRealtimeServiceFactory() + service = factory.create_asr_service( + ASRServiceSpec( + provider="openai_compatible", + sample_rate=16000, + api_key="test-key", + model="FunAudioLLM/SenseVoiceSmall", + ) + ) + assert isinstance(service, OpenAICompatibleASRService) + assert service.mode == "offline" + assert service.enable_interim is False + + +def test_create_asr_service_volcengine_returns_streaming_provider(): + factory = DefaultRealtimeServiceFactory() + service = factory.create_asr_service( + ASRServiceSpec( + provider="volcengine", + mode="streaming", + sample_rate=16000, + api_key="test-key", + api_url="wss://openspeech.bytedance.com/api/v3/sauc/bigmodel", + model="bigmodel", + app_id="app-1", + uid="caller-1", + request_params={"end_window_size": 800}, + ) + ) + assert isinstance(service, VolcengineRealtimeASRService) + assert service.mode == "streaming" + assert service.protocol == "seed" + assert service.app_id == "app-1" + assert service.uid == "caller-1" + assert service.request_params["end_window_size"] == 800 + + +def test_create_asr_service_fallback_buffered_for_unsupported_provider(): + factory = DefaultRealtimeServiceFactory() + service = factory.create_asr_service( + ASRServiceSpec( + provider="unknown_provider", + sample_rate=16000, + ) + ) + assert isinstance(service, BufferedASRService) + assert service.mode == "offline" diff --git a/engine/tests/test_backend_adapters.py b/engine/tests/test_backend_adapters.py index d55f5e2..81f1134 100644 --- a/engine/tests/test_backend_adapters.py +++ b/engine/tests/test_backend_adapters.py @@ -1,25 +1,43 @@ import aiohttp import pytest -from app.backend_adapters import ( - HistoryDisabledBackendAdapter, - HttpBackendAdapter, - NullBackendAdapter, +from adapters.control_plane.backend import ( + AssistantConfigSourceAdapter, + LocalYamlAssistantConfigAdapter, build_backend_adapter, ) @pytest.mark.asyncio -async def test_build_backend_adapter_without_url_returns_null_adapter(): +async def test_without_backend_url_uses_local_yaml_for_assistant_config(tmp_path): + config_dir = tmp_path / "assistants" + config_dir.mkdir(parents=True, exist_ok=True) + (config_dir / "dev_local.yaml").write_text( + "\n".join( + [ + "assistant:", + " assistantId: dev_local", + " systemPrompt: local prompt", + " greeting: local greeting", + ] + ), + encoding="utf-8", + ) + adapter = build_backend_adapter( backend_url=None, backend_mode="auto", history_enabled=True, timeout_sec=3, + assistant_local_config_dir=str(config_dir), ) - assert isinstance(adapter, NullBackendAdapter) + assert isinstance(adapter, AssistantConfigSourceAdapter) - assert await adapter.fetch_assistant_config("assistant_1") is None + payload = await adapter.fetch_assistant_config("dev_local") + assert isinstance(payload, dict) + assert payload.get("__error_code") in (None, "") + assert payload["assistant"]["assistantId"] == "dev_local" + assert payload["assistant"]["systemPrompt"] == "local prompt" assert ( await adapter.create_call_record( user_id=1, @@ -54,7 +72,7 @@ async def test_build_backend_adapter_without_url_returns_null_adapter(): @pytest.mark.asyncio -async def test_http_backend_adapter_create_call_record_posts_expected_payload(monkeypatch): +async def test_http_backend_adapter_create_call_record_posts_expected_payload(monkeypatch, tmp_path): captured = {} class _FakeResponse: @@ -90,15 +108,31 @@ async def test_http_backend_adapter_create_call_record_posts_expected_payload(mo captured["json"] = json return _FakeResponse(status=200, payload={"id": "call_123"}) - monkeypatch.setattr("app.backend_adapters.aiohttp.ClientSession", _FakeClientSession) + def get(self, url): + _ = url + return _FakeResponse( + status=200, + payload={ + "assistant": { + "assistantId": "assistant_9", + "systemPrompt": "backend prompt", + } + }, + ) + + monkeypatch.setattr("adapters.control_plane.backend.aiohttp.ClientSession", _FakeClientSession) + + config_dir = tmp_path / "assistants" + config_dir.mkdir(parents=True, exist_ok=True) adapter = build_backend_adapter( backend_url="http://localhost:8100", backend_mode="auto", history_enabled=True, timeout_sec=7, + assistant_local_config_dir=str(config_dir), ) - assert isinstance(adapter, HttpBackendAdapter) + assert isinstance(adapter, AssistantConfigSourceAdapter) call_id = await adapter.create_call_record( user_id=99, @@ -119,25 +153,195 @@ async def test_http_backend_adapter_create_call_record_posts_expected_payload(mo @pytest.mark.asyncio -async def test_backend_mode_disabled_forces_null_even_with_url(): +async def test_with_backend_url_uses_backend_for_assistant_config(monkeypatch, tmp_path): + class _FakeResponse: + def __init__(self, status=200, payload=None): + self.status = status + self._payload = payload if payload is not None else {} + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return None + + async def json(self): + return self._payload + + def raise_for_status(self): + if self.status >= 400: + raise RuntimeError("http_error") + + class _FakeClientSession: + def __init__(self, timeout=None): + self.timeout = timeout + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return None + + def get(self, url): + _ = url + return _FakeResponse( + status=200, + payload={ + "assistant": { + "assistantId": "dev_http", + "systemPrompt": "backend prompt", + } + }, + ) + + def post(self, url, json=None): + _ = (url, json) + return _FakeResponse(status=200, payload={"id": "call_1"}) + + monkeypatch.setattr("adapters.control_plane.backend.aiohttp.ClientSession", _FakeClientSession) + + config_dir = tmp_path / "assistants" + config_dir.mkdir(parents=True, exist_ok=True) + (config_dir / "dev_http.yaml").write_text( + "\n".join( + [ + "assistant:", + " assistantId: dev_http", + " systemPrompt: local prompt", + ] + ), + encoding="utf-8", + ) + + adapter = build_backend_adapter( + backend_url="http://localhost:8100", + backend_mode="auto", + history_enabled=True, + timeout_sec=3, + assistant_local_config_dir=str(config_dir), + ) + assert isinstance(adapter, AssistantConfigSourceAdapter) + + payload = await adapter.fetch_assistant_config("dev_http") + assert payload["assistant"]["assistantId"] == "dev_http" + assert payload["assistant"]["systemPrompt"] == "backend prompt" + + +def test_translate_agent_schema_maps_volcengine_fields(): + payload = { + "agent": { + "tts": { + "provider": "volcengine", + "api_key": "tts-key", + "api_url": "https://openspeech.bytedance.com/api/v3/tts/unidirectional", + "app_id": "app-123", + "resource_id": "seed-tts-2.0", + "uid": "caller-1", + "voice": "zh_female_shuangkuaisisi_moon_bigtts", + "speed": 1.1, + }, + "asr": { + "provider": "volcengine", + "api_key": "asr-key", + "api_url": "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel", + "model": "bigmodel", + "app_id": "app-123", + "resource_id": "volc.bigasr.sauc.duration", + "uid": "caller-1", + "request_params": { + "end_window_size": 800, + "force_to_speech_time": 1000, + }, + }, + } + } + + translated = LocalYamlAssistantConfigAdapter._translate_agent_schema("assistant_demo", payload) + assert translated is not None + assert translated["services"]["tts"] == { + "provider": "volcengine", + "apiKey": "tts-key", + "baseUrl": "https://openspeech.bytedance.com/api/v3/tts/unidirectional", + "voice": "zh_female_shuangkuaisisi_moon_bigtts", + "appId": "app-123", + "resourceId": "seed-tts-2.0", + "uid": "caller-1", + "speed": 1.1, + } + assert translated["services"]["asr"] == { + "provider": "volcengine", + "model": "bigmodel", + "apiKey": "asr-key", + "baseUrl": "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel", + "appId": "app-123", + "resourceId": "volc.bigasr.sauc.duration", + "uid": "caller-1", + "requestParams": { + "end_window_size": 800, + "force_to_speech_time": 1000, + }, + } + + +def test_translate_agent_schema_maps_llm_app_id(): + payload = { + "agent": { + "llm": { + "provider": "fastgpt", + "model": "fastgpt", + "api_key": "llm-key", + "api_url": "https://cloud.fastgpt.cn/api", + "app_id": "app-fastgpt-123", + }, + } + } + + translated = LocalYamlAssistantConfigAdapter._translate_agent_schema("assistant_demo", payload) + assert translated is not None + assert translated["services"]["llm"] == { + "provider": "fastgpt", + "model": "fastgpt", + "apiKey": "llm-key", + "baseUrl": "https://cloud.fastgpt.cn/api", + "appId": "app-fastgpt-123", + } + + +@pytest.mark.asyncio +async def test_backend_mode_disabled_uses_local_assistant_config_even_with_url(monkeypatch, tmp_path): + class _FailIfCalledClientSession: + def __init__(self, timeout=None): + _ = timeout + raise AssertionError("HTTP client should not be created when backend_mode=disabled") + + monkeypatch.setattr("adapters.control_plane.backend.aiohttp.ClientSession", _FailIfCalledClientSession) + + config_dir = tmp_path / "assistants" + config_dir.mkdir(parents=True, exist_ok=True) + (config_dir / "dev_disabled.yaml").write_text( + "\n".join( + [ + "assistant:", + " assistantId: dev_disabled", + " systemPrompt: local disabled prompt", + ] + ), + encoding="utf-8", + ) + adapter = build_backend_adapter( backend_url="http://localhost:8100", backend_mode="disabled", history_enabled=True, - timeout_sec=7, + timeout_sec=3, + assistant_local_config_dir=str(config_dir), ) - assert isinstance(adapter, NullBackendAdapter) + assert isinstance(adapter, AssistantConfigSourceAdapter) + payload = await adapter.fetch_assistant_config("dev_disabled") + assert payload["assistant"]["assistantId"] == "dev_disabled" + assert payload["assistant"]["systemPrompt"] == "local disabled prompt" -@pytest.mark.asyncio -async def test_history_disabled_wraps_backend_adapter(): - adapter = build_backend_adapter( - backend_url="http://localhost:8100", - backend_mode="auto", - history_enabled=False, - timeout_sec=7, - ) - assert isinstance(adapter, HistoryDisabledBackendAdapter) assert await adapter.create_call_record(user_id=1, assistant_id="a1", source="debug") is None assert await adapter.add_transcript( call_id="c1", @@ -148,3 +352,55 @@ async def test_history_disabled_wraps_backend_adapter(): end_ms=10, duration_ms=10, ) is False + + +@pytest.mark.asyncio +async def test_local_yaml_adapter_rejects_path_traversal_like_assistant_id(tmp_path): + adapter = LocalYamlAssistantConfigAdapter(str(tmp_path)) + payload = await adapter.fetch_assistant_config("../etc/passwd") + assert payload == {"__error_code": "assistant.not_found", "assistantId": "../etc/passwd"} + + +@pytest.mark.asyncio +async def test_local_yaml_translates_agent_schema_with_asr_interim_flag(tmp_path): + config_dir = tmp_path / "assistants" + config_dir.mkdir(parents=True, exist_ok=True) + (config_dir / "default.yaml").write_text( + "\n".join( + [ + "agent:", + " llm:", + " provider: openai", + " model: gpt-4o-mini", + " api_key: sk-llm", + " api_url: https://api.example.com/v1", + " tts:", + " provider: openai_compatible", + " model: tts-model", + " api_key: sk-tts", + " api_url: https://tts.example.com/v1/audio/speech", + " voice: anna", + " asr:", + " provider: openai_compatible", + " model: asr-model", + " api_key: sk-asr", + " api_url: https://asr.example.com/v1/audio/transcriptions", + " enable_interim: false", + " duplex:", + " system_prompt: You are test assistant", + ] + ), + encoding="utf-8", + ) + + adapter = LocalYamlAssistantConfigAdapter(str(config_dir)) + payload = await adapter.fetch_assistant_config("default") + + assert isinstance(payload, dict) + assistant = payload.get("assistant", {}) + services = assistant.get("services", {}) + assert services.get("llm", {}).get("apiKey") == "sk-llm" + assert services.get("tts", {}).get("apiKey") == "sk-tts" + assert services.get("asr", {}).get("apiKey") == "sk-asr" + assert services.get("asr", {}).get("enableInterim") is False + assert assistant.get("systemPrompt") == "You are test assistant" diff --git a/engine/tests/test_dashscope_asr_provider.py b/engine/tests/test_dashscope_asr_provider.py new file mode 100644 index 0000000..123530a --- /dev/null +++ b/engine/tests/test_dashscope_asr_provider.py @@ -0,0 +1,67 @@ +import asyncio + +import pytest + +from providers.asr.dashscope import DashScopeRealtimeASRService + + +@pytest.mark.asyncio +async def test_dashscope_asr_interim_event_emits_interim_transcript(): + received = [] + + async def _on_transcript(text: str, is_final: bool) -> None: + received.append((text, is_final)) + + service = DashScopeRealtimeASRService(api_key="test-key", on_transcript=_on_transcript) + service._loop = asyncio.get_running_loop() + service._running = True + + service._on_ws_event( + { + "type": "conversation.item.input_audio_transcription.text", + "stash": "你好世界", + } + ) + await asyncio.sleep(0.05) + + result = service._transcript_queue.get_nowait() + assert result.text == "你好世界" + assert result.is_final is False + assert received == [("你好世界", False)] + + +@pytest.mark.asyncio +async def test_dashscope_asr_final_event_emits_final_transcript_and_final_queue(): + received = [] + + async def _on_transcript(text: str, is_final: bool) -> None: + received.append((text, is_final)) + + service = DashScopeRealtimeASRService(api_key="test-key", on_transcript=_on_transcript) + service._loop = asyncio.get_running_loop() + service._running = True + service._audio_sent_in_utterance = True + + service._on_ws_event( + { + "type": "conversation.item.input_audio_transcription.completed", + "transcript": "最终识别结果", + } + ) + await asyncio.sleep(0.05) + + result = service._transcript_queue.get_nowait() + assert result.text == "最终识别结果" + assert result.is_final is True + assert service._final_queue.get_nowait() == "最终识别结果" + assert received == [("最终识别结果", True)] + + +@pytest.mark.asyncio +async def test_dashscope_wait_for_final_falls_back_to_latest_interim_on_timeout(): + service = DashScopeRealtimeASRService(api_key="test-key") + service._audio_sent_in_utterance = True + service._last_interim_text = "部分结果" + + text = await service.wait_for_final_transcription(timeout_ms=10) + assert text == "部分结果" diff --git a/engine/tests/test_duplex_asr_modes.py b/engine/tests/test_duplex_asr_modes.py new file mode 100644 index 0000000..3e4b1cf --- /dev/null +++ b/engine/tests/test_duplex_asr_modes.py @@ -0,0 +1,260 @@ +import asyncio +from typing import Any, Dict, List + +import pytest + +from runtime.pipeline.duplex import DuplexPipeline + + +class _DummySileroVAD: + def __init__(self, *args, **kwargs): + pass + + def process_audio(self, _pcm: bytes) -> float: + return 0.0 + + +class _DummyVADProcessor: + def __init__(self, *args, **kwargs): + pass + + def process(self, _speech_prob: float): + return "Silence", 0.0 + + +class _DummyEouDetector: + def __init__(self, *args, **kwargs): + self.is_speaking = True + + def process(self, _vad_status: str, force_eligible: bool = False) -> bool: + _ = force_eligible + return False + + def reset(self) -> None: + self.is_speaking = False + + +class _FakeTransport: + async def send_event(self, _event: Dict[str, Any]) -> None: + return None + + async def send_audio(self, _audio: bytes) -> None: + return None + + +class _FakeStreamingASR: + mode = "streaming" + + def __init__(self): + self.begin_calls = 0 + self.end_calls = 0 + self.wait_calls = 0 + self.sent_audio: List[bytes] = [] + self.wait_text = "" + + async def connect(self) -> None: + return None + + async def disconnect(self) -> None: + return None + + async def send_audio(self, audio: bytes) -> None: + self.sent_audio.append(audio) + + async def receive_transcripts(self): + if False: + yield None + + async def begin_utterance(self) -> None: + self.begin_calls += 1 + + async def end_utterance(self) -> None: + self.end_calls += 1 + + async def wait_for_final_transcription(self, timeout_ms: int = 800) -> str: + _ = timeout_ms + self.wait_calls += 1 + return self.wait_text + + def clear_utterance(self) -> None: + return None + + +class _FakeOfflineASR: + mode = "offline" + + def __init__(self): + self.start_interim_calls = 0 + self.stop_interim_calls = 0 + self.sent_audio: List[bytes] = [] + self.final_text = "offline final" + + async def connect(self) -> None: + return None + + async def disconnect(self) -> None: + return None + + async def send_audio(self, audio: bytes) -> None: + self.sent_audio.append(audio) + + async def receive_transcripts(self): + if False: + yield None + + async def start_interim_transcription(self) -> None: + self.start_interim_calls += 1 + + async def stop_interim_transcription(self) -> None: + self.stop_interim_calls += 1 + + async def get_final_transcription(self) -> str: + return self.final_text + + def clear_buffer(self) -> None: + return None + + def get_and_clear_text(self) -> str: + return self.final_text + + +def _build_pipeline(monkeypatch, asr_service): + monkeypatch.setattr("runtime.pipeline.duplex.SileroVAD", _DummySileroVAD) + monkeypatch.setattr("runtime.pipeline.duplex.VADProcessor", _DummyVADProcessor) + monkeypatch.setattr("runtime.pipeline.duplex.EouDetector", _DummyEouDetector) + return DuplexPipeline( + transport=_FakeTransport(), + session_id="asr_mode_test", + asr_service=asr_service, + ) + + +@pytest.mark.asyncio +async def test_start_asr_capture_uses_streaming_begin(monkeypatch): + asr = _FakeStreamingASR() + pipeline = _build_pipeline(monkeypatch, asr) + pipeline._asr_mode = "streaming" + pipeline._pending_speech_audio = b"\x00" * 320 + pipeline._pre_speech_buffer = b"\x00" * 640 + + await pipeline._start_asr_capture() + + assert asr.begin_calls == 1 + assert asr.sent_audio + assert pipeline._asr_capture_active is True + + +@pytest.mark.asyncio +async def test_start_asr_capture_uses_offline_interim_control_when_enabled(monkeypatch): + asr = _FakeOfflineASR() + pipeline = _build_pipeline(monkeypatch, asr) + pipeline._asr_mode = "offline" + pipeline._runtime_asr["enableInterim"] = True + pipeline._pending_speech_audio = b"\x00" * 320 + pipeline._pre_speech_buffer = b"\x00" * 640 + + await pipeline._start_asr_capture() + + assert asr.start_interim_calls == 1 + assert asr.sent_audio + assert pipeline._asr_capture_active is True + + +@pytest.mark.asyncio +async def test_start_asr_capture_skips_offline_interim_control_when_disabled(monkeypatch): + asr = _FakeOfflineASR() + pipeline = _build_pipeline(monkeypatch, asr) + pipeline._asr_mode = "offline" + pipeline._runtime_asr["enableInterim"] = False + pipeline._pending_speech_audio = b"\x00" * 320 + pipeline._pre_speech_buffer = b"\x00" * 640 + + await pipeline._start_asr_capture() + + assert asr.start_interim_calls == 0 + assert asr.sent_audio + assert pipeline._asr_capture_active is True + + +@pytest.mark.asyncio +async def test_offline_interim_callback_ignored_when_disabled(monkeypatch): + asr = _FakeOfflineASR() + pipeline = _build_pipeline(monkeypatch, asr) + pipeline._asr_mode = "offline" + pipeline._runtime_asr["enableInterim"] = False + + captured_events = [] + captured_deltas = [] + + async def _capture_event(event: Dict[str, Any], priority: int = 20): + _ = priority + captured_events.append(event) + + async def _capture_delta(text: str): + captured_deltas.append(text) + + monkeypatch.setattr(pipeline, "_send_event", _capture_event) + monkeypatch.setattr(pipeline, "_emit_transcript_delta", _capture_delta) + + await pipeline._on_transcript_callback("ignored interim", is_final=False) + + assert captured_events == [] + assert captured_deltas == [] + assert pipeline._latest_asr_interim_text == "" + + +@pytest.mark.asyncio +async def test_offline_final_callback_emits_when_interim_disabled(monkeypatch): + asr = _FakeOfflineASR() + pipeline = _build_pipeline(monkeypatch, asr) + pipeline._asr_mode = "offline" + pipeline._runtime_asr["enableInterim"] = False + + captured_events = [] + + async def _capture_event(event: Dict[str, Any], priority: int = 20): + _ = priority + captured_events.append(event) + + monkeypatch.setattr(pipeline, "_send_event", _capture_event) + + await pipeline._on_transcript_callback("final only", is_final=True) + + assert any(event.get("type") == "transcript.final" for event in captured_events) + + +@pytest.mark.asyncio +async def test_streaming_eou_falls_back_to_latest_interim(monkeypatch): + asr = _FakeStreamingASR() + asr.wait_text = "" + pipeline = _build_pipeline(monkeypatch, asr) + pipeline._asr_mode = "streaming" + pipeline._asr_capture_active = True + pipeline._latest_asr_interim_text = "fallback interim text" + await pipeline.conversation.start_user_turn() + + captured_events = [] + captured_turns = [] + + async def _capture_event(event: Dict[str, Any], priority: int = 20): + _ = priority + captured_events.append(event) + + async def _noop_stop_current_speech() -> None: + return None + + async def _capture_turn(user_text: str, *args, **kwargs) -> None: + _ = (args, kwargs) + captured_turns.append(user_text) + + monkeypatch.setattr(pipeline, "_send_event", _capture_event) + monkeypatch.setattr(pipeline, "_stop_current_speech", _noop_stop_current_speech) + monkeypatch.setattr(pipeline, "_handle_turn", _capture_turn) + + await pipeline._on_end_of_utterance() + await asyncio.sleep(0.05) + + assert asr.end_calls == 1 + assert asr.wait_calls == 1 + assert captured_turns == ["fallback interim text"] + assert any(event.get("type") == "transcript.final" for event in captured_events) diff --git a/engine/tests/test_dynamic_variables.py b/engine/tests/test_dynamic_variables.py index f7982c7..d6744af 100644 --- a/engine/tests/test_dynamic_variables.py +++ b/engine/tests/test_dynamic_variables.py @@ -1,4 +1,4 @@ -from core.session import Session +from runtime.session.manager import Session def _session() -> Session: diff --git a/engine/tests/test_fastgpt_provider.py b/engine/tests/test_fastgpt_provider.py new file mode 100644 index 0000000..9b7d63a --- /dev/null +++ b/engine/tests/test_fastgpt_provider.py @@ -0,0 +1,411 @@ +import json +from types import SimpleNamespace +from typing import Any, Dict, List + +import pytest + +from providers.common.base import LLMMessage +from providers.llm.fastgpt import FastGPTLLMService + + +class _FakeResponse: + def __init__(self, events: List[Any]): + self.events = events + self.closed = False + + async def close(self) -> None: + self.closed = True + + +class _FakeJSONResponse: + def __init__(self, payload: Dict[str, Any], status_code: int = 200): + self._payload = payload + self.status_code = status_code + + def json(self) -> Dict[str, Any]: + return dict(self._payload) + + def raise_for_status(self) -> None: + if self.status_code >= 400: + raise RuntimeError(f"HTTP {self.status_code}") + + +class _FakeAsyncStreamResponse(_FakeResponse): + def __init__(self, events: List[Any]): + super().__init__(events) + self.aclosed = False + + def close(self) -> None: + raise AssertionError("sync close should not be used for async stream responses") + + async def aclose(self) -> None: + self.aclosed = True + + +class _FakeAsyncChatClient: + responses: List[_FakeResponse] = [] + init_payload: Dict[str, Any] | None = None + + def __init__(self, api_key: str, base_url: str): + self.api_key = api_key + self.base_url = base_url + self.requests: List[Dict[str, Any]] = [] + self.init_requests: List[Dict[str, Any]] = [] + + async def create_chat_completion(self, **kwargs): + self.requests.append(dict(kwargs)) + if not self.responses: + raise AssertionError("No fake FastGPT response queued") + return self.responses.pop(0) + + async def get_chat_init(self, **kwargs): + self.init_requests.append(dict(kwargs)) + return _FakeJSONResponse( + self.init_payload or {"data": {"app": {"chatConfig": {"welcomeText": ""}}}}, + ) + + async def close(self) -> None: + return None + + +async def _fake_aiter_stream_events(response: _FakeResponse): + for event in response.events: + yield event + + +@pytest.mark.asyncio +async def test_fastgpt_provider_streams_text_from_data_event(monkeypatch): + monkeypatch.setattr("providers.llm.fastgpt.AsyncChatClient", _FakeAsyncChatClient) + monkeypatch.setattr("providers.llm.fastgpt.aiter_stream_events", _fake_aiter_stream_events) + + _FakeAsyncChatClient.responses = [ + _FakeResponse( + [ + SimpleNamespace( + kind="data", + data={"choices": [{"delta": {"content": "Hello from FastGPT."}}]}, + ), + SimpleNamespace(kind="done", data={}), + ] + ) + ] + + service = FastGPTLLMService(api_key="key", base_url="https://fastgpt.example") + await service.connect() + + events = [event async for event in service.generate_stream([LLMMessage(role="user", content="Hi")])] + + assert [event.type for event in events] == ["text_delta", "done"] + assert events[0].text == "Hello from FastGPT." + assert service.client.requests[0]["messages"] == [{"role": "user", "content": "Hi"}] + assert service.client.requests[0]["chatId"] == service._state.chat_id + + +@pytest.mark.asyncio +async def test_fastgpt_provider_streams_text_from_answer_delta_event(monkeypatch): + monkeypatch.setattr("providers.llm.fastgpt.AsyncChatClient", _FakeAsyncChatClient) + monkeypatch.setattr("providers.llm.fastgpt.aiter_stream_events", _fake_aiter_stream_events) + + _FakeAsyncChatClient.responses = [ + _FakeResponse( + [ + SimpleNamespace( + kind="answer", + data={"choices": [{"delta": {"content": "Hello from answer delta."}}]}, + ), + SimpleNamespace(kind="done", data={}), + ] + ) + ] + + service = FastGPTLLMService(api_key="key", base_url="https://fastgpt.example") + await service.connect() + + events = [event async for event in service.generate_stream([LLMMessage(role="user", content="Hi")])] + + assert [event.type for event in events] == ["text_delta", "done"] + assert events[0].text == "Hello from answer delta." + + +@pytest.mark.asyncio +async def test_fastgpt_provider_uses_async_close_for_stream_responses(monkeypatch): + monkeypatch.setattr("providers.llm.fastgpt.AsyncChatClient", _FakeAsyncChatClient) + monkeypatch.setattr("providers.llm.fastgpt.aiter_stream_events", _fake_aiter_stream_events) + + response = _FakeAsyncStreamResponse( + [ + SimpleNamespace( + kind="data", + data={"choices": [{"delta": {"content": "Hello from FastGPT."}}]}, + ), + SimpleNamespace(kind="done", data={}), + ] + ) + _FakeAsyncChatClient.responses = [response] + + service = FastGPTLLMService(api_key="key", base_url="https://fastgpt.example") + await service.connect() + + events = [event async for event in service.generate_stream([LLMMessage(role="user", content="Hi")])] + + assert [event.type for event in events] == ["text_delta", "done"] + assert response.aclosed is True + + +@pytest.mark.asyncio +async def test_fastgpt_provider_loads_initial_greeting_from_chat_init(monkeypatch): + monkeypatch.setattr("providers.llm.fastgpt.AsyncChatClient", _FakeAsyncChatClient) + monkeypatch.setattr("providers.llm.fastgpt.aiter_stream_events", _fake_aiter_stream_events) + + _FakeAsyncChatClient.init_payload = { + "data": { + "app": { + "chatConfig": { + "welcomeText": "Hello from FastGPT init.", + } + } + } + } + + service = FastGPTLLMService( + api_key="key", + base_url="https://fastgpt.example", + app_id="app-123", + ) + await service.connect() + + greeting = await service.get_initial_greeting() + + assert greeting == "Hello from FastGPT init." + assert service.client.init_requests[0] == { + "appId": "app-123", + "chatId": service._state.chat_id, + } + + +@pytest.mark.asyncio +async def test_fastgpt_provider_maps_interactive_event_to_client_tool(monkeypatch): + monkeypatch.setattr("providers.llm.fastgpt.AsyncChatClient", _FakeAsyncChatClient) + monkeypatch.setattr("providers.llm.fastgpt.aiter_stream_events", _fake_aiter_stream_events) + + _FakeAsyncChatClient.responses = [ + _FakeResponse( + [ + SimpleNamespace( + kind="interactive", + data={ + "type": "userSelect", + "title": "Choose a plan", + "params": { + "description": "Pick the best plan for your team.", + "userSelectOptions": [ + {"id": "basic", "label": "Basic", "value": "basic", "desc": "Starter tier"}, + {"id": "pro", "label": "Pro", "value": "pro", "description": "Advanced tier"}, + ] + }, + }, + ) + ] + ) + ] + + service = FastGPTLLMService(api_key="key", base_url="https://fastgpt.example") + await service.connect() + + events = [event async for event in service.generate_stream([LLMMessage(role="user", content="Start")])] + + assert len(events) == 1 + assert events[0].type == "tool_call" + tool_call = events[0].tool_call + assert tool_call["executor"] == "client" + assert tool_call["wait_for_response"] is True + assert tool_call["timeout_ms"] == 300000 + assert tool_call["function"]["name"] == "fastgpt.interactive" + + arguments = json.loads(tool_call["function"]["arguments"]) + assert arguments["provider"] == "fastgpt" + assert arguments["version"] == "fastgpt_interactive_v1" + assert arguments["interaction"]["type"] == "userSelect" + assert arguments["interaction"]["description"] == "Pick the best plan for your team." + assert arguments["interaction"]["options"][0]["description"] == "Starter tier" + assert arguments["interaction"]["options"][1]["value"] == "pro" + assert arguments["interaction"]["options"][1]["description"] == "Advanced tier" + assert arguments["context"]["chat_id"] == service._state.chat_id + assert service._state.pending_interaction is not None + + +@pytest.mark.asyncio +async def test_fastgpt_provider_unwraps_nested_tool_children_interactive(monkeypatch): + monkeypatch.setattr("providers.llm.fastgpt.AsyncChatClient", _FakeAsyncChatClient) + monkeypatch.setattr("providers.llm.fastgpt.aiter_stream_events", _fake_aiter_stream_events) + + _FakeAsyncChatClient.responses = [ + _FakeResponse( + [ + SimpleNamespace( + kind="interactive", + data={ + "interactive": { + "type": "toolChildrenInteractive", + "params": { + "childrenResponse": { + "type": "userSelect", + "params": { + "description": "Please choose a workflow branch.", + "userSelectOptions": [ + {"value": "A", "description": "Branch A"}, + {"value": "B", "description": "Branch B"}, + ], + }, + } + }, + } + }, + ) + ] + ) + ] + + service = FastGPTLLMService(api_key="key", base_url="https://fastgpt.example") + await service.connect() + + events = [event async for event in service.generate_stream([LLMMessage(role="user", content="Start")])] + + assert len(events) == 1 + arguments = json.loads(events[0].tool_call["function"]["arguments"]) + assert arguments["interaction"]["type"] == "userSelect" + assert arguments["interaction"]["description"] == "Please choose a workflow branch." + assert arguments["interaction"]["options"][0]["description"] == "Branch A" + + +@pytest.mark.asyncio +async def test_fastgpt_provider_uses_opener_for_interactive_prompt_when_prompt_missing(monkeypatch): + monkeypatch.setattr("providers.llm.fastgpt.AsyncChatClient", _FakeAsyncChatClient) + monkeypatch.setattr("providers.llm.fastgpt.aiter_stream_events", _fake_aiter_stream_events) + + _FakeAsyncChatClient.responses = [ + _FakeResponse( + [ + SimpleNamespace( + kind="interactive", + data={ + "type": "userSelect", + "opener": "请确认您是否满意本次服务。", + "params": { + "userSelectOptions": [ + {"value": "是"}, + {"value": "否"}, + ] + }, + }, + ) + ] + ) + ] + + service = FastGPTLLMService(api_key="key", base_url="https://fastgpt.example") + await service.connect() + + events = [event async for event in service.generate_stream([LLMMessage(role="user", content="Start")])] + + assert len(events) == 1 + tool_call = events[0].tool_call + arguments = json.loads(tool_call["function"]["arguments"]) + assert tool_call["display_name"] == "请确认您是否满意本次服务。" + assert arguments["interaction"]["prompt"] == "请确认您是否满意本次服务。" + + +@pytest.mark.asyncio +async def test_fastgpt_provider_resumes_same_chat_after_client_result(monkeypatch): + monkeypatch.setattr("providers.llm.fastgpt.AsyncChatClient", _FakeAsyncChatClient) + monkeypatch.setattr("providers.llm.fastgpt.aiter_stream_events", _fake_aiter_stream_events) + + _FakeAsyncChatClient.responses = [ + _FakeResponse( + [ + SimpleNamespace( + kind="interactive", + data={ + "type": "userSelect", + "params": {"userSelectOptions": [{"label": "Pro", "value": "pro"}]}, + }, + ) + ] + ), + _FakeResponse( + [ + SimpleNamespace(kind="answer", data={"text": "Resumed answer."}), + SimpleNamespace(kind="done", data={}), + ] + ), + ] + + service = FastGPTLLMService(api_key="key", base_url="https://fastgpt.example") + await service.connect() + + initial_events = [event async for event in service.generate_stream([LLMMessage(role="user", content="Start")])] + call_id = initial_events[0].tool_call["id"] + + resumed_events = [ + event + async for event in service.resume_after_client_tool_result( + call_id, + { + "tool_call_id": call_id, + "name": "fastgpt.interactive", + "output": { + "action": "submit", + "result": {"type": "userSelect", "selected": "pro"}, + }, + "status": {"code": 200, "message": "ok"}, + }, + ) + ] + + assert [event.type for event in resumed_events] == ["text_delta", "done"] + assert resumed_events[0].text == "Resumed answer." + assert service.client.requests[1]["chatId"] == service.client.requests[0]["chatId"] + assert service.client.requests[1]["messages"] == [{"role": "user", "content": "pro"}] + assert service._state.pending_interaction is None + + +@pytest.mark.asyncio +async def test_fastgpt_provider_cancel_result_clears_pending_interaction(monkeypatch): + monkeypatch.setattr("providers.llm.fastgpt.AsyncChatClient", _FakeAsyncChatClient) + monkeypatch.setattr("providers.llm.fastgpt.aiter_stream_events", _fake_aiter_stream_events) + + _FakeAsyncChatClient.responses = [ + _FakeResponse( + [ + SimpleNamespace( + kind="interactive", + data={ + "type": "userInput", + "params": {"inputForm": [{"name": "name", "label": "Name"}]}, + }, + ) + ] + ) + ] + + service = FastGPTLLMService(api_key="key", base_url="https://fastgpt.example") + await service.connect() + + initial_events = [event async for event in service.generate_stream([LLMMessage(role="user", content="Start")])] + call_id = initial_events[0].tool_call["id"] + + resumed_events = [ + event + async for event in service.resume_after_client_tool_result( + call_id, + { + "tool_call_id": call_id, + "name": "fastgpt.interactive", + "output": {"action": "cancel", "result": {}}, + "status": {"code": 499, "message": "user_cancelled"}, + }, + ) + ] + + assert [event.type for event in resumed_events] == ["done"] + assert service._state.pending_interaction is None diff --git a/engine/tests/test_history_bridge.py b/engine/tests/test_history_bridge.py index 2f9dd80..d70fa6e 100644 --- a/engine/tests/test_history_bridge.py +++ b/engine/tests/test_history_bridge.py @@ -3,7 +3,7 @@ import time import pytest -from core.history_bridge import SessionHistoryBridge +from runtime.history.bridge import SessionHistoryBridge class _FakeHistoryWriter: diff --git a/engine/tests/test_session_timeout.py b/engine/tests/test_session_timeout.py new file mode 100644 index 0000000..54905d8 --- /dev/null +++ b/engine/tests/test_session_timeout.py @@ -0,0 +1,13 @@ +from app.main import _inactivity_deadline + + +def test_inactivity_deadline_uses_default_timeout_without_pending_tool(): + assert _inactivity_deadline(last_received_at=100.0, inactivity_timeout_sec=60) == 160.0 + + +def test_inactivity_deadline_extends_while_waiting_for_client_tool(): + assert _inactivity_deadline( + last_received_at=100.0, + inactivity_timeout_sec=60, + pending_client_tool_deadline=340.0, + ) == 340.0 diff --git a/engine/tests/test_tool_call_flow.py b/engine/tests/test_tool_call_flow.py index 11a7b77..820cd8d 100644 --- a/engine/tests/test_tool_call_flow.py +++ b/engine/tests/test_tool_call_flow.py @@ -1,13 +1,14 @@ import asyncio import json +import time from typing import Any, Dict, List import pytest -from core.conversation import ConversationState -from core.duplex_pipeline import DuplexPipeline -from models.ws_v1 import OutputAudioPlayedMessage, ToolCallResultsMessage, parse_client_message -from services.base import LLMStreamEvent +from runtime.conversation import ConversationState +from runtime.pipeline.duplex import DuplexPipeline +from protocol.ws_v1.schema import OutputAudioPlayedMessage, ToolCallResultsMessage, parse_client_message +from providers.common.base import LLMStreamEvent class _DummySileroVAD: @@ -52,9 +53,33 @@ class _FakeTTS: class _FakeASR: + mode = "offline" + async def connect(self) -> None: return None + async def disconnect(self) -> None: + return None + + async def send_audio(self, _audio: bytes) -> None: + return None + + async def receive_transcripts(self): + if False: + yield None + + def clear_buffer(self) -> None: + return None + + async def start_interim_transcription(self) -> None: + return None + + async def stop_interim_transcription(self) -> None: + return None + + async def get_final_transcription(self) -> str: + return "" + class _FakeLLM: def __init__(self, rounds: List[List[LLMStreamEvent]]): @@ -85,10 +110,26 @@ class _CaptureGenerateLLM: yield LLMStreamEvent(type="done") +class _InitGreetingLLM: + def __init__(self, greeting: str): + self.greeting = greeting + self.init_calls = 0 + + async def generate(self, _messages, temperature=0.7, max_tokens=None): + return "" + + async def generate_stream(self, _messages, temperature=0.7, max_tokens=None): + yield LLMStreamEvent(type="done") + + async def get_initial_greeting(self): + self.init_calls += 1 + return self.greeting + + def _build_pipeline(monkeypatch, llm_rounds: List[List[LLMStreamEvent]]) -> tuple[DuplexPipeline, List[Dict[str, Any]]]: - monkeypatch.setattr("core.duplex_pipeline.SileroVAD", _DummySileroVAD) - monkeypatch.setattr("core.duplex_pipeline.VADProcessor", _DummyVADProcessor) - monkeypatch.setattr("core.duplex_pipeline.EouDetector", _DummyEouDetector) + monkeypatch.setattr("runtime.pipeline.duplex.SileroVAD", _DummySileroVAD) + monkeypatch.setattr("runtime.pipeline.duplex.VADProcessor", _DummyVADProcessor) + monkeypatch.setattr("runtime.pipeline.duplex.EouDetector", _DummyEouDetector) pipeline = DuplexPipeline( transport=_FakeTransport(), @@ -112,7 +153,7 @@ def _build_pipeline(monkeypatch, llm_rounds: List[List[LLMStreamEvent]]) -> tupl def test_pipeline_uses_default_tools_from_settings(monkeypatch): monkeypatch.setattr( - "core.duplex_pipeline.settings.tools", + "runtime.pipeline.duplex.settings.tools", [ "current_time", "calculator", @@ -141,7 +182,7 @@ def test_pipeline_uses_default_tools_from_settings(monkeypatch): def test_pipeline_exposes_unknown_string_tools_with_fallback_schema(monkeypatch): - monkeypatch.setattr("core.duplex_pipeline.settings.tools", ["custom_system_cmd"]) + monkeypatch.setattr("runtime.pipeline.duplex.settings.tools", ["custom_system_cmd"]) pipeline, _events = _build_pipeline(monkeypatch, [[LLMStreamEvent(type="done")]]) schemas = pipeline._resolved_tool_schemas() @@ -151,7 +192,7 @@ def test_pipeline_exposes_unknown_string_tools_with_fallback_schema(monkeypatch) def test_pipeline_assigns_default_client_executor_for_system_string_tools(monkeypatch): - monkeypatch.setattr("core.duplex_pipeline.settings.tools", ["increase_volume"]) + monkeypatch.setattr("runtime.pipeline.duplex.settings.tools", ["increase_volume"]) pipeline, _events = _build_pipeline(monkeypatch, [[LLMStreamEvent(type="done")]]) tool_call = { @@ -221,9 +262,9 @@ async def test_pipeline_applies_default_args_to_tool_call(monkeypatch): @pytest.mark.asyncio async def test_generated_opener_prompt_uses_system_prompt_only(monkeypatch): - monkeypatch.setattr("core.duplex_pipeline.SileroVAD", _DummySileroVAD) - monkeypatch.setattr("core.duplex_pipeline.VADProcessor", _DummyVADProcessor) - monkeypatch.setattr("core.duplex_pipeline.EouDetector", _DummyEouDetector) + monkeypatch.setattr("runtime.pipeline.duplex.SileroVAD", _DummySileroVAD) + monkeypatch.setattr("runtime.pipeline.duplex.VADProcessor", _DummyVADProcessor) + monkeypatch.setattr("runtime.pipeline.duplex.EouDetector", _DummyEouDetector) llm = _CaptureGenerateLLM("你好") pipeline = DuplexPipeline( @@ -282,6 +323,21 @@ async def test_generated_opener_uses_tool_capable_turn_when_tools_available(monk assert called.get("user_text") == "" +@pytest.mark.asyncio +async def test_provider_initial_greeting_takes_precedence_over_local_opener(monkeypatch): + llm = _InitGreetingLLM("FastGPT init greeting") + pipeline, events = _build_pipeline_with_custom_llm(monkeypatch, llm) + pipeline.apply_runtime_overrides({"output": {"mode": "text"}}) + pipeline.conversation.greeting = "local fallback greeting" + + await pipeline.emit_initial_greeting() + + finals = [event for event in events if event.get("type") == "assistant.response.final"] + assert finals + assert finals[-1]["text"] == "FastGPT init greeting" + assert llm.init_calls == 1 + + @pytest.mark.asyncio async def test_manual_opener_tool_calls_emit_assistant_tool_call(monkeypatch): pipeline, events = _build_pipeline(monkeypatch, [[LLMStreamEvent(type="done")]]) @@ -662,7 +718,7 @@ async def test_server_tool_timeout_emits_504_and_continues(monkeypatch): "status": {"code": 200, "message": "ok"}, } - monkeypatch.setattr("core.duplex_pipeline.execute_server_tool", _slow_execute) + monkeypatch.setattr("runtime.pipeline.duplex.execute_server_tool", _slow_execute) pipeline, events = _build_pipeline( monkeypatch, @@ -712,3 +768,268 @@ async def test_eou_early_return_clears_stale_asr_capture(monkeypatch): assert pipeline._asr_capture_active is False assert pipeline._asr_capture_started_ms == 0.0 assert pipeline._pending_speech_audio == b"" + +class _FakeResumableLLM: + def __init__(self, *, timeout_ms: int = 300000): + self.timeout_ms = timeout_ms + self.generate_stream_calls = 0 + self.resumed_results: List[Dict[str, Any]] = [] + + async def generate(self, _messages, temperature=0.7, max_tokens=None): + return "" + + async def generate_stream(self, _messages, temperature=0.7, max_tokens=None): + self.generate_stream_calls += 1 + yield LLMStreamEvent( + type="tool_call", + tool_call={ + "id": "call_fastgpt_1", + "executor": "client", + "wait_for_response": True, + "timeout_ms": self.timeout_ms, + "display_name": "Choose a plan", + "type": "function", + "function": { + "name": "fastgpt.interactive", + "arguments": json.dumps( + { + "provider": "fastgpt", + "version": "fastgpt_interactive_v1", + "interaction": { + "type": "userSelect", + "title": "Choose a plan", + "options": [ + {"id": "basic", "label": "Basic", "value": "basic"}, + {"id": "pro", "label": "Pro", "value": "pro"}, + ], + "form": [], + }, + "context": {"chat_id": "fastgpt_chat_1"}, + }, + ensure_ascii=False, + ), + }, + }, + ) + yield LLMStreamEvent(type="done") + + def handles_client_tool(self, tool_name: str) -> bool: + return tool_name == "fastgpt.interactive" + + async def resume_after_client_tool_result(self, tool_call_id: str, result: Dict[str, Any]): + self.resumed_results.append({"tool_call_id": tool_call_id, "result": dict(result)}) + yield LLMStreamEvent(type="text_delta", text="provider resumed answer.") + yield LLMStreamEvent(type="done") + + +class _FakeChainedResumableLLM: + def __init__(self, call_ids: List[str], *, timeout_ms: int = 300000): + self.call_ids = call_ids + self.timeout_ms = timeout_ms + self.generate_stream_calls = 0 + self.resumed_results: List[Dict[str, Any]] = [] + + def _tool_call_event(self, call_id: str) -> LLMStreamEvent: + return LLMStreamEvent( + type="tool_call", + tool_call={ + "id": call_id, + "executor": "client", + "wait_for_response": True, + "timeout_ms": self.timeout_ms, + "display_name": f"Collect {call_id}", + "type": "function", + "function": { + "name": "fastgpt.interactive", + "arguments": json.dumps( + { + "provider": "fastgpt", + "version": "fastgpt_interactive_v1", + "interaction": { + "type": "userInput", + "title": "", + "description": f"Prompt for {call_id}", + "prompt": f"Prompt for {call_id}", + "form": [{"name": "result", "label": "result", "input_type": "input"}], + "options": [], + }, + "context": {"chat_id": "fastgpt_chat_chain"}, + }, + ensure_ascii=False, + ), + }, + }, + ) + + async def generate(self, _messages, temperature=0.7, max_tokens=None): + return "" + + async def generate_stream(self, _messages, temperature=0.7, max_tokens=None): + self.generate_stream_calls += 1 + yield self._tool_call_event(self.call_ids[0]) + yield LLMStreamEvent(type="done") + + def handles_client_tool(self, tool_name: str) -> bool: + return tool_name == "fastgpt.interactive" + + async def resume_after_client_tool_result(self, tool_call_id: str, result: Dict[str, Any]): + self.resumed_results.append({"tool_call_id": tool_call_id, "result": dict(result)}) + next_index = len(self.resumed_results) + if next_index < len(self.call_ids): + yield self._tool_call_event(self.call_ids[next_index]) + else: + yield LLMStreamEvent(type="text_delta", text="completed after third interactive input.") + yield LLMStreamEvent(type="done") + + +def _build_pipeline_with_custom_llm(monkeypatch, llm_service) -> tuple[DuplexPipeline, List[Dict[str, Any]]]: + monkeypatch.setattr("runtime.pipeline.duplex.SileroVAD", _DummySileroVAD) + monkeypatch.setattr("runtime.pipeline.duplex.VADProcessor", _DummyVADProcessor) + monkeypatch.setattr("runtime.pipeline.duplex.EouDetector", _DummyEouDetector) + + pipeline = DuplexPipeline( + transport=_FakeTransport(), + session_id="s_fastgpt", + llm_service=llm_service, + tts_service=_FakeTTS(), + asr_service=_FakeASR(), + ) + events: List[Dict[str, Any]] = [] + + async def _capture_event(event: Dict[str, Any], priority: int = 20): + events.append(event) + + async def _noop_speak(_text: str, *args, **kwargs): + return None + + monkeypatch.setattr(pipeline, "_send_event", _capture_event) + monkeypatch.setattr(pipeline, "_speak_sentence", _noop_speak) + return pipeline, events + + +@pytest.mark.asyncio +async def test_fastgpt_provider_managed_tool_resumes_provider_stream(monkeypatch): + llm = _FakeResumableLLM(timeout_ms=300000) + pipeline, events = _build_pipeline_with_custom_llm(monkeypatch, llm) + pipeline.apply_runtime_overrides({"output": {"mode": "text"}}) + + task = asyncio.create_task(pipeline._handle_turn("start fastgpt")) + for _ in range(200): + if any(event.get("type") == "assistant.tool_call" for event in events): + break + await asyncio.sleep(0.005) + + tool_event = next(event for event in events if event.get("type") == "assistant.tool_call") + assert tool_event.get("executor") == "client" + assert tool_event.get("tool_name") == "fastgpt.interactive" + assert tool_event.get("timeout_ms") == 300000 + assert tool_event.get("arguments", {}).get("context", {}).get("turn_id") + assert tool_event.get("arguments", {}).get("context", {}).get("response_id") + + await pipeline.handle_tool_call_results( + [ + { + "tool_call_id": "call_fastgpt_1", + "name": "fastgpt.interactive", + "output": { + "action": "submit", + "result": {"type": "userSelect", "selected": "pro"}, + }, + "status": {"code": 200, "message": "ok"}, + } + ] + ) + await task + + finals = [event for event in events if event.get("type") == "assistant.response.final"] + assert finals + assert "provider resumed answer" in finals[-1].get("text", "") + assert llm.generate_stream_calls == 1 + assert len(llm.resumed_results) == 1 + assert llm.resumed_results[0]["tool_call_id"] == "call_fastgpt_1" + + +@pytest.mark.asyncio +async def test_fastgpt_provider_managed_tool_timeout_stops_without_generic_tool_prompt(monkeypatch): + llm = _FakeResumableLLM(timeout_ms=10) + pipeline, events = _build_pipeline_with_custom_llm(monkeypatch, llm) + pipeline.apply_runtime_overrides({"output": {"mode": "text"}}) + + await pipeline._handle_turn("start fastgpt") + + tool_results = [event for event in events if event.get("type") == "assistant.tool_result"] + assert tool_results + assert tool_results[-1].get("result", {}).get("status", {}).get("code") == 504 + finals = [event for event in events if event.get("type") == "assistant.response.final"] + assert not finals + assert llm.generate_stream_calls == 1 + assert llm.resumed_results == [] + + +@pytest.mark.asyncio +async def test_fastgpt_provider_managed_tool_chain_can_continue_after_third_result(monkeypatch): + llm = _FakeChainedResumableLLM(["call_fastgpt_1", "call_fastgpt_2", "call_fastgpt_3"]) + pipeline, events = _build_pipeline_with_custom_llm(monkeypatch, llm) + pipeline.apply_runtime_overrides({"output": {"mode": "text"}}) + + task = asyncio.create_task(pipeline._handle_turn("start chained fastgpt")) + + expected_call_ids = ["call_fastgpt_1", "call_fastgpt_2", "call_fastgpt_3"] + for idx, call_id in enumerate(expected_call_ids, start=1): + for _ in range(200): + seen_call_ids = [event.get("tool_call_id") for event in events if event.get("type") == "assistant.tool_call"] + if call_id in seen_call_ids: + break + await asyncio.sleep(0.005) + + await pipeline.handle_tool_call_results( + [ + { + "tool_call_id": call_id, + "name": "fastgpt.interactive", + "output": { + "action": "submit", + "result": {"type": "userInput", "fields": {"result": f"value-{idx}"}}, + }, + "status": {"code": 200, "message": "ok"}, + } + ] + ) + + await task + + finals = [event for event in events if event.get("type") == "assistant.response.final"] + assert finals + assert "completed after third interactive input" in finals[-1].get("text", "") + assert llm.generate_stream_calls == 1 + assert len(llm.resumed_results) == 3 + + +@pytest.mark.asyncio +async def test_pending_client_tool_deadline_tracks_waiting_result(monkeypatch): + pipeline, _events = _build_pipeline(monkeypatch, [[LLMStreamEvent(type="done")]]) + + waiter = asyncio.create_task(pipeline._wait_for_single_tool_result("call_deadline", timeout_seconds=30)) + for _ in range(50): + deadline = pipeline.pending_client_tool_deadline() + if deadline is not None: + break + await asyncio.sleep(0.001) + + deadline = pipeline.pending_client_tool_deadline() + assert deadline is not None + assert deadline > time.monotonic() + 25 + + await pipeline.handle_tool_call_results( + [ + { + "tool_call_id": "call_deadline", + "name": "fastgpt.interactive", + "output": {"action": "submit", "result": {"type": "userInput", "fields": {"name": "Alice"}}}, + "status": {"code": 200, "message": "ok"}, + } + ] + ) + await waiter + + assert pipeline.pending_client_tool_deadline() is None diff --git a/engine/tests/test_tool_executor.py b/engine/tests/test_tool_executor.py index 17345c7..aada0c1 100644 --- a/engine/tests/test_tool_executor.py +++ b/engine/tests/test_tool_executor.py @@ -1,6 +1,6 @@ import pytest -from core.tool_executor import execute_server_tool +from tools.executor import execute_server_tool @pytest.mark.asyncio @@ -38,7 +38,7 @@ async def test_current_time_uses_local_system_clock(monkeypatch): async def _should_not_be_called(_tool_id): raise AssertionError("fetch_tool_resource should not be called for current_time") - monkeypatch.setattr("core.tool_executor.fetch_tool_resource", _should_not_be_called) + monkeypatch.setattr("tools.executor.fetch_tool_resource", _should_not_be_called) result = await execute_server_tool( { diff --git a/engine/tests/test_tts_factory_modes.py b/engine/tests/test_tts_factory_modes.py new file mode 100644 index 0000000..987fc10 --- /dev/null +++ b/engine/tests/test_tts_factory_modes.py @@ -0,0 +1,45 @@ +from providers.factory.default import DefaultRealtimeServiceFactory +from providers.tts.mock import MockTTSService +from providers.tts.openai_compatible import OpenAICompatibleTTSService +from providers.tts.volcengine import VolcengineTTSService +from runtime.ports import TTSServiceSpec + + +def test_create_tts_service_volcengine_returns_native_provider(): + factory = DefaultRealtimeServiceFactory() + service = factory.create_tts_service( + TTSServiceSpec( + provider="volcengine", + api_key="test-key", + app_id="app-1", + resource_id="seed-tts-2.0", + voice="zh_female_shuangkuaisisi_moon_bigtts", + sample_rate=16000, + ) + ) + assert isinstance(service, VolcengineTTSService) + + +def test_create_tts_service_openai_compatible_returns_provider(): + factory = DefaultRealtimeServiceFactory() + service = factory.create_tts_service( + TTSServiceSpec( + provider="openai_compatible", + api_key="test-key", + voice="anna", + sample_rate=16000, + ) + ) + assert isinstance(service, OpenAICompatibleTTSService) + + +def test_create_tts_service_fallbacks_to_mock_without_key(): + factory = DefaultRealtimeServiceFactory() + service = factory.create_tts_service( + TTSServiceSpec( + provider="volcengine", + voice="anna", + sample_rate=16000, + ) + ) + assert isinstance(service, MockTTSService) diff --git a/engine/tests/test_volcengine_asr_provider.py b/engine/tests/test_volcengine_asr_provider.py new file mode 100644 index 0000000..c5756c0 --- /dev/null +++ b/engine/tests/test_volcengine_asr_provider.py @@ -0,0 +1,86 @@ +import gzip +import json + +from providers.asr.volcengine import VolcengineRealtimeASRService + + +def test_volcengine_seed_protocol_defaults_and_headers(): + service = VolcengineRealtimeASRService( + api_key="access-token", + api_url="wss://openspeech.bytedance.com/api/v3/sauc/bigmodel", + app_id="app-1", + uid="caller-1", + ) + + assert service.protocol == "seed" + assert service.resource_id == "volc.bigasr.sauc.duration" + + headers = service._build_seed_headers("req-1") + assert headers == { + "X-Api-App-Key": "app-1", + "X-Api-Access-Key": "access-token", + "X-Api-Resource-Id": "volc.bigasr.sauc.duration", + "X-Api-Request-Id": "req-1", + } + + +def test_volcengine_seed_start_payload_merges_request_params(): + service = VolcengineRealtimeASRService( + api_key="access-token", + api_url="wss://openspeech.bytedance.com/api/v3/sauc/bigmodel", + app_id="app-1", + uid="caller-1", + language="zh-CN", + request_params={ + "request": { + "end_window_size": 800, + "force_to_speech_time": 1000, + "context": "{\"hotwords\":[{\"word\":\"doubao\"}]}", + }, + "audio": {"codec": "raw"}, + }, + ) + + payload = service._build_seed_start_payload() + assert payload["user"] == {"uid": "caller-1"} + assert payload["audio"] == { + "format": "pcm", + "rate": 16000, + "bits": 16, + "channels": 1, + "codec": "raw", + "language": "zh-CN", + } + assert payload["request"]["model_name"] == "bigmodel" + assert payload["request"]["end_window_size"] == 800 + assert payload["request"]["force_to_speech_time"] == 1000 + assert payload["request"]["context"] == "{\"hotwords\":[{\"word\":\"doubao\"}]}" + + +def test_volcengine_seed_start_request_encodes_gzip_json_payload(): + service = VolcengineRealtimeASRService( + api_key="access-token", + api_url="wss://openspeech.bytedance.com/api/v3/sauc/bigmodel", + app_id="app-1", + uid="caller-1", + ) + + frame = service._build_seed_start_request() + assert frame[0] == 0x11 + assert frame[1] == 0x11 + + payload_length = int.from_bytes(frame[8:12], "big") + payload = json.loads(gzip.decompress(frame[12 : 12 + payload_length]).decode("utf-8")) + assert payload["user"]["uid"] == "caller-1" + assert payload["request"]["model_name"] == "bigmodel" + + +def test_volcengine_gateway_protocol_keeps_model_query(): + service = VolcengineRealtimeASRService( + api_key="access-token", + api_url="wss://ai-gateway.vei.volces.com/v1/realtime", + model="bigmodel", + ) + + assert service.protocol == "gateway" + assert service.api_url == "wss://ai-gateway.vei.volces.com/v1/realtime?model=bigmodel" diff --git a/engine/tests/test_ws_protocol_session_start.py b/engine/tests/test_ws_protocol_session_start.py index 90ac179..e055c75 100644 --- a/engine/tests/test_ws_protocol_session_start.py +++ b/engine/tests/test_ws_protocol_session_start.py @@ -1,7 +1,7 @@ import pytest -from core.session import Session, WsSessionState -from models.ws_v1 import OutputAudioPlayedMessage, SessionStartMessage, parse_client_message +from runtime.session.manager import Session, WsSessionState +from protocol.ws_v1.schema import OutputAudioPlayedMessage, SessionStartMessage, parse_client_message def _session() -> Session: @@ -139,7 +139,7 @@ async def test_load_server_runtime_metadata_returns_not_found_error(): _ = assistant_id return {"__error_code": "assistant.not_found"} - session._backend_gateway = _Gateway() + session._runtime_config_provider = _Gateway() runtime, error = await session._load_server_runtime_metadata("assistant_demo") assert runtime == {} assert error is not None @@ -155,7 +155,7 @@ async def test_load_server_runtime_metadata_returns_config_unavailable_error(): _ = assistant_id return None - session._backend_gateway = _Gateway() + session._runtime_config_provider = _Gateway() runtime, error = await session._load_server_runtime_metadata("assistant_demo") assert runtime == {} assert error is not None @@ -194,7 +194,7 @@ async def test_handle_session_start_requires_assistant_id_and_closes_transport() @pytest.mark.asyncio async def test_handle_session_start_applies_whitelisted_overrides_and_ignores_workflow(monkeypatch): - monkeypatch.setattr("core.session.settings.ws_emit_config_resolved", False) + monkeypatch.setattr("runtime.session.manager.settings.ws_emit_config_resolved", False) session = Session.__new__(Session) session.id = "sess_start_ok" @@ -289,7 +289,9 @@ async def test_handle_session_start_applies_whitelisted_overrides_and_ignores_wo @pytest.mark.asyncio async def test_handle_session_start_emits_config_resolved_when_enabled(monkeypatch): - monkeypatch.setattr("core.session.settings.ws_emit_config_resolved", True) + monkeypatch.setattr("runtime.session.manager.settings.ws_emit_config_resolved", True) + monkeypatch.setattr("runtime.session.manager.settings.ws_protocol_version", "v1-custom") + monkeypatch.setattr("runtime.session.manager.settings.default_codec", "pcmu") session = Session.__new__(Session) session.id = "sess_start_emit_config" @@ -368,10 +370,46 @@ async def test_handle_session_start_emits_config_resolved_when_enabled(monkeypat ) config_event = next(item for item in events if item.get("type") == "config.resolved") + session_started_event = next(item for item in events if item.get("type") == "session.started") + assert session_started_event["protocolVersion"] == "v1-custom" assert "appId" not in config_event["config"] assert "configVersionId" not in config_event["config"] assert "services" not in config_event["config"] + assert config_event["config"]["protocolVersion"] == "v1-custom" assert config_event["config"]["channel"] == "web_debug" assert config_event["config"]["output"]["mode"] == "text" + assert config_event["config"]["output"]["codec"] == "pcmu" assert config_event["config"]["tools"]["enabled"] is True assert config_event["config"]["tools"]["count"] == 1 + + +@pytest.mark.asyncio +async def test_handle_audio_uses_chunk_size_for_frame_validation(monkeypatch): + monkeypatch.setattr("runtime.session.manager.settings.sample_rate", 16000) + monkeypatch.setattr("runtime.session.manager.settings.chunk_size_ms", 10) + + session = Session.__new__(Session) + session.id = "sess_chunk_frame" + session.ws_state = WsSessionState.ACTIVE + + class _Pipeline: + def __init__(self): + self.frames = [] + + async def process_audio(self, frame: bytes): + self.frames.append(frame) + + session.pipeline = _Pipeline() + errors = [] + + async def _send_error(sender, message, code, **kwargs): + _ = (sender, kwargs) + errors.append((code, message)) + + session._send_error = _send_error + payload = b"\x00\x01" * 320 # 640 bytes = 2 frames when chunk_size_ms=10 + await session.handle_audio(payload) + + assert errors == [] + assert len(session.pipeline.frames) == 2 + assert all(len(frame) == 320 for frame in session.pipeline.frames) diff --git a/engine/tools/__init__.py b/engine/tools/__init__.py new file mode 100644 index 0000000..29a67f0 --- /dev/null +++ b/engine/tools/__init__.py @@ -0,0 +1 @@ +"""Tools package.""" diff --git a/engine/core/tool_executor.py b/engine/tools/executor.py similarity index 99% rename from engine/core/tool_executor.py rename to engine/tools/executor.py index 899d930..0049cbc 100644 --- a/engine/core/tool_executor.py +++ b/engine/tools/executor.py @@ -8,7 +8,7 @@ from typing import Any, Awaitable, Callable, Dict, Optional import aiohttp -from app.backend_adapters import build_backend_adapter_from_settings +from adapters.control_plane.backend import build_backend_adapter_from_settings ToolResourceFetcher = Callable[[str], Awaitable[Optional[Dict[str, Any]]]] diff --git a/engine/workflow/__init__.py b/engine/workflow/__init__.py new file mode 100644 index 0000000..35ffe2e --- /dev/null +++ b/engine/workflow/__init__.py @@ -0,0 +1 @@ +"""Workflow package.""" diff --git a/engine/core/workflow_runner.py b/engine/workflow/runner.py similarity index 100% rename from engine/core/workflow_runner.py rename to engine/workflow/runner.py diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..d7d3b64 --- /dev/null +++ b/examples/README.md @@ -0,0 +1 @@ +# Example Application using RAS \ No newline at end of file diff --git a/web/components.json b/web/components.json new file mode 100644 index 0000000..46aa2f0 --- /dev/null +++ b/web/components.json @@ -0,0 +1,25 @@ +{ + "$schema": "https://ui.shadcn.com/schema.json", + "style": "base-nova", + "rsc": false, + "tsx": true, + "tailwind": { + "config": "", + "css": "index.css", + "baseColor": "neutral", + "cssVariables": true, + "prefix": "" + }, + "iconLibrary": "lucide", + "rtl": false, + "aliases": { + "components": "@/components", + "utils": "@/lib/utils", + "ui": "@/components/ui", + "lib": "@/lib", + "hooks": "@/hooks" + }, + "menuColor": "default", + "menuAccent": "subtle", + "registries": {} +} diff --git a/web/components/UI.tsx b/web/components/UI.tsx index 98c72c9..15b77bc 100644 --- a/web/components/UI.tsx +++ b/web/components/UI.tsx @@ -1,63 +1,37 @@ - import React from 'react'; import { X } from 'lucide-react'; -// Button +// Shadcn UI Imports +import { Button as ShadcnButton } from './ui/button'; +import { Input as ShadcnInput } from './ui/input'; +import { Switch as ShadcnSwitch } from './ui/switch'; +import { Card as ShadcnCard } from './ui/card'; +import { Badge as ShadcnBadge } from './ui/badge'; +import { TableHeader as ShadcnTableHeader, TableRow as ShadcnTableRow, TableHead as ShadcnTableHead, TableCell as ShadcnTableCell } from './ui/table'; +import { Sheet, SheetContent, SheetHeader, SheetTitle } from './ui/sheet'; +import { Dialog as ShadcnDialog, DialogContent, DialogHeader, DialogTitle, DialogFooter } from './ui/dialog'; + +// Button Wrapper to match old API interface ButtonProps extends React.ButtonHTMLAttributes { variant?: 'primary' | 'secondary' | 'outline' | 'ghost' | 'destructive'; size?: 'sm' | 'md' | 'lg' | 'icon'; } +export const Button: React.FC = ({ variant = 'primary', size = 'md', className, ...props }) => { + const vMap: any = { primary: 'default', secondary: 'secondary', outline: 'outline', ghost: 'ghost', destructive: 'destructive' }; + const sMap: any = { sm: 'sm', md: 'default', lg: 'lg', icon: 'icon' }; + return ; +} -export const Button: React.FC = ({ - className = '', - variant = 'primary', - size = 'md', - children, - ...props -}) => { - const baseStyles = "inline-flex items-center justify-center rounded-md text-sm font-medium transition-all focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50 active:scale-95"; - - const variants = { - // Primary: Glow effect - primary: "bg-primary text-primary-foreground shadow-[0_0_10px_rgba(6,182,212,0.5)] hover:bg-primary/90 hover:shadow-[0_0_15px_rgba(6,182,212,0.6)]", - secondary: "bg-secondary text-secondary-foreground shadow-sm hover:bg-secondary/80", - outline: "border border-input bg-transparent shadow-sm hover:bg-accent hover:text-accent-foreground hover:border-primary/50", - ghost: "hover:bg-accent hover:text-accent-foreground", - destructive: "bg-destructive text-destructive-foreground shadow-sm hover:bg-destructive/90", - }; - - const sizes = { - sm: "h-8 px-3 text-xs", - md: "h-9 px-4 py-2", - lg: "h-10 px-8", - icon: "h-9 w-9", - }; - - return ( - - ); -}; - -// Input - Removed border, added subtle background -interface InputProps extends React.InputHTMLAttributes {} - -export const Input: React.FC = ({ className = '', ...props }) => { - return ( - - ); -}; - -interface SelectProps extends React.SelectHTMLAttributes {} +// Input and Switch match seamlessly +export const Input = ShadcnInput; +export const Switch = ShadcnSwitch; +// Native Select Wrapper to avoid breaking consumers expecting +interface SelectProps extends React.SelectHTMLAttributes { } export const Select: React.FC = ({ className = '', children, ...props }) => { return (