diff --git a/api/app/routers/assistants.py b/api/app/routers/assistants.py index e6cdcea..09f338f 100644 --- a/api/app/routers/assistants.py +++ b/api/app/routers/assistants.py @@ -34,6 +34,12 @@ def _is_openai_compatible_vendor(vendor: Optional[str]) -> bool: } +def _is_dashscope_vendor(vendor: Optional[str]) -> bool: + return (vendor or "").strip().lower() in { + "dashscope", + } + + def _normalize_openai_compatible_voice_key(voice_value: str, model: str) -> str: raw = (voice_value or "").strip() model_name = (model or "").strip() or OPENAI_COMPATIBLE_DEFAULT_MODEL @@ -121,7 +127,12 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s elif assistant.voice: voice = db.query(Voice).filter(Voice.id == assistant.voice).first() if voice: - tts_provider = "openai_compatible" if _is_openai_compatible_vendor(voice.vendor) else "edge" + if _is_dashscope_vendor(voice.vendor): + tts_provider = "dashscope" + elif _is_openai_compatible_vendor(voice.vendor): + tts_provider = "openai_compatible" + else: + tts_provider = "edge" model = voice.model runtime_voice = voice.voice_key or voice.id if tts_provider == "openai_compatible": @@ -131,8 +142,8 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s "enabled": True, "provider": tts_provider, "model": model, - "apiKey": voice.api_key if tts_provider == "openai_compatible" else None, - "baseUrl": voice.base_url if tts_provider == "openai_compatible" else None, + "apiKey": voice.api_key if tts_provider in {"openai_compatible", "dashscope"} else None, + "baseUrl": voice.base_url if tts_provider in {"openai_compatible", "dashscope"} else None, "voice": runtime_voice, "speed": assistant.speed or voice.speed, } diff --git a/api/app/routers/voices.py b/api/app/routers/voices.py index 1e4a722..2d5ebe9 100644 --- a/api/app/routers/voices.py +++ b/api/app/routers/voices.py @@ -1,6 +1,10 @@ import base64 +import io +import json import os -from typing import Optional +import threading +import wave +from typing import Any, Dict, Optional import httpx from fastapi import APIRouter, Depends, HTTPException @@ -14,6 +18,203 @@ from ..schemas import VoiceCreate, VoiceOut, VoicePreviewRequest, VoicePreviewRe router = APIRouter(prefix="/voices", tags=["Voices"]) OPENAI_COMPATIBLE_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B" +DASHSCOPE_DEFAULT_MODEL = "qwen3-tts-flash-realtime" +DASHSCOPE_DEFAULT_VOICE_KEY = "Cherry" +DASHSCOPE_DEFAULT_BASE_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + +try: + import dashscope + from dashscope.audio.qwen_tts_realtime import AudioFormat, QwenTtsRealtime, QwenTtsRealtimeCallback + + DASHSCOPE_SDK_AVAILABLE = True +except ImportError: + dashscope = None # type: ignore[assignment] + AudioFormat = None # type: ignore[assignment] + QwenTtsRealtime = None # type: ignore[assignment] + DASHSCOPE_SDK_AVAILABLE = False + + class QwenTtsRealtimeCallback: # type: ignore[no-redef] + """Fallback callback base when DashScope SDK is unavailable.""" + + pass + + +class _DashScopePreviewCallback(QwenTtsRealtimeCallback): + """Collect DashScope realtime callback events and PCM chunks.""" + + def __init__(self) -> None: + super().__init__() + self._open_event = threading.Event() + self._done_event = threading.Event() + self._lock = threading.Lock() + self._audio_chunks: list[bytes] = [] + self._error_message: Optional[str] = None + + def on_open(self) -> None: + self._open_event.set() + + def on_close(self, code: int, reason: str) -> None: + if not self._done_event.is_set(): + self._error_message = f"DashScope websocket closed unexpectedly: {code} {reason}" + self._done_event.set() + + def on_error(self, message: str) -> None: + self._error_message = str(message) + self._done_event.set() + + def on_event(self, response: Any) -> None: + payload = _coerce_dashscope_event(response) + event_type = str(payload.get("type") or "").strip() + if event_type == "response.audio.delta": + delta = payload.get("delta") + if isinstance(delta, str): + try: + self._append_audio(base64.b64decode(delta)) + except Exception: + return + elif event_type in {"response.done", "session.finished"}: + self._done_event.set() + elif event_type == "error": + self._error_message = _format_dashscope_error_event(payload) + self._done_event.set() + + def on_data(self, data: bytes) -> None: + # Some SDK versions emit raw PCM frames via on_data. + if isinstance(data, (bytes, bytearray)): + self._append_audio(bytes(data)) + + def wait_for_open(self, timeout: float = 10.0) -> None: + if not self._open_event.wait(timeout): + raise TimeoutError("DashScope websocket open timeout") + + def wait_for_done(self, timeout: float = 45.0) -> None: + if not self._done_event.wait(timeout): + raise TimeoutError("DashScope synthesis timeout") + + def raise_if_error(self) -> None: + if self._error_message: + raise RuntimeError(self._error_message) + + def read_audio(self) -> bytes: + with self._lock: + return b"".join(self._audio_chunks) + + def _append_audio(self, chunk: bytes) -> None: + if not chunk: + return + with self._lock: + self._audio_chunks.append(chunk) + + +def _coerce_dashscope_event(response: Any) -> Dict[str, Any]: + if isinstance(response, dict): + return response + if isinstance(response, str): + try: + parsed = json.loads(response) + if isinstance(parsed, dict): + return parsed + except json.JSONDecodeError: + pass + return {"type": "raw", "message": str(response)} + + +def _format_dashscope_error_event(payload: Dict[str, Any]) -> str: + error = payload.get("error") + if isinstance(error, dict): + code = str(error.get("code") or "").strip() + message = str(error.get("message") or "").strip() + if code and message: + return f"{code}: {message}" + return message or str(error) + return str(error or "DashScope realtime TTS error") + + +def _create_dashscope_realtime_client(*, model: str, callback: _DashScopePreviewCallback, url: str, api_key: str) -> Any: + if QwenTtsRealtime is None: + raise RuntimeError("DashScope SDK unavailable") + + init_kwargs = { + "model": model, + "callback": callback, + "url": url, + } + try: + return QwenTtsRealtime(api_key=api_key, **init_kwargs) # type: ignore[misc] + except TypeError as exc: + if "api_key" not in str(exc): + raise + return QwenTtsRealtime(**init_kwargs) # type: ignore[misc] + + +def _pcm16_to_wav_bytes(pcm_bytes: bytes, sample_rate: int = 24000) -> bytes: + with io.BytesIO() as buffer: + with wave.open(buffer, "wb") as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(sample_rate) + wav_file.writeframes(pcm_bytes) + return buffer.getvalue() + + +def _synthesize_dashscope_preview( + *, + text: str, + api_key: str, + base_url: str, + model: str, + voice_key: str, + speed: Optional[float], +) -> bytes: + if not DASHSCOPE_SDK_AVAILABLE: + raise RuntimeError("dashscope package not installed; install with `pip install dashscope>=1.25.11`") + if not AudioFormat: + raise RuntimeError("DashScope SDK AudioFormat unavailable") + + callback = _DashScopePreviewCallback() + if dashscope is not None: + dashscope.api_key = api_key + client = _create_dashscope_realtime_client( + model=model, + callback=callback, + url=base_url, + api_key=api_key, + ) + + try: + client.connect() + callback.wait_for_open() + session_kwargs: Dict[str, Any] = { + "voice": voice_key, + "response_format": AudioFormat.PCM_24000HZ_MONO_16BIT, + "mode": "commit", + } + # speech_rate is supported by qwen3-* realtime models. + normalized_model = str(model or "").strip().lower() + if speed is not None and normalized_model.startswith("qwen3-"): + session_kwargs["speech_rate"] = max(0.5, min(2.0, float(speed))) + client.update_session(**session_kwargs) + client.append_text(text) + client.commit() + callback.wait_for_done() + callback.raise_if_error() + pcm_audio = callback.read_audio() + if not pcm_audio: + raise RuntimeError("No audio chunk returned from DashScope realtime synthesis") + return _pcm16_to_wav_bytes(pcm_audio, sample_rate=24000) + finally: + finish_fn = getattr(client, "finish", None) + if callable(finish_fn): + try: + finish_fn() + except Exception: + pass + close_fn = getattr(client, "close", None) + if callable(close_fn): + try: + close_fn() + except Exception: + pass def _is_openai_compatible_vendor(vendor: str) -> bool: @@ -26,9 +227,18 @@ def _is_openai_compatible_vendor(vendor: str) -> bool: } +def _is_dashscope_vendor(vendor: str) -> bool: + normalized = (vendor or "").strip().lower() + return normalized in { + "dashscope", + } + + def _default_base_url(vendor: str) -> Optional[str]: if _is_openai_compatible_vendor(vendor): return "https://api.siliconflow.cn/v1" + if _is_dashscope_vendor(vendor): + return DASHSCOPE_DEFAULT_BASE_URL return None @@ -76,6 +286,9 @@ def create_voice(data: VoiceCreate, db: Session = Depends(get_db)): if not voice_key: raw_id = (data.id or data.name).strip() voice_key = raw_id if ":" in raw_id else f"{model}:{raw_id}" + elif _is_dashscope_vendor(vendor): + model = (model or "").strip() or DASHSCOPE_DEFAULT_MODEL + voice_key = (voice_key or "").strip() or DASHSCOPE_DEFAULT_VOICE_KEY voice = Voice( id=unique_short_id("tts", db, Voice), @@ -126,6 +339,11 @@ def update_voice(id: str, data: VoiceUpdate, db: Session = Depends(get_db)): voice_key = update_data.get("voice_key") or voice.voice_key update_data["model"] = model update_data["voice_key"] = voice_key or _build_openai_compatible_voice_key(voice, model) + elif _is_dashscope_vendor(vendor_for_defaults): + model = update_data.get("model") or voice.model or DASHSCOPE_DEFAULT_MODEL + voice_key = update_data.get("voice_key") or voice.voice_key or DASHSCOPE_DEFAULT_VOICE_KEY + update_data["model"] = model + update_data["voice_key"] = voice_key for field, value in update_data.items(): setattr(voice, field, value) @@ -148,7 +366,7 @@ def delete_voice(id: str, db: Session = Depends(get_db)): @router.post("/{id}/preview", response_model=VoicePreviewResponse) def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_db)): - """试听指定声音,基于 OpenAI-compatible /audio/speech 接口。""" + """试听指定声音,支持 OpenAI-compatible 与 DashScope Realtime。""" voice = db.query(Voice).filter(Voice.id == id).first() if not voice: raise HTTPException(status_code=404, detail="Voice not found") @@ -157,6 +375,31 @@ def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_ if not text: raise HTTPException(status_code=400, detail="Preview text cannot be empty") + if _is_dashscope_vendor(voice.vendor): + api_key = (data.api_key or "").strip() or (voice.api_key or "").strip() + if not api_key: + api_key = os.getenv("DASHSCOPE_API_KEY", "").strip() or os.getenv("TTS_API_KEY", "").strip() + if not api_key: + raise HTTPException(status_code=400, detail=f"API key is required for voice: {voice.name}") + + base_url = (voice.base_url or "").strip() or DASHSCOPE_DEFAULT_BASE_URL + model = (voice.model or "").strip() or DASHSCOPE_DEFAULT_MODEL + voice_key = (voice.voice_key or "").strip() or DASHSCOPE_DEFAULT_VOICE_KEY + effective_speed = data.speed if data.speed is not None else voice.speed + try: + wav_bytes = _synthesize_dashscope_preview( + text=text, + api_key=api_key, + base_url=base_url, + model=model, + voice_key=voice_key, + speed=effective_speed, + ) + except Exception as exc: + raise HTTPException(status_code=502, detail=f"DashScope preview failed: {exc}") from exc + audio_base64 = base64.b64encode(wav_bytes).decode("utf-8") + return VoicePreviewResponse(success=True, audio_url=f"data:audio/wav;base64,{audio_base64}") + api_key = (data.api_key or "").strip() or (voice.api_key or "").strip() if not api_key and _is_openai_compatible_vendor(voice.vendor): api_key = os.getenv("SILICONFLOW_API_KEY", "").strip() diff --git a/api/init_db.py b/api/init_db.py index 0025c4b..e3373f6 100644 --- a/api/init_db.py +++ b/api/init_db.py @@ -13,12 +13,16 @@ from app.id_generator import short_id from app.models import Voice, Assistant, KnowledgeBase, Workflow, LLMModel, ASRModel, KnowledgeDocument VOICE_MODEL = "FunAudioLLM/CosyVoice2-0.5B" +DASHSCOPE_VOICE_MODEL = "qwen3-tts-flash-realtime" +DASHSCOPE_DEFAULT_VOICE_KEY = "Cherry" +DASHSCOPE_REALTIME_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" SEED_VOICE_IDS = { "alex": short_id("tts"), "david": short_id("tts"), "bella": short_id("tts"), "claire": short_id("tts"), + "dashscope_cherry": short_id("tts"), } SEED_LLM_IDS = { @@ -177,8 +181,20 @@ def init_default_data(): voice_key=f"{VOICE_MODEL}:claire", is_system=True, ), + Voice( + id=SEED_VOICE_IDS["dashscope_cherry"], + name="DashScope Cherry", + vendor="DashScope", + gender="Female", + language="zh", + description="DashScope realtime sample voice.", + model=DASHSCOPE_VOICE_MODEL, + voice_key=DASHSCOPE_DEFAULT_VOICE_KEY, + base_url=DASHSCOPE_REALTIME_URL, + is_system=True, + ), ] - seed_if_empty(db, Voice, voices, "✅ 默认声音数据已初始化 (OpenAI Compatible CosyVoice 2.0)") + seed_if_empty(db, Voice, voices, "✅ 默认声音数据已初始化 (OpenAI Compatible + DashScope)") def init_default_tools(recreate: bool = False): diff --git a/api/requirements.txt b/api/requirements.txt index 95288af..173875c 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -9,3 +9,4 @@ minio==7.2.0 httpx==0.26.0 chromadb==0.4.22 openai==1.12.0 +dashscope==1.25.11 diff --git a/api/tests/test_assistants.py b/api/tests/test_assistants.py index 10cf93c..6b2d173 100644 --- a/api/tests/test_assistants.py +++ b/api/tests/test_assistants.py @@ -186,9 +186,11 @@ class TestAssistantAPI: sample_asr_model_data["vendor"] = "OpenAI Compatible" llm_resp = client.post("/api/llm", json=sample_llm_model_data) assert llm_resp.status_code == 200 + llm_id = llm_resp.json()["id"] asr_resp = client.post("/api/asr", json=sample_asr_model_data) assert asr_resp.status_code == 200 + asr_id = asr_resp.json()["id"] sample_voice_data["vendor"] = "OpenAI Compatible" sample_voice_data["base_url"] = "https://tts.example.com/v1/audio/speech" @@ -198,8 +200,8 @@ class TestAssistantAPI: voice_id = voice_resp.json()["id"] sample_assistant_data.update({ - "llmModelId": sample_llm_model_data["id"], - "asrModelId": sample_asr_model_data["id"], + "llmModelId": llm_id, + "asrModelId": asr_id, "voice": voice_id, "prompt": "runtime prompt", "opener": "runtime opener", @@ -220,7 +222,8 @@ class TestAssistantAPI: assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"] assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"] assert metadata["services"]["asr"]["baseUrl"] == sample_asr_model_data["base_url"] - assert metadata["services"]["tts"]["voice"] == sample_voice_data["voice_key"] + expected_tts_voice = f"{sample_voice_data['model']}:{sample_voice_data['voice_key']}" + assert metadata["services"]["tts"]["voice"] == expected_tts_voice assert metadata["services"]["tts"]["baseUrl"] == sample_voice_data["base_url"] def test_get_engine_config_endpoint(self, client, sample_assistant_data): @@ -252,6 +255,38 @@ class TestAssistantAPI: assert metadata["output"]["mode"] == "text" assert metadata["services"]["tts"]["enabled"] is False + def test_runtime_config_dashscope_voice_provider(self, client, sample_assistant_data): + """DashScope voices should map to dashscope tts provider in runtime metadata.""" + voice_resp = client.post("/api/voices", json={ + "name": "DashScope Cherry", + "vendor": "DashScope", + "gender": "Female", + "language": "zh", + "description": "dashscope voice", + "api_key": "dashscope-key", + "base_url": "wss://dashscope.aliyuncs.com/api-ws/v1/realtime", + }) + assert voice_resp.status_code == 200 + voice_payload = voice_resp.json() + + sample_assistant_data.update({ + "voice": voice_payload["id"], + "voiceOutputEnabled": True, + }) + assistant_resp = client.post("/api/assistants", json=sample_assistant_data) + assert assistant_resp.status_code == 200 + assistant_id = assistant_resp.json()["id"] + + runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config") + assert runtime_resp.status_code == 200 + metadata = runtime_resp.json()["sessionStartMetadata"] + tts = metadata["services"]["tts"] + assert tts["provider"] == "dashscope" + assert tts["voice"] == "Cherry" + assert tts["model"] == "qwen3-tts-flash-realtime" + assert tts["apiKey"] == "dashscope-key" + assert tts["baseUrl"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + def test_assistant_interrupt_and_generated_opener_flags(self, client, sample_assistant_data): sample_assistant_data.update({ "firstTurnMode": "user_first", diff --git a/api/tests/test_voices.py b/api/tests/test_voices.py index 73ed371..1093b41 100644 --- a/api/tests/test_voices.py +++ b/api/tests/test_voices.py @@ -171,8 +171,9 @@ class TestVoiceAPI: "voice_key": "FunAudioLLM/CosyVoice2-0.5B:anna" }) assert create_resp.status_code == 200 + voice_id = create_resp.json()["id"] - preview_resp = client.post("/api/voices/anna/preview", json={"text": "你好"}) + preview_resp = client.post(f"/api/voices/{voice_id}/preview", json={"text": "你好"}) assert preview_resp.status_code == 200 payload = preview_resp.json() assert payload["success"] is True @@ -228,8 +229,103 @@ class TestVoiceAPI: "base_url": "https://api.siliconflow.cn/v1" }) assert create_resp.status_code == 200 + voice_id = create_resp.json()["id"] - preview_resp = client.post("/api/voices/anna2/preview", json={"text": "hello"}) + preview_resp = client.post(f"/api/voices/{voice_id}/preview", json={"text": "hello"}) assert preview_resp.status_code == 200 assert captured_auth["value"] == "Bearer voice-key-123" assert captured_url["value"] == "https://api.siliconflow.cn/v1/audio/speech" + + def test_create_voice_dashscope_defaults(self, client): + """Test creating DashScope voice applies model/voice defaults.""" + create_resp = client.post("/api/voices", json={ + "name": "DashScope Voice", + "vendor": "DashScope", + "gender": "Female", + "language": "zh", + "description": "dashscope", + }) + assert create_resp.status_code == 200 + payload = create_resp.json() + assert payload["vendor"] == "DashScope" + assert payload["model"] == "qwen3-tts-flash-realtime" + assert payload["voice_key"] == "Cherry" + + def test_preview_voice_dashscope_success(self, client, monkeypatch): + """DashScope voice preview should return playable wav data url.""" + from app.routers import voices as voice_router + + captured = { + "api_key": "", + "model": "", + "url": "", + "session": {}, + "text": "", + } + + class DummyAudioFormat: + PCM_24000HZ_MONO_16BIT = "pcm24k16mono" + + class DummyDashScopeModule: + api_key = "" + + class DummyRealtime: + def __init__(self, *args, **kwargs): + captured["api_key"] = kwargs.get("api_key", "") + captured["model"] = kwargs.get("model", "") + captured["url"] = kwargs.get("url", "") + self.callback = kwargs["callback"] + + def connect(self): + self.callback.on_open() + + def update_session(self, **kwargs): + captured["session"] = kwargs + + def append_text(self, text): + captured["text"] = text + + def commit(self): + # 16-bit PCM mono samples + raw_pcm = b"\x00\x00\x01\x00\x02\x00\x03\x00" + self.callback.on_event({ + "type": "response.audio.delta", + "delta": base64.b64encode(raw_pcm).decode("utf-8"), + }) + self.callback.on_event({"type": "response.done"}) + + def finish(self): + return None + + def close(self): + return None + + monkeypatch.setattr(voice_router, "DASHSCOPE_SDK_AVAILABLE", True) + monkeypatch.setattr(voice_router, "AudioFormat", DummyAudioFormat) + monkeypatch.setattr(voice_router, "QwenTtsRealtime", DummyRealtime) + monkeypatch.setattr(voice_router, "dashscope", DummyDashScopeModule()) + + create_resp = client.post("/api/voices", json={ + "name": "DashScope Voice", + "vendor": "DashScope", + "gender": "Female", + "language": "zh", + "description": "dashscope", + "api_key": "dashscope-key", + "base_url": "wss://dashscope.aliyuncs.com/api-ws/v1/realtime", + }) + assert create_resp.status_code == 200 + voice_id = create_resp.json()["id"] + + preview_resp = client.post(f"/api/voices/{voice_id}/preview", json={"text": "你好"}) + assert preview_resp.status_code == 200 + payload = preview_resp.json() + assert payload["success"] is True + assert payload["audio_url"].startswith("data:audio/wav;base64,") + encoded = payload["audio_url"].split(",", 1)[1] + wav_bytes = base64.b64decode(encoded) + assert wav_bytes.startswith(b"RIFF") + assert captured["model"] == "qwen3-tts-flash-realtime" + assert captured["url"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + assert captured["text"] == "你好" + assert captured["session"]["voice"] == "Cherry" diff --git a/engine/agents/default.yaml b/engine/agents/default.yaml new file mode 100644 index 0000000..b32a1ae --- /dev/null +++ b/engine/agents/default.yaml @@ -0,0 +1,62 @@ +# Agent behavior configuration (safe to edit per profile) +# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers). +# Infra/server/network settings should stay in .env. + +agent: + vad: + type: silero + model_path: data/vad/silero_vad.onnx + threshold: 0.5 + min_speech_duration_ms: 100 + eou_threshold_ms: 800 + + llm: + # provider: openai | openai_compatible | siliconflow + provider: openai_compatible + model: deepseek-v3 + temperature: 0.7 + # Required: no fallback. You can still reference env explicitly. + api_key: sk-fc4d59b360475f53401a864db8ce0985010acc4e696723d20a90d6569f38d80a + # Optional for OpenAI-compatible endpoints: + api_url: https://api.qnaigc.com/v1 + + tts: + # provider: edge | openai_compatible | siliconflow | dashscope + # dashscope defaults (if omitted): + provider: dashscope + api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime + model: qwen3-tts-flash-realtime + api_key: sk-391f5126d18345d497c6e8717c8c9ad7 + mode: commit + voice: Cherry + speed: 1.0 + # provider: openai_compatible + # api_key: sk-thmzysdpqqmhqxxshyqoxvjeiflexjdgaftyufrsgrhpjnyx + # api_url: https://api.siliconflow.cn/v1/audio/speech + # model: FunAudioLLM/CosyVoice2-0.5B + # voice: anna + # speed: 1.0 + + asr: + # provider: buffered | openai_compatible | siliconflow + provider: openai_compatible + api_key: sk-thmzysdpqqmhqxxshyqoxvjeiflexjdgaftyufrsgrhpjnyx + api_url: https://api.siliconflow.cn/v1/audio/transcriptions + model: FunAudioLLM/SenseVoiceSmall + interim_interval_ms: 500 + min_audio_ms: 300 + start_min_speech_ms: 160 + pre_speech_ms: 240 + final_tail_ms: 120 + + duplex: + enabled: true + system_prompt: You are a helpful, friendly voice assistant. Keep your responses concise and conversational. + + barge_in: + min_duration_ms: 200 + silence_tolerance_ms: 60 + + tools: + - calculator + - current_time diff --git a/engine/agents/example.yaml b/engine/agents/example.yaml new file mode 100644 index 0000000..dd0e927 --- /dev/null +++ b/engine/agents/example.yaml @@ -0,0 +1,55 @@ +# Agent behavior configuration (safe to edit per profile) +# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers). +# Infra/server/network settings should stay in .env. + +agent: + vad: + type: silero + model_path: data/vad/silero_vad.onnx + threshold: 0.5 + min_speech_duration_ms: 100 + eou_threshold_ms: 800 + + llm: + # provider: openai | openai_compatible | siliconflow + provider: openai_compatible + model: deepseek-v3 + temperature: 0.7 + # Required: no fallback. You can still reference env explicitly. + api_key: your_llm_api_key + # Optional for OpenAI-compatible endpoints: + api_url: https://api.qnaigc.com/v1 + + tts: + # provider: edge | openai_compatible | siliconflow | dashscope + # dashscope defaults (if omitted): + # api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime + # model: qwen3-tts-flash-realtime + # dashscope_mode: commit (engine splits) | server_commit (dashscope splits) + # note: dashscope_mode/mode is ONLY used when provider=dashscope. + provider: openai_compatible + api_key: your_tts_api_key + api_url: https://api.siliconflow.cn/v1/audio/speech + model: FunAudioLLM/CosyVoice2-0.5B + voice: anna + speed: 1.0 + + asr: + # provider: buffered | openai_compatible | siliconflow + provider: openai_compatible + api_key: you_asr_api_key + api_url: https://api.siliconflow.cn/v1/audio/transcriptions + model: FunAudioLLM/SenseVoiceSmall + interim_interval_ms: 500 + min_audio_ms: 300 + start_min_speech_ms: 160 + pre_speech_ms: 240 + final_tail_ms: 120 + + duplex: + enabled: true + system_prompt: You are a helpful, friendly voice assistant. Keep your responses concise and conversational. + + barge_in: + min_duration_ms: 200 + silence_tolerance_ms: 60 diff --git a/engine/agents/tools.yaml b/engine/agents/tools.yaml new file mode 100644 index 0000000..4d8bd72 --- /dev/null +++ b/engine/agents/tools.yaml @@ -0,0 +1,78 @@ +# Agent behavior configuration with tool declarations. +# This profile is an example only. + +agent: + vad: + type: silero + model_path: data/vad/silero_vad.onnx + threshold: 0.5 + min_speech_duration_ms: 100 + eou_threshold_ms: 800 + + llm: + # provider: openai | openai_compatible | siliconflow + provider: openai_compatible + model: deepseek-v3 + temperature: 0.7 + api_key: your_llm_api_key + api_url: https://api.qnaigc.com/v1 + + tts: + # provider: edge | openai_compatible | siliconflow | dashscope + # dashscope defaults (if omitted): + # api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime + # model: qwen3-tts-flash-realtime + # dashscope_mode: commit (engine splits) | server_commit (dashscope splits) + # note: dashscope_mode/mode is ONLY used when provider=dashscope. + provider: openai_compatible + api_key: your_tts_api_key + api_url: https://api.siliconflow.cn/v1/audio/speech + model: FunAudioLLM/CosyVoice2-0.5B + voice: anna + speed: 1.0 + + asr: + # provider: buffered | openai_compatible | siliconflow + provider: openai_compatible + api_key: your_asr_api_key + api_url: https://api.siliconflow.cn/v1/audio/transcriptions + model: FunAudioLLM/SenseVoiceSmall + interim_interval_ms: 500 + min_audio_ms: 300 + start_min_speech_ms: 160 + pre_speech_ms: 240 + final_tail_ms: 120 + + duplex: + enabled: true + system_prompt: You are a helpful voice assistant with tool-calling support. + + barge_in: + min_duration_ms: 200 + silence_tolerance_ms: 60 + + # Tool declarations consumed by the engine at startup. + # - String form enables built-in/default tool schema when available. + # - Object form provides OpenAI function schema + executor hint. + tools: + - current_time + - calculator + - name: weather + description: Get weather by city name. + parameters: + type: object + properties: + city: + type: string + description: City name, for example "San Francisco". + required: [city] + executor: server + - name: open_map + description: Open map app on the client device. + parameters: + type: object + properties: + query: + type: string + required: [query] + executor: client diff --git a/web/pages/VoiceLibrary.tsx b/web/pages/VoiceLibrary.tsx index 4932463..a133bec 100644 --- a/web/pages/VoiceLibrary.tsx +++ b/web/pages/VoiceLibrary.tsx @@ -5,6 +5,12 @@ import { Voice } from '../types'; import { createVoice, deleteVoice, fetchVoices, previewVoice, updateVoice } from '../services/backendApi'; const OPENAI_COMPATIBLE_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B'; +const OPENAI_COMPATIBLE_DEFAULT_VOICE = 'FunAudioLLM/CosyVoice2-0.5B:anna'; +const DASHSCOPE_DEFAULT_MODEL = 'qwen3-tts-flash-realtime'; +const DASHSCOPE_DEFAULT_VOICE = 'Cherry'; +const DASHSCOPE_DEFAULT_BASE_URL = 'wss://dashscope.aliyuncs.com/api-ws/v1/realtime'; + +type VoiceVendor = 'OpenAI Compatible' | 'DashScope'; const buildOpenAICompatibleVoiceKey = (rawId: string, model: string): string => { const id = (rawId || '').trim(); @@ -249,11 +255,11 @@ const AddVoiceModal: React.FC<{ onSuccess: (voice: Voice) => Promise; initialVoice?: Voice; }> = ({ isOpen, onClose, onSuccess, initialVoice }) => { - const [vendor, setVendor] = useState<'OpenAI Compatible'>('OpenAI Compatible'); + const [vendor, setVendor] = useState('OpenAI Compatible'); const [name, setName] = useState(''); const [openaiCompatibleModel, setOpenaiCompatibleModel] = useState(OPENAI_COMPATIBLE_DEFAULT_MODEL); - const [sfVoiceId, setSfVoiceId] = useState('FunAudioLLM/CosyVoice2-0.5B:anna'); + const [sfVoiceId, setSfVoiceId] = useState(OPENAI_COMPATIBLE_DEFAULT_VOICE); const [sfSpeed, setSfSpeed] = useState(1); const [sfGain, setSfGain] = useState(0); const [sfPitch, setSfPitch] = useState(0); @@ -270,10 +276,33 @@ const AddVoiceModal: React.FC<{ const testAudioRef = useRef(null); useEffect(() => { - if (!initialVoice) return; - const nextVendor = 'OpenAI Compatible'; - const nextModel = initialVoice.model || OPENAI_COMPATIBLE_DEFAULT_MODEL; - const defaultVoiceKey = buildOpenAICompatibleVoiceKey(initialVoice.id || initialVoice.name || '', nextModel); + if (!isOpen) return; + + if (!initialVoice) { + setVendor('OpenAI Compatible'); + setName(''); + setGender('Female'); + setLanguage('zh'); + setDescription(''); + setOpenaiCompatibleModel(OPENAI_COMPATIBLE_DEFAULT_MODEL); + setSfVoiceId(OPENAI_COMPATIBLE_DEFAULT_VOICE); + setSfSpeed(1); + setSfGain(0); + setSfPitch(0); + setApiKey(''); + setBaseUrl(''); + setTestInput('你好,正在测试语音合成效果。'); + return; + } + + const nextVendor: VoiceVendor = String(initialVoice.vendor || '').trim().toLowerCase() === 'dashscope' + ? 'DashScope' + : 'OpenAI Compatible'; + const nextModel = (initialVoice.model || (nextVendor === 'DashScope' ? DASHSCOPE_DEFAULT_MODEL : OPENAI_COMPATIBLE_DEFAULT_MODEL)).trim(); + const defaultVoiceKey = nextVendor === 'DashScope' + ? DASHSCOPE_DEFAULT_VOICE + : buildOpenAICompatibleVoiceKey(initialVoice.id || initialVoice.name || '', nextModel); + setVendor(nextVendor); setName(initialVoice.name || ''); setGender(initialVoice.gender || 'Female'); @@ -285,7 +314,7 @@ const AddVoiceModal: React.FC<{ setSfGain(initialVoice.gain ?? 0); setSfPitch(initialVoice.pitch ?? 0); setApiKey(initialVoice.apiKey || ''); - setBaseUrl(initialVoice.baseUrl || ''); + setBaseUrl(initialVoice.baseUrl || (nextVendor === 'DashScope' ? DASHSCOPE_DEFAULT_BASE_URL : '')); }, [initialVoice, isOpen]); const handleAudition = async () => { @@ -316,10 +345,23 @@ const AddVoiceModal: React.FC<{ return; } + const resolvedModel = (() => { + const current = (openaiCompatibleModel || '').trim(); + if (current) return current; + return vendor === 'DashScope' ? DASHSCOPE_DEFAULT_MODEL : OPENAI_COMPATIBLE_DEFAULT_MODEL; + })(); + const resolvedVoiceKey = (() => { const current = (sfVoiceId || '').trim(); if (current) return current; - return buildOpenAICompatibleVoiceKey(initialVoice?.id || name, openaiCompatibleModel || OPENAI_COMPATIBLE_DEFAULT_MODEL); + if (vendor === 'DashScope') return DASHSCOPE_DEFAULT_VOICE; + return buildOpenAICompatibleVoiceKey(initialVoice?.id || name, resolvedModel); + })(); + + const resolvedBaseUrl = (() => { + const current = (baseUrl || '').trim(); + if (current) return current; + return vendor === 'DashScope' ? DASHSCOPE_DEFAULT_BASE_URL : ''; })(); const newVoice: Voice = { @@ -328,11 +370,11 @@ const AddVoiceModal: React.FC<{ vendor, gender, language, - description: description || `Model: ${openaiCompatibleModel}`, - model: openaiCompatibleModel, + description: description || `Model: ${resolvedModel}`, + model: resolvedModel, voiceKey: resolvedVoiceKey, apiKey, - baseUrl, + baseUrl: resolvedBaseUrl, speed: sfSpeed, gain: sfGain, pitch: sfPitch, @@ -346,6 +388,11 @@ const AddVoiceModal: React.FC<{ setDescription(''); setApiKey(''); setBaseUrl(''); + setOpenaiCompatibleModel(OPENAI_COMPATIBLE_DEFAULT_MODEL); + setSfVoiceId(OPENAI_COMPATIBLE_DEFAULT_VOICE); + setSfSpeed(1); + setSfGain(0); + setSfPitch(0); } catch (error: any) { alert(error?.message || '保存失败'); } finally { @@ -370,7 +417,10 @@ const AddVoiceModal: React.FC<{
- +
@@ -388,12 +438,16 @@ const AddVoiceModal: React.FC<{ className="font-mono text-xs" value={openaiCompatibleModel} onChange={(e) => setOpenaiCompatibleModel(e.target.value)} - placeholder="例如: FunAudioLLM/CosyVoice2-0.5B" + placeholder={vendor === 'DashScope' ? DASHSCOPE_DEFAULT_MODEL : OPENAI_COMPATIBLE_DEFAULT_MODEL} />
- setSfVoiceId(e.target.value)} placeholder="FunAudioLLM/CosyVoice2-0.5B:anna" /> + setSfVoiceId(e.target.value)} + placeholder={vendor === 'DashScope' ? DASHSCOPE_DEFAULT_VOICE : OPENAI_COMPATIBLE_DEFAULT_VOICE} + />
@@ -429,7 +483,11 @@ const AddVoiceModal: React.FC<{
- setBaseUrl(e.target.value)} placeholder="https://.../v1" /> + setBaseUrl(e.target.value)} + placeholder={vendor === 'DashScope' ? DASHSCOPE_DEFAULT_BASE_URL : 'https://.../v1'} + />
diff --git a/web/services/backendApi.ts b/web/services/backendApi.ts index b6a65ab..f57a7a1 100644 --- a/web/services/backendApi.ts +++ b/web/services/backendApi.ts @@ -60,6 +60,9 @@ const mapVoice = (raw: AnyRecord): Voice => ({ name: readField(raw, ['name'], ''), vendor: ((): string => { const vendor = String(readField(raw, ['vendor'], '')).trim().toLowerCase(); + if (vendor === 'dashscope') { + return 'DashScope'; + } if (vendor === 'siliconflow' || vendor === '硅基流动' || vendor === 'openai-compatible') { return 'OpenAI Compatible'; }