Voice library support dashscope

This commit is contained in:
Xin Wang
2026-02-26 03:54:52 +08:00
parent b193f91432
commit f77f7c7531
11 changed files with 684 additions and 26 deletions

View File

@@ -34,6 +34,12 @@ def _is_openai_compatible_vendor(vendor: Optional[str]) -> bool:
} }
def _is_dashscope_vendor(vendor: Optional[str]) -> bool:
return (vendor or "").strip().lower() in {
"dashscope",
}
def _normalize_openai_compatible_voice_key(voice_value: str, model: str) -> str: def _normalize_openai_compatible_voice_key(voice_value: str, model: str) -> str:
raw = (voice_value or "").strip() raw = (voice_value or "").strip()
model_name = (model or "").strip() or OPENAI_COMPATIBLE_DEFAULT_MODEL model_name = (model or "").strip() or OPENAI_COMPATIBLE_DEFAULT_MODEL
@@ -121,7 +127,12 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s
elif assistant.voice: elif assistant.voice:
voice = db.query(Voice).filter(Voice.id == assistant.voice).first() voice = db.query(Voice).filter(Voice.id == assistant.voice).first()
if voice: if voice:
tts_provider = "openai_compatible" if _is_openai_compatible_vendor(voice.vendor) else "edge" if _is_dashscope_vendor(voice.vendor):
tts_provider = "dashscope"
elif _is_openai_compatible_vendor(voice.vendor):
tts_provider = "openai_compatible"
else:
tts_provider = "edge"
model = voice.model model = voice.model
runtime_voice = voice.voice_key or voice.id runtime_voice = voice.voice_key or voice.id
if tts_provider == "openai_compatible": if tts_provider == "openai_compatible":
@@ -131,8 +142,8 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s
"enabled": True, "enabled": True,
"provider": tts_provider, "provider": tts_provider,
"model": model, "model": model,
"apiKey": voice.api_key if tts_provider == "openai_compatible" else None, "apiKey": voice.api_key if tts_provider in {"openai_compatible", "dashscope"} else None,
"baseUrl": voice.base_url if tts_provider == "openai_compatible" else None, "baseUrl": voice.base_url if tts_provider in {"openai_compatible", "dashscope"} else None,
"voice": runtime_voice, "voice": runtime_voice,
"speed": assistant.speed or voice.speed, "speed": assistant.speed or voice.speed,
} }

View File

@@ -1,6 +1,10 @@
import base64 import base64
import io
import json
import os import os
from typing import Optional import threading
import wave
from typing import Any, Dict, Optional
import httpx import httpx
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
@@ -14,6 +18,203 @@ from ..schemas import VoiceCreate, VoiceOut, VoicePreviewRequest, VoicePreviewRe
router = APIRouter(prefix="/voices", tags=["Voices"]) router = APIRouter(prefix="/voices", tags=["Voices"])
OPENAI_COMPATIBLE_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B" OPENAI_COMPATIBLE_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
DASHSCOPE_DEFAULT_MODEL = "qwen3-tts-flash-realtime"
DASHSCOPE_DEFAULT_VOICE_KEY = "Cherry"
DASHSCOPE_DEFAULT_BASE_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
try:
import dashscope
from dashscope.audio.qwen_tts_realtime import AudioFormat, QwenTtsRealtime, QwenTtsRealtimeCallback
DASHSCOPE_SDK_AVAILABLE = True
except ImportError:
dashscope = None # type: ignore[assignment]
AudioFormat = None # type: ignore[assignment]
QwenTtsRealtime = None # type: ignore[assignment]
DASHSCOPE_SDK_AVAILABLE = False
class QwenTtsRealtimeCallback: # type: ignore[no-redef]
"""Fallback callback base when DashScope SDK is unavailable."""
pass
class _DashScopePreviewCallback(QwenTtsRealtimeCallback):
"""Collect DashScope realtime callback events and PCM chunks."""
def __init__(self) -> None:
super().__init__()
self._open_event = threading.Event()
self._done_event = threading.Event()
self._lock = threading.Lock()
self._audio_chunks: list[bytes] = []
self._error_message: Optional[str] = None
def on_open(self) -> None:
self._open_event.set()
def on_close(self, code: int, reason: str) -> None:
if not self._done_event.is_set():
self._error_message = f"DashScope websocket closed unexpectedly: {code} {reason}"
self._done_event.set()
def on_error(self, message: str) -> None:
self._error_message = str(message)
self._done_event.set()
def on_event(self, response: Any) -> None:
payload = _coerce_dashscope_event(response)
event_type = str(payload.get("type") or "").strip()
if event_type == "response.audio.delta":
delta = payload.get("delta")
if isinstance(delta, str):
try:
self._append_audio(base64.b64decode(delta))
except Exception:
return
elif event_type in {"response.done", "session.finished"}:
self._done_event.set()
elif event_type == "error":
self._error_message = _format_dashscope_error_event(payload)
self._done_event.set()
def on_data(self, data: bytes) -> None:
# Some SDK versions emit raw PCM frames via on_data.
if isinstance(data, (bytes, bytearray)):
self._append_audio(bytes(data))
def wait_for_open(self, timeout: float = 10.0) -> None:
if not self._open_event.wait(timeout):
raise TimeoutError("DashScope websocket open timeout")
def wait_for_done(self, timeout: float = 45.0) -> None:
if not self._done_event.wait(timeout):
raise TimeoutError("DashScope synthesis timeout")
def raise_if_error(self) -> None:
if self._error_message:
raise RuntimeError(self._error_message)
def read_audio(self) -> bytes:
with self._lock:
return b"".join(self._audio_chunks)
def _append_audio(self, chunk: bytes) -> None:
if not chunk:
return
with self._lock:
self._audio_chunks.append(chunk)
def _coerce_dashscope_event(response: Any) -> Dict[str, Any]:
if isinstance(response, dict):
return response
if isinstance(response, str):
try:
parsed = json.loads(response)
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
pass
return {"type": "raw", "message": str(response)}
def _format_dashscope_error_event(payload: Dict[str, Any]) -> str:
error = payload.get("error")
if isinstance(error, dict):
code = str(error.get("code") or "").strip()
message = str(error.get("message") or "").strip()
if code and message:
return f"{code}: {message}"
return message or str(error)
return str(error or "DashScope realtime TTS error")
def _create_dashscope_realtime_client(*, model: str, callback: _DashScopePreviewCallback, url: str, api_key: str) -> Any:
if QwenTtsRealtime is None:
raise RuntimeError("DashScope SDK unavailable")
init_kwargs = {
"model": model,
"callback": callback,
"url": url,
}
try:
return QwenTtsRealtime(api_key=api_key, **init_kwargs) # type: ignore[misc]
except TypeError as exc:
if "api_key" not in str(exc):
raise
return QwenTtsRealtime(**init_kwargs) # type: ignore[misc]
def _pcm16_to_wav_bytes(pcm_bytes: bytes, sample_rate: int = 24000) -> bytes:
with io.BytesIO() as buffer:
with wave.open(buffer, "wb") as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
wav_file.writeframes(pcm_bytes)
return buffer.getvalue()
def _synthesize_dashscope_preview(
*,
text: str,
api_key: str,
base_url: str,
model: str,
voice_key: str,
speed: Optional[float],
) -> bytes:
if not DASHSCOPE_SDK_AVAILABLE:
raise RuntimeError("dashscope package not installed; install with `pip install dashscope>=1.25.11`")
if not AudioFormat:
raise RuntimeError("DashScope SDK AudioFormat unavailable")
callback = _DashScopePreviewCallback()
if dashscope is not None:
dashscope.api_key = api_key
client = _create_dashscope_realtime_client(
model=model,
callback=callback,
url=base_url,
api_key=api_key,
)
try:
client.connect()
callback.wait_for_open()
session_kwargs: Dict[str, Any] = {
"voice": voice_key,
"response_format": AudioFormat.PCM_24000HZ_MONO_16BIT,
"mode": "commit",
}
# speech_rate is supported by qwen3-* realtime models.
normalized_model = str(model or "").strip().lower()
if speed is not None and normalized_model.startswith("qwen3-"):
session_kwargs["speech_rate"] = max(0.5, min(2.0, float(speed)))
client.update_session(**session_kwargs)
client.append_text(text)
client.commit()
callback.wait_for_done()
callback.raise_if_error()
pcm_audio = callback.read_audio()
if not pcm_audio:
raise RuntimeError("No audio chunk returned from DashScope realtime synthesis")
return _pcm16_to_wav_bytes(pcm_audio, sample_rate=24000)
finally:
finish_fn = getattr(client, "finish", None)
if callable(finish_fn):
try:
finish_fn()
except Exception:
pass
close_fn = getattr(client, "close", None)
if callable(close_fn):
try:
close_fn()
except Exception:
pass
def _is_openai_compatible_vendor(vendor: str) -> bool: def _is_openai_compatible_vendor(vendor: str) -> bool:
@@ -26,9 +227,18 @@ def _is_openai_compatible_vendor(vendor: str) -> bool:
} }
def _is_dashscope_vendor(vendor: str) -> bool:
normalized = (vendor or "").strip().lower()
return normalized in {
"dashscope",
}
def _default_base_url(vendor: str) -> Optional[str]: def _default_base_url(vendor: str) -> Optional[str]:
if _is_openai_compatible_vendor(vendor): if _is_openai_compatible_vendor(vendor):
return "https://api.siliconflow.cn/v1" return "https://api.siliconflow.cn/v1"
if _is_dashscope_vendor(vendor):
return DASHSCOPE_DEFAULT_BASE_URL
return None return None
@@ -76,6 +286,9 @@ def create_voice(data: VoiceCreate, db: Session = Depends(get_db)):
if not voice_key: if not voice_key:
raw_id = (data.id or data.name).strip() raw_id = (data.id or data.name).strip()
voice_key = raw_id if ":" in raw_id else f"{model}:{raw_id}" voice_key = raw_id if ":" in raw_id else f"{model}:{raw_id}"
elif _is_dashscope_vendor(vendor):
model = (model or "").strip() or DASHSCOPE_DEFAULT_MODEL
voice_key = (voice_key or "").strip() or DASHSCOPE_DEFAULT_VOICE_KEY
voice = Voice( voice = Voice(
id=unique_short_id("tts", db, Voice), id=unique_short_id("tts", db, Voice),
@@ -126,6 +339,11 @@ def update_voice(id: str, data: VoiceUpdate, db: Session = Depends(get_db)):
voice_key = update_data.get("voice_key") or voice.voice_key voice_key = update_data.get("voice_key") or voice.voice_key
update_data["model"] = model update_data["model"] = model
update_data["voice_key"] = voice_key or _build_openai_compatible_voice_key(voice, model) update_data["voice_key"] = voice_key or _build_openai_compatible_voice_key(voice, model)
elif _is_dashscope_vendor(vendor_for_defaults):
model = update_data.get("model") or voice.model or DASHSCOPE_DEFAULT_MODEL
voice_key = update_data.get("voice_key") or voice.voice_key or DASHSCOPE_DEFAULT_VOICE_KEY
update_data["model"] = model
update_data["voice_key"] = voice_key
for field, value in update_data.items(): for field, value in update_data.items():
setattr(voice, field, value) setattr(voice, field, value)
@@ -148,7 +366,7 @@ def delete_voice(id: str, db: Session = Depends(get_db)):
@router.post("/{id}/preview", response_model=VoicePreviewResponse) @router.post("/{id}/preview", response_model=VoicePreviewResponse)
def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_db)): def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_db)):
"""试听指定声音,基于 OpenAI-compatible /audio/speech 接口""" """试听指定声音,支持 OpenAI-compatible 与 DashScope Realtime"""
voice = db.query(Voice).filter(Voice.id == id).first() voice = db.query(Voice).filter(Voice.id == id).first()
if not voice: if not voice:
raise HTTPException(status_code=404, detail="Voice not found") raise HTTPException(status_code=404, detail="Voice not found")
@@ -157,6 +375,31 @@ def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_
if not text: if not text:
raise HTTPException(status_code=400, detail="Preview text cannot be empty") raise HTTPException(status_code=400, detail="Preview text cannot be empty")
if _is_dashscope_vendor(voice.vendor):
api_key = (data.api_key or "").strip() or (voice.api_key or "").strip()
if not api_key:
api_key = os.getenv("DASHSCOPE_API_KEY", "").strip() or os.getenv("TTS_API_KEY", "").strip()
if not api_key:
raise HTTPException(status_code=400, detail=f"API key is required for voice: {voice.name}")
base_url = (voice.base_url or "").strip() or DASHSCOPE_DEFAULT_BASE_URL
model = (voice.model or "").strip() or DASHSCOPE_DEFAULT_MODEL
voice_key = (voice.voice_key or "").strip() or DASHSCOPE_DEFAULT_VOICE_KEY
effective_speed = data.speed if data.speed is not None else voice.speed
try:
wav_bytes = _synthesize_dashscope_preview(
text=text,
api_key=api_key,
base_url=base_url,
model=model,
voice_key=voice_key,
speed=effective_speed,
)
except Exception as exc:
raise HTTPException(status_code=502, detail=f"DashScope preview failed: {exc}") from exc
audio_base64 = base64.b64encode(wav_bytes).decode("utf-8")
return VoicePreviewResponse(success=True, audio_url=f"data:audio/wav;base64,{audio_base64}")
api_key = (data.api_key or "").strip() or (voice.api_key or "").strip() api_key = (data.api_key or "").strip() or (voice.api_key or "").strip()
if not api_key and _is_openai_compatible_vendor(voice.vendor): if not api_key and _is_openai_compatible_vendor(voice.vendor):
api_key = os.getenv("SILICONFLOW_API_KEY", "").strip() api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()

View File

@@ -13,12 +13,16 @@ from app.id_generator import short_id
from app.models import Voice, Assistant, KnowledgeBase, Workflow, LLMModel, ASRModel, KnowledgeDocument from app.models import Voice, Assistant, KnowledgeBase, Workflow, LLMModel, ASRModel, KnowledgeDocument
VOICE_MODEL = "FunAudioLLM/CosyVoice2-0.5B" VOICE_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
DASHSCOPE_VOICE_MODEL = "qwen3-tts-flash-realtime"
DASHSCOPE_DEFAULT_VOICE_KEY = "Cherry"
DASHSCOPE_REALTIME_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
SEED_VOICE_IDS = { SEED_VOICE_IDS = {
"alex": short_id("tts"), "alex": short_id("tts"),
"david": short_id("tts"), "david": short_id("tts"),
"bella": short_id("tts"), "bella": short_id("tts"),
"claire": short_id("tts"), "claire": short_id("tts"),
"dashscope_cherry": short_id("tts"),
} }
SEED_LLM_IDS = { SEED_LLM_IDS = {
@@ -177,8 +181,20 @@ def init_default_data():
voice_key=f"{VOICE_MODEL}:claire", voice_key=f"{VOICE_MODEL}:claire",
is_system=True, is_system=True,
), ),
Voice(
id=SEED_VOICE_IDS["dashscope_cherry"],
name="DashScope Cherry",
vendor="DashScope",
gender="Female",
language="zh",
description="DashScope realtime sample voice.",
model=DASHSCOPE_VOICE_MODEL,
voice_key=DASHSCOPE_DEFAULT_VOICE_KEY,
base_url=DASHSCOPE_REALTIME_URL,
is_system=True,
),
] ]
seed_if_empty(db, Voice, voices, "✅ 默认声音数据已初始化 (OpenAI Compatible CosyVoice 2.0)") seed_if_empty(db, Voice, voices, "✅ 默认声音数据已初始化 (OpenAI Compatible + DashScope)")
def init_default_tools(recreate: bool = False): def init_default_tools(recreate: bool = False):

View File

@@ -9,3 +9,4 @@ minio==7.2.0
httpx==0.26.0 httpx==0.26.0
chromadb==0.4.22 chromadb==0.4.22
openai==1.12.0 openai==1.12.0
dashscope==1.25.11

View File

@@ -186,9 +186,11 @@ class TestAssistantAPI:
sample_asr_model_data["vendor"] = "OpenAI Compatible" sample_asr_model_data["vendor"] = "OpenAI Compatible"
llm_resp = client.post("/api/llm", json=sample_llm_model_data) llm_resp = client.post("/api/llm", json=sample_llm_model_data)
assert llm_resp.status_code == 200 assert llm_resp.status_code == 200
llm_id = llm_resp.json()["id"]
asr_resp = client.post("/api/asr", json=sample_asr_model_data) asr_resp = client.post("/api/asr", json=sample_asr_model_data)
assert asr_resp.status_code == 200 assert asr_resp.status_code == 200
asr_id = asr_resp.json()["id"]
sample_voice_data["vendor"] = "OpenAI Compatible" sample_voice_data["vendor"] = "OpenAI Compatible"
sample_voice_data["base_url"] = "https://tts.example.com/v1/audio/speech" sample_voice_data["base_url"] = "https://tts.example.com/v1/audio/speech"
@@ -198,8 +200,8 @@ class TestAssistantAPI:
voice_id = voice_resp.json()["id"] voice_id = voice_resp.json()["id"]
sample_assistant_data.update({ sample_assistant_data.update({
"llmModelId": sample_llm_model_data["id"], "llmModelId": llm_id,
"asrModelId": sample_asr_model_data["id"], "asrModelId": asr_id,
"voice": voice_id, "voice": voice_id,
"prompt": "runtime prompt", "prompt": "runtime prompt",
"opener": "runtime opener", "opener": "runtime opener",
@@ -220,7 +222,8 @@ class TestAssistantAPI:
assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"] assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"]
assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"] assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"]
assert metadata["services"]["asr"]["baseUrl"] == sample_asr_model_data["base_url"] assert metadata["services"]["asr"]["baseUrl"] == sample_asr_model_data["base_url"]
assert metadata["services"]["tts"]["voice"] == sample_voice_data["voice_key"] expected_tts_voice = f"{sample_voice_data['model']}:{sample_voice_data['voice_key']}"
assert metadata["services"]["tts"]["voice"] == expected_tts_voice
assert metadata["services"]["tts"]["baseUrl"] == sample_voice_data["base_url"] assert metadata["services"]["tts"]["baseUrl"] == sample_voice_data["base_url"]
def test_get_engine_config_endpoint(self, client, sample_assistant_data): def test_get_engine_config_endpoint(self, client, sample_assistant_data):
@@ -252,6 +255,38 @@ class TestAssistantAPI:
assert metadata["output"]["mode"] == "text" assert metadata["output"]["mode"] == "text"
assert metadata["services"]["tts"]["enabled"] is False assert metadata["services"]["tts"]["enabled"] is False
def test_runtime_config_dashscope_voice_provider(self, client, sample_assistant_data):
"""DashScope voices should map to dashscope tts provider in runtime metadata."""
voice_resp = client.post("/api/voices", json={
"name": "DashScope Cherry",
"vendor": "DashScope",
"gender": "Female",
"language": "zh",
"description": "dashscope voice",
"api_key": "dashscope-key",
"base_url": "wss://dashscope.aliyuncs.com/api-ws/v1/realtime",
})
assert voice_resp.status_code == 200
voice_payload = voice_resp.json()
sample_assistant_data.update({
"voice": voice_payload["id"],
"voiceOutputEnabled": True,
})
assistant_resp = client.post("/api/assistants", json=sample_assistant_data)
assert assistant_resp.status_code == 200
assistant_id = assistant_resp.json()["id"]
runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config")
assert runtime_resp.status_code == 200
metadata = runtime_resp.json()["sessionStartMetadata"]
tts = metadata["services"]["tts"]
assert tts["provider"] == "dashscope"
assert tts["voice"] == "Cherry"
assert tts["model"] == "qwen3-tts-flash-realtime"
assert tts["apiKey"] == "dashscope-key"
assert tts["baseUrl"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
def test_assistant_interrupt_and_generated_opener_flags(self, client, sample_assistant_data): def test_assistant_interrupt_and_generated_opener_flags(self, client, sample_assistant_data):
sample_assistant_data.update({ sample_assistant_data.update({
"firstTurnMode": "user_first", "firstTurnMode": "user_first",

View File

@@ -171,8 +171,9 @@ class TestVoiceAPI:
"voice_key": "FunAudioLLM/CosyVoice2-0.5B:anna" "voice_key": "FunAudioLLM/CosyVoice2-0.5B:anna"
}) })
assert create_resp.status_code == 200 assert create_resp.status_code == 200
voice_id = create_resp.json()["id"]
preview_resp = client.post("/api/voices/anna/preview", json={"text": "你好"}) preview_resp = client.post(f"/api/voices/{voice_id}/preview", json={"text": "你好"})
assert preview_resp.status_code == 200 assert preview_resp.status_code == 200
payload = preview_resp.json() payload = preview_resp.json()
assert payload["success"] is True assert payload["success"] is True
@@ -228,8 +229,103 @@ class TestVoiceAPI:
"base_url": "https://api.siliconflow.cn/v1" "base_url": "https://api.siliconflow.cn/v1"
}) })
assert create_resp.status_code == 200 assert create_resp.status_code == 200
voice_id = create_resp.json()["id"]
preview_resp = client.post("/api/voices/anna2/preview", json={"text": "hello"}) preview_resp = client.post(f"/api/voices/{voice_id}/preview", json={"text": "hello"})
assert preview_resp.status_code == 200 assert preview_resp.status_code == 200
assert captured_auth["value"] == "Bearer voice-key-123" assert captured_auth["value"] == "Bearer voice-key-123"
assert captured_url["value"] == "https://api.siliconflow.cn/v1/audio/speech" assert captured_url["value"] == "https://api.siliconflow.cn/v1/audio/speech"
def test_create_voice_dashscope_defaults(self, client):
"""Test creating DashScope voice applies model/voice defaults."""
create_resp = client.post("/api/voices", json={
"name": "DashScope Voice",
"vendor": "DashScope",
"gender": "Female",
"language": "zh",
"description": "dashscope",
})
assert create_resp.status_code == 200
payload = create_resp.json()
assert payload["vendor"] == "DashScope"
assert payload["model"] == "qwen3-tts-flash-realtime"
assert payload["voice_key"] == "Cherry"
def test_preview_voice_dashscope_success(self, client, monkeypatch):
"""DashScope voice preview should return playable wav data url."""
from app.routers import voices as voice_router
captured = {
"api_key": "",
"model": "",
"url": "",
"session": {},
"text": "",
}
class DummyAudioFormat:
PCM_24000HZ_MONO_16BIT = "pcm24k16mono"
class DummyDashScopeModule:
api_key = ""
class DummyRealtime:
def __init__(self, *args, **kwargs):
captured["api_key"] = kwargs.get("api_key", "")
captured["model"] = kwargs.get("model", "")
captured["url"] = kwargs.get("url", "")
self.callback = kwargs["callback"]
def connect(self):
self.callback.on_open()
def update_session(self, **kwargs):
captured["session"] = kwargs
def append_text(self, text):
captured["text"] = text
def commit(self):
# 16-bit PCM mono samples
raw_pcm = b"\x00\x00\x01\x00\x02\x00\x03\x00"
self.callback.on_event({
"type": "response.audio.delta",
"delta": base64.b64encode(raw_pcm).decode("utf-8"),
})
self.callback.on_event({"type": "response.done"})
def finish(self):
return None
def close(self):
return None
monkeypatch.setattr(voice_router, "DASHSCOPE_SDK_AVAILABLE", True)
monkeypatch.setattr(voice_router, "AudioFormat", DummyAudioFormat)
monkeypatch.setattr(voice_router, "QwenTtsRealtime", DummyRealtime)
monkeypatch.setattr(voice_router, "dashscope", DummyDashScopeModule())
create_resp = client.post("/api/voices", json={
"name": "DashScope Voice",
"vendor": "DashScope",
"gender": "Female",
"language": "zh",
"description": "dashscope",
"api_key": "dashscope-key",
"base_url": "wss://dashscope.aliyuncs.com/api-ws/v1/realtime",
})
assert create_resp.status_code == 200
voice_id = create_resp.json()["id"]
preview_resp = client.post(f"/api/voices/{voice_id}/preview", json={"text": "你好"})
assert preview_resp.status_code == 200
payload = preview_resp.json()
assert payload["success"] is True
assert payload["audio_url"].startswith("data:audio/wav;base64,")
encoded = payload["audio_url"].split(",", 1)[1]
wav_bytes = base64.b64decode(encoded)
assert wav_bytes.startswith(b"RIFF")
assert captured["model"] == "qwen3-tts-flash-realtime"
assert captured["url"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
assert captured["text"] == "你好"
assert captured["session"]["voice"] == "Cherry"

View File

@@ -0,0 +1,62 @@
# Agent behavior configuration (safe to edit per profile)
# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers).
# Infra/server/network settings should stay in .env.
agent:
vad:
type: silero
model_path: data/vad/silero_vad.onnx
threshold: 0.5
min_speech_duration_ms: 100
eou_threshold_ms: 800
llm:
# provider: openai | openai_compatible | siliconflow
provider: openai_compatible
model: deepseek-v3
temperature: 0.7
# Required: no fallback. You can still reference env explicitly.
api_key: sk-fc4d59b360475f53401a864db8ce0985010acc4e696723d20a90d6569f38d80a
# Optional for OpenAI-compatible endpoints:
api_url: https://api.qnaigc.com/v1
tts:
# provider: edge | openai_compatible | siliconflow | dashscope
# dashscope defaults (if omitted):
provider: dashscope
api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
model: qwen3-tts-flash-realtime
api_key: sk-391f5126d18345d497c6e8717c8c9ad7
mode: commit
voice: Cherry
speed: 1.0
# provider: openai_compatible
# api_key: sk-thmzysdpqqmhqxxshyqoxvjeiflexjdgaftyufrsgrhpjnyx
# api_url: https://api.siliconflow.cn/v1/audio/speech
# model: FunAudioLLM/CosyVoice2-0.5B
# voice: anna
# speed: 1.0
asr:
# provider: buffered | openai_compatible | siliconflow
provider: openai_compatible
api_key: sk-thmzysdpqqmhqxxshyqoxvjeiflexjdgaftyufrsgrhpjnyx
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
model: FunAudioLLM/SenseVoiceSmall
interim_interval_ms: 500
min_audio_ms: 300
start_min_speech_ms: 160
pre_speech_ms: 240
final_tail_ms: 120
duplex:
enabled: true
system_prompt: You are a helpful, friendly voice assistant. Keep your responses concise and conversational.
barge_in:
min_duration_ms: 200
silence_tolerance_ms: 60
tools:
- calculator
- current_time

View File

@@ -0,0 +1,55 @@
# Agent behavior configuration (safe to edit per profile)
# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers).
# Infra/server/network settings should stay in .env.
agent:
vad:
type: silero
model_path: data/vad/silero_vad.onnx
threshold: 0.5
min_speech_duration_ms: 100
eou_threshold_ms: 800
llm:
# provider: openai | openai_compatible | siliconflow
provider: openai_compatible
model: deepseek-v3
temperature: 0.7
# Required: no fallback. You can still reference env explicitly.
api_key: your_llm_api_key
# Optional for OpenAI-compatible endpoints:
api_url: https://api.qnaigc.com/v1
tts:
# provider: edge | openai_compatible | siliconflow | dashscope
# dashscope defaults (if omitted):
# api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
# model: qwen3-tts-flash-realtime
# dashscope_mode: commit (engine splits) | server_commit (dashscope splits)
# note: dashscope_mode/mode is ONLY used when provider=dashscope.
provider: openai_compatible
api_key: your_tts_api_key
api_url: https://api.siliconflow.cn/v1/audio/speech
model: FunAudioLLM/CosyVoice2-0.5B
voice: anna
speed: 1.0
asr:
# provider: buffered | openai_compatible | siliconflow
provider: openai_compatible
api_key: you_asr_api_key
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
model: FunAudioLLM/SenseVoiceSmall
interim_interval_ms: 500
min_audio_ms: 300
start_min_speech_ms: 160
pre_speech_ms: 240
final_tail_ms: 120
duplex:
enabled: true
system_prompt: You are a helpful, friendly voice assistant. Keep your responses concise and conversational.
barge_in:
min_duration_ms: 200
silence_tolerance_ms: 60

78
engine/agents/tools.yaml Normal file
View File

@@ -0,0 +1,78 @@
# Agent behavior configuration with tool declarations.
# This profile is an example only.
agent:
vad:
type: silero
model_path: data/vad/silero_vad.onnx
threshold: 0.5
min_speech_duration_ms: 100
eou_threshold_ms: 800
llm:
# provider: openai | openai_compatible | siliconflow
provider: openai_compatible
model: deepseek-v3
temperature: 0.7
api_key: your_llm_api_key
api_url: https://api.qnaigc.com/v1
tts:
# provider: edge | openai_compatible | siliconflow | dashscope
# dashscope defaults (if omitted):
# api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
# model: qwen3-tts-flash-realtime
# dashscope_mode: commit (engine splits) | server_commit (dashscope splits)
# note: dashscope_mode/mode is ONLY used when provider=dashscope.
provider: openai_compatible
api_key: your_tts_api_key
api_url: https://api.siliconflow.cn/v1/audio/speech
model: FunAudioLLM/CosyVoice2-0.5B
voice: anna
speed: 1.0
asr:
# provider: buffered | openai_compatible | siliconflow
provider: openai_compatible
api_key: your_asr_api_key
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
model: FunAudioLLM/SenseVoiceSmall
interim_interval_ms: 500
min_audio_ms: 300
start_min_speech_ms: 160
pre_speech_ms: 240
final_tail_ms: 120
duplex:
enabled: true
system_prompt: You are a helpful voice assistant with tool-calling support.
barge_in:
min_duration_ms: 200
silence_tolerance_ms: 60
# Tool declarations consumed by the engine at startup.
# - String form enables built-in/default tool schema when available.
# - Object form provides OpenAI function schema + executor hint.
tools:
- current_time
- calculator
- name: weather
description: Get weather by city name.
parameters:
type: object
properties:
city:
type: string
description: City name, for example "San Francisco".
required: [city]
executor: server
- name: open_map
description: Open map app on the client device.
parameters:
type: object
properties:
query:
type: string
required: [query]
executor: client

View File

@@ -5,6 +5,12 @@ import { Voice } from '../types';
import { createVoice, deleteVoice, fetchVoices, previewVoice, updateVoice } from '../services/backendApi'; import { createVoice, deleteVoice, fetchVoices, previewVoice, updateVoice } from '../services/backendApi';
const OPENAI_COMPATIBLE_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B'; const OPENAI_COMPATIBLE_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B';
const OPENAI_COMPATIBLE_DEFAULT_VOICE = 'FunAudioLLM/CosyVoice2-0.5B:anna';
const DASHSCOPE_DEFAULT_MODEL = 'qwen3-tts-flash-realtime';
const DASHSCOPE_DEFAULT_VOICE = 'Cherry';
const DASHSCOPE_DEFAULT_BASE_URL = 'wss://dashscope.aliyuncs.com/api-ws/v1/realtime';
type VoiceVendor = 'OpenAI Compatible' | 'DashScope';
const buildOpenAICompatibleVoiceKey = (rawId: string, model: string): string => { const buildOpenAICompatibleVoiceKey = (rawId: string, model: string): string => {
const id = (rawId || '').trim(); const id = (rawId || '').trim();
@@ -249,11 +255,11 @@ const AddVoiceModal: React.FC<{
onSuccess: (voice: Voice) => Promise<void>; onSuccess: (voice: Voice) => Promise<void>;
initialVoice?: Voice; initialVoice?: Voice;
}> = ({ isOpen, onClose, onSuccess, initialVoice }) => { }> = ({ isOpen, onClose, onSuccess, initialVoice }) => {
const [vendor, setVendor] = useState<'OpenAI Compatible'>('OpenAI Compatible'); const [vendor, setVendor] = useState<VoiceVendor>('OpenAI Compatible');
const [name, setName] = useState(''); const [name, setName] = useState('');
const [openaiCompatibleModel, setOpenaiCompatibleModel] = useState(OPENAI_COMPATIBLE_DEFAULT_MODEL); const [openaiCompatibleModel, setOpenaiCompatibleModel] = useState(OPENAI_COMPATIBLE_DEFAULT_MODEL);
const [sfVoiceId, setSfVoiceId] = useState('FunAudioLLM/CosyVoice2-0.5B:anna'); const [sfVoiceId, setSfVoiceId] = useState(OPENAI_COMPATIBLE_DEFAULT_VOICE);
const [sfSpeed, setSfSpeed] = useState(1); const [sfSpeed, setSfSpeed] = useState(1);
const [sfGain, setSfGain] = useState(0); const [sfGain, setSfGain] = useState(0);
const [sfPitch, setSfPitch] = useState(0); const [sfPitch, setSfPitch] = useState(0);
@@ -270,10 +276,33 @@ const AddVoiceModal: React.FC<{
const testAudioRef = useRef<HTMLAudioElement | null>(null); const testAudioRef = useRef<HTMLAudioElement | null>(null);
useEffect(() => { useEffect(() => {
if (!initialVoice) return; if (!isOpen) return;
const nextVendor = 'OpenAI Compatible';
const nextModel = initialVoice.model || OPENAI_COMPATIBLE_DEFAULT_MODEL; if (!initialVoice) {
const defaultVoiceKey = buildOpenAICompatibleVoiceKey(initialVoice.id || initialVoice.name || '', nextModel); setVendor('OpenAI Compatible');
setName('');
setGender('Female');
setLanguage('zh');
setDescription('');
setOpenaiCompatibleModel(OPENAI_COMPATIBLE_DEFAULT_MODEL);
setSfVoiceId(OPENAI_COMPATIBLE_DEFAULT_VOICE);
setSfSpeed(1);
setSfGain(0);
setSfPitch(0);
setApiKey('');
setBaseUrl('');
setTestInput('你好,正在测试语音合成效果。');
return;
}
const nextVendor: VoiceVendor = String(initialVoice.vendor || '').trim().toLowerCase() === 'dashscope'
? 'DashScope'
: 'OpenAI Compatible';
const nextModel = (initialVoice.model || (nextVendor === 'DashScope' ? DASHSCOPE_DEFAULT_MODEL : OPENAI_COMPATIBLE_DEFAULT_MODEL)).trim();
const defaultVoiceKey = nextVendor === 'DashScope'
? DASHSCOPE_DEFAULT_VOICE
: buildOpenAICompatibleVoiceKey(initialVoice.id || initialVoice.name || '', nextModel);
setVendor(nextVendor); setVendor(nextVendor);
setName(initialVoice.name || ''); setName(initialVoice.name || '');
setGender(initialVoice.gender || 'Female'); setGender(initialVoice.gender || 'Female');
@@ -285,7 +314,7 @@ const AddVoiceModal: React.FC<{
setSfGain(initialVoice.gain ?? 0); setSfGain(initialVoice.gain ?? 0);
setSfPitch(initialVoice.pitch ?? 0); setSfPitch(initialVoice.pitch ?? 0);
setApiKey(initialVoice.apiKey || ''); setApiKey(initialVoice.apiKey || '');
setBaseUrl(initialVoice.baseUrl || ''); setBaseUrl(initialVoice.baseUrl || (nextVendor === 'DashScope' ? DASHSCOPE_DEFAULT_BASE_URL : ''));
}, [initialVoice, isOpen]); }, [initialVoice, isOpen]);
const handleAudition = async () => { const handleAudition = async () => {
@@ -316,10 +345,23 @@ const AddVoiceModal: React.FC<{
return; return;
} }
const resolvedModel = (() => {
const current = (openaiCompatibleModel || '').trim();
if (current) return current;
return vendor === 'DashScope' ? DASHSCOPE_DEFAULT_MODEL : OPENAI_COMPATIBLE_DEFAULT_MODEL;
})();
const resolvedVoiceKey = (() => { const resolvedVoiceKey = (() => {
const current = (sfVoiceId || '').trim(); const current = (sfVoiceId || '').trim();
if (current) return current; if (current) return current;
return buildOpenAICompatibleVoiceKey(initialVoice?.id || name, openaiCompatibleModel || OPENAI_COMPATIBLE_DEFAULT_MODEL); if (vendor === 'DashScope') return DASHSCOPE_DEFAULT_VOICE;
return buildOpenAICompatibleVoiceKey(initialVoice?.id || name, resolvedModel);
})();
const resolvedBaseUrl = (() => {
const current = (baseUrl || '').trim();
if (current) return current;
return vendor === 'DashScope' ? DASHSCOPE_DEFAULT_BASE_URL : '';
})(); })();
const newVoice: Voice = { const newVoice: Voice = {
@@ -328,11 +370,11 @@ const AddVoiceModal: React.FC<{
vendor, vendor,
gender, gender,
language, language,
description: description || `Model: ${openaiCompatibleModel}`, description: description || `Model: ${resolvedModel}`,
model: openaiCompatibleModel, model: resolvedModel,
voiceKey: resolvedVoiceKey, voiceKey: resolvedVoiceKey,
apiKey, apiKey,
baseUrl, baseUrl: resolvedBaseUrl,
speed: sfSpeed, speed: sfSpeed,
gain: sfGain, gain: sfGain,
pitch: sfPitch, pitch: sfPitch,
@@ -346,6 +388,11 @@ const AddVoiceModal: React.FC<{
setDescription(''); setDescription('');
setApiKey(''); setApiKey('');
setBaseUrl(''); setBaseUrl('');
setOpenaiCompatibleModel(OPENAI_COMPATIBLE_DEFAULT_MODEL);
setSfVoiceId(OPENAI_COMPATIBLE_DEFAULT_VOICE);
setSfSpeed(1);
setSfGain(0);
setSfPitch(0);
} catch (error: any) { } catch (error: any) {
alert(error?.message || '保存失败'); alert(error?.message || '保存失败');
} finally { } finally {
@@ -370,7 +417,10 @@ const AddVoiceModal: React.FC<{
<div className="space-y-4 max-h-[75vh] overflow-y-auto px-1 custom-scrollbar"> <div className="space-y-4 max-h-[75vh] overflow-y-auto px-1 custom-scrollbar">
<div className="space-y-1.5"> <div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"> (Vendor)</label> <label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"> (Vendor)</label>
<Input value={vendor} readOnly className="h-10 border border-white/10 bg-white/5" /> <Select value={vendor} onChange={(e) => setVendor(e.target.value as VoiceVendor)}>
<option value="OpenAI Compatible">OpenAI Compatible</option>
<option value="DashScope">DashScope</option>
</Select>
</div> </div>
<div className="h-px bg-white/5"></div> <div className="h-px bg-white/5"></div>
@@ -388,12 +438,16 @@ const AddVoiceModal: React.FC<{
className="font-mono text-xs" className="font-mono text-xs"
value={openaiCompatibleModel} value={openaiCompatibleModel}
onChange={(e) => setOpenaiCompatibleModel(e.target.value)} onChange={(e) => setOpenaiCompatibleModel(e.target.value)}
placeholder="例如: FunAudioLLM/CosyVoice2-0.5B" placeholder={vendor === 'DashScope' ? DASHSCOPE_DEFAULT_MODEL : OPENAI_COMPATIBLE_DEFAULT_MODEL}
/> />
</div> </div>
<div className="space-y-1.5"> <div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"> ID (Voice)</label> <label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"> ID (Voice)</label>
<Input value={sfVoiceId} onChange={(e) => setSfVoiceId(e.target.value)} placeholder="FunAudioLLM/CosyVoice2-0.5B:anna" /> <Input
value={sfVoiceId}
onChange={(e) => setSfVoiceId(e.target.value)}
placeholder={vendor === 'DashScope' ? DASHSCOPE_DEFAULT_VOICE : OPENAI_COMPATIBLE_DEFAULT_VOICE}
/>
</div> </div>
</div> </div>
@@ -429,7 +483,11 @@ const AddVoiceModal: React.FC<{
</div> </div>
<div className="space-y-1.5"> <div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block">Base URL</label> <label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block">Base URL</label>
<Input value={baseUrl} onChange={(e) => setBaseUrl(e.target.value)} placeholder="https://.../v1" /> <Input
value={baseUrl}
onChange={(e) => setBaseUrl(e.target.value)}
placeholder={vendor === 'DashScope' ? DASHSCOPE_DEFAULT_BASE_URL : 'https://.../v1'}
/>
</div> </div>
</div> </div>

View File

@@ -60,6 +60,9 @@ const mapVoice = (raw: AnyRecord): Voice => ({
name: readField(raw, ['name'], ''), name: readField(raw, ['name'], ''),
vendor: ((): string => { vendor: ((): string => {
const vendor = String(readField(raw, ['vendor'], '')).trim().toLowerCase(); const vendor = String(readField(raw, ['vendor'], '')).trim().toLowerCase();
if (vendor === 'dashscope') {
return 'DashScope';
}
if (vendor === 'siliconflow' || vendor === '硅基流动' || vendor === 'openai-compatible') { if (vendor === 'siliconflow' || vendor === '硅基流动' || vendor === 'openai-compatible') {
return 'OpenAI Compatible'; return 'OpenAI Compatible';
} }