Voice library support dashscope
This commit is contained in:
@@ -34,6 +34,12 @@ def _is_openai_compatible_vendor(vendor: Optional[str]) -> bool:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _is_dashscope_vendor(vendor: Optional[str]) -> bool:
|
||||||
|
return (vendor or "").strip().lower() in {
|
||||||
|
"dashscope",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _normalize_openai_compatible_voice_key(voice_value: str, model: str) -> str:
|
def _normalize_openai_compatible_voice_key(voice_value: str, model: str) -> str:
|
||||||
raw = (voice_value or "").strip()
|
raw = (voice_value or "").strip()
|
||||||
model_name = (model or "").strip() or OPENAI_COMPATIBLE_DEFAULT_MODEL
|
model_name = (model or "").strip() or OPENAI_COMPATIBLE_DEFAULT_MODEL
|
||||||
@@ -121,7 +127,12 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s
|
|||||||
elif assistant.voice:
|
elif assistant.voice:
|
||||||
voice = db.query(Voice).filter(Voice.id == assistant.voice).first()
|
voice = db.query(Voice).filter(Voice.id == assistant.voice).first()
|
||||||
if voice:
|
if voice:
|
||||||
tts_provider = "openai_compatible" if _is_openai_compatible_vendor(voice.vendor) else "edge"
|
if _is_dashscope_vendor(voice.vendor):
|
||||||
|
tts_provider = "dashscope"
|
||||||
|
elif _is_openai_compatible_vendor(voice.vendor):
|
||||||
|
tts_provider = "openai_compatible"
|
||||||
|
else:
|
||||||
|
tts_provider = "edge"
|
||||||
model = voice.model
|
model = voice.model
|
||||||
runtime_voice = voice.voice_key or voice.id
|
runtime_voice = voice.voice_key or voice.id
|
||||||
if tts_provider == "openai_compatible":
|
if tts_provider == "openai_compatible":
|
||||||
@@ -131,8 +142,8 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s
|
|||||||
"enabled": True,
|
"enabled": True,
|
||||||
"provider": tts_provider,
|
"provider": tts_provider,
|
||||||
"model": model,
|
"model": model,
|
||||||
"apiKey": voice.api_key if tts_provider == "openai_compatible" else None,
|
"apiKey": voice.api_key if tts_provider in {"openai_compatible", "dashscope"} else None,
|
||||||
"baseUrl": voice.base_url if tts_provider == "openai_compatible" else None,
|
"baseUrl": voice.base_url if tts_provider in {"openai_compatible", "dashscope"} else None,
|
||||||
"voice": runtime_voice,
|
"voice": runtime_voice,
|
||||||
"speed": assistant.speed or voice.speed,
|
"speed": assistant.speed or voice.speed,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
import base64
|
import base64
|
||||||
|
import io
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
from typing import Optional
|
import threading
|
||||||
|
import wave
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
@@ -14,6 +18,203 @@ from ..schemas import VoiceCreate, VoiceOut, VoicePreviewRequest, VoicePreviewRe
|
|||||||
router = APIRouter(prefix="/voices", tags=["Voices"])
|
router = APIRouter(prefix="/voices", tags=["Voices"])
|
||||||
|
|
||||||
OPENAI_COMPATIBLE_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
|
OPENAI_COMPATIBLE_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
|
||||||
|
DASHSCOPE_DEFAULT_MODEL = "qwen3-tts-flash-realtime"
|
||||||
|
DASHSCOPE_DEFAULT_VOICE_KEY = "Cherry"
|
||||||
|
DASHSCOPE_DEFAULT_BASE_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
|
||||||
|
|
||||||
|
try:
|
||||||
|
import dashscope
|
||||||
|
from dashscope.audio.qwen_tts_realtime import AudioFormat, QwenTtsRealtime, QwenTtsRealtimeCallback
|
||||||
|
|
||||||
|
DASHSCOPE_SDK_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
dashscope = None # type: ignore[assignment]
|
||||||
|
AudioFormat = None # type: ignore[assignment]
|
||||||
|
QwenTtsRealtime = None # type: ignore[assignment]
|
||||||
|
DASHSCOPE_SDK_AVAILABLE = False
|
||||||
|
|
||||||
|
class QwenTtsRealtimeCallback: # type: ignore[no-redef]
|
||||||
|
"""Fallback callback base when DashScope SDK is unavailable."""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class _DashScopePreviewCallback(QwenTtsRealtimeCallback):
|
||||||
|
"""Collect DashScope realtime callback events and PCM chunks."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self._open_event = threading.Event()
|
||||||
|
self._done_event = threading.Event()
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self._audio_chunks: list[bytes] = []
|
||||||
|
self._error_message: Optional[str] = None
|
||||||
|
|
||||||
|
def on_open(self) -> None:
|
||||||
|
self._open_event.set()
|
||||||
|
|
||||||
|
def on_close(self, code: int, reason: str) -> None:
|
||||||
|
if not self._done_event.is_set():
|
||||||
|
self._error_message = f"DashScope websocket closed unexpectedly: {code} {reason}"
|
||||||
|
self._done_event.set()
|
||||||
|
|
||||||
|
def on_error(self, message: str) -> None:
|
||||||
|
self._error_message = str(message)
|
||||||
|
self._done_event.set()
|
||||||
|
|
||||||
|
def on_event(self, response: Any) -> None:
|
||||||
|
payload = _coerce_dashscope_event(response)
|
||||||
|
event_type = str(payload.get("type") or "").strip()
|
||||||
|
if event_type == "response.audio.delta":
|
||||||
|
delta = payload.get("delta")
|
||||||
|
if isinstance(delta, str):
|
||||||
|
try:
|
||||||
|
self._append_audio(base64.b64decode(delta))
|
||||||
|
except Exception:
|
||||||
|
return
|
||||||
|
elif event_type in {"response.done", "session.finished"}:
|
||||||
|
self._done_event.set()
|
||||||
|
elif event_type == "error":
|
||||||
|
self._error_message = _format_dashscope_error_event(payload)
|
||||||
|
self._done_event.set()
|
||||||
|
|
||||||
|
def on_data(self, data: bytes) -> None:
|
||||||
|
# Some SDK versions emit raw PCM frames via on_data.
|
||||||
|
if isinstance(data, (bytes, bytearray)):
|
||||||
|
self._append_audio(bytes(data))
|
||||||
|
|
||||||
|
def wait_for_open(self, timeout: float = 10.0) -> None:
|
||||||
|
if not self._open_event.wait(timeout):
|
||||||
|
raise TimeoutError("DashScope websocket open timeout")
|
||||||
|
|
||||||
|
def wait_for_done(self, timeout: float = 45.0) -> None:
|
||||||
|
if not self._done_event.wait(timeout):
|
||||||
|
raise TimeoutError("DashScope synthesis timeout")
|
||||||
|
|
||||||
|
def raise_if_error(self) -> None:
|
||||||
|
if self._error_message:
|
||||||
|
raise RuntimeError(self._error_message)
|
||||||
|
|
||||||
|
def read_audio(self) -> bytes:
|
||||||
|
with self._lock:
|
||||||
|
return b"".join(self._audio_chunks)
|
||||||
|
|
||||||
|
def _append_audio(self, chunk: bytes) -> None:
|
||||||
|
if not chunk:
|
||||||
|
return
|
||||||
|
with self._lock:
|
||||||
|
self._audio_chunks.append(chunk)
|
||||||
|
|
||||||
|
|
||||||
|
def _coerce_dashscope_event(response: Any) -> Dict[str, Any]:
|
||||||
|
if isinstance(response, dict):
|
||||||
|
return response
|
||||||
|
if isinstance(response, str):
|
||||||
|
try:
|
||||||
|
parsed = json.loads(response)
|
||||||
|
if isinstance(parsed, dict):
|
||||||
|
return parsed
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
return {"type": "raw", "message": str(response)}
|
||||||
|
|
||||||
|
|
||||||
|
def _format_dashscope_error_event(payload: Dict[str, Any]) -> str:
|
||||||
|
error = payload.get("error")
|
||||||
|
if isinstance(error, dict):
|
||||||
|
code = str(error.get("code") or "").strip()
|
||||||
|
message = str(error.get("message") or "").strip()
|
||||||
|
if code and message:
|
||||||
|
return f"{code}: {message}"
|
||||||
|
return message or str(error)
|
||||||
|
return str(error or "DashScope realtime TTS error")
|
||||||
|
|
||||||
|
|
||||||
|
def _create_dashscope_realtime_client(*, model: str, callback: _DashScopePreviewCallback, url: str, api_key: str) -> Any:
|
||||||
|
if QwenTtsRealtime is None:
|
||||||
|
raise RuntimeError("DashScope SDK unavailable")
|
||||||
|
|
||||||
|
init_kwargs = {
|
||||||
|
"model": model,
|
||||||
|
"callback": callback,
|
||||||
|
"url": url,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
return QwenTtsRealtime(api_key=api_key, **init_kwargs) # type: ignore[misc]
|
||||||
|
except TypeError as exc:
|
||||||
|
if "api_key" not in str(exc):
|
||||||
|
raise
|
||||||
|
return QwenTtsRealtime(**init_kwargs) # type: ignore[misc]
|
||||||
|
|
||||||
|
|
||||||
|
def _pcm16_to_wav_bytes(pcm_bytes: bytes, sample_rate: int = 24000) -> bytes:
|
||||||
|
with io.BytesIO() as buffer:
|
||||||
|
with wave.open(buffer, "wb") as wav_file:
|
||||||
|
wav_file.setnchannels(1)
|
||||||
|
wav_file.setsampwidth(2)
|
||||||
|
wav_file.setframerate(sample_rate)
|
||||||
|
wav_file.writeframes(pcm_bytes)
|
||||||
|
return buffer.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def _synthesize_dashscope_preview(
|
||||||
|
*,
|
||||||
|
text: str,
|
||||||
|
api_key: str,
|
||||||
|
base_url: str,
|
||||||
|
model: str,
|
||||||
|
voice_key: str,
|
||||||
|
speed: Optional[float],
|
||||||
|
) -> bytes:
|
||||||
|
if not DASHSCOPE_SDK_AVAILABLE:
|
||||||
|
raise RuntimeError("dashscope package not installed; install with `pip install dashscope>=1.25.11`")
|
||||||
|
if not AudioFormat:
|
||||||
|
raise RuntimeError("DashScope SDK AudioFormat unavailable")
|
||||||
|
|
||||||
|
callback = _DashScopePreviewCallback()
|
||||||
|
if dashscope is not None:
|
||||||
|
dashscope.api_key = api_key
|
||||||
|
client = _create_dashscope_realtime_client(
|
||||||
|
model=model,
|
||||||
|
callback=callback,
|
||||||
|
url=base_url,
|
||||||
|
api_key=api_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
client.connect()
|
||||||
|
callback.wait_for_open()
|
||||||
|
session_kwargs: Dict[str, Any] = {
|
||||||
|
"voice": voice_key,
|
||||||
|
"response_format": AudioFormat.PCM_24000HZ_MONO_16BIT,
|
||||||
|
"mode": "commit",
|
||||||
|
}
|
||||||
|
# speech_rate is supported by qwen3-* realtime models.
|
||||||
|
normalized_model = str(model or "").strip().lower()
|
||||||
|
if speed is not None and normalized_model.startswith("qwen3-"):
|
||||||
|
session_kwargs["speech_rate"] = max(0.5, min(2.0, float(speed)))
|
||||||
|
client.update_session(**session_kwargs)
|
||||||
|
client.append_text(text)
|
||||||
|
client.commit()
|
||||||
|
callback.wait_for_done()
|
||||||
|
callback.raise_if_error()
|
||||||
|
pcm_audio = callback.read_audio()
|
||||||
|
if not pcm_audio:
|
||||||
|
raise RuntimeError("No audio chunk returned from DashScope realtime synthesis")
|
||||||
|
return _pcm16_to_wav_bytes(pcm_audio, sample_rate=24000)
|
||||||
|
finally:
|
||||||
|
finish_fn = getattr(client, "finish", None)
|
||||||
|
if callable(finish_fn):
|
||||||
|
try:
|
||||||
|
finish_fn()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
close_fn = getattr(client, "close", None)
|
||||||
|
if callable(close_fn):
|
||||||
|
try:
|
||||||
|
close_fn()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _is_openai_compatible_vendor(vendor: str) -> bool:
|
def _is_openai_compatible_vendor(vendor: str) -> bool:
|
||||||
@@ -26,9 +227,18 @@ def _is_openai_compatible_vendor(vendor: str) -> bool:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _is_dashscope_vendor(vendor: str) -> bool:
|
||||||
|
normalized = (vendor or "").strip().lower()
|
||||||
|
return normalized in {
|
||||||
|
"dashscope",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _default_base_url(vendor: str) -> Optional[str]:
|
def _default_base_url(vendor: str) -> Optional[str]:
|
||||||
if _is_openai_compatible_vendor(vendor):
|
if _is_openai_compatible_vendor(vendor):
|
||||||
return "https://api.siliconflow.cn/v1"
|
return "https://api.siliconflow.cn/v1"
|
||||||
|
if _is_dashscope_vendor(vendor):
|
||||||
|
return DASHSCOPE_DEFAULT_BASE_URL
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@@ -76,6 +286,9 @@ def create_voice(data: VoiceCreate, db: Session = Depends(get_db)):
|
|||||||
if not voice_key:
|
if not voice_key:
|
||||||
raw_id = (data.id or data.name).strip()
|
raw_id = (data.id or data.name).strip()
|
||||||
voice_key = raw_id if ":" in raw_id else f"{model}:{raw_id}"
|
voice_key = raw_id if ":" in raw_id else f"{model}:{raw_id}"
|
||||||
|
elif _is_dashscope_vendor(vendor):
|
||||||
|
model = (model or "").strip() or DASHSCOPE_DEFAULT_MODEL
|
||||||
|
voice_key = (voice_key or "").strip() or DASHSCOPE_DEFAULT_VOICE_KEY
|
||||||
|
|
||||||
voice = Voice(
|
voice = Voice(
|
||||||
id=unique_short_id("tts", db, Voice),
|
id=unique_short_id("tts", db, Voice),
|
||||||
@@ -126,6 +339,11 @@ def update_voice(id: str, data: VoiceUpdate, db: Session = Depends(get_db)):
|
|||||||
voice_key = update_data.get("voice_key") or voice.voice_key
|
voice_key = update_data.get("voice_key") or voice.voice_key
|
||||||
update_data["model"] = model
|
update_data["model"] = model
|
||||||
update_data["voice_key"] = voice_key or _build_openai_compatible_voice_key(voice, model)
|
update_data["voice_key"] = voice_key or _build_openai_compatible_voice_key(voice, model)
|
||||||
|
elif _is_dashscope_vendor(vendor_for_defaults):
|
||||||
|
model = update_data.get("model") or voice.model or DASHSCOPE_DEFAULT_MODEL
|
||||||
|
voice_key = update_data.get("voice_key") or voice.voice_key or DASHSCOPE_DEFAULT_VOICE_KEY
|
||||||
|
update_data["model"] = model
|
||||||
|
update_data["voice_key"] = voice_key
|
||||||
|
|
||||||
for field, value in update_data.items():
|
for field, value in update_data.items():
|
||||||
setattr(voice, field, value)
|
setattr(voice, field, value)
|
||||||
@@ -148,7 +366,7 @@ def delete_voice(id: str, db: Session = Depends(get_db)):
|
|||||||
|
|
||||||
@router.post("/{id}/preview", response_model=VoicePreviewResponse)
|
@router.post("/{id}/preview", response_model=VoicePreviewResponse)
|
||||||
def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_db)):
|
def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_db)):
|
||||||
"""试听指定声音,基于 OpenAI-compatible /audio/speech 接口。"""
|
"""试听指定声音,支持 OpenAI-compatible 与 DashScope Realtime。"""
|
||||||
voice = db.query(Voice).filter(Voice.id == id).first()
|
voice = db.query(Voice).filter(Voice.id == id).first()
|
||||||
if not voice:
|
if not voice:
|
||||||
raise HTTPException(status_code=404, detail="Voice not found")
|
raise HTTPException(status_code=404, detail="Voice not found")
|
||||||
@@ -157,6 +375,31 @@ def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_
|
|||||||
if not text:
|
if not text:
|
||||||
raise HTTPException(status_code=400, detail="Preview text cannot be empty")
|
raise HTTPException(status_code=400, detail="Preview text cannot be empty")
|
||||||
|
|
||||||
|
if _is_dashscope_vendor(voice.vendor):
|
||||||
|
api_key = (data.api_key or "").strip() or (voice.api_key or "").strip()
|
||||||
|
if not api_key:
|
||||||
|
api_key = os.getenv("DASHSCOPE_API_KEY", "").strip() or os.getenv("TTS_API_KEY", "").strip()
|
||||||
|
if not api_key:
|
||||||
|
raise HTTPException(status_code=400, detail=f"API key is required for voice: {voice.name}")
|
||||||
|
|
||||||
|
base_url = (voice.base_url or "").strip() or DASHSCOPE_DEFAULT_BASE_URL
|
||||||
|
model = (voice.model or "").strip() or DASHSCOPE_DEFAULT_MODEL
|
||||||
|
voice_key = (voice.voice_key or "").strip() or DASHSCOPE_DEFAULT_VOICE_KEY
|
||||||
|
effective_speed = data.speed if data.speed is not None else voice.speed
|
||||||
|
try:
|
||||||
|
wav_bytes = _synthesize_dashscope_preview(
|
||||||
|
text=text,
|
||||||
|
api_key=api_key,
|
||||||
|
base_url=base_url,
|
||||||
|
model=model,
|
||||||
|
voice_key=voice_key,
|
||||||
|
speed=effective_speed,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=502, detail=f"DashScope preview failed: {exc}") from exc
|
||||||
|
audio_base64 = base64.b64encode(wav_bytes).decode("utf-8")
|
||||||
|
return VoicePreviewResponse(success=True, audio_url=f"data:audio/wav;base64,{audio_base64}")
|
||||||
|
|
||||||
api_key = (data.api_key or "").strip() or (voice.api_key or "").strip()
|
api_key = (data.api_key or "").strip() or (voice.api_key or "").strip()
|
||||||
if not api_key and _is_openai_compatible_vendor(voice.vendor):
|
if not api_key and _is_openai_compatible_vendor(voice.vendor):
|
||||||
api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
|
api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
|
||||||
|
|||||||
@@ -13,12 +13,16 @@ from app.id_generator import short_id
|
|||||||
from app.models import Voice, Assistant, KnowledgeBase, Workflow, LLMModel, ASRModel, KnowledgeDocument
|
from app.models import Voice, Assistant, KnowledgeBase, Workflow, LLMModel, ASRModel, KnowledgeDocument
|
||||||
|
|
||||||
VOICE_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
|
VOICE_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
|
||||||
|
DASHSCOPE_VOICE_MODEL = "qwen3-tts-flash-realtime"
|
||||||
|
DASHSCOPE_DEFAULT_VOICE_KEY = "Cherry"
|
||||||
|
DASHSCOPE_REALTIME_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
|
||||||
|
|
||||||
SEED_VOICE_IDS = {
|
SEED_VOICE_IDS = {
|
||||||
"alex": short_id("tts"),
|
"alex": short_id("tts"),
|
||||||
"david": short_id("tts"),
|
"david": short_id("tts"),
|
||||||
"bella": short_id("tts"),
|
"bella": short_id("tts"),
|
||||||
"claire": short_id("tts"),
|
"claire": short_id("tts"),
|
||||||
|
"dashscope_cherry": short_id("tts"),
|
||||||
}
|
}
|
||||||
|
|
||||||
SEED_LLM_IDS = {
|
SEED_LLM_IDS = {
|
||||||
@@ -177,8 +181,20 @@ def init_default_data():
|
|||||||
voice_key=f"{VOICE_MODEL}:claire",
|
voice_key=f"{VOICE_MODEL}:claire",
|
||||||
is_system=True,
|
is_system=True,
|
||||||
),
|
),
|
||||||
|
Voice(
|
||||||
|
id=SEED_VOICE_IDS["dashscope_cherry"],
|
||||||
|
name="DashScope Cherry",
|
||||||
|
vendor="DashScope",
|
||||||
|
gender="Female",
|
||||||
|
language="zh",
|
||||||
|
description="DashScope realtime sample voice.",
|
||||||
|
model=DASHSCOPE_VOICE_MODEL,
|
||||||
|
voice_key=DASHSCOPE_DEFAULT_VOICE_KEY,
|
||||||
|
base_url=DASHSCOPE_REALTIME_URL,
|
||||||
|
is_system=True,
|
||||||
|
),
|
||||||
]
|
]
|
||||||
seed_if_empty(db, Voice, voices, "✅ 默认声音数据已初始化 (OpenAI Compatible CosyVoice 2.0)")
|
seed_if_empty(db, Voice, voices, "✅ 默认声音数据已初始化 (OpenAI Compatible + DashScope)")
|
||||||
|
|
||||||
|
|
||||||
def init_default_tools(recreate: bool = False):
|
def init_default_tools(recreate: bool = False):
|
||||||
|
|||||||
@@ -9,3 +9,4 @@ minio==7.2.0
|
|||||||
httpx==0.26.0
|
httpx==0.26.0
|
||||||
chromadb==0.4.22
|
chromadb==0.4.22
|
||||||
openai==1.12.0
|
openai==1.12.0
|
||||||
|
dashscope==1.25.11
|
||||||
|
|||||||
@@ -186,9 +186,11 @@ class TestAssistantAPI:
|
|||||||
sample_asr_model_data["vendor"] = "OpenAI Compatible"
|
sample_asr_model_data["vendor"] = "OpenAI Compatible"
|
||||||
llm_resp = client.post("/api/llm", json=sample_llm_model_data)
|
llm_resp = client.post("/api/llm", json=sample_llm_model_data)
|
||||||
assert llm_resp.status_code == 200
|
assert llm_resp.status_code == 200
|
||||||
|
llm_id = llm_resp.json()["id"]
|
||||||
|
|
||||||
asr_resp = client.post("/api/asr", json=sample_asr_model_data)
|
asr_resp = client.post("/api/asr", json=sample_asr_model_data)
|
||||||
assert asr_resp.status_code == 200
|
assert asr_resp.status_code == 200
|
||||||
|
asr_id = asr_resp.json()["id"]
|
||||||
|
|
||||||
sample_voice_data["vendor"] = "OpenAI Compatible"
|
sample_voice_data["vendor"] = "OpenAI Compatible"
|
||||||
sample_voice_data["base_url"] = "https://tts.example.com/v1/audio/speech"
|
sample_voice_data["base_url"] = "https://tts.example.com/v1/audio/speech"
|
||||||
@@ -198,8 +200,8 @@ class TestAssistantAPI:
|
|||||||
voice_id = voice_resp.json()["id"]
|
voice_id = voice_resp.json()["id"]
|
||||||
|
|
||||||
sample_assistant_data.update({
|
sample_assistant_data.update({
|
||||||
"llmModelId": sample_llm_model_data["id"],
|
"llmModelId": llm_id,
|
||||||
"asrModelId": sample_asr_model_data["id"],
|
"asrModelId": asr_id,
|
||||||
"voice": voice_id,
|
"voice": voice_id,
|
||||||
"prompt": "runtime prompt",
|
"prompt": "runtime prompt",
|
||||||
"opener": "runtime opener",
|
"opener": "runtime opener",
|
||||||
@@ -220,7 +222,8 @@ class TestAssistantAPI:
|
|||||||
assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"]
|
assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"]
|
||||||
assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"]
|
assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"]
|
||||||
assert metadata["services"]["asr"]["baseUrl"] == sample_asr_model_data["base_url"]
|
assert metadata["services"]["asr"]["baseUrl"] == sample_asr_model_data["base_url"]
|
||||||
assert metadata["services"]["tts"]["voice"] == sample_voice_data["voice_key"]
|
expected_tts_voice = f"{sample_voice_data['model']}:{sample_voice_data['voice_key']}"
|
||||||
|
assert metadata["services"]["tts"]["voice"] == expected_tts_voice
|
||||||
assert metadata["services"]["tts"]["baseUrl"] == sample_voice_data["base_url"]
|
assert metadata["services"]["tts"]["baseUrl"] == sample_voice_data["base_url"]
|
||||||
|
|
||||||
def test_get_engine_config_endpoint(self, client, sample_assistant_data):
|
def test_get_engine_config_endpoint(self, client, sample_assistant_data):
|
||||||
@@ -252,6 +255,38 @@ class TestAssistantAPI:
|
|||||||
assert metadata["output"]["mode"] == "text"
|
assert metadata["output"]["mode"] == "text"
|
||||||
assert metadata["services"]["tts"]["enabled"] is False
|
assert metadata["services"]["tts"]["enabled"] is False
|
||||||
|
|
||||||
|
def test_runtime_config_dashscope_voice_provider(self, client, sample_assistant_data):
|
||||||
|
"""DashScope voices should map to dashscope tts provider in runtime metadata."""
|
||||||
|
voice_resp = client.post("/api/voices", json={
|
||||||
|
"name": "DashScope Cherry",
|
||||||
|
"vendor": "DashScope",
|
||||||
|
"gender": "Female",
|
||||||
|
"language": "zh",
|
||||||
|
"description": "dashscope voice",
|
||||||
|
"api_key": "dashscope-key",
|
||||||
|
"base_url": "wss://dashscope.aliyuncs.com/api-ws/v1/realtime",
|
||||||
|
})
|
||||||
|
assert voice_resp.status_code == 200
|
||||||
|
voice_payload = voice_resp.json()
|
||||||
|
|
||||||
|
sample_assistant_data.update({
|
||||||
|
"voice": voice_payload["id"],
|
||||||
|
"voiceOutputEnabled": True,
|
||||||
|
})
|
||||||
|
assistant_resp = client.post("/api/assistants", json=sample_assistant_data)
|
||||||
|
assert assistant_resp.status_code == 200
|
||||||
|
assistant_id = assistant_resp.json()["id"]
|
||||||
|
|
||||||
|
runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config")
|
||||||
|
assert runtime_resp.status_code == 200
|
||||||
|
metadata = runtime_resp.json()["sessionStartMetadata"]
|
||||||
|
tts = metadata["services"]["tts"]
|
||||||
|
assert tts["provider"] == "dashscope"
|
||||||
|
assert tts["voice"] == "Cherry"
|
||||||
|
assert tts["model"] == "qwen3-tts-flash-realtime"
|
||||||
|
assert tts["apiKey"] == "dashscope-key"
|
||||||
|
assert tts["baseUrl"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
|
||||||
|
|
||||||
def test_assistant_interrupt_and_generated_opener_flags(self, client, sample_assistant_data):
|
def test_assistant_interrupt_and_generated_opener_flags(self, client, sample_assistant_data):
|
||||||
sample_assistant_data.update({
|
sample_assistant_data.update({
|
||||||
"firstTurnMode": "user_first",
|
"firstTurnMode": "user_first",
|
||||||
|
|||||||
@@ -171,8 +171,9 @@ class TestVoiceAPI:
|
|||||||
"voice_key": "FunAudioLLM/CosyVoice2-0.5B:anna"
|
"voice_key": "FunAudioLLM/CosyVoice2-0.5B:anna"
|
||||||
})
|
})
|
||||||
assert create_resp.status_code == 200
|
assert create_resp.status_code == 200
|
||||||
|
voice_id = create_resp.json()["id"]
|
||||||
|
|
||||||
preview_resp = client.post("/api/voices/anna/preview", json={"text": "你好"})
|
preview_resp = client.post(f"/api/voices/{voice_id}/preview", json={"text": "你好"})
|
||||||
assert preview_resp.status_code == 200
|
assert preview_resp.status_code == 200
|
||||||
payload = preview_resp.json()
|
payload = preview_resp.json()
|
||||||
assert payload["success"] is True
|
assert payload["success"] is True
|
||||||
@@ -228,8 +229,103 @@ class TestVoiceAPI:
|
|||||||
"base_url": "https://api.siliconflow.cn/v1"
|
"base_url": "https://api.siliconflow.cn/v1"
|
||||||
})
|
})
|
||||||
assert create_resp.status_code == 200
|
assert create_resp.status_code == 200
|
||||||
|
voice_id = create_resp.json()["id"]
|
||||||
|
|
||||||
preview_resp = client.post("/api/voices/anna2/preview", json={"text": "hello"})
|
preview_resp = client.post(f"/api/voices/{voice_id}/preview", json={"text": "hello"})
|
||||||
assert preview_resp.status_code == 200
|
assert preview_resp.status_code == 200
|
||||||
assert captured_auth["value"] == "Bearer voice-key-123"
|
assert captured_auth["value"] == "Bearer voice-key-123"
|
||||||
assert captured_url["value"] == "https://api.siliconflow.cn/v1/audio/speech"
|
assert captured_url["value"] == "https://api.siliconflow.cn/v1/audio/speech"
|
||||||
|
|
||||||
|
def test_create_voice_dashscope_defaults(self, client):
|
||||||
|
"""Test creating DashScope voice applies model/voice defaults."""
|
||||||
|
create_resp = client.post("/api/voices", json={
|
||||||
|
"name": "DashScope Voice",
|
||||||
|
"vendor": "DashScope",
|
||||||
|
"gender": "Female",
|
||||||
|
"language": "zh",
|
||||||
|
"description": "dashscope",
|
||||||
|
})
|
||||||
|
assert create_resp.status_code == 200
|
||||||
|
payload = create_resp.json()
|
||||||
|
assert payload["vendor"] == "DashScope"
|
||||||
|
assert payload["model"] == "qwen3-tts-flash-realtime"
|
||||||
|
assert payload["voice_key"] == "Cherry"
|
||||||
|
|
||||||
|
def test_preview_voice_dashscope_success(self, client, monkeypatch):
|
||||||
|
"""DashScope voice preview should return playable wav data url."""
|
||||||
|
from app.routers import voices as voice_router
|
||||||
|
|
||||||
|
captured = {
|
||||||
|
"api_key": "",
|
||||||
|
"model": "",
|
||||||
|
"url": "",
|
||||||
|
"session": {},
|
||||||
|
"text": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
class DummyAudioFormat:
|
||||||
|
PCM_24000HZ_MONO_16BIT = "pcm24k16mono"
|
||||||
|
|
||||||
|
class DummyDashScopeModule:
|
||||||
|
api_key = ""
|
||||||
|
|
||||||
|
class DummyRealtime:
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
captured["api_key"] = kwargs.get("api_key", "")
|
||||||
|
captured["model"] = kwargs.get("model", "")
|
||||||
|
captured["url"] = kwargs.get("url", "")
|
||||||
|
self.callback = kwargs["callback"]
|
||||||
|
|
||||||
|
def connect(self):
|
||||||
|
self.callback.on_open()
|
||||||
|
|
||||||
|
def update_session(self, **kwargs):
|
||||||
|
captured["session"] = kwargs
|
||||||
|
|
||||||
|
def append_text(self, text):
|
||||||
|
captured["text"] = text
|
||||||
|
|
||||||
|
def commit(self):
|
||||||
|
# 16-bit PCM mono samples
|
||||||
|
raw_pcm = b"\x00\x00\x01\x00\x02\x00\x03\x00"
|
||||||
|
self.callback.on_event({
|
||||||
|
"type": "response.audio.delta",
|
||||||
|
"delta": base64.b64encode(raw_pcm).decode("utf-8"),
|
||||||
|
})
|
||||||
|
self.callback.on_event({"type": "response.done"})
|
||||||
|
|
||||||
|
def finish(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
monkeypatch.setattr(voice_router, "DASHSCOPE_SDK_AVAILABLE", True)
|
||||||
|
monkeypatch.setattr(voice_router, "AudioFormat", DummyAudioFormat)
|
||||||
|
monkeypatch.setattr(voice_router, "QwenTtsRealtime", DummyRealtime)
|
||||||
|
monkeypatch.setattr(voice_router, "dashscope", DummyDashScopeModule())
|
||||||
|
|
||||||
|
create_resp = client.post("/api/voices", json={
|
||||||
|
"name": "DashScope Voice",
|
||||||
|
"vendor": "DashScope",
|
||||||
|
"gender": "Female",
|
||||||
|
"language": "zh",
|
||||||
|
"description": "dashscope",
|
||||||
|
"api_key": "dashscope-key",
|
||||||
|
"base_url": "wss://dashscope.aliyuncs.com/api-ws/v1/realtime",
|
||||||
|
})
|
||||||
|
assert create_resp.status_code == 200
|
||||||
|
voice_id = create_resp.json()["id"]
|
||||||
|
|
||||||
|
preview_resp = client.post(f"/api/voices/{voice_id}/preview", json={"text": "你好"})
|
||||||
|
assert preview_resp.status_code == 200
|
||||||
|
payload = preview_resp.json()
|
||||||
|
assert payload["success"] is True
|
||||||
|
assert payload["audio_url"].startswith("data:audio/wav;base64,")
|
||||||
|
encoded = payload["audio_url"].split(",", 1)[1]
|
||||||
|
wav_bytes = base64.b64decode(encoded)
|
||||||
|
assert wav_bytes.startswith(b"RIFF")
|
||||||
|
assert captured["model"] == "qwen3-tts-flash-realtime"
|
||||||
|
assert captured["url"] == "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
|
||||||
|
assert captured["text"] == "你好"
|
||||||
|
assert captured["session"]["voice"] == "Cherry"
|
||||||
|
|||||||
62
engine/agents/default.yaml
Normal file
62
engine/agents/default.yaml
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# Agent behavior configuration (safe to edit per profile)
|
||||||
|
# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers).
|
||||||
|
# Infra/server/network settings should stay in .env.
|
||||||
|
|
||||||
|
agent:
|
||||||
|
vad:
|
||||||
|
type: silero
|
||||||
|
model_path: data/vad/silero_vad.onnx
|
||||||
|
threshold: 0.5
|
||||||
|
min_speech_duration_ms: 100
|
||||||
|
eou_threshold_ms: 800
|
||||||
|
|
||||||
|
llm:
|
||||||
|
# provider: openai | openai_compatible | siliconflow
|
||||||
|
provider: openai_compatible
|
||||||
|
model: deepseek-v3
|
||||||
|
temperature: 0.7
|
||||||
|
# Required: no fallback. You can still reference env explicitly.
|
||||||
|
api_key: sk-fc4d59b360475f53401a864db8ce0985010acc4e696723d20a90d6569f38d80a
|
||||||
|
# Optional for OpenAI-compatible endpoints:
|
||||||
|
api_url: https://api.qnaigc.com/v1
|
||||||
|
|
||||||
|
tts:
|
||||||
|
# provider: edge | openai_compatible | siliconflow | dashscope
|
||||||
|
# dashscope defaults (if omitted):
|
||||||
|
provider: dashscope
|
||||||
|
api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
|
||||||
|
model: qwen3-tts-flash-realtime
|
||||||
|
api_key: sk-391f5126d18345d497c6e8717c8c9ad7
|
||||||
|
mode: commit
|
||||||
|
voice: Cherry
|
||||||
|
speed: 1.0
|
||||||
|
# provider: openai_compatible
|
||||||
|
# api_key: sk-thmzysdpqqmhqxxshyqoxvjeiflexjdgaftyufrsgrhpjnyx
|
||||||
|
# api_url: https://api.siliconflow.cn/v1/audio/speech
|
||||||
|
# model: FunAudioLLM/CosyVoice2-0.5B
|
||||||
|
# voice: anna
|
||||||
|
# speed: 1.0
|
||||||
|
|
||||||
|
asr:
|
||||||
|
# provider: buffered | openai_compatible | siliconflow
|
||||||
|
provider: openai_compatible
|
||||||
|
api_key: sk-thmzysdpqqmhqxxshyqoxvjeiflexjdgaftyufrsgrhpjnyx
|
||||||
|
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
|
||||||
|
model: FunAudioLLM/SenseVoiceSmall
|
||||||
|
interim_interval_ms: 500
|
||||||
|
min_audio_ms: 300
|
||||||
|
start_min_speech_ms: 160
|
||||||
|
pre_speech_ms: 240
|
||||||
|
final_tail_ms: 120
|
||||||
|
|
||||||
|
duplex:
|
||||||
|
enabled: true
|
||||||
|
system_prompt: You are a helpful, friendly voice assistant. Keep your responses concise and conversational.
|
||||||
|
|
||||||
|
barge_in:
|
||||||
|
min_duration_ms: 200
|
||||||
|
silence_tolerance_ms: 60
|
||||||
|
|
||||||
|
tools:
|
||||||
|
- calculator
|
||||||
|
- current_time
|
||||||
55
engine/agents/example.yaml
Normal file
55
engine/agents/example.yaml
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
# Agent behavior configuration (safe to edit per profile)
|
||||||
|
# This file only controls agent-side behavior (VAD/LLM/TTS/ASR providers).
|
||||||
|
# Infra/server/network settings should stay in .env.
|
||||||
|
|
||||||
|
agent:
|
||||||
|
vad:
|
||||||
|
type: silero
|
||||||
|
model_path: data/vad/silero_vad.onnx
|
||||||
|
threshold: 0.5
|
||||||
|
min_speech_duration_ms: 100
|
||||||
|
eou_threshold_ms: 800
|
||||||
|
|
||||||
|
llm:
|
||||||
|
# provider: openai | openai_compatible | siliconflow
|
||||||
|
provider: openai_compatible
|
||||||
|
model: deepseek-v3
|
||||||
|
temperature: 0.7
|
||||||
|
# Required: no fallback. You can still reference env explicitly.
|
||||||
|
api_key: your_llm_api_key
|
||||||
|
# Optional for OpenAI-compatible endpoints:
|
||||||
|
api_url: https://api.qnaigc.com/v1
|
||||||
|
|
||||||
|
tts:
|
||||||
|
# provider: edge | openai_compatible | siliconflow | dashscope
|
||||||
|
# dashscope defaults (if omitted):
|
||||||
|
# api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
|
||||||
|
# model: qwen3-tts-flash-realtime
|
||||||
|
# dashscope_mode: commit (engine splits) | server_commit (dashscope splits)
|
||||||
|
# note: dashscope_mode/mode is ONLY used when provider=dashscope.
|
||||||
|
provider: openai_compatible
|
||||||
|
api_key: your_tts_api_key
|
||||||
|
api_url: https://api.siliconflow.cn/v1/audio/speech
|
||||||
|
model: FunAudioLLM/CosyVoice2-0.5B
|
||||||
|
voice: anna
|
||||||
|
speed: 1.0
|
||||||
|
|
||||||
|
asr:
|
||||||
|
# provider: buffered | openai_compatible | siliconflow
|
||||||
|
provider: openai_compatible
|
||||||
|
api_key: you_asr_api_key
|
||||||
|
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
|
||||||
|
model: FunAudioLLM/SenseVoiceSmall
|
||||||
|
interim_interval_ms: 500
|
||||||
|
min_audio_ms: 300
|
||||||
|
start_min_speech_ms: 160
|
||||||
|
pre_speech_ms: 240
|
||||||
|
final_tail_ms: 120
|
||||||
|
|
||||||
|
duplex:
|
||||||
|
enabled: true
|
||||||
|
system_prompt: You are a helpful, friendly voice assistant. Keep your responses concise and conversational.
|
||||||
|
|
||||||
|
barge_in:
|
||||||
|
min_duration_ms: 200
|
||||||
|
silence_tolerance_ms: 60
|
||||||
78
engine/agents/tools.yaml
Normal file
78
engine/agents/tools.yaml
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
# Agent behavior configuration with tool declarations.
|
||||||
|
# This profile is an example only.
|
||||||
|
|
||||||
|
agent:
|
||||||
|
vad:
|
||||||
|
type: silero
|
||||||
|
model_path: data/vad/silero_vad.onnx
|
||||||
|
threshold: 0.5
|
||||||
|
min_speech_duration_ms: 100
|
||||||
|
eou_threshold_ms: 800
|
||||||
|
|
||||||
|
llm:
|
||||||
|
# provider: openai | openai_compatible | siliconflow
|
||||||
|
provider: openai_compatible
|
||||||
|
model: deepseek-v3
|
||||||
|
temperature: 0.7
|
||||||
|
api_key: your_llm_api_key
|
||||||
|
api_url: https://api.qnaigc.com/v1
|
||||||
|
|
||||||
|
tts:
|
||||||
|
# provider: edge | openai_compatible | siliconflow | dashscope
|
||||||
|
# dashscope defaults (if omitted):
|
||||||
|
# api_url: wss://dashscope.aliyuncs.com/api-ws/v1/realtime
|
||||||
|
# model: qwen3-tts-flash-realtime
|
||||||
|
# dashscope_mode: commit (engine splits) | server_commit (dashscope splits)
|
||||||
|
# note: dashscope_mode/mode is ONLY used when provider=dashscope.
|
||||||
|
provider: openai_compatible
|
||||||
|
api_key: your_tts_api_key
|
||||||
|
api_url: https://api.siliconflow.cn/v1/audio/speech
|
||||||
|
model: FunAudioLLM/CosyVoice2-0.5B
|
||||||
|
voice: anna
|
||||||
|
speed: 1.0
|
||||||
|
|
||||||
|
asr:
|
||||||
|
# provider: buffered | openai_compatible | siliconflow
|
||||||
|
provider: openai_compatible
|
||||||
|
api_key: your_asr_api_key
|
||||||
|
api_url: https://api.siliconflow.cn/v1/audio/transcriptions
|
||||||
|
model: FunAudioLLM/SenseVoiceSmall
|
||||||
|
interim_interval_ms: 500
|
||||||
|
min_audio_ms: 300
|
||||||
|
start_min_speech_ms: 160
|
||||||
|
pre_speech_ms: 240
|
||||||
|
final_tail_ms: 120
|
||||||
|
|
||||||
|
duplex:
|
||||||
|
enabled: true
|
||||||
|
system_prompt: You are a helpful voice assistant with tool-calling support.
|
||||||
|
|
||||||
|
barge_in:
|
||||||
|
min_duration_ms: 200
|
||||||
|
silence_tolerance_ms: 60
|
||||||
|
|
||||||
|
# Tool declarations consumed by the engine at startup.
|
||||||
|
# - String form enables built-in/default tool schema when available.
|
||||||
|
# - Object form provides OpenAI function schema + executor hint.
|
||||||
|
tools:
|
||||||
|
- current_time
|
||||||
|
- calculator
|
||||||
|
- name: weather
|
||||||
|
description: Get weather by city name.
|
||||||
|
parameters:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
city:
|
||||||
|
type: string
|
||||||
|
description: City name, for example "San Francisco".
|
||||||
|
required: [city]
|
||||||
|
executor: server
|
||||||
|
- name: open_map
|
||||||
|
description: Open map app on the client device.
|
||||||
|
parameters:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
query:
|
||||||
|
type: string
|
||||||
|
required: [query]
|
||||||
|
executor: client
|
||||||
@@ -5,6 +5,12 @@ import { Voice } from '../types';
|
|||||||
import { createVoice, deleteVoice, fetchVoices, previewVoice, updateVoice } from '../services/backendApi';
|
import { createVoice, deleteVoice, fetchVoices, previewVoice, updateVoice } from '../services/backendApi';
|
||||||
|
|
||||||
const OPENAI_COMPATIBLE_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B';
|
const OPENAI_COMPATIBLE_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B';
|
||||||
|
const OPENAI_COMPATIBLE_DEFAULT_VOICE = 'FunAudioLLM/CosyVoice2-0.5B:anna';
|
||||||
|
const DASHSCOPE_DEFAULT_MODEL = 'qwen3-tts-flash-realtime';
|
||||||
|
const DASHSCOPE_DEFAULT_VOICE = 'Cherry';
|
||||||
|
const DASHSCOPE_DEFAULT_BASE_URL = 'wss://dashscope.aliyuncs.com/api-ws/v1/realtime';
|
||||||
|
|
||||||
|
type VoiceVendor = 'OpenAI Compatible' | 'DashScope';
|
||||||
|
|
||||||
const buildOpenAICompatibleVoiceKey = (rawId: string, model: string): string => {
|
const buildOpenAICompatibleVoiceKey = (rawId: string, model: string): string => {
|
||||||
const id = (rawId || '').trim();
|
const id = (rawId || '').trim();
|
||||||
@@ -249,11 +255,11 @@ const AddVoiceModal: React.FC<{
|
|||||||
onSuccess: (voice: Voice) => Promise<void>;
|
onSuccess: (voice: Voice) => Promise<void>;
|
||||||
initialVoice?: Voice;
|
initialVoice?: Voice;
|
||||||
}> = ({ isOpen, onClose, onSuccess, initialVoice }) => {
|
}> = ({ isOpen, onClose, onSuccess, initialVoice }) => {
|
||||||
const [vendor, setVendor] = useState<'OpenAI Compatible'>('OpenAI Compatible');
|
const [vendor, setVendor] = useState<VoiceVendor>('OpenAI Compatible');
|
||||||
const [name, setName] = useState('');
|
const [name, setName] = useState('');
|
||||||
|
|
||||||
const [openaiCompatibleModel, setOpenaiCompatibleModel] = useState(OPENAI_COMPATIBLE_DEFAULT_MODEL);
|
const [openaiCompatibleModel, setOpenaiCompatibleModel] = useState(OPENAI_COMPATIBLE_DEFAULT_MODEL);
|
||||||
const [sfVoiceId, setSfVoiceId] = useState('FunAudioLLM/CosyVoice2-0.5B:anna');
|
const [sfVoiceId, setSfVoiceId] = useState(OPENAI_COMPATIBLE_DEFAULT_VOICE);
|
||||||
const [sfSpeed, setSfSpeed] = useState(1);
|
const [sfSpeed, setSfSpeed] = useState(1);
|
||||||
const [sfGain, setSfGain] = useState(0);
|
const [sfGain, setSfGain] = useState(0);
|
||||||
const [sfPitch, setSfPitch] = useState(0);
|
const [sfPitch, setSfPitch] = useState(0);
|
||||||
@@ -270,10 +276,33 @@ const AddVoiceModal: React.FC<{
|
|||||||
const testAudioRef = useRef<HTMLAudioElement | null>(null);
|
const testAudioRef = useRef<HTMLAudioElement | null>(null);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!initialVoice) return;
|
if (!isOpen) return;
|
||||||
const nextVendor = 'OpenAI Compatible';
|
|
||||||
const nextModel = initialVoice.model || OPENAI_COMPATIBLE_DEFAULT_MODEL;
|
if (!initialVoice) {
|
||||||
const defaultVoiceKey = buildOpenAICompatibleVoiceKey(initialVoice.id || initialVoice.name || '', nextModel);
|
setVendor('OpenAI Compatible');
|
||||||
|
setName('');
|
||||||
|
setGender('Female');
|
||||||
|
setLanguage('zh');
|
||||||
|
setDescription('');
|
||||||
|
setOpenaiCompatibleModel(OPENAI_COMPATIBLE_DEFAULT_MODEL);
|
||||||
|
setSfVoiceId(OPENAI_COMPATIBLE_DEFAULT_VOICE);
|
||||||
|
setSfSpeed(1);
|
||||||
|
setSfGain(0);
|
||||||
|
setSfPitch(0);
|
||||||
|
setApiKey('');
|
||||||
|
setBaseUrl('');
|
||||||
|
setTestInput('你好,正在测试语音合成效果。');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const nextVendor: VoiceVendor = String(initialVoice.vendor || '').trim().toLowerCase() === 'dashscope'
|
||||||
|
? 'DashScope'
|
||||||
|
: 'OpenAI Compatible';
|
||||||
|
const nextModel = (initialVoice.model || (nextVendor === 'DashScope' ? DASHSCOPE_DEFAULT_MODEL : OPENAI_COMPATIBLE_DEFAULT_MODEL)).trim();
|
||||||
|
const defaultVoiceKey = nextVendor === 'DashScope'
|
||||||
|
? DASHSCOPE_DEFAULT_VOICE
|
||||||
|
: buildOpenAICompatibleVoiceKey(initialVoice.id || initialVoice.name || '', nextModel);
|
||||||
|
|
||||||
setVendor(nextVendor);
|
setVendor(nextVendor);
|
||||||
setName(initialVoice.name || '');
|
setName(initialVoice.name || '');
|
||||||
setGender(initialVoice.gender || 'Female');
|
setGender(initialVoice.gender || 'Female');
|
||||||
@@ -285,7 +314,7 @@ const AddVoiceModal: React.FC<{
|
|||||||
setSfGain(initialVoice.gain ?? 0);
|
setSfGain(initialVoice.gain ?? 0);
|
||||||
setSfPitch(initialVoice.pitch ?? 0);
|
setSfPitch(initialVoice.pitch ?? 0);
|
||||||
setApiKey(initialVoice.apiKey || '');
|
setApiKey(initialVoice.apiKey || '');
|
||||||
setBaseUrl(initialVoice.baseUrl || '');
|
setBaseUrl(initialVoice.baseUrl || (nextVendor === 'DashScope' ? DASHSCOPE_DEFAULT_BASE_URL : ''));
|
||||||
}, [initialVoice, isOpen]);
|
}, [initialVoice, isOpen]);
|
||||||
|
|
||||||
const handleAudition = async () => {
|
const handleAudition = async () => {
|
||||||
@@ -316,10 +345,23 @@ const AddVoiceModal: React.FC<{
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const resolvedModel = (() => {
|
||||||
|
const current = (openaiCompatibleModel || '').trim();
|
||||||
|
if (current) return current;
|
||||||
|
return vendor === 'DashScope' ? DASHSCOPE_DEFAULT_MODEL : OPENAI_COMPATIBLE_DEFAULT_MODEL;
|
||||||
|
})();
|
||||||
|
|
||||||
const resolvedVoiceKey = (() => {
|
const resolvedVoiceKey = (() => {
|
||||||
const current = (sfVoiceId || '').trim();
|
const current = (sfVoiceId || '').trim();
|
||||||
if (current) return current;
|
if (current) return current;
|
||||||
return buildOpenAICompatibleVoiceKey(initialVoice?.id || name, openaiCompatibleModel || OPENAI_COMPATIBLE_DEFAULT_MODEL);
|
if (vendor === 'DashScope') return DASHSCOPE_DEFAULT_VOICE;
|
||||||
|
return buildOpenAICompatibleVoiceKey(initialVoice?.id || name, resolvedModel);
|
||||||
|
})();
|
||||||
|
|
||||||
|
const resolvedBaseUrl = (() => {
|
||||||
|
const current = (baseUrl || '').trim();
|
||||||
|
if (current) return current;
|
||||||
|
return vendor === 'DashScope' ? DASHSCOPE_DEFAULT_BASE_URL : '';
|
||||||
})();
|
})();
|
||||||
|
|
||||||
const newVoice: Voice = {
|
const newVoice: Voice = {
|
||||||
@@ -328,11 +370,11 @@ const AddVoiceModal: React.FC<{
|
|||||||
vendor,
|
vendor,
|
||||||
gender,
|
gender,
|
||||||
language,
|
language,
|
||||||
description: description || `Model: ${openaiCompatibleModel}`,
|
description: description || `Model: ${resolvedModel}`,
|
||||||
model: openaiCompatibleModel,
|
model: resolvedModel,
|
||||||
voiceKey: resolvedVoiceKey,
|
voiceKey: resolvedVoiceKey,
|
||||||
apiKey,
|
apiKey,
|
||||||
baseUrl,
|
baseUrl: resolvedBaseUrl,
|
||||||
speed: sfSpeed,
|
speed: sfSpeed,
|
||||||
gain: sfGain,
|
gain: sfGain,
|
||||||
pitch: sfPitch,
|
pitch: sfPitch,
|
||||||
@@ -346,6 +388,11 @@ const AddVoiceModal: React.FC<{
|
|||||||
setDescription('');
|
setDescription('');
|
||||||
setApiKey('');
|
setApiKey('');
|
||||||
setBaseUrl('');
|
setBaseUrl('');
|
||||||
|
setOpenaiCompatibleModel(OPENAI_COMPATIBLE_DEFAULT_MODEL);
|
||||||
|
setSfVoiceId(OPENAI_COMPATIBLE_DEFAULT_VOICE);
|
||||||
|
setSfSpeed(1);
|
||||||
|
setSfGain(0);
|
||||||
|
setSfPitch(0);
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
alert(error?.message || '保存失败');
|
alert(error?.message || '保存失败');
|
||||||
} finally {
|
} finally {
|
||||||
@@ -370,7 +417,10 @@ const AddVoiceModal: React.FC<{
|
|||||||
<div className="space-y-4 max-h-[75vh] overflow-y-auto px-1 custom-scrollbar">
|
<div className="space-y-4 max-h-[75vh] overflow-y-auto px-1 custom-scrollbar">
|
||||||
<div className="space-y-1.5">
|
<div className="space-y-1.5">
|
||||||
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block">厂商 (Vendor)</label>
|
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block">厂商 (Vendor)</label>
|
||||||
<Input value={vendor} readOnly className="h-10 border border-white/10 bg-white/5" />
|
<Select value={vendor} onChange={(e) => setVendor(e.target.value as VoiceVendor)}>
|
||||||
|
<option value="OpenAI Compatible">OpenAI Compatible</option>
|
||||||
|
<option value="DashScope">DashScope</option>
|
||||||
|
</Select>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div className="h-px bg-white/5"></div>
|
<div className="h-px bg-white/5"></div>
|
||||||
@@ -388,12 +438,16 @@ const AddVoiceModal: React.FC<{
|
|||||||
className="font-mono text-xs"
|
className="font-mono text-xs"
|
||||||
value={openaiCompatibleModel}
|
value={openaiCompatibleModel}
|
||||||
onChange={(e) => setOpenaiCompatibleModel(e.target.value)}
|
onChange={(e) => setOpenaiCompatibleModel(e.target.value)}
|
||||||
placeholder="例如: FunAudioLLM/CosyVoice2-0.5B"
|
placeholder={vendor === 'DashScope' ? DASHSCOPE_DEFAULT_MODEL : OPENAI_COMPATIBLE_DEFAULT_MODEL}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
<div className="space-y-1.5">
|
<div className="space-y-1.5">
|
||||||
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block">声音 ID (Voice)</label>
|
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block">声音 ID (Voice)</label>
|
||||||
<Input value={sfVoiceId} onChange={(e) => setSfVoiceId(e.target.value)} placeholder="FunAudioLLM/CosyVoice2-0.5B:anna" />
|
<Input
|
||||||
|
value={sfVoiceId}
|
||||||
|
onChange={(e) => setSfVoiceId(e.target.value)}
|
||||||
|
placeholder={vendor === 'DashScope' ? DASHSCOPE_DEFAULT_VOICE : OPENAI_COMPATIBLE_DEFAULT_VOICE}
|
||||||
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -429,7 +483,11 @@ const AddVoiceModal: React.FC<{
|
|||||||
</div>
|
</div>
|
||||||
<div className="space-y-1.5">
|
<div className="space-y-1.5">
|
||||||
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block">Base URL</label>
|
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block">Base URL</label>
|
||||||
<Input value={baseUrl} onChange={(e) => setBaseUrl(e.target.value)} placeholder="https://.../v1" />
|
<Input
|
||||||
|
value={baseUrl}
|
||||||
|
onChange={(e) => setBaseUrl(e.target.value)}
|
||||||
|
placeholder={vendor === 'DashScope' ? DASHSCOPE_DEFAULT_BASE_URL : 'https://.../v1'}
|
||||||
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|||||||
@@ -60,6 +60,9 @@ const mapVoice = (raw: AnyRecord): Voice => ({
|
|||||||
name: readField(raw, ['name'], ''),
|
name: readField(raw, ['name'], ''),
|
||||||
vendor: ((): string => {
|
vendor: ((): string => {
|
||||||
const vendor = String(readField(raw, ['vendor'], '')).trim().toLowerCase();
|
const vendor = String(readField(raw, ['vendor'], '')).trim().toLowerCase();
|
||||||
|
if (vendor === 'dashscope') {
|
||||||
|
return 'DashScope';
|
||||||
|
}
|
||||||
if (vendor === 'siliconflow' || vendor === '硅基流动' || vendor === 'openai-compatible') {
|
if (vendor === 'siliconflow' || vendor === '硅基流动' || vendor === 'openai-compatible') {
|
||||||
return 'OpenAI Compatible';
|
return 'OpenAI Compatible';
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user