Fix talking voice error

2026-02-12 19:39:26 +08:00
parent 81ed89b84f
commit 56f8aa2191
3 changed files with 82 additions and 12 deletions
--- a/api/app/routers/assistants.py
+++ b/api/app/routers/assistants.py
@@ -12,6 +12,18 @@ from ..schemas import (

 router = APIRouter(prefix="/assistants", tags=["Assistants"])

+OPENAI_COMPATIBLE_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
+OPENAI_COMPATIBLE_KNOWN_VOICES = {
+    "alex",
+    "anna",
+    "bella",
+    "benjamin",
+    "charles",
+    "claire",
+    "david",
+    "diana",
+}
+

 def _is_openai_compatible_vendor(vendor: Optional[str]) -> bool:
    return (vendor or "").strip().lower() in {
@@ -22,6 +34,24 @@ def _is_openai_compatible_vendor(vendor: Optional[str]) -> bool:
    }


+def _normalize_openai_compatible_voice_key(voice_value: str, model: str) -> str:
+    raw = (voice_value or "").strip()
+    model_name = (model or "").strip() or OPENAI_COMPATIBLE_DEFAULT_MODEL
+    if not raw:
+        return f"{model_name}:anna"
+
+    if ":" in raw:
+        voice_model, voice_id = raw.split(":", 1)
+        voice_model = voice_model.strip() or model_name
+        voice_id = voice_id.strip()
+        if voice_id.lower() in OPENAI_COMPATIBLE_KNOWN_VOICES:
+            voice_id = voice_id.lower()
+        return f"{voice_model}:{voice_id}"
+
+    voice_id = raw.lower() if raw.lower() in OPENAI_COMPATIBLE_KNOWN_VOICES else raw
+    return f"{model_name}:{voice_id}"
+
+
 def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict:
    metadata = {
        "systemPrompt": assistant.prompt or "",
@@ -67,12 +97,17 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict:
        voice = db.query(Voice).filter(Voice.id == assistant.voice).first()
        if voice:
            tts_provider = "openai_compatible" if _is_openai_compatible_vendor(voice.vendor) else "edge"
+            model = voice.model
+            runtime_voice = voice.voice_key or voice.id
+            if tts_provider == "openai_compatible":
+                model = model or OPENAI_COMPATIBLE_DEFAULT_MODEL
+                runtime_voice = _normalize_openai_compatible_voice_key(runtime_voice, model)
            metadata["services"]["tts"] = {
                "enabled": True,
                "provider": tts_provider,
-                "model": voice.model,
+                "model": model,
                "apiKey": voice.api_key if tts_provider == "openai_compatible" else None,
-                "voice": voice.voice_key or voice.id,
+                "voice": runtime_voice,
                "speed": assistant.speed or voice.speed,
            }
        else:
--- a/engine/services/openai_compatible_tts.py
+++ b/engine/services/openai_compatible_tts.py
@@ -53,11 +53,20 @@ class OpenAICompatibleTTSService(BaseTTSService):
            sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
            speed: Speech speed (0.25 to 4.0)
        """
-        # Resolve voice name
-        if voice in self.VOICES:
-            full_voice = self.VOICES[voice]
+        # Resolve voice name (case-insensitive), and normalize "model:VoiceId" suffix.
+        resolved_voice = (voice or "").strip()
+        voice_lookup = resolved_voice.lower()
+        if voice_lookup in self.VOICES:
+            full_voice = self.VOICES[voice_lookup]
+        elif ":" in resolved_voice:
+            model_part, voice_part = resolved_voice.split(":", 1)
+            normalized_voice_part = voice_part.strip().lower()
+            if normalized_voice_part in self.VOICES:
+                full_voice = f"{(model_part or model).strip()}:{normalized_voice_part}"
            else:
-            full_voice = voice
+                full_voice = resolved_voice
+        else:
+            full_voice = resolved_voice
            
        super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed)
        
--- a/web/pages/Assistants.tsx
+++ b/web/pages/Assistants.tsx
@@ -16,12 +16,37 @@ const isOpenAICompatibleVendor = (vendor?: string) => {
 };

 const OPENAI_COMPATIBLE_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B';
+const OPENAI_COMPATIBLE_KNOWN_VOICES = new Set([
+  'alex',
+  'anna',
+  'bella',
+  'benjamin',
+  'charles',
+  'claire',
+  'david',
+  'diana',
+]);
+
+const normalizeOpenAICompatibleVoiceKey = (voiceValue: string, model?: string) => {
+  const raw = String(voiceValue || '').trim();
+  const modelName = String(model || '').trim() || OPENAI_COMPATIBLE_DEFAULT_MODEL;
+  if (!raw) return `${modelName}:anna`;
+
+  if (raw.includes(':')) {
+    const [prefix, ...rest] = raw.split(':');
+    const voiceIdRaw = rest.join(':').trim();
+    const voiceIdLower = voiceIdRaw.toLowerCase();
+    const normalizedVoiceId = OPENAI_COMPATIBLE_KNOWN_VOICES.has(voiceIdLower) ? voiceIdLower : voiceIdRaw;
+    return `${(prefix || modelName).trim()}:${normalizedVoiceId}`;
+  }
+
+  const rawLower = raw.toLowerCase();
+  const normalizedVoiceId = OPENAI_COMPATIBLE_KNOWN_VOICES.has(rawLower) ? rawLower : raw;
+  return `${modelName}:${normalizedVoiceId}`;
+};

 const buildOpenAICompatibleVoiceKey = (voiceId: string, model?: string) => {
-  const id = String(voiceId || '').trim();
-  if (!id) return '';
-  if (id.includes(':')) return id;
-  return `${model || OPENAI_COMPATIBLE_DEFAULT_MODEL}:${id}`;
+  return normalizeOpenAICompatibleVoiceKey(voiceId, model);
 };

 const resolveRuntimeTtsVoice = (selectedVoiceId: string, voice: Voice) => {
@@ -29,13 +54,14 @@ const resolveRuntimeTtsVoice = (selectedVoiceId: string, voice: Voice) => {
  if (!isOpenAICompatibleVendor(voice.vendor)) {
    return explicitKey || selectedVoiceId;
  }
+  const resolved = normalizeOpenAICompatibleVoiceKey(explicitKey || selectedVoiceId, voice.model);
  if (voice.isSystem) {
-    const canonical = buildOpenAICompatibleVoiceKey(selectedVoiceId, voice.model);
+    const canonical = normalizeOpenAICompatibleVoiceKey(selectedVoiceId, voice.model);
    if (!explicitKey) return canonical;
    const explicitSuffix = explicitKey.includes(':') ? explicitKey.split(':').pop() : explicitKey;
    if (explicitSuffix && explicitSuffix !== selectedVoiceId) return canonical;
  }
-  return explicitKey || buildOpenAICompatibleVoiceKey(selectedVoiceId, voice.model);
+  return resolved;
 };

 const renderToolIcon = (icon: string) => {