diff --git a/api/app/routers/assistants.py b/api/app/routers/assistants.py index 340f01b..815e9f6 100644 --- a/api/app/routers/assistants.py +++ b/api/app/routers/assistants.py @@ -12,6 +12,18 @@ from ..schemas import ( router = APIRouter(prefix="/assistants", tags=["Assistants"]) +OPENAI_COMPATIBLE_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B" +OPENAI_COMPATIBLE_KNOWN_VOICES = { + "alex", + "anna", + "bella", + "benjamin", + "charles", + "claire", + "david", + "diana", +} + def _is_openai_compatible_vendor(vendor: Optional[str]) -> bool: return (vendor or "").strip().lower() in { @@ -22,6 +34,24 @@ def _is_openai_compatible_vendor(vendor: Optional[str]) -> bool: } +def _normalize_openai_compatible_voice_key(voice_value: str, model: str) -> str: + raw = (voice_value or "").strip() + model_name = (model or "").strip() or OPENAI_COMPATIBLE_DEFAULT_MODEL + if not raw: + return f"{model_name}:anna" + + if ":" in raw: + voice_model, voice_id = raw.split(":", 1) + voice_model = voice_model.strip() or model_name + voice_id = voice_id.strip() + if voice_id.lower() in OPENAI_COMPATIBLE_KNOWN_VOICES: + voice_id = voice_id.lower() + return f"{voice_model}:{voice_id}" + + voice_id = raw.lower() if raw.lower() in OPENAI_COMPATIBLE_KNOWN_VOICES else raw + return f"{model_name}:{voice_id}" + + def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict: metadata = { "systemPrompt": assistant.prompt or "", @@ -67,12 +97,17 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict: voice = db.query(Voice).filter(Voice.id == assistant.voice).first() if voice: tts_provider = "openai_compatible" if _is_openai_compatible_vendor(voice.vendor) else "edge" + model = voice.model + runtime_voice = voice.voice_key or voice.id + if tts_provider == "openai_compatible": + model = model or OPENAI_COMPATIBLE_DEFAULT_MODEL + runtime_voice = _normalize_openai_compatible_voice_key(runtime_voice, model) metadata["services"]["tts"] = { "enabled": True, "provider": tts_provider, - "model": voice.model, + "model": model, "apiKey": voice.api_key if tts_provider == "openai_compatible" else None, - "voice": voice.voice_key or voice.id, + "voice": runtime_voice, "speed": assistant.speed or voice.speed, } else: diff --git a/engine/services/openai_compatible_tts.py b/engine/services/openai_compatible_tts.py index f912ec3..4967557 100644 --- a/engine/services/openai_compatible_tts.py +++ b/engine/services/openai_compatible_tts.py @@ -53,11 +53,20 @@ class OpenAICompatibleTTSService(BaseTTSService): sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100) speed: Speech speed (0.25 to 4.0) """ - # Resolve voice name - if voice in self.VOICES: - full_voice = self.VOICES[voice] + # Resolve voice name (case-insensitive), and normalize "model:VoiceId" suffix. + resolved_voice = (voice or "").strip() + voice_lookup = resolved_voice.lower() + if voice_lookup in self.VOICES: + full_voice = self.VOICES[voice_lookup] + elif ":" in resolved_voice: + model_part, voice_part = resolved_voice.split(":", 1) + normalized_voice_part = voice_part.strip().lower() + if normalized_voice_part in self.VOICES: + full_voice = f"{(model_part or model).strip()}:{normalized_voice_part}" + else: + full_voice = resolved_voice else: - full_voice = voice + full_voice = resolved_voice super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed) diff --git a/web/pages/Assistants.tsx b/web/pages/Assistants.tsx index 68a93d4..6b64919 100644 --- a/web/pages/Assistants.tsx +++ b/web/pages/Assistants.tsx @@ -16,12 +16,37 @@ const isOpenAICompatibleVendor = (vendor?: string) => { }; const OPENAI_COMPATIBLE_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B'; +const OPENAI_COMPATIBLE_KNOWN_VOICES = new Set([ + 'alex', + 'anna', + 'bella', + 'benjamin', + 'charles', + 'claire', + 'david', + 'diana', +]); + +const normalizeOpenAICompatibleVoiceKey = (voiceValue: string, model?: string) => { + const raw = String(voiceValue || '').trim(); + const modelName = String(model || '').trim() || OPENAI_COMPATIBLE_DEFAULT_MODEL; + if (!raw) return `${modelName}:anna`; + + if (raw.includes(':')) { + const [prefix, ...rest] = raw.split(':'); + const voiceIdRaw = rest.join(':').trim(); + const voiceIdLower = voiceIdRaw.toLowerCase(); + const normalizedVoiceId = OPENAI_COMPATIBLE_KNOWN_VOICES.has(voiceIdLower) ? voiceIdLower : voiceIdRaw; + return `${(prefix || modelName).trim()}:${normalizedVoiceId}`; + } + + const rawLower = raw.toLowerCase(); + const normalizedVoiceId = OPENAI_COMPATIBLE_KNOWN_VOICES.has(rawLower) ? rawLower : raw; + return `${modelName}:${normalizedVoiceId}`; +}; const buildOpenAICompatibleVoiceKey = (voiceId: string, model?: string) => { - const id = String(voiceId || '').trim(); - if (!id) return ''; - if (id.includes(':')) return id; - return `${model || OPENAI_COMPATIBLE_DEFAULT_MODEL}:${id}`; + return normalizeOpenAICompatibleVoiceKey(voiceId, model); }; const resolveRuntimeTtsVoice = (selectedVoiceId: string, voice: Voice) => { @@ -29,13 +54,14 @@ const resolveRuntimeTtsVoice = (selectedVoiceId: string, voice: Voice) => { if (!isOpenAICompatibleVendor(voice.vendor)) { return explicitKey || selectedVoiceId; } + const resolved = normalizeOpenAICompatibleVoiceKey(explicitKey || selectedVoiceId, voice.model); if (voice.isSystem) { - const canonical = buildOpenAICompatibleVoiceKey(selectedVoiceId, voice.model); + const canonical = normalizeOpenAICompatibleVoiceKey(selectedVoiceId, voice.model); if (!explicitKey) return canonical; const explicitSuffix = explicitKey.includes(':') ? explicitKey.split(':').pop() : explicitKey; if (explicitSuffix && explicitSuffix !== selectedVoiceId) return canonical; } - return explicitKey || buildOpenAICompatibleVoiceKey(selectedVoiceId, voice.model); + return resolved; }; const renderToolIcon = (icon: string) => {