From ff3a03b1ad5fe4ea55d34f05fa66f3b132f1f554 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Thu, 12 Feb 2026 18:44:55 +0800 Subject: [PATCH] Use openai compatible as vendor --- api/app/routers/asr.py | 20 +- api/app/routers/assistants.py | 17 +- api/app/routers/tools.py | 8 +- api/app/routers/voices.py | 32 ++- api/docs/asr.md | 20 +- api/docs/llm.md | 20 +- api/docs/model-access.md | 9 +- api/init_db.py | 24 +- engine/.env.example | 10 +- engine/app/config.py | 10 +- engine/core/duplex_pipeline.py | 117 +-------- engine/docs/ws_v1_schema.md | 4 +- engine/services/__init__.py | 6 +- engine/services/openai_compatible_asr.py | 321 +++++++++++++++++++++++ engine/services/openai_compatible_tts.py | 315 ++++++++++++++++++++++ engine/services/siliconflow_asr.py | 319 +--------------------- engine/services/siliconflow_tts.py | 313 +--------------------- web/pages/ASRLibrary.tsx | 12 +- web/pages/Assistants.tsx | 29 +- web/pages/LLMLibrary.tsx | 13 +- web/pages/VoiceLibrary.tsx | 95 ++----- web/services/backendApi.ts | 11 +- web/services/mockData.ts | 2 +- 23 files changed, 822 insertions(+), 905 deletions(-) create mode 100644 engine/services/openai_compatible_asr.py create mode 100644 engine/services/openai_compatible_tts.py diff --git a/api/app/routers/asr.py b/api/app/routers/asr.py index 5805061..470368d 100644 --- a/api/app/routers/asr.py +++ b/api/app/routers/asr.py @@ -16,16 +16,22 @@ from ..schemas import ( router = APIRouter(prefix="/asr", tags=["ASR Models"]) -SILICONFLOW_DEFAULT_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall" +OPENAI_COMPATIBLE_DEFAULT_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall" -def _is_siliconflow_vendor(vendor: str) -> bool: - return (vendor or "").strip().lower() in {"siliconflow", "硅基流动"} +def _is_openai_compatible_vendor(vendor: str) -> bool: + normalized = (vendor or "").strip().lower() + return normalized in { + "openai compatible", + "openai-compatible", + "siliconflow", # backward compatibility + "硅基流动", # backward compatibility + } def _default_asr_model(vendor: str) -> str: - if _is_siliconflow_vendor(vendor): - return SILICONFLOW_DEFAULT_ASR_MODEL + if _is_openai_compatible_vendor(vendor): + return OPENAI_COMPATIBLE_DEFAULT_ASR_MODEL return "whisper-1" @@ -129,7 +135,7 @@ def test_asr_model( # 连接性测试优先,避免依赖真实音频输入 headers = {"Authorization": f"Bearer {model.api_key}"} with httpx.Client(timeout=60.0) as client: - if model.vendor.lower() in ["siliconflow", "paraformer"]: + if _is_openai_compatible_vendor(model.vendor) or model.vendor.lower() == "paraformer": response = client.get(f"{model.base_url}/asr", headers=headers) elif model.vendor.lower() == "openai": response = client.get(f"{model.base_url}/audio/models", headers=headers) @@ -258,7 +264,7 @@ async def preview_asr_model( raise HTTPException(status_code=400, detail="Uploaded audio file is empty") effective_api_key = (api_key or "").strip() or (model.api_key or "").strip() - if not effective_api_key and _is_siliconflow_vendor(model.vendor): + if not effective_api_key and _is_openai_compatible_vendor(model.vendor): effective_api_key = os.getenv("SILICONFLOW_API_KEY", "").strip() if not effective_api_key: raise HTTPException(status_code=400, detail=f"API key is required for ASR model: {model.name}") diff --git a/api/app/routers/assistants.py b/api/app/routers/assistants.py index 468cfb9..340f01b 100644 --- a/api/app/routers/assistants.py +++ b/api/app/routers/assistants.py @@ -13,8 +13,13 @@ from ..schemas import ( router = APIRouter(prefix="/assistants", tags=["Assistants"]) -def _is_siliconflow_vendor(vendor: Optional[str]) -> bool: - return (vendor or "").strip().lower() in {"siliconflow", "硅基流动"} +def _is_openai_compatible_vendor(vendor: Optional[str]) -> bool: + return (vendor or "").strip().lower() in { + "siliconflow", + "硅基流动", + "openai compatible", + "openai-compatible", + } def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict: @@ -47,11 +52,11 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict: if assistant.asr_model_id: asr = db.query(ASRModel).filter(ASRModel.id == assistant.asr_model_id).first() if asr: - asr_provider = "siliconflow" if _is_siliconflow_vendor(asr.vendor) else "buffered" + asr_provider = "openai_compatible" if _is_openai_compatible_vendor(asr.vendor) else "buffered" metadata["services"]["asr"] = { "provider": asr_provider, "model": asr.model_name or asr.name, - "apiKey": asr.api_key if asr_provider == "siliconflow" else None, + "apiKey": asr.api_key if asr_provider == "openai_compatible" else None, } else: warnings.append(f"ASR model not found: {assistant.asr_model_id}") @@ -61,12 +66,12 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict: elif assistant.voice: voice = db.query(Voice).filter(Voice.id == assistant.voice).first() if voice: - tts_provider = "siliconflow" if _is_siliconflow_vendor(voice.vendor) else "edge" + tts_provider = "openai_compatible" if _is_openai_compatible_vendor(voice.vendor) else "edge" metadata["services"]["tts"] = { "enabled": True, "provider": tts_provider, "model": voice.model, - "apiKey": voice.api_key if tts_provider == "siliconflow" else None, + "apiKey": voice.api_key if tts_provider == "openai_compatible" else None, "voice": voice.voice_key or voice.id, "speed": assistant.speed or voice.speed, } diff --git a/api/app/routers/tools.py b/api/app/routers/tools.py index 79f24e9..0af09e0 100644 --- a/api/app/routers/tools.py +++ b/api/app/routers/tools.py @@ -467,7 +467,13 @@ def _test_asr_model(db: Session, model_id: str, result: AutotestResult): headers = {"Authorization": f"Bearer {model.api_key}"} with httpx.Client(timeout=30.0) as client: - if model.vendor.lower() in ["siliconflow", "paraformer"]: + normalized_vendor = (model.vendor or "").strip().lower() + if normalized_vendor in [ + "openai compatible", + "openai-compatible", + "siliconflow", # backward compatibility + "paraformer", + ]: response = client.get( f"{model.base_url}/asr", headers=headers diff --git a/api/app/routers/voices.py b/api/app/routers/voices.py index eab2298..6cb4258 100644 --- a/api/app/routers/voices.py +++ b/api/app/routers/voices.py @@ -13,20 +13,26 @@ from ..schemas import VoiceCreate, VoiceOut, VoicePreviewRequest, VoicePreviewRe router = APIRouter(prefix="/voices", tags=["Voices"]) -SILICONFLOW_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B" +OPENAI_COMPATIBLE_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B" -def _is_siliconflow_vendor(vendor: str) -> bool: - return vendor.strip().lower() in {"siliconflow", "硅基流动"} +def _is_openai_compatible_vendor(vendor: str) -> bool: + normalized = (vendor or "").strip().lower() + return normalized in { + "openai compatible", + "openai-compatible", + "siliconflow", # backward compatibility + "硅基流动", # backward compatibility + } def _default_base_url(vendor: str) -> Optional[str]: - if _is_siliconflow_vendor(vendor): + if _is_openai_compatible_vendor(vendor): return "https://api.siliconflow.cn/v1" return None -def _build_siliconflow_voice_key(voice: Voice, model: str) -> str: +def _build_openai_compatible_voice_key(voice: Voice, model: str) -> str: if voice.voice_key: return voice.voice_key if ":" in voice.id: @@ -65,8 +71,8 @@ def create_voice(data: VoiceCreate, db: Session = Depends(get_db)): model = data.model voice_key = data.voice_key - if _is_siliconflow_vendor(vendor): - model = model or SILICONFLOW_DEFAULT_MODEL + if _is_openai_compatible_vendor(vendor): + model = model or OPENAI_COMPATIBLE_DEFAULT_MODEL if not voice_key: raw_id = (data.id or data.name).strip() voice_key = raw_id if ":" in raw_id else f"{model}:{raw_id}" @@ -115,11 +121,11 @@ def update_voice(id: str, data: VoiceUpdate, db: Session = Depends(get_db)): update_data["vendor"] = update_data["vendor"].strip() vendor_for_defaults = update_data.get("vendor", voice.vendor) - if _is_siliconflow_vendor(vendor_for_defaults): - model = update_data.get("model") or voice.model or SILICONFLOW_DEFAULT_MODEL + if _is_openai_compatible_vendor(vendor_for_defaults): + model = update_data.get("model") or voice.model or OPENAI_COMPATIBLE_DEFAULT_MODEL voice_key = update_data.get("voice_key") or voice.voice_key update_data["model"] = model - update_data["voice_key"] = voice_key or _build_siliconflow_voice_key(voice, model) + update_data["voice_key"] = voice_key or _build_openai_compatible_voice_key(voice, model) for field, value in update_data.items(): setattr(voice, field, value) @@ -152,7 +158,7 @@ def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_ raise HTTPException(status_code=400, detail="Preview text cannot be empty") api_key = (data.api_key or "").strip() or (voice.api_key or "").strip() - if not api_key and _is_siliconflow_vendor(voice.vendor): + if not api_key and _is_openai_compatible_vendor(voice.vendor): api_key = os.getenv("SILICONFLOW_API_KEY", "").strip() if not api_key: raise HTTPException(status_code=400, detail=f"API key is required for voice: {voice.name}") @@ -161,11 +167,11 @@ def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_ if not base_url: raise HTTPException(status_code=400, detail=f"Base URL is required for voice: {voice.name}") - model = voice.model or SILICONFLOW_DEFAULT_MODEL + model = voice.model or OPENAI_COMPATIBLE_DEFAULT_MODEL payload = { "model": model, "input": text, - "voice": voice.voice_key or _build_siliconflow_voice_key(voice, model), + "voice": voice.voice_key or _build_openai_compatible_voice_key(voice, model), "response_format": "mp3", "speed": data.speed if data.speed is not None else voice.speed, } diff --git a/api/docs/asr.md b/api/docs/asr.md index 08767ad..5b9098f 100644 --- a/api/docs/asr.md +++ b/api/docs/asr.md @@ -20,7 +20,7 @@ interface ASRModel { id: string; // 模型唯一标识 (8位UUID) user_id: number; // 所属用户ID name: string; // 模型显示名称 - vendor: string; // 供应商: "OpenAI" | "SiliconFlow" | "Paraformer" | 等 + vendor: string; // 供应商: "OpenAI Compatible" | "Paraformer" | 等 language: string; // 识别语言: "zh" | "en" | "Multi-lingual" base_url: string; // API Base URL api_key: string; // API Key @@ -64,7 +64,7 @@ GET /api/v1/asr "id": "abc12345", "user_id": 1, "name": "Whisper 多语种识别", - "vendor": "OpenAI", + "vendor": "OpenAI Compatible", "language": "Multi-lingual", "base_url": "https://api.openai.com/v1", "api_key": "sk-***", @@ -78,7 +78,7 @@ GET /api/v1/asr "id": "def67890", "user_id": 1, "name": "SenseVoice 中文识别", - "vendor": "SiliconFlow", + "vendor": "OpenAI Compatible", "language": "zh", "base_url": "https://api.siliconflow.cn/v1", "api_key": "sf-***", @@ -114,7 +114,7 @@ GET /api/v1/asr/{id} "id": "abc12345", "user_id": 1, "name": "Whisper 多语种识别", - "vendor": "OpenAI", + "vendor": "OpenAI Compatible", "language": "Multi-lingual", "base_url": "https://api.openai.com/v1", "api_key": "sk-***", @@ -140,7 +140,7 @@ POST /api/v1/asr ```json { "name": "SenseVoice 中文识别", - "vendor": "SiliconFlow", + "vendor": "OpenAI Compatible", "language": "zh", "base_url": "https://api.siliconflow.cn/v1", "api_key": "sk-your-api-key", @@ -157,7 +157,7 @@ POST /api/v1/asr | 字段 | 类型 | 必填 | 说明 | |------|------|------|------| | name | string | 是 | 模型显示名称 | -| vendor | string | 是 | 供应商: "OpenAI" / "SiliconFlow" / "Paraformer" | +| vendor | string | 是 | 供应商: "OpenAI Compatible" / "Paraformer" | | language | string | 是 | 语言: "zh" / "en" / "Multi-lingual" | | base_url | string | 是 | API Base URL | | api_key | string | 是 | API Key | @@ -347,7 +347,7 @@ class ASRTestResponse(BaseModel): ```json { - "vendor": "OpenAI", + "vendor": "OpenAI Compatible", "base_url": "https://api.openai.com/v1", "api_key": "sk-xxx", "model_name": "whisper-1", @@ -357,11 +357,11 @@ class ASRTestResponse(BaseModel): } ``` -### SiliconFlow Paraformer +### OpenAI Compatible Paraformer ```json { - "vendor": "SiliconFlow", + "vendor": "OpenAI Compatible", "base_url": "https://api.siliconflow.cn/v1", "api_key": "sf-xxx", "model_name": "paraformer-v2", @@ -393,7 +393,7 @@ class ASRTestResponse(BaseModel): | test_filter_asr_models_by_language | 按语言过滤测试 | | test_filter_asr_models_by_enabled | 按启用状态过滤测试 | | test_create_asr_model_with_hotwords | 热词配置测试 | -| test_test_asr_model_siliconflow | SiliconFlow 供应商测试 | +| test_test_asr_model_siliconflow | OpenAI Compatible 供应商测试 | | test_test_asr_model_openai | OpenAI 供应商测试 | | test_different_asr_languages | 多语言测试 | | test_different_asr_vendors | 多供应商测试 | diff --git a/api/docs/llm.md b/api/docs/llm.md index 2b2fce9..86d3c24 100644 --- a/api/docs/llm.md +++ b/api/docs/llm.md @@ -20,7 +20,7 @@ interface LLMModel { id: string; // 模型唯一标识 (8位UUID) user_id: number; // 所属用户ID name: string; // 模型显示名称 - vendor: string; // 供应商: "OpenAI" | "SiliconFlow" | "Dify" | "FastGPT" | 等 + vendor: string; // 供应商: "OpenAI Compatible" | "Dify" | "FastGPT" | 等 type: string; // 类型: "text" | "embedding" | "rerank" base_url: string; // API Base URL api_key: string; // API Key @@ -64,7 +64,7 @@ GET /api/v1/llm "id": "abc12345", "user_id": 1, "name": "GPT-4o", - "vendor": "OpenAI", + "vendor": "OpenAI Compatible", "type": "text", "base_url": "https://api.openai.com/v1", "api_key": "sk-***", @@ -79,7 +79,7 @@ GET /api/v1/llm "id": "def67890", "user_id": 1, "name": "Embedding-3-Small", - "vendor": "OpenAI", + "vendor": "OpenAI Compatible", "type": "embedding", "base_url": "https://api.openai.com/v1", "api_key": "sk-***", @@ -111,7 +111,7 @@ GET /api/v1/llm/{id} "id": "abc12345", "user_id": 1, "name": "GPT-4o", - "vendor": "OpenAI", + "vendor": "OpenAI Compatible", "type": "text", "base_url": "https://api.openai.com/v1", "api_key": "sk-***", @@ -137,7 +137,7 @@ POST /api/v1/llm ```json { "name": "GPT-4o", - "vendor": "OpenAI", + "vendor": "OpenAI Compatible", "type": "text", "base_url": "https://api.openai.com/v1", "api_key": "sk-your-api-key", @@ -314,11 +314,11 @@ class LLMModelTestResponse(BaseModel): ## 供应商配置示例 -### OpenAI +### OpenAI Compatible (OpenAI Endpoint) ```json { - "vendor": "OpenAI", + "vendor": "OpenAI Compatible", "base_url": "https://api.openai.com/v1", "api_key": "sk-xxx", "model_name": "gpt-4o", @@ -327,11 +327,11 @@ class LLMModelTestResponse(BaseModel): } ``` -### SiliconFlow +### OpenAI Compatible ```json { - "vendor": "SiliconFlow", + "vendor": "OpenAI Compatible", "base_url": "https://api.siliconflow.com/v1", "api_key": "sf-xxx", "model_name": "deepseek-v3", @@ -356,7 +356,7 @@ class LLMModelTestResponse(BaseModel): ```json { - "vendor": "OpenAI", + "vendor": "OpenAI Compatible", "base_url": "https://api.openai.com/v1", "api_key": "sk-xxx", "model_name": "text-embedding-3-small", diff --git a/api/docs/model-access.md b/api/docs/model-access.md index d68f163..9126bc1 100644 --- a/api/docs/model-access.md +++ b/api/docs/model-access.md @@ -20,7 +20,7 @@ interface LLMModel { id: string; // 模型唯一标识 user_id: number; // 所属用户ID name: string; // 模型显示名称 - vendor: string; // 供应商: "OpenAI Compatible" | "SiliconFlow" | "Dify" | "FastGPT" + vendor: string; // 供应商: "OpenAI Compatible" | "Dify" | "FastGPT" type: string; // 类型: "text" | "embedding" | "rerank" base_url: string; // API Base URL api_key: string; // API Key @@ -57,7 +57,7 @@ interface TTSModel { id: string; user_id: number; name: string; - vendor: string; // "Ali" | "Volcano" | "Minimax" | "硅基流动" + vendor: string; // "OpenAI Compatible" | "Ali" | "Volcano" | "Minimax" language: string; // "zh" | "en" voice_list?: string[]; // 支持的声音列表 enabled: boolean; @@ -316,7 +316,6 @@ class LLMModelType(str, Enum): class LLMModelVendor(str, Enum): OPENAI_COMPATIBLE = "OpenAI Compatible" - SILICONFLOW = "SiliconFlow" DIFY = "Dify" FASTGPT = "FastGPT" @@ -389,11 +388,11 @@ class ASRModelOut(ASRModelBase): } ``` -### SiliconFlow +### OpenAI Compatible ```json { - "vendor": "SiliconFlow", + "vendor": "OpenAI Compatible", "base_url": "https://api.siliconflow.com/v1", "api_key": "sf-xxx", "model_name": "deepseek-v3" diff --git a/api/init_db.py b/api/init_db.py index d2e9b8e..2eff147 100644 --- a/api/init_db.py +++ b/api/init_db.py @@ -135,21 +135,21 @@ def rebuild_vector_store(reset_doc_status: bool = True): def init_default_data(): with db_session() as db: # 检查是否已有数据 - # SiliconFlow CosyVoice 2.0 预设声音 (8个) + # OpenAI Compatible (SiliconFlow API) CosyVoice 2.0 预设声音 (8个) # 参考: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech voices = [ # 男声 (Male Voices) - Voice(id="alex", name="Alex", vendor="SiliconFlow", gender="Male", language="en", + Voice(id="alex", name="Alex", vendor="OpenAI Compatible", gender="Male", language="en", description="Steady male voice.", is_system=True), - Voice(id="david", name="David", vendor="SiliconFlow", gender="Male", language="en", + Voice(id="david", name="David", vendor="OpenAI Compatible", gender="Male", language="en", description="Cheerful male voice.", is_system=True), # 女声 (Female Voices) - Voice(id="bella", name="Bella", vendor="SiliconFlow", gender="Female", language="en", + Voice(id="bella", name="Bella", vendor="OpenAI Compatible", gender="Female", language="en", description="Passionate female voice.", is_system=True), - Voice(id="claire", name="Claire", vendor="SiliconFlow", gender="Female", language="en", + Voice(id="claire", name="Claire", vendor="OpenAI Compatible", gender="Female", language="en", description="Gentle female voice.", is_system=True), ] - seed_if_empty(db, Voice, voices, "✅ 默认声音数据已初始化 (SiliconFlow CosyVoice 2.0)") + seed_if_empty(db, Voice, voices, "✅ 默认声音数据已初始化 (OpenAI Compatible CosyVoice 2.0)") def init_default_tools(recreate: bool = False): @@ -181,7 +181,7 @@ def init_default_assistants(): voice="anna", speed=1.0, hotwords=[], - tools=["calculator", "current_time"], + tools=["current_time"], interruption_sensitivity=500, config_mode="platform", llm_model_id="deepseek-chat", @@ -215,7 +215,7 @@ def init_default_assistants(): voice="alex", speed=1.0, hotwords=["grammar", "vocabulary", "practice"], - tools=["calculator"], + tools=["current_time"], interruption_sensitivity=400, config_mode="platform", ), @@ -294,7 +294,7 @@ def init_default_llm_models(): id="deepseek-chat", user_id=1, name="DeepSeek Chat", - vendor="SiliconFlow", + vendor="OpenAI Compatible", type="text", base_url="https://api.deepseek.com", api_key="YOUR_API_KEY", # 用户需替换 @@ -320,7 +320,7 @@ def init_default_llm_models(): id="text-embedding-3-small", user_id=1, name="Embedding 3 Small", - vendor="OpenAI", + vendor="OpenAI Compatible", type="embedding", base_url="https://api.openai.com/v1", api_key="YOUR_API_KEY", @@ -339,7 +339,7 @@ def init_default_asr_models(): id="FunAudioLLM/SenseVoiceSmall", user_id=1, name="FunAudioLLM/SenseVoiceSmall", - vendor="SiliconFlow", + vendor="OpenAI Compatible", language="Multi-lingual", base_url="https://api.siliconflow.cn/v1", api_key="YOUR_API_KEY", @@ -353,7 +353,7 @@ def init_default_asr_models(): id="TeleAI/TeleSpeechASR", user_id=1, name="TeleAI/TeleSpeechASR", - vendor="SiliconFlow", + vendor="OpenAI Compatible", language="Multi-lingual", base_url="https://api.siliconflow.cn/v1", api_key="YOUR_API_KEY", diff --git a/engine/.env.example b/engine/.env.example index db4aa5a..f62a4c6 100644 --- a/engine/.env.example +++ b/engine/.env.example @@ -41,19 +41,19 @@ LLM_MODEL=gpt-4o-mini LLM_TEMPERATURE=0.7 # TTS -# edge: no SiliconFlow key needed -# siliconflow: requires SILICONFLOW_API_KEY -TTS_PROVIDER=siliconflow +# edge: no API key needed +# openai_compatible: compatible with SiliconFlow-style endpoints +TTS_PROVIDER=openai_compatible TTS_VOICE=anna TTS_SPEED=1.0 -# SiliconFlow (used by TTS and/or ASR when provider=siliconflow) +# SiliconFlow (used by TTS and/or ASR when provider=openai_compatible) SILICONFLOW_API_KEY=your_siliconflow_api_key_here SILICONFLOW_TTS_MODEL=FunAudioLLM/CosyVoice2-0.5B SILICONFLOW_ASR_MODEL=FunAudioLLM/SenseVoiceSmall # ASR -ASR_PROVIDER=siliconflow +ASR_PROVIDER=openai_compatible # Interim cadence and minimum audio before interim decode. ASR_INTERIM_INTERVAL_MS=500 ASR_MIN_AUDIO_MS=300 diff --git a/engine/app/config.py b/engine/app/config.py index 609b3ad..1e3e1b3 100644 --- a/engine/app/config.py +++ b/engine/app/config.py @@ -44,7 +44,10 @@ class Settings(BaseSettings): llm_temperature: float = Field(default=0.7, description="LLM temperature for response generation") # TTS Configuration - tts_provider: str = Field(default="siliconflow", description="TTS provider (edge, siliconflow)") + tts_provider: str = Field( + default="openai_compatible", + description="TTS provider (edge, openai_compatible; siliconflow alias supported)" + ) tts_voice: str = Field(default="anna", description="TTS voice name") tts_speed: float = Field(default=1.0, description="TTS speech speed multiplier") @@ -53,7 +56,10 @@ class Settings(BaseSettings): siliconflow_tts_model: str = Field(default="FunAudioLLM/CosyVoice2-0.5B", description="SiliconFlow TTS model") # ASR Configuration - asr_provider: str = Field(default="siliconflow", description="ASR provider (siliconflow, buffered)") + asr_provider: str = Field( + default="openai_compatible", + description="ASR provider (openai_compatible, buffered; siliconflow alias supported)" + ) siliconflow_asr_model: str = Field(default="FunAudioLLM/SenseVoiceSmall", description="SiliconFlow ASR model") asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms") asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result") diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py index 61ea397..508ba2b 100644 --- a/engine/core/duplex_pipeline.py +++ b/engine/core/duplex_pipeline.py @@ -30,8 +30,8 @@ from processors.vad import SileroVAD, VADProcessor from services.asr import BufferedASRService from services.base import BaseASRService, BaseLLMService, BaseTTSService, LLMMessage, LLMStreamEvent from services.llm import MockLLMService, OpenAILLMService -from services.siliconflow_asr import SiliconFlowASRService -from services.siliconflow_tts import SiliconFlowTTSService +from services.openai_compatible_asr import OpenAICompatibleASRService +from services.openai_compatible_tts import OpenAICompatibleTTSService from services.streaming_text import extract_tts_sentence, has_spoken_content from services.tts import EdgeTTSService, MockTTSService @@ -60,57 +60,6 @@ class DuplexPipeline: _TOOL_WAIT_TIMEOUT_SECONDS = 15.0 _SERVER_TOOL_TIMEOUT_SECONDS = 15.0 _DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = { - "search": { - "name": "search", - "description": "Search the internet for recent information", - "parameters": { - "type": "object", - "properties": {"query": {"type": "string"}}, - "required": ["query"], - }, - }, - "calculator": { - "name": "calculator", - "description": "Evaluate a math expression", - "parameters": { - "type": "object", - "properties": {"expression": {"type": "string"}}, - "required": ["expression"], - }, - }, - "weather": { - "name": "weather", - "description": "Get weather by city name", - "parameters": { - "type": "object", - "properties": {"city": {"type": "string"}}, - "required": ["city"], - }, - }, - "translate": { - "name": "translate", - "description": "Translate text to target language", - "parameters": { - "type": "object", - "properties": { - "text": {"type": "string"}, - "target_lang": {"type": "string"}, - }, - "required": ["text", "target_lang"], - }, - }, - "knowledge": { - "name": "knowledge", - "description": "Query knowledge base by question", - "parameters": { - "type": "object", - "properties": { - "query": {"type": "string"}, - "kb_id": {"type": "string"}, - }, - "required": ["query"], - }, - }, "current_time": { "name": "current_time", "description": "Get current local time", @@ -120,51 +69,6 @@ class DuplexPipeline: "required": [], }, }, - "code_interpreter": { - "name": "code_interpreter", - "description": "Execute Python code in a controlled environment", - "parameters": { - "type": "object", - "properties": {"code": {"type": "string"}}, - "required": ["code"], - }, - }, - "turn_on_camera": { - "name": "turn_on_camera", - "description": "Turn on camera on client device", - "parameters": { - "type": "object", - "properties": {}, - "required": [], - }, - }, - "turn_off_camera": { - "name": "turn_off_camera", - "description": "Turn off camera on client device", - "parameters": { - "type": "object", - "properties": {}, - "required": [], - }, - }, - "increase_volume": { - "name": "increase_volume", - "description": "Increase speaker volume", - "parameters": { - "type": "object", - "properties": {"step": {"type": "integer"}}, - "required": [], - }, - }, - "decrease_volume": { - "name": "decrease_volume", - "description": "Decrease speaker volume", - "parameters": { - "type": "object", - "properties": {"step": {"type": "integer"}}, - "required": [], - }, - }, } def __init__( @@ -386,6 +290,11 @@ class DuplexPipeline: return False return None + @staticmethod + def _is_openai_compatible_provider(provider: Any) -> bool: + normalized = str(provider or "").strip().lower() + return normalized in {"openai_compatible", "openai-compatible", "siliconflow"} + def _tts_output_enabled(self) -> bool: enabled = self._coerce_bool(self._runtime_tts.get("enabled")) if enabled is not None: @@ -495,15 +404,15 @@ class DuplexPipeline: tts_model = self._runtime_tts.get("model") or settings.siliconflow_tts_model tts_speed = float(self._runtime_tts.get("speed") or settings.tts_speed) - if tts_provider == "siliconflow" and tts_api_key: - self.tts_service = SiliconFlowTTSService( + if self._is_openai_compatible_provider(tts_provider) and tts_api_key: + self.tts_service = OpenAICompatibleTTSService( api_key=tts_api_key, voice=tts_voice, model=tts_model, sample_rate=settings.sample_rate, speed=tts_speed ) - logger.info("Using SiliconFlow TTS service") + logger.info("Using OpenAI-compatible TTS service (SiliconFlow implementation)") else: self.tts_service = EdgeTTSService( voice=tts_voice, @@ -531,8 +440,8 @@ class DuplexPipeline: asr_interim_interval = int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms) asr_min_audio_ms = int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms) - if asr_provider == "siliconflow" and asr_api_key: - self.asr_service = SiliconFlowASRService( + if self._is_openai_compatible_provider(asr_provider) and asr_api_key: + self.asr_service = OpenAICompatibleASRService( api_key=asr_api_key, model=asr_model, sample_rate=settings.sample_rate, @@ -540,7 +449,7 @@ class DuplexPipeline: min_audio_for_interim_ms=asr_min_audio_ms, on_transcript=self._on_transcript_callback ) - logger.info("Using SiliconFlow ASR service") + logger.info("Using OpenAI-compatible ASR service (SiliconFlow implementation)") else: self.asr_service = BufferedASRService( sample_rate=settings.sample_rate diff --git a/engine/docs/ws_v1_schema.md b/engine/docs/ws_v1_schema.md index 88ac755..9db0900 100644 --- a/engine/docs/ws_v1_schema.md +++ b/engine/docs/ws_v1_schema.md @@ -66,7 +66,7 @@ Rules: "baseUrl": "https://api.openai.com/v1" }, "asr": { - "provider": "siliconflow", + "provider": "openai_compatible", "model": "FunAudioLLM/SenseVoiceSmall", "apiKey": "sf-...", "interimIntervalMs": 500, @@ -74,7 +74,7 @@ Rules: }, "tts": { "enabled": true, - "provider": "siliconflow", + "provider": "openai_compatible", "model": "FunAudioLLM/CosyVoice2-0.5B", "apiKey": "sf-...", "voice": "anna", diff --git a/engine/services/__init__.py b/engine/services/__init__.py index 50301d4..0bab6b3 100644 --- a/engine/services/__init__.py +++ b/engine/services/__init__.py @@ -15,8 +15,8 @@ from services.base import ( from services.llm import OpenAILLMService, MockLLMService from services.tts import EdgeTTSService, MockTTSService from services.asr import BufferedASRService, MockASRService -from services.siliconflow_asr import SiliconFlowASRService -from services.siliconflow_tts import SiliconFlowTTSService +from services.openai_compatible_asr import OpenAICompatibleASRService, SiliconFlowASRService +from services.openai_compatible_tts import OpenAICompatibleTTSService, SiliconFlowTTSService from services.streaming_tts_adapter import StreamingTTSAdapter from services.realtime import RealtimeService, RealtimeConfig, RealtimePipeline @@ -38,8 +38,10 @@ __all__ = [ # ASR "BufferedASRService", "MockASRService", + "OpenAICompatibleASRService", "SiliconFlowASRService", # TTS (SiliconFlow) + "OpenAICompatibleTTSService", "SiliconFlowTTSService", "StreamingTTSAdapter", # Realtime diff --git a/engine/services/openai_compatible_asr.py b/engine/services/openai_compatible_asr.py new file mode 100644 index 0000000..daf7c04 --- /dev/null +++ b/engine/services/openai_compatible_asr.py @@ -0,0 +1,321 @@ +"""OpenAI-compatible ASR (Automatic Speech Recognition) Service. + +Uses the SiliconFlow API for speech-to-text transcription. +API: https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions +""" + +import asyncio +import io +import wave +from typing import AsyncIterator, Optional, Callable, Awaitable +from loguru import logger + +try: + import aiohttp + AIOHTTP_AVAILABLE = True +except ImportError: + AIOHTTP_AVAILABLE = False + logger.warning("aiohttp not available - OpenAICompatibleASRService will not work") + +from services.base import BaseASRService, ASRResult, ServiceState + + +class OpenAICompatibleASRService(BaseASRService): + """ + OpenAI-compatible ASR service for speech-to-text transcription. + + Features: + - Buffers incoming audio chunks + - Provides interim transcriptions periodically (for streaming to client) + - Final transcription on EOU + + API Details: + - Endpoint: POST https://api.siliconflow.cn/v1/audio/transcriptions + - Models: FunAudioLLM/SenseVoiceSmall (default), TeleAI/TeleSpeechASR + - Input: Audio file (multipart/form-data) + - Output: {"text": "transcribed text"} + """ + + # Supported models + MODELS = { + "sensevoice": "FunAudioLLM/SenseVoiceSmall", + "telespeech": "TeleAI/TeleSpeechASR", + } + + API_URL = "https://api.siliconflow.cn/v1/audio/transcriptions" + + def __init__( + self, + api_key: str, + model: str = "FunAudioLLM/SenseVoiceSmall", + sample_rate: int = 16000, + language: str = "auto", + interim_interval_ms: int = 500, # How often to send interim results + min_audio_for_interim_ms: int = 300, # Min audio before first interim + on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None + ): + """ + Initialize OpenAI-compatible ASR service. + + Args: + api_key: Provider API key + model: ASR model name or alias + sample_rate: Audio sample rate (16000 recommended) + language: Language code (auto for automatic detection) + interim_interval_ms: How often to generate interim transcriptions + min_audio_for_interim_ms: Minimum audio duration before first interim + on_transcript: Callback for transcription results (text, is_final) + """ + super().__init__(sample_rate=sample_rate, language=language) + + if not AIOHTTP_AVAILABLE: + raise RuntimeError("aiohttp is required for OpenAICompatibleASRService") + + self.api_key = api_key + self.model = self.MODELS.get(model.lower(), model) + self.interim_interval_ms = interim_interval_ms + self.min_audio_for_interim_ms = min_audio_for_interim_ms + self.on_transcript = on_transcript + + # Session + self._session: Optional[aiohttp.ClientSession] = None + + # Audio buffer + self._audio_buffer: bytes = b"" + self._current_text: str = "" + self._last_interim_time: float = 0 + + # Transcript queue for async iteration + self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue() + + # Background task for interim results + self._interim_task: Optional[asyncio.Task] = None + self._running = False + + logger.info(f"OpenAICompatibleASRService initialized with model: {self.model}") + + async def connect(self) -> None: + """Connect to the service.""" + self._session = aiohttp.ClientSession( + headers={ + "Authorization": f"Bearer {self.api_key}" + } + ) + self._running = True + self.state = ServiceState.CONNECTED + logger.info("OpenAICompatibleASRService connected") + + async def disconnect(self) -> None: + """Disconnect and cleanup.""" + self._running = False + + if self._interim_task: + self._interim_task.cancel() + try: + await self._interim_task + except asyncio.CancelledError: + pass + self._interim_task = None + + if self._session: + await self._session.close() + self._session = None + + self._audio_buffer = b"" + self._current_text = "" + self.state = ServiceState.DISCONNECTED + logger.info("OpenAICompatibleASRService disconnected") + + async def send_audio(self, audio: bytes) -> None: + """ + Buffer incoming audio data. + + Args: + audio: PCM audio data (16-bit, mono) + """ + self._audio_buffer += audio + + async def transcribe_buffer(self, is_final: bool = False) -> Optional[str]: + """ + Transcribe current audio buffer. + + Args: + is_final: Whether this is the final transcription + + Returns: + Transcribed text or None if not enough audio + """ + if not self._session: + logger.warning("ASR session not connected") + return None + + # Check minimum audio duration + audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000 + + if not is_final and audio_duration_ms < self.min_audio_for_interim_ms: + return None + + if audio_duration_ms < 100: # Less than 100ms - too short + return None + + try: + # Convert PCM to WAV in memory + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, 'wb') as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) # 16-bit + wav_file.setframerate(self.sample_rate) + wav_file.writeframes(self._audio_buffer) + + wav_buffer.seek(0) + wav_data = wav_buffer.read() + + # Send to API + form_data = aiohttp.FormData() + form_data.add_field( + 'file', + wav_data, + filename='audio.wav', + content_type='audio/wav' + ) + form_data.add_field('model', self.model) + + async with self._session.post(self.API_URL, data=form_data) as response: + if response.status == 200: + result = await response.json() + text = result.get("text", "").strip() + + if text: + self._current_text = text + + # Notify via callback + if self.on_transcript: + await self.on_transcript(text, is_final) + + # Queue result + await self._transcript_queue.put( + ASRResult(text=text, is_final=is_final) + ) + + logger.debug(f"ASR {'final' if is_final else 'interim'}: {text[:50]}...") + return text + else: + error_text = await response.text() + logger.error(f"ASR API error {response.status}: {error_text}") + return None + + except Exception as e: + logger.error(f"ASR transcription error: {e}") + return None + + async def get_final_transcription(self) -> str: + """ + Get final transcription and clear buffer. + + Call this when EOU is detected. + + Returns: + Final transcribed text + """ + # Transcribe full buffer as final + text = await self.transcribe_buffer(is_final=True) + + # Clear buffer + result = text or self._current_text + self._audio_buffer = b"" + self._current_text = "" + + return result + + def get_and_clear_text(self) -> str: + """ + Get accumulated text and clear buffer. + + Compatible with BufferedASRService interface. + """ + text = self._current_text + self._current_text = "" + self._audio_buffer = b"" + return text + + def get_audio_buffer(self) -> bytes: + """Get current audio buffer.""" + return self._audio_buffer + + def get_audio_duration_ms(self) -> float: + """Get current audio buffer duration in milliseconds.""" + return len(self._audio_buffer) / (self.sample_rate * 2) * 1000 + + def clear_buffer(self) -> None: + """Clear audio and text buffers.""" + self._audio_buffer = b"" + self._current_text = "" + + async def receive_transcripts(self) -> AsyncIterator[ASRResult]: + """ + Async iterator for transcription results. + + Yields: + ASRResult with text and is_final flag + """ + while self._running: + try: + result = await asyncio.wait_for( + self._transcript_queue.get(), + timeout=0.1 + ) + yield result + except asyncio.TimeoutError: + continue + except asyncio.CancelledError: + break + + async def start_interim_transcription(self) -> None: + """ + Start background task for interim transcriptions. + + This periodically transcribes buffered audio for + real-time feedback to the user. + """ + if self._interim_task and not self._interim_task.done(): + return + + self._interim_task = asyncio.create_task(self._interim_loop()) + + async def stop_interim_transcription(self) -> None: + """Stop interim transcription task.""" + if self._interim_task: + self._interim_task.cancel() + try: + await self._interim_task + except asyncio.CancelledError: + pass + self._interim_task = None + + async def _interim_loop(self) -> None: + """Background loop for interim transcriptions.""" + import time + + while self._running: + try: + await asyncio.sleep(self.interim_interval_ms / 1000) + + # Check if we have enough new audio + current_time = time.time() + time_since_last = (current_time - self._last_interim_time) * 1000 + + if time_since_last >= self.interim_interval_ms: + audio_duration = self.get_audio_duration_ms() + + if audio_duration >= self.min_audio_for_interim_ms: + await self.transcribe_buffer(is_final=False) + self._last_interim_time = current_time + + except asyncio.CancelledError: + break + except Exception as e: + logger.error(f"Interim transcription error: {e}") + + +# Backward-compatible alias +SiliconFlowASRService = OpenAICompatibleASRService diff --git a/engine/services/openai_compatible_tts.py b/engine/services/openai_compatible_tts.py new file mode 100644 index 0000000..f912ec3 --- /dev/null +++ b/engine/services/openai_compatible_tts.py @@ -0,0 +1,315 @@ +"""OpenAI-compatible TTS Service with streaming support. + +Uses SiliconFlow's CosyVoice2 or MOSS-TTSD models for low-latency +text-to-speech synthesis with streaming. + +API Docs: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech +""" + +import os +import asyncio +import aiohttp +from typing import AsyncIterator, Optional +from loguru import logger + +from services.base import BaseTTSService, TTSChunk, ServiceState +from services.streaming_tts_adapter import StreamingTTSAdapter # backward-compatible re-export + + +class OpenAICompatibleTTSService(BaseTTSService): + """ + OpenAI-compatible TTS service with streaming support. + + Supports CosyVoice2-0.5B and MOSS-TTSD-v0.5 models. + """ + + # Available voices + VOICES = { + "alex": "FunAudioLLM/CosyVoice2-0.5B:alex", + "anna": "FunAudioLLM/CosyVoice2-0.5B:anna", + "bella": "FunAudioLLM/CosyVoice2-0.5B:bella", + "benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin", + "charles": "FunAudioLLM/CosyVoice2-0.5B:charles", + "claire": "FunAudioLLM/CosyVoice2-0.5B:claire", + "david": "FunAudioLLM/CosyVoice2-0.5B:david", + "diana": "FunAudioLLM/CosyVoice2-0.5B:diana", + } + + def __init__( + self, + api_key: Optional[str] = None, + voice: str = "anna", + model: str = "FunAudioLLM/CosyVoice2-0.5B", + sample_rate: int = 16000, + speed: float = 1.0 + ): + """ + Initialize OpenAI-compatible TTS service. + + Args: + api_key: Provider API key (defaults to SILICONFLOW_API_KEY env var) + voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana) + model: Model name + sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100) + speed: Speech speed (0.25 to 4.0) + """ + # Resolve voice name + if voice in self.VOICES: + full_voice = self.VOICES[voice] + else: + full_voice = voice + + super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed) + + self.api_key = api_key or os.getenv("SILICONFLOW_API_KEY") + self.model = model + self.api_url = "https://api.siliconflow.cn/v1/audio/speech" + + self._session: Optional[aiohttp.ClientSession] = None + self._cancel_event = asyncio.Event() + + async def connect(self) -> None: + """Initialize HTTP session.""" + if not self.api_key: + raise ValueError("SiliconFlow API key not provided. Set SILICONFLOW_API_KEY env var.") + + self._session = aiohttp.ClientSession( + headers={ + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + ) + self.state = ServiceState.CONNECTED + logger.info(f"SiliconFlow TTS service ready: voice={self.voice}, model={self.model}") + + async def disconnect(self) -> None: + """Close HTTP session.""" + if self._session: + await self._session.close() + self._session = None + self.state = ServiceState.DISCONNECTED + logger.info("SiliconFlow TTS service disconnected") + + async def synthesize(self, text: str) -> bytes: + """Synthesize complete audio for text.""" + audio_data = b"" + async for chunk in self.synthesize_stream(text): + audio_data += chunk.audio + return audio_data + + async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]: + """ + Synthesize audio in streaming mode. + + Args: + text: Text to synthesize + + Yields: + TTSChunk objects with PCM audio + """ + if not self._session: + raise RuntimeError("TTS service not connected") + + if not text.strip(): + return + + self._cancel_event.clear() + + payload = { + "model": self.model, + "input": text, + "voice": self.voice, + "response_format": "pcm", + "sample_rate": self.sample_rate, + "stream": True, + "speed": self.speed + } + + try: + async with self._session.post(self.api_url, json=payload) as response: + if response.status != 200: + error_text = await response.text() + logger.error(f"SiliconFlow TTS error: {response.status} - {error_text}") + return + + # Stream audio chunks + chunk_size = self.sample_rate * 2 // 10 # 100ms chunks + buffer = b"" + pending_chunk = None + + async for chunk in response.content.iter_any(): + if self._cancel_event.is_set(): + logger.info("TTS synthesis cancelled") + return + + buffer += chunk + + # Yield complete chunks + while len(buffer) >= chunk_size: + audio_chunk = buffer[:chunk_size] + buffer = buffer[chunk_size:] + + # Keep one full chunk buffered so we can always tag the true + # last full chunk as final when stream length is an exact multiple. + if pending_chunk is not None: + yield TTSChunk( + audio=pending_chunk, + sample_rate=self.sample_rate, + is_final=False + ) + pending_chunk = audio_chunk + + # Flush pending chunk(s) and remaining tail. + if pending_chunk is not None: + if buffer: + yield TTSChunk( + audio=pending_chunk, + sample_rate=self.sample_rate, + is_final=False + ) + pending_chunk = None + else: + yield TTSChunk( + audio=pending_chunk, + sample_rate=self.sample_rate, + is_final=True + ) + pending_chunk = None + + if buffer: + yield TTSChunk( + audio=buffer, + sample_rate=self.sample_rate, + is_final=True + ) + + except asyncio.CancelledError: + logger.info("TTS synthesis cancelled via asyncio") + raise + except Exception as e: + logger.error(f"TTS synthesis error: {e}") + raise + + async def cancel(self) -> None: + """Cancel ongoing synthesis.""" + self._cancel_event.set() + + +class StreamingTTSAdapter: + """ + Adapter for streaming LLM text to TTS with sentence-level chunking. + + This reduces latency by starting TTS as soon as a complete sentence + is received from the LLM, rather than waiting for the full response. + """ + + # Sentence delimiters + SENTENCE_ENDS = {',', '。', '!', '?', '.', '!', '?', '\n'} + + def __init__(self, tts_service: BaseTTSService, transport, session_id: str): + self.tts_service = tts_service + self.transport = transport + self.session_id = session_id + self._buffer = "" + self._cancel_event = asyncio.Event() + self._is_speaking = False + + def _is_non_sentence_period(self, text: str, idx: int) -> bool: + """Check whether '.' should NOT be treated as a sentence delimiter.""" + if text[idx] != ".": + return False + + # Decimal/version segment: 1.2, v1.2.3 + if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit(): + return True + + # Number abbreviations: No.1 / No. 1 + left_start = idx - 1 + while left_start >= 0 and text[left_start].isalpha(): + left_start -= 1 + left_token = text[left_start + 1:idx].lower() + if left_token == "no": + j = idx + 1 + while j < len(text) and text[j].isspace(): + j += 1 + if j < len(text) and text[j].isdigit(): + return True + + return False + + async def process_text_chunk(self, text_chunk: str) -> None: + """ + Process a text chunk from LLM and trigger TTS when sentence is complete. + + Args: + text_chunk: Text chunk from LLM streaming + """ + if self._cancel_event.is_set(): + return + + self._buffer += text_chunk + + # Check for sentence completion + while True: + split_idx = -1 + for i, char in enumerate(self._buffer): + if char == "." and self._is_non_sentence_period(self._buffer, i): + continue + if char in self.SENTENCE_ENDS: + split_idx = i + break + if split_idx < 0: + break + + end_idx = split_idx + 1 + while end_idx < len(self._buffer) and self._buffer[end_idx] in self.SENTENCE_ENDS: + end_idx += 1 + + sentence = self._buffer[:end_idx].strip() + self._buffer = self._buffer[end_idx:] + + if sentence and any(ch.isalnum() for ch in sentence): + await self._speak_sentence(sentence) + + async def flush(self) -> None: + """Flush remaining buffer.""" + if self._buffer.strip() and not self._cancel_event.is_set(): + await self._speak_sentence(self._buffer.strip()) + self._buffer = "" + + async def _speak_sentence(self, text: str) -> None: + """Synthesize and send a sentence.""" + if not text or self._cancel_event.is_set(): + return + + self._is_speaking = True + + try: + async for chunk in self.tts_service.synthesize_stream(text): + if self._cancel_event.is_set(): + break + await self.transport.send_audio(chunk.audio) + await asyncio.sleep(0.01) # Prevent flooding + except Exception as e: + logger.error(f"TTS speak error: {e}") + finally: + self._is_speaking = False + + def cancel(self) -> None: + """Cancel ongoing speech.""" + self._cancel_event.set() + self._buffer = "" + + def reset(self) -> None: + """Reset for new turn.""" + self._cancel_event.clear() + self._buffer = "" + self._is_speaking = False + + @property + def is_speaking(self) -> bool: + return self._is_speaking + + +# Backward-compatible alias +SiliconFlowTTSService = OpenAICompatibleTTSService diff --git a/engine/services/siliconflow_asr.py b/engine/services/siliconflow_asr.py index 6d67ad7..2cb95dc 100644 --- a/engine/services/siliconflow_asr.py +++ b/engine/services/siliconflow_asr.py @@ -1,317 +1,8 @@ -"""SiliconFlow ASR (Automatic Speech Recognition) Service. +"""Backward-compatible imports for legacy siliconflow_asr module.""" -Uses the SiliconFlow API for speech-to-text transcription. -API: https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions -""" +from services.openai_compatible_asr import OpenAICompatibleASRService -import asyncio -import io -import wave -from typing import AsyncIterator, Optional, Callable, Awaitable -from loguru import logger +# Backward-compatible alias +SiliconFlowASRService = OpenAICompatibleASRService -try: - import aiohttp - AIOHTTP_AVAILABLE = True -except ImportError: - AIOHTTP_AVAILABLE = False - logger.warning("aiohttp not available - SiliconFlowASRService will not work") - -from services.base import BaseASRService, ASRResult, ServiceState - - -class SiliconFlowASRService(BaseASRService): - """ - SiliconFlow ASR service for speech-to-text transcription. - - Features: - - Buffers incoming audio chunks - - Provides interim transcriptions periodically (for streaming to client) - - Final transcription on EOU - - API Details: - - Endpoint: POST https://api.siliconflow.cn/v1/audio/transcriptions - - Models: FunAudioLLM/SenseVoiceSmall (default), TeleAI/TeleSpeechASR - - Input: Audio file (multipart/form-data) - - Output: {"text": "transcribed text"} - """ - - # Supported models - MODELS = { - "sensevoice": "FunAudioLLM/SenseVoiceSmall", - "telespeech": "TeleAI/TeleSpeechASR", - } - - API_URL = "https://api.siliconflow.cn/v1/audio/transcriptions" - - def __init__( - self, - api_key: str, - model: str = "FunAudioLLM/SenseVoiceSmall", - sample_rate: int = 16000, - language: str = "auto", - interim_interval_ms: int = 500, # How often to send interim results - min_audio_for_interim_ms: int = 300, # Min audio before first interim - on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None - ): - """ - Initialize SiliconFlow ASR service. - - Args: - api_key: SiliconFlow API key - model: ASR model name or alias - sample_rate: Audio sample rate (16000 recommended) - language: Language code (auto for automatic detection) - interim_interval_ms: How often to generate interim transcriptions - min_audio_for_interim_ms: Minimum audio duration before first interim - on_transcript: Callback for transcription results (text, is_final) - """ - super().__init__(sample_rate=sample_rate, language=language) - - if not AIOHTTP_AVAILABLE: - raise RuntimeError("aiohttp is required for SiliconFlowASRService") - - self.api_key = api_key - self.model = self.MODELS.get(model.lower(), model) - self.interim_interval_ms = interim_interval_ms - self.min_audio_for_interim_ms = min_audio_for_interim_ms - self.on_transcript = on_transcript - - # Session - self._session: Optional[aiohttp.ClientSession] = None - - # Audio buffer - self._audio_buffer: bytes = b"" - self._current_text: str = "" - self._last_interim_time: float = 0 - - # Transcript queue for async iteration - self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue() - - # Background task for interim results - self._interim_task: Optional[asyncio.Task] = None - self._running = False - - logger.info(f"SiliconFlowASRService initialized with model: {self.model}") - - async def connect(self) -> None: - """Connect to the service.""" - self._session = aiohttp.ClientSession( - headers={ - "Authorization": f"Bearer {self.api_key}" - } - ) - self._running = True - self.state = ServiceState.CONNECTED - logger.info("SiliconFlowASRService connected") - - async def disconnect(self) -> None: - """Disconnect and cleanup.""" - self._running = False - - if self._interim_task: - self._interim_task.cancel() - try: - await self._interim_task - except asyncio.CancelledError: - pass - self._interim_task = None - - if self._session: - await self._session.close() - self._session = None - - self._audio_buffer = b"" - self._current_text = "" - self.state = ServiceState.DISCONNECTED - logger.info("SiliconFlowASRService disconnected") - - async def send_audio(self, audio: bytes) -> None: - """ - Buffer incoming audio data. - - Args: - audio: PCM audio data (16-bit, mono) - """ - self._audio_buffer += audio - - async def transcribe_buffer(self, is_final: bool = False) -> Optional[str]: - """ - Transcribe current audio buffer. - - Args: - is_final: Whether this is the final transcription - - Returns: - Transcribed text or None if not enough audio - """ - if not self._session: - logger.warning("ASR session not connected") - return None - - # Check minimum audio duration - audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000 - - if not is_final and audio_duration_ms < self.min_audio_for_interim_ms: - return None - - if audio_duration_ms < 100: # Less than 100ms - too short - return None - - try: - # Convert PCM to WAV in memory - wav_buffer = io.BytesIO() - with wave.open(wav_buffer, 'wb') as wav_file: - wav_file.setnchannels(1) - wav_file.setsampwidth(2) # 16-bit - wav_file.setframerate(self.sample_rate) - wav_file.writeframes(self._audio_buffer) - - wav_buffer.seek(0) - wav_data = wav_buffer.read() - - # Send to API - form_data = aiohttp.FormData() - form_data.add_field( - 'file', - wav_data, - filename='audio.wav', - content_type='audio/wav' - ) - form_data.add_field('model', self.model) - - async with self._session.post(self.API_URL, data=form_data) as response: - if response.status == 200: - result = await response.json() - text = result.get("text", "").strip() - - if text: - self._current_text = text - - # Notify via callback - if self.on_transcript: - await self.on_transcript(text, is_final) - - # Queue result - await self._transcript_queue.put( - ASRResult(text=text, is_final=is_final) - ) - - logger.debug(f"ASR {'final' if is_final else 'interim'}: {text[:50]}...") - return text - else: - error_text = await response.text() - logger.error(f"ASR API error {response.status}: {error_text}") - return None - - except Exception as e: - logger.error(f"ASR transcription error: {e}") - return None - - async def get_final_transcription(self) -> str: - """ - Get final transcription and clear buffer. - - Call this when EOU is detected. - - Returns: - Final transcribed text - """ - # Transcribe full buffer as final - text = await self.transcribe_buffer(is_final=True) - - # Clear buffer - result = text or self._current_text - self._audio_buffer = b"" - self._current_text = "" - - return result - - def get_and_clear_text(self) -> str: - """ - Get accumulated text and clear buffer. - - Compatible with BufferedASRService interface. - """ - text = self._current_text - self._current_text = "" - self._audio_buffer = b"" - return text - - def get_audio_buffer(self) -> bytes: - """Get current audio buffer.""" - return self._audio_buffer - - def get_audio_duration_ms(self) -> float: - """Get current audio buffer duration in milliseconds.""" - return len(self._audio_buffer) / (self.sample_rate * 2) * 1000 - - def clear_buffer(self) -> None: - """Clear audio and text buffers.""" - self._audio_buffer = b"" - self._current_text = "" - - async def receive_transcripts(self) -> AsyncIterator[ASRResult]: - """ - Async iterator for transcription results. - - Yields: - ASRResult with text and is_final flag - """ - while self._running: - try: - result = await asyncio.wait_for( - self._transcript_queue.get(), - timeout=0.1 - ) - yield result - except asyncio.TimeoutError: - continue - except asyncio.CancelledError: - break - - async def start_interim_transcription(self) -> None: - """ - Start background task for interim transcriptions. - - This periodically transcribes buffered audio for - real-time feedback to the user. - """ - if self._interim_task and not self._interim_task.done(): - return - - self._interim_task = asyncio.create_task(self._interim_loop()) - - async def stop_interim_transcription(self) -> None: - """Stop interim transcription task.""" - if self._interim_task: - self._interim_task.cancel() - try: - await self._interim_task - except asyncio.CancelledError: - pass - self._interim_task = None - - async def _interim_loop(self) -> None: - """Background loop for interim transcriptions.""" - import time - - while self._running: - try: - await asyncio.sleep(self.interim_interval_ms / 1000) - - # Check if we have enough new audio - current_time = time.time() - time_since_last = (current_time - self._last_interim_time) * 1000 - - if time_since_last >= self.interim_interval_ms: - audio_duration = self.get_audio_duration_ms() - - if audio_duration >= self.min_audio_for_interim_ms: - await self.transcribe_buffer(is_final=False) - self._last_interim_time = current_time - - except asyncio.CancelledError: - break - except Exception as e: - logger.error(f"Interim transcription error: {e}") +__all__ = ["OpenAICompatibleASRService", "SiliconFlowASRService"] diff --git a/engine/services/siliconflow_tts.py b/engine/services/siliconflow_tts.py index 6d2dd8c..3cdf32a 100644 --- a/engine/services/siliconflow_tts.py +++ b/engine/services/siliconflow_tts.py @@ -1,311 +1,8 @@ -"""SiliconFlow TTS Service with streaming support. +"""Backward-compatible imports for legacy siliconflow_tts module.""" -Uses SiliconFlow's CosyVoice2 or MOSS-TTSD models for low-latency -text-to-speech synthesis with streaming. +from services.openai_compatible_tts import OpenAICompatibleTTSService, StreamingTTSAdapter -API Docs: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech -""" +# Backward-compatible alias +SiliconFlowTTSService = OpenAICompatibleTTSService -import os -import asyncio -import aiohttp -from typing import AsyncIterator, Optional -from loguru import logger - -from services.base import BaseTTSService, TTSChunk, ServiceState -from services.streaming_tts_adapter import StreamingTTSAdapter # backward-compatible re-export - - -class SiliconFlowTTSService(BaseTTSService): - """ - SiliconFlow TTS service with streaming support. - - Supports CosyVoice2-0.5B and MOSS-TTSD-v0.5 models. - """ - - # Available voices - VOICES = { - "alex": "FunAudioLLM/CosyVoice2-0.5B:alex", - "anna": "FunAudioLLM/CosyVoice2-0.5B:anna", - "bella": "FunAudioLLM/CosyVoice2-0.5B:bella", - "benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin", - "charles": "FunAudioLLM/CosyVoice2-0.5B:charles", - "claire": "FunAudioLLM/CosyVoice2-0.5B:claire", - "david": "FunAudioLLM/CosyVoice2-0.5B:david", - "diana": "FunAudioLLM/CosyVoice2-0.5B:diana", - } - - def __init__( - self, - api_key: Optional[str] = None, - voice: str = "anna", - model: str = "FunAudioLLM/CosyVoice2-0.5B", - sample_rate: int = 16000, - speed: float = 1.0 - ): - """ - Initialize SiliconFlow TTS service. - - Args: - api_key: SiliconFlow API key (defaults to SILICONFLOW_API_KEY env var) - voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana) - model: Model name - sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100) - speed: Speech speed (0.25 to 4.0) - """ - # Resolve voice name - if voice in self.VOICES: - full_voice = self.VOICES[voice] - else: - full_voice = voice - - super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed) - - self.api_key = api_key or os.getenv("SILICONFLOW_API_KEY") - self.model = model - self.api_url = "https://api.siliconflow.cn/v1/audio/speech" - - self._session: Optional[aiohttp.ClientSession] = None - self._cancel_event = asyncio.Event() - - async def connect(self) -> None: - """Initialize HTTP session.""" - if not self.api_key: - raise ValueError("SiliconFlow API key not provided. Set SILICONFLOW_API_KEY env var.") - - self._session = aiohttp.ClientSession( - headers={ - "Authorization": f"Bearer {self.api_key}", - "Content-Type": "application/json" - } - ) - self.state = ServiceState.CONNECTED - logger.info(f"SiliconFlow TTS service ready: voice={self.voice}, model={self.model}") - - async def disconnect(self) -> None: - """Close HTTP session.""" - if self._session: - await self._session.close() - self._session = None - self.state = ServiceState.DISCONNECTED - logger.info("SiliconFlow TTS service disconnected") - - async def synthesize(self, text: str) -> bytes: - """Synthesize complete audio for text.""" - audio_data = b"" - async for chunk in self.synthesize_stream(text): - audio_data += chunk.audio - return audio_data - - async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]: - """ - Synthesize audio in streaming mode. - - Args: - text: Text to synthesize - - Yields: - TTSChunk objects with PCM audio - """ - if not self._session: - raise RuntimeError("TTS service not connected") - - if not text.strip(): - return - - self._cancel_event.clear() - - payload = { - "model": self.model, - "input": text, - "voice": self.voice, - "response_format": "pcm", - "sample_rate": self.sample_rate, - "stream": True, - "speed": self.speed - } - - try: - async with self._session.post(self.api_url, json=payload) as response: - if response.status != 200: - error_text = await response.text() - logger.error(f"SiliconFlow TTS error: {response.status} - {error_text}") - return - - # Stream audio chunks - chunk_size = self.sample_rate * 2 // 10 # 100ms chunks - buffer = b"" - pending_chunk = None - - async for chunk in response.content.iter_any(): - if self._cancel_event.is_set(): - logger.info("TTS synthesis cancelled") - return - - buffer += chunk - - # Yield complete chunks - while len(buffer) >= chunk_size: - audio_chunk = buffer[:chunk_size] - buffer = buffer[chunk_size:] - - # Keep one full chunk buffered so we can always tag the true - # last full chunk as final when stream length is an exact multiple. - if pending_chunk is not None: - yield TTSChunk( - audio=pending_chunk, - sample_rate=self.sample_rate, - is_final=False - ) - pending_chunk = audio_chunk - - # Flush pending chunk(s) and remaining tail. - if pending_chunk is not None: - if buffer: - yield TTSChunk( - audio=pending_chunk, - sample_rate=self.sample_rate, - is_final=False - ) - pending_chunk = None - else: - yield TTSChunk( - audio=pending_chunk, - sample_rate=self.sample_rate, - is_final=True - ) - pending_chunk = None - - if buffer: - yield TTSChunk( - audio=buffer, - sample_rate=self.sample_rate, - is_final=True - ) - - except asyncio.CancelledError: - logger.info("TTS synthesis cancelled via asyncio") - raise - except Exception as e: - logger.error(f"TTS synthesis error: {e}") - raise - - async def cancel(self) -> None: - """Cancel ongoing synthesis.""" - self._cancel_event.set() - - -class StreamingTTSAdapter: - """ - Adapter for streaming LLM text to TTS with sentence-level chunking. - - This reduces latency by starting TTS as soon as a complete sentence - is received from the LLM, rather than waiting for the full response. - """ - - # Sentence delimiters - SENTENCE_ENDS = {',', '。', '!', '?', '.', '!', '?', '\n'} - - def __init__(self, tts_service: BaseTTSService, transport, session_id: str): - self.tts_service = tts_service - self.transport = transport - self.session_id = session_id - self._buffer = "" - self._cancel_event = asyncio.Event() - self._is_speaking = False - - def _is_non_sentence_period(self, text: str, idx: int) -> bool: - """Check whether '.' should NOT be treated as a sentence delimiter.""" - if text[idx] != ".": - return False - - # Decimal/version segment: 1.2, v1.2.3 - if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit(): - return True - - # Number abbreviations: No.1 / No. 1 - left_start = idx - 1 - while left_start >= 0 and text[left_start].isalpha(): - left_start -= 1 - left_token = text[left_start + 1:idx].lower() - if left_token == "no": - j = idx + 1 - while j < len(text) and text[j].isspace(): - j += 1 - if j < len(text) and text[j].isdigit(): - return True - - return False - - async def process_text_chunk(self, text_chunk: str) -> None: - """ - Process a text chunk from LLM and trigger TTS when sentence is complete. - - Args: - text_chunk: Text chunk from LLM streaming - """ - if self._cancel_event.is_set(): - return - - self._buffer += text_chunk - - # Check for sentence completion - while True: - split_idx = -1 - for i, char in enumerate(self._buffer): - if char == "." and self._is_non_sentence_period(self._buffer, i): - continue - if char in self.SENTENCE_ENDS: - split_idx = i - break - if split_idx < 0: - break - - end_idx = split_idx + 1 - while end_idx < len(self._buffer) and self._buffer[end_idx] in self.SENTENCE_ENDS: - end_idx += 1 - - sentence = self._buffer[:end_idx].strip() - self._buffer = self._buffer[end_idx:] - - if sentence and any(ch.isalnum() for ch in sentence): - await self._speak_sentence(sentence) - - async def flush(self) -> None: - """Flush remaining buffer.""" - if self._buffer.strip() and not self._cancel_event.is_set(): - await self._speak_sentence(self._buffer.strip()) - self._buffer = "" - - async def _speak_sentence(self, text: str) -> None: - """Synthesize and send a sentence.""" - if not text or self._cancel_event.is_set(): - return - - self._is_speaking = True - - try: - async for chunk in self.tts_service.synthesize_stream(text): - if self._cancel_event.is_set(): - break - await self.transport.send_audio(chunk.audio) - await asyncio.sleep(0.01) # Prevent flooding - except Exception as e: - logger.error(f"TTS speak error: {e}") - finally: - self._is_speaking = False - - def cancel(self) -> None: - """Cancel ongoing speech.""" - self._cancel_event.set() - self._buffer = "" - - def reset(self) -> None: - """Reset for new turn.""" - self._cancel_event.clear() - self._buffer = "" - self._is_speaking = False - - @property - def is_speaking(self) -> bool: - return self._is_speaking +__all__ = ["OpenAICompatibleTTSService", "SiliconFlowTTSService", "StreamingTTSAdapter"] diff --git a/web/pages/ASRLibrary.tsx b/web/pages/ASRLibrary.tsx index fe3b113..fbba731 100644 --- a/web/pages/ASRLibrary.tsx +++ b/web/pages/ASRLibrary.tsx @@ -85,7 +85,7 @@ const convertRecordedBlobToWav = async (blob: Blob): Promise => { export const ASRLibraryPage: React.FC = () => { const [models, setModels] = useState([]); const [searchTerm, setSearchTerm] = useState(''); - const [vendorFilter, setVendorFilter] = useState('all'); + const [vendorFilter, setVendorFilter] = useState('OpenAI Compatible'); const [langFilter, setLangFilter] = useState('all'); const [isAddModalOpen, setIsAddModalOpen] = useState(false); const [editingModel, setEditingModel] = useState(null); @@ -111,7 +111,7 @@ export const ASRLibraryPage: React.FC = () => { const filteredModels = models.filter((m) => { const q = searchTerm.toLowerCase(); const matchesSearch = m.name.toLowerCase().includes(q) || (m.modelName || '').toLowerCase().includes(q); - const matchesVendor = vendorFilter === 'all' || m.vendor === vendorFilter; + const matchesVendor = m.vendor === vendorFilter; const matchesLang = langFilter === 'all' || m.language === langFilter || (langFilter !== 'all' && m.language === 'Multi-lingual'); return matchesSearch && matchesVendor && matchesLang; }); @@ -134,8 +134,6 @@ export const ASRLibraryPage: React.FC = () => { setModels((prev) => prev.filter((m) => m.id !== id)); }; - const vendorOptions = Array.from(new Set(models.map((m) => m.vendor).filter(Boolean))); - return (
@@ -162,10 +160,7 @@ export const ASRLibraryPage: React.FC = () => { value={vendorFilter} onChange={(e) => setVendorFilter(e.target.value)} > - - {vendorOptions.map((vendor) => ( - - ))} +
@@ -371,7 +366,6 @@ const ASRModelModal: React.FC<{ onChange={(e) => setVendor(e.target.value)} > -
diff --git a/web/pages/Assistants.tsx b/web/pages/Assistants.tsx index ac4ef82..68a93d4 100644 --- a/web/pages/Assistants.tsx +++ b/web/pages/Assistants.tsx @@ -5,32 +5,37 @@ import { Button, Input, Badge, Drawer, Dialog } from '../components/UI'; import { ASRModel, Assistant, KnowledgeBase, LLMModel, TabValue, Tool, Voice } from '../types'; import { createAssistant, deleteAssistant, fetchASRModels, fetchAssistants, fetchKnowledgeBases, fetchLLMModels, fetchTools, fetchVoices, updateAssistant as updateAssistantApi } from '../services/backendApi'; -const isSiliconflowVendor = (vendor?: string) => { +const isOpenAICompatibleVendor = (vendor?: string) => { const normalized = String(vendor || '').trim().toLowerCase(); - return normalized === 'siliconflow' || normalized === '硅基流动'; + return ( + normalized === 'siliconflow' || + normalized === '硅基流动' || + normalized === 'openai compatible' || + normalized === 'openai-compatible' + ); }; -const SILICONFLOW_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B'; +const OPENAI_COMPATIBLE_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B'; -const buildSiliconflowVoiceKey = (voiceId: string, model?: string) => { +const buildOpenAICompatibleVoiceKey = (voiceId: string, model?: string) => { const id = String(voiceId || '').trim(); if (!id) return ''; if (id.includes(':')) return id; - return `${model || SILICONFLOW_DEFAULT_MODEL}:${id}`; + return `${model || OPENAI_COMPATIBLE_DEFAULT_MODEL}:${id}`; }; const resolveRuntimeTtsVoice = (selectedVoiceId: string, voice: Voice) => { const explicitKey = String(voice.voiceKey || '').trim(); - if (!isSiliconflowVendor(voice.vendor)) { + if (!isOpenAICompatibleVendor(voice.vendor)) { return explicitKey || selectedVoiceId; } if (voice.isSystem) { - const canonical = buildSiliconflowVoiceKey(selectedVoiceId, voice.model); + const canonical = buildOpenAICompatibleVoiceKey(selectedVoiceId, voice.model); if (!explicitKey) return canonical; const explicitSuffix = explicitKey.includes(':') ? explicitKey.split(':').pop() : explicitKey; if (explicitSuffix && explicitSuffix !== selectedVoiceId) return canonical; } - return explicitKey || buildSiliconflowVoiceKey(selectedVoiceId, voice.model); + return explicitKey || buildOpenAICompatibleVoiceKey(selectedVoiceId, voice.model); }; const renderToolIcon = (icon: string) => { @@ -1830,11 +1835,11 @@ export const DebugDrawer: React.FC<{ if (assistant.asrModelId) { const asr = asrModels.find((item) => item.id === assistant.asrModelId); if (asr) { - const asrProvider = isSiliconflowVendor(asr.vendor) ? 'siliconflow' : 'buffered'; + const asrProvider = isOpenAICompatibleVendor(asr.vendor) ? 'openai_compatible' : 'buffered'; services.asr = { provider: asrProvider, model: asr.modelName || asr.name, - apiKey: asrProvider === 'siliconflow' ? asr.apiKey : null, + apiKey: asrProvider === 'openai_compatible' ? asr.apiKey : null, }; } else { warnings.push(`ASR model not found in loaded list: ${assistant.asrModelId}`); @@ -1844,12 +1849,12 @@ export const DebugDrawer: React.FC<{ if (assistant.voice) { const voice = voices.find((item) => item.id === assistant.voice); if (voice) { - const ttsProvider = isSiliconflowVendor(voice.vendor) ? 'siliconflow' : 'edge'; + const ttsProvider = isOpenAICompatibleVendor(voice.vendor) ? 'openai_compatible' : 'edge'; services.tts = { enabled: ttsEnabled, provider: ttsProvider, model: voice.model, - apiKey: ttsProvider === 'siliconflow' ? voice.apiKey : null, + apiKey: ttsProvider === 'openai_compatible' ? voice.apiKey : null, voice: resolveRuntimeTtsVoice(assistant.voice, voice), speed: assistant.speed || voice.speed || 1.0, }; diff --git a/web/pages/LLMLibrary.tsx b/web/pages/LLMLibrary.tsx index 89e24d6..017eb99 100644 --- a/web/pages/LLMLibrary.tsx +++ b/web/pages/LLMLibrary.tsx @@ -13,7 +13,7 @@ const maskApiKey = (key?: string) => { export const LLMLibraryPage: React.FC = () => { const [models, setModels] = useState([]); const [searchTerm, setSearchTerm] = useState(''); - const [vendorFilter, setVendorFilter] = useState('all'); + const [vendorFilter, setVendorFilter] = useState('OpenAI Compatible'); const [typeFilter, setTypeFilter] = useState('all'); const [isAddModalOpen, setIsAddModalOpen] = useState(false); const [editingModel, setEditingModel] = useState(null); @@ -41,7 +41,7 @@ export const LLMLibraryPage: React.FC = () => { m.name.toLowerCase().includes(q) || (m.modelName || '').toLowerCase().includes(q) || (m.baseUrl || '').toLowerCase().includes(q); - const matchesVendor = vendorFilter === 'all' || m.vendor === vendorFilter; + const matchesVendor = m.vendor === vendorFilter; const matchesType = typeFilter === 'all' || m.type === typeFilter; return matchesSearch && matchesVendor && matchesType; }); @@ -64,8 +64,6 @@ export const LLMLibraryPage: React.FC = () => { setModels((prev) => prev.filter((item) => item.id !== id)); }; - const vendorOptions = Array.from(new Set(models.map((m) => m.vendor).filter(Boolean))); - return (
@@ -92,10 +90,7 @@ export const LLMLibraryPage: React.FC = () => { value={vendorFilter} onChange={(e) => setVendorFilter(e.target.value)} > - - {vendorOptions.map((vendor) => ( - - ))} +
@@ -284,8 +279,6 @@ const LLMModelModal: React.FC<{ onChange={(e) => setVendor(e.target.value)} > - -
diff --git a/web/pages/VoiceLibrary.tsx b/web/pages/VoiceLibrary.tsx index 73f7679..cb6f574 100644 --- a/web/pages/VoiceLibrary.tsx +++ b/web/pages/VoiceLibrary.tsx @@ -1,12 +1,12 @@ import React, { useEffect, useState, useRef } from 'react'; -import { Search, Mic2, Play, Pause, Upload, Filter, Plus, Volume2, Sparkles, ChevronDown, Pencil, Trash2 } from 'lucide-react'; +import { Search, Mic2, Play, Pause, Upload, Filter, Plus, Volume2, Pencil, Trash2 } from 'lucide-react'; import { Button, Input, TableHeader, TableRow, TableHead, TableCell, Dialog, Badge } from '../components/UI'; import { Voice } from '../types'; import { createVoice, deleteVoice, fetchVoices, previewVoice, updateVoice } from '../services/backendApi'; -const SILICONFLOW_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B'; +const OPENAI_COMPATIBLE_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B'; -const buildSiliconflowVoiceKey = (rawId: string, model: string): string => { +const buildOpenAICompatibleVoiceKey = (rawId: string, model: string): string => { const id = (rawId || '').trim(); if (!id) return `${model}:anna`; return id.includes(':') ? id : `${model}:${id}`; @@ -15,7 +15,7 @@ const buildSiliconflowVoiceKey = (rawId: string, model: string): string => { export const VoiceLibraryPage: React.FC = () => { const [voices, setVoices] = useState([]); const [searchTerm, setSearchTerm] = useState(''); - const [vendorFilter, setVendorFilter] = useState<'all' | 'Ali' | 'Volcano' | 'Minimax' | '硅基流动' | 'SiliconFlow'>('all'); + const [vendorFilter, setVendorFilter] = useState<'OpenAI Compatible'>('OpenAI Compatible'); const [genderFilter, setGenderFilter] = useState<'all' | 'Male' | 'Female'>('all'); const [langFilter, setLangFilter] = useState<'all' | 'zh' | 'en'>('all'); @@ -44,7 +44,7 @@ export const VoiceLibraryPage: React.FC = () => { const filteredVoices = voices.filter((voice) => { const matchesSearch = voice.name.toLowerCase().includes(searchTerm.toLowerCase()); - const matchesVendor = vendorFilter === 'all' || voice.vendor === vendorFilter; + const matchesVendor = voice.vendor === vendorFilter; const matchesGender = genderFilter === 'all' || voice.gender === genderFilter; const matchesLang = langFilter === 'all' || voice.language === langFilter; return matchesSearch && matchesVendor && matchesGender && matchesLang; @@ -138,12 +138,7 @@ export const VoiceLibraryPage: React.FC = () => { value={vendorFilter} onChange={(e) => setVendorFilter(e.target.value as any)} > - - - - - - +
@@ -187,15 +182,12 @@ export const VoiceLibraryPage: React.FC = () => {
- - {voice.vendor === '硅基流动' && } - {voice.name} - + {voice.name} {voice.description && {voice.description}}
- {voice.vendor} + {voice.vendor} {voice.gender === 'Male' ? '男' : '女'} {voice.language === 'zh' ? '中文' : 'English'} @@ -254,17 +246,15 @@ const AddVoiceModal: React.FC<{ onSuccess: (voice: Voice) => Promise; initialVoice?: Voice; }> = ({ isOpen, onClose, onSuccess, initialVoice }) => { - const [vendor, setVendor] = useState<'硅基流动' | 'Ali' | 'Volcano' | 'Minimax'>('硅基流动'); + const [vendor, setVendor] = useState<'OpenAI Compatible'>('OpenAI Compatible'); const [name, setName] = useState(''); - const [sfModel, setSfModel] = useState(SILICONFLOW_DEFAULT_MODEL); + const [openaiCompatibleModel, setOpenaiCompatibleModel] = useState(OPENAI_COMPATIBLE_DEFAULT_MODEL); const [sfVoiceId, setSfVoiceId] = useState('FunAudioLLM/CosyVoice2-0.5B:anna'); const [sfSpeed, setSfSpeed] = useState(1); const [sfGain, setSfGain] = useState(0); const [sfPitch, setSfPitch] = useState(0); - const [model, setModel] = useState(''); - const [voiceKey, setVoiceKey] = useState(''); const [gender, setGender] = useState('Female'); const [language, setLanguage] = useState('zh'); const [description, setDescription] = useState(''); @@ -278,17 +268,15 @@ const AddVoiceModal: React.FC<{ useEffect(() => { if (!initialVoice) return; - const nextVendor = initialVoice.vendor === 'SiliconFlow' ? '硅基流动' : initialVoice.vendor; - const nextModel = initialVoice.model || SILICONFLOW_DEFAULT_MODEL; - const defaultVoiceKey = buildSiliconflowVoiceKey(initialVoice.id || initialVoice.name || '', nextModel); - setVendor((nextVendor as any) || '硅基流动'); + const nextVendor = 'OpenAI Compatible'; + const nextModel = initialVoice.model || OPENAI_COMPATIBLE_DEFAULT_MODEL; + const defaultVoiceKey = buildOpenAICompatibleVoiceKey(initialVoice.id || initialVoice.name || '', nextModel); + setVendor(nextVendor); setName(initialVoice.name || ''); setGender(initialVoice.gender || 'Female'); setLanguage(initialVoice.language || 'zh'); setDescription(initialVoice.description || ''); - setModel(initialVoice.model || ''); - setVoiceKey(initialVoice.voiceKey || ''); - setSfModel(nextModel); + setOpenaiCompatibleModel(nextModel); setSfVoiceId((initialVoice.voiceKey || '').trim() || defaultVoiceKey); setSfSpeed(initialVoice.speed ?? 1); setSfGain(initialVoice.gain ?? 0); @@ -325,21 +313,21 @@ const AddVoiceModal: React.FC<{ return; } - const resolvedSiliconflowVoiceKey = (() => { + const resolvedVoiceKey = (() => { const current = (sfVoiceId || '').trim(); if (current) return current; - return buildSiliconflowVoiceKey(initialVoice?.id || name, sfModel || SILICONFLOW_DEFAULT_MODEL); + return buildOpenAICompatibleVoiceKey(initialVoice?.id || name, openaiCompatibleModel || OPENAI_COMPATIBLE_DEFAULT_MODEL); })(); const newVoice: Voice = { - id: initialVoice?.id || `${vendor === '硅基流动' ? 'sf' : 'gen'}-${Date.now()}`, + id: initialVoice?.id || `oa-${Date.now()}`, name, vendor, gender, language, - description: description || (vendor === '硅基流动' ? `Model: ${sfModel}` : `Model: ${model}`), - model: vendor === '硅基流动' ? sfModel : model, - voiceKey: vendor === '硅基流动' ? resolvedSiliconflowVoiceKey : voiceKey, + description: description || `Model: ${openaiCompatibleModel}`, + model: openaiCompatibleModel, + voiceKey: resolvedVoiceKey, apiKey, baseUrl, speed: sfSpeed, @@ -351,10 +339,8 @@ const AddVoiceModal: React.FC<{ setIsSaving(true); await onSuccess(newVoice); setName(''); - setVendor('硅基流动'); + setVendor('OpenAI Compatible'); setDescription(''); - setModel(''); - setVoiceKey(''); setApiKey(''); setBaseUrl(''); } catch (error: any) { @@ -381,19 +367,7 @@ const AddVoiceModal: React.FC<{
-
- - -
+
@@ -403,15 +377,14 @@ const AddVoiceModal: React.FC<{ setName(e.target.value)} placeholder="例如: 客服小美" />
- {vendor === '硅基流动' ? ( -
+
setSfModel(e.target.value)} + value={openaiCompatibleModel} + onChange={(e) => setOpenaiCompatibleModel(e.target.value)} placeholder="例如: FunAudioLLM/CosyVoice2-0.5B" />
@@ -445,20 +418,6 @@ const AddVoiceModal: React.FC<{
- ) : ( -
-
-
- - setModel(e.target.value)} placeholder="API Model Key" /> -
-
- - setVoiceKey(e.target.value)} placeholder="Voice Key" /> -
-
-
- )}
@@ -560,7 +519,7 @@ const CloneVoiceModal: React.FC<{ const newVoice: Voice = { id: `v-${Date.now()}`, name, - vendor: 'Volcano', + vendor: 'OpenAI Compatible', gender: 'Female', language: 'zh', description: description || 'User cloned voice', diff --git a/web/services/backendApi.ts b/web/services/backendApi.ts index bea6c58..d892b48 100644 --- a/web/services/backendApi.ts +++ b/web/services/backendApi.ts @@ -55,8 +55,11 @@ const mapVoice = (raw: AnyRecord): Voice => ({ id: String(readField(raw, ['id'], '')), name: readField(raw, ['name'], ''), vendor: ((): string => { - const vendor = String(readField(raw, ['vendor'], '')); - return vendor.toLowerCase() === 'siliconflow' ? '硅基流动' : vendor; + const vendor = String(readField(raw, ['vendor'], '')).trim().toLowerCase(); + if (vendor === 'siliconflow' || vendor === '硅基流动' || vendor === 'openai-compatible') { + return 'OpenAI Compatible'; + } + return String(readField(raw, ['vendor'], 'OpenAI Compatible')) || 'OpenAI Compatible'; })(), gender: readField(raw, ['gender'], ''), language: readField(raw, ['language'], ''), @@ -296,7 +299,7 @@ export const createVoice = async (data: Partial): Promise => { const payload = { id: data.id || undefined, name: data.name || 'New Voice', - vendor: data.vendor === '硅基流动' ? 'SiliconFlow' : (data.vendor || 'SiliconFlow'), + vendor: data.vendor || 'OpenAI Compatible', gender: data.gender || 'Female', language: data.language || 'zh', description: data.description || '', @@ -316,7 +319,7 @@ export const createVoice = async (data: Partial): Promise => { export const updateVoice = async (id: string, data: Partial): Promise => { const payload = { name: data.name, - vendor: data.vendor === '硅基流动' ? 'SiliconFlow' : data.vendor, + vendor: data.vendor, gender: data.gender, language: data.language, description: data.description, diff --git a/web/services/mockData.ts b/web/services/mockData.ts index 6a095f8..d8c6548 100644 --- a/web/services/mockData.ts +++ b/web/services/mockData.ts @@ -200,7 +200,7 @@ export const mockLLMModels: LLMModel[] = [ { id: 'm1', name: 'GPT-4o', vendor: 'OpenAI Compatible', type: 'text', baseUrl: 'https://api.openai.com/v1', apiKey: 'sk-***', temperature: 0.7 }, { id: 'm2', name: 'DeepSeek-V3', vendor: 'OpenAI Compatible', type: 'text', baseUrl: 'https://api.deepseek.com', apiKey: 'sk-***', temperature: 0.5 }, { id: 'm3', name: 'text-embedding-3-small', vendor: 'OpenAI Compatible', type: 'embedding', baseUrl: 'https://api.openai.com/v1', apiKey: 'sk-***' }, - { id: 'm4', name: 'bge-reranker-v2-m3', vendor: 'SiliconFlow', type: 'rerank', baseUrl: 'https://api.siliconflow.cn/v1', apiKey: 'sk-***' }, + { id: 'm4', name: 'bge-reranker-v2-m3', vendor: 'OpenAI Compatible', type: 'rerank', baseUrl: 'https://api.siliconflow.cn/v1', apiKey: 'sk-***' }, ]; export const mockASRModels: ASRModel[] = [