Use openai compatible as vendor

This commit is contained in:
Xin Wang
2026-02-12 18:44:55 +08:00
parent 260ff621bf
commit ff3a03b1ad
23 changed files with 822 additions and 905 deletions

View File

@@ -16,16 +16,22 @@ from ..schemas import (
router = APIRouter(prefix="/asr", tags=["ASR Models"]) router = APIRouter(prefix="/asr", tags=["ASR Models"])
SILICONFLOW_DEFAULT_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall" OPENAI_COMPATIBLE_DEFAULT_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall"
def _is_siliconflow_vendor(vendor: str) -> bool: def _is_openai_compatible_vendor(vendor: str) -> bool:
return (vendor or "").strip().lower() in {"siliconflow", "硅基流动"} normalized = (vendor or "").strip().lower()
return normalized in {
"openai compatible",
"openai-compatible",
"siliconflow", # backward compatibility
"硅基流动", # backward compatibility
}
def _default_asr_model(vendor: str) -> str: def _default_asr_model(vendor: str) -> str:
if _is_siliconflow_vendor(vendor): if _is_openai_compatible_vendor(vendor):
return SILICONFLOW_DEFAULT_ASR_MODEL return OPENAI_COMPATIBLE_DEFAULT_ASR_MODEL
return "whisper-1" return "whisper-1"
@@ -129,7 +135,7 @@ def test_asr_model(
# 连接性测试优先,避免依赖真实音频输入 # 连接性测试优先,避免依赖真实音频输入
headers = {"Authorization": f"Bearer {model.api_key}"} headers = {"Authorization": f"Bearer {model.api_key}"}
with httpx.Client(timeout=60.0) as client: with httpx.Client(timeout=60.0) as client:
if model.vendor.lower() in ["siliconflow", "paraformer"]: if _is_openai_compatible_vendor(model.vendor) or model.vendor.lower() == "paraformer":
response = client.get(f"{model.base_url}/asr", headers=headers) response = client.get(f"{model.base_url}/asr", headers=headers)
elif model.vendor.lower() == "openai": elif model.vendor.lower() == "openai":
response = client.get(f"{model.base_url}/audio/models", headers=headers) response = client.get(f"{model.base_url}/audio/models", headers=headers)
@@ -258,7 +264,7 @@ async def preview_asr_model(
raise HTTPException(status_code=400, detail="Uploaded audio file is empty") raise HTTPException(status_code=400, detail="Uploaded audio file is empty")
effective_api_key = (api_key or "").strip() or (model.api_key or "").strip() effective_api_key = (api_key or "").strip() or (model.api_key or "").strip()
if not effective_api_key and _is_siliconflow_vendor(model.vendor): if not effective_api_key and _is_openai_compatible_vendor(model.vendor):
effective_api_key = os.getenv("SILICONFLOW_API_KEY", "").strip() effective_api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
if not effective_api_key: if not effective_api_key:
raise HTTPException(status_code=400, detail=f"API key is required for ASR model: {model.name}") raise HTTPException(status_code=400, detail=f"API key is required for ASR model: {model.name}")

View File

@@ -13,8 +13,13 @@ from ..schemas import (
router = APIRouter(prefix="/assistants", tags=["Assistants"]) router = APIRouter(prefix="/assistants", tags=["Assistants"])
def _is_siliconflow_vendor(vendor: Optional[str]) -> bool: def _is_openai_compatible_vendor(vendor: Optional[str]) -> bool:
return (vendor or "").strip().lower() in {"siliconflow", "硅基流动"} return (vendor or "").strip().lower() in {
"siliconflow",
"硅基流动",
"openai compatible",
"openai-compatible",
}
def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict: def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict:
@@ -47,11 +52,11 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict:
if assistant.asr_model_id: if assistant.asr_model_id:
asr = db.query(ASRModel).filter(ASRModel.id == assistant.asr_model_id).first() asr = db.query(ASRModel).filter(ASRModel.id == assistant.asr_model_id).first()
if asr: if asr:
asr_provider = "siliconflow" if _is_siliconflow_vendor(asr.vendor) else "buffered" asr_provider = "openai_compatible" if _is_openai_compatible_vendor(asr.vendor) else "buffered"
metadata["services"]["asr"] = { metadata["services"]["asr"] = {
"provider": asr_provider, "provider": asr_provider,
"model": asr.model_name or asr.name, "model": asr.model_name or asr.name,
"apiKey": asr.api_key if asr_provider == "siliconflow" else None, "apiKey": asr.api_key if asr_provider == "openai_compatible" else None,
} }
else: else:
warnings.append(f"ASR model not found: {assistant.asr_model_id}") warnings.append(f"ASR model not found: {assistant.asr_model_id}")
@@ -61,12 +66,12 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict:
elif assistant.voice: elif assistant.voice:
voice = db.query(Voice).filter(Voice.id == assistant.voice).first() voice = db.query(Voice).filter(Voice.id == assistant.voice).first()
if voice: if voice:
tts_provider = "siliconflow" if _is_siliconflow_vendor(voice.vendor) else "edge" tts_provider = "openai_compatible" if _is_openai_compatible_vendor(voice.vendor) else "edge"
metadata["services"]["tts"] = { metadata["services"]["tts"] = {
"enabled": True, "enabled": True,
"provider": tts_provider, "provider": tts_provider,
"model": voice.model, "model": voice.model,
"apiKey": voice.api_key if tts_provider == "siliconflow" else None, "apiKey": voice.api_key if tts_provider == "openai_compatible" else None,
"voice": voice.voice_key or voice.id, "voice": voice.voice_key or voice.id,
"speed": assistant.speed or voice.speed, "speed": assistant.speed or voice.speed,
} }

View File

@@ -467,7 +467,13 @@ def _test_asr_model(db: Session, model_id: str, result: AutotestResult):
headers = {"Authorization": f"Bearer {model.api_key}"} headers = {"Authorization": f"Bearer {model.api_key}"}
with httpx.Client(timeout=30.0) as client: with httpx.Client(timeout=30.0) as client:
if model.vendor.lower() in ["siliconflow", "paraformer"]: normalized_vendor = (model.vendor or "").strip().lower()
if normalized_vendor in [
"openai compatible",
"openai-compatible",
"siliconflow", # backward compatibility
"paraformer",
]:
response = client.get( response = client.get(
f"{model.base_url}/asr", f"{model.base_url}/asr",
headers=headers headers=headers

View File

@@ -13,20 +13,26 @@ from ..schemas import VoiceCreate, VoiceOut, VoicePreviewRequest, VoicePreviewRe
router = APIRouter(prefix="/voices", tags=["Voices"]) router = APIRouter(prefix="/voices", tags=["Voices"])
SILICONFLOW_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B" OPENAI_COMPATIBLE_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
def _is_siliconflow_vendor(vendor: str) -> bool: def _is_openai_compatible_vendor(vendor: str) -> bool:
return vendor.strip().lower() in {"siliconflow", "硅基流动"} normalized = (vendor or "").strip().lower()
return normalized in {
"openai compatible",
"openai-compatible",
"siliconflow", # backward compatibility
"硅基流动", # backward compatibility
}
def _default_base_url(vendor: str) -> Optional[str]: def _default_base_url(vendor: str) -> Optional[str]:
if _is_siliconflow_vendor(vendor): if _is_openai_compatible_vendor(vendor):
return "https://api.siliconflow.cn/v1" return "https://api.siliconflow.cn/v1"
return None return None
def _build_siliconflow_voice_key(voice: Voice, model: str) -> str: def _build_openai_compatible_voice_key(voice: Voice, model: str) -> str:
if voice.voice_key: if voice.voice_key:
return voice.voice_key return voice.voice_key
if ":" in voice.id: if ":" in voice.id:
@@ -65,8 +71,8 @@ def create_voice(data: VoiceCreate, db: Session = Depends(get_db)):
model = data.model model = data.model
voice_key = data.voice_key voice_key = data.voice_key
if _is_siliconflow_vendor(vendor): if _is_openai_compatible_vendor(vendor):
model = model or SILICONFLOW_DEFAULT_MODEL model = model or OPENAI_COMPATIBLE_DEFAULT_MODEL
if not voice_key: if not voice_key:
raw_id = (data.id or data.name).strip() raw_id = (data.id or data.name).strip()
voice_key = raw_id if ":" in raw_id else f"{model}:{raw_id}" voice_key = raw_id if ":" in raw_id else f"{model}:{raw_id}"
@@ -115,11 +121,11 @@ def update_voice(id: str, data: VoiceUpdate, db: Session = Depends(get_db)):
update_data["vendor"] = update_data["vendor"].strip() update_data["vendor"] = update_data["vendor"].strip()
vendor_for_defaults = update_data.get("vendor", voice.vendor) vendor_for_defaults = update_data.get("vendor", voice.vendor)
if _is_siliconflow_vendor(vendor_for_defaults): if _is_openai_compatible_vendor(vendor_for_defaults):
model = update_data.get("model") or voice.model or SILICONFLOW_DEFAULT_MODEL model = update_data.get("model") or voice.model or OPENAI_COMPATIBLE_DEFAULT_MODEL
voice_key = update_data.get("voice_key") or voice.voice_key voice_key = update_data.get("voice_key") or voice.voice_key
update_data["model"] = model update_data["model"] = model
update_data["voice_key"] = voice_key or _build_siliconflow_voice_key(voice, model) update_data["voice_key"] = voice_key or _build_openai_compatible_voice_key(voice, model)
for field, value in update_data.items(): for field, value in update_data.items():
setattr(voice, field, value) setattr(voice, field, value)
@@ -152,7 +158,7 @@ def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_
raise HTTPException(status_code=400, detail="Preview text cannot be empty") raise HTTPException(status_code=400, detail="Preview text cannot be empty")
api_key = (data.api_key or "").strip() or (voice.api_key or "").strip() api_key = (data.api_key or "").strip() or (voice.api_key or "").strip()
if not api_key and _is_siliconflow_vendor(voice.vendor): if not api_key and _is_openai_compatible_vendor(voice.vendor):
api_key = os.getenv("SILICONFLOW_API_KEY", "").strip() api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
if not api_key: if not api_key:
raise HTTPException(status_code=400, detail=f"API key is required for voice: {voice.name}") raise HTTPException(status_code=400, detail=f"API key is required for voice: {voice.name}")
@@ -161,11 +167,11 @@ def preview_voice(id: str, data: VoicePreviewRequest, db: Session = Depends(get_
if not base_url: if not base_url:
raise HTTPException(status_code=400, detail=f"Base URL is required for voice: {voice.name}") raise HTTPException(status_code=400, detail=f"Base URL is required for voice: {voice.name}")
model = voice.model or SILICONFLOW_DEFAULT_MODEL model = voice.model or OPENAI_COMPATIBLE_DEFAULT_MODEL
payload = { payload = {
"model": model, "model": model,
"input": text, "input": text,
"voice": voice.voice_key or _build_siliconflow_voice_key(voice, model), "voice": voice.voice_key or _build_openai_compatible_voice_key(voice, model),
"response_format": "mp3", "response_format": "mp3",
"speed": data.speed if data.speed is not None else voice.speed, "speed": data.speed if data.speed is not None else voice.speed,
} }

View File

@@ -20,7 +20,7 @@ interface ASRModel {
id: string; // 模型唯一标识 (8位UUID) id: string; // 模型唯一标识 (8位UUID)
user_id: number; // 所属用户ID user_id: number; // 所属用户ID
name: string; // 模型显示名称 name: string; // 模型显示名称
vendor: string; // 供应商: "OpenAI" | "SiliconFlow" | "Paraformer" | 等 vendor: string; // 供应商: "OpenAI Compatible" | "Paraformer" | 等
language: string; // 识别语言: "zh" | "en" | "Multi-lingual" language: string; // 识别语言: "zh" | "en" | "Multi-lingual"
base_url: string; // API Base URL base_url: string; // API Base URL
api_key: string; // API Key api_key: string; // API Key
@@ -64,7 +64,7 @@ GET /api/v1/asr
"id": "abc12345", "id": "abc12345",
"user_id": 1, "user_id": 1,
"name": "Whisper 多语种识别", "name": "Whisper 多语种识别",
"vendor": "OpenAI", "vendor": "OpenAI Compatible",
"language": "Multi-lingual", "language": "Multi-lingual",
"base_url": "https://api.openai.com/v1", "base_url": "https://api.openai.com/v1",
"api_key": "sk-***", "api_key": "sk-***",
@@ -78,7 +78,7 @@ GET /api/v1/asr
"id": "def67890", "id": "def67890",
"user_id": 1, "user_id": 1,
"name": "SenseVoice 中文识别", "name": "SenseVoice 中文识别",
"vendor": "SiliconFlow", "vendor": "OpenAI Compatible",
"language": "zh", "language": "zh",
"base_url": "https://api.siliconflow.cn/v1", "base_url": "https://api.siliconflow.cn/v1",
"api_key": "sf-***", "api_key": "sf-***",
@@ -114,7 +114,7 @@ GET /api/v1/asr/{id}
"id": "abc12345", "id": "abc12345",
"user_id": 1, "user_id": 1,
"name": "Whisper 多语种识别", "name": "Whisper 多语种识别",
"vendor": "OpenAI", "vendor": "OpenAI Compatible",
"language": "Multi-lingual", "language": "Multi-lingual",
"base_url": "https://api.openai.com/v1", "base_url": "https://api.openai.com/v1",
"api_key": "sk-***", "api_key": "sk-***",
@@ -140,7 +140,7 @@ POST /api/v1/asr
```json ```json
{ {
"name": "SenseVoice 中文识别", "name": "SenseVoice 中文识别",
"vendor": "SiliconFlow", "vendor": "OpenAI Compatible",
"language": "zh", "language": "zh",
"base_url": "https://api.siliconflow.cn/v1", "base_url": "https://api.siliconflow.cn/v1",
"api_key": "sk-your-api-key", "api_key": "sk-your-api-key",
@@ -157,7 +157,7 @@ POST /api/v1/asr
| 字段 | 类型 | 必填 | 说明 | | 字段 | 类型 | 必填 | 说明 |
|------|------|------|------| |------|------|------|------|
| name | string | 是 | 模型显示名称 | | name | string | 是 | 模型显示名称 |
| vendor | string | 是 | 供应商: "OpenAI" / "SiliconFlow" / "Paraformer" | | vendor | string | 是 | 供应商: "OpenAI Compatible" / "Paraformer" |
| language | string | 是 | 语言: "zh" / "en" / "Multi-lingual" | | language | string | 是 | 语言: "zh" / "en" / "Multi-lingual" |
| base_url | string | 是 | API Base URL | | base_url | string | 是 | API Base URL |
| api_key | string | 是 | API Key | | api_key | string | 是 | API Key |
@@ -347,7 +347,7 @@ class ASRTestResponse(BaseModel):
```json ```json
{ {
"vendor": "OpenAI", "vendor": "OpenAI Compatible",
"base_url": "https://api.openai.com/v1", "base_url": "https://api.openai.com/v1",
"api_key": "sk-xxx", "api_key": "sk-xxx",
"model_name": "whisper-1", "model_name": "whisper-1",
@@ -357,11 +357,11 @@ class ASRTestResponse(BaseModel):
} }
``` ```
### SiliconFlow Paraformer ### OpenAI Compatible Paraformer
```json ```json
{ {
"vendor": "SiliconFlow", "vendor": "OpenAI Compatible",
"base_url": "https://api.siliconflow.cn/v1", "base_url": "https://api.siliconflow.cn/v1",
"api_key": "sf-xxx", "api_key": "sf-xxx",
"model_name": "paraformer-v2", "model_name": "paraformer-v2",
@@ -393,7 +393,7 @@ class ASRTestResponse(BaseModel):
| test_filter_asr_models_by_language | 按语言过滤测试 | | test_filter_asr_models_by_language | 按语言过滤测试 |
| test_filter_asr_models_by_enabled | 按启用状态过滤测试 | | test_filter_asr_models_by_enabled | 按启用状态过滤测试 |
| test_create_asr_model_with_hotwords | 热词配置测试 | | test_create_asr_model_with_hotwords | 热词配置测试 |
| test_test_asr_model_siliconflow | SiliconFlow 供应商测试 | | test_test_asr_model_siliconflow | OpenAI Compatible 供应商测试 |
| test_test_asr_model_openai | OpenAI 供应商测试 | | test_test_asr_model_openai | OpenAI 供应商测试 |
| test_different_asr_languages | 多语言测试 | | test_different_asr_languages | 多语言测试 |
| test_different_asr_vendors | 多供应商测试 | | test_different_asr_vendors | 多供应商测试 |

View File

@@ -20,7 +20,7 @@ interface LLMModel {
id: string; // 模型唯一标识 (8位UUID) id: string; // 模型唯一标识 (8位UUID)
user_id: number; // 所属用户ID user_id: number; // 所属用户ID
name: string; // 模型显示名称 name: string; // 模型显示名称
vendor: string; // 供应商: "OpenAI" | "SiliconFlow" | "Dify" | "FastGPT" | 等 vendor: string; // 供应商: "OpenAI Compatible" | "Dify" | "FastGPT" | 等
type: string; // 类型: "text" | "embedding" | "rerank" type: string; // 类型: "text" | "embedding" | "rerank"
base_url: string; // API Base URL base_url: string; // API Base URL
api_key: string; // API Key api_key: string; // API Key
@@ -64,7 +64,7 @@ GET /api/v1/llm
"id": "abc12345", "id": "abc12345",
"user_id": 1, "user_id": 1,
"name": "GPT-4o", "name": "GPT-4o",
"vendor": "OpenAI", "vendor": "OpenAI Compatible",
"type": "text", "type": "text",
"base_url": "https://api.openai.com/v1", "base_url": "https://api.openai.com/v1",
"api_key": "sk-***", "api_key": "sk-***",
@@ -79,7 +79,7 @@ GET /api/v1/llm
"id": "def67890", "id": "def67890",
"user_id": 1, "user_id": 1,
"name": "Embedding-3-Small", "name": "Embedding-3-Small",
"vendor": "OpenAI", "vendor": "OpenAI Compatible",
"type": "embedding", "type": "embedding",
"base_url": "https://api.openai.com/v1", "base_url": "https://api.openai.com/v1",
"api_key": "sk-***", "api_key": "sk-***",
@@ -111,7 +111,7 @@ GET /api/v1/llm/{id}
"id": "abc12345", "id": "abc12345",
"user_id": 1, "user_id": 1,
"name": "GPT-4o", "name": "GPT-4o",
"vendor": "OpenAI", "vendor": "OpenAI Compatible",
"type": "text", "type": "text",
"base_url": "https://api.openai.com/v1", "base_url": "https://api.openai.com/v1",
"api_key": "sk-***", "api_key": "sk-***",
@@ -137,7 +137,7 @@ POST /api/v1/llm
```json ```json
{ {
"name": "GPT-4o", "name": "GPT-4o",
"vendor": "OpenAI", "vendor": "OpenAI Compatible",
"type": "text", "type": "text",
"base_url": "https://api.openai.com/v1", "base_url": "https://api.openai.com/v1",
"api_key": "sk-your-api-key", "api_key": "sk-your-api-key",
@@ -314,11 +314,11 @@ class LLMModelTestResponse(BaseModel):
## 供应商配置示例 ## 供应商配置示例
### OpenAI ### OpenAI Compatible (OpenAI Endpoint)
```json ```json
{ {
"vendor": "OpenAI", "vendor": "OpenAI Compatible",
"base_url": "https://api.openai.com/v1", "base_url": "https://api.openai.com/v1",
"api_key": "sk-xxx", "api_key": "sk-xxx",
"model_name": "gpt-4o", "model_name": "gpt-4o",
@@ -327,11 +327,11 @@ class LLMModelTestResponse(BaseModel):
} }
``` ```
### SiliconFlow ### OpenAI Compatible
```json ```json
{ {
"vendor": "SiliconFlow", "vendor": "OpenAI Compatible",
"base_url": "https://api.siliconflow.com/v1", "base_url": "https://api.siliconflow.com/v1",
"api_key": "sf-xxx", "api_key": "sf-xxx",
"model_name": "deepseek-v3", "model_name": "deepseek-v3",
@@ -356,7 +356,7 @@ class LLMModelTestResponse(BaseModel):
```json ```json
{ {
"vendor": "OpenAI", "vendor": "OpenAI Compatible",
"base_url": "https://api.openai.com/v1", "base_url": "https://api.openai.com/v1",
"api_key": "sk-xxx", "api_key": "sk-xxx",
"model_name": "text-embedding-3-small", "model_name": "text-embedding-3-small",

View File

@@ -20,7 +20,7 @@ interface LLMModel {
id: string; // 模型唯一标识 id: string; // 模型唯一标识
user_id: number; // 所属用户ID user_id: number; // 所属用户ID
name: string; // 模型显示名称 name: string; // 模型显示名称
vendor: string; // 供应商: "OpenAI Compatible" | "SiliconFlow" | "Dify" | "FastGPT" vendor: string; // 供应商: "OpenAI Compatible" | "Dify" | "FastGPT"
type: string; // 类型: "text" | "embedding" | "rerank" type: string; // 类型: "text" | "embedding" | "rerank"
base_url: string; // API Base URL base_url: string; // API Base URL
api_key: string; // API Key api_key: string; // API Key
@@ -57,7 +57,7 @@ interface TTSModel {
id: string; id: string;
user_id: number; user_id: number;
name: string; name: string;
vendor: string; // "Ali" | "Volcano" | "Minimax" | "硅基流动" vendor: string; // "OpenAI Compatible" | "Ali" | "Volcano" | "Minimax"
language: string; // "zh" | "en" language: string; // "zh" | "en"
voice_list?: string[]; // 支持的声音列表 voice_list?: string[]; // 支持的声音列表
enabled: boolean; enabled: boolean;
@@ -316,7 +316,6 @@ class LLMModelType(str, Enum):
class LLMModelVendor(str, Enum): class LLMModelVendor(str, Enum):
OPENAI_COMPATIBLE = "OpenAI Compatible" OPENAI_COMPATIBLE = "OpenAI Compatible"
SILICONFLOW = "SiliconFlow"
DIFY = "Dify" DIFY = "Dify"
FASTGPT = "FastGPT" FASTGPT = "FastGPT"
@@ -389,11 +388,11 @@ class ASRModelOut(ASRModelBase):
} }
``` ```
### SiliconFlow ### OpenAI Compatible
```json ```json
{ {
"vendor": "SiliconFlow", "vendor": "OpenAI Compatible",
"base_url": "https://api.siliconflow.com/v1", "base_url": "https://api.siliconflow.com/v1",
"api_key": "sf-xxx", "api_key": "sf-xxx",
"model_name": "deepseek-v3" "model_name": "deepseek-v3"

View File

@@ -135,21 +135,21 @@ def rebuild_vector_store(reset_doc_status: bool = True):
def init_default_data(): def init_default_data():
with db_session() as db: with db_session() as db:
# 检查是否已有数据 # 检查是否已有数据
# SiliconFlow CosyVoice 2.0 预设声音 (8个) # OpenAI Compatible (SiliconFlow API) CosyVoice 2.0 预设声音 (8个)
# 参考: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech # 参考: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech
voices = [ voices = [
# 男声 (Male Voices) # 男声 (Male Voices)
Voice(id="alex", name="Alex", vendor="SiliconFlow", gender="Male", language="en", Voice(id="alex", name="Alex", vendor="OpenAI Compatible", gender="Male", language="en",
description="Steady male voice.", is_system=True), description="Steady male voice.", is_system=True),
Voice(id="david", name="David", vendor="SiliconFlow", gender="Male", language="en", Voice(id="david", name="David", vendor="OpenAI Compatible", gender="Male", language="en",
description="Cheerful male voice.", is_system=True), description="Cheerful male voice.", is_system=True),
# 女声 (Female Voices) # 女声 (Female Voices)
Voice(id="bella", name="Bella", vendor="SiliconFlow", gender="Female", language="en", Voice(id="bella", name="Bella", vendor="OpenAI Compatible", gender="Female", language="en",
description="Passionate female voice.", is_system=True), description="Passionate female voice.", is_system=True),
Voice(id="claire", name="Claire", vendor="SiliconFlow", gender="Female", language="en", Voice(id="claire", name="Claire", vendor="OpenAI Compatible", gender="Female", language="en",
description="Gentle female voice.", is_system=True), description="Gentle female voice.", is_system=True),
] ]
seed_if_empty(db, Voice, voices, "✅ 默认声音数据已初始化 (SiliconFlow CosyVoice 2.0)") seed_if_empty(db, Voice, voices, "✅ 默认声音数据已初始化 (OpenAI Compatible CosyVoice 2.0)")
def init_default_tools(recreate: bool = False): def init_default_tools(recreate: bool = False):
@@ -181,7 +181,7 @@ def init_default_assistants():
voice="anna", voice="anna",
speed=1.0, speed=1.0,
hotwords=[], hotwords=[],
tools=["calculator", "current_time"], tools=["current_time"],
interruption_sensitivity=500, interruption_sensitivity=500,
config_mode="platform", config_mode="platform",
llm_model_id="deepseek-chat", llm_model_id="deepseek-chat",
@@ -215,7 +215,7 @@ def init_default_assistants():
voice="alex", voice="alex",
speed=1.0, speed=1.0,
hotwords=["grammar", "vocabulary", "practice"], hotwords=["grammar", "vocabulary", "practice"],
tools=["calculator"], tools=["current_time"],
interruption_sensitivity=400, interruption_sensitivity=400,
config_mode="platform", config_mode="platform",
), ),
@@ -294,7 +294,7 @@ def init_default_llm_models():
id="deepseek-chat", id="deepseek-chat",
user_id=1, user_id=1,
name="DeepSeek Chat", name="DeepSeek Chat",
vendor="SiliconFlow", vendor="OpenAI Compatible",
type="text", type="text",
base_url="https://api.deepseek.com", base_url="https://api.deepseek.com",
api_key="YOUR_API_KEY", # 用户需替换 api_key="YOUR_API_KEY", # 用户需替换
@@ -320,7 +320,7 @@ def init_default_llm_models():
id="text-embedding-3-small", id="text-embedding-3-small",
user_id=1, user_id=1,
name="Embedding 3 Small", name="Embedding 3 Small",
vendor="OpenAI", vendor="OpenAI Compatible",
type="embedding", type="embedding",
base_url="https://api.openai.com/v1", base_url="https://api.openai.com/v1",
api_key="YOUR_API_KEY", api_key="YOUR_API_KEY",
@@ -339,7 +339,7 @@ def init_default_asr_models():
id="FunAudioLLM/SenseVoiceSmall", id="FunAudioLLM/SenseVoiceSmall",
user_id=1, user_id=1,
name="FunAudioLLM/SenseVoiceSmall", name="FunAudioLLM/SenseVoiceSmall",
vendor="SiliconFlow", vendor="OpenAI Compatible",
language="Multi-lingual", language="Multi-lingual",
base_url="https://api.siliconflow.cn/v1", base_url="https://api.siliconflow.cn/v1",
api_key="YOUR_API_KEY", api_key="YOUR_API_KEY",
@@ -353,7 +353,7 @@ def init_default_asr_models():
id="TeleAI/TeleSpeechASR", id="TeleAI/TeleSpeechASR",
user_id=1, user_id=1,
name="TeleAI/TeleSpeechASR", name="TeleAI/TeleSpeechASR",
vendor="SiliconFlow", vendor="OpenAI Compatible",
language="Multi-lingual", language="Multi-lingual",
base_url="https://api.siliconflow.cn/v1", base_url="https://api.siliconflow.cn/v1",
api_key="YOUR_API_KEY", api_key="YOUR_API_KEY",

View File

@@ -41,19 +41,19 @@ LLM_MODEL=gpt-4o-mini
LLM_TEMPERATURE=0.7 LLM_TEMPERATURE=0.7
# TTS # TTS
# edge: no SiliconFlow key needed # edge: no API key needed
# siliconflow: requires SILICONFLOW_API_KEY # openai_compatible: compatible with SiliconFlow-style endpoints
TTS_PROVIDER=siliconflow TTS_PROVIDER=openai_compatible
TTS_VOICE=anna TTS_VOICE=anna
TTS_SPEED=1.0 TTS_SPEED=1.0
# SiliconFlow (used by TTS and/or ASR when provider=siliconflow) # SiliconFlow (used by TTS and/or ASR when provider=openai_compatible)
SILICONFLOW_API_KEY=your_siliconflow_api_key_here SILICONFLOW_API_KEY=your_siliconflow_api_key_here
SILICONFLOW_TTS_MODEL=FunAudioLLM/CosyVoice2-0.5B SILICONFLOW_TTS_MODEL=FunAudioLLM/CosyVoice2-0.5B
SILICONFLOW_ASR_MODEL=FunAudioLLM/SenseVoiceSmall SILICONFLOW_ASR_MODEL=FunAudioLLM/SenseVoiceSmall
# ASR # ASR
ASR_PROVIDER=siliconflow ASR_PROVIDER=openai_compatible
# Interim cadence and minimum audio before interim decode. # Interim cadence and minimum audio before interim decode.
ASR_INTERIM_INTERVAL_MS=500 ASR_INTERIM_INTERVAL_MS=500
ASR_MIN_AUDIO_MS=300 ASR_MIN_AUDIO_MS=300

View File

@@ -44,7 +44,10 @@ class Settings(BaseSettings):
llm_temperature: float = Field(default=0.7, description="LLM temperature for response generation") llm_temperature: float = Field(default=0.7, description="LLM temperature for response generation")
# TTS Configuration # TTS Configuration
tts_provider: str = Field(default="siliconflow", description="TTS provider (edge, siliconflow)") tts_provider: str = Field(
default="openai_compatible",
description="TTS provider (edge, openai_compatible; siliconflow alias supported)"
)
tts_voice: str = Field(default="anna", description="TTS voice name") tts_voice: str = Field(default="anna", description="TTS voice name")
tts_speed: float = Field(default=1.0, description="TTS speech speed multiplier") tts_speed: float = Field(default=1.0, description="TTS speech speed multiplier")
@@ -53,7 +56,10 @@ class Settings(BaseSettings):
siliconflow_tts_model: str = Field(default="FunAudioLLM/CosyVoice2-0.5B", description="SiliconFlow TTS model") siliconflow_tts_model: str = Field(default="FunAudioLLM/CosyVoice2-0.5B", description="SiliconFlow TTS model")
# ASR Configuration # ASR Configuration
asr_provider: str = Field(default="siliconflow", description="ASR provider (siliconflow, buffered)") asr_provider: str = Field(
default="openai_compatible",
description="ASR provider (openai_compatible, buffered; siliconflow alias supported)"
)
siliconflow_asr_model: str = Field(default="FunAudioLLM/SenseVoiceSmall", description="SiliconFlow ASR model") siliconflow_asr_model: str = Field(default="FunAudioLLM/SenseVoiceSmall", description="SiliconFlow ASR model")
asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms") asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms")
asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result") asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result")

View File

@@ -30,8 +30,8 @@ from processors.vad import SileroVAD, VADProcessor
from services.asr import BufferedASRService from services.asr import BufferedASRService
from services.base import BaseASRService, BaseLLMService, BaseTTSService, LLMMessage, LLMStreamEvent from services.base import BaseASRService, BaseLLMService, BaseTTSService, LLMMessage, LLMStreamEvent
from services.llm import MockLLMService, OpenAILLMService from services.llm import MockLLMService, OpenAILLMService
from services.siliconflow_asr import SiliconFlowASRService from services.openai_compatible_asr import OpenAICompatibleASRService
from services.siliconflow_tts import SiliconFlowTTSService from services.openai_compatible_tts import OpenAICompatibleTTSService
from services.streaming_text import extract_tts_sentence, has_spoken_content from services.streaming_text import extract_tts_sentence, has_spoken_content
from services.tts import EdgeTTSService, MockTTSService from services.tts import EdgeTTSService, MockTTSService
@@ -60,57 +60,6 @@ class DuplexPipeline:
_TOOL_WAIT_TIMEOUT_SECONDS = 15.0 _TOOL_WAIT_TIMEOUT_SECONDS = 15.0
_SERVER_TOOL_TIMEOUT_SECONDS = 15.0 _SERVER_TOOL_TIMEOUT_SECONDS = 15.0
_DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = { _DEFAULT_TOOL_SCHEMAS: Dict[str, Dict[str, Any]] = {
"search": {
"name": "search",
"description": "Search the internet for recent information",
"parameters": {
"type": "object",
"properties": {"query": {"type": "string"}},
"required": ["query"],
},
},
"calculator": {
"name": "calculator",
"description": "Evaluate a math expression",
"parameters": {
"type": "object",
"properties": {"expression": {"type": "string"}},
"required": ["expression"],
},
},
"weather": {
"name": "weather",
"description": "Get weather by city name",
"parameters": {
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"],
},
},
"translate": {
"name": "translate",
"description": "Translate text to target language",
"parameters": {
"type": "object",
"properties": {
"text": {"type": "string"},
"target_lang": {"type": "string"},
},
"required": ["text", "target_lang"],
},
},
"knowledge": {
"name": "knowledge",
"description": "Query knowledge base by question",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string"},
"kb_id": {"type": "string"},
},
"required": ["query"],
},
},
"current_time": { "current_time": {
"name": "current_time", "name": "current_time",
"description": "Get current local time", "description": "Get current local time",
@@ -120,51 +69,6 @@ class DuplexPipeline:
"required": [], "required": [],
}, },
}, },
"code_interpreter": {
"name": "code_interpreter",
"description": "Execute Python code in a controlled environment",
"parameters": {
"type": "object",
"properties": {"code": {"type": "string"}},
"required": ["code"],
},
},
"turn_on_camera": {
"name": "turn_on_camera",
"description": "Turn on camera on client device",
"parameters": {
"type": "object",
"properties": {},
"required": [],
},
},
"turn_off_camera": {
"name": "turn_off_camera",
"description": "Turn off camera on client device",
"parameters": {
"type": "object",
"properties": {},
"required": [],
},
},
"increase_volume": {
"name": "increase_volume",
"description": "Increase speaker volume",
"parameters": {
"type": "object",
"properties": {"step": {"type": "integer"}},
"required": [],
},
},
"decrease_volume": {
"name": "decrease_volume",
"description": "Decrease speaker volume",
"parameters": {
"type": "object",
"properties": {"step": {"type": "integer"}},
"required": [],
},
},
} }
def __init__( def __init__(
@@ -386,6 +290,11 @@ class DuplexPipeline:
return False return False
return None return None
@staticmethod
def _is_openai_compatible_provider(provider: Any) -> bool:
normalized = str(provider or "").strip().lower()
return normalized in {"openai_compatible", "openai-compatible", "siliconflow"}
def _tts_output_enabled(self) -> bool: def _tts_output_enabled(self) -> bool:
enabled = self._coerce_bool(self._runtime_tts.get("enabled")) enabled = self._coerce_bool(self._runtime_tts.get("enabled"))
if enabled is not None: if enabled is not None:
@@ -495,15 +404,15 @@ class DuplexPipeline:
tts_model = self._runtime_tts.get("model") or settings.siliconflow_tts_model tts_model = self._runtime_tts.get("model") or settings.siliconflow_tts_model
tts_speed = float(self._runtime_tts.get("speed") or settings.tts_speed) tts_speed = float(self._runtime_tts.get("speed") or settings.tts_speed)
if tts_provider == "siliconflow" and tts_api_key: if self._is_openai_compatible_provider(tts_provider) and tts_api_key:
self.tts_service = SiliconFlowTTSService( self.tts_service = OpenAICompatibleTTSService(
api_key=tts_api_key, api_key=tts_api_key,
voice=tts_voice, voice=tts_voice,
model=tts_model, model=tts_model,
sample_rate=settings.sample_rate, sample_rate=settings.sample_rate,
speed=tts_speed speed=tts_speed
) )
logger.info("Using SiliconFlow TTS service") logger.info("Using OpenAI-compatible TTS service (SiliconFlow implementation)")
else: else:
self.tts_service = EdgeTTSService( self.tts_service = EdgeTTSService(
voice=tts_voice, voice=tts_voice,
@@ -531,8 +440,8 @@ class DuplexPipeline:
asr_interim_interval = int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms) asr_interim_interval = int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms)
asr_min_audio_ms = int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms) asr_min_audio_ms = int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms)
if asr_provider == "siliconflow" and asr_api_key: if self._is_openai_compatible_provider(asr_provider) and asr_api_key:
self.asr_service = SiliconFlowASRService( self.asr_service = OpenAICompatibleASRService(
api_key=asr_api_key, api_key=asr_api_key,
model=asr_model, model=asr_model,
sample_rate=settings.sample_rate, sample_rate=settings.sample_rate,
@@ -540,7 +449,7 @@ class DuplexPipeline:
min_audio_for_interim_ms=asr_min_audio_ms, min_audio_for_interim_ms=asr_min_audio_ms,
on_transcript=self._on_transcript_callback on_transcript=self._on_transcript_callback
) )
logger.info("Using SiliconFlow ASR service") logger.info("Using OpenAI-compatible ASR service (SiliconFlow implementation)")
else: else:
self.asr_service = BufferedASRService( self.asr_service = BufferedASRService(
sample_rate=settings.sample_rate sample_rate=settings.sample_rate

View File

@@ -66,7 +66,7 @@ Rules:
"baseUrl": "https://api.openai.com/v1" "baseUrl": "https://api.openai.com/v1"
}, },
"asr": { "asr": {
"provider": "siliconflow", "provider": "openai_compatible",
"model": "FunAudioLLM/SenseVoiceSmall", "model": "FunAudioLLM/SenseVoiceSmall",
"apiKey": "sf-...", "apiKey": "sf-...",
"interimIntervalMs": 500, "interimIntervalMs": 500,
@@ -74,7 +74,7 @@ Rules:
}, },
"tts": { "tts": {
"enabled": true, "enabled": true,
"provider": "siliconflow", "provider": "openai_compatible",
"model": "FunAudioLLM/CosyVoice2-0.5B", "model": "FunAudioLLM/CosyVoice2-0.5B",
"apiKey": "sf-...", "apiKey": "sf-...",
"voice": "anna", "voice": "anna",

View File

@@ -15,8 +15,8 @@ from services.base import (
from services.llm import OpenAILLMService, MockLLMService from services.llm import OpenAILLMService, MockLLMService
from services.tts import EdgeTTSService, MockTTSService from services.tts import EdgeTTSService, MockTTSService
from services.asr import BufferedASRService, MockASRService from services.asr import BufferedASRService, MockASRService
from services.siliconflow_asr import SiliconFlowASRService from services.openai_compatible_asr import OpenAICompatibleASRService, SiliconFlowASRService
from services.siliconflow_tts import SiliconFlowTTSService from services.openai_compatible_tts import OpenAICompatibleTTSService, SiliconFlowTTSService
from services.streaming_tts_adapter import StreamingTTSAdapter from services.streaming_tts_adapter import StreamingTTSAdapter
from services.realtime import RealtimeService, RealtimeConfig, RealtimePipeline from services.realtime import RealtimeService, RealtimeConfig, RealtimePipeline
@@ -38,8 +38,10 @@ __all__ = [
# ASR # ASR
"BufferedASRService", "BufferedASRService",
"MockASRService", "MockASRService",
"OpenAICompatibleASRService",
"SiliconFlowASRService", "SiliconFlowASRService",
# TTS (SiliconFlow) # TTS (SiliconFlow)
"OpenAICompatibleTTSService",
"SiliconFlowTTSService", "SiliconFlowTTSService",
"StreamingTTSAdapter", "StreamingTTSAdapter",
# Realtime # Realtime

View File

@@ -0,0 +1,321 @@
"""OpenAI-compatible ASR (Automatic Speech Recognition) Service.
Uses the SiliconFlow API for speech-to-text transcription.
API: https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions
"""
import asyncio
import io
import wave
from typing import AsyncIterator, Optional, Callable, Awaitable
from loguru import logger
try:
import aiohttp
AIOHTTP_AVAILABLE = True
except ImportError:
AIOHTTP_AVAILABLE = False
logger.warning("aiohttp not available - OpenAICompatibleASRService will not work")
from services.base import BaseASRService, ASRResult, ServiceState
class OpenAICompatibleASRService(BaseASRService):
"""
OpenAI-compatible ASR service for speech-to-text transcription.
Features:
- Buffers incoming audio chunks
- Provides interim transcriptions periodically (for streaming to client)
- Final transcription on EOU
API Details:
- Endpoint: POST https://api.siliconflow.cn/v1/audio/transcriptions
- Models: FunAudioLLM/SenseVoiceSmall (default), TeleAI/TeleSpeechASR
- Input: Audio file (multipart/form-data)
- Output: {"text": "transcribed text"}
"""
# Supported models
MODELS = {
"sensevoice": "FunAudioLLM/SenseVoiceSmall",
"telespeech": "TeleAI/TeleSpeechASR",
}
API_URL = "https://api.siliconflow.cn/v1/audio/transcriptions"
def __init__(
self,
api_key: str,
model: str = "FunAudioLLM/SenseVoiceSmall",
sample_rate: int = 16000,
language: str = "auto",
interim_interval_ms: int = 500, # How often to send interim results
min_audio_for_interim_ms: int = 300, # Min audio before first interim
on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None
):
"""
Initialize OpenAI-compatible ASR service.
Args:
api_key: Provider API key
model: ASR model name or alias
sample_rate: Audio sample rate (16000 recommended)
language: Language code (auto for automatic detection)
interim_interval_ms: How often to generate interim transcriptions
min_audio_for_interim_ms: Minimum audio duration before first interim
on_transcript: Callback for transcription results (text, is_final)
"""
super().__init__(sample_rate=sample_rate, language=language)
if not AIOHTTP_AVAILABLE:
raise RuntimeError("aiohttp is required for OpenAICompatibleASRService")
self.api_key = api_key
self.model = self.MODELS.get(model.lower(), model)
self.interim_interval_ms = interim_interval_ms
self.min_audio_for_interim_ms = min_audio_for_interim_ms
self.on_transcript = on_transcript
# Session
self._session: Optional[aiohttp.ClientSession] = None
# Audio buffer
self._audio_buffer: bytes = b""
self._current_text: str = ""
self._last_interim_time: float = 0
# Transcript queue for async iteration
self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
# Background task for interim results
self._interim_task: Optional[asyncio.Task] = None
self._running = False
logger.info(f"OpenAICompatibleASRService initialized with model: {self.model}")
async def connect(self) -> None:
"""Connect to the service."""
self._session = aiohttp.ClientSession(
headers={
"Authorization": f"Bearer {self.api_key}"
}
)
self._running = True
self.state = ServiceState.CONNECTED
logger.info("OpenAICompatibleASRService connected")
async def disconnect(self) -> None:
"""Disconnect and cleanup."""
self._running = False
if self._interim_task:
self._interim_task.cancel()
try:
await self._interim_task
except asyncio.CancelledError:
pass
self._interim_task = None
if self._session:
await self._session.close()
self._session = None
self._audio_buffer = b""
self._current_text = ""
self.state = ServiceState.DISCONNECTED
logger.info("OpenAICompatibleASRService disconnected")
async def send_audio(self, audio: bytes) -> None:
"""
Buffer incoming audio data.
Args:
audio: PCM audio data (16-bit, mono)
"""
self._audio_buffer += audio
async def transcribe_buffer(self, is_final: bool = False) -> Optional[str]:
"""
Transcribe current audio buffer.
Args:
is_final: Whether this is the final transcription
Returns:
Transcribed text or None if not enough audio
"""
if not self._session:
logger.warning("ASR session not connected")
return None
# Check minimum audio duration
audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000
if not is_final and audio_duration_ms < self.min_audio_for_interim_ms:
return None
if audio_duration_ms < 100: # Less than 100ms - too short
return None
try:
# Convert PCM to WAV in memory
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(self.sample_rate)
wav_file.writeframes(self._audio_buffer)
wav_buffer.seek(0)
wav_data = wav_buffer.read()
# Send to API
form_data = aiohttp.FormData()
form_data.add_field(
'file',
wav_data,
filename='audio.wav',
content_type='audio/wav'
)
form_data.add_field('model', self.model)
async with self._session.post(self.API_URL, data=form_data) as response:
if response.status == 200:
result = await response.json()
text = result.get("text", "").strip()
if text:
self._current_text = text
# Notify via callback
if self.on_transcript:
await self.on_transcript(text, is_final)
# Queue result
await self._transcript_queue.put(
ASRResult(text=text, is_final=is_final)
)
logger.debug(f"ASR {'final' if is_final else 'interim'}: {text[:50]}...")
return text
else:
error_text = await response.text()
logger.error(f"ASR API error {response.status}: {error_text}")
return None
except Exception as e:
logger.error(f"ASR transcription error: {e}")
return None
async def get_final_transcription(self) -> str:
"""
Get final transcription and clear buffer.
Call this when EOU is detected.
Returns:
Final transcribed text
"""
# Transcribe full buffer as final
text = await self.transcribe_buffer(is_final=True)
# Clear buffer
result = text or self._current_text
self._audio_buffer = b""
self._current_text = ""
return result
def get_and_clear_text(self) -> str:
"""
Get accumulated text and clear buffer.
Compatible with BufferedASRService interface.
"""
text = self._current_text
self._current_text = ""
self._audio_buffer = b""
return text
def get_audio_buffer(self) -> bytes:
"""Get current audio buffer."""
return self._audio_buffer
def get_audio_duration_ms(self) -> float:
"""Get current audio buffer duration in milliseconds."""
return len(self._audio_buffer) / (self.sample_rate * 2) * 1000
def clear_buffer(self) -> None:
"""Clear audio and text buffers."""
self._audio_buffer = b""
self._current_text = ""
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
"""
Async iterator for transcription results.
Yields:
ASRResult with text and is_final flag
"""
while self._running:
try:
result = await asyncio.wait_for(
self._transcript_queue.get(),
timeout=0.1
)
yield result
except asyncio.TimeoutError:
continue
except asyncio.CancelledError:
break
async def start_interim_transcription(self) -> None:
"""
Start background task for interim transcriptions.
This periodically transcribes buffered audio for
real-time feedback to the user.
"""
if self._interim_task and not self._interim_task.done():
return
self._interim_task = asyncio.create_task(self._interim_loop())
async def stop_interim_transcription(self) -> None:
"""Stop interim transcription task."""
if self._interim_task:
self._interim_task.cancel()
try:
await self._interim_task
except asyncio.CancelledError:
pass
self._interim_task = None
async def _interim_loop(self) -> None:
"""Background loop for interim transcriptions."""
import time
while self._running:
try:
await asyncio.sleep(self.interim_interval_ms / 1000)
# Check if we have enough new audio
current_time = time.time()
time_since_last = (current_time - self._last_interim_time) * 1000
if time_since_last >= self.interim_interval_ms:
audio_duration = self.get_audio_duration_ms()
if audio_duration >= self.min_audio_for_interim_ms:
await self.transcribe_buffer(is_final=False)
self._last_interim_time = current_time
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Interim transcription error: {e}")
# Backward-compatible alias
SiliconFlowASRService = OpenAICompatibleASRService

View File

@@ -0,0 +1,315 @@
"""OpenAI-compatible TTS Service with streaming support.
Uses SiliconFlow's CosyVoice2 or MOSS-TTSD models for low-latency
text-to-speech synthesis with streaming.
API Docs: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech
"""
import os
import asyncio
import aiohttp
from typing import AsyncIterator, Optional
from loguru import logger
from services.base import BaseTTSService, TTSChunk, ServiceState
from services.streaming_tts_adapter import StreamingTTSAdapter # backward-compatible re-export
class OpenAICompatibleTTSService(BaseTTSService):
"""
OpenAI-compatible TTS service with streaming support.
Supports CosyVoice2-0.5B and MOSS-TTSD-v0.5 models.
"""
# Available voices
VOICES = {
"alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
"anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
"bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
"benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
"charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
"claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
"david": "FunAudioLLM/CosyVoice2-0.5B:david",
"diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
}
def __init__(
self,
api_key: Optional[str] = None,
voice: str = "anna",
model: str = "FunAudioLLM/CosyVoice2-0.5B",
sample_rate: int = 16000,
speed: float = 1.0
):
"""
Initialize OpenAI-compatible TTS service.
Args:
api_key: Provider API key (defaults to SILICONFLOW_API_KEY env var)
voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
model: Model name
sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
speed: Speech speed (0.25 to 4.0)
"""
# Resolve voice name
if voice in self.VOICES:
full_voice = self.VOICES[voice]
else:
full_voice = voice
super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed)
self.api_key = api_key or os.getenv("SILICONFLOW_API_KEY")
self.model = model
self.api_url = "https://api.siliconflow.cn/v1/audio/speech"
self._session: Optional[aiohttp.ClientSession] = None
self._cancel_event = asyncio.Event()
async def connect(self) -> None:
"""Initialize HTTP session."""
if not self.api_key:
raise ValueError("SiliconFlow API key not provided. Set SILICONFLOW_API_KEY env var.")
self._session = aiohttp.ClientSession(
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
)
self.state = ServiceState.CONNECTED
logger.info(f"SiliconFlow TTS service ready: voice={self.voice}, model={self.model}")
async def disconnect(self) -> None:
"""Close HTTP session."""
if self._session:
await self._session.close()
self._session = None
self.state = ServiceState.DISCONNECTED
logger.info("SiliconFlow TTS service disconnected")
async def synthesize(self, text: str) -> bytes:
"""Synthesize complete audio for text."""
audio_data = b""
async for chunk in self.synthesize_stream(text):
audio_data += chunk.audio
return audio_data
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
"""
Synthesize audio in streaming mode.
Args:
text: Text to synthesize
Yields:
TTSChunk objects with PCM audio
"""
if not self._session:
raise RuntimeError("TTS service not connected")
if not text.strip():
return
self._cancel_event.clear()
payload = {
"model": self.model,
"input": text,
"voice": self.voice,
"response_format": "pcm",
"sample_rate": self.sample_rate,
"stream": True,
"speed": self.speed
}
try:
async with self._session.post(self.api_url, json=payload) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"SiliconFlow TTS error: {response.status} - {error_text}")
return
# Stream audio chunks
chunk_size = self.sample_rate * 2 // 10 # 100ms chunks
buffer = b""
pending_chunk = None
async for chunk in response.content.iter_any():
if self._cancel_event.is_set():
logger.info("TTS synthesis cancelled")
return
buffer += chunk
# Yield complete chunks
while len(buffer) >= chunk_size:
audio_chunk = buffer[:chunk_size]
buffer = buffer[chunk_size:]
# Keep one full chunk buffered so we can always tag the true
# last full chunk as final when stream length is an exact multiple.
if pending_chunk is not None:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=False
)
pending_chunk = audio_chunk
# Flush pending chunk(s) and remaining tail.
if pending_chunk is not None:
if buffer:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=False
)
pending_chunk = None
else:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=True
)
pending_chunk = None
if buffer:
yield TTSChunk(
audio=buffer,
sample_rate=self.sample_rate,
is_final=True
)
except asyncio.CancelledError:
logger.info("TTS synthesis cancelled via asyncio")
raise
except Exception as e:
logger.error(f"TTS synthesis error: {e}")
raise
async def cancel(self) -> None:
"""Cancel ongoing synthesis."""
self._cancel_event.set()
class StreamingTTSAdapter:
"""
Adapter for streaming LLM text to TTS with sentence-level chunking.
This reduces latency by starting TTS as soon as a complete sentence
is received from the LLM, rather than waiting for the full response.
"""
# Sentence delimiters
SENTENCE_ENDS = {'', '', '', '', '.', '!', '?', '\n'}
def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
self.tts_service = tts_service
self.transport = transport
self.session_id = session_id
self._buffer = ""
self._cancel_event = asyncio.Event()
self._is_speaking = False
def _is_non_sentence_period(self, text: str, idx: int) -> bool:
"""Check whether '.' should NOT be treated as a sentence delimiter."""
if text[idx] != ".":
return False
# Decimal/version segment: 1.2, v1.2.3
if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
return True
# Number abbreviations: No.1 / No. 1
left_start = idx - 1
while left_start >= 0 and text[left_start].isalpha():
left_start -= 1
left_token = text[left_start + 1:idx].lower()
if left_token == "no":
j = idx + 1
while j < len(text) and text[j].isspace():
j += 1
if j < len(text) and text[j].isdigit():
return True
return False
async def process_text_chunk(self, text_chunk: str) -> None:
"""
Process a text chunk from LLM and trigger TTS when sentence is complete.
Args:
text_chunk: Text chunk from LLM streaming
"""
if self._cancel_event.is_set():
return
self._buffer += text_chunk
# Check for sentence completion
while True:
split_idx = -1
for i, char in enumerate(self._buffer):
if char == "." and self._is_non_sentence_period(self._buffer, i):
continue
if char in self.SENTENCE_ENDS:
split_idx = i
break
if split_idx < 0:
break
end_idx = split_idx + 1
while end_idx < len(self._buffer) and self._buffer[end_idx] in self.SENTENCE_ENDS:
end_idx += 1
sentence = self._buffer[:end_idx].strip()
self._buffer = self._buffer[end_idx:]
if sentence and any(ch.isalnum() for ch in sentence):
await self._speak_sentence(sentence)
async def flush(self) -> None:
"""Flush remaining buffer."""
if self._buffer.strip() and not self._cancel_event.is_set():
await self._speak_sentence(self._buffer.strip())
self._buffer = ""
async def _speak_sentence(self, text: str) -> None:
"""Synthesize and send a sentence."""
if not text or self._cancel_event.is_set():
return
self._is_speaking = True
try:
async for chunk in self.tts_service.synthesize_stream(text):
if self._cancel_event.is_set():
break
await self.transport.send_audio(chunk.audio)
await asyncio.sleep(0.01) # Prevent flooding
except Exception as e:
logger.error(f"TTS speak error: {e}")
finally:
self._is_speaking = False
def cancel(self) -> None:
"""Cancel ongoing speech."""
self._cancel_event.set()
self._buffer = ""
def reset(self) -> None:
"""Reset for new turn."""
self._cancel_event.clear()
self._buffer = ""
self._is_speaking = False
@property
def is_speaking(self) -> bool:
return self._is_speaking
# Backward-compatible alias
SiliconFlowTTSService = OpenAICompatibleTTSService

View File

@@ -1,317 +1,8 @@
"""SiliconFlow ASR (Automatic Speech Recognition) Service. """Backward-compatible imports for legacy siliconflow_asr module."""
Uses the SiliconFlow API for speech-to-text transcription. from services.openai_compatible_asr import OpenAICompatibleASRService
API: https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions
"""
import asyncio # Backward-compatible alias
import io SiliconFlowASRService = OpenAICompatibleASRService
import wave
from typing import AsyncIterator, Optional, Callable, Awaitable
from loguru import logger
try: __all__ = ["OpenAICompatibleASRService", "SiliconFlowASRService"]
import aiohttp
AIOHTTP_AVAILABLE = True
except ImportError:
AIOHTTP_AVAILABLE = False
logger.warning("aiohttp not available - SiliconFlowASRService will not work")
from services.base import BaseASRService, ASRResult, ServiceState
class SiliconFlowASRService(BaseASRService):
"""
SiliconFlow ASR service for speech-to-text transcription.
Features:
- Buffers incoming audio chunks
- Provides interim transcriptions periodically (for streaming to client)
- Final transcription on EOU
API Details:
- Endpoint: POST https://api.siliconflow.cn/v1/audio/transcriptions
- Models: FunAudioLLM/SenseVoiceSmall (default), TeleAI/TeleSpeechASR
- Input: Audio file (multipart/form-data)
- Output: {"text": "transcribed text"}
"""
# Supported models
MODELS = {
"sensevoice": "FunAudioLLM/SenseVoiceSmall",
"telespeech": "TeleAI/TeleSpeechASR",
}
API_URL = "https://api.siliconflow.cn/v1/audio/transcriptions"
def __init__(
self,
api_key: str,
model: str = "FunAudioLLM/SenseVoiceSmall",
sample_rate: int = 16000,
language: str = "auto",
interim_interval_ms: int = 500, # How often to send interim results
min_audio_for_interim_ms: int = 300, # Min audio before first interim
on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None
):
"""
Initialize SiliconFlow ASR service.
Args:
api_key: SiliconFlow API key
model: ASR model name or alias
sample_rate: Audio sample rate (16000 recommended)
language: Language code (auto for automatic detection)
interim_interval_ms: How often to generate interim transcriptions
min_audio_for_interim_ms: Minimum audio duration before first interim
on_transcript: Callback for transcription results (text, is_final)
"""
super().__init__(sample_rate=sample_rate, language=language)
if not AIOHTTP_AVAILABLE:
raise RuntimeError("aiohttp is required for SiliconFlowASRService")
self.api_key = api_key
self.model = self.MODELS.get(model.lower(), model)
self.interim_interval_ms = interim_interval_ms
self.min_audio_for_interim_ms = min_audio_for_interim_ms
self.on_transcript = on_transcript
# Session
self._session: Optional[aiohttp.ClientSession] = None
# Audio buffer
self._audio_buffer: bytes = b""
self._current_text: str = ""
self._last_interim_time: float = 0
# Transcript queue for async iteration
self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
# Background task for interim results
self._interim_task: Optional[asyncio.Task] = None
self._running = False
logger.info(f"SiliconFlowASRService initialized with model: {self.model}")
async def connect(self) -> None:
"""Connect to the service."""
self._session = aiohttp.ClientSession(
headers={
"Authorization": f"Bearer {self.api_key}"
}
)
self._running = True
self.state = ServiceState.CONNECTED
logger.info("SiliconFlowASRService connected")
async def disconnect(self) -> None:
"""Disconnect and cleanup."""
self._running = False
if self._interim_task:
self._interim_task.cancel()
try:
await self._interim_task
except asyncio.CancelledError:
pass
self._interim_task = None
if self._session:
await self._session.close()
self._session = None
self._audio_buffer = b""
self._current_text = ""
self.state = ServiceState.DISCONNECTED
logger.info("SiliconFlowASRService disconnected")
async def send_audio(self, audio: bytes) -> None:
"""
Buffer incoming audio data.
Args:
audio: PCM audio data (16-bit, mono)
"""
self._audio_buffer += audio
async def transcribe_buffer(self, is_final: bool = False) -> Optional[str]:
"""
Transcribe current audio buffer.
Args:
is_final: Whether this is the final transcription
Returns:
Transcribed text or None if not enough audio
"""
if not self._session:
logger.warning("ASR session not connected")
return None
# Check minimum audio duration
audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000
if not is_final and audio_duration_ms < self.min_audio_for_interim_ms:
return None
if audio_duration_ms < 100: # Less than 100ms - too short
return None
try:
# Convert PCM to WAV in memory
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(self.sample_rate)
wav_file.writeframes(self._audio_buffer)
wav_buffer.seek(0)
wav_data = wav_buffer.read()
# Send to API
form_data = aiohttp.FormData()
form_data.add_field(
'file',
wav_data,
filename='audio.wav',
content_type='audio/wav'
)
form_data.add_field('model', self.model)
async with self._session.post(self.API_URL, data=form_data) as response:
if response.status == 200:
result = await response.json()
text = result.get("text", "").strip()
if text:
self._current_text = text
# Notify via callback
if self.on_transcript:
await self.on_transcript(text, is_final)
# Queue result
await self._transcript_queue.put(
ASRResult(text=text, is_final=is_final)
)
logger.debug(f"ASR {'final' if is_final else 'interim'}: {text[:50]}...")
return text
else:
error_text = await response.text()
logger.error(f"ASR API error {response.status}: {error_text}")
return None
except Exception as e:
logger.error(f"ASR transcription error: {e}")
return None
async def get_final_transcription(self) -> str:
"""
Get final transcription and clear buffer.
Call this when EOU is detected.
Returns:
Final transcribed text
"""
# Transcribe full buffer as final
text = await self.transcribe_buffer(is_final=True)
# Clear buffer
result = text or self._current_text
self._audio_buffer = b""
self._current_text = ""
return result
def get_and_clear_text(self) -> str:
"""
Get accumulated text and clear buffer.
Compatible with BufferedASRService interface.
"""
text = self._current_text
self._current_text = ""
self._audio_buffer = b""
return text
def get_audio_buffer(self) -> bytes:
"""Get current audio buffer."""
return self._audio_buffer
def get_audio_duration_ms(self) -> float:
"""Get current audio buffer duration in milliseconds."""
return len(self._audio_buffer) / (self.sample_rate * 2) * 1000
def clear_buffer(self) -> None:
"""Clear audio and text buffers."""
self._audio_buffer = b""
self._current_text = ""
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
"""
Async iterator for transcription results.
Yields:
ASRResult with text and is_final flag
"""
while self._running:
try:
result = await asyncio.wait_for(
self._transcript_queue.get(),
timeout=0.1
)
yield result
except asyncio.TimeoutError:
continue
except asyncio.CancelledError:
break
async def start_interim_transcription(self) -> None:
"""
Start background task for interim transcriptions.
This periodically transcribes buffered audio for
real-time feedback to the user.
"""
if self._interim_task and not self._interim_task.done():
return
self._interim_task = asyncio.create_task(self._interim_loop())
async def stop_interim_transcription(self) -> None:
"""Stop interim transcription task."""
if self._interim_task:
self._interim_task.cancel()
try:
await self._interim_task
except asyncio.CancelledError:
pass
self._interim_task = None
async def _interim_loop(self) -> None:
"""Background loop for interim transcriptions."""
import time
while self._running:
try:
await asyncio.sleep(self.interim_interval_ms / 1000)
# Check if we have enough new audio
current_time = time.time()
time_since_last = (current_time - self._last_interim_time) * 1000
if time_since_last >= self.interim_interval_ms:
audio_duration = self.get_audio_duration_ms()
if audio_duration >= self.min_audio_for_interim_ms:
await self.transcribe_buffer(is_final=False)
self._last_interim_time = current_time
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Interim transcription error: {e}")

View File

@@ -1,311 +1,8 @@
"""SiliconFlow TTS Service with streaming support. """Backward-compatible imports for legacy siliconflow_tts module."""
Uses SiliconFlow's CosyVoice2 or MOSS-TTSD models for low-latency from services.openai_compatible_tts import OpenAICompatibleTTSService, StreamingTTSAdapter
text-to-speech synthesis with streaming.
API Docs: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech # Backward-compatible alias
""" SiliconFlowTTSService = OpenAICompatibleTTSService
import os __all__ = ["OpenAICompatibleTTSService", "SiliconFlowTTSService", "StreamingTTSAdapter"]
import asyncio
import aiohttp
from typing import AsyncIterator, Optional
from loguru import logger
from services.base import BaseTTSService, TTSChunk, ServiceState
from services.streaming_tts_adapter import StreamingTTSAdapter # backward-compatible re-export
class SiliconFlowTTSService(BaseTTSService):
"""
SiliconFlow TTS service with streaming support.
Supports CosyVoice2-0.5B and MOSS-TTSD-v0.5 models.
"""
# Available voices
VOICES = {
"alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
"anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
"bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
"benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
"charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
"claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
"david": "FunAudioLLM/CosyVoice2-0.5B:david",
"diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
}
def __init__(
self,
api_key: Optional[str] = None,
voice: str = "anna",
model: str = "FunAudioLLM/CosyVoice2-0.5B",
sample_rate: int = 16000,
speed: float = 1.0
):
"""
Initialize SiliconFlow TTS service.
Args:
api_key: SiliconFlow API key (defaults to SILICONFLOW_API_KEY env var)
voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
model: Model name
sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
speed: Speech speed (0.25 to 4.0)
"""
# Resolve voice name
if voice in self.VOICES:
full_voice = self.VOICES[voice]
else:
full_voice = voice
super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed)
self.api_key = api_key or os.getenv("SILICONFLOW_API_KEY")
self.model = model
self.api_url = "https://api.siliconflow.cn/v1/audio/speech"
self._session: Optional[aiohttp.ClientSession] = None
self._cancel_event = asyncio.Event()
async def connect(self) -> None:
"""Initialize HTTP session."""
if not self.api_key:
raise ValueError("SiliconFlow API key not provided. Set SILICONFLOW_API_KEY env var.")
self._session = aiohttp.ClientSession(
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
)
self.state = ServiceState.CONNECTED
logger.info(f"SiliconFlow TTS service ready: voice={self.voice}, model={self.model}")
async def disconnect(self) -> None:
"""Close HTTP session."""
if self._session:
await self._session.close()
self._session = None
self.state = ServiceState.DISCONNECTED
logger.info("SiliconFlow TTS service disconnected")
async def synthesize(self, text: str) -> bytes:
"""Synthesize complete audio for text."""
audio_data = b""
async for chunk in self.synthesize_stream(text):
audio_data += chunk.audio
return audio_data
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
"""
Synthesize audio in streaming mode.
Args:
text: Text to synthesize
Yields:
TTSChunk objects with PCM audio
"""
if not self._session:
raise RuntimeError("TTS service not connected")
if not text.strip():
return
self._cancel_event.clear()
payload = {
"model": self.model,
"input": text,
"voice": self.voice,
"response_format": "pcm",
"sample_rate": self.sample_rate,
"stream": True,
"speed": self.speed
}
try:
async with self._session.post(self.api_url, json=payload) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"SiliconFlow TTS error: {response.status} - {error_text}")
return
# Stream audio chunks
chunk_size = self.sample_rate * 2 // 10 # 100ms chunks
buffer = b""
pending_chunk = None
async for chunk in response.content.iter_any():
if self._cancel_event.is_set():
logger.info("TTS synthesis cancelled")
return
buffer += chunk
# Yield complete chunks
while len(buffer) >= chunk_size:
audio_chunk = buffer[:chunk_size]
buffer = buffer[chunk_size:]
# Keep one full chunk buffered so we can always tag the true
# last full chunk as final when stream length is an exact multiple.
if pending_chunk is not None:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=False
)
pending_chunk = audio_chunk
# Flush pending chunk(s) and remaining tail.
if pending_chunk is not None:
if buffer:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=False
)
pending_chunk = None
else:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=True
)
pending_chunk = None
if buffer:
yield TTSChunk(
audio=buffer,
sample_rate=self.sample_rate,
is_final=True
)
except asyncio.CancelledError:
logger.info("TTS synthesis cancelled via asyncio")
raise
except Exception as e:
logger.error(f"TTS synthesis error: {e}")
raise
async def cancel(self) -> None:
"""Cancel ongoing synthesis."""
self._cancel_event.set()
class StreamingTTSAdapter:
"""
Adapter for streaming LLM text to TTS with sentence-level chunking.
This reduces latency by starting TTS as soon as a complete sentence
is received from the LLM, rather than waiting for the full response.
"""
# Sentence delimiters
SENTENCE_ENDS = {'', '', '', '', '.', '!', '?', '\n'}
def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
self.tts_service = tts_service
self.transport = transport
self.session_id = session_id
self._buffer = ""
self._cancel_event = asyncio.Event()
self._is_speaking = False
def _is_non_sentence_period(self, text: str, idx: int) -> bool:
"""Check whether '.' should NOT be treated as a sentence delimiter."""
if text[idx] != ".":
return False
# Decimal/version segment: 1.2, v1.2.3
if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
return True
# Number abbreviations: No.1 / No. 1
left_start = idx - 1
while left_start >= 0 and text[left_start].isalpha():
left_start -= 1
left_token = text[left_start + 1:idx].lower()
if left_token == "no":
j = idx + 1
while j < len(text) and text[j].isspace():
j += 1
if j < len(text) and text[j].isdigit():
return True
return False
async def process_text_chunk(self, text_chunk: str) -> None:
"""
Process a text chunk from LLM and trigger TTS when sentence is complete.
Args:
text_chunk: Text chunk from LLM streaming
"""
if self._cancel_event.is_set():
return
self._buffer += text_chunk
# Check for sentence completion
while True:
split_idx = -1
for i, char in enumerate(self._buffer):
if char == "." and self._is_non_sentence_period(self._buffer, i):
continue
if char in self.SENTENCE_ENDS:
split_idx = i
break
if split_idx < 0:
break
end_idx = split_idx + 1
while end_idx < len(self._buffer) and self._buffer[end_idx] in self.SENTENCE_ENDS:
end_idx += 1
sentence = self._buffer[:end_idx].strip()
self._buffer = self._buffer[end_idx:]
if sentence and any(ch.isalnum() for ch in sentence):
await self._speak_sentence(sentence)
async def flush(self) -> None:
"""Flush remaining buffer."""
if self._buffer.strip() and not self._cancel_event.is_set():
await self._speak_sentence(self._buffer.strip())
self._buffer = ""
async def _speak_sentence(self, text: str) -> None:
"""Synthesize and send a sentence."""
if not text or self._cancel_event.is_set():
return
self._is_speaking = True
try:
async for chunk in self.tts_service.synthesize_stream(text):
if self._cancel_event.is_set():
break
await self.transport.send_audio(chunk.audio)
await asyncio.sleep(0.01) # Prevent flooding
except Exception as e:
logger.error(f"TTS speak error: {e}")
finally:
self._is_speaking = False
def cancel(self) -> None:
"""Cancel ongoing speech."""
self._cancel_event.set()
self._buffer = ""
def reset(self) -> None:
"""Reset for new turn."""
self._cancel_event.clear()
self._buffer = ""
self._is_speaking = False
@property
def is_speaking(self) -> bool:
return self._is_speaking

View File

@@ -85,7 +85,7 @@ const convertRecordedBlobToWav = async (blob: Blob): Promise<File> => {
export const ASRLibraryPage: React.FC = () => { export const ASRLibraryPage: React.FC = () => {
const [models, setModels] = useState<ASRModel[]>([]); const [models, setModels] = useState<ASRModel[]>([]);
const [searchTerm, setSearchTerm] = useState(''); const [searchTerm, setSearchTerm] = useState('');
const [vendorFilter, setVendorFilter] = useState<string>('all'); const [vendorFilter, setVendorFilter] = useState<string>('OpenAI Compatible');
const [langFilter, setLangFilter] = useState<string>('all'); const [langFilter, setLangFilter] = useState<string>('all');
const [isAddModalOpen, setIsAddModalOpen] = useState(false); const [isAddModalOpen, setIsAddModalOpen] = useState(false);
const [editingModel, setEditingModel] = useState<ASRModel | null>(null); const [editingModel, setEditingModel] = useState<ASRModel | null>(null);
@@ -111,7 +111,7 @@ export const ASRLibraryPage: React.FC = () => {
const filteredModels = models.filter((m) => { const filteredModels = models.filter((m) => {
const q = searchTerm.toLowerCase(); const q = searchTerm.toLowerCase();
const matchesSearch = m.name.toLowerCase().includes(q) || (m.modelName || '').toLowerCase().includes(q); const matchesSearch = m.name.toLowerCase().includes(q) || (m.modelName || '').toLowerCase().includes(q);
const matchesVendor = vendorFilter === 'all' || m.vendor === vendorFilter; const matchesVendor = m.vendor === vendorFilter;
const matchesLang = langFilter === 'all' || m.language === langFilter || (langFilter !== 'all' && m.language === 'Multi-lingual'); const matchesLang = langFilter === 'all' || m.language === langFilter || (langFilter !== 'all' && m.language === 'Multi-lingual');
return matchesSearch && matchesVendor && matchesLang; return matchesSearch && matchesVendor && matchesLang;
}); });
@@ -134,8 +134,6 @@ export const ASRLibraryPage: React.FC = () => {
setModels((prev) => prev.filter((m) => m.id !== id)); setModels((prev) => prev.filter((m) => m.id !== id));
}; };
const vendorOptions = Array.from(new Set(models.map((m) => m.vendor).filter(Boolean)));
return ( return (
<div className="space-y-6 animate-in fade-in py-4 pb-10"> <div className="space-y-6 animate-in fade-in py-4 pb-10">
<div className="flex items-center justify-between"> <div className="flex items-center justify-between">
@@ -162,10 +160,7 @@ export const ASRLibraryPage: React.FC = () => {
value={vendorFilter} value={vendorFilter}
onChange={(e) => setVendorFilter(e.target.value)} onChange={(e) => setVendorFilter(e.target.value)}
> >
<option value="all"></option> <option value="OpenAI Compatible">OpenAI Compatible</option>
{vendorOptions.map((vendor) => (
<option key={vendor} value={vendor}>{vendor}</option>
))}
</select> </select>
</div> </div>
<div className="flex items-center space-x-2"> <div className="flex items-center space-x-2">
@@ -371,7 +366,6 @@ const ASRModelModal: React.FC<{
onChange={(e) => setVendor(e.target.value)} onChange={(e) => setVendor(e.target.value)}
> >
<option value="OpenAI Compatible">OpenAI Compatible</option> <option value="OpenAI Compatible">OpenAI Compatible</option>
<option value="SiliconFlow">SiliconFlow</option>
</select> </select>
</div> </div>
<div className="space-y-1.5"> <div className="space-y-1.5">

View File

@@ -5,32 +5,37 @@ import { Button, Input, Badge, Drawer, Dialog } from '../components/UI';
import { ASRModel, Assistant, KnowledgeBase, LLMModel, TabValue, Tool, Voice } from '../types'; import { ASRModel, Assistant, KnowledgeBase, LLMModel, TabValue, Tool, Voice } from '../types';
import { createAssistant, deleteAssistant, fetchASRModels, fetchAssistants, fetchKnowledgeBases, fetchLLMModels, fetchTools, fetchVoices, updateAssistant as updateAssistantApi } from '../services/backendApi'; import { createAssistant, deleteAssistant, fetchASRModels, fetchAssistants, fetchKnowledgeBases, fetchLLMModels, fetchTools, fetchVoices, updateAssistant as updateAssistantApi } from '../services/backendApi';
const isSiliconflowVendor = (vendor?: string) => { const isOpenAICompatibleVendor = (vendor?: string) => {
const normalized = String(vendor || '').trim().toLowerCase(); const normalized = String(vendor || '').trim().toLowerCase();
return normalized === 'siliconflow' || normalized === '硅基流动'; return (
normalized === 'siliconflow' ||
normalized === '硅基流动' ||
normalized === 'openai compatible' ||
normalized === 'openai-compatible'
);
}; };
const SILICONFLOW_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B'; const OPENAI_COMPATIBLE_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B';
const buildSiliconflowVoiceKey = (voiceId: string, model?: string) => { const buildOpenAICompatibleVoiceKey = (voiceId: string, model?: string) => {
const id = String(voiceId || '').trim(); const id = String(voiceId || '').trim();
if (!id) return ''; if (!id) return '';
if (id.includes(':')) return id; if (id.includes(':')) return id;
return `${model || SILICONFLOW_DEFAULT_MODEL}:${id}`; return `${model || OPENAI_COMPATIBLE_DEFAULT_MODEL}:${id}`;
}; };
const resolveRuntimeTtsVoice = (selectedVoiceId: string, voice: Voice) => { const resolveRuntimeTtsVoice = (selectedVoiceId: string, voice: Voice) => {
const explicitKey = String(voice.voiceKey || '').trim(); const explicitKey = String(voice.voiceKey || '').trim();
if (!isSiliconflowVendor(voice.vendor)) { if (!isOpenAICompatibleVendor(voice.vendor)) {
return explicitKey || selectedVoiceId; return explicitKey || selectedVoiceId;
} }
if (voice.isSystem) { if (voice.isSystem) {
const canonical = buildSiliconflowVoiceKey(selectedVoiceId, voice.model); const canonical = buildOpenAICompatibleVoiceKey(selectedVoiceId, voice.model);
if (!explicitKey) return canonical; if (!explicitKey) return canonical;
const explicitSuffix = explicitKey.includes(':') ? explicitKey.split(':').pop() : explicitKey; const explicitSuffix = explicitKey.includes(':') ? explicitKey.split(':').pop() : explicitKey;
if (explicitSuffix && explicitSuffix !== selectedVoiceId) return canonical; if (explicitSuffix && explicitSuffix !== selectedVoiceId) return canonical;
} }
return explicitKey || buildSiliconflowVoiceKey(selectedVoiceId, voice.model); return explicitKey || buildOpenAICompatibleVoiceKey(selectedVoiceId, voice.model);
}; };
const renderToolIcon = (icon: string) => { const renderToolIcon = (icon: string) => {
@@ -1830,11 +1835,11 @@ export const DebugDrawer: React.FC<{
if (assistant.asrModelId) { if (assistant.asrModelId) {
const asr = asrModels.find((item) => item.id === assistant.asrModelId); const asr = asrModels.find((item) => item.id === assistant.asrModelId);
if (asr) { if (asr) {
const asrProvider = isSiliconflowVendor(asr.vendor) ? 'siliconflow' : 'buffered'; const asrProvider = isOpenAICompatibleVendor(asr.vendor) ? 'openai_compatible' : 'buffered';
services.asr = { services.asr = {
provider: asrProvider, provider: asrProvider,
model: asr.modelName || asr.name, model: asr.modelName || asr.name,
apiKey: asrProvider === 'siliconflow' ? asr.apiKey : null, apiKey: asrProvider === 'openai_compatible' ? asr.apiKey : null,
}; };
} else { } else {
warnings.push(`ASR model not found in loaded list: ${assistant.asrModelId}`); warnings.push(`ASR model not found in loaded list: ${assistant.asrModelId}`);
@@ -1844,12 +1849,12 @@ export const DebugDrawer: React.FC<{
if (assistant.voice) { if (assistant.voice) {
const voice = voices.find((item) => item.id === assistant.voice); const voice = voices.find((item) => item.id === assistant.voice);
if (voice) { if (voice) {
const ttsProvider = isSiliconflowVendor(voice.vendor) ? 'siliconflow' : 'edge'; const ttsProvider = isOpenAICompatibleVendor(voice.vendor) ? 'openai_compatible' : 'edge';
services.tts = { services.tts = {
enabled: ttsEnabled, enabled: ttsEnabled,
provider: ttsProvider, provider: ttsProvider,
model: voice.model, model: voice.model,
apiKey: ttsProvider === 'siliconflow' ? voice.apiKey : null, apiKey: ttsProvider === 'openai_compatible' ? voice.apiKey : null,
voice: resolveRuntimeTtsVoice(assistant.voice, voice), voice: resolveRuntimeTtsVoice(assistant.voice, voice),
speed: assistant.speed || voice.speed || 1.0, speed: assistant.speed || voice.speed || 1.0,
}; };

View File

@@ -13,7 +13,7 @@ const maskApiKey = (key?: string) => {
export const LLMLibraryPage: React.FC = () => { export const LLMLibraryPage: React.FC = () => {
const [models, setModels] = useState<LLMModel[]>([]); const [models, setModels] = useState<LLMModel[]>([]);
const [searchTerm, setSearchTerm] = useState(''); const [searchTerm, setSearchTerm] = useState('');
const [vendorFilter, setVendorFilter] = useState<string>('all'); const [vendorFilter, setVendorFilter] = useState<string>('OpenAI Compatible');
const [typeFilter, setTypeFilter] = useState<string>('all'); const [typeFilter, setTypeFilter] = useState<string>('all');
const [isAddModalOpen, setIsAddModalOpen] = useState(false); const [isAddModalOpen, setIsAddModalOpen] = useState(false);
const [editingModel, setEditingModel] = useState<LLMModel | null>(null); const [editingModel, setEditingModel] = useState<LLMModel | null>(null);
@@ -41,7 +41,7 @@ export const LLMLibraryPage: React.FC = () => {
m.name.toLowerCase().includes(q) || m.name.toLowerCase().includes(q) ||
(m.modelName || '').toLowerCase().includes(q) || (m.modelName || '').toLowerCase().includes(q) ||
(m.baseUrl || '').toLowerCase().includes(q); (m.baseUrl || '').toLowerCase().includes(q);
const matchesVendor = vendorFilter === 'all' || m.vendor === vendorFilter; const matchesVendor = m.vendor === vendorFilter;
const matchesType = typeFilter === 'all' || m.type === typeFilter; const matchesType = typeFilter === 'all' || m.type === typeFilter;
return matchesSearch && matchesVendor && matchesType; return matchesSearch && matchesVendor && matchesType;
}); });
@@ -64,8 +64,6 @@ export const LLMLibraryPage: React.FC = () => {
setModels((prev) => prev.filter((item) => item.id !== id)); setModels((prev) => prev.filter((item) => item.id !== id));
}; };
const vendorOptions = Array.from(new Set(models.map((m) => m.vendor).filter(Boolean)));
return ( return (
<div className="space-y-6 animate-in fade-in py-4 pb-10"> <div className="space-y-6 animate-in fade-in py-4 pb-10">
<div className="flex items-center justify-between"> <div className="flex items-center justify-between">
@@ -92,10 +90,7 @@ export const LLMLibraryPage: React.FC = () => {
value={vendorFilter} value={vendorFilter}
onChange={(e) => setVendorFilter(e.target.value)} onChange={(e) => setVendorFilter(e.target.value)}
> >
<option value="all"></option> <option value="OpenAI Compatible">OpenAI Compatible</option>
{vendorOptions.map((vendor) => (
<option key={vendor} value={vendor}>{vendor}</option>
))}
</select> </select>
</div> </div>
<div className="flex items-center space-x-2"> <div className="flex items-center space-x-2">
@@ -284,8 +279,6 @@ const LLMModelModal: React.FC<{
onChange={(e) => setVendor(e.target.value)} onChange={(e) => setVendor(e.target.value)}
> >
<option value="OpenAI Compatible">OpenAI Compatible</option> <option value="OpenAI Compatible">OpenAI Compatible</option>
<option value="OpenAI">OpenAI</option>
<option value="SiliconFlow">SiliconFlow</option>
</select> </select>
</div> </div>

View File

@@ -1,12 +1,12 @@
import React, { useEffect, useState, useRef } from 'react'; import React, { useEffect, useState, useRef } from 'react';
import { Search, Mic2, Play, Pause, Upload, Filter, Plus, Volume2, Sparkles, ChevronDown, Pencil, Trash2 } from 'lucide-react'; import { Search, Mic2, Play, Pause, Upload, Filter, Plus, Volume2, Pencil, Trash2 } from 'lucide-react';
import { Button, Input, TableHeader, TableRow, TableHead, TableCell, Dialog, Badge } from '../components/UI'; import { Button, Input, TableHeader, TableRow, TableHead, TableCell, Dialog, Badge } from '../components/UI';
import { Voice } from '../types'; import { Voice } from '../types';
import { createVoice, deleteVoice, fetchVoices, previewVoice, updateVoice } from '../services/backendApi'; import { createVoice, deleteVoice, fetchVoices, previewVoice, updateVoice } from '../services/backendApi';
const SILICONFLOW_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B'; const OPENAI_COMPATIBLE_DEFAULT_MODEL = 'FunAudioLLM/CosyVoice2-0.5B';
const buildSiliconflowVoiceKey = (rawId: string, model: string): string => { const buildOpenAICompatibleVoiceKey = (rawId: string, model: string): string => {
const id = (rawId || '').trim(); const id = (rawId || '').trim();
if (!id) return `${model}:anna`; if (!id) return `${model}:anna`;
return id.includes(':') ? id : `${model}:${id}`; return id.includes(':') ? id : `${model}:${id}`;
@@ -15,7 +15,7 @@ const buildSiliconflowVoiceKey = (rawId: string, model: string): string => {
export const VoiceLibraryPage: React.FC = () => { export const VoiceLibraryPage: React.FC = () => {
const [voices, setVoices] = useState<Voice[]>([]); const [voices, setVoices] = useState<Voice[]>([]);
const [searchTerm, setSearchTerm] = useState(''); const [searchTerm, setSearchTerm] = useState('');
const [vendorFilter, setVendorFilter] = useState<'all' | 'Ali' | 'Volcano' | 'Minimax' | '硅基流动' | 'SiliconFlow'>('all'); const [vendorFilter, setVendorFilter] = useState<'OpenAI Compatible'>('OpenAI Compatible');
const [genderFilter, setGenderFilter] = useState<'all' | 'Male' | 'Female'>('all'); const [genderFilter, setGenderFilter] = useState<'all' | 'Male' | 'Female'>('all');
const [langFilter, setLangFilter] = useState<'all' | 'zh' | 'en'>('all'); const [langFilter, setLangFilter] = useState<'all' | 'zh' | 'en'>('all');
@@ -44,7 +44,7 @@ export const VoiceLibraryPage: React.FC = () => {
const filteredVoices = voices.filter((voice) => { const filteredVoices = voices.filter((voice) => {
const matchesSearch = voice.name.toLowerCase().includes(searchTerm.toLowerCase()); const matchesSearch = voice.name.toLowerCase().includes(searchTerm.toLowerCase());
const matchesVendor = vendorFilter === 'all' || voice.vendor === vendorFilter; const matchesVendor = voice.vendor === vendorFilter;
const matchesGender = genderFilter === 'all' || voice.gender === genderFilter; const matchesGender = genderFilter === 'all' || voice.gender === genderFilter;
const matchesLang = langFilter === 'all' || voice.language === langFilter; const matchesLang = langFilter === 'all' || voice.language === langFilter;
return matchesSearch && matchesVendor && matchesGender && matchesLang; return matchesSearch && matchesVendor && matchesGender && matchesLang;
@@ -138,12 +138,7 @@ export const VoiceLibraryPage: React.FC = () => {
value={vendorFilter} value={vendorFilter}
onChange={(e) => setVendorFilter(e.target.value as any)} onChange={(e) => setVendorFilter(e.target.value as any)}
> >
<option value="all"></option> <option value="OpenAI Compatible">OpenAI Compatible</option>
<option value="硅基流动"> (SiliconFlow)</option>
<option value="SiliconFlow">SiliconFlow</option>
<option value="Ali"> (Ali)</option>
<option value="Volcano"> (Volcano)</option>
<option value="Minimax">Minimax</option>
</select> </select>
</div> </div>
<div className="flex items-center space-x-2"> <div className="flex items-center space-x-2">
@@ -187,15 +182,12 @@ export const VoiceLibraryPage: React.FC = () => {
<TableRow key={voice.id}> <TableRow key={voice.id}>
<TableCell className="font-medium"> <TableCell className="font-medium">
<div className="flex flex-col"> <div className="flex flex-col">
<span className="flex items-center text-white"> <span className="flex items-center text-white">{voice.name}</span>
{voice.vendor === '硅基流动' && <Sparkles className="w-3 h-3 text-primary mr-1.5" />}
{voice.name}
</span>
{voice.description && <span className="text-xs text-muted-foreground">{voice.description}</span>} {voice.description && <span className="text-xs text-muted-foreground">{voice.description}</span>}
</div> </div>
</TableCell> </TableCell>
<TableCell> <TableCell>
<Badge variant={voice.vendor === '硅基流动' ? 'default' : 'outline'}>{voice.vendor}</Badge> <Badge variant="outline">{voice.vendor}</Badge>
</TableCell> </TableCell>
<TableCell className="text-muted-foreground">{voice.gender === 'Male' ? '男' : '女'}</TableCell> <TableCell className="text-muted-foreground">{voice.gender === 'Male' ? '男' : '女'}</TableCell>
<TableCell className="text-muted-foreground">{voice.language === 'zh' ? '中文' : 'English'}</TableCell> <TableCell className="text-muted-foreground">{voice.language === 'zh' ? '中文' : 'English'}</TableCell>
@@ -254,17 +246,15 @@ const AddVoiceModal: React.FC<{
onSuccess: (voice: Voice) => Promise<void>; onSuccess: (voice: Voice) => Promise<void>;
initialVoice?: Voice; initialVoice?: Voice;
}> = ({ isOpen, onClose, onSuccess, initialVoice }) => { }> = ({ isOpen, onClose, onSuccess, initialVoice }) => {
const [vendor, setVendor] = useState<'硅基流动' | 'Ali' | 'Volcano' | 'Minimax'>('硅基流动'); const [vendor, setVendor] = useState<'OpenAI Compatible'>('OpenAI Compatible');
const [name, setName] = useState(''); const [name, setName] = useState('');
const [sfModel, setSfModel] = useState(SILICONFLOW_DEFAULT_MODEL); const [openaiCompatibleModel, setOpenaiCompatibleModel] = useState(OPENAI_COMPATIBLE_DEFAULT_MODEL);
const [sfVoiceId, setSfVoiceId] = useState('FunAudioLLM/CosyVoice2-0.5B:anna'); const [sfVoiceId, setSfVoiceId] = useState('FunAudioLLM/CosyVoice2-0.5B:anna');
const [sfSpeed, setSfSpeed] = useState(1); const [sfSpeed, setSfSpeed] = useState(1);
const [sfGain, setSfGain] = useState(0); const [sfGain, setSfGain] = useState(0);
const [sfPitch, setSfPitch] = useState(0); const [sfPitch, setSfPitch] = useState(0);
const [model, setModel] = useState('');
const [voiceKey, setVoiceKey] = useState('');
const [gender, setGender] = useState('Female'); const [gender, setGender] = useState('Female');
const [language, setLanguage] = useState('zh'); const [language, setLanguage] = useState('zh');
const [description, setDescription] = useState(''); const [description, setDescription] = useState('');
@@ -278,17 +268,15 @@ const AddVoiceModal: React.FC<{
useEffect(() => { useEffect(() => {
if (!initialVoice) return; if (!initialVoice) return;
const nextVendor = initialVoice.vendor === 'SiliconFlow' ? '硅基流动' : initialVoice.vendor; const nextVendor = 'OpenAI Compatible';
const nextModel = initialVoice.model || SILICONFLOW_DEFAULT_MODEL; const nextModel = initialVoice.model || OPENAI_COMPATIBLE_DEFAULT_MODEL;
const defaultVoiceKey = buildSiliconflowVoiceKey(initialVoice.id || initialVoice.name || '', nextModel); const defaultVoiceKey = buildOpenAICompatibleVoiceKey(initialVoice.id || initialVoice.name || '', nextModel);
setVendor((nextVendor as any) || '硅基流动'); setVendor(nextVendor);
setName(initialVoice.name || ''); setName(initialVoice.name || '');
setGender(initialVoice.gender || 'Female'); setGender(initialVoice.gender || 'Female');
setLanguage(initialVoice.language || 'zh'); setLanguage(initialVoice.language || 'zh');
setDescription(initialVoice.description || ''); setDescription(initialVoice.description || '');
setModel(initialVoice.model || ''); setOpenaiCompatibleModel(nextModel);
setVoiceKey(initialVoice.voiceKey || '');
setSfModel(nextModel);
setSfVoiceId((initialVoice.voiceKey || '').trim() || defaultVoiceKey); setSfVoiceId((initialVoice.voiceKey || '').trim() || defaultVoiceKey);
setSfSpeed(initialVoice.speed ?? 1); setSfSpeed(initialVoice.speed ?? 1);
setSfGain(initialVoice.gain ?? 0); setSfGain(initialVoice.gain ?? 0);
@@ -325,21 +313,21 @@ const AddVoiceModal: React.FC<{
return; return;
} }
const resolvedSiliconflowVoiceKey = (() => { const resolvedVoiceKey = (() => {
const current = (sfVoiceId || '').trim(); const current = (sfVoiceId || '').trim();
if (current) return current; if (current) return current;
return buildSiliconflowVoiceKey(initialVoice?.id || name, sfModel || SILICONFLOW_DEFAULT_MODEL); return buildOpenAICompatibleVoiceKey(initialVoice?.id || name, openaiCompatibleModel || OPENAI_COMPATIBLE_DEFAULT_MODEL);
})(); })();
const newVoice: Voice = { const newVoice: Voice = {
id: initialVoice?.id || `${vendor === '硅基流动' ? 'sf' : 'gen'}-${Date.now()}`, id: initialVoice?.id || `oa-${Date.now()}`,
name, name,
vendor, vendor,
gender, gender,
language, language,
description: description || (vendor === '硅基流动' ? `Model: ${sfModel}` : `Model: ${model}`), description: description || `Model: ${openaiCompatibleModel}`,
model: vendor === '硅基流动' ? sfModel : model, model: openaiCompatibleModel,
voiceKey: vendor === '硅基流动' ? resolvedSiliconflowVoiceKey : voiceKey, voiceKey: resolvedVoiceKey,
apiKey, apiKey,
baseUrl, baseUrl,
speed: sfSpeed, speed: sfSpeed,
@@ -351,10 +339,8 @@ const AddVoiceModal: React.FC<{
setIsSaving(true); setIsSaving(true);
await onSuccess(newVoice); await onSuccess(newVoice);
setName(''); setName('');
setVendor('硅基流动'); setVendor('OpenAI Compatible');
setDescription(''); setDescription('');
setModel('');
setVoiceKey('');
setApiKey(''); setApiKey('');
setBaseUrl(''); setBaseUrl('');
} catch (error: any) { } catch (error: any) {
@@ -381,19 +367,7 @@ const AddVoiceModal: React.FC<{
<div className="space-y-4 max-h-[75vh] overflow-y-auto px-1 custom-scrollbar"> <div className="space-y-4 max-h-[75vh] overflow-y-auto px-1 custom-scrollbar">
<div className="space-y-1.5"> <div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"> (Vendor)</label> <label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"> (Vendor)</label>
<div className="relative"> <Input value={vendor} readOnly className="h-10 border border-white/10 bg-white/5" />
<select
className="flex h-10 w-full rounded-md border border-white/10 bg-white/5 px-3 py-1 text-sm shadow-sm transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-primary/50 text-foreground appearance-none cursor-pointer [&>option]:bg-card"
value={vendor}
onChange={(e) => setVendor(e.target.value as any)}
>
<option value="硅基流动"> (SiliconFlow)</option>
<option value="Ali"> (Ali)</option>
<option value="Volcano"> (Volcano)</option>
<option value="Minimax">Minimax</option>
</select>
<ChevronDown className="absolute right-3 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground pointer-events-none" />
</div>
</div> </div>
<div className="h-px bg-white/5"></div> <div className="h-px bg-white/5"></div>
@@ -403,15 +377,14 @@ const AddVoiceModal: React.FC<{
<Input value={name} onChange={(e) => setName(e.target.value)} placeholder="例如: 客服小美" /> <Input value={name} onChange={(e) => setName(e.target.value)} placeholder="例如: 客服小美" />
</div> </div>
{vendor === '硅基流动' ? (
<div className="space-y-4 animate-in fade-in slide-in-from-top-1 duration-200"> <div className="space-y-4 animate-in fade-in slide-in-from-top-1 duration-200">
<div className="grid grid-cols-2 gap-4"> <div className="grid grid-cols-2 gap-4">
<div className="space-y-1.5"> <div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"> (Model)</label> <label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"> (Model)</label>
<Input <Input
className="font-mono text-xs" className="font-mono text-xs"
value={sfModel} value={openaiCompatibleModel}
onChange={(e) => setSfModel(e.target.value)} onChange={(e) => setOpenaiCompatibleModel(e.target.value)}
placeholder="例如: FunAudioLLM/CosyVoice2-0.5B" placeholder="例如: FunAudioLLM/CosyVoice2-0.5B"
/> />
</div> </div>
@@ -445,20 +418,6 @@ const AddVoiceModal: React.FC<{
</div> </div>
</div> </div>
</div> </div>
) : (
<div className="space-y-4 animate-in fade-in slide-in-from-top-1 duration-200">
<div className="grid grid-cols-2 gap-4">
<div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"></label>
<Input value={model} onChange={(e) => setModel(e.target.value)} placeholder="API Model Key" />
</div>
<div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"></label>
<Input value={voiceKey} onChange={(e) => setVoiceKey(e.target.value)} placeholder="Voice Key" />
</div>
</div>
</div>
)}
<div className="grid grid-cols-2 gap-4"> <div className="grid grid-cols-2 gap-4">
<div className="space-y-1.5"> <div className="space-y-1.5">
@@ -560,7 +519,7 @@ const CloneVoiceModal: React.FC<{
const newVoice: Voice = { const newVoice: Voice = {
id: `v-${Date.now()}`, id: `v-${Date.now()}`,
name, name,
vendor: 'Volcano', vendor: 'OpenAI Compatible',
gender: 'Female', gender: 'Female',
language: 'zh', language: 'zh',
description: description || 'User cloned voice', description: description || 'User cloned voice',

View File

@@ -55,8 +55,11 @@ const mapVoice = (raw: AnyRecord): Voice => ({
id: String(readField(raw, ['id'], '')), id: String(readField(raw, ['id'], '')),
name: readField(raw, ['name'], ''), name: readField(raw, ['name'], ''),
vendor: ((): string => { vendor: ((): string => {
const vendor = String(readField(raw, ['vendor'], '')); const vendor = String(readField(raw, ['vendor'], '')).trim().toLowerCase();
return vendor.toLowerCase() === 'siliconflow' ? '硅基流动' : vendor; if (vendor === 'siliconflow' || vendor === '硅基流动' || vendor === 'openai-compatible') {
return 'OpenAI Compatible';
}
return String(readField(raw, ['vendor'], 'OpenAI Compatible')) || 'OpenAI Compatible';
})(), })(),
gender: readField(raw, ['gender'], ''), gender: readField(raw, ['gender'], ''),
language: readField(raw, ['language'], ''), language: readField(raw, ['language'], ''),
@@ -296,7 +299,7 @@ export const createVoice = async (data: Partial<Voice>): Promise<Voice> => {
const payload = { const payload = {
id: data.id || undefined, id: data.id || undefined,
name: data.name || 'New Voice', name: data.name || 'New Voice',
vendor: data.vendor === '硅基流动' ? 'SiliconFlow' : (data.vendor || 'SiliconFlow'), vendor: data.vendor || 'OpenAI Compatible',
gender: data.gender || 'Female', gender: data.gender || 'Female',
language: data.language || 'zh', language: data.language || 'zh',
description: data.description || '', description: data.description || '',
@@ -316,7 +319,7 @@ export const createVoice = async (data: Partial<Voice>): Promise<Voice> => {
export const updateVoice = async (id: string, data: Partial<Voice>): Promise<Voice> => { export const updateVoice = async (id: string, data: Partial<Voice>): Promise<Voice> => {
const payload = { const payload = {
name: data.name, name: data.name,
vendor: data.vendor === '硅基流动' ? 'SiliconFlow' : data.vendor, vendor: data.vendor,
gender: data.gender, gender: data.gender,
language: data.language, language: data.language,
description: data.description, description: data.description,

View File

@@ -200,7 +200,7 @@ export const mockLLMModels: LLMModel[] = [
{ id: 'm1', name: 'GPT-4o', vendor: 'OpenAI Compatible', type: 'text', baseUrl: 'https://api.openai.com/v1', apiKey: 'sk-***', temperature: 0.7 }, { id: 'm1', name: 'GPT-4o', vendor: 'OpenAI Compatible', type: 'text', baseUrl: 'https://api.openai.com/v1', apiKey: 'sk-***', temperature: 0.7 },
{ id: 'm2', name: 'DeepSeek-V3', vendor: 'OpenAI Compatible', type: 'text', baseUrl: 'https://api.deepseek.com', apiKey: 'sk-***', temperature: 0.5 }, { id: 'm2', name: 'DeepSeek-V3', vendor: 'OpenAI Compatible', type: 'text', baseUrl: 'https://api.deepseek.com', apiKey: 'sk-***', temperature: 0.5 },
{ id: 'm3', name: 'text-embedding-3-small', vendor: 'OpenAI Compatible', type: 'embedding', baseUrl: 'https://api.openai.com/v1', apiKey: 'sk-***' }, { id: 'm3', name: 'text-embedding-3-small', vendor: 'OpenAI Compatible', type: 'embedding', baseUrl: 'https://api.openai.com/v1', apiKey: 'sk-***' },
{ id: 'm4', name: 'bge-reranker-v2-m3', vendor: 'SiliconFlow', type: 'rerank', baseUrl: 'https://api.siliconflow.cn/v1', apiKey: 'sk-***' }, { id: 'm4', name: 'bge-reranker-v2-m3', vendor: 'OpenAI Compatible', type: 'rerank', baseUrl: 'https://api.siliconflow.cn/v1', apiKey: 'sk-***' },
]; ];
export const mockASRModels: ASRModel[] = [ export const mockASRModels: ASRModel[] = [