From 4bf2f788ad6174b2d94e5a2434f075d7add58574 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Sun, 8 Feb 2026 23:38:34 +0800 Subject: [PATCH] Update asr library preview --- api/app/routers/asr.py | 123 ++++++- api/tests/test_asr.py | 58 ++++ web/pages/ASRLibrary.tsx | 669 +++++++++++++++++++++++++++---------- web/services/backendApi.ts | 107 +++++- web/types.ts | 7 +- 5 files changed, 781 insertions(+), 183 deletions(-) diff --git a/api/app/routers/asr.py b/api/app/routers/asr.py index e55028e..5805061 100644 --- a/api/app/routers/asr.py +++ b/api/app/routers/asr.py @@ -1,12 +1,11 @@ -from fastapi import APIRouter, Depends, HTTPException -from sqlalchemy.orm import Session -from typing import List, Optional -import uuid -import httpx +import os import time -import base64 -import json -from datetime import datetime +import uuid +from typing import List, Optional + +import httpx +from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile +from sqlalchemy.orm import Session from ..db import get_db from ..models import ASRModel @@ -17,6 +16,18 @@ from ..schemas import ( router = APIRouter(prefix="/asr", tags=["ASR Models"]) +SILICONFLOW_DEFAULT_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall" + + +def _is_siliconflow_vendor(vendor: str) -> bool: + return (vendor or "").strip().lower() in {"siliconflow", "硅基流动"} + + +def _default_asr_model(vendor: str) -> str: + if _is_siliconflow_vendor(vendor): + return SILICONFLOW_DEFAULT_ASR_MODEL + return "whisper-1" + # ============ ASR Models CRUD ============ @router.get("") @@ -219,3 +230,99 @@ def transcribe_audio( except Exception as e: raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/{id}/preview", response_model=ASRTestResponse) +async def preview_asr_model( + id: str, + file: UploadFile = File(...), + language: Optional[str] = Form(None), + api_key: Optional[str] = Form(None), + db: Session = Depends(get_db), +): + """预览 ASR:上传音频并调用 OpenAI-compatible /audio/transcriptions。""" + model = db.query(ASRModel).filter(ASRModel.id == id).first() + if not model: + raise HTTPException(status_code=404, detail="ASR Model not found") + + if not file: + raise HTTPException(status_code=400, detail="Audio file is required") + + filename = file.filename or "preview.wav" + content_type = file.content_type or "application/octet-stream" + if not content_type.startswith("audio/"): + raise HTTPException(status_code=400, detail="Only audio files are supported") + + audio_bytes = await file.read() + if not audio_bytes: + raise HTTPException(status_code=400, detail="Uploaded audio file is empty") + + effective_api_key = (api_key or "").strip() or (model.api_key or "").strip() + if not effective_api_key and _is_siliconflow_vendor(model.vendor): + effective_api_key = os.getenv("SILICONFLOW_API_KEY", "").strip() + if not effective_api_key: + raise HTTPException(status_code=400, detail=f"API key is required for ASR model: {model.name}") + + base_url = (model.base_url or "").strip().rstrip("/") + if not base_url: + raise HTTPException(status_code=400, detail=f"Base URL is required for ASR model: {model.name}") + + selected_model = (model.model_name or "").strip() or _default_asr_model(model.vendor) + data = {"model": selected_model} + effective_language = (language or "").strip() or None + if effective_language: + data["language"] = effective_language + if model.hotwords: + data["prompt"] = " ".join(model.hotwords) + + headers = {"Authorization": f"Bearer {effective_api_key}"} + files = {"file": (filename, audio_bytes, content_type)} + + start_time = time.time() + try: + with httpx.Client(timeout=90.0) as client: + response = client.post( + f"{base_url}/audio/transcriptions", + headers=headers, + data=data, + files=files, + ) + except Exception as exc: + raise HTTPException(status_code=502, detail=f"ASR request failed: {exc}") from exc + + if response.status_code != 200: + detail = response.text + try: + detail_json = response.json() + detail = detail_json.get("error", {}).get("message") or detail_json.get("detail") or detail + except Exception: + pass + raise HTTPException(status_code=502, detail=f"ASR vendor error: {detail}") + + try: + payload = response.json() + except Exception: + payload = {"text": response.text} + + transcript = "" + response_language = model.language + confidence = None + if isinstance(payload, dict): + transcript = str(payload.get("text") or payload.get("transcript") or "") + response_language = str(payload.get("language") or effective_language or model.language) + raw_confidence = payload.get("confidence") + if raw_confidence is not None: + try: + confidence = float(raw_confidence) + except (TypeError, ValueError): + confidence = None + + latency_ms = int((time.time() - start_time) * 1000) + return ASRTestResponse( + success=bool(transcript), + transcript=transcript, + language=response_language, + confidence=confidence, + latency_ms=latency_ms, + message=None if transcript else "No transcript in response", + ) diff --git a/api/tests/test_asr.py b/api/tests/test_asr.py index 3f8cc77..209116c 100644 --- a/api/tests/test_asr.py +++ b/api/tests/test_asr.py @@ -287,3 +287,61 @@ class TestASRModelAPI: response = client.post("/api/asr", json=data) assert response.status_code == 200 assert response.json()["vendor"] == vendor + + def test_preview_asr_model_success(self, client, sample_asr_model_data, monkeypatch): + """Test ASR preview endpoint with OpenAI-compatible transcriptions API.""" + from app.routers import asr as asr_router + + create_response = client.post("/api/asr", json=sample_asr_model_data) + model_id = create_response.json()["id"] + + class DummyResponse: + status_code = 200 + + def json(self): + return {"text": "你好,这是测试转写", "language": "zh", "confidence": 0.98} + + @property + def text(self): + return '{"text":"ok"}' + + class DummyClient: + def __init__(self, *args, **kwargs): + pass + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def post(self, url, headers=None, data=None, files=None): + assert url.endswith("/audio/transcriptions") + assert headers["Authorization"] == f"Bearer {sample_asr_model_data['api_key']}" + assert data["model"] == sample_asr_model_data["model_name"] + assert files["file"][0] == "sample.wav" + return DummyResponse() + + monkeypatch.setattr(asr_router.httpx, "Client", DummyClient) + + response = client.post( + f"/api/asr/{model_id}/preview", + files={"file": ("sample.wav", b"fake-wav-bytes", "audio/wav")}, + ) + assert response.status_code == 200 + payload = response.json() + assert payload["success"] is True + assert payload["transcript"] == "你好,这是测试转写" + assert payload["language"] == "zh" + + def test_preview_asr_model_reject_non_audio(self, client, sample_asr_model_data): + """Test ASR preview endpoint rejects non-audio file.""" + create_response = client.post("/api/asr", json=sample_asr_model_data) + model_id = create_response.json()["id"] + + response = client.post( + f"/api/asr/{model_id}/preview", + files={"file": ("sample.txt", b"text-data", "text/plain")}, + ) + assert response.status_code == 400 + assert "Only audio files are supported" in response.text diff --git a/web/pages/ASRLibrary.tsx b/web/pages/ASRLibrary.tsx index a47b845..9611921 100644 --- a/web/pages/ASRLibrary.tsx +++ b/web/pages/ASRLibrary.tsx @@ -1,103 +1,122 @@ - -import React, { useState } from 'react'; -import { Search, Filter, Plus, Trash2, Key, Server, Ear, Globe, Languages } from 'lucide-react'; +import React, { useEffect, useRef, useState } from 'react'; +import { Search, Filter, Plus, Trash2, Key, Server, Ear, Globe, Languages, Pencil, Mic, Square, Upload } from 'lucide-react'; import { Button, Input, TableHeader, TableRow, TableHead, TableCell, Dialog, Badge } from '../components/UI'; -import { mockASRModels } from '../services/mockData'; import { ASRModel } from '../types'; +import { createASRModel, deleteASRModel, fetchASRModels, previewASRModel, updateASRModel } from '../services/backendApi'; + +const maskApiKey = (key?: string) => { + if (!key) return '********'; + if (key.length < 8) return '********'; + return `${key.slice(0, 3)}****${key.slice(-4)}`; +}; + +const parseHotwords = (value: string): string[] => { + return value + .split(/[\n,]/) + .map((item) => item.trim()) + .filter(Boolean); +}; + +const toHotwordsValue = (hotwords?: string[]): string => (hotwords || []).join(', '); export const ASRLibraryPage: React.FC = () => { - const [models, setModels] = useState(mockASRModels); + const [models, setModels] = useState([]); const [searchTerm, setSearchTerm] = useState(''); const [vendorFilter, setVendorFilter] = useState('all'); const [langFilter, setLangFilter] = useState('all'); const [isAddModalOpen, setIsAddModalOpen] = useState(false); + const [editingModel, setEditingModel] = useState(null); + const [previewingModel, setPreviewingModel] = useState(null); + const [isLoading, setIsLoading] = useState(true); - // Form State - const [newModel, setNewModel] = useState>({ - vendor: 'OpenAI Compatible', - language: 'zh' - }); + const loadModels = async () => { + setIsLoading(true); + try { + setModels(await fetchASRModels()); + } catch (error) { + console.error(error); + setModels([]); + } finally { + setIsLoading(false); + } + }; - const filteredModels = models.filter(m => { - const matchesSearch = m.name.toLowerCase().includes(searchTerm.toLowerCase()); + useEffect(() => { + loadModels(); + }, []); + + const filteredModels = models.filter((m) => { + const q = searchTerm.toLowerCase(); + const matchesSearch = m.name.toLowerCase().includes(q) || (m.modelName || '').toLowerCase().includes(q); const matchesVendor = vendorFilter === 'all' || m.vendor === vendorFilter; const matchesLang = langFilter === 'all' || m.language === langFilter || (langFilter !== 'all' && m.language === 'Multi-lingual'); return matchesSearch && matchesVendor && matchesLang; }); - const handleAddModel = () => { - if (!newModel.name || !newModel.baseUrl || !newModel.apiKey) { - alert("请填写完整信息"); - return; - } - - const model: ASRModel = { - id: `asr_${Date.now()}`, - name: newModel.name, - vendor: newModel.vendor as 'OpenAI Compatible', - language: newModel.language || 'zh', - baseUrl: newModel.baseUrl, - apiKey: newModel.apiKey - }; - - setModels([model, ...models]); + const handleCreate = async (data: Partial) => { + const created = await createASRModel(data); + setModels((prev) => [created, ...prev]); setIsAddModalOpen(false); - setNewModel({ vendor: 'OpenAI Compatible', language: 'zh', name: '', baseUrl: '', apiKey: '' }); }; - const handleDeleteModel = (id: string) => { - if (confirm('确认删除该语音识别模型吗?')) { - setModels(prev => prev.filter(m => m.id !== id)); - } + const handleUpdate = async (id: string, data: Partial) => { + const updated = await updateASRModel(id, data); + setModels((prev) => prev.map((m) => (m.id === id ? updated : m))); + setEditingModel(null); }; - const maskApiKey = (key: string) => { - if (!key || key.length < 8) return '********'; - return `${key.substring(0, 3)}****${key.substring(key.length - 4)}`; + const handleDelete = async (id: string) => { + if (!confirm('确认删除该语音识别模型吗?')) return; + await deleteASRModel(id); + setModels((prev) => prev.filter((m) => m.id !== id)); }; + const vendorOptions = Array.from(new Set(models.map((m) => m.vendor).filter(Boolean))); + return (

语音识别

-
- - setSearchTerm(e.target.value)} - /> -
-
- - -
-
- -
+
+ + setSearchTerm(e.target.value)} + /> +
+
+ + +
+
+ +
@@ -105,131 +124,435 @@ export const ASRLibraryPage: React.FC = () => { 模型名称 - 接口类型 + 厂商 语言 + 模型标识 Base URL API Key 操作 - {filteredModels.map(model => ( + {!isLoading && filteredModels.map((model) => ( - - - {model.name} - - - {model.vendor} - - - - {model.language} - - - - {model.baseUrl} - - - {maskApiKey(model.apiKey)} + +
+ + + {model.name} + + {model.hotwords && model.hotwords.length > 0 && ( + 热词: {model.hotwords.join(', ')} + )} +
+ {model.vendor} + {model.language} + {model.modelName || '-'} + {model.baseUrl} + {maskApiKey(model.apiKey)} - + + +
))} - {filteredModels.length === 0 && ( - - 暂无语音识别模型 - - )} + {!isLoading && filteredModels.length === 0 && ( + + 暂无语音识别模型 + + )} + {isLoading && ( + + 加载中... + + )}
- setIsAddModalOpen(false)} - title="添加语音识别模型" - footer={ - <> - - - - } - > -
-
- - -
+ onSubmit={handleCreate} + /> -
- -
- {(['zh', 'en', 'Multi-lingual'] as const).map(l => ( - - ))} -
-
+ setEditingModel(null)} + onSubmit={(data) => handleUpdate(editingModel!.id, data)} + initialModel={editingModel || undefined} + /> -
- - setNewModel({...newModel, name: e.target.value})} - placeholder="例如: whisper-1, funasr" - /> -
- -
- - setNewModel({...newModel, baseUrl: e.target.value})} - placeholder="https://api.openai.com/v1" - className="font-mono text-xs" - /> -
- -
- - setNewModel({...newModel, apiKey: e.target.value})} - placeholder="sk-..." - className="font-mono text-xs" - /> -
-
-
+ setPreviewingModel(null)} + model={previewingModel} + />
); }; + +const ASRModelModal: React.FC<{ + isOpen: boolean; + onClose: () => void; + onSubmit: (model: Partial) => Promise; + initialModel?: ASRModel; +}> = ({ isOpen, onClose, onSubmit, initialModel }) => { + const [name, setName] = useState(''); + const [vendor, setVendor] = useState('OpenAI Compatible'); + const [language, setLanguage] = useState('zh'); + const [modelName, setModelName] = useState('FunAudioLLM/SenseVoiceSmall'); + const [baseUrl, setBaseUrl] = useState('https://api.siliconflow.cn/v1'); + const [apiKey, setApiKey] = useState(''); + const [hotwords, setHotwords] = useState(''); + const [enablePunctuation, setEnablePunctuation] = useState(true); + const [enableNormalization, setEnableNormalization] = useState(true); + const [enabled, setEnabled] = useState(true); + const [saving, setSaving] = useState(false); + + useEffect(() => { + if (!isOpen) return; + if (initialModel) { + setName(initialModel.name || ''); + setVendor(initialModel.vendor || 'OpenAI Compatible'); + setLanguage(initialModel.language || 'zh'); + setModelName(initialModel.modelName || 'FunAudioLLM/SenseVoiceSmall'); + setBaseUrl(initialModel.baseUrl || 'https://api.siliconflow.cn/v1'); + setApiKey(initialModel.apiKey || ''); + setHotwords(toHotwordsValue(initialModel.hotwords)); + setEnablePunctuation(initialModel.enablePunctuation ?? true); + setEnableNormalization(initialModel.enableNormalization ?? true); + setEnabled(initialModel.enabled ?? true); + return; + } + + setName(''); + setVendor('OpenAI Compatible'); + setLanguage('zh'); + setModelName('FunAudioLLM/SenseVoiceSmall'); + setBaseUrl('https://api.siliconflow.cn/v1'); + setApiKey(''); + setHotwords(''); + setEnablePunctuation(true); + setEnableNormalization(true); + setEnabled(true); + }, [initialModel, isOpen]); + + const handleSubmit = async () => { + if (!name.trim()) { + alert('请填写模型名称'); + return; + } + if (!baseUrl.trim()) { + alert('请填写 Base URL'); + return; + } + if (!apiKey.trim()) { + alert('请填写 API Key'); + return; + } + + try { + setSaving(true); + await onSubmit({ + name: name.trim(), + vendor: vendor.trim(), + language, + modelName: modelName.trim(), + baseUrl: baseUrl.trim(), + apiKey: apiKey.trim(), + hotwords: parseHotwords(hotwords), + enablePunctuation, + enableNormalization, + enabled, + }); + } catch (error: any) { + alert(error?.message || '保存失败'); + } finally { + setSaving(false); + } + }; + + return ( + + + + + } + > +
+
+ + setName(e.target.value)} placeholder="例如: SenseVoice CN" /> +
+ +
+
+ + +
+
+ + +
+
+ +
+ + setModelName(e.target.value)} placeholder="FunAudioLLM/SenseVoiceSmall" /> +
+ +
+
+ + setBaseUrl(e.target.value)} placeholder="https://api.siliconflow.cn/v1" className="font-mono text-xs" /> +
+
+ + setApiKey(e.target.value)} type="password" placeholder="sk-..." className="font-mono text-xs" /> +
+
+ +
+ + setHotwords(e.target.value)} placeholder="品牌名, 人名, 专有词" /> +
+ +
+ + + +
+
+
+ ); +}; + +const ASRPreviewModal: React.FC<{ + isOpen: boolean; + onClose: () => void; + model: ASRModel | null; +}> = ({ isOpen, onClose, model }) => { + const [selectedFile, setSelectedFile] = useState(null); + const [isDragging, setIsDragging] = useState(false); + const [isTranscribing, setIsTranscribing] = useState(false); + const [transcript, setTranscript] = useState(''); + const [latency, setLatency] = useState(null); + const [confidence, setConfidence] = useState(null); + const [language, setLanguage] = useState(''); + const [isRecording, setIsRecording] = useState(false); + + const inputRef = useRef(null); + const mediaRecorderRef = useRef(null); + const streamRef = useRef(null); + const chunksRef = useRef([]); + + useEffect(() => { + if (!isOpen) return; + setSelectedFile(null); + setTranscript(''); + setLatency(null); + setConfidence(null); + setLanguage(model?.language || ''); + setIsTranscribing(false); + setIsRecording(false); + }, [isOpen, model]); + + useEffect(() => { + return () => { + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + } + }; + }, []); + + const pickFile = (file: File | null) => { + if (!file) return; + if (!file.type.startsWith('audio/')) { + alert('仅支持音频文件'); + return; + } + setSelectedFile(file); + }; + + const handleDrop = (event: React.DragEvent) => { + event.preventDefault(); + setIsDragging(false); + const file = event.dataTransfer.files?.[0] || null; + pickFile(file); + }; + + const startRecording = async () => { + if (!navigator.mediaDevices?.getUserMedia) { + alert('当前浏览器不支持麦克风录音'); + return; + } + + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const recorder = new MediaRecorder(stream); + + chunksRef.current = []; + streamRef.current = stream; + mediaRecorderRef.current = recorder; + + recorder.ondataavailable = (event) => { + if (event.data.size > 0) { + chunksRef.current.push(event.data); + } + }; + + recorder.onstop = () => { + const blob = new Blob(chunksRef.current, { type: recorder.mimeType || 'audio/webm' }); + const file = new File([blob], `mic-preview-${Date.now()}.webm`, { type: blob.type || 'audio/webm' }); + setSelectedFile(file); + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + } + }; + + recorder.start(); + setIsRecording(true); + } catch (error: any) { + alert(error?.message || '无法访问麦克风'); + } + }; + + const stopRecording = () => { + if (!mediaRecorderRef.current) return; + mediaRecorderRef.current.stop(); + setIsRecording(false); + }; + + const runPreview = async () => { + if (!model?.id) return; + if (!selectedFile) { + alert('请先上传或录制音频'); + return; + } + + try { + setIsTranscribing(true); + const result = await previewASRModel(model.id, selectedFile, { language: language || undefined }); + setTranscript(result.transcript || result.message || '无识别内容'); + setLatency(result.latency_ms ?? null); + setConfidence(result.confidence ?? null); + } catch (error: any) { + alert(error?.message || '识别失败'); + } finally { + setIsTranscribing(false); + } + }; + + return ( + + + + + } + > +
+
{ + e.preventDefault(); + setIsDragging(true); + }} + onDragLeave={() => setIsDragging(false)} + onDrop={handleDrop} + > + pickFile(e.target.files?.[0] || null)} + /> +
+ +

拖拽音频文件到这里,或

+ + {selectedFile &&

已选择: {selectedFile.name}

} +
+
+ +
+
麦克风测试
+ {!isRecording ? ( + + ) : ( + + )} +
+ +
+ + setLanguage(e.target.value)} placeholder="zh / en / auto" /> +
+ +
+
+ 识别结果 + + {latency !== null ? `Latency: ${latency}ms` : ''} + {confidence !== null ? ` Confidence: ${confidence.toFixed(3)}` : ''} + +
+