Files
AI-VideoAssistant/web/pages/ASRLibrary.tsx
2026-02-12 19:23:30 +08:00

704 lines
27 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import React, { useEffect, useRef, useState } from 'react';
import { Search, Filter, Plus, Trash2, Key, Server, Ear, Globe, Languages, Pencil, Mic, Square, Upload } from 'lucide-react';
import { Button, Input, Select, TableHeader, TableRow, TableHead, TableCell, Dialog, Badge, LibraryPageShell, TableStatusRow, LibraryActionCell } from '../components/UI';
import { ASRModel } from '../types';
import { createASRModel, deleteASRModel, fetchASRModels, previewASRModel, updateASRModel } from '../services/backendApi';
const maskApiKey = (key?: string) => {
if (!key) return '********';
if (key.length < 8) return '********';
return `${key.slice(0, 3)}****${key.slice(-4)}`;
};
const parseHotwords = (value: string): string[] => {
return value
.split(/[\n,]/)
.map((item) => item.trim())
.filter(Boolean);
};
const toHotwordsValue = (hotwords?: string[]): string => (hotwords || []).join(', ');
const createAudioContext = (): AudioContext => {
const Ctx = (window as any).AudioContext || (window as any).webkitAudioContext;
return new Ctx();
};
const encodeWav = (audioBuffer: AudioBuffer): Blob => {
const numberOfChannels = audioBuffer.numberOfChannels;
const sampleRate = audioBuffer.sampleRate;
const format = 1;
const bitDepth = 16;
const channelData = Array.from({ length: numberOfChannels }, (_, ch) => audioBuffer.getChannelData(ch));
const sampleCount = audioBuffer.length;
const blockAlign = numberOfChannels * (bitDepth / 8);
const byteRate = sampleRate * blockAlign;
const dataSize = sampleCount * blockAlign;
const buffer = new ArrayBuffer(44 + dataSize);
const view = new DataView(buffer);
const writeString = (offset: number, value: string) => {
for (let i = 0; i < value.length; i += 1) {
view.setUint8(offset + i, value.charCodeAt(i));
}
};
writeString(0, 'RIFF');
view.setUint32(4, 36 + dataSize, true);
writeString(8, 'WAVE');
writeString(12, 'fmt ');
view.setUint32(16, 16, true);
view.setUint16(20, format, true);
view.setUint16(22, numberOfChannels, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, byteRate, true);
view.setUint16(32, blockAlign, true);
view.setUint16(34, bitDepth, true);
writeString(36, 'data');
view.setUint32(40, dataSize, true);
let offset = 44;
for (let i = 0; i < sampleCount; i += 1) {
for (let ch = 0; ch < numberOfChannels; ch += 1) {
const sample = Math.max(-1, Math.min(1, channelData[ch][i]));
const pcm = sample < 0 ? sample * 0x8000 : sample * 0x7fff;
view.setInt16(offset, pcm, true);
offset += 2;
}
}
return new Blob([buffer], { type: 'audio/wav' });
};
const convertRecordedBlobToWav = async (blob: Blob): Promise<File> => {
const audioContext = createAudioContext();
try {
const inputArrayBuffer = await blob.arrayBuffer();
const decoded = await audioContext.decodeAudioData(inputArrayBuffer.slice(0));
const wavBlob = encodeWav(decoded);
return new File([wavBlob], `mic-preview-${Date.now()}.wav`, { type: 'audio/wav' });
} finally {
await audioContext.close();
}
};
export const ASRLibraryPage: React.FC = () => {
const [models, setModels] = useState<ASRModel[]>([]);
const [searchTerm, setSearchTerm] = useState('');
const [vendorFilter, setVendorFilter] = useState<string>('OpenAI Compatible');
const [langFilter, setLangFilter] = useState<string>('all');
const [isAddModalOpen, setIsAddModalOpen] = useState(false);
const [editingModel, setEditingModel] = useState<ASRModel | null>(null);
const [previewingModel, setPreviewingModel] = useState<ASRModel | null>(null);
const [isLoading, setIsLoading] = useState(true);
const loadModels = async () => {
setIsLoading(true);
try {
setModels(await fetchASRModels());
} catch (error) {
console.error(error);
setModels([]);
} finally {
setIsLoading(false);
}
};
useEffect(() => {
loadModels();
}, []);
const filteredModels = models.filter((m) => {
const q = searchTerm.toLowerCase();
const matchesSearch = m.name.toLowerCase().includes(q) || (m.modelName || '').toLowerCase().includes(q);
const matchesVendor = m.vendor === vendorFilter;
const matchesLang = langFilter === 'all' || m.language === langFilter || (langFilter !== 'all' && m.language === 'Multi-lingual');
return matchesSearch && matchesVendor && matchesLang;
});
const handleCreate = async (data: Partial<ASRModel>) => {
const created = await createASRModel(data);
setModels((prev) => [created, ...prev]);
setIsAddModalOpen(false);
};
const handleUpdate = async (id: string, data: Partial<ASRModel>) => {
const updated = await updateASRModel(id, data);
setModels((prev) => prev.map((m) => (m.id === id ? updated : m)));
setEditingModel(null);
};
const handleDelete = async (id: string) => {
if (!confirm('确认删除该语音识别模型吗?该操作不可恢复。')) return;
await deleteASRModel(id);
setModels((prev) => prev.filter((m) => m.id !== id));
};
return (
<LibraryPageShell
title="语音识别"
primaryAction={(
<Button onClick={() => setIsAddModalOpen(true)} className="shadow-[0_0_15px_rgba(6,182,212,0.4)]">
<Plus className="mr-2 h-4 w-4" />
</Button>
)}
filterBar={(
<>
<div className="relative col-span-1 md:col-span-2">
<Search className="absolute left-2.5 top-2.5 h-4 w-4 text-muted-foreground" />
<Input
placeholder="搜索名称..."
className="pl-9 border-0 bg-white/5"
value={searchTerm}
onChange={(e) => setSearchTerm(e.target.value)}
/>
</div>
<div className="flex items-center space-x-2">
<Filter className="h-4 w-4 text-muted-foreground" />
<Select
value={vendorFilter}
onChange={(e) => setVendorFilter(e.target.value)}
>
<option value="OpenAI Compatible">OpenAI Compatible</option>
</Select>
</div>
<div className="flex items-center space-x-2">
<Select
value={langFilter}
onChange={(e) => setLangFilter(e.target.value)}
>
<option value="all"></option>
<option value="zh"> (Chinese)</option>
<option value="en"> (English)</option>
<option value="Multi-lingual"> (Multi-lingual)</option>
</Select>
</div>
</>
)}
>
<div className="rounded-md border border-white/5 bg-card/40 backdrop-blur-md overflow-hidden">
<table className="w-full text-sm">
<TableHeader>
<TableRow>
<TableHead></TableHead>
<TableHead></TableHead>
<TableHead></TableHead>
<TableHead></TableHead>
<TableHead>Base URL</TableHead>
<TableHead>API Key</TableHead>
<TableHead className="text-right"></TableHead>
</TableRow>
</TableHeader>
<tbody>
{!isLoading && filteredModels.map((model) => (
<TableRow key={model.id}>
<TableCell className="font-medium text-white">
<div className="flex flex-col">
<span className="flex items-center">
<Ear className="w-4 h-4 mr-2 text-primary" />
{model.name}
</span>
{model.hotwords && model.hotwords.length > 0 && (
<span className="text-xs text-muted-foreground">: {model.hotwords.join(', ')}</span>
)}
</div>
</TableCell>
<TableCell><Badge variant="outline">{model.vendor}</Badge></TableCell>
<TableCell>{model.language}</TableCell>
<TableCell className="font-mono text-xs text-muted-foreground">{model.modelName || '-'}</TableCell>
<TableCell className="font-mono text-xs text-muted-foreground max-w-[220px] truncate">{model.baseUrl}</TableCell>
<TableCell className="font-mono text-xs text-muted-foreground">{maskApiKey(model.apiKey)}</TableCell>
<LibraryActionCell
previewAction={(
<Button variant="ghost" size="icon" onClick={() => setPreviewingModel(model)} title="试听识别">
<Ear className="h-4 w-4" />
</Button>
)}
editAction={(
<Button variant="ghost" size="icon" onClick={() => setEditingModel(model)} title="编辑模型">
<Pencil className="h-4 w-4" />
</Button>
)}
deleteAction={(
<Button variant="ghost" size="icon" onClick={() => handleDelete(model.id)} className="text-muted-foreground hover:text-destructive transition-colors" title="删除模型">
<Trash2 className="h-4 w-4" />
</Button>
)}
/>
</TableRow>
))}
{!isLoading && filteredModels.length === 0 && <TableStatusRow colSpan={7} text="暂无语音识别模型" />}
{isLoading && <TableStatusRow colSpan={7} text="加载中..." />}
</tbody>
</table>
</div>
<ASRModelModal
isOpen={isAddModalOpen}
onClose={() => setIsAddModalOpen(false)}
onSubmit={handleCreate}
/>
<ASRModelModal
isOpen={!!editingModel}
onClose={() => setEditingModel(null)}
onSubmit={(data) => handleUpdate(editingModel!.id, data)}
initialModel={editingModel || undefined}
/>
<ASRPreviewModal
isOpen={!!previewingModel}
onClose={() => setPreviewingModel(null)}
model={previewingModel}
/>
</LibraryPageShell>
);
};
const ASRModelModal: React.FC<{
isOpen: boolean;
onClose: () => void;
onSubmit: (model: Partial<ASRModel>) => Promise<void>;
initialModel?: ASRModel;
}> = ({ isOpen, onClose, onSubmit, initialModel }) => {
const [name, setName] = useState('');
const [vendor, setVendor] = useState('OpenAI Compatible');
const [language, setLanguage] = useState('zh');
const [modelName, setModelName] = useState('FunAudioLLM/SenseVoiceSmall');
const [baseUrl, setBaseUrl] = useState('https://api.siliconflow.cn/v1');
const [apiKey, setApiKey] = useState('');
const [hotwords, setHotwords] = useState('');
const [enablePunctuation, setEnablePunctuation] = useState(true);
const [enableNormalization, setEnableNormalization] = useState(true);
const [enabled, setEnabled] = useState(true);
const [saving, setSaving] = useState(false);
useEffect(() => {
if (!isOpen) return;
if (initialModel) {
setName(initialModel.name || '');
setVendor(initialModel.vendor || 'OpenAI Compatible');
setLanguage(initialModel.language || 'zh');
setModelName(initialModel.modelName || 'FunAudioLLM/SenseVoiceSmall');
setBaseUrl(initialModel.baseUrl || 'https://api.siliconflow.cn/v1');
setApiKey(initialModel.apiKey || '');
setHotwords(toHotwordsValue(initialModel.hotwords));
setEnablePunctuation(initialModel.enablePunctuation ?? true);
setEnableNormalization(initialModel.enableNormalization ?? true);
setEnabled(initialModel.enabled ?? true);
return;
}
setName('');
setVendor('OpenAI Compatible');
setLanguage('zh');
setModelName('FunAudioLLM/SenseVoiceSmall');
setBaseUrl('https://api.siliconflow.cn/v1');
setApiKey('');
setHotwords('');
setEnablePunctuation(true);
setEnableNormalization(true);
setEnabled(true);
}, [initialModel, isOpen]);
const handleSubmit = async () => {
if (!name.trim()) {
alert('请填写模型名称');
return;
}
if (!baseUrl.trim()) {
alert('请填写 Base URL');
return;
}
if (!apiKey.trim()) {
alert('请填写 API Key');
return;
}
try {
setSaving(true);
await onSubmit({
name: name.trim(),
vendor: vendor.trim(),
language,
modelName: modelName.trim(),
baseUrl: baseUrl.trim(),
apiKey: apiKey.trim(),
hotwords: parseHotwords(hotwords),
enablePunctuation,
enableNormalization,
enabled,
});
} catch (error: any) {
alert(error?.message || '保存失败');
} finally {
setSaving(false);
}
};
return (
<Dialog
isOpen={isOpen}
onClose={onClose}
title={initialModel ? '编辑语音识别模型' : '添加语音识别模型'}
footer={
<>
<Button variant="ghost" onClick={onClose}></Button>
<Button onClick={handleSubmit} disabled={saving}>{saving ? '保存中...' : (initialModel ? '保存修改' : '确认添加')}</Button>
</>
}
>
<div className="space-y-4 max-h-[75vh] overflow-y-auto px-1 custom-scrollbar">
<div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"></label>
<Input value={name} onChange={(e) => setName(e.target.value)} placeholder="例如: SenseVoice CN" />
</div>
<div className="grid grid-cols-1 md:grid-cols-2 gap-4">
<div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"></label>
<Select
value={vendor}
onChange={(e) => setVendor(e.target.value)}
>
<option value="OpenAI Compatible">OpenAI Compatible</option>
</Select>
</div>
<div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block flex items-center"><Languages className="w-3 h-3 mr-1.5" /></label>
<Select
value={language}
onChange={(e) => setLanguage(e.target.value)}
>
<option value="zh"> (Chinese)</option>
<option value="en"> (English)</option>
<option value="Multi-lingual"> (Multi-lingual)</option>
</Select>
</div>
</div>
<div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block">Model Name</label>
<Input value={modelName} onChange={(e) => setModelName(e.target.value)} placeholder="FunAudioLLM/SenseVoiceSmall" />
</div>
<div className="grid grid-cols-1 md:grid-cols-2 gap-4">
<div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block flex items-center"><Server className="w-3 h-3 mr-1.5" />Base URL</label>
<Input value={baseUrl} onChange={(e) => setBaseUrl(e.target.value)} placeholder="https://api.siliconflow.cn/v1" className="font-mono text-xs" />
</div>
<div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block flex items-center"><Key className="w-3 h-3 mr-1.5" />API Key</label>
<Input value={apiKey} onChange={(e) => setApiKey(e.target.value)} type="password" placeholder="sk-..." className="font-mono text-xs" />
</div>
</div>
<div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block"> (comma separated)</label>
<Input value={hotwords} onChange={(e) => setHotwords(e.target.value)} placeholder="品牌名, 人名, 专有词" />
</div>
<div className="grid grid-cols-1 md:grid-cols-3 gap-2">
<label className="flex items-center space-x-2 text-xs text-muted-foreground">
<input type="checkbox" checked={enablePunctuation} onChange={(e) => setEnablePunctuation(e.target.checked)} />
<span></span>
</label>
<label className="flex items-center space-x-2 text-xs text-muted-foreground">
<input type="checkbox" checked={enableNormalization} onChange={(e) => setEnableNormalization(e.target.checked)} />
<span></span>
</label>
<label className="flex items-center space-x-2 text-xs text-muted-foreground">
<input type="checkbox" checked={enabled} onChange={(e) => setEnabled(e.target.checked)} />
<span></span>
</label>
</div>
</div>
</Dialog>
);
};
const ASRPreviewModal: React.FC<{
isOpen: boolean;
onClose: () => void;
model: ASRModel | null;
}> = ({ isOpen, onClose, model }) => {
const [selectedFile, setSelectedFile] = useState<File | null>(null);
const [isDragging, setIsDragging] = useState(false);
const [isTranscribing, setIsTranscribing] = useState(false);
const [transcript, setTranscript] = useState('');
const [latency, setLatency] = useState<number | null>(null);
const [confidence, setConfidence] = useState<number | null>(null);
const [language, setLanguage] = useState('');
const [isRecording, setIsRecording] = useState(false);
const [isProcessingRecording, setIsProcessingRecording] = useState(false);
const [inputLevel, setInputLevel] = useState(0);
const [isSpeaking, setIsSpeaking] = useState(false);
const inputRef = useRef<HTMLInputElement>(null);
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const chunksRef = useRef<Blob[]>([]);
const analyserRef = useRef<AnalyserNode | null>(null);
const visualAudioContextRef = useRef<AudioContext | null>(null);
const rafRef = useRef<number | null>(null);
useEffect(() => {
if (!isOpen) return;
setSelectedFile(null);
setTranscript('');
setLatency(null);
setConfidence(null);
setLanguage(model?.language || '');
setIsTranscribing(false);
setIsRecording(false);
setIsProcessingRecording(false);
setInputLevel(0);
setIsSpeaking(false);
}, [isOpen, model]);
const stopVisualization = () => {
if (rafRef.current) {
cancelAnimationFrame(rafRef.current);
rafRef.current = null;
}
analyserRef.current = null;
if (visualAudioContextRef.current) {
visualAudioContextRef.current.close().catch(() => undefined);
visualAudioContextRef.current = null;
}
setInputLevel(0);
setIsSpeaking(false);
};
const stopCurrentStream = () => {
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => track.stop());
streamRef.current = null;
}
};
useEffect(() => {
return () => {
stopVisualization();
stopCurrentStream();
};
}, []);
useEffect(() => {
if (!isOpen) {
stopVisualization();
stopCurrentStream();
}
}, [isOpen]);
const pickFile = (file: File | null) => {
if (!file) return;
if (!file.type.startsWith('audio/')) {
alert('仅支持音频文件');
return;
}
setSelectedFile(file);
};
const handleDrop = (event: React.DragEvent<HTMLDivElement>) => {
event.preventDefault();
setIsDragging(false);
const file = event.dataTransfer.files?.[0] || null;
pickFile(file);
};
const startRecording = async () => {
if (!navigator.mediaDevices?.getUserMedia) {
alert('当前浏览器不支持麦克风录音');
return;
}
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const mimeType = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus'
: (MediaRecorder.isTypeSupported('audio/webm') ? 'audio/webm' : '');
const recorder = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
chunksRef.current = [];
streamRef.current = stream;
mediaRecorderRef.current = recorder;
const visualizationContext = createAudioContext();
const source = visualizationContext.createMediaStreamSource(stream);
const analyser = visualizationContext.createAnalyser();
analyser.fftSize = 1024;
source.connect(analyser);
analyserRef.current = analyser;
visualAudioContextRef.current = visualizationContext;
const timeData = new Uint8Array(analyser.frequencyBinCount);
const tick = () => {
if (!analyserRef.current) return;
analyserRef.current.getByteTimeDomainData(timeData);
let sumSquares = 0;
for (let i = 0; i < timeData.length; i += 1) {
const normalized = (timeData[i] - 128) / 128;
sumSquares += normalized * normalized;
}
const rms = Math.sqrt(sumSquares / timeData.length);
const level = Math.min(1, rms * 4);
setInputLevel(level);
setIsSpeaking(level > 0.08);
rafRef.current = requestAnimationFrame(tick);
};
tick();
recorder.ondataavailable = (event) => {
if (event.data.size > 0) {
chunksRef.current.push(event.data);
}
};
recorder.onstop = async () => {
const blob = new Blob(chunksRef.current, { type: recorder.mimeType || 'audio/webm' });
setIsProcessingRecording(true);
try {
let outputFile: File;
try {
outputFile = await convertRecordedBlobToWav(blob);
} catch {
outputFile = new File([blob], `mic-preview-${Date.now()}.webm`, { type: blob.type || 'audio/webm' });
}
setSelectedFile(outputFile);
} finally {
setIsProcessingRecording(false);
stopVisualization();
stopCurrentStream();
}
};
recorder.start(250);
setIsRecording(true);
} catch (error: any) {
alert(error?.message || '无法访问麦克风');
}
};
const stopRecording = () => {
if (!mediaRecorderRef.current) return;
mediaRecorderRef.current.stop();
setIsRecording(false);
};
const runPreview = async () => {
if (!model?.id) return;
if (!selectedFile) {
alert('请先上传或录制音频');
return;
}
try {
setIsTranscribing(true);
const result = await previewASRModel(model.id, selectedFile, { language: language || undefined });
setTranscript(result.transcript || result.message || '无识别内容');
setLatency(result.latency_ms ?? null);
setConfidence(result.confidence ?? null);
} catch (error: any) {
alert(error?.message || '识别失败');
} finally {
setIsTranscribing(false);
}
};
return (
<Dialog
isOpen={isOpen}
onClose={onClose}
title={`试听识别: ${model?.name || ''}`}
footer={
<>
<Button variant="ghost" onClick={onClose}></Button>
<Button onClick={runPreview} disabled={isTranscribing || !selectedFile || isProcessingRecording}>
{isTranscribing ? '识别中...' : '开始识别'}
</Button>
</>
}
>
<div className="space-y-4">
<div
className={`rounded-lg border-2 border-dashed p-4 transition-colors ${isDragging ? 'border-primary bg-primary/10' : 'border-white/10 bg-white/5'}`}
onDragOver={(e) => {
e.preventDefault();
setIsDragging(true);
}}
onDragLeave={() => setIsDragging(false)}
onDrop={handleDrop}
>
<input
ref={inputRef}
type="file"
accept="audio/*"
className="hidden"
onChange={(e) => pickFile(e.target.files?.[0] || null)}
/>
<div className="flex flex-col items-center justify-center gap-2 text-sm text-muted-foreground">
<Upload className="h-6 w-6 text-primary" />
<p></p>
<Button variant="outline" size="sm" onClick={() => inputRef.current?.click()}></Button>
{selectedFile && <p className="text-primary text-xs">: {selectedFile.name}</p>}
{isProcessingRecording && <p className="text-yellow-400 text-xs">...</p>}
</div>
</div>
<div className="rounded-lg border border-white/10 bg-white/5 p-3 space-y-3">
<div className="flex items-center justify-between">
<div className="text-sm text-muted-foreground"></div>
<div className={`text-xs font-semibold ${isSpeaking ? 'text-green-400' : 'text-muted-foreground'}`}>
{isRecording ? (isSpeaking ? '正在说话' : '等待语音') : '未录音'}
</div>
</div>
<div className="h-10 rounded-md bg-black/30 border border-white/10 px-2 flex items-end gap-1">
{Array.from({ length: 20 }).map((_, index) => {
const threshold = (index + 1) / 20;
const active = inputLevel >= threshold;
const height = 6 + ((index % 5) * 6);
return (
<div
key={`meter-${index}`}
className={`w-1 rounded-sm transition-all ${active ? (isSpeaking ? 'bg-green-400' : 'bg-primary') : 'bg-white/10'}`}
style={{ height }}
/>
);
})}
</div>
{!isRecording ? (
<Button size="sm" variant="outline" onClick={startRecording}><Mic className="h-4 w-4 mr-1" /></Button>
) : (
<Button size="sm" variant="destructive" onClick={stopRecording}><Square className="h-4 w-4 mr-1" /></Button>
)}
</div>
<div className="space-y-1.5">
<label className="text-[10px] font-black text-muted-foreground uppercase tracking-widest block flex items-center">
<Globe className="w-3 h-3 mr-1.5" /> (Optional)
</label>
<Input value={language} onChange={(e) => setLanguage(e.target.value)} placeholder="zh / en / auto" />
</div>
<div className="rounded-lg border border-primary/20 bg-primary/5 p-3 space-y-2">
<div className="flex items-center justify-between text-xs text-primary">
<span></span>
<span>
{latency !== null ? `Latency: ${latency}ms` : ''}
{confidence !== null ? ` Confidence: ${confidence.toFixed(3)}` : ''}
</span>
</div>
<textarea
readOnly
value={transcript}
className="flex min-h-[120px] w-full rounded-md border-0 bg-black/20 px-3 py-2 text-sm shadow-sm text-white"
placeholder="识别结果会显示在这里"
/>
</div>
</div>
</Dialog>
);
};