Update asr preview using microphone

This commit is contained in:
Xin Wang
2026-02-08 23:48:03 +08:00
parent 4bf2f788ad
commit 6462c4f432

View File

@@ -19,6 +19,69 @@ const parseHotwords = (value: string): string[] => {
const toHotwordsValue = (hotwords?: string[]): string => (hotwords || []).join(', '); const toHotwordsValue = (hotwords?: string[]): string => (hotwords || []).join(', ');
const createAudioContext = (): AudioContext => {
const Ctx = (window as any).AudioContext || (window as any).webkitAudioContext;
return new Ctx();
};
const encodeWav = (audioBuffer: AudioBuffer): Blob => {
const numberOfChannels = audioBuffer.numberOfChannels;
const sampleRate = audioBuffer.sampleRate;
const format = 1;
const bitDepth = 16;
const channelData = Array.from({ length: numberOfChannels }, (_, ch) => audioBuffer.getChannelData(ch));
const sampleCount = audioBuffer.length;
const blockAlign = numberOfChannels * (bitDepth / 8);
const byteRate = sampleRate * blockAlign;
const dataSize = sampleCount * blockAlign;
const buffer = new ArrayBuffer(44 + dataSize);
const view = new DataView(buffer);
const writeString = (offset: number, value: string) => {
for (let i = 0; i < value.length; i += 1) {
view.setUint8(offset + i, value.charCodeAt(i));
}
};
writeString(0, 'RIFF');
view.setUint32(4, 36 + dataSize, true);
writeString(8, 'WAVE');
writeString(12, 'fmt ');
view.setUint32(16, 16, true);
view.setUint16(20, format, true);
view.setUint16(22, numberOfChannels, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, byteRate, true);
view.setUint16(32, blockAlign, true);
view.setUint16(34, bitDepth, true);
writeString(36, 'data');
view.setUint32(40, dataSize, true);
let offset = 44;
for (let i = 0; i < sampleCount; i += 1) {
for (let ch = 0; ch < numberOfChannels; ch += 1) {
const sample = Math.max(-1, Math.min(1, channelData[ch][i]));
const pcm = sample < 0 ? sample * 0x8000 : sample * 0x7fff;
view.setInt16(offset, pcm, true);
offset += 2;
}
}
return new Blob([buffer], { type: 'audio/wav' });
};
const convertRecordedBlobToWav = async (blob: Blob): Promise<File> => {
const audioContext = createAudioContext();
try {
const inputArrayBuffer = await blob.arrayBuffer();
const decoded = await audioContext.decodeAudioData(inputArrayBuffer.slice(0));
const wavBlob = encodeWav(decoded);
return new File([wavBlob], `mic-preview-${Date.now()}.wav`, { type: 'audio/wav' });
} finally {
await audioContext.close();
}
};
export const ASRLibraryPage: React.FC = () => { export const ASRLibraryPage: React.FC = () => {
const [models, setModels] = useState<ASRModel[]>([]); const [models, setModels] = useState<ASRModel[]>([]);
const [searchTerm, setSearchTerm] = useState(''); const [searchTerm, setSearchTerm] = useState('');
@@ -378,11 +441,17 @@ const ASRPreviewModal: React.FC<{
const [confidence, setConfidence] = useState<number | null>(null); const [confidence, setConfidence] = useState<number | null>(null);
const [language, setLanguage] = useState(''); const [language, setLanguage] = useState('');
const [isRecording, setIsRecording] = useState(false); const [isRecording, setIsRecording] = useState(false);
const [isProcessingRecording, setIsProcessingRecording] = useState(false);
const [inputLevel, setInputLevel] = useState(0);
const [isSpeaking, setIsSpeaking] = useState(false);
const inputRef = useRef<HTMLInputElement>(null); const inputRef = useRef<HTMLInputElement>(null);
const mediaRecorderRef = useRef<MediaRecorder | null>(null); const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null); const streamRef = useRef<MediaStream | null>(null);
const chunksRef = useRef<Blob[]>([]); const chunksRef = useRef<Blob[]>([]);
const analyserRef = useRef<AnalyserNode | null>(null);
const visualAudioContextRef = useRef<AudioContext | null>(null);
const rafRef = useRef<number | null>(null);
useEffect(() => { useEffect(() => {
if (!isOpen) return; if (!isOpen) return;
@@ -393,16 +462,46 @@ const ASRPreviewModal: React.FC<{
setLanguage(model?.language || ''); setLanguage(model?.language || '');
setIsTranscribing(false); setIsTranscribing(false);
setIsRecording(false); setIsRecording(false);
setIsProcessingRecording(false);
setInputLevel(0);
setIsSpeaking(false);
}, [isOpen, model]); }, [isOpen, model]);
const stopVisualization = () => {
if (rafRef.current) {
cancelAnimationFrame(rafRef.current);
rafRef.current = null;
}
analyserRef.current = null;
if (visualAudioContextRef.current) {
visualAudioContextRef.current.close().catch(() => undefined);
visualAudioContextRef.current = null;
}
setInputLevel(0);
setIsSpeaking(false);
};
const stopCurrentStream = () => {
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => track.stop());
streamRef.current = null;
}
};
useEffect(() => { useEffect(() => {
return () => { return () => {
if (streamRef.current) { stopVisualization();
streamRef.current.getTracks().forEach((track) => track.stop()); stopCurrentStream();
}
}; };
}, []); }, []);
useEffect(() => {
if (!isOpen) {
stopVisualization();
stopCurrentStream();
}
}, [isOpen]);
const pickFile = (file: File | null) => { const pickFile = (file: File | null) => {
if (!file) return; if (!file) return;
if (!file.type.startsWith('audio/')) { if (!file.type.startsWith('audio/')) {
@@ -427,29 +526,65 @@ const ASRPreviewModal: React.FC<{
try { try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const recorder = new MediaRecorder(stream); const mimeType = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus'
: (MediaRecorder.isTypeSupported('audio/webm') ? 'audio/webm' : '');
const recorder = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
chunksRef.current = []; chunksRef.current = [];
streamRef.current = stream; streamRef.current = stream;
mediaRecorderRef.current = recorder; mediaRecorderRef.current = recorder;
const visualizationContext = createAudioContext();
const source = visualizationContext.createMediaStreamSource(stream);
const analyser = visualizationContext.createAnalyser();
analyser.fftSize = 1024;
source.connect(analyser);
analyserRef.current = analyser;
visualAudioContextRef.current = visualizationContext;
const timeData = new Uint8Array(analyser.frequencyBinCount);
const tick = () => {
if (!analyserRef.current) return;
analyserRef.current.getByteTimeDomainData(timeData);
let sumSquares = 0;
for (let i = 0; i < timeData.length; i += 1) {
const normalized = (timeData[i] - 128) / 128;
sumSquares += normalized * normalized;
}
const rms = Math.sqrt(sumSquares / timeData.length);
const level = Math.min(1, rms * 4);
setInputLevel(level);
setIsSpeaking(level > 0.08);
rafRef.current = requestAnimationFrame(tick);
};
tick();
recorder.ondataavailable = (event) => { recorder.ondataavailable = (event) => {
if (event.data.size > 0) { if (event.data.size > 0) {
chunksRef.current.push(event.data); chunksRef.current.push(event.data);
} }
}; };
recorder.onstop = () => { recorder.onstop = async () => {
const blob = new Blob(chunksRef.current, { type: recorder.mimeType || 'audio/webm' }); const blob = new Blob(chunksRef.current, { type: recorder.mimeType || 'audio/webm' });
const file = new File([blob], `mic-preview-${Date.now()}.webm`, { type: blob.type || 'audio/webm' }); setIsProcessingRecording(true);
setSelectedFile(file); try {
if (streamRef.current) { let outputFile: File;
streamRef.current.getTracks().forEach((track) => track.stop()); try {
streamRef.current = null; outputFile = await convertRecordedBlobToWav(blob);
} catch {
outputFile = new File([blob], `mic-preview-${Date.now()}.webm`, { type: blob.type || 'audio/webm' });
}
setSelectedFile(outputFile);
} finally {
setIsProcessingRecording(false);
stopVisualization();
stopCurrentStream();
} }
}; };
recorder.start(); recorder.start(250);
setIsRecording(true); setIsRecording(true);
} catch (error: any) { } catch (error: any) {
alert(error?.message || '无法访问麦克风'); alert(error?.message || '无法访问麦克风');
@@ -490,7 +625,7 @@ const ASRPreviewModal: React.FC<{
footer={ footer={
<> <>
<Button variant="ghost" onClick={onClose}></Button> <Button variant="ghost" onClick={onClose}></Button>
<Button onClick={runPreview} disabled={isTranscribing || !selectedFile}> <Button onClick={runPreview} disabled={isTranscribing || !selectedFile || isProcessingRecording}>
{isTranscribing ? '识别中...' : '开始识别'} {isTranscribing ? '识别中...' : '开始识别'}
</Button> </Button>
</> </>
@@ -518,11 +653,31 @@ const ASRPreviewModal: React.FC<{
<p></p> <p></p>
<Button variant="outline" size="sm" onClick={() => inputRef.current?.click()}></Button> <Button variant="outline" size="sm" onClick={() => inputRef.current?.click()}></Button>
{selectedFile && <p className="text-primary text-xs">: {selectedFile.name}</p>} {selectedFile && <p className="text-primary text-xs">: {selectedFile.name}</p>}
{isProcessingRecording && <p className="text-yellow-400 text-xs">...</p>}
</div> </div>
</div> </div>
<div className="flex items-center justify-between rounded-lg border border-white/10 bg-white/5 p-3"> <div className="rounded-lg border border-white/10 bg-white/5 p-3 space-y-3">
<div className="text-sm text-muted-foreground"></div> <div className="flex items-center justify-between">
<div className="text-sm text-muted-foreground"></div>
<div className={`text-xs font-semibold ${isSpeaking ? 'text-green-400' : 'text-muted-foreground'}`}>
{isRecording ? (isSpeaking ? '正在说话' : '等待语音') : '未录音'}
</div>
</div>
<div className="h-10 rounded-md bg-black/30 border border-white/10 px-2 flex items-end gap-1">
{Array.from({ length: 20 }).map((_, index) => {
const threshold = (index + 1) / 20;
const active = inputLevel >= threshold;
const height = 6 + ((index % 5) * 6);
return (
<div
key={`meter-${index}`}
className={`w-1 rounded-sm transition-all ${active ? (isSpeaking ? 'bg-green-400' : 'bg-primary') : 'bg-white/10'}`}
style={{ height }}
/>
);
})}
</div>
{!isRecording ? ( {!isRecording ? (
<Button size="sm" variant="outline" onClick={startRecording}><Mic className="h-4 w-4 mr-1" /></Button> <Button size="sm" variant="outline" onClick={startRecording}><Mic className="h-4 w-4 mr-1" /></Button>
) : ( ) : (