diff --git a/web/pages/ASRLibrary.tsx b/web/pages/ASRLibrary.tsx index 9611921..fe3b113 100644 --- a/web/pages/ASRLibrary.tsx +++ b/web/pages/ASRLibrary.tsx @@ -19,6 +19,69 @@ const parseHotwords = (value: string): string[] => { const toHotwordsValue = (hotwords?: string[]): string => (hotwords || []).join(', '); +const createAudioContext = (): AudioContext => { + const Ctx = (window as any).AudioContext || (window as any).webkitAudioContext; + return new Ctx(); +}; + +const encodeWav = (audioBuffer: AudioBuffer): Blob => { + const numberOfChannels = audioBuffer.numberOfChannels; + const sampleRate = audioBuffer.sampleRate; + const format = 1; + const bitDepth = 16; + const channelData = Array.from({ length: numberOfChannels }, (_, ch) => audioBuffer.getChannelData(ch)); + const sampleCount = audioBuffer.length; + const blockAlign = numberOfChannels * (bitDepth / 8); + const byteRate = sampleRate * blockAlign; + const dataSize = sampleCount * blockAlign; + const buffer = new ArrayBuffer(44 + dataSize); + const view = new DataView(buffer); + + const writeString = (offset: number, value: string) => { + for (let i = 0; i < value.length; i += 1) { + view.setUint8(offset + i, value.charCodeAt(i)); + } + }; + + writeString(0, 'RIFF'); + view.setUint32(4, 36 + dataSize, true); + writeString(8, 'WAVE'); + writeString(12, 'fmt '); + view.setUint32(16, 16, true); + view.setUint16(20, format, true); + view.setUint16(22, numberOfChannels, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, byteRate, true); + view.setUint16(32, blockAlign, true); + view.setUint16(34, bitDepth, true); + writeString(36, 'data'); + view.setUint32(40, dataSize, true); + + let offset = 44; + for (let i = 0; i < sampleCount; i += 1) { + for (let ch = 0; ch < numberOfChannels; ch += 1) { + const sample = Math.max(-1, Math.min(1, channelData[ch][i])); + const pcm = sample < 0 ? sample * 0x8000 : sample * 0x7fff; + view.setInt16(offset, pcm, true); + offset += 2; + } + } + + return new Blob([buffer], { type: 'audio/wav' }); +}; + +const convertRecordedBlobToWav = async (blob: Blob): Promise => { + const audioContext = createAudioContext(); + try { + const inputArrayBuffer = await blob.arrayBuffer(); + const decoded = await audioContext.decodeAudioData(inputArrayBuffer.slice(0)); + const wavBlob = encodeWav(decoded); + return new File([wavBlob], `mic-preview-${Date.now()}.wav`, { type: 'audio/wav' }); + } finally { + await audioContext.close(); + } +}; + export const ASRLibraryPage: React.FC = () => { const [models, setModels] = useState([]); const [searchTerm, setSearchTerm] = useState(''); @@ -378,11 +441,17 @@ const ASRPreviewModal: React.FC<{ const [confidence, setConfidence] = useState(null); const [language, setLanguage] = useState(''); const [isRecording, setIsRecording] = useState(false); + const [isProcessingRecording, setIsProcessingRecording] = useState(false); + const [inputLevel, setInputLevel] = useState(0); + const [isSpeaking, setIsSpeaking] = useState(false); const inputRef = useRef(null); const mediaRecorderRef = useRef(null); const streamRef = useRef(null); const chunksRef = useRef([]); + const analyserRef = useRef(null); + const visualAudioContextRef = useRef(null); + const rafRef = useRef(null); useEffect(() => { if (!isOpen) return; @@ -393,16 +462,46 @@ const ASRPreviewModal: React.FC<{ setLanguage(model?.language || ''); setIsTranscribing(false); setIsRecording(false); + setIsProcessingRecording(false); + setInputLevel(0); + setIsSpeaking(false); }, [isOpen, model]); + const stopVisualization = () => { + if (rafRef.current) { + cancelAnimationFrame(rafRef.current); + rafRef.current = null; + } + analyserRef.current = null; + if (visualAudioContextRef.current) { + visualAudioContextRef.current.close().catch(() => undefined); + visualAudioContextRef.current = null; + } + setInputLevel(0); + setIsSpeaking(false); + }; + + const stopCurrentStream = () => { + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + } + }; + useEffect(() => { return () => { - if (streamRef.current) { - streamRef.current.getTracks().forEach((track) => track.stop()); - } + stopVisualization(); + stopCurrentStream(); }; }, []); + useEffect(() => { + if (!isOpen) { + stopVisualization(); + stopCurrentStream(); + } + }, [isOpen]); + const pickFile = (file: File | null) => { if (!file) return; if (!file.type.startsWith('audio/')) { @@ -427,29 +526,65 @@ const ASRPreviewModal: React.FC<{ try { const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); - const recorder = new MediaRecorder(stream); + const mimeType = MediaRecorder.isTypeSupported('audio/webm;codecs=opus') + ? 'audio/webm;codecs=opus' + : (MediaRecorder.isTypeSupported('audio/webm') ? 'audio/webm' : ''); + const recorder = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream); chunksRef.current = []; streamRef.current = stream; mediaRecorderRef.current = recorder; + const visualizationContext = createAudioContext(); + const source = visualizationContext.createMediaStreamSource(stream); + const analyser = visualizationContext.createAnalyser(); + analyser.fftSize = 1024; + source.connect(analyser); + analyserRef.current = analyser; + visualAudioContextRef.current = visualizationContext; + + const timeData = new Uint8Array(analyser.frequencyBinCount); + const tick = () => { + if (!analyserRef.current) return; + analyserRef.current.getByteTimeDomainData(timeData); + let sumSquares = 0; + for (let i = 0; i < timeData.length; i += 1) { + const normalized = (timeData[i] - 128) / 128; + sumSquares += normalized * normalized; + } + const rms = Math.sqrt(sumSquares / timeData.length); + const level = Math.min(1, rms * 4); + setInputLevel(level); + setIsSpeaking(level > 0.08); + rafRef.current = requestAnimationFrame(tick); + }; + tick(); + recorder.ondataavailable = (event) => { if (event.data.size > 0) { chunksRef.current.push(event.data); } }; - recorder.onstop = () => { + recorder.onstop = async () => { const blob = new Blob(chunksRef.current, { type: recorder.mimeType || 'audio/webm' }); - const file = new File([blob], `mic-preview-${Date.now()}.webm`, { type: blob.type || 'audio/webm' }); - setSelectedFile(file); - if (streamRef.current) { - streamRef.current.getTracks().forEach((track) => track.stop()); - streamRef.current = null; + setIsProcessingRecording(true); + try { + let outputFile: File; + try { + outputFile = await convertRecordedBlobToWav(blob); + } catch { + outputFile = new File([blob], `mic-preview-${Date.now()}.webm`, { type: blob.type || 'audio/webm' }); + } + setSelectedFile(outputFile); + } finally { + setIsProcessingRecording(false); + stopVisualization(); + stopCurrentStream(); } }; - recorder.start(); + recorder.start(250); setIsRecording(true); } catch (error: any) { alert(error?.message || '无法访问麦克风'); @@ -490,7 +625,7 @@ const ASRPreviewModal: React.FC<{ footer={ <> - @@ -518,11 +653,31 @@ const ASRPreviewModal: React.FC<{

拖拽音频文件到这里,或

{selectedFile &&

已选择: {selectedFile.name}

} + {isProcessingRecording &&

正在处理录音格式...

} -
-
麦克风测试
+
+
+
麦克风测试
+
+ {isRecording ? (isSpeaking ? '正在说话' : '等待语音') : '未录音'} +
+
+
+ {Array.from({ length: 20 }).map((_, index) => { + const threshold = (index + 1) / 20; + const active = inputLevel >= threshold; + const height = 6 + ((index % 5) * 6); + return ( +
+ ); + })} +
{!isRecording ? ( ) : (