Update asr preview using microphone
This commit is contained in:
@@ -19,6 +19,69 @@ const parseHotwords = (value: string): string[] => {
|
|||||||
|
|
||||||
const toHotwordsValue = (hotwords?: string[]): string => (hotwords || []).join(', ');
|
const toHotwordsValue = (hotwords?: string[]): string => (hotwords || []).join(', ');
|
||||||
|
|
||||||
|
const createAudioContext = (): AudioContext => {
|
||||||
|
const Ctx = (window as any).AudioContext || (window as any).webkitAudioContext;
|
||||||
|
return new Ctx();
|
||||||
|
};
|
||||||
|
|
||||||
|
const encodeWav = (audioBuffer: AudioBuffer): Blob => {
|
||||||
|
const numberOfChannels = audioBuffer.numberOfChannels;
|
||||||
|
const sampleRate = audioBuffer.sampleRate;
|
||||||
|
const format = 1;
|
||||||
|
const bitDepth = 16;
|
||||||
|
const channelData = Array.from({ length: numberOfChannels }, (_, ch) => audioBuffer.getChannelData(ch));
|
||||||
|
const sampleCount = audioBuffer.length;
|
||||||
|
const blockAlign = numberOfChannels * (bitDepth / 8);
|
||||||
|
const byteRate = sampleRate * blockAlign;
|
||||||
|
const dataSize = sampleCount * blockAlign;
|
||||||
|
const buffer = new ArrayBuffer(44 + dataSize);
|
||||||
|
const view = new DataView(buffer);
|
||||||
|
|
||||||
|
const writeString = (offset: number, value: string) => {
|
||||||
|
for (let i = 0; i < value.length; i += 1) {
|
||||||
|
view.setUint8(offset + i, value.charCodeAt(i));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
writeString(0, 'RIFF');
|
||||||
|
view.setUint32(4, 36 + dataSize, true);
|
||||||
|
writeString(8, 'WAVE');
|
||||||
|
writeString(12, 'fmt ');
|
||||||
|
view.setUint32(16, 16, true);
|
||||||
|
view.setUint16(20, format, true);
|
||||||
|
view.setUint16(22, numberOfChannels, true);
|
||||||
|
view.setUint32(24, sampleRate, true);
|
||||||
|
view.setUint32(28, byteRate, true);
|
||||||
|
view.setUint16(32, blockAlign, true);
|
||||||
|
view.setUint16(34, bitDepth, true);
|
||||||
|
writeString(36, 'data');
|
||||||
|
view.setUint32(40, dataSize, true);
|
||||||
|
|
||||||
|
let offset = 44;
|
||||||
|
for (let i = 0; i < sampleCount; i += 1) {
|
||||||
|
for (let ch = 0; ch < numberOfChannels; ch += 1) {
|
||||||
|
const sample = Math.max(-1, Math.min(1, channelData[ch][i]));
|
||||||
|
const pcm = sample < 0 ? sample * 0x8000 : sample * 0x7fff;
|
||||||
|
view.setInt16(offset, pcm, true);
|
||||||
|
offset += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Blob([buffer], { type: 'audio/wav' });
|
||||||
|
};
|
||||||
|
|
||||||
|
const convertRecordedBlobToWav = async (blob: Blob): Promise<File> => {
|
||||||
|
const audioContext = createAudioContext();
|
||||||
|
try {
|
||||||
|
const inputArrayBuffer = await blob.arrayBuffer();
|
||||||
|
const decoded = await audioContext.decodeAudioData(inputArrayBuffer.slice(0));
|
||||||
|
const wavBlob = encodeWav(decoded);
|
||||||
|
return new File([wavBlob], `mic-preview-${Date.now()}.wav`, { type: 'audio/wav' });
|
||||||
|
} finally {
|
||||||
|
await audioContext.close();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
export const ASRLibraryPage: React.FC = () => {
|
export const ASRLibraryPage: React.FC = () => {
|
||||||
const [models, setModels] = useState<ASRModel[]>([]);
|
const [models, setModels] = useState<ASRModel[]>([]);
|
||||||
const [searchTerm, setSearchTerm] = useState('');
|
const [searchTerm, setSearchTerm] = useState('');
|
||||||
@@ -378,11 +441,17 @@ const ASRPreviewModal: React.FC<{
|
|||||||
const [confidence, setConfidence] = useState<number | null>(null);
|
const [confidence, setConfidence] = useState<number | null>(null);
|
||||||
const [language, setLanguage] = useState('');
|
const [language, setLanguage] = useState('');
|
||||||
const [isRecording, setIsRecording] = useState(false);
|
const [isRecording, setIsRecording] = useState(false);
|
||||||
|
const [isProcessingRecording, setIsProcessingRecording] = useState(false);
|
||||||
|
const [inputLevel, setInputLevel] = useState(0);
|
||||||
|
const [isSpeaking, setIsSpeaking] = useState(false);
|
||||||
|
|
||||||
const inputRef = useRef<HTMLInputElement>(null);
|
const inputRef = useRef<HTMLInputElement>(null);
|
||||||
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
||||||
const streamRef = useRef<MediaStream | null>(null);
|
const streamRef = useRef<MediaStream | null>(null);
|
||||||
const chunksRef = useRef<Blob[]>([]);
|
const chunksRef = useRef<Blob[]>([]);
|
||||||
|
const analyserRef = useRef<AnalyserNode | null>(null);
|
||||||
|
const visualAudioContextRef = useRef<AudioContext | null>(null);
|
||||||
|
const rafRef = useRef<number | null>(null);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!isOpen) return;
|
if (!isOpen) return;
|
||||||
@@ -393,16 +462,46 @@ const ASRPreviewModal: React.FC<{
|
|||||||
setLanguage(model?.language || '');
|
setLanguage(model?.language || '');
|
||||||
setIsTranscribing(false);
|
setIsTranscribing(false);
|
||||||
setIsRecording(false);
|
setIsRecording(false);
|
||||||
|
setIsProcessingRecording(false);
|
||||||
|
setInputLevel(0);
|
||||||
|
setIsSpeaking(false);
|
||||||
}, [isOpen, model]);
|
}, [isOpen, model]);
|
||||||
|
|
||||||
|
const stopVisualization = () => {
|
||||||
|
if (rafRef.current) {
|
||||||
|
cancelAnimationFrame(rafRef.current);
|
||||||
|
rafRef.current = null;
|
||||||
|
}
|
||||||
|
analyserRef.current = null;
|
||||||
|
if (visualAudioContextRef.current) {
|
||||||
|
visualAudioContextRef.current.close().catch(() => undefined);
|
||||||
|
visualAudioContextRef.current = null;
|
||||||
|
}
|
||||||
|
setInputLevel(0);
|
||||||
|
setIsSpeaking(false);
|
||||||
|
};
|
||||||
|
|
||||||
|
const stopCurrentStream = () => {
|
||||||
|
if (streamRef.current) {
|
||||||
|
streamRef.current.getTracks().forEach((track) => track.stop());
|
||||||
|
streamRef.current = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
return () => {
|
return () => {
|
||||||
if (streamRef.current) {
|
stopVisualization();
|
||||||
streamRef.current.getTracks().forEach((track) => track.stop());
|
stopCurrentStream();
|
||||||
}
|
|
||||||
};
|
};
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (!isOpen) {
|
||||||
|
stopVisualization();
|
||||||
|
stopCurrentStream();
|
||||||
|
}
|
||||||
|
}, [isOpen]);
|
||||||
|
|
||||||
const pickFile = (file: File | null) => {
|
const pickFile = (file: File | null) => {
|
||||||
if (!file) return;
|
if (!file) return;
|
||||||
if (!file.type.startsWith('audio/')) {
|
if (!file.type.startsWith('audio/')) {
|
||||||
@@ -427,29 +526,65 @@ const ASRPreviewModal: React.FC<{
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
const recorder = new MediaRecorder(stream);
|
const mimeType = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
|
||||||
|
? 'audio/webm;codecs=opus'
|
||||||
|
: (MediaRecorder.isTypeSupported('audio/webm') ? 'audio/webm' : '');
|
||||||
|
const recorder = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
|
||||||
|
|
||||||
chunksRef.current = [];
|
chunksRef.current = [];
|
||||||
streamRef.current = stream;
|
streamRef.current = stream;
|
||||||
mediaRecorderRef.current = recorder;
|
mediaRecorderRef.current = recorder;
|
||||||
|
|
||||||
|
const visualizationContext = createAudioContext();
|
||||||
|
const source = visualizationContext.createMediaStreamSource(stream);
|
||||||
|
const analyser = visualizationContext.createAnalyser();
|
||||||
|
analyser.fftSize = 1024;
|
||||||
|
source.connect(analyser);
|
||||||
|
analyserRef.current = analyser;
|
||||||
|
visualAudioContextRef.current = visualizationContext;
|
||||||
|
|
||||||
|
const timeData = new Uint8Array(analyser.frequencyBinCount);
|
||||||
|
const tick = () => {
|
||||||
|
if (!analyserRef.current) return;
|
||||||
|
analyserRef.current.getByteTimeDomainData(timeData);
|
||||||
|
let sumSquares = 0;
|
||||||
|
for (let i = 0; i < timeData.length; i += 1) {
|
||||||
|
const normalized = (timeData[i] - 128) / 128;
|
||||||
|
sumSquares += normalized * normalized;
|
||||||
|
}
|
||||||
|
const rms = Math.sqrt(sumSquares / timeData.length);
|
||||||
|
const level = Math.min(1, rms * 4);
|
||||||
|
setInputLevel(level);
|
||||||
|
setIsSpeaking(level > 0.08);
|
||||||
|
rafRef.current = requestAnimationFrame(tick);
|
||||||
|
};
|
||||||
|
tick();
|
||||||
|
|
||||||
recorder.ondataavailable = (event) => {
|
recorder.ondataavailable = (event) => {
|
||||||
if (event.data.size > 0) {
|
if (event.data.size > 0) {
|
||||||
chunksRef.current.push(event.data);
|
chunksRef.current.push(event.data);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
recorder.onstop = () => {
|
recorder.onstop = async () => {
|
||||||
const blob = new Blob(chunksRef.current, { type: recorder.mimeType || 'audio/webm' });
|
const blob = new Blob(chunksRef.current, { type: recorder.mimeType || 'audio/webm' });
|
||||||
const file = new File([blob], `mic-preview-${Date.now()}.webm`, { type: blob.type || 'audio/webm' });
|
setIsProcessingRecording(true);
|
||||||
setSelectedFile(file);
|
try {
|
||||||
if (streamRef.current) {
|
let outputFile: File;
|
||||||
streamRef.current.getTracks().forEach((track) => track.stop());
|
try {
|
||||||
streamRef.current = null;
|
outputFile = await convertRecordedBlobToWav(blob);
|
||||||
|
} catch {
|
||||||
|
outputFile = new File([blob], `mic-preview-${Date.now()}.webm`, { type: blob.type || 'audio/webm' });
|
||||||
|
}
|
||||||
|
setSelectedFile(outputFile);
|
||||||
|
} finally {
|
||||||
|
setIsProcessingRecording(false);
|
||||||
|
stopVisualization();
|
||||||
|
stopCurrentStream();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
recorder.start();
|
recorder.start(250);
|
||||||
setIsRecording(true);
|
setIsRecording(true);
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
alert(error?.message || '无法访问麦克风');
|
alert(error?.message || '无法访问麦克风');
|
||||||
@@ -490,7 +625,7 @@ const ASRPreviewModal: React.FC<{
|
|||||||
footer={
|
footer={
|
||||||
<>
|
<>
|
||||||
<Button variant="ghost" onClick={onClose}>关闭</Button>
|
<Button variant="ghost" onClick={onClose}>关闭</Button>
|
||||||
<Button onClick={runPreview} disabled={isTranscribing || !selectedFile}>
|
<Button onClick={runPreview} disabled={isTranscribing || !selectedFile || isProcessingRecording}>
|
||||||
{isTranscribing ? '识别中...' : '开始识别'}
|
{isTranscribing ? '识别中...' : '开始识别'}
|
||||||
</Button>
|
</Button>
|
||||||
</>
|
</>
|
||||||
@@ -518,11 +653,31 @@ const ASRPreviewModal: React.FC<{
|
|||||||
<p>拖拽音频文件到这里,或</p>
|
<p>拖拽音频文件到这里,或</p>
|
||||||
<Button variant="outline" size="sm" onClick={() => inputRef.current?.click()}>选择文件</Button>
|
<Button variant="outline" size="sm" onClick={() => inputRef.current?.click()}>选择文件</Button>
|
||||||
{selectedFile && <p className="text-primary text-xs">已选择: {selectedFile.name}</p>}
|
{selectedFile && <p className="text-primary text-xs">已选择: {selectedFile.name}</p>}
|
||||||
|
{isProcessingRecording && <p className="text-yellow-400 text-xs">正在处理录音格式...</p>}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div className="flex items-center justify-between rounded-lg border border-white/10 bg-white/5 p-3">
|
<div className="rounded-lg border border-white/10 bg-white/5 p-3 space-y-3">
|
||||||
<div className="text-sm text-muted-foreground">麦克风测试</div>
|
<div className="flex items-center justify-between">
|
||||||
|
<div className="text-sm text-muted-foreground">麦克风测试</div>
|
||||||
|
<div className={`text-xs font-semibold ${isSpeaking ? 'text-green-400' : 'text-muted-foreground'}`}>
|
||||||
|
{isRecording ? (isSpeaking ? '正在说话' : '等待语音') : '未录音'}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div className="h-10 rounded-md bg-black/30 border border-white/10 px-2 flex items-end gap-1">
|
||||||
|
{Array.from({ length: 20 }).map((_, index) => {
|
||||||
|
const threshold = (index + 1) / 20;
|
||||||
|
const active = inputLevel >= threshold;
|
||||||
|
const height = 6 + ((index % 5) * 6);
|
||||||
|
return (
|
||||||
|
<div
|
||||||
|
key={`meter-${index}`}
|
||||||
|
className={`w-1 rounded-sm transition-all ${active ? (isSpeaking ? 'bg-green-400' : 'bg-primary') : 'bg-white/10'}`}
|
||||||
|
style={{ height }}
|
||||||
|
/>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
{!isRecording ? (
|
{!isRecording ? (
|
||||||
<Button size="sm" variant="outline" onClick={startRecording}><Mic className="h-4 w-4 mr-1" />开始录音</Button>
|
<Button size="sm" variant="outline" onClick={startRecording}><Mic className="h-4 w-4 mr-1" />开始录音</Button>
|
||||||
) : (
|
) : (
|
||||||
|
|||||||
Reference in New Issue
Block a user