Add audio context in debug drawer

This commit is contained in:
Xin Wang
2026-02-09 09:24:36 +08:00
parent 8edbe14382
commit 7c5b71a101

View File

@@ -1061,11 +1061,14 @@ export const DebugDrawer: React.FC<{
const pendingResolveRef = useRef<(() => void) | null>(null); const pendingResolveRef = useRef<(() => void) | null>(null);
const pendingRejectRef = useRef<((e: Error) => void) | null>(null); const pendingRejectRef = useRef<((e: Error) => void) | null>(null);
const assistantDraftIndexRef = useRef<number | null>(null); const assistantDraftIndexRef = useRef<number | null>(null);
const audioCtxRef = useRef<AudioContext | null>(null);
const playbackTimeRef = useRef<number>(0);
const [devices, setDevices] = useState<MediaDeviceInfo[]>([]); const [devices, setDevices] = useState<MediaDeviceInfo[]>([]);
const [selectedCamera, setSelectedCamera] = useState<string>(''); const [selectedCamera, setSelectedCamera] = useState<string>('');
const [selectedMic, setSelectedMic] = useState<string>(''); const [selectedMic, setSelectedMic] = useState<string>('');
const [isSwapped, setIsSwapped] = useState(false); const [isSwapped, setIsSwapped] = useState(false);
const [textTtsEnabled, setTextTtsEnabled] = useState(true);
// Initialize // Initialize
useEffect(() => { useEffect(() => {
@@ -1080,6 +1083,10 @@ export const DebugDrawer: React.FC<{
setMode('text'); setMode('text');
stopMedia(); stopMedia();
closeWs(); closeWs();
if (audioCtxRef.current) {
void audioCtxRef.current.close();
audioCtxRef.current = null;
}
setIsSwapped(false); setIsSwapped(false);
setCallStatus('idle'); setCallStatus('idle');
} }
@@ -1123,6 +1130,48 @@ export const DebugDrawer: React.FC<{
} }
}; };
const ensureAudioContext = async () => {
if (!audioCtxRef.current) {
audioCtxRef.current = new AudioContext();
playbackTimeRef.current = audioCtxRef.current.currentTime;
}
if (audioCtxRef.current.state === 'suspended') {
await audioCtxRef.current.resume();
}
return audioCtxRef.current;
};
const clearPlaybackQueue = () => {
const ctx = audioCtxRef.current;
if (!ctx) return;
playbackTimeRef.current = ctx.currentTime;
};
const playPcm16Chunk = async (pcmBuffer: ArrayBuffer) => {
if (!textTtsEnabled) return;
if (mode !== 'text') return;
const ctx = await ensureAudioContext();
const int16 = new Int16Array(pcmBuffer);
if (int16.length === 0) return;
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i += 1) {
float32[i] = int16[i] / 32768;
}
const sampleRate = 16000;
const audioBuffer = ctx.createBuffer(1, float32.length, sampleRate);
audioBuffer.copyToChannel(float32, 0);
const source = ctx.createBufferSource();
source.buffer = audioBuffer;
source.connect(ctx.destination);
const startAt = Math.max(ctx.currentTime + 0.02, playbackTimeRef.current);
source.start(startAt);
playbackTimeRef.current = startAt + audioBuffer.duration;
};
useEffect(() => { useEffect(() => {
const handleStream = async () => { const handleStream = async () => {
if (isOpen && mode === 'video' && callStatus === 'active') { if (isOpen && mode === 'video' && callStatus === 'active') {
@@ -1173,6 +1222,7 @@ export const DebugDrawer: React.FC<{
try { try {
if (mode === 'text') { if (mode === 'text') {
if (textTtsEnabled) await ensureAudioContext();
await ensureWsSession(); await ensureWsSession();
wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg })); wsRef.current?.send(JSON.stringify({ type: 'input.text', text: userMsg }));
} else { } else {
@@ -1321,6 +1371,7 @@ export const DebugDrawer: React.FC<{
pendingResolveRef.current = null; pendingResolveRef.current = null;
pendingRejectRef.current = null; pendingRejectRef.current = null;
assistantDraftIndexRef.current = null; assistantDraftIndexRef.current = null;
clearPlaybackQueue();
if (isOpen) setWsStatus('disconnected'); if (isOpen) setWsStatus('disconnected');
}; };
@@ -1357,6 +1408,14 @@ export const DebugDrawer: React.FC<{
}; };
ws.onmessage = (event) => { ws.onmessage = (event) => {
if (event.data instanceof ArrayBuffer) {
void playPcm16Chunk(event.data);
return;
}
if (event.data instanceof Blob) {
void event.data.arrayBuffer().then((buf) => playPcm16Chunk(buf));
return;
}
if (typeof event.data !== 'string') return; if (typeof event.data !== 'string') return;
let payload: any; let payload: any;
try { try {
@@ -1376,6 +1435,10 @@ export const DebugDrawer: React.FC<{
); );
return; return;
} }
if (type === 'output.audio.start') {
clearPlaybackQueue();
return;
}
if (type === 'session.started') { if (type === 'session.started') {
wsReadyRef.current = true; wsReadyRef.current = true;
@@ -1457,11 +1520,18 @@ export const DebugDrawer: React.FC<{
ws.onclose = () => { ws.onclose = () => {
wsReadyRef.current = false; wsReadyRef.current = false;
clearPlaybackQueue();
if (wsStatus !== 'error') setWsStatus('disconnected'); if (wsStatus !== 'error') setWsStatus('disconnected');
}; };
}); });
}; };
useEffect(() => {
if (!textTtsEnabled) {
clearPlaybackQueue();
}
}, [textTtsEnabled]);
useEffect(() => { useEffect(() => {
if (!isOpen) return; if (!isOpen) return;
const localResolved = buildLocalResolvedRuntime(); const localResolved = buildLocalResolvedRuntime();
@@ -1525,6 +1595,15 @@ export const DebugDrawer: React.FC<{
<Badge variant="outline" className="text-xs"> <Badge variant="outline" className="text-xs">
WS: {wsStatus} WS: {wsStatus}
</Badge> </Badge>
<label className="inline-flex items-center gap-1 text-xs text-muted-foreground px-2 py-1 rounded border border-white/10">
<input
type="checkbox"
checked={textTtsEnabled}
onChange={(e) => setTextTtsEnabled(e.target.checked)}
className="accent-primary"
/>
TTS
</label>
<Button size="sm" variant="secondary" onClick={() => ensureWsSession()} disabled={wsStatus === 'connecting'}> <Button size="sm" variant="secondary" onClick={() => ensureWsSession()} disabled={wsStatus === 'connecting'}>
Connect Connect
</Button> </Button>