/** * Minimal browser client for the AI VideoAssistant engine's product * websocket (`/ws-product`, protocol `va.ws.v1`). * * Responsibilities: * - Open/close the websocket and run the session handshake. * - List/select microphones and capture mic audio with browser AEC enabled. * - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames * as binary websocket messages. * - Play `response.audio.delta` frames gaplessly through Web Audio. * - Render a chat-style history of user transcripts and bot text deltas. */ const SAMPLE_RATE = 16000; const CHANNELS = 1; const FRAME_MS = 20; const PROTOCOL = "va.ws.v1"; const MAX_WS_LOG_LINES = 120; const AUDIO_DELTA_LOG_INTERVAL_MS = 1000; function defaultWsUrl() { const scheme = location.protocol === "https:" ? "wss:" : "ws:"; return `${scheme}//${location.host}/ws-product`; } const els = { url: document.getElementById("ws-url"), connectBtn: document.getElementById("connect-btn"), statusDot: document.getElementById("status-dot"), statusText: document.getElementById("status-text"), chatLog: document.getElementById("chat-log"), micBtn: document.getElementById("mic-btn"), micSelect: document.getElementById("mic-select"), micLabel: document.querySelector(".mic-btn__label"), micIndicator: document.getElementById("mic-indicator"), botIndicator: document.getElementById("bot-indicator"), clearBtn: document.getElementById("clear-btn"), clearWsLogBtn: document.getElementById("clear-ws-log-btn"), wsLog: document.getElementById("ws-log"), meterFill: document.getElementById("meter-fill"), composer: document.getElementById("composer"), textInput: document.getElementById("text-input"), sendBtn: document.getElementById("send-btn"), }; const state = { ws: null, connected: false, connecting: false, audioContext: null, micStream: null, micSourceNode: null, recorderNode: null, micEnabled: false, micDevices: [], selectedMicDeviceId: "", // Output scheduling. nextPlaybackTime: 0, playbackEndsAt: 0, scheduledSources: [], botActive: false, botUiTimer: null, // Chat state. currentAssistantBubble: null, // VU meter smoothing. meterLevel: 0, // Compact websocket logging. audioDeltaLogCount: 0, audioDeltaLogBytes: 0, lastAudioDeltaLogAt: 0, audioSendLogCount: 0, audioSendLogBytes: 0, lastAudioSendLogAt: 0, }; /* ------------------------------------------------------------------ UI */ function setStatus(kind, text) { els.statusDot.className = `status__dot status__dot--${kind}`; els.statusText.textContent = text; } function setConnectButton() { if (state.connecting) { els.connectBtn.textContent = "Connecting…"; els.connectBtn.disabled = true; els.connectBtn.classList.remove("is-disconnect"); } else if (state.connected) { els.connectBtn.textContent = "Disconnect"; els.connectBtn.disabled = false; els.connectBtn.classList.add("is-disconnect"); } else { els.connectBtn.textContent = "Connect"; els.connectBtn.disabled = false; els.connectBtn.classList.remove("is-disconnect"); } } function setMicButton() { els.micBtn.disabled = !state.connected; els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false"); els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic"; els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic"; els.micIndicator.classList.toggle("is-active", state.micEnabled); } function setMicSelectEnabled() { els.micSelect.disabled = !state.connected || !navigator.mediaDevices; } function setComposerEnabled(enabled) { els.textInput.disabled = !enabled; els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0; } function setBotIndicator(active) { els.botIndicator.classList.toggle("is-active", active); } function addBubble(role, text) { if (els.chatLog.querySelector(".chat__empty")) { els.chatLog.innerHTML = ""; } const bubble = document.createElement("div"); bubble.className = `bubble bubble--${role}`; if (role !== "system") { const tag = document.createElement("span"); tag.className = "bubble__role"; tag.textContent = role === "user" ? "You" : "Assistant"; bubble.appendChild(tag); } const body = document.createElement("span"); body.className = "bubble__text"; body.textContent = text; bubble.appendChild(body); els.chatLog.appendChild(bubble); scrollChatToBottom(); return bubble; } function appendToBubble(bubble, text) { const body = bubble.querySelector(".bubble__text"); body.textContent += text; scrollChatToBottom(); } function scrollChatToBottom() { els.chatLog.scrollTop = els.chatLog.scrollHeight; } function clearChat() { els.chatLog.innerHTML = ""; state.currentAssistantBubble = null; const empty = document.createElement("div"); empty.className = "chat__empty"; empty.innerHTML = "

Chat cleared.

"; els.chatLog.appendChild(empty); } function truncateLogValue(value, maxLength = 160) { const text = String(value); if (text.length <= maxLength) return text; return `${text.slice(0, maxLength - 1)}…`; } function compactWsPayload(payload) { if (!payload || typeof payload !== "object") return String(payload); const compact = { ...payload }; if (typeof compact.audio === "string") { compact.audio = ``; } if (typeof compact.data === "string" && compact.data.length > 160) { compact.data = ``; } if (typeof compact.text === "string") { compact.text = truncateLogValue(compact.text); } try { return JSON.stringify(compact); } catch (_) { return payload.type || "unserializable websocket payload"; } } function addWsLog(direction, detail, kind = direction) { if (els.wsLog.querySelector(".ws-log__empty")) { els.wsLog.innerHTML = ""; } const entry = document.createElement("div"); entry.className = `ws-log__entry ws-log__entry--${kind}`; const time = document.createElement("span"); time.className = "ws-log__time"; time.textContent = new Date().toLocaleTimeString([], { hour12: false, hour: "2-digit", minute: "2-digit", second: "2-digit", }); const dir = document.createElement("span"); dir.className = "ws-log__direction"; dir.textContent = direction === "send" ? "SEND" : direction === "recv" ? "RECV" : direction.toUpperCase(); const body = document.createElement("span"); body.className = "ws-log__detail"; body.textContent = detail; entry.append(time, dir, body); els.wsLog.appendChild(entry); while (els.wsLog.children.length > MAX_WS_LOG_LINES) { els.wsLog.firstElementChild.remove(); } els.wsLog.scrollTop = els.wsLog.scrollHeight; } function flushAudioDeltaLog() { if (state.audioDeltaLogCount === 0) return; addWsLog( "recv", `response.audio.delta x${state.audioDeltaLogCount} (${state.audioDeltaLogBytes} bytes)`, ); state.audioDeltaLogCount = 0; state.audioDeltaLogBytes = 0; state.lastAudioDeltaLogAt = performance.now(); } function flushAudioSendLog() { if (state.audioSendLogCount === 0) return; addWsLog( "send", `input.audio binary x${state.audioSendLogCount} (${state.audioSendLogBytes} bytes)`, ); state.audioSendLogCount = 0; state.audioSendLogBytes = 0; state.lastAudioSendLogAt = performance.now(); } function flushPendingWsLogs() { flushAudioDeltaLog(); flushAudioSendLog(); } function logWsPayload(direction, payload) { if (direction === "send") { flushAudioSendLog(); } else { flushAudioDeltaLog(); } if (direction === "recv" && payload?.type === "response.audio.delta") { state.audioDeltaLogCount += 1; state.audioDeltaLogBytes += payload.bytes || payload.audio?.length || 0; const now = performance.now(); if (now - state.lastAudioDeltaLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) { flushAudioDeltaLog(); } return; } addWsLog(direction, compactWsPayload(payload)); } function logBinarySend(byteLength) { state.audioSendLogCount += 1; state.audioSendLogBytes += byteLength; const now = performance.now(); if (now - state.lastAudioSendLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) { flushAudioSendLog(); } } function wsSend(data) { if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false; if (typeof data === "string") { try { logWsPayload("send", JSON.parse(data)); } catch (_) { flushAudioSendLog(); flushAudioDeltaLog(); addWsLog("send", truncateLogValue(data)); } } else { const byteLength = data instanceof ArrayBuffer ? data.byteLength : ArrayBuffer.isView(data) ? data.byteLength : 0; if (byteLength > 0) { logBinarySend(byteLength); } } state.ws.send(data); return true; } function clearWsLog() { state.audioDeltaLogCount = 0; state.audioDeltaLogBytes = 0; state.audioSendLogCount = 0; state.audioSendLogBytes = 0; els.wsLog.innerHTML = '
No websocket events yet.
'; } /* ---------------------------------------------------------------- Audio */ async function ensureAudioContext() { if (!state.audioContext) { const Ctx = window.AudioContext || window.webkitAudioContext; state.audioContext = new Ctx(); await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js"); } if (state.audioContext.state === "suspended") { await state.audioContext.resume(); } return state.audioContext; } function renderMicDevices() { const previousValue = state.selectedMicDeviceId || els.micSelect.value; els.micSelect.innerHTML = ""; const defaultOption = document.createElement("option"); defaultOption.value = ""; defaultOption.textContent = "Default microphone"; els.micSelect.appendChild(defaultOption); state.micDevices.forEach((device, index) => { const option = document.createElement("option"); option.value = device.deviceId; option.textContent = device.label || `Microphone ${index + 1}`; els.micSelect.appendChild(option); }); const hasPrevious = state.micDevices.some( (device) => device.deviceId === previousValue, ); state.selectedMicDeviceId = hasPrevious ? previousValue : ""; els.micSelect.value = state.selectedMicDeviceId; setMicSelectEnabled(); } async function refreshMicDevices() { if (!navigator.mediaDevices?.enumerateDevices) { setMicSelectEnabled(); return; } try { const devices = await navigator.mediaDevices.enumerateDevices(); state.micDevices = devices.filter((device) => device.kind === "audioinput"); renderMicDevices(); } catch (err) { console.warn("Could not enumerate microphones", err); setMicSelectEnabled(); } } async function startMic() { const ctx = await ensureAudioContext(); const audioConstraints = { echoCancellation: true, noiseSuppression: true, autoGainControl: true, channelCount: 1, }; if (state.selectedMicDeviceId) { audioConstraints.deviceId = { exact: state.selectedMicDeviceId }; } state.micStream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints, video: false, }); await refreshMicDevices(); state.micSourceNode = ctx.createMediaStreamSource(state.micStream); state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", { numberOfInputs: 1, numberOfOutputs: 0, channelCount: 1, processorOptions: { targetSampleRate: SAMPLE_RATE, frameMs: FRAME_MS, }, }); state.recorderNode.port.onmessage = (event) => { const data = event.data; if (!data || data.type !== "frame") return; updateMeter(data.rms || 0); if (state.connected) { wsSend(data.buffer); } }; state.micSourceNode.connect(state.recorderNode); state.micEnabled = true; addWsLog("system", "mic capture started (binary input.audio frames)"); setMicButton(); } function stopMic() { const wasEnabled = state.micEnabled; if (state.recorderNode) { try { state.recorderNode.port.onmessage = null; state.recorderNode.disconnect(); } catch (_) { /* ignore */ } state.recorderNode = null; } if (state.micSourceNode) { try { state.micSourceNode.disconnect(); } catch (_) { /* ignore */ } state.micSourceNode = null; } if (state.micStream) { for (const track of state.micStream.getTracks()) { try { track.stop(); } catch (_) { /* ignore */ } } state.micStream = null; } state.micEnabled = false; updateMeter(0); if (wasEnabled) { flushAudioSendLog(); addWsLog("system", "mic capture stopped"); } setMicButton(); } function updateMeter(rms) { // Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech. const target = Math.min(1, rms * 2.4); state.meterLevel = state.meterLevel * 0.5 + target * 0.5; els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`; } /* ---------------------------------------------------- Bot audio playback */ function schedulePlayback(int16) { const ctx = state.audioContext; if (!ctx) return; const float32 = new Float32Array(int16.length); for (let i = 0; i < int16.length; i++) { float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff); } const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE); buffer.copyToChannel(float32, 0); const src = ctx.createBufferSource(); src.buffer = buffer; src.connect(ctx.destination); const now = ctx.currentTime; // Schedule immediately after the previously scheduled chunk to keep // playback contiguous, with a tiny safety margin if we fell behind. const startAt = Math.max(now + 0.02, state.nextPlaybackTime); src.start(startAt); state.nextPlaybackTime = startAt + buffer.duration; state.playbackEndsAt = state.nextPlaybackTime; src.onended = () => { const idx = state.scheduledSources.indexOf(src); if (idx >= 0) state.scheduledSources.splice(idx, 1); }; state.scheduledSources.push(src); setBotIndicator(true); if (state.botUiTimer) clearTimeout(state.botUiTimer); const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120; state.botUiTimer = setTimeout(() => { if (state.audioContext && state.audioContext.currentTime >= state.playbackEndsAt - 0.01) { setBotIndicator(false); } }, msUntilEnd); } function stopPlaybackQueue() { for (const src of state.scheduledSources) { try { src.onended = null; src.stop(); src.disconnect(); } catch (_) { /* already stopped */ } } state.scheduledSources = []; resetPlaybackClock(); if (state.botUiTimer) { clearTimeout(state.botUiTimer); state.botUiTimer = null; } setBotIndicator(false); } function resetPlaybackClock() { if (state.audioContext) { state.nextPlaybackTime = state.audioContext.currentTime; state.playbackEndsAt = state.audioContext.currentTime; } } /* --------------------------------------------------------- Chat updates */ function handleUserTranscript(text) { if (!text) return; state.currentAssistantBubble = null; addBubble("user", text); } function sendText(text) { const value = (text || "").trim(); if (!value) return false; if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false; const message = { type: "input.text", text: value, interrupt: true, }; // The engine does not echo text input back as a transcript event, so we // render the user bubble locally. Also interrupt any in-flight bot audio // so the next reply is heard cleanly. We deliberately do NOT clear // `currentAssistantBubble` here — the engine will emit a // `response.text.final(interrupted=true)` for the in-flight assistant // turn, which finalizes that bubble in place. A brand-new bubble for the // reply will be created when `response.text.started` arrives. wsSend(JSON.stringify(message)); stopPlaybackQueue(); addBubble("user", value); return true; } function handleAssistantDelta(text) { if (!text) return; if (!state.currentAssistantBubble) { state.currentAssistantBubble = addBubble("assistant", ""); } appendToBubble(state.currentAssistantBubble, text); } function handleAssistantStarted() { state.currentAssistantBubble = null; } function handleAssistantFinal(text, interrupted) { if (!text) { state.currentAssistantBubble = null; return; } if (state.currentAssistantBubble) { const body = state.currentAssistantBubble.querySelector(".bubble__text"); body.textContent = text; } else { state.currentAssistantBubble = addBubble("assistant", text); } if (interrupted) { state.currentAssistantBubble.classList.add("bubble--interrupted"); } state.currentAssistantBubble = null; scrollChatToBottom(); } function finalizeAssistantBubble() { state.currentAssistantBubble = null; } /* ---------------------------------------------------------- Websocket IO */ function decodeBase64ToInt16(b64) { const binary = atob(b64); const len = binary.length; const bytes = new Uint8Array(len); for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i); return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2); } function handleEvent(event) { switch (event.type) { case "response.audio.delta": if (typeof event.audio === "string") { schedulePlayback(decodeBase64ToInt16(event.audio)); } break; case "response.audio.started": setBotIndicator(true); break; case "response.audio.stopped": finalizeAssistantBubble(); // The indicator turns off automatically when the playback queue drains. break; case "response.text.delta": handleAssistantDelta(event.text); break; case "response.text.started": handleAssistantStarted(); break; case "response.text.final": handleAssistantFinal(event.text, event.interrupted); break; case "input.transcript.final": handleUserTranscript(event.text); break; case "input.transcript.interim": // Ignore partial ASR updates; chat history renders committed user turns. break; case "transport.message": // Reserved for future structured messages; ignore silently. break; default: // Unknown event type: log for debugging. console.debug("ws event", event); } } async function connect() { if (state.connected || state.connecting) return; const url = (els.url.value || "").trim(); if (!url) { setStatus("error", "Missing URL"); return; } state.connecting = true; setStatus("connecting", "Connecting…"); setConnectButton(); addWsLog("system", `connecting ${url}`); try { // Pre-warm audio context on user gesture so playback works on Safari. await ensureAudioContext(); } catch (err) { console.error("AudioContext failed", err); state.connecting = false; setStatus("error", "Audio init failed"); setConnectButton(); addWsLog("error", `audio init failed: ${err.message || err}`, "error"); return; } let ws; try { ws = new WebSocket(url); } catch (err) { console.error("WebSocket constructor failed", err); state.connecting = false; setStatus("error", "Bad URL"); setConnectButton(); addWsLog("error", `bad websocket URL: ${err.message || err}`, "error"); return; } ws.binaryType = "arraybuffer"; state.ws = ws; ws.addEventListener("open", () => { const startMessage = { type: "session.start", protocol: PROTOCOL, audio: { encoding: "pcm_s16le", sample_rate: SAMPLE_RATE, channels: CHANNELS, }, }; state.connecting = false; state.connected = true; resetPlaybackClock(); addWsLog("system", "websocket open"); setStatus("connected", "Connected"); setConnectButton(); setMicButton(); setMicSelectEnabled(); refreshMicDevices(); wsSend(JSON.stringify(startMessage)); addBubble("system", "Session started."); setComposerEnabled(true); els.textInput.focus(); }); ws.addEventListener("message", (event) => { const data = event.data; if (typeof data === "string") { let parsed; try { parsed = JSON.parse(data); } catch (err) { console.warn("Bad JSON from server", err, data); addWsLog( "error", `invalid JSON from server: ${truncateLogValue(data)}`, "error", ); return; } logWsPayload("recv", parsed); handleEvent(parsed); } else if (data instanceof ArrayBuffer) { // Server doesn't currently send binary, but handle it just in case. addWsLog("recv", `binary audio ${data.byteLength} bytes`); schedulePlayback(new Int16Array(data)); } }); ws.addEventListener("error", (err) => { console.error("WebSocket error", err); setStatus("error", "Connection error"); addWsLog("error", "websocket error", "error"); }); ws.addEventListener("close", (event) => { const wasConnected = state.connected; state.ws = null; state.connected = false; state.connecting = false; if (state.micEnabled) stopMic(); stopPlaybackQueue(); setConnectButton(); setMicButton(); setMicSelectEnabled(); setComposerEnabled(false); setBotIndicator(false); flushPendingWsLogs(); addWsLog( "system", `websocket close code=${event.code}${ event.reason ? ` reason=${event.reason}` : "" }`, ); if (wasConnected) { addBubble( "system", `Session ended${event.reason ? ` — ${event.reason}` : ""}.`, ); setStatus("idle", "Disconnected"); } else { setStatus("error", "Connection closed"); } }); } function disconnect() { if (!state.ws) return; try { if (state.ws.readyState === WebSocket.OPEN) { const stopMessage = { type: "session.stop", reason: "client_disconnect" }; wsSend(JSON.stringify(stopMessage)); } } catch (_) { /* ignore */ } try { state.ws.close(1000, "client_disconnect"); } catch (_) { /* ignore */ } } /* ---------------------------------------------------------------- Wiring */ els.connectBtn.addEventListener("click", () => { if (state.connected) disconnect(); else connect(); }); els.micBtn.addEventListener("click", async () => { if (!state.connected) return; els.micBtn.disabled = true; try { if (state.micEnabled) { stopMic(); } else { await startMic(); } } catch (err) { console.error("Mic error", err); addBubble("system", `Mic error: ${err.message || err}`); } finally { els.micBtn.disabled = !state.connected; } }); els.micSelect.addEventListener("change", async () => { state.selectedMicDeviceId = els.micSelect.value; if (!state.micEnabled) return; els.micSelect.disabled = true; els.micBtn.disabled = true; try { stopMic(); await startMic(); } catch (err) { console.error("Mic switch error", err); addBubble("system", `Mic switch error: ${err.message || err}`); } finally { setMicButton(); setMicSelectEnabled(); } }); if (navigator.mediaDevices?.addEventListener) { navigator.mediaDevices.addEventListener("devicechange", refreshMicDevices); } els.clearBtn.addEventListener("click", () => { clearChat(); }); els.clearWsLogBtn.addEventListener("click", () => { clearWsLog(); }); function autosizeTextarea() { const ta = els.textInput; ta.style.height = "auto"; ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`; } function submitText() { const value = els.textInput.value; if (!sendText(value)) return; els.textInput.value = ""; autosizeTextarea(); setComposerEnabled(state.connected); } els.composer.addEventListener("submit", (event) => { event.preventDefault(); submitText(); }); els.textInput.addEventListener("input", () => { autosizeTextarea(); setComposerEnabled(state.connected); }); els.textInput.addEventListener("keydown", (event) => { if (event.key === "Enter" && !event.shiftKey && !event.isComposing) { event.preventDefault(); submitText(); } }); window.addEventListener("beforeunload", () => { if (state.ws) { try { state.ws.close(); } catch (_) { /* ignore */ } } if (state.audioContext) { try { state.audioContext.close(); } catch (_) { /* ignore */ } } }); els.url.value = defaultWsUrl(); setStatus("idle", "Disconnected"); setConnectButton(); setMicButton(); setMicSelectEnabled(); setComposerEnabled(false);