615 lines
16 KiB
JavaScript
615 lines
16 KiB
JavaScript
/**
|
|
* Minimal browser client for the AI VideoAssistant engine's product
|
|
* websocket (`/ws-product`, protocol `va.ws.v1`).
|
|
*
|
|
* Responsibilities:
|
|
* - Open/close the websocket and run the session handshake.
|
|
* - Capture mic audio with echoCancellation, noiseSuppression, autoGainControl.
|
|
* - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames
|
|
* as binary websocket messages.
|
|
* - Play `response.audio.delta` frames gaplessly through Web Audio.
|
|
* - Render a chat-style history of user transcripts and bot text deltas.
|
|
*/
|
|
|
|
const SAMPLE_RATE = 16000;
|
|
const CHANNELS = 1;
|
|
const FRAME_MS = 20;
|
|
const PROTOCOL = "va.ws.v1";
|
|
|
|
const els = {
|
|
url: document.getElementById("ws-url"),
|
|
connectBtn: document.getElementById("connect-btn"),
|
|
statusDot: document.getElementById("status-dot"),
|
|
statusText: document.getElementById("status-text"),
|
|
chatLog: document.getElementById("chat-log"),
|
|
micBtn: document.getElementById("mic-btn"),
|
|
micLabel: document.querySelector(".mic-btn__label"),
|
|
micIndicator: document.getElementById("mic-indicator"),
|
|
botIndicator: document.getElementById("bot-indicator"),
|
|
clearBtn: document.getElementById("clear-btn"),
|
|
meterFill: document.getElementById("meter-fill"),
|
|
composer: document.getElementById("composer"),
|
|
textInput: document.getElementById("text-input"),
|
|
sendBtn: document.getElementById("send-btn"),
|
|
};
|
|
|
|
const state = {
|
|
ws: null,
|
|
connected: false,
|
|
connecting: false,
|
|
|
|
audioContext: null,
|
|
micStream: null,
|
|
micSourceNode: null,
|
|
recorderNode: null,
|
|
|
|
micEnabled: false,
|
|
|
|
// Output scheduling.
|
|
nextPlaybackTime: 0,
|
|
playbackEndsAt: 0,
|
|
scheduledSources: [],
|
|
botActive: false,
|
|
botUiTimer: null,
|
|
|
|
// Chat state.
|
|
currentAssistantBubble: null,
|
|
|
|
// VU meter smoothing.
|
|
meterLevel: 0,
|
|
};
|
|
|
|
/* ------------------------------------------------------------------ UI */
|
|
|
|
function setStatus(kind, text) {
|
|
els.statusDot.className = `status__dot status__dot--${kind}`;
|
|
els.statusText.textContent = text;
|
|
}
|
|
|
|
function setConnectButton() {
|
|
if (state.connecting) {
|
|
els.connectBtn.textContent = "Connecting…";
|
|
els.connectBtn.disabled = true;
|
|
els.connectBtn.classList.remove("is-disconnect");
|
|
} else if (state.connected) {
|
|
els.connectBtn.textContent = "Disconnect";
|
|
els.connectBtn.disabled = false;
|
|
els.connectBtn.classList.add("is-disconnect");
|
|
} else {
|
|
els.connectBtn.textContent = "Connect";
|
|
els.connectBtn.disabled = false;
|
|
els.connectBtn.classList.remove("is-disconnect");
|
|
}
|
|
}
|
|
|
|
function setMicButton() {
|
|
els.micBtn.disabled = !state.connected;
|
|
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
|
|
els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic";
|
|
els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic";
|
|
els.micIndicator.classList.toggle("is-active", state.micEnabled);
|
|
}
|
|
|
|
function setComposerEnabled(enabled) {
|
|
els.textInput.disabled = !enabled;
|
|
els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0;
|
|
}
|
|
|
|
function setBotIndicator(active) {
|
|
els.botIndicator.classList.toggle("is-active", active);
|
|
}
|
|
|
|
function addBubble(role, text) {
|
|
if (els.chatLog.querySelector(".chat__empty")) {
|
|
els.chatLog.innerHTML = "";
|
|
}
|
|
const bubble = document.createElement("div");
|
|
bubble.className = `bubble bubble--${role}`;
|
|
if (role !== "system") {
|
|
const tag = document.createElement("span");
|
|
tag.className = "bubble__role";
|
|
tag.textContent = role === "user" ? "You" : "Assistant";
|
|
bubble.appendChild(tag);
|
|
}
|
|
const body = document.createElement("span");
|
|
body.className = "bubble__text";
|
|
body.textContent = text;
|
|
bubble.appendChild(body);
|
|
els.chatLog.appendChild(bubble);
|
|
scrollChatToBottom();
|
|
return bubble;
|
|
}
|
|
|
|
function appendToBubble(bubble, text) {
|
|
const body = bubble.querySelector(".bubble__text");
|
|
body.textContent += text;
|
|
scrollChatToBottom();
|
|
}
|
|
|
|
function scrollChatToBottom() {
|
|
els.chatLog.scrollTop = els.chatLog.scrollHeight;
|
|
}
|
|
|
|
function clearChat() {
|
|
els.chatLog.innerHTML = "";
|
|
state.currentAssistantBubble = null;
|
|
const empty = document.createElement("div");
|
|
empty.className = "chat__empty";
|
|
empty.innerHTML = "<p>Chat cleared.</p>";
|
|
els.chatLog.appendChild(empty);
|
|
}
|
|
|
|
/* ---------------------------------------------------------------- Audio */
|
|
|
|
async function ensureAudioContext() {
|
|
if (!state.audioContext) {
|
|
const Ctx = window.AudioContext || window.webkitAudioContext;
|
|
state.audioContext = new Ctx();
|
|
await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js");
|
|
}
|
|
if (state.audioContext.state === "suspended") {
|
|
await state.audioContext.resume();
|
|
}
|
|
return state.audioContext;
|
|
}
|
|
|
|
async function startMic() {
|
|
const ctx = await ensureAudioContext();
|
|
|
|
state.micStream = await navigator.mediaDevices.getUserMedia({
|
|
audio: {
|
|
echoCancellation: true,
|
|
noiseSuppression: true,
|
|
autoGainControl: true,
|
|
channelCount: 1,
|
|
},
|
|
video: false,
|
|
});
|
|
|
|
state.micSourceNode = ctx.createMediaStreamSource(state.micStream);
|
|
state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", {
|
|
numberOfInputs: 1,
|
|
numberOfOutputs: 0,
|
|
channelCount: 1,
|
|
processorOptions: {
|
|
targetSampleRate: SAMPLE_RATE,
|
|
frameMs: FRAME_MS,
|
|
},
|
|
});
|
|
state.recorderNode.port.onmessage = (event) => {
|
|
const data = event.data;
|
|
if (!data || data.type !== "frame") return;
|
|
updateMeter(data.rms || 0);
|
|
if (state.connected && state.ws && state.ws.readyState === WebSocket.OPEN) {
|
|
state.ws.send(data.buffer);
|
|
}
|
|
};
|
|
|
|
state.micSourceNode.connect(state.recorderNode);
|
|
state.micEnabled = true;
|
|
setMicButton();
|
|
}
|
|
|
|
function stopMic() {
|
|
if (state.recorderNode) {
|
|
try {
|
|
state.recorderNode.port.onmessage = null;
|
|
state.recorderNode.disconnect();
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
state.recorderNode = null;
|
|
}
|
|
if (state.micSourceNode) {
|
|
try {
|
|
state.micSourceNode.disconnect();
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
state.micSourceNode = null;
|
|
}
|
|
if (state.micStream) {
|
|
for (const track of state.micStream.getTracks()) {
|
|
try {
|
|
track.stop();
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
}
|
|
state.micStream = null;
|
|
}
|
|
state.micEnabled = false;
|
|
updateMeter(0);
|
|
setMicButton();
|
|
}
|
|
|
|
function updateMeter(rms) {
|
|
// Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech.
|
|
const target = Math.min(1, rms * 2.4);
|
|
state.meterLevel = state.meterLevel * 0.5 + target * 0.5;
|
|
els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`;
|
|
}
|
|
|
|
/* ---------------------------------------------------- Bot audio playback */
|
|
|
|
function schedulePlayback(int16) {
|
|
const ctx = state.audioContext;
|
|
if (!ctx) return;
|
|
|
|
const float32 = new Float32Array(int16.length);
|
|
for (let i = 0; i < int16.length; i++) {
|
|
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
|
|
}
|
|
const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE);
|
|
buffer.copyToChannel(float32, 0);
|
|
|
|
const src = ctx.createBufferSource();
|
|
src.buffer = buffer;
|
|
src.connect(ctx.destination);
|
|
|
|
const now = ctx.currentTime;
|
|
// Schedule immediately after the previously scheduled chunk to keep
|
|
// playback contiguous, with a tiny safety margin if we fell behind.
|
|
const startAt = Math.max(now + 0.02, state.nextPlaybackTime);
|
|
src.start(startAt);
|
|
state.nextPlaybackTime = startAt + buffer.duration;
|
|
state.playbackEndsAt = state.nextPlaybackTime;
|
|
|
|
src.onended = () => {
|
|
const idx = state.scheduledSources.indexOf(src);
|
|
if (idx >= 0) state.scheduledSources.splice(idx, 1);
|
|
};
|
|
state.scheduledSources.push(src);
|
|
|
|
setBotIndicator(true);
|
|
if (state.botUiTimer) clearTimeout(state.botUiTimer);
|
|
const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120;
|
|
state.botUiTimer = setTimeout(() => {
|
|
if (state.audioContext &&
|
|
state.audioContext.currentTime >= state.playbackEndsAt - 0.01) {
|
|
setBotIndicator(false);
|
|
}
|
|
}, msUntilEnd);
|
|
}
|
|
|
|
function stopPlaybackQueue() {
|
|
for (const src of state.scheduledSources) {
|
|
try {
|
|
src.onended = null;
|
|
src.stop();
|
|
src.disconnect();
|
|
} catch (_) {
|
|
/* already stopped */
|
|
}
|
|
}
|
|
state.scheduledSources = [];
|
|
resetPlaybackClock();
|
|
if (state.botUiTimer) {
|
|
clearTimeout(state.botUiTimer);
|
|
state.botUiTimer = null;
|
|
}
|
|
setBotIndicator(false);
|
|
}
|
|
|
|
function resetPlaybackClock() {
|
|
if (state.audioContext) {
|
|
state.nextPlaybackTime = state.audioContext.currentTime;
|
|
state.playbackEndsAt = state.audioContext.currentTime;
|
|
}
|
|
}
|
|
|
|
/* --------------------------------------------------------- Chat updates */
|
|
|
|
function handleUserTranscript(text) {
|
|
if (!text) return;
|
|
state.currentAssistantBubble = null;
|
|
addBubble("user", text);
|
|
}
|
|
|
|
function sendText(text) {
|
|
const value = (text || "").trim();
|
|
if (!value) return false;
|
|
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
|
|
|
|
// The engine does not echo text input back as a transcript event, so we
|
|
// render the user bubble locally. Also interrupt any in-flight bot audio
|
|
// so the next reply is heard cleanly. We deliberately do NOT clear
|
|
// `currentAssistantBubble` here — the engine will emit a
|
|
// `response.text.final(interrupted=true)` for the in-flight assistant
|
|
// turn, which finalizes that bubble in place. A brand-new bubble for the
|
|
// reply will be created when `response.text.started` arrives.
|
|
state.ws.send(
|
|
JSON.stringify({
|
|
type: "input.text",
|
|
text: value,
|
|
interrupt: true,
|
|
}),
|
|
);
|
|
stopPlaybackQueue();
|
|
addBubble("user", value);
|
|
return true;
|
|
}
|
|
|
|
function handleAssistantDelta(text) {
|
|
if (!text) return;
|
|
if (!state.currentAssistantBubble) {
|
|
state.currentAssistantBubble = addBubble("assistant", "");
|
|
}
|
|
appendToBubble(state.currentAssistantBubble, text);
|
|
}
|
|
|
|
function handleAssistantStarted() {
|
|
state.currentAssistantBubble = null;
|
|
}
|
|
|
|
function handleAssistantFinal(text, interrupted) {
|
|
if (!text) {
|
|
state.currentAssistantBubble = null;
|
|
return;
|
|
}
|
|
if (state.currentAssistantBubble) {
|
|
const body = state.currentAssistantBubble.querySelector(".bubble__text");
|
|
body.textContent = text;
|
|
} else {
|
|
state.currentAssistantBubble = addBubble("assistant", text);
|
|
}
|
|
if (interrupted) {
|
|
state.currentAssistantBubble.classList.add("bubble--interrupted");
|
|
}
|
|
state.currentAssistantBubble = null;
|
|
scrollChatToBottom();
|
|
}
|
|
|
|
function finalizeAssistantBubble() {
|
|
state.currentAssistantBubble = null;
|
|
}
|
|
|
|
/* ---------------------------------------------------------- Websocket IO */
|
|
|
|
function decodeBase64ToInt16(b64) {
|
|
const binary = atob(b64);
|
|
const len = binary.length;
|
|
const bytes = new Uint8Array(len);
|
|
for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i);
|
|
return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2);
|
|
}
|
|
|
|
function handleEvent(event) {
|
|
switch (event.type) {
|
|
case "response.audio.delta":
|
|
if (typeof event.audio === "string") {
|
|
schedulePlayback(decodeBase64ToInt16(event.audio));
|
|
}
|
|
break;
|
|
case "response.audio.started":
|
|
setBotIndicator(true);
|
|
break;
|
|
case "response.audio.stopped":
|
|
finalizeAssistantBubble();
|
|
// The indicator turns off automatically when the playback queue drains.
|
|
break;
|
|
case "response.text.delta":
|
|
handleAssistantDelta(event.text);
|
|
break;
|
|
case "response.text.started":
|
|
handleAssistantStarted();
|
|
break;
|
|
case "response.text.final":
|
|
handleAssistantFinal(event.text, event.interrupted);
|
|
break;
|
|
case "input.transcript.final":
|
|
handleUserTranscript(event.text);
|
|
break;
|
|
case "transport.message":
|
|
// Reserved for future structured messages; ignore silently.
|
|
break;
|
|
default:
|
|
// Unknown event type: log for debugging.
|
|
console.debug("ws event", event);
|
|
}
|
|
}
|
|
|
|
async function connect() {
|
|
if (state.connected || state.connecting) return;
|
|
const url = (els.url.value || "").trim();
|
|
if (!url) {
|
|
setStatus("error", "Missing URL");
|
|
return;
|
|
}
|
|
|
|
state.connecting = true;
|
|
setStatus("connecting", "Connecting…");
|
|
setConnectButton();
|
|
|
|
try {
|
|
// Pre-warm audio context on user gesture so playback works on Safari.
|
|
await ensureAudioContext();
|
|
} catch (err) {
|
|
console.error("AudioContext failed", err);
|
|
state.connecting = false;
|
|
setStatus("error", "Audio init failed");
|
|
setConnectButton();
|
|
return;
|
|
}
|
|
|
|
let ws;
|
|
try {
|
|
ws = new WebSocket(url);
|
|
} catch (err) {
|
|
console.error("WebSocket constructor failed", err);
|
|
state.connecting = false;
|
|
setStatus("error", "Bad URL");
|
|
setConnectButton();
|
|
return;
|
|
}
|
|
ws.binaryType = "arraybuffer";
|
|
state.ws = ws;
|
|
|
|
ws.addEventListener("open", () => {
|
|
state.connecting = false;
|
|
state.connected = true;
|
|
resetPlaybackClock();
|
|
setStatus("connected", "Connected");
|
|
setConnectButton();
|
|
setMicButton();
|
|
|
|
ws.send(
|
|
JSON.stringify({
|
|
type: "session.start",
|
|
protocol: PROTOCOL,
|
|
audio: {
|
|
encoding: "pcm_s16le",
|
|
sample_rate: SAMPLE_RATE,
|
|
channels: CHANNELS,
|
|
},
|
|
}),
|
|
);
|
|
addBubble("system", "Session started.");
|
|
setComposerEnabled(true);
|
|
els.textInput.focus();
|
|
});
|
|
|
|
ws.addEventListener("message", (event) => {
|
|
const data = event.data;
|
|
if (typeof data === "string") {
|
|
let parsed;
|
|
try {
|
|
parsed = JSON.parse(data);
|
|
} catch (err) {
|
|
console.warn("Bad JSON from server", err, data);
|
|
return;
|
|
}
|
|
handleEvent(parsed);
|
|
} else if (data instanceof ArrayBuffer) {
|
|
// Server doesn't currently send binary, but handle it just in case.
|
|
schedulePlayback(new Int16Array(data));
|
|
}
|
|
});
|
|
|
|
ws.addEventListener("error", (err) => {
|
|
console.error("WebSocket error", err);
|
|
setStatus("error", "Connection error");
|
|
});
|
|
|
|
ws.addEventListener("close", (event) => {
|
|
const wasConnected = state.connected;
|
|
state.ws = null;
|
|
state.connected = false;
|
|
state.connecting = false;
|
|
if (state.micEnabled) stopMic();
|
|
stopPlaybackQueue();
|
|
setConnectButton();
|
|
setMicButton();
|
|
setComposerEnabled(false);
|
|
setBotIndicator(false);
|
|
if (wasConnected) {
|
|
addBubble(
|
|
"system",
|
|
`Session ended${event.reason ? ` — ${event.reason}` : ""}.`,
|
|
);
|
|
setStatus("idle", "Disconnected");
|
|
} else {
|
|
setStatus("error", "Connection closed");
|
|
}
|
|
});
|
|
}
|
|
|
|
function disconnect() {
|
|
if (!state.ws) return;
|
|
try {
|
|
if (state.ws.readyState === WebSocket.OPEN) {
|
|
state.ws.send(
|
|
JSON.stringify({ type: "session.stop", reason: "client_disconnect" }),
|
|
);
|
|
}
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
try {
|
|
state.ws.close(1000, "client_disconnect");
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
}
|
|
|
|
/* ---------------------------------------------------------------- Wiring */
|
|
|
|
els.connectBtn.addEventListener("click", () => {
|
|
if (state.connected) disconnect();
|
|
else connect();
|
|
});
|
|
|
|
els.micBtn.addEventListener("click", async () => {
|
|
if (!state.connected) return;
|
|
els.micBtn.disabled = true;
|
|
try {
|
|
if (state.micEnabled) {
|
|
stopMic();
|
|
} else {
|
|
await startMic();
|
|
}
|
|
} catch (err) {
|
|
console.error("Mic error", err);
|
|
addBubble("system", `Mic error: ${err.message || err}`);
|
|
} finally {
|
|
els.micBtn.disabled = !state.connected;
|
|
}
|
|
});
|
|
|
|
els.clearBtn.addEventListener("click", () => {
|
|
clearChat();
|
|
});
|
|
|
|
function autosizeTextarea() {
|
|
const ta = els.textInput;
|
|
ta.style.height = "auto";
|
|
ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`;
|
|
}
|
|
|
|
function submitText() {
|
|
const value = els.textInput.value;
|
|
if (!sendText(value)) return;
|
|
els.textInput.value = "";
|
|
autosizeTextarea();
|
|
setComposerEnabled(state.connected);
|
|
}
|
|
|
|
els.composer.addEventListener("submit", (event) => {
|
|
event.preventDefault();
|
|
submitText();
|
|
});
|
|
|
|
els.textInput.addEventListener("input", () => {
|
|
autosizeTextarea();
|
|
setComposerEnabled(state.connected);
|
|
});
|
|
|
|
els.textInput.addEventListener("keydown", (event) => {
|
|
if (event.key === "Enter" && !event.shiftKey && !event.isComposing) {
|
|
event.preventDefault();
|
|
submitText();
|
|
}
|
|
});
|
|
|
|
window.addEventListener("beforeunload", () => {
|
|
if (state.ws) {
|
|
try {
|
|
state.ws.close();
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
}
|
|
if (state.audioContext) {
|
|
try {
|
|
state.audioContext.close();
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
}
|
|
});
|
|
|
|
setStatus("idle", "Disconnected");
|
|
setConnectButton();
|
|
setMicButton();
|
|
setComposerEnabled(false);
|