Files
engine-v5-pipecat-core/examples/webpage/app.js
2026-05-31 22:46:48 +08:00

1157 lines
32 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Minimal browser client for the AI VideoAssistant engine's product
* websocket (`/ws-product`, protocol `va.ws.v1`).
*
* Responsibilities:
* - Open/close the websocket and run the session handshake.
* - List/select microphones and capture mic audio with browser AEC enabled.
* - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames
* as binary websocket messages.
* - Play `response.audio.delta` frames gaplessly through Web Audio.
* - Render a chat-style history of user transcripts and bot text deltas.
* - Collapse high-frequency audio frames into expandable websocket log groups.
*/
const SAMPLE_RATE = 16000;
const CHANNELS = 1;
const FRAME_MS = 20;
const PROTOCOL = "va.ws.v1";
const MAX_WS_LOG_LINES = 120;
const MAX_GROUP_CHILDREN_RENDER = 100;
const WS_LOG_GROUP_KEYS = {
AUDIO_DELTA: "recv:response.audio.delta",
TEXT_DELTA: "recv:response.text.delta",
AUDIO_SEND: "send:input.audio",
};
const CAMERA_DONE_TEXT = "【拍摄完成】";
const CAMERA_STATE_PROMPTS = {
2000: "请对准车辆碰撞部位拍摄照片。",
2001: "请对准车辆碰撞部位拍摄照片。",
2002: "请对准被撞物品拍摄照片。",
2003: "请切换摄像头对准本人拍摄一张正面照片。",
2010: "请对准第一辆车碰撞部位拍摄。",
2011: "请对准第一辆车碰撞部位拍摄。",
2012: "请对准第二辆车碰撞部位拍摄。",
2013: "请对准第二方车辆侧后方,看清车牌拍摄。",
2014: "请拍摄另一方驾驶人的正面照片。",
2015: "请切换前置摄像头对准本人拍摄一张正面照片。",
};
function defaultWsUrl() {
const scheme = location.protocol === "https:" ? "wss:" : "ws:";
return `${scheme}//${location.host}/ws-product`;
}
const els = {
url: document.getElementById("ws-url"),
chatId: document.getElementById("chat-id"),
connectBtn: document.getElementById("connect-btn"),
statusDot: document.getElementById("status-dot"),
statusText: document.getElementById("status-text"),
conversation: document.getElementById("conversation"),
chatLog: document.getElementById("chat-log"),
micBtn: document.getElementById("mic-btn"),
micSelect: document.getElementById("mic-select"),
micLabel: document.querySelector(".mic-btn__label"),
micIndicator: document.getElementById("mic-indicator"),
botIndicator: document.getElementById("bot-indicator"),
stateIndicator: document.getElementById("state-indicator"),
stateLabel: document.getElementById("state-label"),
cameraDrawer: document.getElementById("camera-drawer"),
cameraState: document.getElementById("camera-state"),
cameraQuestion: document.getElementById("camera-question"),
cameraDoneBtn: document.getElementById("camera-done-btn"),
clearBtn: document.getElementById("clear-btn"),
clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
wsLog: document.getElementById("ws-log"),
meterFill: document.getElementById("meter-fill"),
composer: document.getElementById("composer"),
textInput: document.getElementById("text-input"),
sendBtn: document.getElementById("send-btn"),
};
function wsUrlWithChatId() {
const rawUrl = (els.url.value || "").trim();
const chatId = (els.chatId.value || "").trim();
if (!rawUrl || !chatId) return rawUrl;
try {
const url = new URL(rawUrl, location.href);
url.searchParams.set("chatId", chatId);
return url.href;
} catch (_) {
const separator = rawUrl.includes("?") ? "&" : "?";
return `${rawUrl}${separator}chatId=${encodeURIComponent(chatId)}`;
}
}
const state = {
ws: null,
connected: false,
connecting: false,
audioContext: null,
micStream: null,
micSourceNode: null,
recorderNode: null,
micEnabled: false,
micDevices: [],
selectedMicDeviceId: "",
// Output scheduling.
nextPlaybackTime: 0,
playbackEndsAt: 0,
scheduledSources: [],
botActive: false,
botUiTimer: null,
// Chat state.
currentAssistantBubble: null,
assistantState: "",
cameraState: "",
// VU meter smoothing.
meterLevel: 0,
// Collapsible websocket log groups for high-frequency audio frames.
wsLogGroup: null,
};
/* ------------------------------------------------------------------ UI */
function setStatus(kind, text) {
els.statusDot.className = `status__dot status__dot--${kind}`;
els.statusText.textContent = text;
}
function setConnectButton() {
els.chatId.disabled = state.connected || state.connecting;
if (state.connecting) {
els.connectBtn.textContent = "Connecting…";
els.connectBtn.disabled = true;
els.connectBtn.classList.remove("is-disconnect");
} else if (state.connected) {
els.connectBtn.textContent = "Disconnect";
els.connectBtn.disabled = false;
els.connectBtn.classList.add("is-disconnect");
} else {
els.connectBtn.textContent = "Connect";
els.connectBtn.disabled = false;
els.connectBtn.classList.remove("is-disconnect");
}
}
function setMicButton() {
els.micBtn.disabled = !state.connected;
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic";
els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic";
els.micIndicator.classList.toggle("is-active", state.micEnabled);
}
function setMicSelectEnabled() {
els.micSelect.disabled = !state.connected || !navigator.mediaDevices;
}
function setComposerEnabled(enabled) {
els.textInput.disabled = !enabled;
els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0;
setCameraButtonEnabled();
}
function setBotIndicator(active) {
els.botIndicator.classList.toggle("is-active", active);
}
function setAssistantState(value) {
const text = typeof value === "string" ? value.trim() : "";
const label = text.length > 32 ? `${text.slice(0, 31)}` : text;
state.assistantState = text;
els.stateIndicator.classList.toggle("is-active", Boolean(text));
els.stateLabel.textContent = label ? `State ${label}` : "State -";
els.stateIndicator.title = label ? `Assistant state: ${text}` : "Assistant state";
syncCameraDrawer(text);
}
function setCameraButtonEnabled() {
if (!els.cameraDoneBtn) return;
els.cameraDoneBtn.disabled =
!state.connected || !state.cameraState ||
!state.ws || state.ws.readyState !== WebSocket.OPEN;
}
function syncCameraDrawer(value) {
const prompt = CAMERA_STATE_PROMPTS[value];
const open = Boolean(prompt);
state.cameraState = open ? value : "";
els.cameraDrawer.classList.toggle("is-open", open);
els.conversation.classList.toggle("has-camera", open);
els.cameraDrawer.setAttribute("aria-hidden", open ? "false" : "true");
if (open) {
els.cameraState.textContent = `State ${value}`;
els.cameraQuestion.textContent = prompt;
} else {
els.cameraState.textContent = "State -";
els.cameraQuestion.textContent = "";
}
setCameraButtonEnabled();
}
function updateCameraQuestion(text) {
const value = typeof text === "string" ? text.trim() : "";
if (!state.cameraState || !value) return;
els.cameraQuestion.textContent = value;
}
function addBubble(role, text) {
if (els.chatLog.querySelector(".chat__empty")) {
els.chatLog.innerHTML = "";
}
const bubble = document.createElement("div");
bubble.className = `bubble bubble--${role}`;
if (role !== "system") {
const tag = document.createElement("span");
tag.className = "bubble__role";
tag.textContent = role === "user" ? "You" : "Assistant";
bubble.appendChild(tag);
}
const body = document.createElement("span");
body.className = "bubble__text";
body.textContent = text;
bubble.appendChild(body);
els.chatLog.appendChild(bubble);
scrollChatToBottom();
return bubble;
}
function appendToBubble(bubble, text) {
const body = bubble.querySelector(".bubble__text");
body.textContent += text;
scrollChatToBottom();
}
function scrollChatToBottom() {
els.chatLog.scrollTop = els.chatLog.scrollHeight;
}
function clearChat() {
els.chatLog.innerHTML = "";
state.currentAssistantBubble = null;
setAssistantState("");
const empty = document.createElement("div");
empty.className = "chat__empty";
empty.innerHTML = "<p>Chat cleared.</p>";
els.chatLog.appendChild(empty);
}
function truncateLogValue(value, maxLength = 160) {
const text = String(value);
if (text.length <= maxLength) return text;
return `${text.slice(0, maxLength - 1)}`;
}
function formatLogTime(date = new Date()) {
return date.toLocaleTimeString([], {
hour12: false,
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
});
}
function formatLogBytes(byteCount) {
if (byteCount >= 1048576) {
return `${(byteCount / 1048576).toFixed(2)} MB`;
}
if (byteCount >= 1024) {
return `${(byteCount / 1024).toFixed(1)} KB`;
}
return `${byteCount} bytes`;
}
function wsLogGroupLabel(groupKey) {
if (groupKey === WS_LOG_GROUP_KEYS.AUDIO_DELTA) {
return "response.audio.delta";
}
if (groupKey === WS_LOG_GROUP_KEYS.TEXT_DELTA) {
return "response.text.delta";
}
if (groupKey === WS_LOG_GROUP_KEYS.AUDIO_SEND) {
return "input.audio binary";
}
return "grouped events";
}
function ensureWsLogReady() {
if (els.wsLog.querySelector(".ws-log__empty")) {
els.wsLog.innerHTML = "";
}
}
function scrollWsLogToBottom() {
els.wsLog.scrollTop = els.wsLog.scrollHeight;
}
function trimWsLog() {
while (els.wsLog.children.length > MAX_WS_LOG_LINES) {
const first = els.wsLog.firstElementChild;
if (state.wsLogGroup?.element === first) {
state.wsLogGroup = null;
}
first.remove();
}
}
function finalizeWsLogGroup() {
state.wsLogGroup = null;
}
function createWsLogEntry(direction, detail, kind, timeText = formatLogTime()) {
const entry = document.createElement("div");
entry.className = `ws-log__entry ws-log__entry--${kind}`;
const time = document.createElement("span");
time.className = "ws-log__time";
time.textContent = timeText;
const dir = document.createElement("span");
dir.className = "ws-log__direction";
dir.textContent =
direction === "send"
? "SEND"
: direction === "recv"
? "RECV"
: direction.toUpperCase();
const body = document.createElement("span");
body.className = "ws-log__detail";
body.textContent = detail;
entry.append(time, dir, body);
return entry;
}
function updateWsLogGroupSummary(group) {
group.summaryEl.textContent = `${wsLogGroupLabel(group.key)} ×${group.count} (${formatLogBytes(group.totalBytes)})`;
}
function appendWsLogGroupChildDom(group, item) {
const entry = createWsLogEntry(
group.direction,
item.detail,
group.kind,
item.time,
);
entry.classList.add("ws-log__entry--child");
group.childrenEl.appendChild(entry);
const childEntries = group.childrenEl.querySelectorAll(".ws-log__entry");
if (childEntries.length > MAX_GROUP_CHILDREN_RENDER) {
const omit = group.childrenEl.querySelector(".ws-log__group-omit");
if (!omit) {
const omitted = document.createElement("div");
omitted.className = "ws-log__group-omit";
omitted.textContent = "… earlier events omitted";
group.childrenEl.insertBefore(omitted, group.childrenEl.firstElementChild);
}
childEntries[0].remove();
}
}
function renderWsLogGroupChildren(group) {
group.childrenEl.innerHTML = "";
const items = group.items;
const start = Math.max(0, items.length - MAX_GROUP_CHILDREN_RENDER);
if (start > 0) {
const omitted = document.createElement("div");
omitted.className = "ws-log__group-omit";
omitted.textContent = `${start} earlier events omitted`;
group.childrenEl.appendChild(omitted);
}
for (let i = start; i < items.length; i += 1) {
appendWsLogGroupChildDom(group, items[i]);
}
}
function toggleWsLogGroup(group) {
group.collapsed = !group.collapsed;
group.childrenEl.hidden = group.collapsed;
group.chevronEl.textContent = group.collapsed ? "▶" : "▼";
group.headerEl.setAttribute("aria-expanded", group.collapsed ? "false" : "true");
if (!group.collapsed && group.childrenEl.childElementCount === 0) {
renderWsLogGroupChildren(group);
}
}
function appendWsLogGroupItem(groupKey, direction, kind, itemDetail, byteCount = 0) {
ensureWsLogReady();
let group = state.wsLogGroup;
if (!group || group.key !== groupKey) {
finalizeWsLogGroup();
const groupEl = document.createElement("div");
groupEl.className = `ws-log__group ws-log__group--${kind}`;
const header = document.createElement("button");
header.type = "button";
header.className = "ws-log__group-header";
header.setAttribute("aria-expanded", "false");
const time = document.createElement("span");
time.className = "ws-log__time";
time.textContent = formatLogTime();
const dir = document.createElement("span");
dir.className = "ws-log__direction";
dir.textContent = direction === "send" ? "SEND" : "RECV";
const chevron = document.createElement("span");
chevron.className = "ws-log__group-chevron";
chevron.textContent = "▶";
chevron.setAttribute("aria-hidden", "true");
const summary = document.createElement("span");
summary.className = "ws-log__group-summary";
header.append(time, dir, chevron, summary);
const children = document.createElement("div");
children.className = "ws-log__group-children";
children.hidden = true;
groupEl.append(header, children);
els.wsLog.appendChild(groupEl);
group = {
key: groupKey,
direction,
kind,
element: groupEl,
headerEl: header,
chevronEl: chevron,
summaryEl: summary,
childrenEl: children,
collapsed: true,
count: 0,
totalBytes: 0,
items: [],
};
state.wsLogGroup = group;
header.addEventListener("click", () => toggleWsLogGroup(group));
}
group.count += 1;
group.totalBytes += byteCount;
const item = { time: formatLogTime(), detail: itemDetail };
group.items.push(item);
updateWsLogGroupSummary(group);
if (!group.collapsed) {
appendWsLogGroupChildDom(group, item);
}
trimWsLog();
scrollWsLogToBottom();
}
function compactWsPayload(payload) {
if (!payload || typeof payload !== "object") return String(payload);
const compact = { ...payload };
if (typeof compact.audio === "string") {
compact.audio = `<base64 ${compact.audio.length} chars>`;
}
if (typeof compact.data === "string" && compact.data.length > 160) {
compact.data = `<string ${compact.data.length} chars>`;
}
if (typeof compact.text === "string") {
compact.text = truncateLogValue(compact.text);
}
try {
return JSON.stringify(compact);
} catch (_) {
return payload.type || "unserializable websocket payload";
}
}
function addWsLog(direction, detail, kind = direction) {
finalizeWsLogGroup();
ensureWsLogReady();
els.wsLog.appendChild(createWsLogEntry(direction, detail, kind));
trimWsLog();
scrollWsLogToBottom();
}
function logWsPayload(direction, payload) {
if (direction === "recv" && payload?.type === "response.audio.delta") {
const bytes = payload.bytes || 0;
const detail =
payload.seq != null
? `seq=${payload.seq} (${bytes} bytes)`
: `(${bytes} bytes)`;
appendWsLogGroupItem(
WS_LOG_GROUP_KEYS.AUDIO_DELTA,
"recv",
"recv",
detail,
bytes,
);
return;
}
if (direction === "recv" && payload?.type === "response.text.delta") {
const text = typeof payload.text === "string" ? payload.text : "";
const bytes = new TextEncoder().encode(text).length;
const detail =
payload.seq != null
? `seq=${payload.seq} ${JSON.stringify(truncateLogValue(text, 120))}`
: JSON.stringify(truncateLogValue(text, 120));
appendWsLogGroupItem(
WS_LOG_GROUP_KEYS.TEXT_DELTA,
"recv",
"recv",
detail,
bytes,
);
return;
}
addWsLog(direction, compactWsPayload(payload));
}
function logBinarySend(byteLength) {
appendWsLogGroupItem(
WS_LOG_GROUP_KEYS.AUDIO_SEND,
"send",
"send",
`(${byteLength} bytes)`,
byteLength,
);
}
function wsSend(data) {
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
if (typeof data === "string") {
try {
logWsPayload("send", JSON.parse(data));
} catch (_) {
addWsLog("send", truncateLogValue(data));
}
} else {
const byteLength =
data instanceof ArrayBuffer
? data.byteLength
: ArrayBuffer.isView(data)
? data.byteLength
: 0;
if (byteLength > 0) {
logBinarySend(byteLength);
}
}
state.ws.send(data);
return true;
}
function clearWsLog() {
state.wsLogGroup = null;
els.wsLog.innerHTML =
'<div class="ws-log__empty">No websocket events yet.</div>';
}
/* ---------------------------------------------------------------- Audio */
async function ensureAudioContext() {
if (!state.audioContext) {
const Ctx = window.AudioContext || window.webkitAudioContext;
state.audioContext = new Ctx();
await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js");
}
if (state.audioContext.state === "suspended") {
await state.audioContext.resume();
}
return state.audioContext;
}
function renderMicDevices() {
const previousValue = state.selectedMicDeviceId || els.micSelect.value;
els.micSelect.innerHTML = "";
const defaultOption = document.createElement("option");
defaultOption.value = "";
defaultOption.textContent = "Default microphone";
els.micSelect.appendChild(defaultOption);
state.micDevices.forEach((device, index) => {
const option = document.createElement("option");
option.value = device.deviceId;
option.textContent = device.label || `Microphone ${index + 1}`;
els.micSelect.appendChild(option);
});
const hasPrevious = state.micDevices.some(
(device) => device.deviceId === previousValue,
);
state.selectedMicDeviceId = hasPrevious ? previousValue : "";
els.micSelect.value = state.selectedMicDeviceId;
setMicSelectEnabled();
}
async function refreshMicDevices() {
if (!navigator.mediaDevices?.enumerateDevices) {
setMicSelectEnabled();
return;
}
try {
const devices = await navigator.mediaDevices.enumerateDevices();
state.micDevices = devices.filter((device) => device.kind === "audioinput");
renderMicDevices();
} catch (err) {
console.warn("Could not enumerate microphones", err);
setMicSelectEnabled();
}
}
async function startMic() {
const ctx = await ensureAudioContext();
const audioConstraints = {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
channelCount: 1,
};
if (state.selectedMicDeviceId) {
audioConstraints.deviceId = { exact: state.selectedMicDeviceId };
}
state.micStream = await navigator.mediaDevices.getUserMedia({
audio: audioConstraints,
video: false,
});
await refreshMicDevices();
state.micSourceNode = ctx.createMediaStreamSource(state.micStream);
state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", {
numberOfInputs: 1,
numberOfOutputs: 0,
channelCount: 1,
processorOptions: {
targetSampleRate: SAMPLE_RATE,
frameMs: FRAME_MS,
},
});
state.recorderNode.port.onmessage = (event) => {
const data = event.data;
if (!data || data.type !== "frame") return;
updateMeter(data.rms || 0);
if (state.connected) {
wsSend(data.buffer);
}
};
state.micSourceNode.connect(state.recorderNode);
state.micEnabled = true;
addWsLog("system", "mic capture started (binary input.audio frames)");
setMicButton();
}
function stopMic() {
const wasEnabled = state.micEnabled;
if (state.recorderNode) {
try {
state.recorderNode.port.onmessage = null;
state.recorderNode.disconnect();
} catch (_) {
/* ignore */
}
state.recorderNode = null;
}
if (state.micSourceNode) {
try {
state.micSourceNode.disconnect();
} catch (_) {
/* ignore */
}
state.micSourceNode = null;
}
if (state.micStream) {
for (const track of state.micStream.getTracks()) {
try {
track.stop();
} catch (_) {
/* ignore */
}
}
state.micStream = null;
}
state.micEnabled = false;
updateMeter(0);
if (wasEnabled) {
addWsLog("system", "mic capture stopped");
}
setMicButton();
}
function updateMeter(rms) {
// Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech.
const target = Math.min(1, rms * 2.4);
state.meterLevel = state.meterLevel * 0.5 + target * 0.5;
els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`;
}
/* ---------------------------------------------------- Bot audio playback */
function schedulePlayback(int16) {
const ctx = state.audioContext;
if (!ctx) return;
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
}
const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE);
buffer.copyToChannel(float32, 0);
const src = ctx.createBufferSource();
src.buffer = buffer;
src.connect(ctx.destination);
const now = ctx.currentTime;
// Schedule immediately after the previously scheduled chunk to keep
// playback contiguous, with a tiny safety margin if we fell behind.
const startAt = Math.max(now + 0.02, state.nextPlaybackTime);
src.start(startAt);
state.nextPlaybackTime = startAt + buffer.duration;
state.playbackEndsAt = state.nextPlaybackTime;
src.onended = () => {
const idx = state.scheduledSources.indexOf(src);
if (idx >= 0) state.scheduledSources.splice(idx, 1);
};
state.scheduledSources.push(src);
setBotIndicator(true);
if (state.botUiTimer) clearTimeout(state.botUiTimer);
const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120;
state.botUiTimer = setTimeout(() => {
if (state.audioContext &&
state.audioContext.currentTime >= state.playbackEndsAt - 0.01) {
setBotIndicator(false);
}
}, msUntilEnd);
}
function stopPlaybackQueue() {
for (const src of state.scheduledSources) {
try {
src.onended = null;
src.stop();
src.disconnect();
} catch (_) {
/* already stopped */
}
}
state.scheduledSources = [];
resetPlaybackClock();
if (state.botUiTimer) {
clearTimeout(state.botUiTimer);
state.botUiTimer = null;
}
setBotIndicator(false);
}
function resetPlaybackClock() {
if (state.audioContext) {
state.nextPlaybackTime = state.audioContext.currentTime;
state.playbackEndsAt = state.audioContext.currentTime;
}
}
/* --------------------------------------------------------- Chat updates */
function handleUserTranscript(text) {
if (!text) return;
state.currentAssistantBubble = null;
addBubble("user", text);
}
function sendText(text) {
const value = (text || "").trim();
if (!value) return false;
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
const message = {
type: "input.text",
text: value,
interrupt: true,
};
// The engine does not echo text input back as a transcript event, so we
// render the user bubble locally. Also interrupt any in-flight bot audio
// so the next reply is heard cleanly. We deliberately do NOT clear
// `currentAssistantBubble` here — the engine will emit a
// `response.text.final(interrupted=true)` for the in-flight assistant
// turn, which finalizes that bubble in place. A brand-new bubble for the
// reply will be created when `response.text.started` arrives.
wsSend(JSON.stringify(message));
stopPlaybackQueue();
addBubble("user", value);
return true;
}
function handleAssistantDelta(text) {
if (!text) return;
if (!state.currentAssistantBubble) {
state.currentAssistantBubble = addBubble("assistant", "");
}
appendToBubble(state.currentAssistantBubble, text);
}
function handleAssistantStarted() {
state.currentAssistantBubble = null;
}
function handleAssistantFinal(text, interrupted) {
if (!text) {
state.currentAssistantBubble = null;
return;
}
if (state.currentAssistantBubble) {
const body = state.currentAssistantBubble.querySelector(".bubble__text");
body.textContent = text;
} else {
state.currentAssistantBubble = addBubble("assistant", text);
}
if (interrupted) {
state.currentAssistantBubble.classList.add("bubble--interrupted");
}
updateCameraQuestion(text);
state.currentAssistantBubble = null;
scrollChatToBottom();
}
function finalizeAssistantBubble() {
state.currentAssistantBubble = null;
}
/* ---------------------------------------------------------- Websocket IO */
function decodeBase64ToInt16(b64) {
const binary = atob(b64);
const len = binary.length;
const bytes = new Uint8Array(len);
for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i);
return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2);
}
function handleEvent(event) {
switch (event.type) {
case "response.audio.delta":
if (typeof event.audio === "string") {
schedulePlayback(decodeBase64ToInt16(event.audio));
}
break;
case "response.audio.started":
setBotIndicator(true);
break;
case "response.audio.stopped":
finalizeAssistantBubble();
// The indicator turns off automatically when the playback queue drains.
break;
case "response.text.delta":
handleAssistantDelta(event.text);
break;
case "response.text.started":
handleAssistantStarted();
break;
case "response.text.final":
handleAssistantFinal(event.text, event.interrupted);
break;
case "response.state":
setAssistantState(event.state);
break;
case "input.transcript.final":
handleUserTranscript(event.text);
break;
case "input.transcript.interim":
// Ignore partial ASR updates; chat history renders committed user turns.
break;
case "transport.message":
// Reserved for future structured messages; ignore silently.
break;
default:
// Unknown event type: log for debugging.
console.debug("ws event", event);
}
}
async function connect() {
if (state.connected || state.connecting) return;
const url = wsUrlWithChatId();
if (!url) {
setStatus("error", "Missing URL");
return;
}
state.connecting = true;
setStatus("connecting", "Connecting…");
setConnectButton();
addWsLog("system", `connecting ${url}`);
try {
// Pre-warm audio context on user gesture so playback works on Safari.
await ensureAudioContext();
} catch (err) {
console.error("AudioContext failed", err);
state.connecting = false;
setStatus("error", "Audio init failed");
setConnectButton();
addWsLog("error", `audio init failed: ${err.message || err}`, "error");
return;
}
let ws;
try {
ws = new WebSocket(url);
} catch (err) {
console.error("WebSocket constructor failed", err);
state.connecting = false;
setStatus("error", "Bad URL");
setConnectButton();
addWsLog("error", `bad websocket URL: ${err.message || err}`, "error");
return;
}
ws.binaryType = "arraybuffer";
state.ws = ws;
ws.addEventListener("open", () => {
const chatId = (els.chatId.value || "").trim();
const startMessage = {
type: "session.start",
protocol: PROTOCOL,
audio: {
encoding: "pcm_s16le",
sample_rate: SAMPLE_RATE,
channels: CHANNELS,
},
};
if (chatId) {
startMessage.chatId = chatId;
}
state.connecting = false;
state.connected = true;
resetPlaybackClock();
addWsLog("system", "websocket open");
setStatus("connected", "Connected");
setConnectButton();
setMicButton();
setMicSelectEnabled();
refreshMicDevices();
wsSend(JSON.stringify(startMessage));
addBubble("system", "Session started.");
setComposerEnabled(true);
setCameraButtonEnabled();
els.textInput.focus();
});
ws.addEventListener("message", (event) => {
const data = event.data;
if (typeof data === "string") {
let parsed;
try {
parsed = JSON.parse(data);
} catch (err) {
console.warn("Bad JSON from server", err, data);
addWsLog(
"error",
`invalid JSON from server: ${truncateLogValue(data)}`,
"error",
);
return;
}
logWsPayload("recv", parsed);
handleEvent(parsed);
} else if (data instanceof ArrayBuffer) {
// Server doesn't currently send binary, but handle it just in case.
addWsLog("recv", `binary audio ${data.byteLength} bytes`);
schedulePlayback(new Int16Array(data));
}
});
ws.addEventListener("error", (err) => {
console.error("WebSocket error", err);
setStatus("error", "Connection error");
addWsLog("error", "websocket error", "error");
});
ws.addEventListener("close", (event) => {
const wasConnected = state.connected;
state.ws = null;
state.connected = false;
state.connecting = false;
setAssistantState("");
if (state.micEnabled) stopMic();
stopPlaybackQueue();
setConnectButton();
setMicButton();
setMicSelectEnabled();
setComposerEnabled(false);
setCameraButtonEnabled();
setBotIndicator(false);
finalizeWsLogGroup();
addWsLog(
"system",
`websocket close code=${event.code}${
event.reason ? ` reason=${event.reason}` : ""
}`,
);
if (wasConnected) {
addBubble(
"system",
`Session ended${event.reason ? `${event.reason}` : ""}.`,
);
setStatus("idle", "Disconnected");
} else {
setStatus("error", "Connection closed");
}
});
}
function disconnect() {
if (!state.ws) return;
try {
if (state.ws.readyState === WebSocket.OPEN) {
const stopMessage = { type: "session.stop", reason: "client_disconnect" };
wsSend(JSON.stringify(stopMessage));
}
} catch (_) {
/* ignore */
}
try {
state.ws.close(1000, "client_disconnect");
} catch (_) {
/* ignore */
}
}
/* ---------------------------------------------------------------- Wiring */
els.connectBtn.addEventListener("click", () => {
if (state.connected) disconnect();
else connect();
});
els.micBtn.addEventListener("click", async () => {
if (!state.connected) return;
els.micBtn.disabled = true;
try {
if (state.micEnabled) {
stopMic();
} else {
await startMic();
}
} catch (err) {
console.error("Mic error", err);
addBubble("system", `Mic error: ${err.message || err}`);
} finally {
els.micBtn.disabled = !state.connected;
}
});
els.micSelect.addEventListener("change", async () => {
state.selectedMicDeviceId = els.micSelect.value;
if (!state.micEnabled) return;
els.micSelect.disabled = true;
els.micBtn.disabled = true;
try {
stopMic();
await startMic();
} catch (err) {
console.error("Mic switch error", err);
addBubble("system", `Mic switch error: ${err.message || err}`);
} finally {
setMicButton();
setMicSelectEnabled();
}
});
if (navigator.mediaDevices?.addEventListener) {
navigator.mediaDevices.addEventListener("devicechange", refreshMicDevices);
}
els.clearBtn.addEventListener("click", () => {
clearChat();
});
els.clearWsLogBtn.addEventListener("click", () => {
clearWsLog();
});
els.cameraDoneBtn.addEventListener("click", () => {
if (!state.cameraState) return;
sendText(CAMERA_DONE_TEXT);
});
function autosizeTextarea() {
const ta = els.textInput;
ta.style.height = "auto";
ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`;
}
function submitText() {
const value = els.textInput.value;
if (!sendText(value)) return;
els.textInput.value = "";
autosizeTextarea();
setComposerEnabled(state.connected);
}
els.composer.addEventListener("submit", (event) => {
event.preventDefault();
submitText();
});
els.textInput.addEventListener("input", () => {
autosizeTextarea();
setComposerEnabled(state.connected);
});
els.textInput.addEventListener("keydown", (event) => {
if (event.key === "Enter" && !event.shiftKey && !event.isComposing) {
event.preventDefault();
submitText();
}
});
window.addEventListener("beforeunload", () => {
if (state.ws) {
try {
state.ws.close();
} catch (_) {
/* ignore */
}
}
if (state.audioContext) {
try {
state.audioContext.close();
} catch (_) {
/* ignore */
}
}
});
els.url.value = defaultWsUrl();
setStatus("idle", "Disconnected");
setConnectButton();
setMicButton();
setMicSelectEnabled();
setComposerEnabled(false);