Files
engine-v5-pipecat-core/examples/webpage/app.js
2026-05-21 15:42:49 +08:00

916 lines
25 KiB
JavaScript

/**
* Minimal browser client for the AI VideoAssistant engine's product
* websocket (`/ws-product`, protocol `va.ws.v1`).
*
* Responsibilities:
* - Open/close the websocket and run the session handshake.
* - List/select microphones and capture mic audio with browser AEC enabled.
* - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames
* as binary websocket messages.
* - Play `response.audio.delta` frames gaplessly through Web Audio.
* - Render a chat-style history of user transcripts and bot text deltas.
*/
const SAMPLE_RATE = 16000;
const CHANNELS = 1;
const FRAME_MS = 20;
const PROTOCOL = "va.ws.v1";
const MAX_WS_LOG_LINES = 120;
const AUDIO_DELTA_LOG_INTERVAL_MS = 1000;
const els = {
url: document.getElementById("ws-url"),
connectBtn: document.getElementById("connect-btn"),
statusDot: document.getElementById("status-dot"),
statusText: document.getElementById("status-text"),
chatLog: document.getElementById("chat-log"),
micBtn: document.getElementById("mic-btn"),
micSelect: document.getElementById("mic-select"),
micLabel: document.querySelector(".mic-btn__label"),
micIndicator: document.getElementById("mic-indicator"),
botIndicator: document.getElementById("bot-indicator"),
clearBtn: document.getElementById("clear-btn"),
clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
wsLog: document.getElementById("ws-log"),
meterFill: document.getElementById("meter-fill"),
composer: document.getElementById("composer"),
textInput: document.getElementById("text-input"),
sendBtn: document.getElementById("send-btn"),
};
const state = {
ws: null,
connected: false,
connecting: false,
audioContext: null,
micStream: null,
micSourceNode: null,
recorderNode: null,
micEnabled: false,
micDevices: [],
selectedMicDeviceId: "",
// Output scheduling.
nextPlaybackTime: 0,
playbackEndsAt: 0,
scheduledSources: [],
botActive: false,
botUiTimer: null,
// Chat state.
currentUserBubble: null,
currentAssistantBubble: null,
// VU meter smoothing.
meterLevel: 0,
// Compact websocket logging.
audioDeltaLogCount: 0,
audioDeltaLogBytes: 0,
lastAudioDeltaLogAt: 0,
audioSendLogCount: 0,
audioSendLogBytes: 0,
lastAudioSendLogAt: 0,
};
/* ------------------------------------------------------------------ UI */
function setStatus(kind, text) {
els.statusDot.className = `status__dot status__dot--${kind}`;
els.statusText.textContent = text;
}
function setConnectButton() {
if (state.connecting) {
els.connectBtn.textContent = "Connecting…";
els.connectBtn.disabled = true;
els.connectBtn.classList.remove("is-disconnect");
} else if (state.connected) {
els.connectBtn.textContent = "Disconnect";
els.connectBtn.disabled = false;
els.connectBtn.classList.add("is-disconnect");
} else {
els.connectBtn.textContent = "Connect";
els.connectBtn.disabled = false;
els.connectBtn.classList.remove("is-disconnect");
}
}
function setMicButton() {
els.micBtn.disabled = !state.connected;
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic";
els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic";
els.micIndicator.classList.toggle("is-active", state.micEnabled);
}
function setMicSelectEnabled() {
els.micSelect.disabled = !state.connected || !navigator.mediaDevices;
}
function setComposerEnabled(enabled) {
els.textInput.disabled = !enabled;
els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0;
}
function setBotIndicator(active) {
els.botIndicator.classList.toggle("is-active", active);
}
function addBubble(role, text) {
if (els.chatLog.querySelector(".chat__empty")) {
els.chatLog.innerHTML = "";
}
const bubble = document.createElement("div");
bubble.className = `bubble bubble--${role}`;
if (role !== "system") {
const tag = document.createElement("span");
tag.className = "bubble__role";
tag.textContent = role === "user" ? "You" : "Assistant";
bubble.appendChild(tag);
}
const body = document.createElement("span");
body.className = "bubble__text";
body.textContent = text;
bubble.appendChild(body);
els.chatLog.appendChild(bubble);
scrollChatToBottom();
return bubble;
}
function appendToBubble(bubble, text) {
const body = bubble.querySelector(".bubble__text");
body.textContent += text;
scrollChatToBottom();
}
function scrollChatToBottom() {
els.chatLog.scrollTop = els.chatLog.scrollHeight;
}
function clearChat() {
els.chatLog.innerHTML = "";
state.currentUserBubble = null;
state.currentAssistantBubble = null;
const empty = document.createElement("div");
empty.className = "chat__empty";
empty.innerHTML = "<p>Chat cleared.</p>";
els.chatLog.appendChild(empty);
}
function truncateLogValue(value, maxLength = 160) {
const text = String(value);
if (text.length <= maxLength) return text;
return `${text.slice(0, maxLength - 1)}`;
}
function compactWsPayload(payload) {
if (!payload || typeof payload !== "object") return String(payload);
const compact = { ...payload };
if (typeof compact.audio === "string") {
compact.audio = `<base64 ${compact.audio.length} chars>`;
}
if (typeof compact.data === "string" && compact.data.length > 160) {
compact.data = `<string ${compact.data.length} chars>`;
}
if (typeof compact.text === "string") {
compact.text = truncateLogValue(compact.text);
}
try {
return JSON.stringify(compact);
} catch (_) {
return payload.type || "unserializable websocket payload";
}
}
function addWsLog(direction, detail, kind = direction) {
if (els.wsLog.querySelector(".ws-log__empty")) {
els.wsLog.innerHTML = "";
}
const entry = document.createElement("div");
entry.className = `ws-log__entry ws-log__entry--${kind}`;
const time = document.createElement("span");
time.className = "ws-log__time";
time.textContent = new Date().toLocaleTimeString([], {
hour12: false,
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
});
const dir = document.createElement("span");
dir.className = "ws-log__direction";
dir.textContent =
direction === "send"
? "SEND"
: direction === "recv"
? "RECV"
: direction.toUpperCase();
const body = document.createElement("span");
body.className = "ws-log__detail";
body.textContent = detail;
entry.append(time, dir, body);
els.wsLog.appendChild(entry);
while (els.wsLog.children.length > MAX_WS_LOG_LINES) {
els.wsLog.firstElementChild.remove();
}
els.wsLog.scrollTop = els.wsLog.scrollHeight;
}
function flushAudioDeltaLog() {
if (state.audioDeltaLogCount === 0) return;
addWsLog(
"recv",
`response.audio.delta x${state.audioDeltaLogCount} (${state.audioDeltaLogBytes} bytes)`,
);
state.audioDeltaLogCount = 0;
state.audioDeltaLogBytes = 0;
state.lastAudioDeltaLogAt = performance.now();
}
function flushAudioSendLog() {
if (state.audioSendLogCount === 0) return;
addWsLog(
"send",
`input.audio binary x${state.audioSendLogCount} (${state.audioSendLogBytes} bytes)`,
);
state.audioSendLogCount = 0;
state.audioSendLogBytes = 0;
state.lastAudioSendLogAt = performance.now();
}
function flushPendingWsLogs() {
flushAudioDeltaLog();
flushAudioSendLog();
}
function logWsPayload(direction, payload) {
if (direction === "send") {
flushAudioSendLog();
} else {
flushAudioDeltaLog();
}
if (direction === "recv" && payload?.type === "response.audio.delta") {
state.audioDeltaLogCount += 1;
state.audioDeltaLogBytes += payload.bytes || payload.audio?.length || 0;
const now = performance.now();
if (now - state.lastAudioDeltaLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) {
flushAudioDeltaLog();
}
return;
}
addWsLog(direction, compactWsPayload(payload));
}
function logBinarySend(byteLength) {
state.audioSendLogCount += 1;
state.audioSendLogBytes += byteLength;
const now = performance.now();
if (now - state.lastAudioSendLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) {
flushAudioSendLog();
}
}
function wsSend(data) {
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
if (typeof data === "string") {
try {
logWsPayload("send", JSON.parse(data));
} catch (_) {
flushAudioSendLog();
flushAudioDeltaLog();
addWsLog("send", truncateLogValue(data));
}
} else {
const byteLength =
data instanceof ArrayBuffer
? data.byteLength
: ArrayBuffer.isView(data)
? data.byteLength
: 0;
if (byteLength > 0) {
logBinarySend(byteLength);
}
}
state.ws.send(data);
return true;
}
function clearWsLog() {
state.audioDeltaLogCount = 0;
state.audioDeltaLogBytes = 0;
state.audioSendLogCount = 0;
state.audioSendLogBytes = 0;
els.wsLog.innerHTML =
'<div class="ws-log__empty">No websocket events yet.</div>';
}
/* ---------------------------------------------------------------- Audio */
async function ensureAudioContext() {
if (!state.audioContext) {
const Ctx = window.AudioContext || window.webkitAudioContext;
state.audioContext = new Ctx();
await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js");
}
if (state.audioContext.state === "suspended") {
await state.audioContext.resume();
}
return state.audioContext;
}
function renderMicDevices() {
const previousValue = state.selectedMicDeviceId || els.micSelect.value;
els.micSelect.innerHTML = "";
const defaultOption = document.createElement("option");
defaultOption.value = "";
defaultOption.textContent = "Default microphone";
els.micSelect.appendChild(defaultOption);
state.micDevices.forEach((device, index) => {
const option = document.createElement("option");
option.value = device.deviceId;
option.textContent = device.label || `Microphone ${index + 1}`;
els.micSelect.appendChild(option);
});
const hasPrevious = state.micDevices.some(
(device) => device.deviceId === previousValue,
);
state.selectedMicDeviceId = hasPrevious ? previousValue : "";
els.micSelect.value = state.selectedMicDeviceId;
setMicSelectEnabled();
}
async function refreshMicDevices() {
if (!navigator.mediaDevices?.enumerateDevices) {
setMicSelectEnabled();
return;
}
try {
const devices = await navigator.mediaDevices.enumerateDevices();
state.micDevices = devices.filter((device) => device.kind === "audioinput");
renderMicDevices();
} catch (err) {
console.warn("Could not enumerate microphones", err);
setMicSelectEnabled();
}
}
async function startMic() {
const ctx = await ensureAudioContext();
const audioConstraints = {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
channelCount: 1,
};
if (state.selectedMicDeviceId) {
audioConstraints.deviceId = { exact: state.selectedMicDeviceId };
}
state.micStream = await navigator.mediaDevices.getUserMedia({
audio: audioConstraints,
video: false,
});
await refreshMicDevices();
state.micSourceNode = ctx.createMediaStreamSource(state.micStream);
state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", {
numberOfInputs: 1,
numberOfOutputs: 0,
channelCount: 1,
processorOptions: {
targetSampleRate: SAMPLE_RATE,
frameMs: FRAME_MS,
},
});
state.recorderNode.port.onmessage = (event) => {
const data = event.data;
if (!data || data.type !== "frame") return;
updateMeter(data.rms || 0);
if (state.connected) {
wsSend(data.buffer);
}
};
state.micSourceNode.connect(state.recorderNode);
state.micEnabled = true;
addWsLog("system", "mic capture started (binary input.audio frames)");
setMicButton();
}
function stopMic() {
const wasEnabled = state.micEnabled;
if (state.recorderNode) {
try {
state.recorderNode.port.onmessage = null;
state.recorderNode.disconnect();
} catch (_) {
/* ignore */
}
state.recorderNode = null;
}
if (state.micSourceNode) {
try {
state.micSourceNode.disconnect();
} catch (_) {
/* ignore */
}
state.micSourceNode = null;
}
if (state.micStream) {
for (const track of state.micStream.getTracks()) {
try {
track.stop();
} catch (_) {
/* ignore */
}
}
state.micStream = null;
}
state.micEnabled = false;
updateMeter(0);
if (wasEnabled) {
flushAudioSendLog();
addWsLog("system", "mic capture stopped");
}
setMicButton();
}
function updateMeter(rms) {
// Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech.
const target = Math.min(1, rms * 2.4);
state.meterLevel = state.meterLevel * 0.5 + target * 0.5;
els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`;
}
/* ---------------------------------------------------- Bot audio playback */
function schedulePlayback(int16) {
const ctx = state.audioContext;
if (!ctx) return;
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
}
const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE);
buffer.copyToChannel(float32, 0);
const src = ctx.createBufferSource();
src.buffer = buffer;
src.connect(ctx.destination);
const now = ctx.currentTime;
// Schedule immediately after the previously scheduled chunk to keep
// playback contiguous, with a tiny safety margin if we fell behind.
const startAt = Math.max(now + 0.02, state.nextPlaybackTime);
src.start(startAt);
state.nextPlaybackTime = startAt + buffer.duration;
state.playbackEndsAt = state.nextPlaybackTime;
src.onended = () => {
const idx = state.scheduledSources.indexOf(src);
if (idx >= 0) state.scheduledSources.splice(idx, 1);
};
state.scheduledSources.push(src);
setBotIndicator(true);
if (state.botUiTimer) clearTimeout(state.botUiTimer);
const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120;
state.botUiTimer = setTimeout(() => {
if (state.audioContext &&
state.audioContext.currentTime >= state.playbackEndsAt - 0.01) {
setBotIndicator(false);
}
}, msUntilEnd);
}
function stopPlaybackQueue() {
for (const src of state.scheduledSources) {
try {
src.onended = null;
src.stop();
src.disconnect();
} catch (_) {
/* already stopped */
}
}
state.scheduledSources = [];
resetPlaybackClock();
if (state.botUiTimer) {
clearTimeout(state.botUiTimer);
state.botUiTimer = null;
}
setBotIndicator(false);
}
function resetPlaybackClock() {
if (state.audioContext) {
state.nextPlaybackTime = state.audioContext.currentTime;
state.playbackEndsAt = state.audioContext.currentTime;
}
}
/* --------------------------------------------------------- Chat updates */
function handleUserTranscript(text) {
if (!text) return;
state.currentAssistantBubble = null;
if (state.currentUserBubble) {
const body = state.currentUserBubble.querySelector(".bubble__text");
body.textContent = text;
state.currentUserBubble.classList.remove("bubble--interim");
} else {
addBubble("user", text);
}
state.currentUserBubble = null;
}
function handleUserTranscriptInterim(text) {
if (!text) return;
state.currentAssistantBubble = null;
if (!state.currentUserBubble) {
state.currentUserBubble = addBubble("user", text);
state.currentUserBubble.classList.add("bubble--interim");
} else {
const body = state.currentUserBubble.querySelector(".bubble__text");
body.textContent = text;
}
scrollChatToBottom();
}
function sendText(text) {
const value = (text || "").trim();
if (!value) return false;
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
const message = {
type: "input.text",
text: value,
interrupt: true,
};
// The engine does not echo text input back as a transcript event, so we
// render the user bubble locally. Also interrupt any in-flight bot audio
// so the next reply is heard cleanly. We deliberately do NOT clear
// `currentAssistantBubble` here — the engine will emit a
// `response.text.final(interrupted=true)` for the in-flight assistant
// turn, which finalizes that bubble in place. A brand-new bubble for the
// reply will be created when `response.text.started` arrives.
wsSend(JSON.stringify(message));
stopPlaybackQueue();
state.currentUserBubble = null;
addBubble("user", value);
return true;
}
function handleAssistantDelta(text) {
if (!text) return;
if (!state.currentAssistantBubble) {
state.currentAssistantBubble = addBubble("assistant", "");
}
appendToBubble(state.currentAssistantBubble, text);
}
function handleAssistantStarted() {
state.currentAssistantBubble = null;
}
function handleAssistantFinal(text, interrupted) {
if (!text) {
state.currentAssistantBubble = null;
return;
}
if (state.currentAssistantBubble) {
const body = state.currentAssistantBubble.querySelector(".bubble__text");
body.textContent = text;
} else {
state.currentAssistantBubble = addBubble("assistant", text);
}
if (interrupted) {
state.currentAssistantBubble.classList.add("bubble--interrupted");
}
state.currentAssistantBubble = null;
scrollChatToBottom();
}
function finalizeAssistantBubble() {
state.currentAssistantBubble = null;
}
/* ---------------------------------------------------------- Websocket IO */
function decodeBase64ToInt16(b64) {
const binary = atob(b64);
const len = binary.length;
const bytes = new Uint8Array(len);
for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i);
return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2);
}
function handleEvent(event) {
switch (event.type) {
case "response.audio.delta":
if (typeof event.audio === "string") {
schedulePlayback(decodeBase64ToInt16(event.audio));
}
break;
case "response.audio.started":
setBotIndicator(true);
break;
case "response.audio.stopped":
finalizeAssistantBubble();
// The indicator turns off automatically when the playback queue drains.
break;
case "response.text.delta":
handleAssistantDelta(event.text);
break;
case "response.text.started":
handleAssistantStarted();
break;
case "response.text.final":
handleAssistantFinal(event.text, event.interrupted);
break;
case "input.transcript.final":
handleUserTranscript(event.text);
break;
case "input.transcript.interim":
handleUserTranscriptInterim(event.text);
break;
case "transport.message":
// Reserved for future structured messages; ignore silently.
break;
default:
// Unknown event type: log for debugging.
console.debug("ws event", event);
}
}
async function connect() {
if (state.connected || state.connecting) return;
const url = (els.url.value || "").trim();
if (!url) {
setStatus("error", "Missing URL");
return;
}
state.connecting = true;
setStatus("connecting", "Connecting…");
setConnectButton();
addWsLog("system", `connecting ${url}`);
try {
// Pre-warm audio context on user gesture so playback works on Safari.
await ensureAudioContext();
} catch (err) {
console.error("AudioContext failed", err);
state.connecting = false;
setStatus("error", "Audio init failed");
setConnectButton();
addWsLog("error", `audio init failed: ${err.message || err}`, "error");
return;
}
let ws;
try {
ws = new WebSocket(url);
} catch (err) {
console.error("WebSocket constructor failed", err);
state.connecting = false;
setStatus("error", "Bad URL");
setConnectButton();
addWsLog("error", `bad websocket URL: ${err.message || err}`, "error");
return;
}
ws.binaryType = "arraybuffer";
state.ws = ws;
ws.addEventListener("open", () => {
const startMessage = {
type: "session.start",
protocol: PROTOCOL,
audio: {
encoding: "pcm_s16le",
sample_rate: SAMPLE_RATE,
channels: CHANNELS,
},
};
state.connecting = false;
state.connected = true;
resetPlaybackClock();
addWsLog("system", "websocket open");
setStatus("connected", "Connected");
setConnectButton();
setMicButton();
setMicSelectEnabled();
refreshMicDevices();
wsSend(JSON.stringify(startMessage));
addBubble("system", "Session started.");
setComposerEnabled(true);
els.textInput.focus();
});
ws.addEventListener("message", (event) => {
const data = event.data;
if (typeof data === "string") {
let parsed;
try {
parsed = JSON.parse(data);
} catch (err) {
console.warn("Bad JSON from server", err, data);
addWsLog(
"error",
`invalid JSON from server: ${truncateLogValue(data)}`,
"error",
);
return;
}
logWsPayload("recv", parsed);
handleEvent(parsed);
} else if (data instanceof ArrayBuffer) {
// Server doesn't currently send binary, but handle it just in case.
addWsLog("recv", `binary audio ${data.byteLength} bytes`);
schedulePlayback(new Int16Array(data));
}
});
ws.addEventListener("error", (err) => {
console.error("WebSocket error", err);
setStatus("error", "Connection error");
addWsLog("error", "websocket error", "error");
});
ws.addEventListener("close", (event) => {
const wasConnected = state.connected;
state.ws = null;
state.connected = false;
state.connecting = false;
if (state.micEnabled) stopMic();
stopPlaybackQueue();
setConnectButton();
setMicButton();
setMicSelectEnabled();
setComposerEnabled(false);
setBotIndicator(false);
flushPendingWsLogs();
addWsLog(
"system",
`websocket close code=${event.code}${
event.reason ? ` reason=${event.reason}` : ""
}`,
);
if (wasConnected) {
addBubble(
"system",
`Session ended${event.reason ? `${event.reason}` : ""}.`,
);
setStatus("idle", "Disconnected");
} else {
setStatus("error", "Connection closed");
}
});
}
function disconnect() {
if (!state.ws) return;
try {
if (state.ws.readyState === WebSocket.OPEN) {
const stopMessage = { type: "session.stop", reason: "client_disconnect" };
wsSend(JSON.stringify(stopMessage));
}
} catch (_) {
/* ignore */
}
try {
state.ws.close(1000, "client_disconnect");
} catch (_) {
/* ignore */
}
}
/* ---------------------------------------------------------------- Wiring */
els.connectBtn.addEventListener("click", () => {
if (state.connected) disconnect();
else connect();
});
els.micBtn.addEventListener("click", async () => {
if (!state.connected) return;
els.micBtn.disabled = true;
try {
if (state.micEnabled) {
stopMic();
} else {
await startMic();
}
} catch (err) {
console.error("Mic error", err);
addBubble("system", `Mic error: ${err.message || err}`);
} finally {
els.micBtn.disabled = !state.connected;
}
});
els.micSelect.addEventListener("change", async () => {
state.selectedMicDeviceId = els.micSelect.value;
if (!state.micEnabled) return;
els.micSelect.disabled = true;
els.micBtn.disabled = true;
try {
stopMic();
await startMic();
} catch (err) {
console.error("Mic switch error", err);
addBubble("system", `Mic switch error: ${err.message || err}`);
} finally {
setMicButton();
setMicSelectEnabled();
}
});
if (navigator.mediaDevices?.addEventListener) {
navigator.mediaDevices.addEventListener("devicechange", refreshMicDevices);
}
els.clearBtn.addEventListener("click", () => {
clearChat();
});
els.clearWsLogBtn.addEventListener("click", () => {
clearWsLog();
});
function autosizeTextarea() {
const ta = els.textInput;
ta.style.height = "auto";
ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`;
}
function submitText() {
const value = els.textInput.value;
if (!sendText(value)) return;
els.textInput.value = "";
autosizeTextarea();
setComposerEnabled(state.connected);
}
els.composer.addEventListener("submit", (event) => {
event.preventDefault();
submitText();
});
els.textInput.addEventListener("input", () => {
autosizeTextarea();
setComposerEnabled(state.connected);
});
els.textInput.addEventListener("keydown", (event) => {
if (event.key === "Enter" && !event.shiftKey && !event.isComposing) {
event.preventDefault();
submitText();
}
});
window.addEventListener("beforeunload", () => {
if (state.ws) {
try {
state.ws.close();
} catch (_) {
/* ignore */
}
}
if (state.audioContext) {
try {
state.audioContext.close();
} catch (_) {
/* ignore */
}
}
});
setStatus("idle", "Disconnected");
setConnectButton();
setMicButton();
setMicSelectEnabled();
setComposerEnabled(false);