Files
engine-v5-pipecat-core/examples/webpage/app.js
2026-06-02 08:24:53 +08:00

1492 lines
42 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Minimal browser client for the AI VideoAssistant engine's product
* websocket (`/ws-product`, protocol `va.ws.v1`).
*
* Responsibilities:
* - Open/close the websocket and run the session handshake.
* - List/select microphones and capture mic audio with browser AEC enabled.
* - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames
* as binary websocket messages.
* - Play `response.audio.delta` frames gaplessly through Web Audio.
* - Render a chat-style history of user transcripts and bot text deltas.
* - Collapse high-frequency audio frames into expandable websocket log groups.
*/
const SAMPLE_RATE = 16000;
const CHANNELS = 1;
const FRAME_MS = 20;
const PROTOCOL = "va.ws.v1";
const MAX_WS_LOG_LINES = 120;
const MAX_GROUP_CHILDREN_RENDER = 100;
const WS_LOG_GROUP_KEYS = {
AUDIO_DELTA: "recv:response.audio.delta",
TEXT_DELTA: "recv:response.text.delta",
AUDIO_SEND: "send:input.audio",
};
const CAMERA_DONE_TEXT = "【拍摄完成】";
// Sample images shown as thumbnails under the camera preview. Same-origin files
// so they can be drawn to a canvas (for base64 + dimensions) without tainting.
const SAMPLE_IMAGES = [
{ src: "./samples/front-damage.jpg", label: "车辆前部" },
{ src: "./samples/plate.jpg", label: "车牌" },
{ src: "./samples/license.jpg", label: "驾驶证" },
{ src: "./samples/scene.jpg", label: "事故现场" },
];
// Cap the longer edge before JPEG-encoding so payloads stay small.
const IMAGE_MAX_DIM = 1280;
const IMAGE_JPEG_QUALITY = 0.85;
const CAMERA_STATE_PROMPTS = {
2000: "请对准车辆碰撞部位拍摄照片。",
2001: "请对准车辆碰撞部位拍摄照片。",
2002: "请对准被撞物品拍摄照片。",
2003: "请切换摄像头对准本人拍摄一张正面照片。",
2010: "请对准第一辆车碰撞部位拍摄。",
2011: "请对准第一辆车碰撞部位拍摄。",
2012: "请对准第二辆车碰撞部位拍摄。",
2013: "请对准第二方车辆侧后方,看清车牌拍摄。",
2014: "请拍摄另一方驾驶人的正面照片。",
2015: "请切换前置摄像头对准本人拍摄一张正面照片。",
};
function defaultWsUrl() {
const scheme = location.protocol === "https:" ? "wss:" : "ws:";
return `${scheme}//${location.host}/ws-product`;
}
const els = {
url: document.getElementById("ws-url"),
chatId: document.getElementById("chat-id"),
copyChatIdBtn: document.getElementById("copy-chat-id-btn"),
connectBtn: document.getElementById("connect-btn"),
statusDot: document.getElementById("status-dot"),
statusText: document.getElementById("status-text"),
conversation: document.getElementById("conversation"),
chatLog: document.getElementById("chat-log"),
micBtn: document.getElementById("mic-btn"),
micSelect: document.getElementById("mic-select"),
micLabel: document.querySelector(".mic-btn__label"),
micIndicator: document.getElementById("mic-indicator"),
botIndicator: document.getElementById("bot-indicator"),
stateIndicator: document.getElementById("state-indicator"),
stateLabel: document.getElementById("state-label"),
cameraDrawer: document.getElementById("camera-drawer"),
cameraState: document.getElementById("camera-state"),
cameraQuestion: document.getElementById("camera-question"),
cameraDoneBtn: document.getElementById("camera-done-btn"),
cameraPreview: document.getElementById("camera-preview"),
cameraVideo: document.getElementById("camera-video"),
cameraPhoto: document.getElementById("camera-photo"),
cameraCanvas: document.getElementById("camera-canvas"),
cameraStartBtn: document.getElementById("camera-start-btn"),
cameraFlipBtn: document.getElementById("camera-flip-btn"),
cameraUpload: document.getElementById("camera-upload"),
cameraSamples: document.getElementById("camera-samples"),
clearBtn: document.getElementById("clear-btn"),
clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
wsLog: document.getElementById("ws-log"),
meterFill: document.getElementById("meter-fill"),
composer: document.getElementById("composer"),
textInput: document.getElementById("text-input"),
sendBtn: document.getElementById("send-btn"),
};
function generateChatId() {
if (typeof crypto !== "undefined" && crypto.randomUUID) {
return `voice_${crypto.randomUUID().replaceAll("-", "").slice(0, 16)}`;
}
return `voice_${Date.now().toString(36)}${Math.random()
.toString(36)
.slice(2, 10)}`;
}
function currentChatIdInput() {
return (els.chatId.value || "").trim();
}
function wsUrlWithChatId(chatId) {
const rawUrl = (els.url.value || "").trim();
if (!rawUrl || !chatId) return rawUrl;
try {
const url = new URL(rawUrl, location.href);
url.searchParams.set("chatId", chatId);
return url.href;
} catch (_) {
const separator = rawUrl.includes("?") ? "&" : "?";
return `${rawUrl}${separator}chatId=${encodeURIComponent(chatId)}`;
}
}
const state = {
ws: null,
connected: false,
connecting: false,
chatId: "",
audioContext: null,
micStream: null,
micSourceNode: null,
recorderNode: null,
micEnabled: false,
micDevices: [],
selectedMicDeviceId: "",
// Output scheduling.
nextPlaybackTime: 0,
playbackEndsAt: 0,
scheduledSources: [],
botActive: false,
botUiTimer: null,
// Chat state.
currentAssistantBubble: null,
assistantState: "",
cameraState: "",
// Camera / image input.
cameraStream: null,
cameraActive: false,
cameraFacing: "environment",
pendingImage: null,
samplesRendered: false,
// VU meter smoothing.
meterLevel: 0,
// Collapsible websocket log groups for high-frequency audio frames.
wsLogGroup: null,
};
/* ------------------------------------------------------------------ UI */
function setStatus(kind, text) {
els.statusDot.className = `status__dot status__dot--${kind}`;
els.statusText.textContent = text;
}
function setConnectButton() {
els.chatId.disabled = state.connected || state.connecting;
els.copyChatIdBtn.disabled = !state.connected || !state.chatId;
if (state.connecting) {
els.connectBtn.textContent = "Connecting…";
els.connectBtn.disabled = true;
els.connectBtn.classList.remove("is-disconnect");
} else if (state.connected) {
els.connectBtn.textContent = "Disconnect";
els.connectBtn.disabled = false;
els.connectBtn.classList.add("is-disconnect");
} else {
els.connectBtn.textContent = "Connect";
els.connectBtn.disabled = false;
els.connectBtn.classList.remove("is-disconnect");
}
}
async function copyChatId() {
if (!state.connected || !state.chatId) return;
try {
await navigator.clipboard.writeText(state.chatId);
} catch (_) {
const selectionStart = els.chatId.selectionStart;
const selectionEnd = els.chatId.selectionEnd;
els.chatId.disabled = false;
els.chatId.select();
document.execCommand("copy");
els.chatId.setSelectionRange(selectionStart, selectionEnd);
els.chatId.disabled = true;
}
els.copyChatIdBtn.classList.add("copied");
window.setTimeout(() => {
els.copyChatIdBtn.classList.remove("copied");
}, 1200);
}
function setMicButton() {
els.micBtn.disabled = !state.connected;
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic";
els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic";
els.micIndicator.classList.toggle("is-active", state.micEnabled);
}
function setMicSelectEnabled() {
els.micSelect.disabled = !state.connected || !navigator.mediaDevices;
}
function setComposerEnabled(enabled) {
els.textInput.disabled = !enabled;
els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0;
setCameraButtonEnabled();
}
function setBotIndicator(active) {
els.botIndicator.classList.toggle("is-active", active);
}
function setAssistantState(value) {
const text = typeof value === "string" ? value.trim() : "";
const label = text.length > 32 ? `${text.slice(0, 31)}` : text;
state.assistantState = text;
els.stateIndicator.classList.toggle("is-active", Boolean(text));
els.stateLabel.textContent = label ? `State ${label}` : "State -";
els.stateIndicator.title = label ? `Assistant state: ${text}` : "Assistant state";
syncCameraDrawer(text);
}
function setCameraButtonEnabled() {
if (!els.cameraDoneBtn) return;
const wsReady =
state.connected && state.ws && state.ws.readyState === WebSocket.OPEN;
const hasImageSource = state.cameraActive || Boolean(state.pendingImage);
els.cameraDoneBtn.disabled = !wsReady || !state.cameraState || !hasImageSource;
}
function syncCameraDrawer(value) {
const prompt = CAMERA_STATE_PROMPTS[value];
const open = Boolean(prompt);
const wasOpen = Boolean(state.cameraState);
state.cameraState = open ? value : "";
els.cameraDrawer.classList.toggle("is-open", open);
els.conversation.classList.toggle("has-camera", open);
els.cameraDrawer.setAttribute("aria-hidden", open ? "false" : "true");
if (open) {
els.cameraState.textContent = `State ${value}`;
els.cameraQuestion.textContent = prompt;
renderSampleThumbnails();
} else {
els.cameraState.textContent = "State -";
els.cameraQuestion.textContent = "";
if (wasOpen) resetCameraInput();
}
setCameraButtonEnabled();
}
function updateCameraQuestion(text) {
const value = typeof text === "string" ? text.trim() : "";
if (!state.cameraState || !value) return;
els.cameraQuestion.textContent = value;
}
function addBubble(role, text) {
if (els.chatLog.querySelector(".chat__empty")) {
els.chatLog.innerHTML = "";
}
const bubble = document.createElement("div");
bubble.className = `bubble bubble--${role}`;
if (role !== "system") {
const tag = document.createElement("span");
tag.className = "bubble__role";
tag.textContent = role === "user" ? "You" : "Assistant";
bubble.appendChild(tag);
}
const body = document.createElement("span");
body.className = "bubble__text";
body.textContent = text;
bubble.appendChild(body);
els.chatLog.appendChild(bubble);
scrollChatToBottom();
return bubble;
}
// Render a single chat bubble holding an image and (optionally) text together.
function addImageBubble(role, imageUrl, text) {
if (els.chatLog.querySelector(".chat__empty")) {
els.chatLog.innerHTML = "";
}
const bubble = document.createElement("div");
bubble.className = `bubble bubble--${role}`;
if (role !== "system") {
const tag = document.createElement("span");
tag.className = "bubble__role";
tag.textContent = role === "user" ? "You" : "Assistant";
bubble.appendChild(tag);
}
const img = document.createElement("img");
img.className = "bubble__image";
img.src = imageUrl;
img.alt = text || "image";
bubble.appendChild(img);
const body = document.createElement("span");
body.className = "bubble__text";
body.textContent = text || "";
bubble.appendChild(body);
els.chatLog.appendChild(bubble);
scrollChatToBottom();
return bubble;
}
function appendToBubble(bubble, text) {
const body = bubble.querySelector(".bubble__text");
body.textContent += text;
scrollChatToBottom();
}
function scrollChatToBottom() {
els.chatLog.scrollTop = els.chatLog.scrollHeight;
}
function clearChat() {
els.chatLog.innerHTML = "";
state.currentAssistantBubble = null;
setAssistantState("");
const empty = document.createElement("div");
empty.className = "chat__empty";
empty.innerHTML = "<p>Chat cleared.</p>";
els.chatLog.appendChild(empty);
}
function truncateLogValue(value, maxLength = 160) {
const text = String(value);
if (text.length <= maxLength) return text;
return `${text.slice(0, maxLength - 1)}`;
}
function formatLogTime(date = new Date()) {
return date.toLocaleTimeString([], {
hour12: false,
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
});
}
function formatLogBytes(byteCount) {
if (byteCount >= 1048576) {
return `${(byteCount / 1048576).toFixed(2)} MB`;
}
if (byteCount >= 1024) {
return `${(byteCount / 1024).toFixed(1)} KB`;
}
return `${byteCount} bytes`;
}
function wsLogGroupLabel(groupKey) {
if (groupKey === WS_LOG_GROUP_KEYS.AUDIO_DELTA) {
return "response.audio.delta";
}
if (groupKey === WS_LOG_GROUP_KEYS.TEXT_DELTA) {
return "response.text.delta";
}
if (groupKey === WS_LOG_GROUP_KEYS.AUDIO_SEND) {
return "input.audio binary";
}
return "grouped events";
}
function ensureWsLogReady() {
if (els.wsLog.querySelector(".ws-log__empty")) {
els.wsLog.innerHTML = "";
}
}
function scrollWsLogToBottom() {
els.wsLog.scrollTop = els.wsLog.scrollHeight;
}
function trimWsLog() {
while (els.wsLog.children.length > MAX_WS_LOG_LINES) {
const first = els.wsLog.firstElementChild;
if (state.wsLogGroup?.element === first) {
state.wsLogGroup = null;
}
first.remove();
}
}
function finalizeWsLogGroup() {
state.wsLogGroup = null;
}
function createWsLogEntry(direction, detail, kind, timeText = formatLogTime()) {
const entry = document.createElement("div");
entry.className = `ws-log__entry ws-log__entry--${kind}`;
const time = document.createElement("span");
time.className = "ws-log__time";
time.textContent = timeText;
const dir = document.createElement("span");
dir.className = "ws-log__direction";
dir.textContent =
direction === "send"
? "SEND"
: direction === "recv"
? "RECV"
: direction.toUpperCase();
const body = document.createElement("span");
body.className = "ws-log__detail";
body.textContent = detail;
entry.append(time, dir, body);
return entry;
}
function updateWsLogGroupSummary(group) {
group.summaryEl.textContent = `${wsLogGroupLabel(group.key)} ×${group.count} (${formatLogBytes(group.totalBytes)})`;
}
function appendWsLogGroupChildDom(group, item) {
const entry = createWsLogEntry(
group.direction,
item.detail,
group.kind,
item.time,
);
entry.classList.add("ws-log__entry--child");
group.childrenEl.appendChild(entry);
const childEntries = group.childrenEl.querySelectorAll(".ws-log__entry");
if (childEntries.length > MAX_GROUP_CHILDREN_RENDER) {
const omit = group.childrenEl.querySelector(".ws-log__group-omit");
if (!omit) {
const omitted = document.createElement("div");
omitted.className = "ws-log__group-omit";
omitted.textContent = "… earlier events omitted";
group.childrenEl.insertBefore(omitted, group.childrenEl.firstElementChild);
}
childEntries[0].remove();
}
}
function renderWsLogGroupChildren(group) {
group.childrenEl.innerHTML = "";
const items = group.items;
const start = Math.max(0, items.length - MAX_GROUP_CHILDREN_RENDER);
if (start > 0) {
const omitted = document.createElement("div");
omitted.className = "ws-log__group-omit";
omitted.textContent = `${start} earlier events omitted`;
group.childrenEl.appendChild(omitted);
}
for (let i = start; i < items.length; i += 1) {
appendWsLogGroupChildDom(group, items[i]);
}
}
function toggleWsLogGroup(group) {
group.collapsed = !group.collapsed;
group.childrenEl.hidden = group.collapsed;
group.chevronEl.textContent = group.collapsed ? "▶" : "▼";
group.headerEl.setAttribute("aria-expanded", group.collapsed ? "false" : "true");
if (!group.collapsed && group.childrenEl.childElementCount === 0) {
renderWsLogGroupChildren(group);
}
}
function appendWsLogGroupItem(groupKey, direction, kind, itemDetail, byteCount = 0) {
ensureWsLogReady();
let group = state.wsLogGroup;
if (!group || group.key !== groupKey) {
finalizeWsLogGroup();
const groupEl = document.createElement("div");
groupEl.className = `ws-log__group ws-log__group--${kind}`;
const header = document.createElement("button");
header.type = "button";
header.className = "ws-log__group-header";
header.setAttribute("aria-expanded", "false");
const time = document.createElement("span");
time.className = "ws-log__time";
time.textContent = formatLogTime();
const dir = document.createElement("span");
dir.className = "ws-log__direction";
dir.textContent = direction === "send" ? "SEND" : "RECV";
const chevron = document.createElement("span");
chevron.className = "ws-log__group-chevron";
chevron.textContent = "▶";
chevron.setAttribute("aria-hidden", "true");
const summary = document.createElement("span");
summary.className = "ws-log__group-summary";
header.append(time, dir, chevron, summary);
const children = document.createElement("div");
children.className = "ws-log__group-children";
children.hidden = true;
groupEl.append(header, children);
els.wsLog.appendChild(groupEl);
group = {
key: groupKey,
direction,
kind,
element: groupEl,
headerEl: header,
chevronEl: chevron,
summaryEl: summary,
childrenEl: children,
collapsed: true,
count: 0,
totalBytes: 0,
items: [],
};
state.wsLogGroup = group;
header.addEventListener("click", () => toggleWsLogGroup(group));
}
group.count += 1;
group.totalBytes += byteCount;
const item = { time: formatLogTime(), detail: itemDetail };
group.items.push(item);
updateWsLogGroupSummary(group);
if (!group.collapsed) {
appendWsLogGroupChildDom(group, item);
}
trimWsLog();
scrollWsLogToBottom();
}
function compactWsPayload(payload) {
if (!payload || typeof payload !== "object") return String(payload);
const compact = { ...payload };
if (typeof compact.audio === "string") {
compact.audio = `<base64 ${compact.audio.length} chars>`;
}
if (typeof compact.image === "string") {
compact.image = `<base64 ${compact.image.length} chars>`;
}
if (typeof compact.data === "string" && compact.data.length > 160) {
compact.data = `<string ${compact.data.length} chars>`;
}
if (typeof compact.text === "string") {
compact.text = truncateLogValue(compact.text);
}
try {
return JSON.stringify(compact);
} catch (_) {
return payload.type || "unserializable websocket payload";
}
}
function addWsLog(direction, detail, kind = direction) {
finalizeWsLogGroup();
ensureWsLogReady();
els.wsLog.appendChild(createWsLogEntry(direction, detail, kind));
trimWsLog();
scrollWsLogToBottom();
}
function logWsPayload(direction, payload) {
if (direction === "recv" && payload?.type === "response.audio.delta") {
const bytes = payload.bytes || 0;
const detail =
payload.seq != null
? `seq=${payload.seq} (${bytes} bytes)`
: `(${bytes} bytes)`;
appendWsLogGroupItem(
WS_LOG_GROUP_KEYS.AUDIO_DELTA,
"recv",
"recv",
detail,
bytes,
);
return;
}
if (direction === "recv" && payload?.type === "response.text.delta") {
const text = typeof payload.text === "string" ? payload.text : "";
const bytes = new TextEncoder().encode(text).length;
const detail =
payload.seq != null
? `seq=${payload.seq} ${JSON.stringify(truncateLogValue(text, 120))}`
: JSON.stringify(truncateLogValue(text, 120));
appendWsLogGroupItem(
WS_LOG_GROUP_KEYS.TEXT_DELTA,
"recv",
"recv",
detail,
bytes,
);
return;
}
addWsLog(direction, compactWsPayload(payload));
}
function logBinarySend(byteLength) {
appendWsLogGroupItem(
WS_LOG_GROUP_KEYS.AUDIO_SEND,
"send",
"send",
`(${byteLength} bytes)`,
byteLength,
);
}
function wsSend(data) {
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
if (typeof data === "string") {
try {
logWsPayload("send", JSON.parse(data));
} catch (_) {
addWsLog("send", truncateLogValue(data));
}
} else {
const byteLength =
data instanceof ArrayBuffer
? data.byteLength
: ArrayBuffer.isView(data)
? data.byteLength
: 0;
if (byteLength > 0) {
logBinarySend(byteLength);
}
}
state.ws.send(data);
return true;
}
function clearWsLog() {
state.wsLogGroup = null;
els.wsLog.innerHTML =
'<div class="ws-log__empty">No websocket events yet.</div>';
}
/* ---------------------------------------------------------------- Audio */
async function ensureAudioContext() {
if (!state.audioContext) {
const Ctx = window.AudioContext || window.webkitAudioContext;
state.audioContext = new Ctx();
await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js");
}
if (state.audioContext.state === "suspended") {
await state.audioContext.resume();
}
return state.audioContext;
}
function renderMicDevices() {
const previousValue = state.selectedMicDeviceId || els.micSelect.value;
els.micSelect.innerHTML = "";
const defaultOption = document.createElement("option");
defaultOption.value = "";
defaultOption.textContent = "Default microphone";
els.micSelect.appendChild(defaultOption);
state.micDevices.forEach((device, index) => {
const option = document.createElement("option");
option.value = device.deviceId;
option.textContent = device.label || `Microphone ${index + 1}`;
els.micSelect.appendChild(option);
});
const hasPrevious = state.micDevices.some(
(device) => device.deviceId === previousValue,
);
state.selectedMicDeviceId = hasPrevious ? previousValue : "";
els.micSelect.value = state.selectedMicDeviceId;
setMicSelectEnabled();
}
async function refreshMicDevices() {
if (!navigator.mediaDevices?.enumerateDevices) {
setMicSelectEnabled();
return;
}
try {
const devices = await navigator.mediaDevices.enumerateDevices();
state.micDevices = devices.filter((device) => device.kind === "audioinput");
renderMicDevices();
} catch (err) {
console.warn("Could not enumerate microphones", err);
setMicSelectEnabled();
}
}
async function startMic() {
const ctx = await ensureAudioContext();
const audioConstraints = {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
channelCount: 1,
};
if (state.selectedMicDeviceId) {
audioConstraints.deviceId = { exact: state.selectedMicDeviceId };
}
state.micStream = await navigator.mediaDevices.getUserMedia({
audio: audioConstraints,
video: false,
});
await refreshMicDevices();
state.micSourceNode = ctx.createMediaStreamSource(state.micStream);
state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", {
numberOfInputs: 1,
numberOfOutputs: 0,
channelCount: 1,
processorOptions: {
targetSampleRate: SAMPLE_RATE,
frameMs: FRAME_MS,
},
});
state.recorderNode.port.onmessage = (event) => {
const data = event.data;
if (!data || data.type !== "frame") return;
updateMeter(data.rms || 0);
if (state.connected) {
wsSend(data.buffer);
}
};
state.micSourceNode.connect(state.recorderNode);
state.micEnabled = true;
addWsLog("system", "mic capture started (binary input.audio frames)");
setMicButton();
}
function stopMic() {
const wasEnabled = state.micEnabled;
if (state.recorderNode) {
try {
state.recorderNode.port.onmessage = null;
state.recorderNode.disconnect();
} catch (_) {
/* ignore */
}
state.recorderNode = null;
}
if (state.micSourceNode) {
try {
state.micSourceNode.disconnect();
} catch (_) {
/* ignore */
}
state.micSourceNode = null;
}
if (state.micStream) {
for (const track of state.micStream.getTracks()) {
try {
track.stop();
} catch (_) {
/* ignore */
}
}
state.micStream = null;
}
state.micEnabled = false;
updateMeter(0);
if (wasEnabled) {
addWsLog("system", "mic capture stopped");
}
setMicButton();
}
function updateMeter(rms) {
// Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech.
const target = Math.min(1, rms * 2.4);
state.meterLevel = state.meterLevel * 0.5 + target * 0.5;
els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`;
}
/* ---------------------------------------------------- Bot audio playback */
function schedulePlayback(int16) {
const ctx = state.audioContext;
if (!ctx) return;
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
}
const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE);
buffer.copyToChannel(float32, 0);
const src = ctx.createBufferSource();
src.buffer = buffer;
src.connect(ctx.destination);
const now = ctx.currentTime;
// Schedule immediately after the previously scheduled chunk to keep
// playback contiguous, with a tiny safety margin if we fell behind.
const startAt = Math.max(now + 0.02, state.nextPlaybackTime);
src.start(startAt);
state.nextPlaybackTime = startAt + buffer.duration;
state.playbackEndsAt = state.nextPlaybackTime;
src.onended = () => {
const idx = state.scheduledSources.indexOf(src);
if (idx >= 0) state.scheduledSources.splice(idx, 1);
};
state.scheduledSources.push(src);
setBotIndicator(true);
if (state.botUiTimer) clearTimeout(state.botUiTimer);
const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120;
state.botUiTimer = setTimeout(() => {
if (state.audioContext &&
state.audioContext.currentTime >= state.playbackEndsAt - 0.01) {
setBotIndicator(false);
}
}, msUntilEnd);
}
function stopPlaybackQueue() {
for (const src of state.scheduledSources) {
try {
src.onended = null;
src.stop();
src.disconnect();
} catch (_) {
/* already stopped */
}
}
state.scheduledSources = [];
resetPlaybackClock();
if (state.botUiTimer) {
clearTimeout(state.botUiTimer);
state.botUiTimer = null;
}
setBotIndicator(false);
}
function resetPlaybackClock() {
if (state.audioContext) {
state.nextPlaybackTime = state.audioContext.currentTime;
state.playbackEndsAt = state.audioContext.currentTime;
}
}
/* ------------------------------------------------------ Camera / image */
function setPreviewMode(mode) {
// mode: "camera" | "photo" | "idle"
els.cameraPreview.classList.toggle("is-camera", mode === "camera");
els.cameraPreview.classList.toggle("is-photo", mode === "photo");
}
// Draw an <img>/<video> source to the canvas and return a normalized payload
// (JPEG data URL + dimensions) suitable for an `input.image` message.
function mediaToPayload(source) {
const srcW = source.videoWidth || source.naturalWidth || source.width;
const srcH = source.videoHeight || source.naturalHeight || source.height;
if (!srcW || !srcH) return null;
let w = srcW;
let h = srcH;
const longest = Math.max(w, h);
if (longest > IMAGE_MAX_DIM) {
const scale = IMAGE_MAX_DIM / longest;
w = Math.round(w * scale);
h = Math.round(h * scale);
}
const canvas = els.cameraCanvas;
canvas.width = w;
canvas.height = h;
const ctx = canvas.getContext("2d");
ctx.drawImage(source, 0, 0, w, h);
let dataUrl;
try {
dataUrl = canvas.toDataURL("image/jpeg", IMAGE_JPEG_QUALITY);
} catch (err) {
addWsLog("system", `image encode failed: ${err.message || err}`);
return null;
}
return { dataUrl, mime: "image/jpeg", width: w, height: h };
}
function setPendingImage(payload) {
state.pendingImage = payload;
if (payload) {
els.cameraPhoto.src = payload.dataUrl;
setPreviewMode("photo");
}
setCameraButtonEnabled();
}
async function startCamera() {
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
addWsLog("system", "getUserMedia not available in this browser");
return;
}
stopCameraStream();
try {
state.cameraStream = await navigator.mediaDevices.getUserMedia({
video: { facingMode: state.cameraFacing },
audio: false,
});
} catch (err) {
addWsLog("system", `camera error: ${err.message || err}`);
return;
}
els.cameraVideo.srcObject = state.cameraStream;
try {
await els.cameraVideo.play();
} catch (_) {
/* autoplay may resolve later */
}
state.cameraActive = true;
state.pendingImage = null;
setPreviewMode("camera");
els.cameraStartBtn.classList.add("is-active");
els.cameraStartBtn.textContent = "重新拍摄";
els.cameraFlipBtn.hidden = false;
clearSampleSelection();
setCameraButtonEnabled();
}
function stopCameraStream() {
if (state.cameraStream) {
state.cameraStream.getTracks().forEach((track) => track.stop());
state.cameraStream = null;
}
els.cameraVideo.srcObject = null;
state.cameraActive = false;
els.cameraStartBtn.classList.remove("is-active");
els.cameraStartBtn.textContent = "使用摄像头";
els.cameraFlipBtn.hidden = true;
}
function captureFromCamera() {
const payload = mediaToPayload(els.cameraVideo);
if (!payload) return null;
stopCameraStream();
setPendingImage(payload);
return payload;
}
// Load a same-origin/object URL into an <img> and resolve once decoded.
function loadImage(src) {
return new Promise((resolve, reject) => {
const img = new Image();
img.onload = () => resolve(img);
img.onerror = () => reject(new Error(`failed to load image: ${src}`));
img.src = src;
});
}
async function selectFileImage(file) {
if (!file) return;
const objectUrl = URL.createObjectURL(file);
try {
const img = await loadImage(objectUrl);
const payload = mediaToPayload(img);
if (!payload) return;
stopCameraStream();
clearSampleSelection();
setPendingImage(payload);
} catch (err) {
addWsLog("system", `upload error: ${err.message || err}`);
} finally {
URL.revokeObjectURL(objectUrl);
}
}
async function selectSampleImage(src, buttonEl) {
try {
const img = await loadImage(src);
const payload = mediaToPayload(img);
if (!payload) return;
stopCameraStream();
clearSampleSelection();
if (buttonEl) buttonEl.classList.add("is-selected");
setPendingImage(payload);
} catch (err) {
addWsLog("system", `sample error: ${err.message || err}`);
}
}
function clearSampleSelection() {
els.cameraSamples
.querySelectorAll(".camera-drawer__sample.is-selected")
.forEach((el) => el.classList.remove("is-selected"));
}
function renderSampleThumbnails() {
if (state.samplesRendered) return;
state.samplesRendered = true;
els.cameraSamples.innerHTML = "";
for (const sample of SAMPLE_IMAGES) {
const btn = document.createElement("button");
btn.type = "button";
btn.className = "camera-drawer__sample";
btn.title = sample.label;
const img = document.createElement("img");
img.src = sample.src;
img.alt = sample.label;
btn.appendChild(img);
btn.addEventListener("click", () => selectSampleImage(sample.src, btn));
els.cameraSamples.appendChild(btn);
}
}
function resetCameraInput() {
stopCameraStream();
state.pendingImage = null;
clearSampleSelection();
els.cameraPhoto.removeAttribute("src");
setPreviewMode("idle");
setCameraButtonEnabled();
}
function sendImage(payload, text) {
if (!payload) return false;
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
const message = {
type: "input.image",
image: payload.dataUrl,
mime_type: payload.mime,
width: payload.width,
height: payload.height,
text: text || CAMERA_DONE_TEXT,
interrupt: true,
};
wsSend(JSON.stringify(message));
// Mirror the text-input path: interrupt in-flight bot audio and render the
// user's image + text together as one local bubble (the engine does not echo
// image input back as a transcript event).
stopPlaybackQueue();
state.currentAssistantBubble = null;
addImageBubble("user", payload.dataUrl, text || CAMERA_DONE_TEXT);
return true;
}
function submitCameraImage() {
// If the live camera is on, grab the current frame first; otherwise use the
// already-selected (uploaded / sample / captured) image.
let payload = state.pendingImage;
if (state.cameraActive) {
payload = captureFromCamera() || payload;
}
if (!payload) return;
// Keep the existing workflow contract: the accompanying text stays the
// "【拍摄完成】" marker that advances the FastGPT camera step; the image is
// the new multimodal attachment.
if (!sendImage(payload, CAMERA_DONE_TEXT)) return;
resetCameraInput();
}
/* --------------------------------------------------------- Chat updates */
function handleUserTranscript(text) {
if (!text) return;
state.currentAssistantBubble = null;
addBubble("user", text);
}
function sendText(text) {
const value = (text || "").trim();
if (!value) return false;
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
const message = {
type: "input.text",
text: value,
interrupt: true,
};
// The engine does not echo text input back as a transcript event, so we
// render the user bubble locally. Also interrupt any in-flight bot audio
// so the next reply is heard cleanly. We deliberately do NOT clear
// `currentAssistantBubble` here — the engine will emit a
// `response.text.final(interrupted=true)` for the in-flight assistant
// turn, which finalizes that bubble in place. A brand-new bubble for the
// reply will be created when `response.text.started` arrives.
wsSend(JSON.stringify(message));
stopPlaybackQueue();
addBubble("user", value);
return true;
}
function handleAssistantDelta(text) {
if (!text) return;
if (!state.currentAssistantBubble) {
state.currentAssistantBubble = addBubble("assistant", "");
}
appendToBubble(state.currentAssistantBubble, text);
}
function handleAssistantStarted() {
state.currentAssistantBubble = null;
}
function handleAssistantFinal(text, interrupted) {
if (!text) {
state.currentAssistantBubble = null;
return;
}
if (state.currentAssistantBubble) {
const body = state.currentAssistantBubble.querySelector(".bubble__text");
body.textContent = text;
} else {
state.currentAssistantBubble = addBubble("assistant", text);
}
if (interrupted) {
state.currentAssistantBubble.classList.add("bubble--interrupted");
}
updateCameraQuestion(text);
state.currentAssistantBubble = null;
scrollChatToBottom();
}
function finalizeAssistantBubble() {
state.currentAssistantBubble = null;
}
/* ---------------------------------------------------------- Websocket IO */
function decodeBase64ToInt16(b64) {
const binary = atob(b64);
const len = binary.length;
const bytes = new Uint8Array(len);
for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i);
return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2);
}
function handleEvent(event) {
switch (event.type) {
case "response.audio.delta":
if (typeof event.audio === "string") {
schedulePlayback(decodeBase64ToInt16(event.audio));
}
break;
case "response.audio.started":
setBotIndicator(true);
break;
case "response.audio.stopped":
finalizeAssistantBubble();
// The indicator turns off automatically when the playback queue drains.
break;
case "response.text.delta":
handleAssistantDelta(event.text);
break;
case "response.text.started":
handleAssistantStarted();
break;
case "response.text.final":
handleAssistantFinal(event.text, event.interrupted);
break;
case "response.state":
setAssistantState(event.state);
break;
case "input.transcript.final":
handleUserTranscript(event.text);
break;
case "input.transcript.interim":
// Ignore partial ASR updates; chat history renders committed user turns.
break;
case "transport.message":
// Reserved for future structured messages; ignore silently.
break;
default:
// Unknown event type: log for debugging.
console.debug("ws event", event);
}
}
async function connect() {
if (state.connected || state.connecting) return;
const inputChatId = currentChatIdInput();
const chatId = inputChatId || generateChatId();
const url = wsUrlWithChatId(chatId);
if (!url) {
setStatus("error", "Missing URL");
return;
}
state.connecting = true;
state.chatId = chatId;
els.chatId.value = chatId;
setStatus("connecting", "Connecting…");
setConnectButton();
addWsLog("system", `connecting ${url}`);
try {
// Pre-warm audio context on user gesture so playback works on Safari.
await ensureAudioContext();
} catch (err) {
console.error("AudioContext failed", err);
state.connecting = false;
state.chatId = "";
if (!inputChatId) els.chatId.value = "";
setStatus("error", "Audio init failed");
setConnectButton();
addWsLog("error", `audio init failed: ${err.message || err}`, "error");
return;
}
let ws;
try {
ws = new WebSocket(url);
} catch (err) {
console.error("WebSocket constructor failed", err);
state.connecting = false;
state.chatId = "";
if (!inputChatId) els.chatId.value = "";
setStatus("error", "Bad URL");
setConnectButton();
addWsLog("error", `bad websocket URL: ${err.message || err}`, "error");
return;
}
ws.binaryType = "arraybuffer";
state.ws = ws;
ws.addEventListener("open", () => {
const startMessage = {
type: "session.start",
protocol: PROTOCOL,
audio: {
encoding: "pcm_s16le",
sample_rate: SAMPLE_RATE,
channels: CHANNELS,
},
};
startMessage.chatId = state.chatId;
state.connecting = false;
state.connected = true;
resetPlaybackClock();
addWsLog("system", "websocket open");
setStatus("connected", "Connected");
setConnectButton();
setMicButton();
setMicSelectEnabled();
refreshMicDevices();
wsSend(JSON.stringify(startMessage));
addBubble("system", "Session started.");
setComposerEnabled(true);
setCameraButtonEnabled();
els.textInput.focus();
});
ws.addEventListener("message", (event) => {
const data = event.data;
if (typeof data === "string") {
let parsed;
try {
parsed = JSON.parse(data);
} catch (err) {
console.warn("Bad JSON from server", err, data);
addWsLog(
"error",
`invalid JSON from server: ${truncateLogValue(data)}`,
"error",
);
return;
}
logWsPayload("recv", parsed);
handleEvent(parsed);
} else if (data instanceof ArrayBuffer) {
// Server doesn't currently send binary, but handle it just in case.
addWsLog("recv", `binary audio ${data.byteLength} bytes`);
schedulePlayback(new Int16Array(data));
}
});
ws.addEventListener("error", (err) => {
console.error("WebSocket error", err);
setStatus("error", "Connection error");
addWsLog("error", "websocket error", "error");
});
ws.addEventListener("close", (event) => {
const wasConnected = state.connected;
state.ws = null;
state.connected = false;
state.connecting = false;
state.chatId = "";
setAssistantState("");
if (state.micEnabled) stopMic();
stopPlaybackQueue();
setConnectButton();
setMicButton();
setMicSelectEnabled();
setComposerEnabled(false);
setCameraButtonEnabled();
setBotIndicator(false);
finalizeWsLogGroup();
addWsLog(
"system",
`websocket close code=${event.code}${
event.reason ? ` reason=${event.reason}` : ""
}`,
);
if (wasConnected) {
addBubble(
"system",
`Session ended${event.reason ? `${event.reason}` : ""}.`,
);
setStatus("idle", "Disconnected");
} else {
setStatus("error", "Connection closed");
}
});
}
function disconnect() {
if (!state.ws) return;
try {
if (state.ws.readyState === WebSocket.OPEN) {
const stopMessage = { type: "session.stop", reason: "client_disconnect" };
wsSend(JSON.stringify(stopMessage));
}
} catch (_) {
/* ignore */
}
try {
state.ws.close(1000, "client_disconnect");
} catch (_) {
/* ignore */
}
}
/* ---------------------------------------------------------------- Wiring */
els.connectBtn.addEventListener("click", () => {
if (state.connected) disconnect();
else connect();
});
els.copyChatIdBtn.addEventListener("click", copyChatId);
els.micBtn.addEventListener("click", async () => {
if (!state.connected) return;
els.micBtn.disabled = true;
try {
if (state.micEnabled) {
stopMic();
} else {
await startMic();
}
} catch (err) {
console.error("Mic error", err);
addBubble("system", `Mic error: ${err.message || err}`);
} finally {
els.micBtn.disabled = !state.connected;
}
});
els.micSelect.addEventListener("change", async () => {
state.selectedMicDeviceId = els.micSelect.value;
if (!state.micEnabled) return;
els.micSelect.disabled = true;
els.micBtn.disabled = true;
try {
stopMic();
await startMic();
} catch (err) {
console.error("Mic switch error", err);
addBubble("system", `Mic switch error: ${err.message || err}`);
} finally {
setMicButton();
setMicSelectEnabled();
}
});
if (navigator.mediaDevices?.addEventListener) {
navigator.mediaDevices.addEventListener("devicechange", refreshMicDevices);
}
els.clearBtn.addEventListener("click", () => {
clearChat();
});
els.clearWsLogBtn.addEventListener("click", () => {
clearWsLog();
});
els.cameraDoneBtn.addEventListener("click", () => {
if (!state.cameraState) return;
submitCameraImage();
});
els.cameraStartBtn.addEventListener("click", () => {
startCamera();
});
els.cameraFlipBtn.addEventListener("click", () => {
state.cameraFacing =
state.cameraFacing === "environment" ? "user" : "environment";
if (state.cameraActive) startCamera();
});
els.cameraUpload.addEventListener("change", (event) => {
const file = event.target.files && event.target.files[0];
selectFileImage(file);
event.target.value = "";
});
function autosizeTextarea() {
const ta = els.textInput;
ta.style.height = "auto";
ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`;
}
function submitText() {
const value = els.textInput.value;
if (!sendText(value)) return;
els.textInput.value = "";
autosizeTextarea();
setComposerEnabled(state.connected);
}
els.composer.addEventListener("submit", (event) => {
event.preventDefault();
submitText();
});
els.textInput.addEventListener("input", () => {
autosizeTextarea();
setComposerEnabled(state.connected);
});
els.textInput.addEventListener("keydown", (event) => {
if (event.key === "Enter" && !event.shiftKey && !event.isComposing) {
event.preventDefault();
submitText();
}
});
window.addEventListener("beforeunload", () => {
stopCameraStream();
if (state.ws) {
try {
state.ws.close();
} catch (_) {
/* ignore */
}
}
if (state.audioContext) {
try {
state.audioContext.close();
} catch (_) {
/* ignore */
}
}
});
els.url.value = defaultWsUrl();
setStatus("idle", "Disconnected");
setConnectButton();
setMicButton();
setMicSelectEnabled();
setComposerEnabled(false);