Files
ZNJJ-api-server/static/voice-demo/app.js
2026-06-03 12:36:18 +08:00

1545 lines
44 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Minimal browser client for the AI VideoAssistant engine's product
* websocket (`/ws-product`, protocol `va.ws.v1`).
*
* Responsibilities:
* - Open/close the websocket and run the session handshake.
* - List/select microphones and capture mic audio with browser AEC enabled.
* - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames
* as binary websocket messages.
* - Play `response.audio.delta` frames gaplessly through Web Audio.
* - Render a chat-style history of user transcripts and bot text deltas.
* - Collapse high-frequency audio frames into expandable websocket log groups.
*/
const SAMPLE_RATE = 16000;
const CHANNELS = 1;
const FRAME_MS = 20;
const PROTOCOL = "va.ws.v1";
const MAX_WS_LOG_LINES = 120;
const MAX_GROUP_CHILDREN_RENDER = 100;
const WS_LOG_GROUP_KEYS = {
AUDIO_DELTA: "recv:response.audio.delta",
TEXT_DELTA: "recv:response.text.delta",
AUDIO_SEND: "send:input.audio",
};
const CAMERA_DONE_TEXT = "【拍摄完成】";
// Sample images shown as thumbnails under the camera preview. Same-origin files
// so they can be drawn to a canvas (for base64 + dimensions) without tainting.
const SAMPLE_IMAGES = [
{ src: "./samples/damage1.png", label: "车辆前部" },
{ src: "./samples/damage2.png", label: "车辆后部" },
{ src: "./samples/plate1.jpg", label: "车牌 1" },
{ src: "./samples/plate2.jpg", label: "车牌 2" },
{ src: "./samples/user1.jpg", label: "人物 1" },
{ src: "./samples/user2.jpg", label: "人物 2" },
];
// Cap the longer edge before JPEG-encoding so payloads stay small.
const IMAGE_MAX_DIM = 1280;
const IMAGE_JPEG_QUALITY = 0.85;
const CAMERA_STATE_PROMPTS = {
2000: "请对准车辆碰撞部位拍摄照片。",
2001: "请对准车辆碰撞部位拍摄照片。",
2002: "请对准被撞物品拍摄照片。",
2003: "请切换摄像头对准本人拍摄一张正面照片。",
2010: "请对准第一辆车碰撞部位拍摄。",
2011: "请对准第一辆车碰撞部位拍摄。",
2012: "请对准第二辆车碰撞部位拍摄。",
2013: "请对准第二方车辆侧后方,看清车牌拍摄。",
2014: "请拍摄另一方驾驶人的正面照片。",
2015: "请切换前置摄像头对准本人拍摄一张正面照片。",
};
function defaultWsUrl() {
const scheme = location.protocol === "https:" ? "wss:" : "ws:";
return `${scheme}//${location.host}/ws-product`;
}
const els = {
url: document.getElementById("ws-url"),
chatId: document.getElementById("chat-id"),
copyChatIdBtn: document.getElementById("copy-chat-id-btn"),
connectBtn: document.getElementById("connect-btn"),
statusDot: document.getElementById("status-dot"),
statusText: document.getElementById("status-text"),
conversation: document.getElementById("conversation"),
chatLog: document.getElementById("chat-log"),
micBtn: document.getElementById("mic-btn"),
micSelect: document.getElementById("mic-select"),
micLabel: document.querySelector(".mic-btn__label"),
micIndicator: document.getElementById("mic-indicator"),
botIndicator: document.getElementById("bot-indicator"),
stateIndicator: document.getElementById("state-indicator"),
stateLabel: document.getElementById("state-label"),
cameraDrawer: document.getElementById("camera-drawer"),
cameraState: document.getElementById("camera-state"),
cameraQuestion: document.getElementById("camera-question"),
cameraDoneBtn: document.getElementById("camera-done-btn"),
cameraPreview: document.getElementById("camera-preview"),
cameraVideo: document.getElementById("camera-video"),
cameraPhoto: document.getElementById("camera-photo"),
cameraCanvas: document.getElementById("camera-canvas"),
cameraStartBtn: document.getElementById("camera-start-btn"),
cameraDeviceRow: document.getElementById("camera-device-row"),
cameraDeviceSelect: document.getElementById("camera-device-select"),
cameraUpload: document.getElementById("camera-upload"),
cameraSamples: document.getElementById("camera-samples"),
clearBtn: document.getElementById("clear-btn"),
clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
wsLog: document.getElementById("ws-log"),
meterFill: document.getElementById("meter-fill"),
composer: document.getElementById("composer"),
textInput: document.getElementById("text-input"),
sendBtn: document.getElementById("send-btn"),
};
function generateChatId() {
if (typeof crypto !== "undefined" && crypto.randomUUID) {
return `voice_${crypto.randomUUID().replaceAll("-", "").slice(0, 16)}`;
}
return `voice_${Date.now().toString(36)}${Math.random()
.toString(36)
.slice(2, 10)}`;
}
function currentChatIdInput() {
return (els.chatId.value || "").trim();
}
function wsUrlWithChatId(chatId) {
const rawUrl = (els.url.value || "").trim();
if (!rawUrl || !chatId) return rawUrl;
try {
const url = new URL(rawUrl, location.href);
url.searchParams.set("chatId", chatId);
return url.href;
} catch (_) {
const separator = rawUrl.includes("?") ? "&" : "?";
return `${rawUrl}${separator}chatId=${encodeURIComponent(chatId)}`;
}
}
const state = {
ws: null,
connected: false,
connecting: false,
chatId: "",
audioContext: null,
micStream: null,
micSourceNode: null,
recorderNode: null,
micEnabled: false,
micDevices: [],
selectedMicDeviceId: "",
// Output scheduling.
nextPlaybackTime: 0,
playbackEndsAt: 0,
scheduledSources: [],
botActive: false,
botUiTimer: null,
// Chat state.
currentAssistantBubble: null,
assistantState: "",
cameraState: "",
// Camera / image input.
cameraStream: null,
cameraActive: false,
cameraFacing: "environment",
videoDevices: [],
pendingImage: null,
samplesRendered: false,
// VU meter smoothing.
meterLevel: 0,
// Collapsible websocket log groups for high-frequency audio frames.
wsLogGroup: null,
};
/* ------------------------------------------------------------------ UI */
function setStatus(kind, text) {
els.statusDot.className = `status__dot status__dot--${kind}`;
els.statusText.textContent = text;
}
function setConnectButton() {
els.chatId.disabled = state.connected || state.connecting;
els.copyChatIdBtn.disabled = !state.connected || !state.chatId;
if (state.connecting) {
els.connectBtn.textContent = "连接中…";
els.connectBtn.disabled = true;
els.connectBtn.classList.remove("is-disconnect");
} else if (state.connected) {
els.connectBtn.textContent = "断开连接";
els.connectBtn.disabled = false;
els.connectBtn.classList.add("is-disconnect");
} else {
els.connectBtn.textContent = "连接";
els.connectBtn.disabled = false;
els.connectBtn.classList.remove("is-disconnect");
}
}
async function copyChatId() {
if (!state.connected || !state.chatId) return;
try {
await navigator.clipboard.writeText(state.chatId);
} catch (_) {
const selectionStart = els.chatId.selectionStart;
const selectionEnd = els.chatId.selectionEnd;
els.chatId.disabled = false;
els.chatId.select();
document.execCommand("copy");
els.chatId.setSelectionRange(selectionStart, selectionEnd);
els.chatId.disabled = true;
}
els.copyChatIdBtn.classList.add("copied");
window.setTimeout(() => {
els.copyChatIdBtn.classList.remove("copied");
}, 1200);
}
function setMicButton() {
els.micBtn.disabled = !state.connected;
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
els.micBtn.title = state.micEnabled ? "关闭麦克风" : "开启麦克风";
els.micLabel.textContent = state.micEnabled ? "关闭麦克风" : "开启麦克风";
els.micIndicator.classList.toggle("is-active", state.micEnabled);
}
function setMicSelectEnabled() {
els.micSelect.disabled = !state.connected || !navigator.mediaDevices;
}
function setComposerEnabled(enabled) {
els.textInput.disabled = !enabled;
els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0;
setCameraButtonEnabled();
}
function setBotIndicator(active) {
els.botIndicator.classList.toggle("is-active", active);
}
function setAssistantState(value) {
const text = typeof value === "string" ? value.trim() : "";
const label = text.length > 32 ? `${text.slice(0, 31)}` : text;
state.assistantState = text;
els.stateIndicator.classList.toggle("is-active", Boolean(text));
els.stateLabel.textContent = label ? `状态 ${label}` : "状态 -";
els.stateIndicator.title = label ? `助手状态:${text}` : "助手状态";
syncCameraDrawer(text);
}
function setCameraButtonEnabled() {
if (!els.cameraDoneBtn) return;
const wsReady =
state.connected && state.ws && state.ws.readyState === WebSocket.OPEN;
const hasImageSource = state.cameraActive || Boolean(state.pendingImage);
els.cameraDoneBtn.disabled = !wsReady || !state.cameraState || !hasImageSource;
}
function syncCameraDrawer(value) {
const prompt = CAMERA_STATE_PROMPTS[value];
const open = Boolean(prompt);
const wasOpen = Boolean(state.cameraState);
state.cameraState = open ? value : "";
els.cameraDrawer.classList.toggle("is-open", open);
els.conversation.classList.toggle("has-camera", open);
els.cameraDrawer.setAttribute("aria-hidden", open ? "false" : "true");
if (open) {
els.cameraState.textContent = `状态 ${value}`;
els.cameraQuestion.textContent = prompt;
renderSampleThumbnails();
selectDefaultImage();
} else {
els.cameraState.textContent = "状态 -";
els.cameraQuestion.textContent = "";
if (wasOpen) resetCameraInput();
}
setCameraButtonEnabled();
}
function addBubble(role, text) {
if (els.chatLog.querySelector(".chat__empty")) {
els.chatLog.innerHTML = "";
}
const bubble = document.createElement("div");
bubble.className = `bubble bubble--${role}`;
if (role !== "system") {
const tag = document.createElement("span");
tag.className = "bubble__role";
tag.textContent = role === "user" ? "你" : "助手";
bubble.appendChild(tag);
}
const body = document.createElement("span");
body.className = "bubble__text";
body.textContent = text;
bubble.appendChild(body);
els.chatLog.appendChild(bubble);
scrollChatToBottom();
return bubble;
}
// Render a single chat bubble holding an image and (optionally) text together.
function addImageBubble(role, imageUrl, text) {
if (els.chatLog.querySelector(".chat__empty")) {
els.chatLog.innerHTML = "";
}
const bubble = document.createElement("div");
bubble.className = `bubble bubble--${role}`;
if (role !== "system") {
const tag = document.createElement("span");
tag.className = "bubble__role";
tag.textContent = role === "user" ? "你" : "助手";
bubble.appendChild(tag);
}
const img = document.createElement("img");
img.className = "bubble__image";
img.src = imageUrl;
img.alt = text || "image";
bubble.appendChild(img);
const body = document.createElement("span");
body.className = "bubble__text";
body.textContent = text || "";
bubble.appendChild(body);
els.chatLog.appendChild(bubble);
scrollChatToBottom();
return bubble;
}
function appendToBubble(bubble, text) {
const body = bubble.querySelector(".bubble__text");
body.textContent += text;
scrollChatToBottom();
}
function scrollChatToBottom() {
els.chatLog.scrollTop = els.chatLog.scrollHeight;
}
function clearChat() {
els.chatLog.innerHTML = "";
state.currentAssistantBubble = null;
setAssistantState("");
const empty = document.createElement("div");
empty.className = "chat__empty";
empty.innerHTML = "<p>对话已清空。</p>";
els.chatLog.appendChild(empty);
}
function truncateLogValue(value, maxLength = 160) {
const text = String(value);
if (text.length <= maxLength) return text;
return `${text.slice(0, maxLength - 1)}`;
}
function formatLogTime(date = new Date()) {
return date.toLocaleTimeString([], {
hour12: false,
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
});
}
function formatLogBytes(byteCount) {
if (byteCount >= 1048576) {
return `${(byteCount / 1048576).toFixed(2)} MB`;
}
if (byteCount >= 1024) {
return `${(byteCount / 1024).toFixed(1)} KB`;
}
return `${byteCount} bytes`;
}
function wsLogGroupLabel(groupKey) {
if (groupKey === WS_LOG_GROUP_KEYS.AUDIO_DELTA) {
return "response.audio.delta";
}
if (groupKey === WS_LOG_GROUP_KEYS.TEXT_DELTA) {
return "response.text.delta";
}
if (groupKey === WS_LOG_GROUP_KEYS.AUDIO_SEND) {
return "input.audio binary";
}
return "grouped events";
}
function ensureWsLogReady() {
if (els.wsLog.querySelector(".ws-log__empty")) {
els.wsLog.innerHTML = "";
}
}
function scrollWsLogToBottom() {
els.wsLog.scrollTop = els.wsLog.scrollHeight;
}
function trimWsLog() {
while (els.wsLog.children.length > MAX_WS_LOG_LINES) {
const first = els.wsLog.firstElementChild;
if (state.wsLogGroup?.element === first) {
state.wsLogGroup = null;
}
first.remove();
}
}
function finalizeWsLogGroup() {
state.wsLogGroup = null;
}
function createWsLogEntry(direction, detail, kind, timeText = formatLogTime()) {
const entry = document.createElement("div");
entry.className = `ws-log__entry ws-log__entry--${kind}`;
const time = document.createElement("span");
time.className = "ws-log__time";
time.textContent = timeText;
const dir = document.createElement("span");
dir.className = "ws-log__direction";
dir.textContent =
direction === "send"
? "SEND"
: direction === "recv"
? "RECV"
: direction.toUpperCase();
const body = document.createElement("span");
body.className = "ws-log__detail";
body.textContent = detail;
entry.append(time, dir, body);
return entry;
}
function updateWsLogGroupSummary(group) {
group.summaryEl.textContent = `${wsLogGroupLabel(group.key)} ×${group.count} (${formatLogBytes(group.totalBytes)})`;
}
function appendWsLogGroupChildDom(group, item) {
const entry = createWsLogEntry(
group.direction,
item.detail,
group.kind,
item.time,
);
entry.classList.add("ws-log__entry--child");
group.childrenEl.appendChild(entry);
const childEntries = group.childrenEl.querySelectorAll(".ws-log__entry");
if (childEntries.length > MAX_GROUP_CHILDREN_RENDER) {
const omit = group.childrenEl.querySelector(".ws-log__group-omit");
if (!omit) {
const omitted = document.createElement("div");
omitted.className = "ws-log__group-omit";
omitted.textContent = "… earlier events omitted";
group.childrenEl.insertBefore(omitted, group.childrenEl.firstElementChild);
}
childEntries[0].remove();
}
}
function renderWsLogGroupChildren(group) {
group.childrenEl.innerHTML = "";
const items = group.items;
const start = Math.max(0, items.length - MAX_GROUP_CHILDREN_RENDER);
if (start > 0) {
const omitted = document.createElement("div");
omitted.className = "ws-log__group-omit";
omitted.textContent = `${start} earlier events omitted`;
group.childrenEl.appendChild(omitted);
}
for (let i = start; i < items.length; i += 1) {
appendWsLogGroupChildDom(group, items[i]);
}
}
function toggleWsLogGroup(group) {
group.collapsed = !group.collapsed;
group.childrenEl.hidden = group.collapsed;
group.chevronEl.textContent = group.collapsed ? "▶" : "▼";
group.headerEl.setAttribute("aria-expanded", group.collapsed ? "false" : "true");
if (!group.collapsed && group.childrenEl.childElementCount === 0) {
renderWsLogGroupChildren(group);
}
}
function appendWsLogGroupItem(groupKey, direction, kind, itemDetail, byteCount = 0) {
ensureWsLogReady();
let group = state.wsLogGroup;
if (!group || group.key !== groupKey) {
finalizeWsLogGroup();
const groupEl = document.createElement("div");
groupEl.className = `ws-log__group ws-log__group--${kind}`;
const header = document.createElement("button");
header.type = "button";
header.className = "ws-log__group-header";
header.setAttribute("aria-expanded", "false");
const time = document.createElement("span");
time.className = "ws-log__time";
time.textContent = formatLogTime();
const dir = document.createElement("span");
dir.className = "ws-log__direction";
dir.textContent = direction === "send" ? "SEND" : "RECV";
const chevron = document.createElement("span");
chevron.className = "ws-log__group-chevron";
chevron.textContent = "▶";
chevron.setAttribute("aria-hidden", "true");
const summary = document.createElement("span");
summary.className = "ws-log__group-summary";
header.append(time, dir, chevron, summary);
const children = document.createElement("div");
children.className = "ws-log__group-children";
children.hidden = true;
groupEl.append(header, children);
els.wsLog.appendChild(groupEl);
group = {
key: groupKey,
direction,
kind,
element: groupEl,
headerEl: header,
chevronEl: chevron,
summaryEl: summary,
childrenEl: children,
collapsed: true,
count: 0,
totalBytes: 0,
items: [],
};
state.wsLogGroup = group;
header.addEventListener("click", () => toggleWsLogGroup(group));
}
group.count += 1;
group.totalBytes += byteCount;
const item = { time: formatLogTime(), detail: itemDetail };
group.items.push(item);
updateWsLogGroupSummary(group);
if (!group.collapsed) {
appendWsLogGroupChildDom(group, item);
}
trimWsLog();
scrollWsLogToBottom();
}
function compactWsPayload(payload) {
if (!payload || typeof payload !== "object") return String(payload);
const compact = { ...payload };
if (typeof compact.audio === "string") {
compact.audio = `<base64 ${compact.audio.length} chars>`;
}
if (typeof compact.image === "string") {
compact.image = `<base64 ${compact.image.length} chars>`;
}
if (typeof compact.data === "string" && compact.data.length > 160) {
compact.data = `<string ${compact.data.length} chars>`;
}
if (typeof compact.text === "string") {
compact.text = truncateLogValue(compact.text);
}
try {
return JSON.stringify(compact);
} catch (_) {
return payload.type || "unserializable websocket payload";
}
}
function addWsLog(direction, detail, kind = direction) {
finalizeWsLogGroup();
ensureWsLogReady();
els.wsLog.appendChild(createWsLogEntry(direction, detail, kind));
trimWsLog();
scrollWsLogToBottom();
}
function logWsPayload(direction, payload) {
if (direction === "recv" && payload?.type === "response.audio.delta") {
const bytes = payload.bytes || 0;
const detail =
payload.seq != null
? `seq=${payload.seq} (${bytes} bytes)`
: `(${bytes} bytes)`;
appendWsLogGroupItem(
WS_LOG_GROUP_KEYS.AUDIO_DELTA,
"recv",
"recv",
detail,
bytes,
);
return;
}
if (direction === "recv" && payload?.type === "response.text.delta") {
const text = typeof payload.text === "string" ? payload.text : "";
const bytes = new TextEncoder().encode(text).length;
const detail =
payload.seq != null
? `seq=${payload.seq} ${JSON.stringify(truncateLogValue(text, 120))}`
: JSON.stringify(truncateLogValue(text, 120));
appendWsLogGroupItem(
WS_LOG_GROUP_KEYS.TEXT_DELTA,
"recv",
"recv",
detail,
bytes,
);
return;
}
addWsLog(direction, compactWsPayload(payload));
}
function logBinarySend(byteLength) {
appendWsLogGroupItem(
WS_LOG_GROUP_KEYS.AUDIO_SEND,
"send",
"send",
`(${byteLength} bytes)`,
byteLength,
);
}
function wsSend(data) {
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
if (typeof data === "string") {
try {
logWsPayload("send", JSON.parse(data));
} catch (_) {
addWsLog("send", truncateLogValue(data));
}
} else {
const byteLength =
data instanceof ArrayBuffer
? data.byteLength
: ArrayBuffer.isView(data)
? data.byteLength
: 0;
if (byteLength > 0) {
logBinarySend(byteLength);
}
}
state.ws.send(data);
return true;
}
function clearWsLog() {
state.wsLogGroup = null;
els.wsLog.innerHTML =
'<div class="ws-log__empty">暂无 WebSocket 事件。</div>';
}
/* ---------------------------------------------------------------- Audio */
async function ensureAudioContext() {
if (!state.audioContext) {
const Ctx = window.AudioContext || window.webkitAudioContext;
state.audioContext = new Ctx();
await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js");
}
if (state.audioContext.state === "suspended") {
await state.audioContext.resume();
}
return state.audioContext;
}
function renderMicDevices() {
const previousValue = state.selectedMicDeviceId || els.micSelect.value;
els.micSelect.innerHTML = "";
const defaultOption = document.createElement("option");
defaultOption.value = "";
defaultOption.textContent = "默认麦克风";
els.micSelect.appendChild(defaultOption);
state.micDevices.forEach((device, index) => {
const option = document.createElement("option");
option.value = device.deviceId;
option.textContent = device.label || `麦克风 ${index + 1}`;
els.micSelect.appendChild(option);
});
const hasPrevious = state.micDevices.some(
(device) => device.deviceId === previousValue,
);
state.selectedMicDeviceId = hasPrevious ? previousValue : "";
els.micSelect.value = state.selectedMicDeviceId;
setMicSelectEnabled();
}
async function refreshMicDevices() {
if (!navigator.mediaDevices?.enumerateDevices) {
setMicSelectEnabled();
return;
}
try {
const devices = await navigator.mediaDevices.enumerateDevices();
state.micDevices = devices.filter((device) => device.kind === "audioinput");
renderMicDevices();
} catch (err) {
console.warn("Could not enumerate microphones", err);
setMicSelectEnabled();
}
}
async function startMic() {
const ctx = await ensureAudioContext();
const audioConstraints = {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
channelCount: 1,
};
if (state.selectedMicDeviceId) {
audioConstraints.deviceId = { exact: state.selectedMicDeviceId };
}
state.micStream = await navigator.mediaDevices.getUserMedia({
audio: audioConstraints,
video: false,
});
await refreshMicDevices();
state.micSourceNode = ctx.createMediaStreamSource(state.micStream);
state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", {
numberOfInputs: 1,
numberOfOutputs: 0,
channelCount: 1,
processorOptions: {
targetSampleRate: SAMPLE_RATE,
frameMs: FRAME_MS,
},
});
state.recorderNode.port.onmessage = (event) => {
const data = event.data;
if (!data || data.type !== "frame") return;
updateMeter(data.rms || 0);
if (state.connected) {
wsSend(data.buffer);
}
};
state.micSourceNode.connect(state.recorderNode);
state.micEnabled = true;
addWsLog("system", "麦克风已开启PCM 音频流)");
setMicButton();
}
function stopMic() {
const wasEnabled = state.micEnabled;
if (state.recorderNode) {
try {
state.recorderNode.port.onmessage = null;
state.recorderNode.disconnect();
} catch (_) {
/* ignore */
}
state.recorderNode = null;
}
if (state.micSourceNode) {
try {
state.micSourceNode.disconnect();
} catch (_) {
/* ignore */
}
state.micSourceNode = null;
}
if (state.micStream) {
for (const track of state.micStream.getTracks()) {
try {
track.stop();
} catch (_) {
/* ignore */
}
}
state.micStream = null;
}
state.micEnabled = false;
updateMeter(0);
if (wasEnabled) {
addWsLog("system", "麦克风已关闭");
}
setMicButton();
}
function updateMeter(rms) {
// Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech.
const target = Math.min(1, rms * 2.4);
state.meterLevel = state.meterLevel * 0.5 + target * 0.5;
els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`;
}
/* ---------------------------------------------------- Bot audio playback */
function schedulePlayback(int16) {
const ctx = state.audioContext;
if (!ctx) return;
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
}
const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE);
buffer.copyToChannel(float32, 0);
const src = ctx.createBufferSource();
src.buffer = buffer;
src.connect(ctx.destination);
const now = ctx.currentTime;
// Schedule immediately after the previously scheduled chunk to keep
// playback contiguous, with a tiny safety margin if we fell behind.
const startAt = Math.max(now + 0.02, state.nextPlaybackTime);
src.start(startAt);
state.nextPlaybackTime = startAt + buffer.duration;
state.playbackEndsAt = state.nextPlaybackTime;
src.onended = () => {
const idx = state.scheduledSources.indexOf(src);
if (idx >= 0) state.scheduledSources.splice(idx, 1);
};
state.scheduledSources.push(src);
setBotIndicator(true);
if (state.botUiTimer) clearTimeout(state.botUiTimer);
const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120;
state.botUiTimer = setTimeout(() => {
if (state.audioContext &&
state.audioContext.currentTime >= state.playbackEndsAt - 0.01) {
setBotIndicator(false);
}
}, msUntilEnd);
}
function stopPlaybackQueue() {
for (const src of state.scheduledSources) {
try {
src.onended = null;
src.stop();
src.disconnect();
} catch (_) {
/* already stopped */
}
}
state.scheduledSources = [];
resetPlaybackClock();
if (state.botUiTimer) {
clearTimeout(state.botUiTimer);
state.botUiTimer = null;
}
setBotIndicator(false);
}
function resetPlaybackClock() {
if (state.audioContext) {
state.nextPlaybackTime = state.audioContext.currentTime;
state.playbackEndsAt = state.audioContext.currentTime;
}
}
/* ------------------------------------------------------ Camera / image */
function setPreviewMode(mode) {
// mode: "camera" | "photo" | "idle"
els.cameraPreview.classList.toggle("is-camera", mode === "camera");
els.cameraPreview.classList.toggle("is-photo", mode === "photo");
}
// Draw an <img>/<video> source to the canvas and return a normalized payload
// (JPEG data URL + dimensions) suitable for an `input.image` message.
function mediaToPayload(source) {
const srcW = source.videoWidth || source.naturalWidth || source.width;
const srcH = source.videoHeight || source.naturalHeight || source.height;
if (!srcW || !srcH) return null;
let w = srcW;
let h = srcH;
const longest = Math.max(w, h);
if (longest > IMAGE_MAX_DIM) {
const scale = IMAGE_MAX_DIM / longest;
w = Math.round(w * scale);
h = Math.round(h * scale);
}
const canvas = els.cameraCanvas;
canvas.width = w;
canvas.height = h;
const ctx = canvas.getContext("2d");
ctx.drawImage(source, 0, 0, w, h);
let dataUrl;
try {
dataUrl = canvas.toDataURL("image/jpeg", IMAGE_JPEG_QUALITY);
} catch (err) {
addWsLog("system", `图片编码失败:${err.message || err}`);
return null;
}
return { dataUrl, mime: "image/jpeg", width: w, height: h };
}
function setPendingImage(payload) {
state.pendingImage = payload;
if (payload) {
els.cameraPhoto.src = payload.dataUrl;
setPreviewMode("photo");
}
setCameraButtonEnabled();
}
async function refreshVideoDevices() {
try {
const devices = await navigator.mediaDevices.enumerateDevices();
state.videoDevices = devices.filter((d) => d.kind === "videoinput");
} catch (_) {
state.videoDevices = [];
}
}
// Fill the camera dropdown from the enumerated devices. Labels are only exposed
// after camera permission has been granted, so before that we show generic
// names ("摄像头 1", …) or just the default option.
function populateDeviceSelect(activeDeviceId) {
const sel = els.cameraDeviceSelect;
sel.innerHTML = "";
if (state.videoDevices.length === 0) {
const opt = document.createElement("option");
opt.value = "";
opt.textContent = "默认摄像头";
sel.appendChild(opt);
sel.disabled = true;
return;
}
state.videoDevices.forEach((device, index) => {
const opt = document.createElement("option");
opt.value = device.deviceId;
opt.textContent = device.label || `摄像头 ${index + 1}`;
sel.appendChild(opt);
});
sel.disabled = false;
if (activeDeviceId) sel.value = activeDeviceId;
}
async function startCamera(deviceId) {
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
addWsLog("system", "该浏览器不支持摄像头访问");
return;
}
stopCameraStream();
const video = deviceId
? { deviceId: { exact: deviceId } }
: { facingMode: state.cameraFacing };
try {
state.cameraStream = await navigator.mediaDevices.getUserMedia({
video,
audio: false,
});
} catch (err) {
addWsLog("system", `摄像头错误:${err.message || err}`);
return;
}
els.cameraVideo.srcObject = state.cameraStream;
try {
await els.cameraVideo.play();
} catch (_) {
/* autoplay may resolve later */
}
state.cameraActive = true;
state.pendingImage = null;
setPreviewMode("camera");
els.cameraStartBtn.classList.add("is-active");
clearSampleSelection();
// Device labels become available only after permission is granted; refresh
// the dropdown now and select whichever camera is actually streaming.
await refreshVideoDevices();
const activeId =
state.cameraStream.getVideoTracks?.()[0]?.getSettings?.().deviceId ||
deviceId;
populateDeviceSelect(activeId);
// Reveal the camera device dropdown only while the camera is in use.
els.cameraDeviceRow.hidden = false;
setCameraButtonEnabled();
}
function stopCameraStream() {
if (state.cameraStream) {
state.cameraStream.getTracks().forEach((track) => track.stop());
state.cameraStream = null;
}
els.cameraVideo.srcObject = null;
state.cameraActive = false;
els.cameraStartBtn.classList.remove("is-active");
els.cameraDeviceRow.hidden = true;
}
function captureFromCamera() {
const payload = mediaToPayload(els.cameraVideo);
if (!payload) return null;
stopCameraStream();
setPendingImage(payload);
return payload;
}
// Load a same-origin/object URL into an <img> and resolve once decoded.
function loadImage(src) {
return new Promise((resolve, reject) => {
const img = new Image();
img.onload = () => resolve(img);
img.onerror = () => reject(new Error(`failed to load image: ${src}`));
img.src = src;
});
}
async function selectFileImage(file) {
if (!file) return;
const objectUrl = URL.createObjectURL(file);
try {
const img = await loadImage(objectUrl);
const payload = mediaToPayload(img);
if (!payload) return;
stopCameraStream();
clearSampleSelection();
setPendingImage(payload);
} catch (err) {
addWsLog("system", `上传错误:${err.message || err}`);
} finally {
URL.revokeObjectURL(objectUrl);
}
}
async function selectSampleImage(src, buttonEl) {
try {
const img = await loadImage(src);
const payload = mediaToPayload(img);
if (!payload) return;
stopCameraStream();
clearSampleSelection();
if (buttonEl) buttonEl.classList.add("is-selected");
setPendingImage(payload);
} catch (err) {
addWsLog("system", `示例图加载错误:${err.message || err}`);
}
}
function clearSampleSelection() {
els.cameraSamples
.querySelectorAll(".camera-drawer__sample.is-selected")
.forEach((el) => el.classList.remove("is-selected"));
}
function renderSampleThumbnails() {
if (state.samplesRendered) return;
state.samplesRendered = true;
els.cameraSamples.innerHTML = "";
for (const sample of SAMPLE_IMAGES) {
const btn = document.createElement("button");
btn.type = "button";
btn.className = "camera-drawer__sample";
btn.title = sample.label;
const img = document.createElement("img");
img.src = sample.src;
img.alt = sample.label;
btn.appendChild(img);
btn.addEventListener("click", () => selectSampleImage(sample.src, btn));
els.cameraSamples.appendChild(btn);
}
}
function resetCameraInput() {
stopCameraStream();
state.pendingImage = null;
clearSampleSelection();
els.cameraPhoto.removeAttribute("src");
setPreviewMode("idle");
setCameraButtonEnabled();
}
// Pre-select the first sample image so "拍摄完成" is immediately pressable when
// the drawer opens, without requiring the user to capture or pick first.
function selectDefaultImage() {
if (state.pendingImage || state.cameraActive) return;
const first = els.cameraSamples.querySelector(".camera-drawer__sample");
if (first && SAMPLE_IMAGES[0]) {
selectSampleImage(SAMPLE_IMAGES[0].src, first);
}
}
function sendImage(payload, text) {
if (!payload) return false;
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
const message = {
type: "input.image",
image: payload.dataUrl,
mime_type: payload.mime,
width: payload.width,
height: payload.height,
text: text || CAMERA_DONE_TEXT,
interrupt: true,
};
wsSend(JSON.stringify(message));
// Mirror the text-input path: interrupt in-flight bot audio and render the
// user's image + text together as one local bubble (the engine does not echo
// image input back as a transcript event).
stopPlaybackQueue();
state.currentAssistantBubble = null;
addImageBubble("user", payload.dataUrl, text || CAMERA_DONE_TEXT);
return true;
}
function submitCameraImage() {
// If the live camera is on, grab the current frame first; otherwise use the
// already-selected (uploaded / sample / captured) image.
let payload = state.pendingImage;
if (state.cameraActive) {
payload = captureFromCamera() || payload;
}
if (!payload) return;
// Keep the existing workflow contract: the accompanying text stays the
// "【拍摄完成】" marker that advances the FastGPT camera step; the image is
// the new multimodal attachment.
if (!sendImage(payload, CAMERA_DONE_TEXT)) return;
resetCameraInput();
}
/* --------------------------------------------------------- Chat updates */
function handleUserTranscript(text) {
if (!text) return;
state.currentAssistantBubble = null;
addBubble("user", text);
}
function sendText(text) {
const value = (text || "").trim();
if (!value) return false;
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
const message = {
type: "input.text",
text: value,
interrupt: true,
};
// The engine does not echo text input back as a transcript event, so we
// render the user bubble locally. Also interrupt any in-flight bot audio
// so the next reply is heard cleanly. We deliberately do NOT clear
// `currentAssistantBubble` here — the engine will emit a
// `response.text.final(interrupted=true)` for the in-flight assistant
// turn, which finalizes that bubble in place. A brand-new bubble for the
// reply will be created when `response.text.started` arrives.
wsSend(JSON.stringify(message));
stopPlaybackQueue();
addBubble("user", value);
return true;
}
function handleAssistantDelta(text) {
if (!text) return;
if (!state.currentAssistantBubble) {
state.currentAssistantBubble = addBubble("assistant", "");
}
appendToBubble(state.currentAssistantBubble, text);
}
function handleAssistantStarted() {
state.currentAssistantBubble = null;
}
function handleAssistantFinal(text, interrupted) {
if (!text) {
state.currentAssistantBubble = null;
return;
}
if (state.currentAssistantBubble) {
const body = state.currentAssistantBubble.querySelector(".bubble__text");
body.textContent = text;
} else {
state.currentAssistantBubble = addBubble("assistant", text);
}
if (interrupted) {
state.currentAssistantBubble.classList.add("bubble--interrupted");
}
state.currentAssistantBubble = null;
scrollChatToBottom();
}
function finalizeAssistantBubble() {
state.currentAssistantBubble = null;
}
/* ---------------------------------------------------------- Websocket IO */
function decodeBase64ToInt16(b64) {
const binary = atob(b64);
const len = binary.length;
const bytes = new Uint8Array(len);
for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i);
return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2);
}
function handleEvent(event) {
switch (event.type) {
case "response.audio.delta":
if (typeof event.audio === "string") {
schedulePlayback(decodeBase64ToInt16(event.audio));
}
break;
case "response.audio.started":
setBotIndicator(true);
break;
case "response.audio.stopped":
finalizeAssistantBubble();
// The indicator turns off automatically when the playback queue drains.
break;
case "response.text.delta":
handleAssistantDelta(event.text);
break;
case "response.text.started":
handleAssistantStarted();
break;
case "response.text.final":
handleAssistantFinal(event.text, event.interrupted);
break;
case "response.state":
setAssistantState(event.state);
break;
case "input.transcript.final":
handleUserTranscript(event.text);
break;
case "input.transcript.interim":
// Ignore partial ASR updates; chat history renders committed user turns.
break;
case "transport.message":
// Reserved for future structured messages; ignore silently.
break;
default:
// Unknown event type: log for debugging.
console.debug("ws event", event);
}
}
async function connect() {
if (state.connected || state.connecting) return;
const inputChatId = currentChatIdInput();
const chatId = inputChatId || generateChatId();
const url = wsUrlWithChatId(chatId);
if (!url) {
setStatus("error", "缺少服务器地址");
return;
}
state.connecting = true;
state.chatId = chatId;
els.chatId.value = chatId;
setStatus("connecting", "连接中…");
setConnectButton();
addWsLog("system", `正在连接 ${url}`);
try {
// Pre-warm audio context on user gesture so playback works on Safari.
await ensureAudioContext();
} catch (err) {
console.error("AudioContext failed", err);
state.connecting = false;
state.chatId = "";
if (!inputChatId) els.chatId.value = "";
setStatus("error", "音频初始化失败");
setConnectButton();
addWsLog("error", `音频初始化失败:${err.message || err}`, "error");
return;
}
let ws;
try {
ws = new WebSocket(url);
} catch (err) {
console.error("WebSocket constructor failed", err);
state.connecting = false;
state.chatId = "";
if (!inputChatId) els.chatId.value = "";
setStatus("error", "服务器地址无效");
setConnectButton();
addWsLog("error", `WebSocket 地址无效:${err.message || err}`, "error");
return;
}
ws.binaryType = "arraybuffer";
state.ws = ws;
ws.addEventListener("open", () => {
const startMessage = {
type: "session.start",
protocol: PROTOCOL,
audio: {
encoding: "pcm_s16le",
sample_rate: SAMPLE_RATE,
channels: CHANNELS,
},
};
startMessage.chatId = state.chatId;
state.connecting = false;
state.connected = true;
resetPlaybackClock();
addWsLog("system", "连接已建立");
setStatus("connected", "已连接");
setConnectButton();
setMicButton();
setMicSelectEnabled();
refreshMicDevices();
wsSend(JSON.stringify(startMessage));
addBubble("system", "会话已开始。");
setComposerEnabled(true);
setCameraButtonEnabled();
els.textInput.focus();
});
ws.addEventListener("message", (event) => {
const data = event.data;
if (typeof data === "string") {
let parsed;
try {
parsed = JSON.parse(data);
} catch (err) {
console.warn("Bad JSON from server", err, data);
addWsLog(
"error",
`invalid JSON from server: ${truncateLogValue(data)}`,
"error",
);
return;
}
logWsPayload("recv", parsed);
handleEvent(parsed);
} else if (data instanceof ArrayBuffer) {
// Server doesn't currently send binary, but handle it just in case.
addWsLog("recv", `binary audio ${data.byteLength} bytes`);
schedulePlayback(new Int16Array(data));
}
});
ws.addEventListener("error", (err) => {
console.error("WebSocket error", err);
setStatus("error", "连接错误");
addWsLog("error", "websocket error", "error");
});
ws.addEventListener("close", (event) => {
const wasConnected = state.connected;
state.ws = null;
state.connected = false;
state.connecting = false;
state.chatId = "";
setAssistantState("");
if (state.micEnabled) stopMic();
stopPlaybackQueue();
setConnectButton();
setMicButton();
setMicSelectEnabled();
setComposerEnabled(false);
setCameraButtonEnabled();
setBotIndicator(false);
finalizeWsLogGroup();
addWsLog(
"system",
`websocket close code=${event.code}${
event.reason ? ` reason=${event.reason}` : ""
}`,
);
if (wasConnected) {
addBubble(
"system",
`会话已结束${event.reason ? `${event.reason}` : ""}`,
);
setStatus("idle", "未连接");
} else {
setStatus("error", "连接已断开");
}
});
}
function disconnect() {
if (!state.ws) return;
try {
if (state.ws.readyState === WebSocket.OPEN) {
const stopMessage = { type: "session.stop", reason: "client_disconnect" };
wsSend(JSON.stringify(stopMessage));
}
} catch (_) {
/* ignore */
}
try {
state.ws.close(1000, "client_disconnect");
} catch (_) {
/* ignore */
}
}
/* ---------------------------------------------------------------- Wiring */
els.connectBtn.addEventListener("click", () => {
if (state.connected) disconnect();
else connect();
});
els.copyChatIdBtn.addEventListener("click", copyChatId);
els.micBtn.addEventListener("click", async () => {
if (!state.connected) return;
els.micBtn.disabled = true;
try {
if (state.micEnabled) {
stopMic();
} else {
await startMic();
}
} catch (err) {
console.error("Mic error", err);
addBubble("system", `麦克风错误:${err.message || err}`);
} finally {
els.micBtn.disabled = !state.connected;
}
});
els.micSelect.addEventListener("change", async () => {
state.selectedMicDeviceId = els.micSelect.value;
if (!state.micEnabled) return;
els.micSelect.disabled = true;
els.micBtn.disabled = true;
try {
stopMic();
await startMic();
} catch (err) {
console.error("Mic switch error", err);
addBubble("system", `麦克风切换错误:${err.message || err}`);
} finally {
setMicButton();
setMicSelectEnabled();
}
});
if (navigator.mediaDevices?.addEventListener) {
navigator.mediaDevices.addEventListener("devicechange", refreshMicDevices);
}
els.clearBtn.addEventListener("click", () => {
clearChat();
});
els.clearWsLogBtn.addEventListener("click", () => {
clearWsLog();
});
els.cameraDoneBtn.addEventListener("click", () => {
if (!state.cameraState) return;
submitCameraImage();
});
els.cameraStartBtn.addEventListener("click", () => {
startCamera(els.cameraDeviceSelect.value || undefined);
});
els.cameraDeviceSelect.addEventListener("change", () => {
// Switching device only restarts the stream when the camera is already live;
// otherwise the choice is applied when "使用摄像头" is pressed.
if (state.cameraActive) {
startCamera(els.cameraDeviceSelect.value || undefined);
}
});
els.cameraUpload.addEventListener("change", (event) => {
const file = event.target.files && event.target.files[0];
selectFileImage(file);
event.target.value = "";
});
function autosizeTextarea() {
const ta = els.textInput;
ta.style.height = "auto";
ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`;
}
function submitText() {
const value = els.textInput.value;
if (!sendText(value)) return;
els.textInput.value = "";
autosizeTextarea();
setComposerEnabled(state.connected);
}
els.composer.addEventListener("submit", (event) => {
event.preventDefault();
submitText();
});
els.textInput.addEventListener("input", () => {
autosizeTextarea();
setComposerEnabled(state.connected);
});
els.textInput.addEventListener("keydown", (event) => {
if (event.key === "Enter" && !event.shiftKey && !event.isComposing) {
event.preventDefault();
submitText();
}
});
window.addEventListener("beforeunload", () => {
stopCameraStream();
if (state.ws) {
try {
state.ws.close();
} catch (_) {
/* ignore */
}
}
if (state.audioContext) {
try {
state.audioContext.close();
} catch (_) {
/* ignore */
}
}
});
els.url.value = defaultWsUrl();
setStatus("idle", "未连接");
setConnectButton();
setMicButton();
setMicSelectEnabled();
setComposerEnabled(false);