1545 lines
44 KiB
JavaScript
1545 lines
44 KiB
JavaScript
/**
|
||
* Minimal browser client for the AI VideoAssistant engine's product
|
||
* websocket (`/ws-product`, protocol `va.ws.v1`).
|
||
*
|
||
* Responsibilities:
|
||
* - Open/close the websocket and run the session handshake.
|
||
* - List/select microphones and capture mic audio with browser AEC enabled.
|
||
* - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames
|
||
* as binary websocket messages.
|
||
* - Play `response.audio.delta` frames gaplessly through Web Audio.
|
||
* - Render a chat-style history of user transcripts and bot text deltas.
|
||
* - Collapse high-frequency audio frames into expandable websocket log groups.
|
||
*/
|
||
|
||
const SAMPLE_RATE = 16000;
|
||
const CHANNELS = 1;
|
||
const FRAME_MS = 20;
|
||
const PROTOCOL = "va.ws.v1";
|
||
const MAX_WS_LOG_LINES = 120;
|
||
const MAX_GROUP_CHILDREN_RENDER = 100;
|
||
const WS_LOG_GROUP_KEYS = {
|
||
AUDIO_DELTA: "recv:response.audio.delta",
|
||
TEXT_DELTA: "recv:response.text.delta",
|
||
AUDIO_SEND: "send:input.audio",
|
||
};
|
||
const CAMERA_DONE_TEXT = "【拍摄完成】";
|
||
// Sample images shown as thumbnails under the camera preview. Same-origin files
|
||
// so they can be drawn to a canvas (for base64 + dimensions) without tainting.
|
||
const SAMPLE_IMAGES = [
|
||
{ src: "./samples/damage1.png", label: "车辆前部" },
|
||
{ src: "./samples/damage2.png", label: "车辆后部" },
|
||
{ src: "./samples/plate1.jpg", label: "车牌 1" },
|
||
{ src: "./samples/plate2.jpg", label: "车牌 2" },
|
||
{ src: "./samples/user1.jpg", label: "人物 1" },
|
||
{ src: "./samples/user2.jpg", label: "人物 2" },
|
||
];
|
||
// Cap the longer edge before JPEG-encoding so payloads stay small.
|
||
const IMAGE_MAX_DIM = 1280;
|
||
const IMAGE_JPEG_QUALITY = 0.85;
|
||
const CAMERA_STATE_PROMPTS = {
|
||
2000: "请对准车辆碰撞部位拍摄照片。",
|
||
2001: "请对准车辆碰撞部位拍摄照片。",
|
||
2002: "请对准被撞物品拍摄照片。",
|
||
2003: "请切换摄像头对准本人拍摄一张正面照片。",
|
||
2010: "请对准第一辆车碰撞部位拍摄。",
|
||
2011: "请对准第一辆车碰撞部位拍摄。",
|
||
2012: "请对准第二辆车碰撞部位拍摄。",
|
||
2013: "请对准第二方车辆侧后方,看清车牌拍摄。",
|
||
2014: "请拍摄另一方驾驶人的正面照片。",
|
||
2015: "请切换前置摄像头对准本人拍摄一张正面照片。",
|
||
};
|
||
|
||
function defaultWsUrl() {
|
||
const scheme = location.protocol === "https:" ? "wss:" : "ws:";
|
||
return `${scheme}//${location.host}/ws-product`;
|
||
}
|
||
|
||
const els = {
|
||
url: document.getElementById("ws-url"),
|
||
chatId: document.getElementById("chat-id"),
|
||
copyChatIdBtn: document.getElementById("copy-chat-id-btn"),
|
||
connectBtn: document.getElementById("connect-btn"),
|
||
statusDot: document.getElementById("status-dot"),
|
||
statusText: document.getElementById("status-text"),
|
||
conversation: document.getElementById("conversation"),
|
||
chatLog: document.getElementById("chat-log"),
|
||
micBtn: document.getElementById("mic-btn"),
|
||
micSelect: document.getElementById("mic-select"),
|
||
micLabel: document.querySelector(".mic-btn__label"),
|
||
micIndicator: document.getElementById("mic-indicator"),
|
||
botIndicator: document.getElementById("bot-indicator"),
|
||
stateIndicator: document.getElementById("state-indicator"),
|
||
stateLabel: document.getElementById("state-label"),
|
||
cameraDrawer: document.getElementById("camera-drawer"),
|
||
cameraState: document.getElementById("camera-state"),
|
||
cameraQuestion: document.getElementById("camera-question"),
|
||
cameraDoneBtn: document.getElementById("camera-done-btn"),
|
||
cameraPreview: document.getElementById("camera-preview"),
|
||
cameraVideo: document.getElementById("camera-video"),
|
||
cameraPhoto: document.getElementById("camera-photo"),
|
||
cameraCanvas: document.getElementById("camera-canvas"),
|
||
cameraStartBtn: document.getElementById("camera-start-btn"),
|
||
cameraDeviceRow: document.getElementById("camera-device-row"),
|
||
cameraDeviceSelect: document.getElementById("camera-device-select"),
|
||
cameraUpload: document.getElementById("camera-upload"),
|
||
cameraSamples: document.getElementById("camera-samples"),
|
||
clearBtn: document.getElementById("clear-btn"),
|
||
clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
|
||
wsLog: document.getElementById("ws-log"),
|
||
meterFill: document.getElementById("meter-fill"),
|
||
composer: document.getElementById("composer"),
|
||
textInput: document.getElementById("text-input"),
|
||
sendBtn: document.getElementById("send-btn"),
|
||
};
|
||
|
||
function generateChatId() {
|
||
if (typeof crypto !== "undefined" && crypto.randomUUID) {
|
||
return `voice_${crypto.randomUUID().replaceAll("-", "").slice(0, 16)}`;
|
||
}
|
||
return `voice_${Date.now().toString(36)}${Math.random()
|
||
.toString(36)
|
||
.slice(2, 10)}`;
|
||
}
|
||
|
||
function currentChatIdInput() {
|
||
return (els.chatId.value || "").trim();
|
||
}
|
||
|
||
function wsUrlWithChatId(chatId) {
|
||
const rawUrl = (els.url.value || "").trim();
|
||
if (!rawUrl || !chatId) return rawUrl;
|
||
|
||
try {
|
||
const url = new URL(rawUrl, location.href);
|
||
url.searchParams.set("chatId", chatId);
|
||
return url.href;
|
||
} catch (_) {
|
||
const separator = rawUrl.includes("?") ? "&" : "?";
|
||
return `${rawUrl}${separator}chatId=${encodeURIComponent(chatId)}`;
|
||
}
|
||
}
|
||
|
||
const state = {
|
||
ws: null,
|
||
connected: false,
|
||
connecting: false,
|
||
chatId: "",
|
||
|
||
audioContext: null,
|
||
micStream: null,
|
||
micSourceNode: null,
|
||
recorderNode: null,
|
||
|
||
micEnabled: false,
|
||
micDevices: [],
|
||
selectedMicDeviceId: "",
|
||
|
||
// Output scheduling.
|
||
nextPlaybackTime: 0,
|
||
playbackEndsAt: 0,
|
||
scheduledSources: [],
|
||
botActive: false,
|
||
botUiTimer: null,
|
||
|
||
// Chat state.
|
||
currentAssistantBubble: null,
|
||
assistantState: "",
|
||
cameraState: "",
|
||
|
||
// Camera / image input.
|
||
cameraStream: null,
|
||
cameraActive: false,
|
||
cameraFacing: "environment",
|
||
videoDevices: [],
|
||
pendingImage: null,
|
||
samplesRendered: false,
|
||
|
||
// VU meter smoothing.
|
||
meterLevel: 0,
|
||
|
||
// Collapsible websocket log groups for high-frequency audio frames.
|
||
wsLogGroup: null,
|
||
};
|
||
|
||
/* ------------------------------------------------------------------ UI */
|
||
|
||
function setStatus(kind, text) {
|
||
els.statusDot.className = `status__dot status__dot--${kind}`;
|
||
els.statusText.textContent = text;
|
||
}
|
||
|
||
function setConnectButton() {
|
||
els.chatId.disabled = state.connected || state.connecting;
|
||
els.copyChatIdBtn.disabled = !state.connected || !state.chatId;
|
||
if (state.connecting) {
|
||
els.connectBtn.textContent = "连接中…";
|
||
els.connectBtn.disabled = true;
|
||
els.connectBtn.classList.remove("is-disconnect");
|
||
} else if (state.connected) {
|
||
els.connectBtn.textContent = "断开连接";
|
||
els.connectBtn.disabled = false;
|
||
els.connectBtn.classList.add("is-disconnect");
|
||
} else {
|
||
els.connectBtn.textContent = "连接";
|
||
els.connectBtn.disabled = false;
|
||
els.connectBtn.classList.remove("is-disconnect");
|
||
}
|
||
}
|
||
|
||
async function copyChatId() {
|
||
if (!state.connected || !state.chatId) return;
|
||
try {
|
||
await navigator.clipboard.writeText(state.chatId);
|
||
} catch (_) {
|
||
const selectionStart = els.chatId.selectionStart;
|
||
const selectionEnd = els.chatId.selectionEnd;
|
||
els.chatId.disabled = false;
|
||
els.chatId.select();
|
||
document.execCommand("copy");
|
||
els.chatId.setSelectionRange(selectionStart, selectionEnd);
|
||
els.chatId.disabled = true;
|
||
}
|
||
|
||
els.copyChatIdBtn.classList.add("copied");
|
||
window.setTimeout(() => {
|
||
els.copyChatIdBtn.classList.remove("copied");
|
||
}, 1200);
|
||
}
|
||
|
||
function setMicButton() {
|
||
els.micBtn.disabled = !state.connected;
|
||
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
|
||
els.micBtn.title = state.micEnabled ? "关闭麦克风" : "开启麦克风";
|
||
els.micLabel.textContent = state.micEnabled ? "关闭麦克风" : "开启麦克风";
|
||
els.micIndicator.classList.toggle("is-active", state.micEnabled);
|
||
}
|
||
|
||
function setMicSelectEnabled() {
|
||
els.micSelect.disabled = !state.connected || !navigator.mediaDevices;
|
||
}
|
||
|
||
function setComposerEnabled(enabled) {
|
||
els.textInput.disabled = !enabled;
|
||
els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0;
|
||
setCameraButtonEnabled();
|
||
}
|
||
|
||
function setBotIndicator(active) {
|
||
els.botIndicator.classList.toggle("is-active", active);
|
||
}
|
||
|
||
function setAssistantState(value) {
|
||
const text = typeof value === "string" ? value.trim() : "";
|
||
const label = text.length > 32 ? `${text.slice(0, 31)}…` : text;
|
||
state.assistantState = text;
|
||
els.stateIndicator.classList.toggle("is-active", Boolean(text));
|
||
els.stateLabel.textContent = label ? `状态 ${label}` : "状态 -";
|
||
els.stateIndicator.title = label ? `助手状态:${text}` : "助手状态";
|
||
syncCameraDrawer(text);
|
||
}
|
||
|
||
function setCameraButtonEnabled() {
|
||
if (!els.cameraDoneBtn) return;
|
||
const wsReady =
|
||
state.connected && state.ws && state.ws.readyState === WebSocket.OPEN;
|
||
const hasImageSource = state.cameraActive || Boolean(state.pendingImage);
|
||
els.cameraDoneBtn.disabled = !wsReady || !state.cameraState || !hasImageSource;
|
||
}
|
||
|
||
function syncCameraDrawer(value) {
|
||
const prompt = CAMERA_STATE_PROMPTS[value];
|
||
const open = Boolean(prompt);
|
||
const wasOpen = Boolean(state.cameraState);
|
||
state.cameraState = open ? value : "";
|
||
els.cameraDrawer.classList.toggle("is-open", open);
|
||
els.conversation.classList.toggle("has-camera", open);
|
||
els.cameraDrawer.setAttribute("aria-hidden", open ? "false" : "true");
|
||
if (open) {
|
||
els.cameraState.textContent = `状态 ${value}`;
|
||
els.cameraQuestion.textContent = prompt;
|
||
renderSampleThumbnails();
|
||
selectDefaultImage();
|
||
} else {
|
||
els.cameraState.textContent = "状态 -";
|
||
els.cameraQuestion.textContent = "";
|
||
if (wasOpen) resetCameraInput();
|
||
}
|
||
setCameraButtonEnabled();
|
||
}
|
||
|
||
function addBubble(role, text) {
|
||
if (els.chatLog.querySelector(".chat__empty")) {
|
||
els.chatLog.innerHTML = "";
|
||
}
|
||
const bubble = document.createElement("div");
|
||
bubble.className = `bubble bubble--${role}`;
|
||
if (role !== "system") {
|
||
const tag = document.createElement("span");
|
||
tag.className = "bubble__role";
|
||
tag.textContent = role === "user" ? "你" : "助手";
|
||
bubble.appendChild(tag);
|
||
}
|
||
const body = document.createElement("span");
|
||
body.className = "bubble__text";
|
||
body.textContent = text;
|
||
bubble.appendChild(body);
|
||
els.chatLog.appendChild(bubble);
|
||
scrollChatToBottom();
|
||
return bubble;
|
||
}
|
||
|
||
// Render a single chat bubble holding an image and (optionally) text together.
|
||
function addImageBubble(role, imageUrl, text) {
|
||
if (els.chatLog.querySelector(".chat__empty")) {
|
||
els.chatLog.innerHTML = "";
|
||
}
|
||
const bubble = document.createElement("div");
|
||
bubble.className = `bubble bubble--${role}`;
|
||
if (role !== "system") {
|
||
const tag = document.createElement("span");
|
||
tag.className = "bubble__role";
|
||
tag.textContent = role === "user" ? "你" : "助手";
|
||
bubble.appendChild(tag);
|
||
}
|
||
const img = document.createElement("img");
|
||
img.className = "bubble__image";
|
||
img.src = imageUrl;
|
||
img.alt = text || "image";
|
||
bubble.appendChild(img);
|
||
|
||
const body = document.createElement("span");
|
||
body.className = "bubble__text";
|
||
body.textContent = text || "";
|
||
bubble.appendChild(body);
|
||
|
||
els.chatLog.appendChild(bubble);
|
||
scrollChatToBottom();
|
||
return bubble;
|
||
}
|
||
|
||
function appendToBubble(bubble, text) {
|
||
const body = bubble.querySelector(".bubble__text");
|
||
body.textContent += text;
|
||
scrollChatToBottom();
|
||
}
|
||
|
||
function scrollChatToBottom() {
|
||
els.chatLog.scrollTop = els.chatLog.scrollHeight;
|
||
}
|
||
|
||
function clearChat() {
|
||
els.chatLog.innerHTML = "";
|
||
state.currentAssistantBubble = null;
|
||
setAssistantState("");
|
||
const empty = document.createElement("div");
|
||
empty.className = "chat__empty";
|
||
empty.innerHTML = "<p>对话已清空。</p>";
|
||
els.chatLog.appendChild(empty);
|
||
}
|
||
|
||
function truncateLogValue(value, maxLength = 160) {
|
||
const text = String(value);
|
||
if (text.length <= maxLength) return text;
|
||
return `${text.slice(0, maxLength - 1)}…`;
|
||
}
|
||
|
||
function formatLogTime(date = new Date()) {
|
||
return date.toLocaleTimeString([], {
|
||
hour12: false,
|
||
hour: "2-digit",
|
||
minute: "2-digit",
|
||
second: "2-digit",
|
||
});
|
||
}
|
||
|
||
function formatLogBytes(byteCount) {
|
||
if (byteCount >= 1048576) {
|
||
return `${(byteCount / 1048576).toFixed(2)} MB`;
|
||
}
|
||
if (byteCount >= 1024) {
|
||
return `${(byteCount / 1024).toFixed(1)} KB`;
|
||
}
|
||
return `${byteCount} bytes`;
|
||
}
|
||
|
||
function wsLogGroupLabel(groupKey) {
|
||
if (groupKey === WS_LOG_GROUP_KEYS.AUDIO_DELTA) {
|
||
return "response.audio.delta";
|
||
}
|
||
if (groupKey === WS_LOG_GROUP_KEYS.TEXT_DELTA) {
|
||
return "response.text.delta";
|
||
}
|
||
if (groupKey === WS_LOG_GROUP_KEYS.AUDIO_SEND) {
|
||
return "input.audio binary";
|
||
}
|
||
return "grouped events";
|
||
}
|
||
|
||
function ensureWsLogReady() {
|
||
if (els.wsLog.querySelector(".ws-log__empty")) {
|
||
els.wsLog.innerHTML = "";
|
||
}
|
||
}
|
||
|
||
function scrollWsLogToBottom() {
|
||
els.wsLog.scrollTop = els.wsLog.scrollHeight;
|
||
}
|
||
|
||
function trimWsLog() {
|
||
while (els.wsLog.children.length > MAX_WS_LOG_LINES) {
|
||
const first = els.wsLog.firstElementChild;
|
||
if (state.wsLogGroup?.element === first) {
|
||
state.wsLogGroup = null;
|
||
}
|
||
first.remove();
|
||
}
|
||
}
|
||
|
||
function finalizeWsLogGroup() {
|
||
state.wsLogGroup = null;
|
||
}
|
||
|
||
function createWsLogEntry(direction, detail, kind, timeText = formatLogTime()) {
|
||
const entry = document.createElement("div");
|
||
entry.className = `ws-log__entry ws-log__entry--${kind}`;
|
||
|
||
const time = document.createElement("span");
|
||
time.className = "ws-log__time";
|
||
time.textContent = timeText;
|
||
|
||
const dir = document.createElement("span");
|
||
dir.className = "ws-log__direction";
|
||
dir.textContent =
|
||
direction === "send"
|
||
? "SEND"
|
||
: direction === "recv"
|
||
? "RECV"
|
||
: direction.toUpperCase();
|
||
|
||
const body = document.createElement("span");
|
||
body.className = "ws-log__detail";
|
||
body.textContent = detail;
|
||
|
||
entry.append(time, dir, body);
|
||
return entry;
|
||
}
|
||
|
||
function updateWsLogGroupSummary(group) {
|
||
group.summaryEl.textContent = `${wsLogGroupLabel(group.key)} ×${group.count} (${formatLogBytes(group.totalBytes)})`;
|
||
}
|
||
|
||
function appendWsLogGroupChildDom(group, item) {
|
||
const entry = createWsLogEntry(
|
||
group.direction,
|
||
item.detail,
|
||
group.kind,
|
||
item.time,
|
||
);
|
||
entry.classList.add("ws-log__entry--child");
|
||
group.childrenEl.appendChild(entry);
|
||
|
||
const childEntries = group.childrenEl.querySelectorAll(".ws-log__entry");
|
||
if (childEntries.length > MAX_GROUP_CHILDREN_RENDER) {
|
||
const omit = group.childrenEl.querySelector(".ws-log__group-omit");
|
||
if (!omit) {
|
||
const omitted = document.createElement("div");
|
||
omitted.className = "ws-log__group-omit";
|
||
omitted.textContent = "… earlier events omitted";
|
||
group.childrenEl.insertBefore(omitted, group.childrenEl.firstElementChild);
|
||
}
|
||
childEntries[0].remove();
|
||
}
|
||
}
|
||
|
||
function renderWsLogGroupChildren(group) {
|
||
group.childrenEl.innerHTML = "";
|
||
const items = group.items;
|
||
const start = Math.max(0, items.length - MAX_GROUP_CHILDREN_RENDER);
|
||
if (start > 0) {
|
||
const omitted = document.createElement("div");
|
||
omitted.className = "ws-log__group-omit";
|
||
omitted.textContent = `… ${start} earlier events omitted`;
|
||
group.childrenEl.appendChild(omitted);
|
||
}
|
||
for (let i = start; i < items.length; i += 1) {
|
||
appendWsLogGroupChildDom(group, items[i]);
|
||
}
|
||
}
|
||
|
||
function toggleWsLogGroup(group) {
|
||
group.collapsed = !group.collapsed;
|
||
group.childrenEl.hidden = group.collapsed;
|
||
group.chevronEl.textContent = group.collapsed ? "▶" : "▼";
|
||
group.headerEl.setAttribute("aria-expanded", group.collapsed ? "false" : "true");
|
||
|
||
if (!group.collapsed && group.childrenEl.childElementCount === 0) {
|
||
renderWsLogGroupChildren(group);
|
||
}
|
||
}
|
||
|
||
function appendWsLogGroupItem(groupKey, direction, kind, itemDetail, byteCount = 0) {
|
||
ensureWsLogReady();
|
||
|
||
let group = state.wsLogGroup;
|
||
if (!group || group.key !== groupKey) {
|
||
finalizeWsLogGroup();
|
||
|
||
const groupEl = document.createElement("div");
|
||
groupEl.className = `ws-log__group ws-log__group--${kind}`;
|
||
|
||
const header = document.createElement("button");
|
||
header.type = "button";
|
||
header.className = "ws-log__group-header";
|
||
header.setAttribute("aria-expanded", "false");
|
||
|
||
const time = document.createElement("span");
|
||
time.className = "ws-log__time";
|
||
time.textContent = formatLogTime();
|
||
|
||
const dir = document.createElement("span");
|
||
dir.className = "ws-log__direction";
|
||
dir.textContent = direction === "send" ? "SEND" : "RECV";
|
||
|
||
const chevron = document.createElement("span");
|
||
chevron.className = "ws-log__group-chevron";
|
||
chevron.textContent = "▶";
|
||
chevron.setAttribute("aria-hidden", "true");
|
||
|
||
const summary = document.createElement("span");
|
||
summary.className = "ws-log__group-summary";
|
||
|
||
header.append(time, dir, chevron, summary);
|
||
|
||
const children = document.createElement("div");
|
||
children.className = "ws-log__group-children";
|
||
children.hidden = true;
|
||
|
||
groupEl.append(header, children);
|
||
els.wsLog.appendChild(groupEl);
|
||
|
||
group = {
|
||
key: groupKey,
|
||
direction,
|
||
kind,
|
||
element: groupEl,
|
||
headerEl: header,
|
||
chevronEl: chevron,
|
||
summaryEl: summary,
|
||
childrenEl: children,
|
||
collapsed: true,
|
||
count: 0,
|
||
totalBytes: 0,
|
||
items: [],
|
||
};
|
||
state.wsLogGroup = group;
|
||
header.addEventListener("click", () => toggleWsLogGroup(group));
|
||
}
|
||
|
||
group.count += 1;
|
||
group.totalBytes += byteCount;
|
||
const item = { time: formatLogTime(), detail: itemDetail };
|
||
group.items.push(item);
|
||
updateWsLogGroupSummary(group);
|
||
|
||
if (!group.collapsed) {
|
||
appendWsLogGroupChildDom(group, item);
|
||
}
|
||
|
||
trimWsLog();
|
||
scrollWsLogToBottom();
|
||
}
|
||
|
||
function compactWsPayload(payload) {
|
||
if (!payload || typeof payload !== "object") return String(payload);
|
||
const compact = { ...payload };
|
||
|
||
if (typeof compact.audio === "string") {
|
||
compact.audio = `<base64 ${compact.audio.length} chars>`;
|
||
}
|
||
if (typeof compact.image === "string") {
|
||
compact.image = `<base64 ${compact.image.length} chars>`;
|
||
}
|
||
if (typeof compact.data === "string" && compact.data.length > 160) {
|
||
compact.data = `<string ${compact.data.length} chars>`;
|
||
}
|
||
if (typeof compact.text === "string") {
|
||
compact.text = truncateLogValue(compact.text);
|
||
}
|
||
|
||
try {
|
||
return JSON.stringify(compact);
|
||
} catch (_) {
|
||
return payload.type || "unserializable websocket payload";
|
||
}
|
||
}
|
||
|
||
function addWsLog(direction, detail, kind = direction) {
|
||
finalizeWsLogGroup();
|
||
ensureWsLogReady();
|
||
els.wsLog.appendChild(createWsLogEntry(direction, detail, kind));
|
||
trimWsLog();
|
||
scrollWsLogToBottom();
|
||
}
|
||
|
||
function logWsPayload(direction, payload) {
|
||
if (direction === "recv" && payload?.type === "response.audio.delta") {
|
||
const bytes = payload.bytes || 0;
|
||
const detail =
|
||
payload.seq != null
|
||
? `seq=${payload.seq} (${bytes} bytes)`
|
||
: `(${bytes} bytes)`;
|
||
appendWsLogGroupItem(
|
||
WS_LOG_GROUP_KEYS.AUDIO_DELTA,
|
||
"recv",
|
||
"recv",
|
||
detail,
|
||
bytes,
|
||
);
|
||
return;
|
||
}
|
||
if (direction === "recv" && payload?.type === "response.text.delta") {
|
||
const text = typeof payload.text === "string" ? payload.text : "";
|
||
const bytes = new TextEncoder().encode(text).length;
|
||
const detail =
|
||
payload.seq != null
|
||
? `seq=${payload.seq} ${JSON.stringify(truncateLogValue(text, 120))}`
|
||
: JSON.stringify(truncateLogValue(text, 120));
|
||
appendWsLogGroupItem(
|
||
WS_LOG_GROUP_KEYS.TEXT_DELTA,
|
||
"recv",
|
||
"recv",
|
||
detail,
|
||
bytes,
|
||
);
|
||
return;
|
||
}
|
||
|
||
addWsLog(direction, compactWsPayload(payload));
|
||
}
|
||
|
||
function logBinarySend(byteLength) {
|
||
appendWsLogGroupItem(
|
||
WS_LOG_GROUP_KEYS.AUDIO_SEND,
|
||
"send",
|
||
"send",
|
||
`(${byteLength} bytes)`,
|
||
byteLength,
|
||
);
|
||
}
|
||
|
||
function wsSend(data) {
|
||
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
|
||
|
||
if (typeof data === "string") {
|
||
try {
|
||
logWsPayload("send", JSON.parse(data));
|
||
} catch (_) {
|
||
addWsLog("send", truncateLogValue(data));
|
||
}
|
||
} else {
|
||
const byteLength =
|
||
data instanceof ArrayBuffer
|
||
? data.byteLength
|
||
: ArrayBuffer.isView(data)
|
||
? data.byteLength
|
||
: 0;
|
||
if (byteLength > 0) {
|
||
logBinarySend(byteLength);
|
||
}
|
||
}
|
||
|
||
state.ws.send(data);
|
||
return true;
|
||
}
|
||
|
||
function clearWsLog() {
|
||
state.wsLogGroup = null;
|
||
els.wsLog.innerHTML =
|
||
'<div class="ws-log__empty">暂无 WebSocket 事件。</div>';
|
||
}
|
||
|
||
/* ---------------------------------------------------------------- Audio */
|
||
|
||
async function ensureAudioContext() {
|
||
if (!state.audioContext) {
|
||
const Ctx = window.AudioContext || window.webkitAudioContext;
|
||
state.audioContext = new Ctx();
|
||
await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js");
|
||
}
|
||
if (state.audioContext.state === "suspended") {
|
||
await state.audioContext.resume();
|
||
}
|
||
return state.audioContext;
|
||
}
|
||
|
||
function renderMicDevices() {
|
||
const previousValue = state.selectedMicDeviceId || els.micSelect.value;
|
||
els.micSelect.innerHTML = "";
|
||
|
||
const defaultOption = document.createElement("option");
|
||
defaultOption.value = "";
|
||
defaultOption.textContent = "默认麦克风";
|
||
els.micSelect.appendChild(defaultOption);
|
||
|
||
state.micDevices.forEach((device, index) => {
|
||
const option = document.createElement("option");
|
||
option.value = device.deviceId;
|
||
option.textContent = device.label || `麦克风 ${index + 1}`;
|
||
els.micSelect.appendChild(option);
|
||
});
|
||
|
||
const hasPrevious = state.micDevices.some(
|
||
(device) => device.deviceId === previousValue,
|
||
);
|
||
state.selectedMicDeviceId = hasPrevious ? previousValue : "";
|
||
els.micSelect.value = state.selectedMicDeviceId;
|
||
setMicSelectEnabled();
|
||
}
|
||
|
||
async function refreshMicDevices() {
|
||
if (!navigator.mediaDevices?.enumerateDevices) {
|
||
setMicSelectEnabled();
|
||
return;
|
||
}
|
||
|
||
try {
|
||
const devices = await navigator.mediaDevices.enumerateDevices();
|
||
state.micDevices = devices.filter((device) => device.kind === "audioinput");
|
||
renderMicDevices();
|
||
} catch (err) {
|
||
console.warn("Could not enumerate microphones", err);
|
||
setMicSelectEnabled();
|
||
}
|
||
}
|
||
|
||
async function startMic() {
|
||
const ctx = await ensureAudioContext();
|
||
const audioConstraints = {
|
||
echoCancellation: true,
|
||
noiseSuppression: true,
|
||
autoGainControl: true,
|
||
channelCount: 1,
|
||
};
|
||
if (state.selectedMicDeviceId) {
|
||
audioConstraints.deviceId = { exact: state.selectedMicDeviceId };
|
||
}
|
||
|
||
state.micStream = await navigator.mediaDevices.getUserMedia({
|
||
audio: audioConstraints,
|
||
video: false,
|
||
});
|
||
await refreshMicDevices();
|
||
|
||
state.micSourceNode = ctx.createMediaStreamSource(state.micStream);
|
||
state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", {
|
||
numberOfInputs: 1,
|
||
numberOfOutputs: 0,
|
||
channelCount: 1,
|
||
processorOptions: {
|
||
targetSampleRate: SAMPLE_RATE,
|
||
frameMs: FRAME_MS,
|
||
},
|
||
});
|
||
state.recorderNode.port.onmessage = (event) => {
|
||
const data = event.data;
|
||
if (!data || data.type !== "frame") return;
|
||
updateMeter(data.rms || 0);
|
||
if (state.connected) {
|
||
wsSend(data.buffer);
|
||
}
|
||
};
|
||
|
||
state.micSourceNode.connect(state.recorderNode);
|
||
state.micEnabled = true;
|
||
addWsLog("system", "麦克风已开启(PCM 音频流)");
|
||
setMicButton();
|
||
}
|
||
|
||
function stopMic() {
|
||
const wasEnabled = state.micEnabled;
|
||
if (state.recorderNode) {
|
||
try {
|
||
state.recorderNode.port.onmessage = null;
|
||
state.recorderNode.disconnect();
|
||
} catch (_) {
|
||
/* ignore */
|
||
}
|
||
state.recorderNode = null;
|
||
}
|
||
if (state.micSourceNode) {
|
||
try {
|
||
state.micSourceNode.disconnect();
|
||
} catch (_) {
|
||
/* ignore */
|
||
}
|
||
state.micSourceNode = null;
|
||
}
|
||
if (state.micStream) {
|
||
for (const track of state.micStream.getTracks()) {
|
||
try {
|
||
track.stop();
|
||
} catch (_) {
|
||
/* ignore */
|
||
}
|
||
}
|
||
state.micStream = null;
|
||
}
|
||
state.micEnabled = false;
|
||
updateMeter(0);
|
||
if (wasEnabled) {
|
||
addWsLog("system", "麦克风已关闭");
|
||
}
|
||
setMicButton();
|
||
}
|
||
|
||
function updateMeter(rms) {
|
||
// Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech.
|
||
const target = Math.min(1, rms * 2.4);
|
||
state.meterLevel = state.meterLevel * 0.5 + target * 0.5;
|
||
els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`;
|
||
}
|
||
|
||
/* ---------------------------------------------------- Bot audio playback */
|
||
|
||
function schedulePlayback(int16) {
|
||
const ctx = state.audioContext;
|
||
if (!ctx) return;
|
||
|
||
const float32 = new Float32Array(int16.length);
|
||
for (let i = 0; i < int16.length; i++) {
|
||
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
|
||
}
|
||
const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE);
|
||
buffer.copyToChannel(float32, 0);
|
||
|
||
const src = ctx.createBufferSource();
|
||
src.buffer = buffer;
|
||
src.connect(ctx.destination);
|
||
|
||
const now = ctx.currentTime;
|
||
// Schedule immediately after the previously scheduled chunk to keep
|
||
// playback contiguous, with a tiny safety margin if we fell behind.
|
||
const startAt = Math.max(now + 0.02, state.nextPlaybackTime);
|
||
src.start(startAt);
|
||
state.nextPlaybackTime = startAt + buffer.duration;
|
||
state.playbackEndsAt = state.nextPlaybackTime;
|
||
|
||
src.onended = () => {
|
||
const idx = state.scheduledSources.indexOf(src);
|
||
if (idx >= 0) state.scheduledSources.splice(idx, 1);
|
||
};
|
||
state.scheduledSources.push(src);
|
||
|
||
setBotIndicator(true);
|
||
if (state.botUiTimer) clearTimeout(state.botUiTimer);
|
||
const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120;
|
||
state.botUiTimer = setTimeout(() => {
|
||
if (state.audioContext &&
|
||
state.audioContext.currentTime >= state.playbackEndsAt - 0.01) {
|
||
setBotIndicator(false);
|
||
}
|
||
}, msUntilEnd);
|
||
}
|
||
|
||
function stopPlaybackQueue() {
|
||
for (const src of state.scheduledSources) {
|
||
try {
|
||
src.onended = null;
|
||
src.stop();
|
||
src.disconnect();
|
||
} catch (_) {
|
||
/* already stopped */
|
||
}
|
||
}
|
||
state.scheduledSources = [];
|
||
resetPlaybackClock();
|
||
if (state.botUiTimer) {
|
||
clearTimeout(state.botUiTimer);
|
||
state.botUiTimer = null;
|
||
}
|
||
setBotIndicator(false);
|
||
}
|
||
|
||
function resetPlaybackClock() {
|
||
if (state.audioContext) {
|
||
state.nextPlaybackTime = state.audioContext.currentTime;
|
||
state.playbackEndsAt = state.audioContext.currentTime;
|
||
}
|
||
}
|
||
|
||
/* ------------------------------------------------------ Camera / image */
|
||
|
||
function setPreviewMode(mode) {
|
||
// mode: "camera" | "photo" | "idle"
|
||
els.cameraPreview.classList.toggle("is-camera", mode === "camera");
|
||
els.cameraPreview.classList.toggle("is-photo", mode === "photo");
|
||
}
|
||
|
||
// Draw an <img>/<video> source to the canvas and return a normalized payload
|
||
// (JPEG data URL + dimensions) suitable for an `input.image` message.
|
||
function mediaToPayload(source) {
|
||
const srcW = source.videoWidth || source.naturalWidth || source.width;
|
||
const srcH = source.videoHeight || source.naturalHeight || source.height;
|
||
if (!srcW || !srcH) return null;
|
||
|
||
let w = srcW;
|
||
let h = srcH;
|
||
const longest = Math.max(w, h);
|
||
if (longest > IMAGE_MAX_DIM) {
|
||
const scale = IMAGE_MAX_DIM / longest;
|
||
w = Math.round(w * scale);
|
||
h = Math.round(h * scale);
|
||
}
|
||
|
||
const canvas = els.cameraCanvas;
|
||
canvas.width = w;
|
||
canvas.height = h;
|
||
const ctx = canvas.getContext("2d");
|
||
ctx.drawImage(source, 0, 0, w, h);
|
||
|
||
let dataUrl;
|
||
try {
|
||
dataUrl = canvas.toDataURL("image/jpeg", IMAGE_JPEG_QUALITY);
|
||
} catch (err) {
|
||
addWsLog("system", `图片编码失败:${err.message || err}`);
|
||
return null;
|
||
}
|
||
return { dataUrl, mime: "image/jpeg", width: w, height: h };
|
||
}
|
||
|
||
function setPendingImage(payload) {
|
||
state.pendingImage = payload;
|
||
if (payload) {
|
||
els.cameraPhoto.src = payload.dataUrl;
|
||
setPreviewMode("photo");
|
||
}
|
||
setCameraButtonEnabled();
|
||
}
|
||
|
||
async function refreshVideoDevices() {
|
||
try {
|
||
const devices = await navigator.mediaDevices.enumerateDevices();
|
||
state.videoDevices = devices.filter((d) => d.kind === "videoinput");
|
||
} catch (_) {
|
||
state.videoDevices = [];
|
||
}
|
||
}
|
||
|
||
// Fill the camera dropdown from the enumerated devices. Labels are only exposed
|
||
// after camera permission has been granted, so before that we show generic
|
||
// names ("摄像头 1", …) or just the default option.
|
||
function populateDeviceSelect(activeDeviceId) {
|
||
const sel = els.cameraDeviceSelect;
|
||
sel.innerHTML = "";
|
||
if (state.videoDevices.length === 0) {
|
||
const opt = document.createElement("option");
|
||
opt.value = "";
|
||
opt.textContent = "默认摄像头";
|
||
sel.appendChild(opt);
|
||
sel.disabled = true;
|
||
return;
|
||
}
|
||
state.videoDevices.forEach((device, index) => {
|
||
const opt = document.createElement("option");
|
||
opt.value = device.deviceId;
|
||
opt.textContent = device.label || `摄像头 ${index + 1}`;
|
||
sel.appendChild(opt);
|
||
});
|
||
sel.disabled = false;
|
||
if (activeDeviceId) sel.value = activeDeviceId;
|
||
}
|
||
|
||
async function startCamera(deviceId) {
|
||
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||
addWsLog("system", "该浏览器不支持摄像头访问");
|
||
return;
|
||
}
|
||
stopCameraStream();
|
||
const video = deviceId
|
||
? { deviceId: { exact: deviceId } }
|
||
: { facingMode: state.cameraFacing };
|
||
try {
|
||
state.cameraStream = await navigator.mediaDevices.getUserMedia({
|
||
video,
|
||
audio: false,
|
||
});
|
||
} catch (err) {
|
||
addWsLog("system", `摄像头错误:${err.message || err}`);
|
||
return;
|
||
}
|
||
els.cameraVideo.srcObject = state.cameraStream;
|
||
try {
|
||
await els.cameraVideo.play();
|
||
} catch (_) {
|
||
/* autoplay may resolve later */
|
||
}
|
||
state.cameraActive = true;
|
||
state.pendingImage = null;
|
||
setPreviewMode("camera");
|
||
els.cameraStartBtn.classList.add("is-active");
|
||
clearSampleSelection();
|
||
|
||
// Device labels become available only after permission is granted; refresh
|
||
// the dropdown now and select whichever camera is actually streaming.
|
||
await refreshVideoDevices();
|
||
const activeId =
|
||
state.cameraStream.getVideoTracks?.()[0]?.getSettings?.().deviceId ||
|
||
deviceId;
|
||
populateDeviceSelect(activeId);
|
||
// Reveal the camera device dropdown only while the camera is in use.
|
||
els.cameraDeviceRow.hidden = false;
|
||
setCameraButtonEnabled();
|
||
}
|
||
|
||
function stopCameraStream() {
|
||
if (state.cameraStream) {
|
||
state.cameraStream.getTracks().forEach((track) => track.stop());
|
||
state.cameraStream = null;
|
||
}
|
||
els.cameraVideo.srcObject = null;
|
||
state.cameraActive = false;
|
||
els.cameraStartBtn.classList.remove("is-active");
|
||
els.cameraDeviceRow.hidden = true;
|
||
}
|
||
|
||
function captureFromCamera() {
|
||
const payload = mediaToPayload(els.cameraVideo);
|
||
if (!payload) return null;
|
||
stopCameraStream();
|
||
setPendingImage(payload);
|
||
return payload;
|
||
}
|
||
|
||
// Load a same-origin/object URL into an <img> and resolve once decoded.
|
||
function loadImage(src) {
|
||
return new Promise((resolve, reject) => {
|
||
const img = new Image();
|
||
img.onload = () => resolve(img);
|
||
img.onerror = () => reject(new Error(`failed to load image: ${src}`));
|
||
img.src = src;
|
||
});
|
||
}
|
||
|
||
async function selectFileImage(file) {
|
||
if (!file) return;
|
||
const objectUrl = URL.createObjectURL(file);
|
||
try {
|
||
const img = await loadImage(objectUrl);
|
||
const payload = mediaToPayload(img);
|
||
if (!payload) return;
|
||
stopCameraStream();
|
||
clearSampleSelection();
|
||
setPendingImage(payload);
|
||
} catch (err) {
|
||
addWsLog("system", `上传错误:${err.message || err}`);
|
||
} finally {
|
||
URL.revokeObjectURL(objectUrl);
|
||
}
|
||
}
|
||
|
||
async function selectSampleImage(src, buttonEl) {
|
||
try {
|
||
const img = await loadImage(src);
|
||
const payload = mediaToPayload(img);
|
||
if (!payload) return;
|
||
stopCameraStream();
|
||
clearSampleSelection();
|
||
if (buttonEl) buttonEl.classList.add("is-selected");
|
||
setPendingImage(payload);
|
||
} catch (err) {
|
||
addWsLog("system", `示例图加载错误:${err.message || err}`);
|
||
}
|
||
}
|
||
|
||
function clearSampleSelection() {
|
||
els.cameraSamples
|
||
.querySelectorAll(".camera-drawer__sample.is-selected")
|
||
.forEach((el) => el.classList.remove("is-selected"));
|
||
}
|
||
|
||
function renderSampleThumbnails() {
|
||
if (state.samplesRendered) return;
|
||
state.samplesRendered = true;
|
||
els.cameraSamples.innerHTML = "";
|
||
for (const sample of SAMPLE_IMAGES) {
|
||
const btn = document.createElement("button");
|
||
btn.type = "button";
|
||
btn.className = "camera-drawer__sample";
|
||
btn.title = sample.label;
|
||
const img = document.createElement("img");
|
||
img.src = sample.src;
|
||
img.alt = sample.label;
|
||
btn.appendChild(img);
|
||
btn.addEventListener("click", () => selectSampleImage(sample.src, btn));
|
||
els.cameraSamples.appendChild(btn);
|
||
}
|
||
}
|
||
|
||
function resetCameraInput() {
|
||
stopCameraStream();
|
||
state.pendingImage = null;
|
||
clearSampleSelection();
|
||
els.cameraPhoto.removeAttribute("src");
|
||
setPreviewMode("idle");
|
||
setCameraButtonEnabled();
|
||
}
|
||
|
||
// Pre-select the first sample image so "拍摄完成" is immediately pressable when
|
||
// the drawer opens, without requiring the user to capture or pick first.
|
||
function selectDefaultImage() {
|
||
if (state.pendingImage || state.cameraActive) return;
|
||
const first = els.cameraSamples.querySelector(".camera-drawer__sample");
|
||
if (first && SAMPLE_IMAGES[0]) {
|
||
selectSampleImage(SAMPLE_IMAGES[0].src, first);
|
||
}
|
||
}
|
||
|
||
function sendImage(payload, text) {
|
||
if (!payload) return false;
|
||
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
|
||
|
||
const message = {
|
||
type: "input.image",
|
||
image: payload.dataUrl,
|
||
mime_type: payload.mime,
|
||
width: payload.width,
|
||
height: payload.height,
|
||
text: text || CAMERA_DONE_TEXT,
|
||
interrupt: true,
|
||
};
|
||
|
||
wsSend(JSON.stringify(message));
|
||
// Mirror the text-input path: interrupt in-flight bot audio and render the
|
||
// user's image + text together as one local bubble (the engine does not echo
|
||
// image input back as a transcript event).
|
||
stopPlaybackQueue();
|
||
state.currentAssistantBubble = null;
|
||
addImageBubble("user", payload.dataUrl, text || CAMERA_DONE_TEXT);
|
||
return true;
|
||
}
|
||
|
||
function submitCameraImage() {
|
||
// If the live camera is on, grab the current frame first; otherwise use the
|
||
// already-selected (uploaded / sample / captured) image.
|
||
let payload = state.pendingImage;
|
||
if (state.cameraActive) {
|
||
payload = captureFromCamera() || payload;
|
||
}
|
||
if (!payload) return;
|
||
// Keep the existing workflow contract: the accompanying text stays the
|
||
// "【拍摄完成】" marker that advances the FastGPT camera step; the image is
|
||
// the new multimodal attachment.
|
||
if (!sendImage(payload, CAMERA_DONE_TEXT)) return;
|
||
resetCameraInput();
|
||
}
|
||
|
||
/* --------------------------------------------------------- Chat updates */
|
||
|
||
function handleUserTranscript(text) {
|
||
if (!text) return;
|
||
state.currentAssistantBubble = null;
|
||
addBubble("user", text);
|
||
}
|
||
|
||
function sendText(text) {
|
||
const value = (text || "").trim();
|
||
if (!value) return false;
|
||
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
|
||
const message = {
|
||
type: "input.text",
|
||
text: value,
|
||
interrupt: true,
|
||
};
|
||
|
||
// The engine does not echo text input back as a transcript event, so we
|
||
// render the user bubble locally. Also interrupt any in-flight bot audio
|
||
// so the next reply is heard cleanly. We deliberately do NOT clear
|
||
// `currentAssistantBubble` here — the engine will emit a
|
||
// `response.text.final(interrupted=true)` for the in-flight assistant
|
||
// turn, which finalizes that bubble in place. A brand-new bubble for the
|
||
// reply will be created when `response.text.started` arrives.
|
||
wsSend(JSON.stringify(message));
|
||
stopPlaybackQueue();
|
||
addBubble("user", value);
|
||
return true;
|
||
}
|
||
|
||
function handleAssistantDelta(text) {
|
||
if (!text) return;
|
||
if (!state.currentAssistantBubble) {
|
||
state.currentAssistantBubble = addBubble("assistant", "");
|
||
}
|
||
appendToBubble(state.currentAssistantBubble, text);
|
||
}
|
||
|
||
function handleAssistantStarted() {
|
||
state.currentAssistantBubble = null;
|
||
}
|
||
|
||
function handleAssistantFinal(text, interrupted) {
|
||
if (!text) {
|
||
state.currentAssistantBubble = null;
|
||
return;
|
||
}
|
||
if (state.currentAssistantBubble) {
|
||
const body = state.currentAssistantBubble.querySelector(".bubble__text");
|
||
body.textContent = text;
|
||
} else {
|
||
state.currentAssistantBubble = addBubble("assistant", text);
|
||
}
|
||
if (interrupted) {
|
||
state.currentAssistantBubble.classList.add("bubble--interrupted");
|
||
}
|
||
state.currentAssistantBubble = null;
|
||
scrollChatToBottom();
|
||
}
|
||
|
||
function finalizeAssistantBubble() {
|
||
state.currentAssistantBubble = null;
|
||
}
|
||
|
||
/* ---------------------------------------------------------- Websocket IO */
|
||
|
||
function decodeBase64ToInt16(b64) {
|
||
const binary = atob(b64);
|
||
const len = binary.length;
|
||
const bytes = new Uint8Array(len);
|
||
for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i);
|
||
return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2);
|
||
}
|
||
|
||
function handleEvent(event) {
|
||
switch (event.type) {
|
||
case "response.audio.delta":
|
||
if (typeof event.audio === "string") {
|
||
schedulePlayback(decodeBase64ToInt16(event.audio));
|
||
}
|
||
break;
|
||
case "response.audio.started":
|
||
setBotIndicator(true);
|
||
break;
|
||
case "response.audio.stopped":
|
||
finalizeAssistantBubble();
|
||
// The indicator turns off automatically when the playback queue drains.
|
||
break;
|
||
case "response.text.delta":
|
||
handleAssistantDelta(event.text);
|
||
break;
|
||
case "response.text.started":
|
||
handleAssistantStarted();
|
||
break;
|
||
case "response.text.final":
|
||
handleAssistantFinal(event.text, event.interrupted);
|
||
break;
|
||
case "response.state":
|
||
setAssistantState(event.state);
|
||
break;
|
||
case "input.transcript.final":
|
||
handleUserTranscript(event.text);
|
||
break;
|
||
case "input.transcript.interim":
|
||
// Ignore partial ASR updates; chat history renders committed user turns.
|
||
break;
|
||
case "transport.message":
|
||
// Reserved for future structured messages; ignore silently.
|
||
break;
|
||
default:
|
||
// Unknown event type: log for debugging.
|
||
console.debug("ws event", event);
|
||
}
|
||
}
|
||
|
||
async function connect() {
|
||
if (state.connected || state.connecting) return;
|
||
const inputChatId = currentChatIdInput();
|
||
const chatId = inputChatId || generateChatId();
|
||
const url = wsUrlWithChatId(chatId);
|
||
if (!url) {
|
||
setStatus("error", "缺少服务器地址");
|
||
return;
|
||
}
|
||
|
||
state.connecting = true;
|
||
state.chatId = chatId;
|
||
els.chatId.value = chatId;
|
||
setStatus("connecting", "连接中…");
|
||
setConnectButton();
|
||
addWsLog("system", `正在连接 ${url}`);
|
||
|
||
try {
|
||
// Pre-warm audio context on user gesture so playback works on Safari.
|
||
await ensureAudioContext();
|
||
} catch (err) {
|
||
console.error("AudioContext failed", err);
|
||
state.connecting = false;
|
||
state.chatId = "";
|
||
if (!inputChatId) els.chatId.value = "";
|
||
setStatus("error", "音频初始化失败");
|
||
setConnectButton();
|
||
addWsLog("error", `音频初始化失败:${err.message || err}`, "error");
|
||
return;
|
||
}
|
||
|
||
let ws;
|
||
try {
|
||
ws = new WebSocket(url);
|
||
} catch (err) {
|
||
console.error("WebSocket constructor failed", err);
|
||
state.connecting = false;
|
||
state.chatId = "";
|
||
if (!inputChatId) els.chatId.value = "";
|
||
setStatus("error", "服务器地址无效");
|
||
setConnectButton();
|
||
addWsLog("error", `WebSocket 地址无效:${err.message || err}`, "error");
|
||
return;
|
||
}
|
||
ws.binaryType = "arraybuffer";
|
||
state.ws = ws;
|
||
|
||
ws.addEventListener("open", () => {
|
||
const startMessage = {
|
||
type: "session.start",
|
||
protocol: PROTOCOL,
|
||
audio: {
|
||
encoding: "pcm_s16le",
|
||
sample_rate: SAMPLE_RATE,
|
||
channels: CHANNELS,
|
||
},
|
||
};
|
||
startMessage.chatId = state.chatId;
|
||
|
||
state.connecting = false;
|
||
state.connected = true;
|
||
resetPlaybackClock();
|
||
addWsLog("system", "连接已建立");
|
||
setStatus("connected", "已连接");
|
||
setConnectButton();
|
||
setMicButton();
|
||
setMicSelectEnabled();
|
||
refreshMicDevices();
|
||
|
||
wsSend(JSON.stringify(startMessage));
|
||
addBubble("system", "会话已开始。");
|
||
setComposerEnabled(true);
|
||
setCameraButtonEnabled();
|
||
els.textInput.focus();
|
||
});
|
||
|
||
ws.addEventListener("message", (event) => {
|
||
const data = event.data;
|
||
if (typeof data === "string") {
|
||
let parsed;
|
||
try {
|
||
parsed = JSON.parse(data);
|
||
} catch (err) {
|
||
console.warn("Bad JSON from server", err, data);
|
||
addWsLog(
|
||
"error",
|
||
`invalid JSON from server: ${truncateLogValue(data)}`,
|
||
"error",
|
||
);
|
||
return;
|
||
}
|
||
logWsPayload("recv", parsed);
|
||
handleEvent(parsed);
|
||
} else if (data instanceof ArrayBuffer) {
|
||
// Server doesn't currently send binary, but handle it just in case.
|
||
addWsLog("recv", `binary audio ${data.byteLength} bytes`);
|
||
schedulePlayback(new Int16Array(data));
|
||
}
|
||
});
|
||
|
||
ws.addEventListener("error", (err) => {
|
||
console.error("WebSocket error", err);
|
||
setStatus("error", "连接错误");
|
||
addWsLog("error", "websocket error", "error");
|
||
});
|
||
|
||
ws.addEventListener("close", (event) => {
|
||
const wasConnected = state.connected;
|
||
state.ws = null;
|
||
state.connected = false;
|
||
state.connecting = false;
|
||
state.chatId = "";
|
||
setAssistantState("");
|
||
if (state.micEnabled) stopMic();
|
||
stopPlaybackQueue();
|
||
setConnectButton();
|
||
setMicButton();
|
||
setMicSelectEnabled();
|
||
setComposerEnabled(false);
|
||
setCameraButtonEnabled();
|
||
setBotIndicator(false);
|
||
finalizeWsLogGroup();
|
||
addWsLog(
|
||
"system",
|
||
`websocket close code=${event.code}${
|
||
event.reason ? ` reason=${event.reason}` : ""
|
||
}`,
|
||
);
|
||
if (wasConnected) {
|
||
addBubble(
|
||
"system",
|
||
`会话已结束${event.reason ? `:${event.reason}` : ""}。`,
|
||
);
|
||
setStatus("idle", "未连接");
|
||
} else {
|
||
setStatus("error", "连接已断开");
|
||
}
|
||
});
|
||
}
|
||
|
||
function disconnect() {
|
||
if (!state.ws) return;
|
||
try {
|
||
if (state.ws.readyState === WebSocket.OPEN) {
|
||
const stopMessage = { type: "session.stop", reason: "client_disconnect" };
|
||
wsSend(JSON.stringify(stopMessage));
|
||
}
|
||
} catch (_) {
|
||
/* ignore */
|
||
}
|
||
try {
|
||
state.ws.close(1000, "client_disconnect");
|
||
} catch (_) {
|
||
/* ignore */
|
||
}
|
||
}
|
||
|
||
/* ---------------------------------------------------------------- Wiring */
|
||
|
||
els.connectBtn.addEventListener("click", () => {
|
||
if (state.connected) disconnect();
|
||
else connect();
|
||
});
|
||
|
||
els.copyChatIdBtn.addEventListener("click", copyChatId);
|
||
|
||
els.micBtn.addEventListener("click", async () => {
|
||
if (!state.connected) return;
|
||
els.micBtn.disabled = true;
|
||
try {
|
||
if (state.micEnabled) {
|
||
stopMic();
|
||
} else {
|
||
await startMic();
|
||
}
|
||
} catch (err) {
|
||
console.error("Mic error", err);
|
||
addBubble("system", `麦克风错误:${err.message || err}`);
|
||
} finally {
|
||
els.micBtn.disabled = !state.connected;
|
||
}
|
||
});
|
||
|
||
els.micSelect.addEventListener("change", async () => {
|
||
state.selectedMicDeviceId = els.micSelect.value;
|
||
if (!state.micEnabled) return;
|
||
|
||
els.micSelect.disabled = true;
|
||
els.micBtn.disabled = true;
|
||
try {
|
||
stopMic();
|
||
await startMic();
|
||
} catch (err) {
|
||
console.error("Mic switch error", err);
|
||
addBubble("system", `麦克风切换错误:${err.message || err}`);
|
||
} finally {
|
||
setMicButton();
|
||
setMicSelectEnabled();
|
||
}
|
||
});
|
||
|
||
if (navigator.mediaDevices?.addEventListener) {
|
||
navigator.mediaDevices.addEventListener("devicechange", refreshMicDevices);
|
||
}
|
||
|
||
els.clearBtn.addEventListener("click", () => {
|
||
clearChat();
|
||
});
|
||
|
||
els.clearWsLogBtn.addEventListener("click", () => {
|
||
clearWsLog();
|
||
});
|
||
|
||
els.cameraDoneBtn.addEventListener("click", () => {
|
||
if (!state.cameraState) return;
|
||
submitCameraImage();
|
||
});
|
||
|
||
els.cameraStartBtn.addEventListener("click", () => {
|
||
startCamera(els.cameraDeviceSelect.value || undefined);
|
||
});
|
||
|
||
els.cameraDeviceSelect.addEventListener("change", () => {
|
||
// Switching device only restarts the stream when the camera is already live;
|
||
// otherwise the choice is applied when "使用摄像头" is pressed.
|
||
if (state.cameraActive) {
|
||
startCamera(els.cameraDeviceSelect.value || undefined);
|
||
}
|
||
});
|
||
|
||
els.cameraUpload.addEventListener("change", (event) => {
|
||
const file = event.target.files && event.target.files[0];
|
||
selectFileImage(file);
|
||
event.target.value = "";
|
||
});
|
||
|
||
function autosizeTextarea() {
|
||
const ta = els.textInput;
|
||
ta.style.height = "auto";
|
||
ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`;
|
||
}
|
||
|
||
function submitText() {
|
||
const value = els.textInput.value;
|
||
if (!sendText(value)) return;
|
||
els.textInput.value = "";
|
||
autosizeTextarea();
|
||
setComposerEnabled(state.connected);
|
||
}
|
||
|
||
els.composer.addEventListener("submit", (event) => {
|
||
event.preventDefault();
|
||
submitText();
|
||
});
|
||
|
||
els.textInput.addEventListener("input", () => {
|
||
autosizeTextarea();
|
||
setComposerEnabled(state.connected);
|
||
});
|
||
|
||
els.textInput.addEventListener("keydown", (event) => {
|
||
if (event.key === "Enter" && !event.shiftKey && !event.isComposing) {
|
||
event.preventDefault();
|
||
submitText();
|
||
}
|
||
});
|
||
|
||
window.addEventListener("beforeunload", () => {
|
||
stopCameraStream();
|
||
if (state.ws) {
|
||
try {
|
||
state.ws.close();
|
||
} catch (_) {
|
||
/* ignore */
|
||
}
|
||
}
|
||
if (state.audioContext) {
|
||
try {
|
||
state.audioContext.close();
|
||
} catch (_) {
|
||
/* ignore */
|
||
}
|
||
}
|
||
});
|
||
|
||
els.url.value = defaultWsUrl();
|
||
|
||
setStatus("idle", "未连接");
|
||
setConnectButton();
|
||
setMicButton();
|
||
setMicSelectEnabled();
|
||
setComposerEnabled(false);
|