Add src/voice Pipecat pipeline, browser demo at /voice-demo, and config/voice.json. Co-authored-by: Cursor <cursoragent@cursor.com>
900 lines
24 KiB
JavaScript
900 lines
24 KiB
JavaScript
/**
|
|
* Minimal browser client for the AI VideoAssistant engine's product
|
|
* websocket (`/ws-product`, protocol `va.ws.v1`).
|
|
*
|
|
* Responsibilities:
|
|
* - Open/close the websocket and run the session handshake.
|
|
* - List/select microphones and capture mic audio with browser AEC enabled.
|
|
* - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames
|
|
* as binary websocket messages.
|
|
* - Play `response.audio.delta` frames gaplessly through Web Audio.
|
|
* - Render a chat-style history of user transcripts and bot text deltas.
|
|
*/
|
|
|
|
const SAMPLE_RATE = 16000;
|
|
const CHANNELS = 1;
|
|
const FRAME_MS = 20;
|
|
const PROTOCOL = "va.ws.v1";
|
|
const MAX_WS_LOG_LINES = 120;
|
|
const AUDIO_DELTA_LOG_INTERVAL_MS = 1000;
|
|
|
|
function defaultWsUrl() {
|
|
const scheme = location.protocol === "https:" ? "wss:" : "ws:";
|
|
return `${scheme}//${location.host}/ws-product`;
|
|
}
|
|
|
|
const els = {
|
|
url: document.getElementById("ws-url"),
|
|
connectBtn: document.getElementById("connect-btn"),
|
|
statusDot: document.getElementById("status-dot"),
|
|
statusText: document.getElementById("status-text"),
|
|
chatLog: document.getElementById("chat-log"),
|
|
micBtn: document.getElementById("mic-btn"),
|
|
micSelect: document.getElementById("mic-select"),
|
|
micLabel: document.querySelector(".mic-btn__label"),
|
|
micIndicator: document.getElementById("mic-indicator"),
|
|
botIndicator: document.getElementById("bot-indicator"),
|
|
clearBtn: document.getElementById("clear-btn"),
|
|
clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
|
|
wsLog: document.getElementById("ws-log"),
|
|
meterFill: document.getElementById("meter-fill"),
|
|
composer: document.getElementById("composer"),
|
|
textInput: document.getElementById("text-input"),
|
|
sendBtn: document.getElementById("send-btn"),
|
|
};
|
|
|
|
const state = {
|
|
ws: null,
|
|
connected: false,
|
|
connecting: false,
|
|
|
|
audioContext: null,
|
|
micStream: null,
|
|
micSourceNode: null,
|
|
recorderNode: null,
|
|
|
|
micEnabled: false,
|
|
micDevices: [],
|
|
selectedMicDeviceId: "",
|
|
|
|
// Output scheduling.
|
|
nextPlaybackTime: 0,
|
|
playbackEndsAt: 0,
|
|
scheduledSources: [],
|
|
botActive: false,
|
|
botUiTimer: null,
|
|
|
|
// Chat state.
|
|
currentAssistantBubble: null,
|
|
|
|
// VU meter smoothing.
|
|
meterLevel: 0,
|
|
|
|
// Compact websocket logging.
|
|
audioDeltaLogCount: 0,
|
|
audioDeltaLogBytes: 0,
|
|
lastAudioDeltaLogAt: 0,
|
|
audioSendLogCount: 0,
|
|
audioSendLogBytes: 0,
|
|
lastAudioSendLogAt: 0,
|
|
};
|
|
|
|
/* ------------------------------------------------------------------ UI */
|
|
|
|
function setStatus(kind, text) {
|
|
els.statusDot.className = `status__dot status__dot--${kind}`;
|
|
els.statusText.textContent = text;
|
|
}
|
|
|
|
function setConnectButton() {
|
|
if (state.connecting) {
|
|
els.connectBtn.textContent = "Connecting…";
|
|
els.connectBtn.disabled = true;
|
|
els.connectBtn.classList.remove("is-disconnect");
|
|
} else if (state.connected) {
|
|
els.connectBtn.textContent = "Disconnect";
|
|
els.connectBtn.disabled = false;
|
|
els.connectBtn.classList.add("is-disconnect");
|
|
} else {
|
|
els.connectBtn.textContent = "Connect";
|
|
els.connectBtn.disabled = false;
|
|
els.connectBtn.classList.remove("is-disconnect");
|
|
}
|
|
}
|
|
|
|
function setMicButton() {
|
|
els.micBtn.disabled = !state.connected;
|
|
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
|
|
els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic";
|
|
els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic";
|
|
els.micIndicator.classList.toggle("is-active", state.micEnabled);
|
|
}
|
|
|
|
function setMicSelectEnabled() {
|
|
els.micSelect.disabled = !state.connected || !navigator.mediaDevices;
|
|
}
|
|
|
|
function setComposerEnabled(enabled) {
|
|
els.textInput.disabled = !enabled;
|
|
els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0;
|
|
}
|
|
|
|
function setBotIndicator(active) {
|
|
els.botIndicator.classList.toggle("is-active", active);
|
|
}
|
|
|
|
function addBubble(role, text) {
|
|
if (els.chatLog.querySelector(".chat__empty")) {
|
|
els.chatLog.innerHTML = "";
|
|
}
|
|
const bubble = document.createElement("div");
|
|
bubble.className = `bubble bubble--${role}`;
|
|
if (role !== "system") {
|
|
const tag = document.createElement("span");
|
|
tag.className = "bubble__role";
|
|
tag.textContent = role === "user" ? "You" : "Assistant";
|
|
bubble.appendChild(tag);
|
|
}
|
|
const body = document.createElement("span");
|
|
body.className = "bubble__text";
|
|
body.textContent = text;
|
|
bubble.appendChild(body);
|
|
els.chatLog.appendChild(bubble);
|
|
scrollChatToBottom();
|
|
return bubble;
|
|
}
|
|
|
|
function appendToBubble(bubble, text) {
|
|
const body = bubble.querySelector(".bubble__text");
|
|
body.textContent += text;
|
|
scrollChatToBottom();
|
|
}
|
|
|
|
function scrollChatToBottom() {
|
|
els.chatLog.scrollTop = els.chatLog.scrollHeight;
|
|
}
|
|
|
|
function clearChat() {
|
|
els.chatLog.innerHTML = "";
|
|
state.currentAssistantBubble = null;
|
|
const empty = document.createElement("div");
|
|
empty.className = "chat__empty";
|
|
empty.innerHTML = "<p>Chat cleared.</p>";
|
|
els.chatLog.appendChild(empty);
|
|
}
|
|
|
|
function truncateLogValue(value, maxLength = 160) {
|
|
const text = String(value);
|
|
if (text.length <= maxLength) return text;
|
|
return `${text.slice(0, maxLength - 1)}…`;
|
|
}
|
|
|
|
function compactWsPayload(payload) {
|
|
if (!payload || typeof payload !== "object") return String(payload);
|
|
const compact = { ...payload };
|
|
|
|
if (typeof compact.audio === "string") {
|
|
compact.audio = `<base64 ${compact.audio.length} chars>`;
|
|
}
|
|
if (typeof compact.data === "string" && compact.data.length > 160) {
|
|
compact.data = `<string ${compact.data.length} chars>`;
|
|
}
|
|
if (typeof compact.text === "string") {
|
|
compact.text = truncateLogValue(compact.text);
|
|
}
|
|
|
|
try {
|
|
return JSON.stringify(compact);
|
|
} catch (_) {
|
|
return payload.type || "unserializable websocket payload";
|
|
}
|
|
}
|
|
|
|
function addWsLog(direction, detail, kind = direction) {
|
|
if (els.wsLog.querySelector(".ws-log__empty")) {
|
|
els.wsLog.innerHTML = "";
|
|
}
|
|
|
|
const entry = document.createElement("div");
|
|
entry.className = `ws-log__entry ws-log__entry--${kind}`;
|
|
|
|
const time = document.createElement("span");
|
|
time.className = "ws-log__time";
|
|
time.textContent = new Date().toLocaleTimeString([], {
|
|
hour12: false,
|
|
hour: "2-digit",
|
|
minute: "2-digit",
|
|
second: "2-digit",
|
|
});
|
|
|
|
const dir = document.createElement("span");
|
|
dir.className = "ws-log__direction";
|
|
dir.textContent =
|
|
direction === "send"
|
|
? "SEND"
|
|
: direction === "recv"
|
|
? "RECV"
|
|
: direction.toUpperCase();
|
|
|
|
const body = document.createElement("span");
|
|
body.className = "ws-log__detail";
|
|
body.textContent = detail;
|
|
|
|
entry.append(time, dir, body);
|
|
els.wsLog.appendChild(entry);
|
|
|
|
while (els.wsLog.children.length > MAX_WS_LOG_LINES) {
|
|
els.wsLog.firstElementChild.remove();
|
|
}
|
|
els.wsLog.scrollTop = els.wsLog.scrollHeight;
|
|
}
|
|
|
|
function flushAudioDeltaLog() {
|
|
if (state.audioDeltaLogCount === 0) return;
|
|
addWsLog(
|
|
"recv",
|
|
`response.audio.delta x${state.audioDeltaLogCount} (${state.audioDeltaLogBytes} bytes)`,
|
|
);
|
|
state.audioDeltaLogCount = 0;
|
|
state.audioDeltaLogBytes = 0;
|
|
state.lastAudioDeltaLogAt = performance.now();
|
|
}
|
|
|
|
function flushAudioSendLog() {
|
|
if (state.audioSendLogCount === 0) return;
|
|
addWsLog(
|
|
"send",
|
|
`input.audio binary x${state.audioSendLogCount} (${state.audioSendLogBytes} bytes)`,
|
|
);
|
|
state.audioSendLogCount = 0;
|
|
state.audioSendLogBytes = 0;
|
|
state.lastAudioSendLogAt = performance.now();
|
|
}
|
|
|
|
function flushPendingWsLogs() {
|
|
flushAudioDeltaLog();
|
|
flushAudioSendLog();
|
|
}
|
|
|
|
function logWsPayload(direction, payload) {
|
|
if (direction === "send") {
|
|
flushAudioSendLog();
|
|
} else {
|
|
flushAudioDeltaLog();
|
|
}
|
|
|
|
if (direction === "recv" && payload?.type === "response.audio.delta") {
|
|
state.audioDeltaLogCount += 1;
|
|
state.audioDeltaLogBytes += payload.bytes || payload.audio?.length || 0;
|
|
const now = performance.now();
|
|
if (now - state.lastAudioDeltaLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) {
|
|
flushAudioDeltaLog();
|
|
}
|
|
return;
|
|
}
|
|
|
|
addWsLog(direction, compactWsPayload(payload));
|
|
}
|
|
|
|
function logBinarySend(byteLength) {
|
|
state.audioSendLogCount += 1;
|
|
state.audioSendLogBytes += byteLength;
|
|
const now = performance.now();
|
|
if (now - state.lastAudioSendLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) {
|
|
flushAudioSendLog();
|
|
}
|
|
}
|
|
|
|
function wsSend(data) {
|
|
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
|
|
|
|
if (typeof data === "string") {
|
|
try {
|
|
logWsPayload("send", JSON.parse(data));
|
|
} catch (_) {
|
|
flushAudioSendLog();
|
|
flushAudioDeltaLog();
|
|
addWsLog("send", truncateLogValue(data));
|
|
}
|
|
} else {
|
|
const byteLength =
|
|
data instanceof ArrayBuffer
|
|
? data.byteLength
|
|
: ArrayBuffer.isView(data)
|
|
? data.byteLength
|
|
: 0;
|
|
if (byteLength > 0) {
|
|
logBinarySend(byteLength);
|
|
}
|
|
}
|
|
|
|
state.ws.send(data);
|
|
return true;
|
|
}
|
|
|
|
function clearWsLog() {
|
|
state.audioDeltaLogCount = 0;
|
|
state.audioDeltaLogBytes = 0;
|
|
state.audioSendLogCount = 0;
|
|
state.audioSendLogBytes = 0;
|
|
els.wsLog.innerHTML =
|
|
'<div class="ws-log__empty">No websocket events yet.</div>';
|
|
}
|
|
|
|
/* ---------------------------------------------------------------- Audio */
|
|
|
|
async function ensureAudioContext() {
|
|
if (!state.audioContext) {
|
|
const Ctx = window.AudioContext || window.webkitAudioContext;
|
|
state.audioContext = new Ctx();
|
|
await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js");
|
|
}
|
|
if (state.audioContext.state === "suspended") {
|
|
await state.audioContext.resume();
|
|
}
|
|
return state.audioContext;
|
|
}
|
|
|
|
function renderMicDevices() {
|
|
const previousValue = state.selectedMicDeviceId || els.micSelect.value;
|
|
els.micSelect.innerHTML = "";
|
|
|
|
const defaultOption = document.createElement("option");
|
|
defaultOption.value = "";
|
|
defaultOption.textContent = "Default microphone";
|
|
els.micSelect.appendChild(defaultOption);
|
|
|
|
state.micDevices.forEach((device, index) => {
|
|
const option = document.createElement("option");
|
|
option.value = device.deviceId;
|
|
option.textContent = device.label || `Microphone ${index + 1}`;
|
|
els.micSelect.appendChild(option);
|
|
});
|
|
|
|
const hasPrevious = state.micDevices.some(
|
|
(device) => device.deviceId === previousValue,
|
|
);
|
|
state.selectedMicDeviceId = hasPrevious ? previousValue : "";
|
|
els.micSelect.value = state.selectedMicDeviceId;
|
|
setMicSelectEnabled();
|
|
}
|
|
|
|
async function refreshMicDevices() {
|
|
if (!navigator.mediaDevices?.enumerateDevices) {
|
|
setMicSelectEnabled();
|
|
return;
|
|
}
|
|
|
|
try {
|
|
const devices = await navigator.mediaDevices.enumerateDevices();
|
|
state.micDevices = devices.filter((device) => device.kind === "audioinput");
|
|
renderMicDevices();
|
|
} catch (err) {
|
|
console.warn("Could not enumerate microphones", err);
|
|
setMicSelectEnabled();
|
|
}
|
|
}
|
|
|
|
async function startMic() {
|
|
const ctx = await ensureAudioContext();
|
|
const audioConstraints = {
|
|
echoCancellation: true,
|
|
noiseSuppression: true,
|
|
autoGainControl: true,
|
|
channelCount: 1,
|
|
};
|
|
if (state.selectedMicDeviceId) {
|
|
audioConstraints.deviceId = { exact: state.selectedMicDeviceId };
|
|
}
|
|
|
|
state.micStream = await navigator.mediaDevices.getUserMedia({
|
|
audio: audioConstraints,
|
|
video: false,
|
|
});
|
|
await refreshMicDevices();
|
|
|
|
state.micSourceNode = ctx.createMediaStreamSource(state.micStream);
|
|
state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", {
|
|
numberOfInputs: 1,
|
|
numberOfOutputs: 0,
|
|
channelCount: 1,
|
|
processorOptions: {
|
|
targetSampleRate: SAMPLE_RATE,
|
|
frameMs: FRAME_MS,
|
|
},
|
|
});
|
|
state.recorderNode.port.onmessage = (event) => {
|
|
const data = event.data;
|
|
if (!data || data.type !== "frame") return;
|
|
updateMeter(data.rms || 0);
|
|
if (state.connected) {
|
|
wsSend(data.buffer);
|
|
}
|
|
};
|
|
|
|
state.micSourceNode.connect(state.recorderNode);
|
|
state.micEnabled = true;
|
|
addWsLog("system", "mic capture started (binary input.audio frames)");
|
|
setMicButton();
|
|
}
|
|
|
|
function stopMic() {
|
|
const wasEnabled = state.micEnabled;
|
|
if (state.recorderNode) {
|
|
try {
|
|
state.recorderNode.port.onmessage = null;
|
|
state.recorderNode.disconnect();
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
state.recorderNode = null;
|
|
}
|
|
if (state.micSourceNode) {
|
|
try {
|
|
state.micSourceNode.disconnect();
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
state.micSourceNode = null;
|
|
}
|
|
if (state.micStream) {
|
|
for (const track of state.micStream.getTracks()) {
|
|
try {
|
|
track.stop();
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
}
|
|
state.micStream = null;
|
|
}
|
|
state.micEnabled = false;
|
|
updateMeter(0);
|
|
if (wasEnabled) {
|
|
flushAudioSendLog();
|
|
addWsLog("system", "mic capture stopped");
|
|
}
|
|
setMicButton();
|
|
}
|
|
|
|
function updateMeter(rms) {
|
|
// Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech.
|
|
const target = Math.min(1, rms * 2.4);
|
|
state.meterLevel = state.meterLevel * 0.5 + target * 0.5;
|
|
els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`;
|
|
}
|
|
|
|
/* ---------------------------------------------------- Bot audio playback */
|
|
|
|
function schedulePlayback(int16) {
|
|
const ctx = state.audioContext;
|
|
if (!ctx) return;
|
|
|
|
const float32 = new Float32Array(int16.length);
|
|
for (let i = 0; i < int16.length; i++) {
|
|
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
|
|
}
|
|
const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE);
|
|
buffer.copyToChannel(float32, 0);
|
|
|
|
const src = ctx.createBufferSource();
|
|
src.buffer = buffer;
|
|
src.connect(ctx.destination);
|
|
|
|
const now = ctx.currentTime;
|
|
// Schedule immediately after the previously scheduled chunk to keep
|
|
// playback contiguous, with a tiny safety margin if we fell behind.
|
|
const startAt = Math.max(now + 0.02, state.nextPlaybackTime);
|
|
src.start(startAt);
|
|
state.nextPlaybackTime = startAt + buffer.duration;
|
|
state.playbackEndsAt = state.nextPlaybackTime;
|
|
|
|
src.onended = () => {
|
|
const idx = state.scheduledSources.indexOf(src);
|
|
if (idx >= 0) state.scheduledSources.splice(idx, 1);
|
|
};
|
|
state.scheduledSources.push(src);
|
|
|
|
setBotIndicator(true);
|
|
if (state.botUiTimer) clearTimeout(state.botUiTimer);
|
|
const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120;
|
|
state.botUiTimer = setTimeout(() => {
|
|
if (state.audioContext &&
|
|
state.audioContext.currentTime >= state.playbackEndsAt - 0.01) {
|
|
setBotIndicator(false);
|
|
}
|
|
}, msUntilEnd);
|
|
}
|
|
|
|
function stopPlaybackQueue() {
|
|
for (const src of state.scheduledSources) {
|
|
try {
|
|
src.onended = null;
|
|
src.stop();
|
|
src.disconnect();
|
|
} catch (_) {
|
|
/* already stopped */
|
|
}
|
|
}
|
|
state.scheduledSources = [];
|
|
resetPlaybackClock();
|
|
if (state.botUiTimer) {
|
|
clearTimeout(state.botUiTimer);
|
|
state.botUiTimer = null;
|
|
}
|
|
setBotIndicator(false);
|
|
}
|
|
|
|
function resetPlaybackClock() {
|
|
if (state.audioContext) {
|
|
state.nextPlaybackTime = state.audioContext.currentTime;
|
|
state.playbackEndsAt = state.audioContext.currentTime;
|
|
}
|
|
}
|
|
|
|
/* --------------------------------------------------------- Chat updates */
|
|
|
|
function handleUserTranscript(text) {
|
|
if (!text) return;
|
|
state.currentAssistantBubble = null;
|
|
addBubble("user", text);
|
|
}
|
|
|
|
function sendText(text) {
|
|
const value = (text || "").trim();
|
|
if (!value) return false;
|
|
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
|
|
const message = {
|
|
type: "input.text",
|
|
text: value,
|
|
interrupt: true,
|
|
};
|
|
|
|
// The engine does not echo text input back as a transcript event, so we
|
|
// render the user bubble locally. Also interrupt any in-flight bot audio
|
|
// so the next reply is heard cleanly. We deliberately do NOT clear
|
|
// `currentAssistantBubble` here — the engine will emit a
|
|
// `response.text.final(interrupted=true)` for the in-flight assistant
|
|
// turn, which finalizes that bubble in place. A brand-new bubble for the
|
|
// reply will be created when `response.text.started` arrives.
|
|
wsSend(JSON.stringify(message));
|
|
stopPlaybackQueue();
|
|
addBubble("user", value);
|
|
return true;
|
|
}
|
|
|
|
function handleAssistantDelta(text) {
|
|
if (!text) return;
|
|
if (!state.currentAssistantBubble) {
|
|
state.currentAssistantBubble = addBubble("assistant", "");
|
|
}
|
|
appendToBubble(state.currentAssistantBubble, text);
|
|
}
|
|
|
|
function handleAssistantStarted() {
|
|
state.currentAssistantBubble = null;
|
|
}
|
|
|
|
function handleAssistantFinal(text, interrupted) {
|
|
if (!text) {
|
|
state.currentAssistantBubble = null;
|
|
return;
|
|
}
|
|
if (state.currentAssistantBubble) {
|
|
const body = state.currentAssistantBubble.querySelector(".bubble__text");
|
|
body.textContent = text;
|
|
} else {
|
|
state.currentAssistantBubble = addBubble("assistant", text);
|
|
}
|
|
if (interrupted) {
|
|
state.currentAssistantBubble.classList.add("bubble--interrupted");
|
|
}
|
|
state.currentAssistantBubble = null;
|
|
scrollChatToBottom();
|
|
}
|
|
|
|
function finalizeAssistantBubble() {
|
|
state.currentAssistantBubble = null;
|
|
}
|
|
|
|
/* ---------------------------------------------------------- Websocket IO */
|
|
|
|
function decodeBase64ToInt16(b64) {
|
|
const binary = atob(b64);
|
|
const len = binary.length;
|
|
const bytes = new Uint8Array(len);
|
|
for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i);
|
|
return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2);
|
|
}
|
|
|
|
function handleEvent(event) {
|
|
switch (event.type) {
|
|
case "response.audio.delta":
|
|
if (typeof event.audio === "string") {
|
|
schedulePlayback(decodeBase64ToInt16(event.audio));
|
|
}
|
|
break;
|
|
case "response.audio.started":
|
|
setBotIndicator(true);
|
|
break;
|
|
case "response.audio.stopped":
|
|
finalizeAssistantBubble();
|
|
// The indicator turns off automatically when the playback queue drains.
|
|
break;
|
|
case "response.text.delta":
|
|
handleAssistantDelta(event.text);
|
|
break;
|
|
case "response.text.started":
|
|
handleAssistantStarted();
|
|
break;
|
|
case "response.text.final":
|
|
handleAssistantFinal(event.text, event.interrupted);
|
|
break;
|
|
case "input.transcript.final":
|
|
handleUserTranscript(event.text);
|
|
break;
|
|
case "input.transcript.interim":
|
|
// Ignore partial ASR updates; chat history renders committed user turns.
|
|
break;
|
|
case "transport.message":
|
|
// Reserved for future structured messages; ignore silently.
|
|
break;
|
|
default:
|
|
// Unknown event type: log for debugging.
|
|
console.debug("ws event", event);
|
|
}
|
|
}
|
|
|
|
async function connect() {
|
|
if (state.connected || state.connecting) return;
|
|
const url = (els.url.value || "").trim();
|
|
if (!url) {
|
|
setStatus("error", "Missing URL");
|
|
return;
|
|
}
|
|
|
|
state.connecting = true;
|
|
setStatus("connecting", "Connecting…");
|
|
setConnectButton();
|
|
addWsLog("system", `connecting ${url}`);
|
|
|
|
try {
|
|
// Pre-warm audio context on user gesture so playback works on Safari.
|
|
await ensureAudioContext();
|
|
} catch (err) {
|
|
console.error("AudioContext failed", err);
|
|
state.connecting = false;
|
|
setStatus("error", "Audio init failed");
|
|
setConnectButton();
|
|
addWsLog("error", `audio init failed: ${err.message || err}`, "error");
|
|
return;
|
|
}
|
|
|
|
let ws;
|
|
try {
|
|
ws = new WebSocket(url);
|
|
} catch (err) {
|
|
console.error("WebSocket constructor failed", err);
|
|
state.connecting = false;
|
|
setStatus("error", "Bad URL");
|
|
setConnectButton();
|
|
addWsLog("error", `bad websocket URL: ${err.message || err}`, "error");
|
|
return;
|
|
}
|
|
ws.binaryType = "arraybuffer";
|
|
state.ws = ws;
|
|
|
|
ws.addEventListener("open", () => {
|
|
const startMessage = {
|
|
type: "session.start",
|
|
protocol: PROTOCOL,
|
|
audio: {
|
|
encoding: "pcm_s16le",
|
|
sample_rate: SAMPLE_RATE,
|
|
channels: CHANNELS,
|
|
},
|
|
};
|
|
|
|
state.connecting = false;
|
|
state.connected = true;
|
|
resetPlaybackClock();
|
|
addWsLog("system", "websocket open");
|
|
setStatus("connected", "Connected");
|
|
setConnectButton();
|
|
setMicButton();
|
|
setMicSelectEnabled();
|
|
refreshMicDevices();
|
|
|
|
wsSend(JSON.stringify(startMessage));
|
|
addBubble("system", "Session started.");
|
|
setComposerEnabled(true);
|
|
els.textInput.focus();
|
|
});
|
|
|
|
ws.addEventListener("message", (event) => {
|
|
const data = event.data;
|
|
if (typeof data === "string") {
|
|
let parsed;
|
|
try {
|
|
parsed = JSON.parse(data);
|
|
} catch (err) {
|
|
console.warn("Bad JSON from server", err, data);
|
|
addWsLog(
|
|
"error",
|
|
`invalid JSON from server: ${truncateLogValue(data)}`,
|
|
"error",
|
|
);
|
|
return;
|
|
}
|
|
logWsPayload("recv", parsed);
|
|
handleEvent(parsed);
|
|
} else if (data instanceof ArrayBuffer) {
|
|
// Server doesn't currently send binary, but handle it just in case.
|
|
addWsLog("recv", `binary audio ${data.byteLength} bytes`);
|
|
schedulePlayback(new Int16Array(data));
|
|
}
|
|
});
|
|
|
|
ws.addEventListener("error", (err) => {
|
|
console.error("WebSocket error", err);
|
|
setStatus("error", "Connection error");
|
|
addWsLog("error", "websocket error", "error");
|
|
});
|
|
|
|
ws.addEventListener("close", (event) => {
|
|
const wasConnected = state.connected;
|
|
state.ws = null;
|
|
state.connected = false;
|
|
state.connecting = false;
|
|
if (state.micEnabled) stopMic();
|
|
stopPlaybackQueue();
|
|
setConnectButton();
|
|
setMicButton();
|
|
setMicSelectEnabled();
|
|
setComposerEnabled(false);
|
|
setBotIndicator(false);
|
|
flushPendingWsLogs();
|
|
addWsLog(
|
|
"system",
|
|
`websocket close code=${event.code}${
|
|
event.reason ? ` reason=${event.reason}` : ""
|
|
}`,
|
|
);
|
|
if (wasConnected) {
|
|
addBubble(
|
|
"system",
|
|
`Session ended${event.reason ? ` — ${event.reason}` : ""}.`,
|
|
);
|
|
setStatus("idle", "Disconnected");
|
|
} else {
|
|
setStatus("error", "Connection closed");
|
|
}
|
|
});
|
|
}
|
|
|
|
function disconnect() {
|
|
if (!state.ws) return;
|
|
try {
|
|
if (state.ws.readyState === WebSocket.OPEN) {
|
|
const stopMessage = { type: "session.stop", reason: "client_disconnect" };
|
|
wsSend(JSON.stringify(stopMessage));
|
|
}
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
try {
|
|
state.ws.close(1000, "client_disconnect");
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
}
|
|
|
|
/* ---------------------------------------------------------------- Wiring */
|
|
|
|
els.connectBtn.addEventListener("click", () => {
|
|
if (state.connected) disconnect();
|
|
else connect();
|
|
});
|
|
|
|
els.micBtn.addEventListener("click", async () => {
|
|
if (!state.connected) return;
|
|
els.micBtn.disabled = true;
|
|
try {
|
|
if (state.micEnabled) {
|
|
stopMic();
|
|
} else {
|
|
await startMic();
|
|
}
|
|
} catch (err) {
|
|
console.error("Mic error", err);
|
|
addBubble("system", `Mic error: ${err.message || err}`);
|
|
} finally {
|
|
els.micBtn.disabled = !state.connected;
|
|
}
|
|
});
|
|
|
|
els.micSelect.addEventListener("change", async () => {
|
|
state.selectedMicDeviceId = els.micSelect.value;
|
|
if (!state.micEnabled) return;
|
|
|
|
els.micSelect.disabled = true;
|
|
els.micBtn.disabled = true;
|
|
try {
|
|
stopMic();
|
|
await startMic();
|
|
} catch (err) {
|
|
console.error("Mic switch error", err);
|
|
addBubble("system", `Mic switch error: ${err.message || err}`);
|
|
} finally {
|
|
setMicButton();
|
|
setMicSelectEnabled();
|
|
}
|
|
});
|
|
|
|
if (navigator.mediaDevices?.addEventListener) {
|
|
navigator.mediaDevices.addEventListener("devicechange", refreshMicDevices);
|
|
}
|
|
|
|
els.clearBtn.addEventListener("click", () => {
|
|
clearChat();
|
|
});
|
|
|
|
els.clearWsLogBtn.addEventListener("click", () => {
|
|
clearWsLog();
|
|
});
|
|
|
|
function autosizeTextarea() {
|
|
const ta = els.textInput;
|
|
ta.style.height = "auto";
|
|
ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`;
|
|
}
|
|
|
|
function submitText() {
|
|
const value = els.textInput.value;
|
|
if (!sendText(value)) return;
|
|
els.textInput.value = "";
|
|
autosizeTextarea();
|
|
setComposerEnabled(state.connected);
|
|
}
|
|
|
|
els.composer.addEventListener("submit", (event) => {
|
|
event.preventDefault();
|
|
submitText();
|
|
});
|
|
|
|
els.textInput.addEventListener("input", () => {
|
|
autosizeTextarea();
|
|
setComposerEnabled(state.connected);
|
|
});
|
|
|
|
els.textInput.addEventListener("keydown", (event) => {
|
|
if (event.key === "Enter" && !event.shiftKey && !event.isComposing) {
|
|
event.preventDefault();
|
|
submitText();
|
|
}
|
|
});
|
|
|
|
window.addEventListener("beforeunload", () => {
|
|
if (state.ws) {
|
|
try {
|
|
state.ws.close();
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
}
|
|
if (state.audioContext) {
|
|
try {
|
|
state.audioContext.close();
|
|
} catch (_) {
|
|
/* ignore */
|
|
}
|
|
}
|
|
});
|
|
|
|
els.url.value = defaultWsUrl();
|
|
|
|
setStatus("idle", "Disconnected");
|
|
setConnectButton();
|
|
setMicButton();
|
|
setMicSelectEnabled();
|
|
setComposerEnabled(false);
|