Files
py-active-call/examples/web_client.html
2026-02-06 11:25:05 +08:00

736 lines
22 KiB
HTML

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Duplex Voice Web Client</title>
<style>
@import url("https://fonts.googleapis.com/css2?family=Fraunces:opsz,wght@9..144,300;9..144,500;9..144,700&family=Recursive:wght@300;400;600;700&display=swap");
:root {
--bg: #0b0b0f;
--panel: #14141c;
--panel-2: #101018;
--ink: #f2f3f7;
--muted: #a7acba;
--accent: #ff6b6b;
--accent-2: #ffd166;
--good: #2dd4bf;
--bad: #f87171;
--grid: rgba(255, 255, 255, 0.06);
--shadow: 0 20px 60px rgba(0, 0, 0, 0.45);
}
* {
box-sizing: border-box;
}
html,
body {
height: 100%;
margin: 0;
color: var(--ink);
background: radial-gradient(1200px 600px at 20% -10%, #1d1d2a 0%, transparent 60%),
radial-gradient(800px 800px at 110% 10%, #20203a 0%, transparent 50%),
var(--bg);
font-family: "Recursive", ui-sans-serif, system-ui, -apple-system, "Segoe UI", sans-serif;
}
.noise {
position: fixed;
inset: 0;
background-image: url("data:image/svg+xml;utf8,<svg xmlns='http://www.w3.org/2000/svg' width='120' height='120' viewBox='0 0 120 120'><filter id='n'><feTurbulence type='fractalNoise' baseFrequency='0.9' numOctaves='2' stitchTiles='stitch'/></filter><rect width='120' height='120' filter='url(%23n)' opacity='0.06'/></svg>");
pointer-events: none;
mix-blend-mode: soft-light;
}
header {
padding: 32px 28px 18px;
border-bottom: 1px solid var(--grid);
}
h1 {
font-family: "Fraunces", serif;
font-weight: 600;
margin: 0 0 6px;
letter-spacing: 0.4px;
}
.subtitle {
color: var(--muted);
font-size: 0.95rem;
}
main {
display: grid;
grid-template-columns: 1.1fr 1.4fr;
gap: 24px;
padding: 24px 28px 40px;
}
.panel {
background: linear-gradient(180deg, rgba(255, 255, 255, 0.02), transparent),
var(--panel);
border: 1px solid var(--grid);
border-radius: 16px;
padding: 20px;
box-shadow: var(--shadow);
}
.panel h2 {
margin: 0 0 12px;
font-size: 1.05rem;
font-weight: 600;
}
.stack {
display: grid;
gap: 12px;
}
label {
display: block;
font-size: 0.85rem;
color: var(--muted);
margin-bottom: 6px;
}
input,
select,
button,
textarea {
font-family: inherit;
}
input,
select,
textarea {
width: 100%;
padding: 10px 12px;
border-radius: 10px;
border: 1px solid var(--grid);
background: var(--panel-2);
color: var(--ink);
outline: none;
}
textarea {
min-height: 80px;
resize: vertical;
}
.row {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 12px;
}
.btn-row {
display: flex;
flex-wrap: wrap;
gap: 10px;
}
button {
border: none;
border-radius: 999px;
padding: 10px 16px;
font-weight: 600;
background: var(--ink);
color: #111;
cursor: pointer;
transition: transform 0.2s ease, box-shadow 0.2s ease;
}
button.secondary {
background: transparent;
color: var(--ink);
border: 1px solid var(--grid);
}
button.accent {
background: linear-gradient(120deg, var(--accent), #f97316);
color: #0b0b0f;
}
button.good {
background: linear-gradient(120deg, var(--good), #22c55e);
color: #07261f;
}
button.bad {
background: linear-gradient(120deg, var(--bad), #f97316);
color: #2a0b0b;
}
button:active {
transform: translateY(1px) scale(0.99);
}
.status {
display: flex;
align-items: center;
gap: 12px;
padding: 12px;
background: rgba(255, 255, 255, 0.03);
border-radius: 12px;
border: 1px dashed var(--grid);
font-size: 0.9rem;
}
.dot {
width: 10px;
height: 10px;
border-radius: 999px;
background: var(--bad);
box-shadow: 0 0 12px rgba(248, 113, 113, 0.5);
}
.dot.on {
background: var(--good);
box-shadow: 0 0 12px rgba(45, 212, 191, 0.7);
}
.log {
height: 320px;
overflow: auto;
padding: 12px;
background: #0d0d14;
border-radius: 12px;
border: 1px solid var(--grid);
font-size: 0.85rem;
line-height: 1.4;
}
.chat {
height: 260px;
overflow: auto;
padding: 12px;
background: #0d0d14;
border-radius: 12px;
border: 1px solid var(--grid);
font-size: 0.9rem;
line-height: 1.45;
}
.chat-entry {
padding: 8px 10px;
margin-bottom: 8px;
border-radius: 10px;
background: rgba(255, 255, 255, 0.04);
border: 1px solid rgba(255, 255, 255, 0.06);
}
.chat-entry.user {
border-left: 3px solid var(--accent-2);
}
.chat-entry.ai {
border-left: 3px solid var(--good);
}
.chat-entry.interim {
opacity: 0.7;
font-style: italic;
}
.log-entry {
padding: 6px 8px;
border-bottom: 1px dashed rgba(255, 255, 255, 0.06);
}
.log-entry:last-child {
border-bottom: none;
}
.tag {
display: inline-flex;
align-items: center;
gap: 6px;
padding: 2px 8px;
border-radius: 999px;
font-size: 0.7rem;
text-transform: uppercase;
letter-spacing: 0.6px;
background: rgba(255, 255, 255, 0.08);
color: var(--muted);
}
.tag.event {
background: rgba(255, 107, 107, 0.18);
color: #ffc1c1;
}
.tag.audio {
background: rgba(45, 212, 191, 0.2);
color: #c5f9f0;
}
.tag.sys {
background: rgba(255, 209, 102, 0.2);
color: #ffefb0;
}
.muted {
color: var(--muted);
}
footer {
padding: 0 28px 28px;
color: var(--muted);
font-size: 0.8rem;
}
@media (max-width: 1100px) {
main {
grid-template-columns: 1fr;
}
.log {
height: 360px;
}
.chat {
height: 260px;
}
}
</style>
</head>
<body>
<div class="noise"></div>
<header>
<h1>Duplex Voice Client</h1>
<div class="subtitle">Browser client for the WebSocket duplex pipeline. Device selection + event logging.</div>
</header>
<main>
<section class="panel stack">
<h2>Connection</h2>
<div>
<label for="wsUrl">WebSocket URL</label>
<input id="wsUrl" value="ws://localhost:8000/ws" />
</div>
<div class="btn-row">
<button class="accent" id="connectBtn">Connect</button>
<button class="secondary" id="disconnectBtn">Disconnect</button>
</div>
<div class="status">
<div id="statusDot" class="dot"></div>
<div>
<div id="statusText">Disconnected</div>
<div class="muted" id="statusSub">Waiting for connection</div>
</div>
</div>
<h2>Devices</h2>
<div class="row">
<div>
<label for="inputSelect">Input (Mic)</label>
<select id="inputSelect"></select>
</div>
<div>
<label for="outputSelect">Output (Speaker)</label>
<select id="outputSelect"></select>
</div>
</div>
<div class="btn-row">
<button class="secondary" id="refreshDevicesBtn">Refresh Devices</button>
<button class="good" id="startMicBtn">Start Mic</button>
<button class="secondary" id="stopMicBtn">Stop Mic</button>
</div>
<h2>Chat</h2>
<div class="stack">
<textarea id="chatInput" placeholder="Type a message, press Send"></textarea>
<div class="btn-row">
<button class="accent" id="sendChatBtn">Send Chat</button>
<button class="secondary" id="clearLogBtn">Clear Log</button>
</div>
</div>
</section>
<section class="stack">
<div class="panel stack">
<h2>Chat History</h2>
<div class="chat" id="chatHistory"></div>
</div>
<div class="panel stack">
<h2>Event Log</h2>
<div class="log" id="log"></div>
</div>
</section>
</main>
<footer>
Output device selection requires HTTPS + a browser that supports <code>setSinkId</code>.
Audio is sent as 16-bit PCM @ 16 kHz, matching <code>examples/mic_client.py</code>.
</footer>
<audio id="audioOut" autoplay></audio>
<script>
const wsUrl = document.getElementById("wsUrl");
const connectBtn = document.getElementById("connectBtn");
const disconnectBtn = document.getElementById("disconnectBtn");
const inputSelect = document.getElementById("inputSelect");
const outputSelect = document.getElementById("outputSelect");
const startMicBtn = document.getElementById("startMicBtn");
const stopMicBtn = document.getElementById("stopMicBtn");
const refreshDevicesBtn = document.getElementById("refreshDevicesBtn");
const sendChatBtn = document.getElementById("sendChatBtn");
const clearLogBtn = document.getElementById("clearLogBtn");
const chatInput = document.getElementById("chatInput");
const logEl = document.getElementById("log");
const chatHistory = document.getElementById("chatHistory");
const statusDot = document.getElementById("statusDot");
const statusText = document.getElementById("statusText");
const statusSub = document.getElementById("statusSub");
const audioOut = document.getElementById("audioOut");
let ws = null;
let audioCtx = null;
let micStream = null;
let processor = null;
let micSource = null;
let playbackDest = null;
let playbackTime = 0;
let discardAudio = false;
let playbackSources = [];
let interimUserEl = null;
let interimAiEl = null;
let interimUserText = "";
let interimAiText = "";
const targetSampleRate = 16000;
function logLine(type, text, data) {
const time = new Date().toLocaleTimeString();
const entry = document.createElement("div");
entry.className = "log-entry";
const tag = document.createElement("span");
tag.className = `tag ${type}`;
tag.textContent = type.toUpperCase();
const msg = document.createElement("span");
msg.style.marginLeft = "10px";
msg.textContent = `[${time}] ${text}`;
entry.appendChild(tag);
entry.appendChild(msg);
if (data) {
const pre = document.createElement("div");
pre.className = "muted";
pre.textContent = JSON.stringify(data);
pre.style.marginTop = "4px";
entry.appendChild(pre);
}
logEl.appendChild(entry);
logEl.scrollTop = logEl.scrollHeight;
}
function addChat(role, text) {
const entry = document.createElement("div");
entry.className = `chat-entry ${role === "AI" ? "ai" : "user"}`;
entry.textContent = `${role}: ${text}`;
chatHistory.appendChild(entry);
chatHistory.scrollTop = chatHistory.scrollHeight;
}
function setInterim(role, text) {
const isAi = role === "AI";
let el = isAi ? interimAiEl : interimUserEl;
if (!text) {
if (el) el.remove();
if (isAi) interimAiEl = null;
else interimUserEl = null;
if (isAi) interimAiText = "";
else interimUserText = "";
return;
}
if (!el) {
el = document.createElement("div");
el.className = `chat-entry ${isAi ? "ai" : "user"} interim`;
chatHistory.appendChild(el);
if (isAi) interimAiEl = el;
else interimUserEl = el;
}
el.textContent = `${role} (interim): ${text}`;
chatHistory.scrollTop = chatHistory.scrollHeight;
}
function stopPlayback() {
discardAudio = true;
playbackTime = audioCtx ? audioCtx.currentTime : 0;
playbackSources.forEach((s) => {
try {
s.stop();
} catch (err) {}
});
playbackSources = [];
}
function setStatus(connected, detail) {
statusDot.classList.toggle("on", connected);
statusText.textContent = connected ? "Connected" : "Disconnected";
statusSub.textContent = detail || "";
}
async function ensureAudioContext() {
if (audioCtx) return;
audioCtx = new (window.AudioContext || window.webkitAudioContext)();
playbackDest = audioCtx.createMediaStreamDestination();
audioOut.srcObject = playbackDest.stream;
try {
await audioOut.play();
} catch (err) {
logLine("sys", "Audio playback blocked (user gesture needed)", { err: String(err) });
}
if (outputSelect.value) {
await setOutputDevice(outputSelect.value);
}
}
function downsampleBuffer(buffer, inRate, outRate) {
if (outRate === inRate) return buffer;
const ratio = inRate / outRate;
const newLength = Math.round(buffer.length / ratio);
const result = new Float32Array(newLength);
let offsetResult = 0;
let offsetBuffer = 0;
while (offsetResult < result.length) {
const nextOffsetBuffer = Math.round((offsetResult + 1) * ratio);
let accum = 0;
let count = 0;
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
accum += buffer[i];
count++;
}
result[offsetResult] = accum / count;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result;
}
function floatTo16BitPCM(float32) {
const out = new Int16Array(float32.length);
for (let i = 0; i < float32.length; i++) {
const s = Math.max(-1, Math.min(1, float32[i]));
out[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
return out;
}
function schedulePlayback(int16Data) {
if (!audioCtx || !playbackDest) return;
if (discardAudio) return;
const float32 = new Float32Array(int16Data.length);
for (let i = 0; i < int16Data.length; i++) {
float32[i] = int16Data[i] / 32768;
}
const buffer = audioCtx.createBuffer(1, float32.length, targetSampleRate);
buffer.copyToChannel(float32, 0);
const source = audioCtx.createBufferSource();
source.buffer = buffer;
source.connect(playbackDest);
const startTime = Math.max(audioCtx.currentTime + 0.02, playbackTime);
source.start(startTime);
playbackTime = startTime + buffer.duration;
playbackSources.push(source);
source.onended = () => {
playbackSources = playbackSources.filter((s) => s !== source);
};
}
async function connect() {
if (ws && ws.readyState === WebSocket.OPEN) return;
ws = new WebSocket(wsUrl.value.trim());
ws.binaryType = "arraybuffer";
ws.onopen = () => {
setStatus(true, "Session open");
logLine("sys", "WebSocket connected");
ensureAudioContext();
sendCommand({ command: "invite", option: { codec: "pcm", sampleRate: targetSampleRate } });
};
ws.onclose = () => {
setStatus(false, "Connection closed");
logLine("sys", "WebSocket closed");
ws = null;
};
ws.onerror = (err) => {
logLine("sys", "WebSocket error", { err: String(err) });
};
ws.onmessage = (msg) => {
if (typeof msg.data === "string") {
const event = JSON.parse(msg.data);
handleEvent(event);
} else {
const audioBuf = msg.data;
const int16 = new Int16Array(audioBuf);
schedulePlayback(int16);
logLine("audio", `Audio ${Math.round((int16.length / targetSampleRate) * 1000)}ms`);
}
};
}
function disconnect() {
if (ws) ws.close();
ws = null;
setStatus(false, "Disconnected");
}
function sendCommand(cmd) {
if (!ws || ws.readyState !== WebSocket.OPEN) {
logLine("sys", "Not connected");
return;
}
ws.send(JSON.stringify(cmd));
logLine("sys", `${cmd.command}`, cmd);
}
function handleEvent(event) {
const type = event.event || "unknown";
logLine("event", type, event);
if (type === "transcript") {
if (event.isFinal && event.text) {
setInterim("You", "");
addChat("You", event.text);
} else if (event.text) {
interimUserText += event.text;
setInterim("You", interimUserText);
}
}
if (type === "llmResponse") {
if (event.isFinal && event.text) {
setInterim("AI", "");
addChat("AI", event.text);
} else if (event.text) {
interimAiText += event.text;
setInterim("AI", interimAiText);
}
}
if (type === "trackStart") {
// New bot audio: stop any previous playback to avoid overlap
stopPlayback();
discardAudio = false;
}
if (type === "speaking") {
// User started speaking: clear any in-flight audio to avoid overlap
stopPlayback();
}
if (type === "interrupt") {
stopPlayback();
}
}
async function startMic() {
if (!ws || ws.readyState !== WebSocket.OPEN) {
logLine("sys", "Connect before starting mic");
return;
}
await ensureAudioContext();
const deviceId = inputSelect.value || undefined;
micStream = await navigator.mediaDevices.getUserMedia({
audio: deviceId ? { deviceId: { exact: deviceId } } : true,
});
micSource = audioCtx.createMediaStreamSource(micStream);
processor = audioCtx.createScriptProcessor(2048, 1, 1);
processor.onaudioprocess = (e) => {
if (!ws || ws.readyState !== WebSocket.OPEN) return;
const input = e.inputBuffer.getChannelData(0);
const downsampled = downsampleBuffer(input, audioCtx.sampleRate, targetSampleRate);
const pcm16 = floatTo16BitPCM(downsampled);
ws.send(pcm16.buffer);
};
micSource.connect(processor);
processor.connect(audioCtx.destination);
logLine("sys", "Microphone started");
}
function stopMic() {
if (processor) {
processor.disconnect();
processor = null;
}
if (micSource) {
micSource.disconnect();
micSource = null;
}
if (micStream) {
micStream.getTracks().forEach((t) => t.stop());
micStream = null;
}
logLine("sys", "Microphone stopped");
}
async function refreshDevices() {
const devices = await navigator.mediaDevices.enumerateDevices();
inputSelect.innerHTML = "";
outputSelect.innerHTML = "";
devices.forEach((d) => {
if (d.kind === "audioinput") {
const opt = document.createElement("option");
opt.value = d.deviceId;
opt.textContent = d.label || `Mic ${inputSelect.length + 1}`;
inputSelect.appendChild(opt);
}
if (d.kind === "audiooutput") {
const opt = document.createElement("option");
opt.value = d.deviceId;
opt.textContent = d.label || `Output ${outputSelect.length + 1}`;
outputSelect.appendChild(opt);
}
});
}
async function requestDeviceAccess() {
// Needed to reveal device labels in most browsers
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
stream.getTracks().forEach((t) => t.stop());
logLine("sys", "Microphone permission granted");
} catch (err) {
logLine("sys", "Microphone permission denied", { err: String(err) });
}
}
async function setOutputDevice(deviceId) {
if (!audioOut.setSinkId) {
logLine("sys", "setSinkId not supported in this browser");
return;
}
await audioOut.setSinkId(deviceId);
logLine("sys", `Output device set`, { deviceId });
}
connectBtn.addEventListener("click", connect);
disconnectBtn.addEventListener("click", disconnect);
refreshDevicesBtn.addEventListener("click", async () => {
await requestDeviceAccess();
await refreshDevices();
});
startMicBtn.addEventListener("click", startMic);
stopMicBtn.addEventListener("click", stopMic);
sendChatBtn.addEventListener("click", () => {
const text = chatInput.value.trim();
if (!text) return;
ensureAudioContext();
addChat("You", text);
sendCommand({ command: "chat", text });
chatInput.value = "";
});
clearLogBtn.addEventListener("click", () => (logEl.innerHTML = ""));
inputSelect.addEventListener("change", () => {
if (micStream) {
stopMic();
startMic();
}
});
outputSelect.addEventListener("change", () => setOutputDevice(outputSelect.value));
navigator.mediaDevices.addEventListener("devicechange", refreshDevices);
refreshDevices().catch(() => {});
</script>
</body>
</html>