Compare commits

...

16 Commits

Author SHA1 Message Date
Xin Wang
7be8fda424 Fix microphone talk eou missing and clean chat log 2026-02-06 11:36:39 +08:00
Xin Wang
c8c0e30bc3 Update web client 2026-02-06 11:25:05 +08:00
Xin Wang
960690ba80 Remove invite button, correct stream asr tts transcription 2026-02-06 11:20:52 +08:00
Xin Wang
cb35d87eb4 Update web client 2026-02-06 10:46:24 +08:00
Xin Wang
5c03cf2b1f Update web client layout 2026-02-06 10:34:09 +08:00
Xin Wang
876ca8221c Put web client together 2026-02-06 09:57:45 +08:00
Xin Wang
a8e7c7e2ef Add web client to app server 2026-02-06 09:54:23 +08:00
Xin Wang
9d42f3cca1 Fix list devices on web client 2026-02-06 09:40:52 +08:00
Xin Wang
f81a561e0e Fix indent error 2026-02-06 08:40:42 +08:00
Xin Wang
a70970fee5 Add web client 2026-02-06 08:36:00 +08:00
Xin Wang
e511cf9077 Fix Potential state duplication on barge-in. 2026-02-06 08:30:37 +08:00
Xin Wang
0576231d8d Fix Race risks if process_audio is called concurrently. 2026-02-06 08:26:56 +08:00
Xin Wang
26458faa6c Fix Unbounded _audio_buffer growth. 2026-02-06 08:11:14 +08:00
Xin Wang
605968a639 Fix _stop_current_speech doesn’t cancel LLM/TTS services. 2026-02-06 08:05:33 +08:00
Xin Wang
31d24a7428 Merge branch 'master' of https://gitea.xiaowang.eu.org/wx44wx/py-active-call 2026-02-06 08:00:39 +08:00
Xin Wang
7846e4cebc Fix No cancellation of existing turn on new EOU. 2026-02-06 07:59:31 +08:00
5 changed files with 952 additions and 69 deletions

View File

@@ -4,10 +4,11 @@ import asyncio
import json
import time
import uuid
from pathlib import Path
from typing import Dict, Any, Optional, List
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi.responses import JSONResponse, FileResponse
from loguru import logger
# Try to import aiortc (optional for WebRTC functionality)
@@ -64,6 +65,7 @@ async def heartbeat_and_timeout_task(
# Initialize FastAPI
app = FastAPI(title="Python Active-Call", version="0.1.0")
_WEB_CLIENT_PATH = Path(__file__).resolve().parent.parent / "examples" / "web_client.html"
# Configure CORS
app.add_middleware(
@@ -99,6 +101,24 @@ async def health_check():
return {"status": "healthy", "sessions": len(active_sessions)}
@app.get("/")
async def web_client_root():
"""Serve the web client."""
if not _WEB_CLIENT_PATH.exists():
raise HTTPException(status_code=404, detail="Web client not found")
return FileResponse(_WEB_CLIENT_PATH)
@app.get("/client")
async def web_client_alias():
"""Alias for the web client."""
if not _WEB_CLIENT_PATH.exists():
raise HTTPException(status_code=404, detail="Web client not found")
return FileResponse(_WEB_CLIENT_PATH)
@app.get("/iceservers")
async def get_ice_servers():
"""Get ICE servers configuration for WebRTC."""

View File

@@ -108,7 +108,10 @@ class DuplexPipeline:
self._is_bot_speaking = False
self._current_turn_task: Optional[asyncio.Task] = None
self._audio_buffer: bytes = b""
max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
self._last_vad_status: str = "Silence"
self._process_lock = asyncio.Lock()
# Interruption handling
self._interrupt_event = asyncio.Event()
@@ -206,71 +209,75 @@ class DuplexPipeline:
return
try:
# 1. Process through VAD
vad_result = self.vad_processor.process(pcm_bytes, settings.chunk_size_ms)
async with self._process_lock:
# 1. Process through VAD
vad_result = self.vad_processor.process(pcm_bytes, settings.chunk_size_ms)
vad_status = "Silence"
if vad_result:
event_type, probability = vad_result
vad_status = "Speech" if event_type == "speaking" else "Silence"
# Emit VAD event
await self.event_bus.publish(event_type, {
"trackId": self.session_id,
"probability": probability
})
else:
# No state change - keep previous status
vad_status = self._last_vad_status
# Update state based on VAD
if vad_status == "Speech" and self._last_vad_status != "Speech":
await self._on_speech_start()
self._last_vad_status = vad_status
# 2. Check for barge-in (user speaking while bot speaking)
# Filter false interruptions by requiring minimum speech duration
if self._is_bot_speaking:
if vad_status == "Speech":
# User is speaking while bot is speaking
self._barge_in_silence_frames = 0 # Reset silence counter
vad_status = "Silence"
if vad_result:
event_type, probability = vad_result
vad_status = "Speech" if event_type == "speaking" else "Silence"
if self._barge_in_speech_start_time is None:
# Start tracking speech duration
self._barge_in_speech_start_time = time.time()
self._barge_in_speech_frames = 1
logger.debug("Potential barge-in detected, tracking duration...")
else:
self._barge_in_speech_frames += 1
# Check if speech duration exceeds threshold
speech_duration_ms = (time.time() - self._barge_in_speech_start_time) * 1000
if speech_duration_ms >= self._barge_in_min_duration_ms:
logger.info(f"Barge-in confirmed after {speech_duration_ms:.0f}ms of speech ({self._barge_in_speech_frames} frames)")
await self._handle_barge_in()
# Emit VAD event
await self.event_bus.publish(event_type, {
"trackId": self.session_id,
"probability": probability
})
else:
# Silence frame during potential barge-in
if self._barge_in_speech_start_time is not None:
self._barge_in_silence_frames += 1
# Allow brief silence gaps (VAD flickering)
if self._barge_in_silence_frames > self._barge_in_silence_tolerance:
# Too much silence - reset barge-in tracking
logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
self._barge_in_speech_start_time = None
self._barge_in_speech_frames = 0
self._barge_in_silence_frames = 0
# 3. Buffer audio for ASR
if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
self._audio_buffer += pcm_bytes
await self.asr_service.send_audio(pcm_bytes)
# No state change - keep previous status
vad_status = self._last_vad_status
# For SiliconFlow ASR, trigger interim transcription periodically
# The service handles timing internally via start_interim_transcription()
# 4. Check for End of Utterance - this triggers LLM response
if self.eou_detector.process(vad_status):
await self._on_end_of_utterance()
# Update state based on VAD
if vad_status == "Speech" and self._last_vad_status != "Speech":
await self._on_speech_start()
self._last_vad_status = vad_status
# 2. Check for barge-in (user speaking while bot speaking)
# Filter false interruptions by requiring minimum speech duration
if self._is_bot_speaking:
if vad_status == "Speech":
# User is speaking while bot is speaking
self._barge_in_silence_frames = 0 # Reset silence counter
if self._barge_in_speech_start_time is None:
# Start tracking speech duration
self._barge_in_speech_start_time = time.time()
self._barge_in_speech_frames = 1
logger.debug("Potential barge-in detected, tracking duration...")
else:
self._barge_in_speech_frames += 1
# Check if speech duration exceeds threshold
speech_duration_ms = (time.time() - self._barge_in_speech_start_time) * 1000
if speech_duration_ms >= self._barge_in_min_duration_ms:
logger.info(f"Barge-in confirmed after {speech_duration_ms:.0f}ms of speech ({self._barge_in_speech_frames} frames)")
await self._handle_barge_in()
else:
# Silence frame during potential barge-in
if self._barge_in_speech_start_time is not None:
self._barge_in_silence_frames += 1
# Allow brief silence gaps (VAD flickering)
if self._barge_in_silence_frames > self._barge_in_silence_tolerance:
# Too much silence - reset barge-in tracking
logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
self._barge_in_speech_start_time = None
self._barge_in_speech_frames = 0
self._barge_in_silence_frames = 0
# 3. Buffer audio for ASR
if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
self._audio_buffer += pcm_bytes
if len(self._audio_buffer) > self._max_audio_buffer_bytes:
# Keep only the most recent audio to cap memory usage
self._audio_buffer = self._audio_buffer[-self._max_audio_buffer_bytes:]
await self.asr_service.send_audio(pcm_bytes)
# For SiliconFlow ASR, trigger interim transcription periodically
# The service handles timing internally via start_interim_transcription()
# 4. Check for End of Utterance - this triggers LLM response
if self.eou_detector.process(vad_status):
await self._on_end_of_utterance()
except Exception as e:
logger.error(f"Pipeline audio processing error: {e}", exc_info=True)
@@ -388,6 +395,8 @@ class DuplexPipeline:
self._last_sent_transcript = ""
# Process the turn - trigger LLM response
# Cancel any existing turn to avoid overlapping assistant responses
await self._stop_current_speech()
await self.conversation.end_user_turn(user_text)
self._current_turn_task = asyncio.create_task(self._handle_turn(user_text))
@@ -650,8 +659,10 @@ class DuplexPipeline:
if self.llm_service and hasattr(self.llm_service, 'cancel'):
self.llm_service.cancel()
# Interrupt conversation
await self.conversation.interrupt()
# Interrupt conversation only if there is no active turn task.
# When a turn task exists, it will handle end_assistant_turn() to avoid double callbacks.
if not (self._current_turn_task and not self._current_turn_task.done()):
await self.conversation.interrupt()
# Reset for new user turn
await self.conversation.start_user_turn()
@@ -667,6 +678,12 @@ class DuplexPipeline:
await self._current_turn_task
except asyncio.CancelledError:
pass
# Ensure underlying services are cancelled to avoid leaking work/audio
if self.tts_service:
await self.tts_service.cancel()
if self.llm_service and hasattr(self.llm_service, 'cancel'):
self.llm_service.cancel()
self._is_bot_speaking = False
self._interrupt_event.clear()

View File

@@ -0,0 +1,96 @@
<svg width="1200" height="620" viewBox="0 0 1200 620" xmlns="http://www.w3.org/2000/svg">
<defs>
<style>
.box { fill:#11131a; stroke:#3a3f4b; stroke-width:1.2; rx:10; ry:10; }
.title { font: 600 14px 'Arial'; fill:#f2f3f7; }
.text { font: 12px 'Arial'; fill:#c8ccd8; }
.arrow { stroke:#7aa2ff; stroke-width:1.6; marker-end:url(#arrow); fill:none; }
.arrow2 { stroke:#2dd4bf; stroke-width:1.6; marker-end:url(#arrow); fill:none; }
.arrow3 { stroke:#ff6b6b; stroke-width:1.6; marker-end:url(#arrow); fill:none; }
.label { font: 11px 'Arial'; fill:#9aa3b2; }
</style>
<marker id="arrow" markerWidth="8" markerHeight="8" refX="7" refY="4" orient="auto">
<path d="M0,0 L8,4 L0,8 Z" fill="#7aa2ff"/>
</marker>
</defs>
<rect x="40" y="40" width="250" height="120" class="box"/>
<text x="60" y="70" class="title">Web Client</text>
<text x="60" y="95" class="text">WS JSON commands</text>
<text x="60" y="115" class="text">WS binary PCM audio</text>
<rect x="350" y="40" width="250" height="120" class="box"/>
<text x="370" y="70" class="title">FastAPI /ws</text>
<text x="370" y="95" class="text">Session + Transport</text>
<rect x="660" y="40" width="250" height="120" class="box"/>
<text x="680" y="70" class="title">DuplexPipeline</text>
<text x="680" y="95" class="text">process_audio / process_text</text>
<rect x="920" y="40" width="240" height="120" class="box"/>
<text x="940" y="70" class="title">ConversationManager</text>
<text x="940" y="95" class="text">turns + state</text>
<rect x="660" y="200" width="180" height="100" class="box"/>
<text x="680" y="230" class="title">VADProcessor</text>
<text x="680" y="255" class="text">speech/silence</text>
<rect x="860" y="200" width="180" height="100" class="box"/>
<text x="880" y="230" class="title">EOU Detector</text>
<text x="880" y="255" class="text">end-of-utterance</text>
<rect x="1060" y="200" width="120" height="100" class="box"/>
<text x="1075" y="230" class="title">ASR</text>
<text x="1075" y="255" class="text">transcripts</text>
<rect x="920" y="350" width="240" height="110" class="box"/>
<text x="940" y="380" class="title">LLM (stream)</text>
<text x="940" y="405" class="text">llmResponse events</text>
<rect x="660" y="350" width="220" height="110" class="box"/>
<text x="680" y="380" class="title">TTS (stream)</text>
<text x="680" y="405" class="text">PCM audio</text>
<rect x="40" y="350" width="250" height="110" class="box"/>
<text x="60" y="380" class="title">Web Client</text>
<text x="60" y="405" class="text">audio playback + UI</text>
<path d="M290 80 L350 80" class="arrow"/>
<text x="300" y="70" class="label">JSON / PCM</text>
<path d="M600 80 L660 80" class="arrow"/>
<text x="615" y="70" class="label">dispatch</text>
<path d="M910 80 L920 80" class="arrow"/>
<text x="880" y="70" class="label">turn mgmt</text>
<path d="M750 160 L750 200" class="arrow"/>
<text x="705" y="190" class="label">audio chunks</text>
<path d="M840 250 L860 250" class="arrow"/>
<text x="835" y="240" class="label">vad status</text>
<path d="M1040 250 L1060 250" class="arrow"/>
<text x="1010" y="240" class="label">audio buffer</text>
<path d="M950 300 L950 350" class="arrow2"/>
<text x="930" y="340" class="label">EOU -> LLM</text>
<path d="M880 405 L920 405" class="arrow2"/>
<text x="870" y="395" class="label">text stream</text>
<path d="M660 405 L290 405" class="arrow2"/>
<text x="430" y="395" class="label">PCM audio</text>
<path d="M660 450 L350 450" class="arrow"/>
<text x="420" y="440" class="label">events: trackStart/End</text>
<path d="M350 450 L290 450" class="arrow"/>
<text x="315" y="440" class="label">UI updates</text>
<path d="M750 200 L750 160" class="arrow3"/>
<text x="700" y="145" class="label">barge-in detection</text>
<path d="M760 170 L920 170" class="arrow3"/>
<text x="820" y="160" class="label">interrupt event + cancel</text>
</svg>

After

Width:  |  Height:  |  Size: 3.9 KiB

742
examples/web_client.html Normal file
View File

@@ -0,0 +1,742 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Duplex Voice Web Client</title>
<style>
@import url("https://fonts.googleapis.com/css2?family=Fraunces:opsz,wght@9..144,300;9..144,500;9..144,700&family=Recursive:wght@300;400;600;700&display=swap");
:root {
--bg: #0b0b0f;
--panel: #14141c;
--panel-2: #101018;
--ink: #f2f3f7;
--muted: #a7acba;
--accent: #ff6b6b;
--accent-2: #ffd166;
--good: #2dd4bf;
--bad: #f87171;
--grid: rgba(255, 255, 255, 0.06);
--shadow: 0 20px 60px rgba(0, 0, 0, 0.45);
}
* {
box-sizing: border-box;
}
html,
body {
height: 100%;
margin: 0;
color: var(--ink);
background: radial-gradient(1200px 600px at 20% -10%, #1d1d2a 0%, transparent 60%),
radial-gradient(800px 800px at 110% 10%, #20203a 0%, transparent 50%),
var(--bg);
font-family: "Recursive", ui-sans-serif, system-ui, -apple-system, "Segoe UI", sans-serif;
}
.noise {
position: fixed;
inset: 0;
background-image: url("data:image/svg+xml;utf8,<svg xmlns='http://www.w3.org/2000/svg' width='120' height='120' viewBox='0 0 120 120'><filter id='n'><feTurbulence type='fractalNoise' baseFrequency='0.9' numOctaves='2' stitchTiles='stitch'/></filter><rect width='120' height='120' filter='url(%23n)' opacity='0.06'/></svg>");
pointer-events: none;
mix-blend-mode: soft-light;
}
header {
padding: 32px 28px 18px;
border-bottom: 1px solid var(--grid);
}
h1 {
font-family: "Fraunces", serif;
font-weight: 600;
margin: 0 0 6px;
letter-spacing: 0.4px;
}
.subtitle {
color: var(--muted);
font-size: 0.95rem;
}
main {
display: grid;
grid-template-columns: 1.1fr 1.4fr;
gap: 24px;
padding: 24px 28px 40px;
}
.panel {
background: linear-gradient(180deg, rgba(255, 255, 255, 0.02), transparent),
var(--panel);
border: 1px solid var(--grid);
border-radius: 16px;
padding: 20px;
box-shadow: var(--shadow);
}
.panel h2 {
margin: 0 0 12px;
font-size: 1.05rem;
font-weight: 600;
}
.stack {
display: grid;
gap: 12px;
}
label {
display: block;
font-size: 0.85rem;
color: var(--muted);
margin-bottom: 6px;
}
input,
select,
button,
textarea {
font-family: inherit;
}
input,
select,
textarea {
width: 100%;
padding: 10px 12px;
border-radius: 10px;
border: 1px solid var(--grid);
background: var(--panel-2);
color: var(--ink);
outline: none;
}
textarea {
min-height: 80px;
resize: vertical;
}
.row {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 12px;
}
.btn-row {
display: flex;
flex-wrap: wrap;
gap: 10px;
}
button {
border: none;
border-radius: 999px;
padding: 10px 16px;
font-weight: 600;
background: var(--ink);
color: #111;
cursor: pointer;
transition: transform 0.2s ease, box-shadow 0.2s ease;
}
button.secondary {
background: transparent;
color: var(--ink);
border: 1px solid var(--grid);
}
button.accent {
background: linear-gradient(120deg, var(--accent), #f97316);
color: #0b0b0f;
}
button.good {
background: linear-gradient(120deg, var(--good), #22c55e);
color: #07261f;
}
button.bad {
background: linear-gradient(120deg, var(--bad), #f97316);
color: #2a0b0b;
}
button:active {
transform: translateY(1px) scale(0.99);
}
.status {
display: flex;
align-items: center;
gap: 12px;
padding: 12px;
background: rgba(255, 255, 255, 0.03);
border-radius: 12px;
border: 1px dashed var(--grid);
font-size: 0.9rem;
}
.dot {
width: 10px;
height: 10px;
border-radius: 999px;
background: var(--bad);
box-shadow: 0 0 12px rgba(248, 113, 113, 0.5);
}
.dot.on {
background: var(--good);
box-shadow: 0 0 12px rgba(45, 212, 191, 0.7);
}
.log {
height: 320px;
overflow: auto;
padding: 12px;
background: #0d0d14;
border-radius: 12px;
border: 1px solid var(--grid);
font-size: 0.85rem;
line-height: 1.4;
}
.chat {
height: 260px;
overflow: auto;
padding: 12px;
background: #0d0d14;
border-radius: 12px;
border: 1px solid var(--grid);
font-size: 0.9rem;
line-height: 1.45;
}
.chat-entry {
padding: 8px 10px;
margin-bottom: 8px;
border-radius: 10px;
background: rgba(255, 255, 255, 0.04);
border: 1px solid rgba(255, 255, 255, 0.06);
}
.chat-entry.user {
border-left: 3px solid var(--accent-2);
}
.chat-entry.ai {
border-left: 3px solid var(--good);
}
.chat-entry.interim {
opacity: 0.7;
font-style: italic;
}
.log-entry {
padding: 6px 8px;
border-bottom: 1px dashed rgba(255, 255, 255, 0.06);
}
.log-entry:last-child {
border-bottom: none;
}
.tag {
display: inline-flex;
align-items: center;
gap: 6px;
padding: 2px 8px;
border-radius: 999px;
font-size: 0.7rem;
text-transform: uppercase;
letter-spacing: 0.6px;
background: rgba(255, 255, 255, 0.08);
color: var(--muted);
}
.tag.event {
background: rgba(255, 107, 107, 0.18);
color: #ffc1c1;
}
.tag.audio {
background: rgba(45, 212, 191, 0.2);
color: #c5f9f0;
}
.tag.sys {
background: rgba(255, 209, 102, 0.2);
color: #ffefb0;
}
.muted {
color: var(--muted);
}
footer {
padding: 0 28px 28px;
color: var(--muted);
font-size: 0.8rem;
}
@media (max-width: 1100px) {
main {
grid-template-columns: 1fr;
}
.log {
height: 360px;
}
.chat {
height: 260px;
}
}
</style>
</head>
<body>
<div class="noise"></div>
<header>
<h1>Duplex Voice Client</h1>
<div class="subtitle">Browser client for the WebSocket duplex pipeline. Device selection + event logging.</div>
</header>
<main>
<section class="panel stack">
<h2>Connection</h2>
<div>
<label for="wsUrl">WebSocket URL</label>
<input id="wsUrl" value="ws://localhost:8000/ws" />
</div>
<div class="btn-row">
<button class="accent" id="connectBtn">Connect</button>
<button class="secondary" id="disconnectBtn">Disconnect</button>
</div>
<div class="status">
<div id="statusDot" class="dot"></div>
<div>
<div id="statusText">Disconnected</div>
<div class="muted" id="statusSub">Waiting for connection</div>
</div>
</div>
<h2>Devices</h2>
<div class="row">
<div>
<label for="inputSelect">Input (Mic)</label>
<select id="inputSelect"></select>
</div>
<div>
<label for="outputSelect">Output (Speaker)</label>
<select id="outputSelect"></select>
</div>
</div>
<div class="btn-row">
<button class="secondary" id="refreshDevicesBtn">Refresh Devices</button>
<button class="good" id="startMicBtn">Start Mic</button>
<button class="secondary" id="stopMicBtn">Stop Mic</button>
</div>
<h2>Chat</h2>
<div class="stack">
<textarea id="chatInput" placeholder="Type a message, press Send"></textarea>
<div class="btn-row">
<button class="accent" id="sendChatBtn">Send Chat</button>
<button class="secondary" id="clearLogBtn">Clear Log</button>
</div>
</div>
</section>
<section class="stack">
<div class="panel stack">
<h2>Chat History</h2>
<div class="chat" id="chatHistory"></div>
</div>
<div class="panel stack">
<h2>Event Log</h2>
<div class="log" id="log"></div>
</div>
</section>
</main>
<footer>
Output device selection requires HTTPS + a browser that supports <code>setSinkId</code>.
Audio is sent as 16-bit PCM @ 16 kHz, matching <code>examples/mic_client.py</code>.
</footer>
<audio id="audioOut" autoplay></audio>
<script>
const wsUrl = document.getElementById("wsUrl");
const connectBtn = document.getElementById("connectBtn");
const disconnectBtn = document.getElementById("disconnectBtn");
const inputSelect = document.getElementById("inputSelect");
const outputSelect = document.getElementById("outputSelect");
const startMicBtn = document.getElementById("startMicBtn");
const stopMicBtn = document.getElementById("stopMicBtn");
const refreshDevicesBtn = document.getElementById("refreshDevicesBtn");
const sendChatBtn = document.getElementById("sendChatBtn");
const clearLogBtn = document.getElementById("clearLogBtn");
const chatInput = document.getElementById("chatInput");
const logEl = document.getElementById("log");
const chatHistory = document.getElementById("chatHistory");
const statusDot = document.getElementById("statusDot");
const statusText = document.getElementById("statusText");
const statusSub = document.getElementById("statusSub");
const audioOut = document.getElementById("audioOut");
let ws = null;
let audioCtx = null;
let micStream = null;
let processor = null;
let micSource = null;
let playbackDest = null;
let playbackTime = 0;
let discardAudio = false;
let playbackSources = [];
let interimUserEl = null;
let interimAiEl = null;
let interimUserText = "";
let interimAiText = "";
const targetSampleRate = 16000;
function logLine(type, text, data) {
const time = new Date().toLocaleTimeString();
const entry = document.createElement("div");
entry.className = "log-entry";
const tag = document.createElement("span");
tag.className = `tag ${type}`;
tag.textContent = type.toUpperCase();
const msg = document.createElement("span");
msg.style.marginLeft = "10px";
msg.textContent = `[${time}] ${text}`;
entry.appendChild(tag);
entry.appendChild(msg);
if (data) {
const pre = document.createElement("div");
pre.className = "muted";
pre.textContent = JSON.stringify(data);
pre.style.marginTop = "4px";
entry.appendChild(pre);
}
logEl.appendChild(entry);
logEl.scrollTop = logEl.scrollHeight;
}
function addChat(role, text) {
const entry = document.createElement("div");
entry.className = `chat-entry ${role === "AI" ? "ai" : "user"}`;
entry.textContent = `${role}: ${text}`;
chatHistory.appendChild(entry);
chatHistory.scrollTop = chatHistory.scrollHeight;
}
function setInterim(role, text) {
const isAi = role === "AI";
let el = isAi ? interimAiEl : interimUserEl;
if (!text) {
if (el) el.remove();
if (isAi) interimAiEl = null;
else interimUserEl = null;
if (isAi) interimAiText = "";
else interimUserText = "";
return;
}
if (!el) {
el = document.createElement("div");
el.className = `chat-entry ${isAi ? "ai" : "user"} interim`;
chatHistory.appendChild(el);
if (isAi) interimAiEl = el;
else interimUserEl = el;
}
el.textContent = `${role} (interim): ${text}`;
chatHistory.scrollTop = chatHistory.scrollHeight;
}
function stopPlayback() {
discardAudio = true;
playbackTime = audioCtx ? audioCtx.currentTime : 0;
playbackSources.forEach((s) => {
try {
s.stop();
} catch (err) {}
});
playbackSources = [];
}
function setStatus(connected, detail) {
statusDot.classList.toggle("on", connected);
statusText.textContent = connected ? "Connected" : "Disconnected";
statusSub.textContent = detail || "";
}
async function ensureAudioContext() {
if (audioCtx) return;
audioCtx = new (window.AudioContext || window.webkitAudioContext)();
playbackDest = audioCtx.createMediaStreamDestination();
audioOut.srcObject = playbackDest.stream;
try {
await audioOut.play();
} catch (err) {
logLine("sys", "Audio playback blocked (user gesture needed)", { err: String(err) });
}
if (outputSelect.value) {
await setOutputDevice(outputSelect.value);
}
}
function downsampleBuffer(buffer, inRate, outRate) {
if (outRate === inRate) return buffer;
const ratio = inRate / outRate;
const newLength = Math.round(buffer.length / ratio);
const result = new Float32Array(newLength);
let offsetResult = 0;
let offsetBuffer = 0;
while (offsetResult < result.length) {
const nextOffsetBuffer = Math.round((offsetResult + 1) * ratio);
let accum = 0;
let count = 0;
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
accum += buffer[i];
count++;
}
result[offsetResult] = accum / count;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result;
}
function floatTo16BitPCM(float32) {
const out = new Int16Array(float32.length);
for (let i = 0; i < float32.length; i++) {
const s = Math.max(-1, Math.min(1, float32[i]));
out[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
return out;
}
function schedulePlayback(int16Data) {
if (!audioCtx || !playbackDest) return;
if (discardAudio) return;
const float32 = new Float32Array(int16Data.length);
for (let i = 0; i < int16Data.length; i++) {
float32[i] = int16Data[i] / 32768;
}
const buffer = audioCtx.createBuffer(1, float32.length, targetSampleRate);
buffer.copyToChannel(float32, 0);
const source = audioCtx.createBufferSource();
source.buffer = buffer;
source.connect(playbackDest);
const startTime = Math.max(audioCtx.currentTime + 0.02, playbackTime);
source.start(startTime);
playbackTime = startTime + buffer.duration;
playbackSources.push(source);
source.onended = () => {
playbackSources = playbackSources.filter((s) => s !== source);
};
}
async function connect() {
if (ws && ws.readyState === WebSocket.OPEN) return;
ws = new WebSocket(wsUrl.value.trim());
ws.binaryType = "arraybuffer";
ws.onopen = () => {
setStatus(true, "Session open");
logLine("sys", "WebSocket connected");
ensureAudioContext();
sendCommand({ command: "invite", option: { codec: "pcm", sampleRate: targetSampleRate } });
};
ws.onclose = () => {
setStatus(false, "Connection closed");
logLine("sys", "WebSocket closed");
ws = null;
};
ws.onerror = (err) => {
logLine("sys", "WebSocket error", { err: String(err) });
};
ws.onmessage = (msg) => {
if (typeof msg.data === "string") {
const event = JSON.parse(msg.data);
handleEvent(event);
} else {
const audioBuf = msg.data;
const int16 = new Int16Array(audioBuf);
schedulePlayback(int16);
logLine("audio", `Audio ${Math.round((int16.length / targetSampleRate) * 1000)}ms`);
}
};
}
function disconnect() {
if (ws) ws.close();
ws = null;
setStatus(false, "Disconnected");
}
function sendCommand(cmd) {
if (!ws || ws.readyState !== WebSocket.OPEN) {
logLine("sys", "Not connected");
return;
}
ws.send(JSON.stringify(cmd));
logLine("sys", `${cmd.command}`, cmd);
}
function handleEvent(event) {
const type = event.event || "unknown";
logLine("event", type, event);
if (type === "transcript") {
if (event.isFinal && event.text) {
setInterim("You", "");
addChat("You", event.text);
} else if (event.text) {
interimUserText += event.text;
setInterim("You", interimUserText);
}
}
if (type === "llmResponse") {
if (event.isFinal && event.text) {
setInterim("AI", "");
addChat("AI", event.text);
} else if (event.text) {
interimAiText += event.text;
setInterim("AI", interimAiText);
}
}
if (type === "trackStart") {
// New bot audio: stop any previous playback to avoid overlap
stopPlayback();
discardAudio = false;
}
if (type === "speaking") {
// User started speaking: clear any in-flight audio to avoid overlap
stopPlayback();
}
if (type === "interrupt") {
stopPlayback();
}
}
async function startMic() {
if (!ws || ws.readyState !== WebSocket.OPEN) {
logLine("sys", "Connect before starting mic");
return;
}
await ensureAudioContext();
const deviceId = inputSelect.value || undefined;
micStream = await navigator.mediaDevices.getUserMedia({
audio: deviceId ? { deviceId: { exact: deviceId } } : true,
});
micSource = audioCtx.createMediaStreamSource(micStream);
processor = audioCtx.createScriptProcessor(2048, 1, 1);
processor.onaudioprocess = (e) => {
if (!ws || ws.readyState !== WebSocket.OPEN) return;
const input = e.inputBuffer.getChannelData(0);
const downsampled = downsampleBuffer(input, audioCtx.sampleRate, targetSampleRate);
const pcm16 = floatTo16BitPCM(downsampled);
ws.send(pcm16.buffer);
};
micSource.connect(processor);
processor.connect(audioCtx.destination);
logLine("sys", "Microphone started");
}
function stopMic() {
if (processor) {
processor.disconnect();
processor = null;
}
if (micSource) {
micSource.disconnect();
micSource = null;
}
if (micStream) {
micStream.getTracks().forEach((t) => t.stop());
micStream = null;
}
logLine("sys", "Microphone stopped");
}
async function refreshDevices() {
const devices = await navigator.mediaDevices.enumerateDevices();
inputSelect.innerHTML = "";
outputSelect.innerHTML = "";
devices.forEach((d) => {
if (d.kind === "audioinput") {
const opt = document.createElement("option");
opt.value = d.deviceId;
opt.textContent = d.label || `Mic ${inputSelect.length + 1}`;
inputSelect.appendChild(opt);
}
if (d.kind === "audiooutput") {
const opt = document.createElement("option");
opt.value = d.deviceId;
opt.textContent = d.label || `Output ${outputSelect.length + 1}`;
outputSelect.appendChild(opt);
}
});
}
async function requestDeviceAccess() {
// Needed to reveal device labels in most browsers
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
stream.getTracks().forEach((t) => t.stop());
logLine("sys", "Microphone permission granted");
} catch (err) {
logLine("sys", "Microphone permission denied", { err: String(err) });
}
}
async function setOutputDevice(deviceId) {
if (!audioOut.setSinkId) {
logLine("sys", "setSinkId not supported in this browser");
return;
}
await audioOut.setSinkId(deviceId);
logLine("sys", `Output device set`, { deviceId });
}
connectBtn.addEventListener("click", connect);
disconnectBtn.addEventListener("click", disconnect);
refreshDevicesBtn.addEventListener("click", async () => {
await requestDeviceAccess();
await refreshDevices();
});
startMicBtn.addEventListener("click", startMic);
stopMicBtn.addEventListener("click", stopMic);
sendChatBtn.addEventListener("click", () => {
const text = chatInput.value.trim();
if (!text) return;
ensureAudioContext();
addChat("You", text);
sendCommand({ command: "chat", text });
chatInput.value = "";
});
clearLogBtn.addEventListener("click", () => {
logEl.innerHTML = "";
chatHistory.innerHTML = "";
setInterim("You", "");
setInterim("AI", "");
interimUserText = "";
interimAiText = "";
});
inputSelect.addEventListener("change", () => {
if (micStream) {
stopMic();
startMic();
}
});
outputSelect.addEventListener("change", () => setOutputDevice(outputSelect.value));
navigator.mediaDevices.addEventListener("devicechange", refreshDevices);
refreshDevices().catch(() => {});
</script>
</body>
</html>

View File

@@ -63,6 +63,7 @@ class SileroVAD:
self.min_chunk_size = 512
self.last_label = "Silence"
self.last_probability = 0.0
self._energy_noise_floor = 1e-4
def _reset_state(self):
# Silero VAD V4+ expects state shape [2, 1, 128]
@@ -81,8 +82,7 @@ class SileroVAD:
Tuple of (label, probability) where label is "Speech" or "Silence"
"""
if self.session is None or not ONNX_AVAILABLE:
# Fallback energy-based VAD when model isn't available.
# Map RMS energy to a pseudo-probability so the existing threshold works.
# Fallback energy-based VAD with adaptive noise floor.
if not pcm_bytes:
return "Silence", 0.0
audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
@@ -90,9 +90,17 @@ class SileroVAD:
return "Silence", 0.0
audio_float = audio_int16.astype(np.float32) / 32768.0
rms = float(np.sqrt(np.mean(audio_float * audio_float)))
# Typical speech RMS is ~0.02-0.05 at 16-bit normalized scale.
# Normalize so threshold=0.5 roughly corresponds to ~0.025 RMS.
probability = min(1.0, rms / 0.05)
# Update adaptive noise floor (slowly rises, faster to fall)
if rms < self._energy_noise_floor:
self._energy_noise_floor = 0.95 * self._energy_noise_floor + 0.05 * rms
else:
self._energy_noise_floor = 0.995 * self._energy_noise_floor + 0.005 * rms
# Compute SNR-like ratio and map to probability
denom = max(self._energy_noise_floor, 1e-6)
snr = max(0.0, (rms - denom) / denom)
probability = min(1.0, snr / 3.0) # ~3x above noise => strong speech
label = "Speech" if probability >= 0.5 else "Silence"
return label, probability