Compare commits
20 Commits
9954e8d18f
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7be8fda424 | ||
|
|
c8c0e30bc3 | ||
|
|
960690ba80 | ||
|
|
cb35d87eb4 | ||
|
|
5c03cf2b1f | ||
|
|
876ca8221c | ||
|
|
a8e7c7e2ef | ||
|
|
9d42f3cca1 | ||
|
|
f81a561e0e | ||
|
|
a70970fee5 | ||
|
|
e511cf9077 | ||
|
|
0576231d8d | ||
|
|
26458faa6c | ||
|
|
605968a639 | ||
|
|
31d24a7428 | ||
|
|
7846e4cebc | ||
|
|
d9dc14d03a | ||
|
|
294a3e405c | ||
|
|
6831f5316c | ||
|
|
65128b0eb0 |
22
app/main.py
22
app/main.py
@@ -4,10 +4,11 @@ import asyncio
|
||||
import json
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, List
|
||||
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi.responses import JSONResponse, FileResponse
|
||||
from loguru import logger
|
||||
|
||||
# Try to import aiortc (optional for WebRTC functionality)
|
||||
@@ -64,6 +65,7 @@ async def heartbeat_and_timeout_task(
|
||||
|
||||
# Initialize FastAPI
|
||||
app = FastAPI(title="Python Active-Call", version="0.1.0")
|
||||
_WEB_CLIENT_PATH = Path(__file__).resolve().parent.parent / "examples" / "web_client.html"
|
||||
|
||||
# Configure CORS
|
||||
app.add_middleware(
|
||||
@@ -99,6 +101,24 @@ async def health_check():
|
||||
return {"status": "healthy", "sessions": len(active_sessions)}
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def web_client_root():
|
||||
"""Serve the web client."""
|
||||
if not _WEB_CLIENT_PATH.exists():
|
||||
raise HTTPException(status_code=404, detail="Web client not found")
|
||||
return FileResponse(_WEB_CLIENT_PATH)
|
||||
|
||||
|
||||
@app.get("/client")
|
||||
async def web_client_alias():
|
||||
"""Alias for the web client."""
|
||||
if not _WEB_CLIENT_PATH.exists():
|
||||
raise HTTPException(status_code=404, detail="Web client not found")
|
||||
return FileResponse(_WEB_CLIENT_PATH)
|
||||
|
||||
|
||||
|
||||
|
||||
@app.get("/iceservers")
|
||||
async def get_ice_servers():
|
||||
"""Get ICE servers configuration for WebRTC."""
|
||||
|
||||
@@ -108,7 +108,10 @@ class DuplexPipeline:
|
||||
self._is_bot_speaking = False
|
||||
self._current_turn_task: Optional[asyncio.Task] = None
|
||||
self._audio_buffer: bytes = b""
|
||||
max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30
|
||||
self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
|
||||
self._last_vad_status: str = "Silence"
|
||||
self._process_lock = asyncio.Lock()
|
||||
|
||||
# Interruption handling
|
||||
self._interrupt_event = asyncio.Event()
|
||||
@@ -206,71 +209,75 @@ class DuplexPipeline:
|
||||
return
|
||||
|
||||
try:
|
||||
# 1. Process through VAD
|
||||
vad_result = self.vad_processor.process(pcm_bytes, settings.chunk_size_ms)
|
||||
async with self._process_lock:
|
||||
# 1. Process through VAD
|
||||
vad_result = self.vad_processor.process(pcm_bytes, settings.chunk_size_ms)
|
||||
|
||||
vad_status = "Silence"
|
||||
if vad_result:
|
||||
event_type, probability = vad_result
|
||||
vad_status = "Speech" if event_type == "speaking" else "Silence"
|
||||
|
||||
# Emit VAD event
|
||||
await self.event_bus.publish(event_type, {
|
||||
"trackId": self.session_id,
|
||||
"probability": probability
|
||||
})
|
||||
else:
|
||||
# No state change - keep previous status
|
||||
vad_status = self._last_vad_status
|
||||
|
||||
# Update state based on VAD
|
||||
if vad_status == "Speech" and self._last_vad_status != "Speech":
|
||||
await self._on_speech_start()
|
||||
|
||||
self._last_vad_status = vad_status
|
||||
|
||||
# 2. Check for barge-in (user speaking while bot speaking)
|
||||
# Filter false interruptions by requiring minimum speech duration
|
||||
if self._is_bot_speaking:
|
||||
if vad_status == "Speech":
|
||||
# User is speaking while bot is speaking
|
||||
self._barge_in_silence_frames = 0 # Reset silence counter
|
||||
vad_status = "Silence"
|
||||
if vad_result:
|
||||
event_type, probability = vad_result
|
||||
vad_status = "Speech" if event_type == "speaking" else "Silence"
|
||||
|
||||
if self._barge_in_speech_start_time is None:
|
||||
# Start tracking speech duration
|
||||
self._barge_in_speech_start_time = time.time()
|
||||
self._barge_in_speech_frames = 1
|
||||
logger.debug("Potential barge-in detected, tracking duration...")
|
||||
else:
|
||||
self._barge_in_speech_frames += 1
|
||||
# Check if speech duration exceeds threshold
|
||||
speech_duration_ms = (time.time() - self._barge_in_speech_start_time) * 1000
|
||||
if speech_duration_ms >= self._barge_in_min_duration_ms:
|
||||
logger.info(f"Barge-in confirmed after {speech_duration_ms:.0f}ms of speech ({self._barge_in_speech_frames} frames)")
|
||||
await self._handle_barge_in()
|
||||
# Emit VAD event
|
||||
await self.event_bus.publish(event_type, {
|
||||
"trackId": self.session_id,
|
||||
"probability": probability
|
||||
})
|
||||
else:
|
||||
# Silence frame during potential barge-in
|
||||
if self._barge_in_speech_start_time is not None:
|
||||
self._barge_in_silence_frames += 1
|
||||
# Allow brief silence gaps (VAD flickering)
|
||||
if self._barge_in_silence_frames > self._barge_in_silence_tolerance:
|
||||
# Too much silence - reset barge-in tracking
|
||||
logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
|
||||
self._barge_in_speech_start_time = None
|
||||
self._barge_in_speech_frames = 0
|
||||
self._barge_in_silence_frames = 0
|
||||
|
||||
# 3. Buffer audio for ASR
|
||||
if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
|
||||
self._audio_buffer += pcm_bytes
|
||||
await self.asr_service.send_audio(pcm_bytes)
|
||||
# No state change - keep previous status
|
||||
vad_status = self._last_vad_status
|
||||
|
||||
# For SiliconFlow ASR, trigger interim transcription periodically
|
||||
# The service handles timing internally via start_interim_transcription()
|
||||
|
||||
# 4. Check for End of Utterance - this triggers LLM response
|
||||
if self.eou_detector.process(vad_status):
|
||||
await self._on_end_of_utterance()
|
||||
# Update state based on VAD
|
||||
if vad_status == "Speech" and self._last_vad_status != "Speech":
|
||||
await self._on_speech_start()
|
||||
|
||||
self._last_vad_status = vad_status
|
||||
|
||||
# 2. Check for barge-in (user speaking while bot speaking)
|
||||
# Filter false interruptions by requiring minimum speech duration
|
||||
if self._is_bot_speaking:
|
||||
if vad_status == "Speech":
|
||||
# User is speaking while bot is speaking
|
||||
self._barge_in_silence_frames = 0 # Reset silence counter
|
||||
|
||||
if self._barge_in_speech_start_time is None:
|
||||
# Start tracking speech duration
|
||||
self._barge_in_speech_start_time = time.time()
|
||||
self._barge_in_speech_frames = 1
|
||||
logger.debug("Potential barge-in detected, tracking duration...")
|
||||
else:
|
||||
self._barge_in_speech_frames += 1
|
||||
# Check if speech duration exceeds threshold
|
||||
speech_duration_ms = (time.time() - self._barge_in_speech_start_time) * 1000
|
||||
if speech_duration_ms >= self._barge_in_min_duration_ms:
|
||||
logger.info(f"Barge-in confirmed after {speech_duration_ms:.0f}ms of speech ({self._barge_in_speech_frames} frames)")
|
||||
await self._handle_barge_in()
|
||||
else:
|
||||
# Silence frame during potential barge-in
|
||||
if self._barge_in_speech_start_time is not None:
|
||||
self._barge_in_silence_frames += 1
|
||||
# Allow brief silence gaps (VAD flickering)
|
||||
if self._barge_in_silence_frames > self._barge_in_silence_tolerance:
|
||||
# Too much silence - reset barge-in tracking
|
||||
logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
|
||||
self._barge_in_speech_start_time = None
|
||||
self._barge_in_speech_frames = 0
|
||||
self._barge_in_silence_frames = 0
|
||||
|
||||
# 3. Buffer audio for ASR
|
||||
if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
|
||||
self._audio_buffer += pcm_bytes
|
||||
if len(self._audio_buffer) > self._max_audio_buffer_bytes:
|
||||
# Keep only the most recent audio to cap memory usage
|
||||
self._audio_buffer = self._audio_buffer[-self._max_audio_buffer_bytes:]
|
||||
await self.asr_service.send_audio(pcm_bytes)
|
||||
|
||||
# For SiliconFlow ASR, trigger interim transcription periodically
|
||||
# The service handles timing internally via start_interim_transcription()
|
||||
|
||||
# 4. Check for End of Utterance - this triggers LLM response
|
||||
if self.eou_detector.process(vad_status):
|
||||
await self._on_end_of_utterance()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Pipeline audio processing error: {e}", exc_info=True)
|
||||
@@ -388,6 +395,8 @@ class DuplexPipeline:
|
||||
self._last_sent_transcript = ""
|
||||
|
||||
# Process the turn - trigger LLM response
|
||||
# Cancel any existing turn to avoid overlapping assistant responses
|
||||
await self._stop_current_speech()
|
||||
await self.conversation.end_user_turn(user_text)
|
||||
self._current_turn_task = asyncio.create_task(self._handle_turn(user_text))
|
||||
|
||||
@@ -415,7 +424,7 @@ class DuplexPipeline:
|
||||
|
||||
# Sentence buffer for streaming TTS
|
||||
sentence_buffer = ""
|
||||
sentence_ends = {'.', '!', '?', '。', '!', '?', ';', '\n'}
|
||||
sentence_ends = {',', '。', '!', '?', '\n'}
|
||||
first_audio_sent = False
|
||||
|
||||
# Stream LLM response and TTS sentence by sentence
|
||||
@@ -650,8 +659,10 @@ class DuplexPipeline:
|
||||
if self.llm_service and hasattr(self.llm_service, 'cancel'):
|
||||
self.llm_service.cancel()
|
||||
|
||||
# Interrupt conversation
|
||||
await self.conversation.interrupt()
|
||||
# Interrupt conversation only if there is no active turn task.
|
||||
# When a turn task exists, it will handle end_assistant_turn() to avoid double callbacks.
|
||||
if not (self._current_turn_task and not self._current_turn_task.done()):
|
||||
await self.conversation.interrupt()
|
||||
|
||||
# Reset for new user turn
|
||||
await self.conversation.start_user_turn()
|
||||
@@ -667,6 +678,12 @@ class DuplexPipeline:
|
||||
await self._current_turn_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# Ensure underlying services are cancelled to avoid leaking work/audio
|
||||
if self.tts_service:
|
||||
await self.tts_service.cancel()
|
||||
if self.llm_service and hasattr(self.llm_service, 'cancel'):
|
||||
self.llm_service.cancel()
|
||||
|
||||
self._is_bot_speaking = False
|
||||
self._interrupt_event.clear()
|
||||
|
||||
96
docs/duplex_interaction.svg
Normal file
96
docs/duplex_interaction.svg
Normal file
@@ -0,0 +1,96 @@
|
||||
<svg width="1200" height="620" viewBox="0 0 1200 620" xmlns="http://www.w3.org/2000/svg">
|
||||
<defs>
|
||||
<style>
|
||||
.box { fill:#11131a; stroke:#3a3f4b; stroke-width:1.2; rx:10; ry:10; }
|
||||
.title { font: 600 14px 'Arial'; fill:#f2f3f7; }
|
||||
.text { font: 12px 'Arial'; fill:#c8ccd8; }
|
||||
.arrow { stroke:#7aa2ff; stroke-width:1.6; marker-end:url(#arrow); fill:none; }
|
||||
.arrow2 { stroke:#2dd4bf; stroke-width:1.6; marker-end:url(#arrow); fill:none; }
|
||||
.arrow3 { stroke:#ff6b6b; stroke-width:1.6; marker-end:url(#arrow); fill:none; }
|
||||
.label { font: 11px 'Arial'; fill:#9aa3b2; }
|
||||
</style>
|
||||
<marker id="arrow" markerWidth="8" markerHeight="8" refX="7" refY="4" orient="auto">
|
||||
<path d="M0,0 L8,4 L0,8 Z" fill="#7aa2ff"/>
|
||||
</marker>
|
||||
</defs>
|
||||
|
||||
<rect x="40" y="40" width="250" height="120" class="box"/>
|
||||
<text x="60" y="70" class="title">Web Client</text>
|
||||
<text x="60" y="95" class="text">WS JSON commands</text>
|
||||
<text x="60" y="115" class="text">WS binary PCM audio</text>
|
||||
|
||||
<rect x="350" y="40" width="250" height="120" class="box"/>
|
||||
<text x="370" y="70" class="title">FastAPI /ws</text>
|
||||
<text x="370" y="95" class="text">Session + Transport</text>
|
||||
|
||||
<rect x="660" y="40" width="250" height="120" class="box"/>
|
||||
<text x="680" y="70" class="title">DuplexPipeline</text>
|
||||
<text x="680" y="95" class="text">process_audio / process_text</text>
|
||||
|
||||
<rect x="920" y="40" width="240" height="120" class="box"/>
|
||||
<text x="940" y="70" class="title">ConversationManager</text>
|
||||
<text x="940" y="95" class="text">turns + state</text>
|
||||
|
||||
<rect x="660" y="200" width="180" height="100" class="box"/>
|
||||
<text x="680" y="230" class="title">VADProcessor</text>
|
||||
<text x="680" y="255" class="text">speech/silence</text>
|
||||
|
||||
<rect x="860" y="200" width="180" height="100" class="box"/>
|
||||
<text x="880" y="230" class="title">EOU Detector</text>
|
||||
<text x="880" y="255" class="text">end-of-utterance</text>
|
||||
|
||||
<rect x="1060" y="200" width="120" height="100" class="box"/>
|
||||
<text x="1075" y="230" class="title">ASR</text>
|
||||
<text x="1075" y="255" class="text">transcripts</text>
|
||||
|
||||
<rect x="920" y="350" width="240" height="110" class="box"/>
|
||||
<text x="940" y="380" class="title">LLM (stream)</text>
|
||||
<text x="940" y="405" class="text">llmResponse events</text>
|
||||
|
||||
<rect x="660" y="350" width="220" height="110" class="box"/>
|
||||
<text x="680" y="380" class="title">TTS (stream)</text>
|
||||
<text x="680" y="405" class="text">PCM audio</text>
|
||||
|
||||
<rect x="40" y="350" width="250" height="110" class="box"/>
|
||||
<text x="60" y="380" class="title">Web Client</text>
|
||||
<text x="60" y="405" class="text">audio playback + UI</text>
|
||||
|
||||
<path d="M290 80 L350 80" class="arrow"/>
|
||||
<text x="300" y="70" class="label">JSON / PCM</text>
|
||||
|
||||
<path d="M600 80 L660 80" class="arrow"/>
|
||||
<text x="615" y="70" class="label">dispatch</text>
|
||||
|
||||
<path d="M910 80 L920 80" class="arrow"/>
|
||||
<text x="880" y="70" class="label">turn mgmt</text>
|
||||
|
||||
<path d="M750 160 L750 200" class="arrow"/>
|
||||
<text x="705" y="190" class="label">audio chunks</text>
|
||||
|
||||
<path d="M840 250 L860 250" class="arrow"/>
|
||||
<text x="835" y="240" class="label">vad status</text>
|
||||
|
||||
<path d="M1040 250 L1060 250" class="arrow"/>
|
||||
<text x="1010" y="240" class="label">audio buffer</text>
|
||||
|
||||
<path d="M950 300 L950 350" class="arrow2"/>
|
||||
<text x="930" y="340" class="label">EOU -> LLM</text>
|
||||
|
||||
<path d="M880 405 L920 405" class="arrow2"/>
|
||||
<text x="870" y="395" class="label">text stream</text>
|
||||
|
||||
<path d="M660 405 L290 405" class="arrow2"/>
|
||||
<text x="430" y="395" class="label">PCM audio</text>
|
||||
|
||||
<path d="M660 450 L350 450" class="arrow"/>
|
||||
<text x="420" y="440" class="label">events: trackStart/End</text>
|
||||
|
||||
<path d="M350 450 L290 450" class="arrow"/>
|
||||
<text x="315" y="440" class="label">UI updates</text>
|
||||
|
||||
<path d="M750 200 L750 160" class="arrow3"/>
|
||||
<text x="700" y="145" class="label">barge-in detection</text>
|
||||
|
||||
<path d="M760 170 L920 170" class="arrow3"/>
|
||||
<text x="820" y="160" class="label">interrupt event + cancel</text>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 3.9 KiB |
742
examples/web_client.html
Normal file
742
examples/web_client.html
Normal file
@@ -0,0 +1,742 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>Duplex Voice Web Client</title>
|
||||
<style>
|
||||
@import url("https://fonts.googleapis.com/css2?family=Fraunces:opsz,wght@9..144,300;9..144,500;9..144,700&family=Recursive:wght@300;400;600;700&display=swap");
|
||||
|
||||
:root {
|
||||
--bg: #0b0b0f;
|
||||
--panel: #14141c;
|
||||
--panel-2: #101018;
|
||||
--ink: #f2f3f7;
|
||||
--muted: #a7acba;
|
||||
--accent: #ff6b6b;
|
||||
--accent-2: #ffd166;
|
||||
--good: #2dd4bf;
|
||||
--bad: #f87171;
|
||||
--grid: rgba(255, 255, 255, 0.06);
|
||||
--shadow: 0 20px 60px rgba(0, 0, 0, 0.45);
|
||||
}
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
html,
|
||||
body {
|
||||
height: 100%;
|
||||
margin: 0;
|
||||
color: var(--ink);
|
||||
background: radial-gradient(1200px 600px at 20% -10%, #1d1d2a 0%, transparent 60%),
|
||||
radial-gradient(800px 800px at 110% 10%, #20203a 0%, transparent 50%),
|
||||
var(--bg);
|
||||
font-family: "Recursive", ui-sans-serif, system-ui, -apple-system, "Segoe UI", sans-serif;
|
||||
}
|
||||
|
||||
.noise {
|
||||
position: fixed;
|
||||
inset: 0;
|
||||
background-image: url("data:image/svg+xml;utf8,<svg xmlns='http://www.w3.org/2000/svg' width='120' height='120' viewBox='0 0 120 120'><filter id='n'><feTurbulence type='fractalNoise' baseFrequency='0.9' numOctaves='2' stitchTiles='stitch'/></filter><rect width='120' height='120' filter='url(%23n)' opacity='0.06'/></svg>");
|
||||
pointer-events: none;
|
||||
mix-blend-mode: soft-light;
|
||||
}
|
||||
|
||||
header {
|
||||
padding: 32px 28px 18px;
|
||||
border-bottom: 1px solid var(--grid);
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-family: "Fraunces", serif;
|
||||
font-weight: 600;
|
||||
margin: 0 0 6px;
|
||||
letter-spacing: 0.4px;
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
color: var(--muted);
|
||||
font-size: 0.95rem;
|
||||
}
|
||||
|
||||
main {
|
||||
display: grid;
|
||||
grid-template-columns: 1.1fr 1.4fr;
|
||||
gap: 24px;
|
||||
padding: 24px 28px 40px;
|
||||
}
|
||||
|
||||
.panel {
|
||||
background: linear-gradient(180deg, rgba(255, 255, 255, 0.02), transparent),
|
||||
var(--panel);
|
||||
border: 1px solid var(--grid);
|
||||
border-radius: 16px;
|
||||
padding: 20px;
|
||||
box-shadow: var(--shadow);
|
||||
}
|
||||
|
||||
.panel h2 {
|
||||
margin: 0 0 12px;
|
||||
font-size: 1.05rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.stack {
|
||||
display: grid;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
label {
|
||||
display: block;
|
||||
font-size: 0.85rem;
|
||||
color: var(--muted);
|
||||
margin-bottom: 6px;
|
||||
}
|
||||
|
||||
input,
|
||||
select,
|
||||
button,
|
||||
textarea {
|
||||
font-family: inherit;
|
||||
}
|
||||
|
||||
input,
|
||||
select,
|
||||
textarea {
|
||||
width: 100%;
|
||||
padding: 10px 12px;
|
||||
border-radius: 10px;
|
||||
border: 1px solid var(--grid);
|
||||
background: var(--panel-2);
|
||||
color: var(--ink);
|
||||
outline: none;
|
||||
}
|
||||
|
||||
textarea {
|
||||
min-height: 80px;
|
||||
resize: vertical;
|
||||
}
|
||||
|
||||
.row {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.btn-row {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
button {
|
||||
border: none;
|
||||
border-radius: 999px;
|
||||
padding: 10px 16px;
|
||||
font-weight: 600;
|
||||
background: var(--ink);
|
||||
color: #111;
|
||||
cursor: pointer;
|
||||
transition: transform 0.2s ease, box-shadow 0.2s ease;
|
||||
}
|
||||
|
||||
button.secondary {
|
||||
background: transparent;
|
||||
color: var(--ink);
|
||||
border: 1px solid var(--grid);
|
||||
}
|
||||
|
||||
button.accent {
|
||||
background: linear-gradient(120deg, var(--accent), #f97316);
|
||||
color: #0b0b0f;
|
||||
}
|
||||
|
||||
button.good {
|
||||
background: linear-gradient(120deg, var(--good), #22c55e);
|
||||
color: #07261f;
|
||||
}
|
||||
|
||||
button.bad {
|
||||
background: linear-gradient(120deg, var(--bad), #f97316);
|
||||
color: #2a0b0b;
|
||||
}
|
||||
|
||||
button:active {
|
||||
transform: translateY(1px) scale(0.99);
|
||||
}
|
||||
|
||||
.status {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
padding: 12px;
|
||||
background: rgba(255, 255, 255, 0.03);
|
||||
border-radius: 12px;
|
||||
border: 1px dashed var(--grid);
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
.dot {
|
||||
width: 10px;
|
||||
height: 10px;
|
||||
border-radius: 999px;
|
||||
background: var(--bad);
|
||||
box-shadow: 0 0 12px rgba(248, 113, 113, 0.5);
|
||||
}
|
||||
|
||||
.dot.on {
|
||||
background: var(--good);
|
||||
box-shadow: 0 0 12px rgba(45, 212, 191, 0.7);
|
||||
}
|
||||
|
||||
.log {
|
||||
height: 320px;
|
||||
overflow: auto;
|
||||
padding: 12px;
|
||||
background: #0d0d14;
|
||||
border-radius: 12px;
|
||||
border: 1px solid var(--grid);
|
||||
font-size: 0.85rem;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.chat {
|
||||
height: 260px;
|
||||
overflow: auto;
|
||||
padding: 12px;
|
||||
background: #0d0d14;
|
||||
border-radius: 12px;
|
||||
border: 1px solid var(--grid);
|
||||
font-size: 0.9rem;
|
||||
line-height: 1.45;
|
||||
}
|
||||
|
||||
.chat-entry {
|
||||
padding: 8px 10px;
|
||||
margin-bottom: 8px;
|
||||
border-radius: 10px;
|
||||
background: rgba(255, 255, 255, 0.04);
|
||||
border: 1px solid rgba(255, 255, 255, 0.06);
|
||||
}
|
||||
|
||||
.chat-entry.user {
|
||||
border-left: 3px solid var(--accent-2);
|
||||
}
|
||||
|
||||
.chat-entry.ai {
|
||||
border-left: 3px solid var(--good);
|
||||
}
|
||||
|
||||
.chat-entry.interim {
|
||||
opacity: 0.7;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
.log-entry {
|
||||
padding: 6px 8px;
|
||||
border-bottom: 1px dashed rgba(255, 255, 255, 0.06);
|
||||
}
|
||||
|
||||
.log-entry:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
.tag {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
padding: 2px 8px;
|
||||
border-radius: 999px;
|
||||
font-size: 0.7rem;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.6px;
|
||||
background: rgba(255, 255, 255, 0.08);
|
||||
color: var(--muted);
|
||||
}
|
||||
|
||||
.tag.event {
|
||||
background: rgba(255, 107, 107, 0.18);
|
||||
color: #ffc1c1;
|
||||
}
|
||||
|
||||
.tag.audio {
|
||||
background: rgba(45, 212, 191, 0.2);
|
||||
color: #c5f9f0;
|
||||
}
|
||||
|
||||
.tag.sys {
|
||||
background: rgba(255, 209, 102, 0.2);
|
||||
color: #ffefb0;
|
||||
}
|
||||
|
||||
.muted {
|
||||
color: var(--muted);
|
||||
}
|
||||
|
||||
footer {
|
||||
padding: 0 28px 28px;
|
||||
color: var(--muted);
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
|
||||
@media (max-width: 1100px) {
|
||||
main {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
.log {
|
||||
height: 360px;
|
||||
}
|
||||
.chat {
|
||||
height: 260px;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="noise"></div>
|
||||
<header>
|
||||
<h1>Duplex Voice Client</h1>
|
||||
<div class="subtitle">Browser client for the WebSocket duplex pipeline. Device selection + event logging.</div>
|
||||
</header>
|
||||
|
||||
<main>
|
||||
<section class="panel stack">
|
||||
<h2>Connection</h2>
|
||||
<div>
|
||||
<label for="wsUrl">WebSocket URL</label>
|
||||
<input id="wsUrl" value="ws://localhost:8000/ws" />
|
||||
</div>
|
||||
<div class="btn-row">
|
||||
<button class="accent" id="connectBtn">Connect</button>
|
||||
<button class="secondary" id="disconnectBtn">Disconnect</button>
|
||||
</div>
|
||||
<div class="status">
|
||||
<div id="statusDot" class="dot"></div>
|
||||
<div>
|
||||
<div id="statusText">Disconnected</div>
|
||||
<div class="muted" id="statusSub">Waiting for connection</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2>Devices</h2>
|
||||
<div class="row">
|
||||
<div>
|
||||
<label for="inputSelect">Input (Mic)</label>
|
||||
<select id="inputSelect"></select>
|
||||
</div>
|
||||
<div>
|
||||
<label for="outputSelect">Output (Speaker)</label>
|
||||
<select id="outputSelect"></select>
|
||||
</div>
|
||||
</div>
|
||||
<div class="btn-row">
|
||||
<button class="secondary" id="refreshDevicesBtn">Refresh Devices</button>
|
||||
<button class="good" id="startMicBtn">Start Mic</button>
|
||||
<button class="secondary" id="stopMicBtn">Stop Mic</button>
|
||||
</div>
|
||||
|
||||
<h2>Chat</h2>
|
||||
<div class="stack">
|
||||
<textarea id="chatInput" placeholder="Type a message, press Send"></textarea>
|
||||
<div class="btn-row">
|
||||
<button class="accent" id="sendChatBtn">Send Chat</button>
|
||||
<button class="secondary" id="clearLogBtn">Clear Log</button>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="stack">
|
||||
<div class="panel stack">
|
||||
<h2>Chat History</h2>
|
||||
<div class="chat" id="chatHistory"></div>
|
||||
</div>
|
||||
<div class="panel stack">
|
||||
<h2>Event Log</h2>
|
||||
<div class="log" id="log"></div>
|
||||
</div>
|
||||
</section>
|
||||
</main>
|
||||
|
||||
<footer>
|
||||
Output device selection requires HTTPS + a browser that supports <code>setSinkId</code>.
|
||||
Audio is sent as 16-bit PCM @ 16 kHz, matching <code>examples/mic_client.py</code>.
|
||||
</footer>
|
||||
|
||||
<audio id="audioOut" autoplay></audio>
|
||||
|
||||
<script>
|
||||
const wsUrl = document.getElementById("wsUrl");
|
||||
const connectBtn = document.getElementById("connectBtn");
|
||||
const disconnectBtn = document.getElementById("disconnectBtn");
|
||||
const inputSelect = document.getElementById("inputSelect");
|
||||
const outputSelect = document.getElementById("outputSelect");
|
||||
const startMicBtn = document.getElementById("startMicBtn");
|
||||
const stopMicBtn = document.getElementById("stopMicBtn");
|
||||
const refreshDevicesBtn = document.getElementById("refreshDevicesBtn");
|
||||
const sendChatBtn = document.getElementById("sendChatBtn");
|
||||
const clearLogBtn = document.getElementById("clearLogBtn");
|
||||
const chatInput = document.getElementById("chatInput");
|
||||
const logEl = document.getElementById("log");
|
||||
const chatHistory = document.getElementById("chatHistory");
|
||||
const statusDot = document.getElementById("statusDot");
|
||||
const statusText = document.getElementById("statusText");
|
||||
const statusSub = document.getElementById("statusSub");
|
||||
const audioOut = document.getElementById("audioOut");
|
||||
|
||||
let ws = null;
|
||||
let audioCtx = null;
|
||||
let micStream = null;
|
||||
let processor = null;
|
||||
let micSource = null;
|
||||
let playbackDest = null;
|
||||
let playbackTime = 0;
|
||||
let discardAudio = false;
|
||||
let playbackSources = [];
|
||||
let interimUserEl = null;
|
||||
let interimAiEl = null;
|
||||
let interimUserText = "";
|
||||
let interimAiText = "";
|
||||
|
||||
const targetSampleRate = 16000;
|
||||
|
||||
function logLine(type, text, data) {
|
||||
const time = new Date().toLocaleTimeString();
|
||||
const entry = document.createElement("div");
|
||||
entry.className = "log-entry";
|
||||
const tag = document.createElement("span");
|
||||
tag.className = `tag ${type}`;
|
||||
tag.textContent = type.toUpperCase();
|
||||
const msg = document.createElement("span");
|
||||
msg.style.marginLeft = "10px";
|
||||
msg.textContent = `[${time}] ${text}`;
|
||||
entry.appendChild(tag);
|
||||
entry.appendChild(msg);
|
||||
if (data) {
|
||||
const pre = document.createElement("div");
|
||||
pre.className = "muted";
|
||||
pre.textContent = JSON.stringify(data);
|
||||
pre.style.marginTop = "4px";
|
||||
entry.appendChild(pre);
|
||||
}
|
||||
logEl.appendChild(entry);
|
||||
logEl.scrollTop = logEl.scrollHeight;
|
||||
}
|
||||
|
||||
function addChat(role, text) {
|
||||
const entry = document.createElement("div");
|
||||
entry.className = `chat-entry ${role === "AI" ? "ai" : "user"}`;
|
||||
entry.textContent = `${role}: ${text}`;
|
||||
chatHistory.appendChild(entry);
|
||||
chatHistory.scrollTop = chatHistory.scrollHeight;
|
||||
}
|
||||
|
||||
function setInterim(role, text) {
|
||||
const isAi = role === "AI";
|
||||
let el = isAi ? interimAiEl : interimUserEl;
|
||||
if (!text) {
|
||||
if (el) el.remove();
|
||||
if (isAi) interimAiEl = null;
|
||||
else interimUserEl = null;
|
||||
if (isAi) interimAiText = "";
|
||||
else interimUserText = "";
|
||||
return;
|
||||
}
|
||||
if (!el) {
|
||||
el = document.createElement("div");
|
||||
el.className = `chat-entry ${isAi ? "ai" : "user"} interim`;
|
||||
chatHistory.appendChild(el);
|
||||
if (isAi) interimAiEl = el;
|
||||
else interimUserEl = el;
|
||||
}
|
||||
el.textContent = `${role} (interim): ${text}`;
|
||||
chatHistory.scrollTop = chatHistory.scrollHeight;
|
||||
}
|
||||
|
||||
function stopPlayback() {
|
||||
discardAudio = true;
|
||||
playbackTime = audioCtx ? audioCtx.currentTime : 0;
|
||||
playbackSources.forEach((s) => {
|
||||
try {
|
||||
s.stop();
|
||||
} catch (err) {}
|
||||
});
|
||||
playbackSources = [];
|
||||
}
|
||||
|
||||
function setStatus(connected, detail) {
|
||||
statusDot.classList.toggle("on", connected);
|
||||
statusText.textContent = connected ? "Connected" : "Disconnected";
|
||||
statusSub.textContent = detail || "";
|
||||
}
|
||||
|
||||
async function ensureAudioContext() {
|
||||
if (audioCtx) return;
|
||||
audioCtx = new (window.AudioContext || window.webkitAudioContext)();
|
||||
playbackDest = audioCtx.createMediaStreamDestination();
|
||||
audioOut.srcObject = playbackDest.stream;
|
||||
try {
|
||||
await audioOut.play();
|
||||
} catch (err) {
|
||||
logLine("sys", "Audio playback blocked (user gesture needed)", { err: String(err) });
|
||||
}
|
||||
if (outputSelect.value) {
|
||||
await setOutputDevice(outputSelect.value);
|
||||
}
|
||||
}
|
||||
|
||||
function downsampleBuffer(buffer, inRate, outRate) {
|
||||
if (outRate === inRate) return buffer;
|
||||
const ratio = inRate / outRate;
|
||||
const newLength = Math.round(buffer.length / ratio);
|
||||
const result = new Float32Array(newLength);
|
||||
let offsetResult = 0;
|
||||
let offsetBuffer = 0;
|
||||
while (offsetResult < result.length) {
|
||||
const nextOffsetBuffer = Math.round((offsetResult + 1) * ratio);
|
||||
let accum = 0;
|
||||
let count = 0;
|
||||
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
|
||||
accum += buffer[i];
|
||||
count++;
|
||||
}
|
||||
result[offsetResult] = accum / count;
|
||||
offsetResult++;
|
||||
offsetBuffer = nextOffsetBuffer;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function floatTo16BitPCM(float32) {
|
||||
const out = new Int16Array(float32.length);
|
||||
for (let i = 0; i < float32.length; i++) {
|
||||
const s = Math.max(-1, Math.min(1, float32[i]));
|
||||
out[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
function schedulePlayback(int16Data) {
|
||||
if (!audioCtx || !playbackDest) return;
|
||||
if (discardAudio) return;
|
||||
const float32 = new Float32Array(int16Data.length);
|
||||
for (let i = 0; i < int16Data.length; i++) {
|
||||
float32[i] = int16Data[i] / 32768;
|
||||
}
|
||||
const buffer = audioCtx.createBuffer(1, float32.length, targetSampleRate);
|
||||
buffer.copyToChannel(float32, 0);
|
||||
const source = audioCtx.createBufferSource();
|
||||
source.buffer = buffer;
|
||||
source.connect(playbackDest);
|
||||
const startTime = Math.max(audioCtx.currentTime + 0.02, playbackTime);
|
||||
source.start(startTime);
|
||||
playbackTime = startTime + buffer.duration;
|
||||
playbackSources.push(source);
|
||||
source.onended = () => {
|
||||
playbackSources = playbackSources.filter((s) => s !== source);
|
||||
};
|
||||
}
|
||||
|
||||
async function connect() {
|
||||
if (ws && ws.readyState === WebSocket.OPEN) return;
|
||||
ws = new WebSocket(wsUrl.value.trim());
|
||||
ws.binaryType = "arraybuffer";
|
||||
|
||||
ws.onopen = () => {
|
||||
setStatus(true, "Session open");
|
||||
logLine("sys", "WebSocket connected");
|
||||
ensureAudioContext();
|
||||
sendCommand({ command: "invite", option: { codec: "pcm", sampleRate: targetSampleRate } });
|
||||
};
|
||||
|
||||
ws.onclose = () => {
|
||||
setStatus(false, "Connection closed");
|
||||
logLine("sys", "WebSocket closed");
|
||||
ws = null;
|
||||
};
|
||||
|
||||
ws.onerror = (err) => {
|
||||
logLine("sys", "WebSocket error", { err: String(err) });
|
||||
};
|
||||
|
||||
ws.onmessage = (msg) => {
|
||||
if (typeof msg.data === "string") {
|
||||
const event = JSON.parse(msg.data);
|
||||
handleEvent(event);
|
||||
} else {
|
||||
const audioBuf = msg.data;
|
||||
const int16 = new Int16Array(audioBuf);
|
||||
schedulePlayback(int16);
|
||||
logLine("audio", `Audio ${Math.round((int16.length / targetSampleRate) * 1000)}ms`);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
function disconnect() {
|
||||
if (ws) ws.close();
|
||||
ws = null;
|
||||
setStatus(false, "Disconnected");
|
||||
}
|
||||
|
||||
function sendCommand(cmd) {
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
||||
logLine("sys", "Not connected");
|
||||
return;
|
||||
}
|
||||
ws.send(JSON.stringify(cmd));
|
||||
logLine("sys", `→ ${cmd.command}`, cmd);
|
||||
}
|
||||
|
||||
function handleEvent(event) {
|
||||
const type = event.event || "unknown";
|
||||
logLine("event", type, event);
|
||||
if (type === "transcript") {
|
||||
if (event.isFinal && event.text) {
|
||||
setInterim("You", "");
|
||||
addChat("You", event.text);
|
||||
} else if (event.text) {
|
||||
interimUserText += event.text;
|
||||
setInterim("You", interimUserText);
|
||||
}
|
||||
}
|
||||
if (type === "llmResponse") {
|
||||
if (event.isFinal && event.text) {
|
||||
setInterim("AI", "");
|
||||
addChat("AI", event.text);
|
||||
} else if (event.text) {
|
||||
interimAiText += event.text;
|
||||
setInterim("AI", interimAiText);
|
||||
}
|
||||
}
|
||||
if (type === "trackStart") {
|
||||
// New bot audio: stop any previous playback to avoid overlap
|
||||
stopPlayback();
|
||||
discardAudio = false;
|
||||
}
|
||||
if (type === "speaking") {
|
||||
// User started speaking: clear any in-flight audio to avoid overlap
|
||||
stopPlayback();
|
||||
}
|
||||
if (type === "interrupt") {
|
||||
stopPlayback();
|
||||
}
|
||||
}
|
||||
|
||||
async function startMic() {
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
||||
logLine("sys", "Connect before starting mic");
|
||||
return;
|
||||
}
|
||||
await ensureAudioContext();
|
||||
const deviceId = inputSelect.value || undefined;
|
||||
micStream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: deviceId ? { deviceId: { exact: deviceId } } : true,
|
||||
});
|
||||
micSource = audioCtx.createMediaStreamSource(micStream);
|
||||
processor = audioCtx.createScriptProcessor(2048, 1, 1);
|
||||
processor.onaudioprocess = (e) => {
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) return;
|
||||
const input = e.inputBuffer.getChannelData(0);
|
||||
const downsampled = downsampleBuffer(input, audioCtx.sampleRate, targetSampleRate);
|
||||
const pcm16 = floatTo16BitPCM(downsampled);
|
||||
ws.send(pcm16.buffer);
|
||||
};
|
||||
micSource.connect(processor);
|
||||
processor.connect(audioCtx.destination);
|
||||
logLine("sys", "Microphone started");
|
||||
}
|
||||
|
||||
function stopMic() {
|
||||
if (processor) {
|
||||
processor.disconnect();
|
||||
processor = null;
|
||||
}
|
||||
if (micSource) {
|
||||
micSource.disconnect();
|
||||
micSource = null;
|
||||
}
|
||||
if (micStream) {
|
||||
micStream.getTracks().forEach((t) => t.stop());
|
||||
micStream = null;
|
||||
}
|
||||
logLine("sys", "Microphone stopped");
|
||||
}
|
||||
|
||||
async function refreshDevices() {
|
||||
const devices = await navigator.mediaDevices.enumerateDevices();
|
||||
inputSelect.innerHTML = "";
|
||||
outputSelect.innerHTML = "";
|
||||
devices.forEach((d) => {
|
||||
if (d.kind === "audioinput") {
|
||||
const opt = document.createElement("option");
|
||||
opt.value = d.deviceId;
|
||||
opt.textContent = d.label || `Mic ${inputSelect.length + 1}`;
|
||||
inputSelect.appendChild(opt);
|
||||
}
|
||||
if (d.kind === "audiooutput") {
|
||||
const opt = document.createElement("option");
|
||||
opt.value = d.deviceId;
|
||||
opt.textContent = d.label || `Output ${outputSelect.length + 1}`;
|
||||
outputSelect.appendChild(opt);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async function requestDeviceAccess() {
|
||||
// Needed to reveal device labels in most browsers
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
stream.getTracks().forEach((t) => t.stop());
|
||||
logLine("sys", "Microphone permission granted");
|
||||
} catch (err) {
|
||||
logLine("sys", "Microphone permission denied", { err: String(err) });
|
||||
}
|
||||
}
|
||||
|
||||
async function setOutputDevice(deviceId) {
|
||||
if (!audioOut.setSinkId) {
|
||||
logLine("sys", "setSinkId not supported in this browser");
|
||||
return;
|
||||
}
|
||||
await audioOut.setSinkId(deviceId);
|
||||
logLine("sys", `Output device set`, { deviceId });
|
||||
}
|
||||
|
||||
connectBtn.addEventListener("click", connect);
|
||||
disconnectBtn.addEventListener("click", disconnect);
|
||||
refreshDevicesBtn.addEventListener("click", async () => {
|
||||
await requestDeviceAccess();
|
||||
await refreshDevices();
|
||||
});
|
||||
startMicBtn.addEventListener("click", startMic);
|
||||
stopMicBtn.addEventListener("click", stopMic);
|
||||
sendChatBtn.addEventListener("click", () => {
|
||||
const text = chatInput.value.trim();
|
||||
if (!text) return;
|
||||
ensureAudioContext();
|
||||
addChat("You", text);
|
||||
sendCommand({ command: "chat", text });
|
||||
chatInput.value = "";
|
||||
});
|
||||
clearLogBtn.addEventListener("click", () => {
|
||||
logEl.innerHTML = "";
|
||||
chatHistory.innerHTML = "";
|
||||
setInterim("You", "");
|
||||
setInterim("AI", "");
|
||||
interimUserText = "";
|
||||
interimAiText = "";
|
||||
});
|
||||
inputSelect.addEventListener("change", () => {
|
||||
if (micStream) {
|
||||
stopMic();
|
||||
startMic();
|
||||
}
|
||||
});
|
||||
outputSelect.addEventListener("change", () => setOutputDevice(outputSelect.value));
|
||||
|
||||
navigator.mediaDevices.addEventListener("devicechange", refreshDevices);
|
||||
refreshDevices().catch(() => {});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -63,6 +63,7 @@ class SileroVAD:
|
||||
self.min_chunk_size = 512
|
||||
self.last_label = "Silence"
|
||||
self.last_probability = 0.0
|
||||
self._energy_noise_floor = 1e-4
|
||||
|
||||
def _reset_state(self):
|
||||
# Silero VAD V4+ expects state shape [2, 1, 128]
|
||||
@@ -81,8 +82,7 @@ class SileroVAD:
|
||||
Tuple of (label, probability) where label is "Speech" or "Silence"
|
||||
"""
|
||||
if self.session is None or not ONNX_AVAILABLE:
|
||||
# Fallback energy-based VAD when model isn't available.
|
||||
# Map RMS energy to a pseudo-probability so the existing threshold works.
|
||||
# Fallback energy-based VAD with adaptive noise floor.
|
||||
if not pcm_bytes:
|
||||
return "Silence", 0.0
|
||||
audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||
@@ -90,9 +90,17 @@ class SileroVAD:
|
||||
return "Silence", 0.0
|
||||
audio_float = audio_int16.astype(np.float32) / 32768.0
|
||||
rms = float(np.sqrt(np.mean(audio_float * audio_float)))
|
||||
# Typical speech RMS is ~0.02-0.05 at 16-bit normalized scale.
|
||||
# Normalize so threshold=0.5 roughly corresponds to ~0.025 RMS.
|
||||
probability = min(1.0, rms / 0.05)
|
||||
|
||||
# Update adaptive noise floor (slowly rises, faster to fall)
|
||||
if rms < self._energy_noise_floor:
|
||||
self._energy_noise_floor = 0.95 * self._energy_noise_floor + 0.05 * rms
|
||||
else:
|
||||
self._energy_noise_floor = 0.995 * self._energy_noise_floor + 0.005 * rms
|
||||
|
||||
# Compute SNR-like ratio and map to probability
|
||||
denom = max(self._energy_noise_floor, 1e-6)
|
||||
snr = max(0.0, (rms - denom) / denom)
|
||||
probability = min(1.0, snr / 3.0) # ~3x above noise => strong speech
|
||||
label = "Speech" if probability >= 0.5 else "Silence"
|
||||
return label, probability
|
||||
|
||||
|
||||
Reference in New Issue
Block a user