Fix microphone talk eou missing and clean chat log

Update web client
Remove invite button, correct stream asr tts transcription
2026-02-06 11:36:39 +08:00 · 2026-02-06 11:25:05 +08:00 · 2026-02-06 11:20:52 +08:00 · 2026-02-06 10:46:24 +08:00 · 2026-02-06 10:34:09 +08:00 · 2026-02-06 09:57:45 +08:00
5 changed files with 952 additions and 69 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -4,10 +4,11 @@ import asyncio
 import json
 import time
 import uuid
+from pathlib import Path
 from typing import Dict, Any, Optional, List
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, FileResponse
 from loguru import logger

 # Try to import aiortc (optional for WebRTC functionality)
@@ -64,6 +65,7 @@ async def heartbeat_and_timeout_task(

 # Initialize FastAPI
 app = FastAPI(title="Python Active-Call", version="0.1.0")
+_WEB_CLIENT_PATH = Path(__file__).resolve().parent.parent / "examples" / "web_client.html"

 # Configure CORS
 app.add_middleware(
@@ -99,6 +101,24 @@ async def health_check():
    return {"status": "healthy", "sessions": len(active_sessions)}


+@app.get("/")
+async def web_client_root():
+    """Serve the web client."""
+    if not _WEB_CLIENT_PATH.exists():
+        raise HTTPException(status_code=404, detail="Web client not found")
+    return FileResponse(_WEB_CLIENT_PATH)
+
+
+@app.get("/client")
+async def web_client_alias():
+    """Alias for the web client."""
+    if not _WEB_CLIENT_PATH.exists():
+        raise HTTPException(status_code=404, detail="Web client not found")
+    return FileResponse(_WEB_CLIENT_PATH)
+
+
+
+
@app.get("/iceservers")
 async def get_ice_servers():
    """Get ICE servers configuration for WebRTC."""
--- a/core/duplex_pipeline.py
+++ b/core/duplex_pipeline.py
@@ -108,7 +108,10 @@ class DuplexPipeline:
        self._is_bot_speaking = False
        self._current_turn_task: Optional[asyncio.Task] = None
        self._audio_buffer: bytes = b""
+        max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30
+        self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
        self._last_vad_status: str = "Silence"
+        self._process_lock = asyncio.Lock()
        
        # Interruption handling
        self._interrupt_event = asyncio.Event()
@@ -206,71 +209,75 @@ class DuplexPipeline:
            return
        
        try:
-            # 1. Process through VAD
-            vad_result = self.vad_processor.process(pcm_bytes, settings.chunk_size_ms)
+            async with self._process_lock:
+                # 1. Process through VAD
+                vad_result = self.vad_processor.process(pcm_bytes, settings.chunk_size_ms)
            
-            vad_status = "Silence"
-            if vad_result:
-                event_type, probability = vad_result
-                vad_status = "Speech" if event_type == "speaking" else "Silence"
-                
-                # Emit VAD event
-                await self.event_bus.publish(event_type, {
-                    "trackId": self.session_id,
-                    "probability": probability
-                })
-            else:
-                # No state change - keep previous status
-                vad_status = self._last_vad_status
-            
-            # Update state based on VAD
-            if vad_status == "Speech" and self._last_vad_status != "Speech":
-                await self._on_speech_start()
-            
-            self._last_vad_status = vad_status
-            
-            # 2. Check for barge-in (user speaking while bot speaking)
-            # Filter false interruptions by requiring minimum speech duration
-            if self._is_bot_speaking:
-                if vad_status == "Speech":
-                    # User is speaking while bot is speaking
-                    self._barge_in_silence_frames = 0  # Reset silence counter
+                vad_status = "Silence"
+                if vad_result:
+                    event_type, probability = vad_result
+                    vad_status = "Speech" if event_type == "speaking" else "Silence"
                    
-                    if self._barge_in_speech_start_time is None:
-                        # Start tracking speech duration
-                        self._barge_in_speech_start_time = time.time()
-                        self._barge_in_speech_frames = 1
-                        logger.debug("Potential barge-in detected, tracking duration...")
-                    else:
-                        self._barge_in_speech_frames += 1
-                        # Check if speech duration exceeds threshold
-                        speech_duration_ms = (time.time() - self._barge_in_speech_start_time) * 1000
-                        if speech_duration_ms >= self._barge_in_min_duration_ms:
-                            logger.info(f"Barge-in confirmed after {speech_duration_ms:.0f}ms of speech ({self._barge_in_speech_frames} frames)")
-                            await self._handle_barge_in()
+                    # Emit VAD event
+                    await self.event_bus.publish(event_type, {
+                        "trackId": self.session_id,
+                        "probability": probability
+                    })
                else:
-                    # Silence frame during potential barge-in
-                    if self._barge_in_speech_start_time is not None:
-                        self._barge_in_silence_frames += 1
-                        # Allow brief silence gaps (VAD flickering)
-                        if self._barge_in_silence_frames > self._barge_in_silence_tolerance:
-                            # Too much silence - reset barge-in tracking
-                            logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
-                            self._barge_in_speech_start_time = None
-                            self._barge_in_speech_frames = 0
-                            self._barge_in_silence_frames = 0
-            
-            # 3. Buffer audio for ASR
-            if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
-                self._audio_buffer += pcm_bytes
-                await self.asr_service.send_audio(pcm_bytes)
+                    # No state change - keep previous status
+                    vad_status = self._last_vad_status
                
-                # For SiliconFlow ASR, trigger interim transcription periodically
-                # The service handles timing internally via start_interim_transcription()
-            
-            # 4. Check for End of Utterance - this triggers LLM response
-            if self.eou_detector.process(vad_status):
-                await self._on_end_of_utterance()
+                # Update state based on VAD
+                if vad_status == "Speech" and self._last_vad_status != "Speech":
+                    await self._on_speech_start()
+                
+                self._last_vad_status = vad_status
+                
+                # 2. Check for barge-in (user speaking while bot speaking)
+                # Filter false interruptions by requiring minimum speech duration
+                if self._is_bot_speaking:
+                    if vad_status == "Speech":
+                        # User is speaking while bot is speaking
+                        self._barge_in_silence_frames = 0  # Reset silence counter
+                        
+                        if self._barge_in_speech_start_time is None:
+                            # Start tracking speech duration
+                            self._barge_in_speech_start_time = time.time()
+                            self._barge_in_speech_frames = 1
+                            logger.debug("Potential barge-in detected, tracking duration...")
+                        else:
+                            self._barge_in_speech_frames += 1
+                            # Check if speech duration exceeds threshold
+                            speech_duration_ms = (time.time() - self._barge_in_speech_start_time) * 1000
+                            if speech_duration_ms >= self._barge_in_min_duration_ms:
+                                logger.info(f"Barge-in confirmed after {speech_duration_ms:.0f}ms of speech ({self._barge_in_speech_frames} frames)")
+                                await self._handle_barge_in()
+                    else:
+                        # Silence frame during potential barge-in
+                        if self._barge_in_speech_start_time is not None:
+                            self._barge_in_silence_frames += 1
+                            # Allow brief silence gaps (VAD flickering)
+                            if self._barge_in_silence_frames > self._barge_in_silence_tolerance:
+                                # Too much silence - reset barge-in tracking
+                                logger.debug(f"Barge-in cancelled after {self._barge_in_silence_frames} silence frames")
+                                self._barge_in_speech_start_time = None
+                                self._barge_in_speech_frames = 0
+                                self._barge_in_silence_frames = 0
+                
+                # 3. Buffer audio for ASR
+                if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
+                    self._audio_buffer += pcm_bytes
+                    if len(self._audio_buffer) > self._max_audio_buffer_bytes:
+                        # Keep only the most recent audio to cap memory usage
+                        self._audio_buffer = self._audio_buffer[-self._max_audio_buffer_bytes:]
+                    await self.asr_service.send_audio(pcm_bytes)
+                    
+                    # For SiliconFlow ASR, trigger interim transcription periodically
+                    # The service handles timing internally via start_interim_transcription()
+                
+                # 4. Check for End of Utterance - this triggers LLM response
+                if self.eou_detector.process(vad_status):
+                    await self._on_end_of_utterance()
            
        except Exception as e:
            logger.error(f"Pipeline audio processing error: {e}", exc_info=True)
@@ -388,6 +395,8 @@ class DuplexPipeline:
        self._last_sent_transcript = ""
        
        # Process the turn - trigger LLM response
+        # Cancel any existing turn to avoid overlapping assistant responses
+        await self._stop_current_speech()
        await self.conversation.end_user_turn(user_text)
        self._current_turn_task = asyncio.create_task(self._handle_turn(user_text))
    
@@ -650,8 +659,10 @@ class DuplexPipeline:
        if self.llm_service and hasattr(self.llm_service, 'cancel'):
            self.llm_service.cancel()
        
-        # Interrupt conversation
-        await self.conversation.interrupt()
+        # Interrupt conversation only if there is no active turn task.
+        # When a turn task exists, it will handle end_assistant_turn() to avoid double callbacks.
+        if not (self._current_turn_task and not self._current_turn_task.done()):
+            await self.conversation.interrupt()
        
        # Reset for new user turn
        await self.conversation.start_user_turn()
@@ -667,6 +678,12 @@ class DuplexPipeline:
                await self._current_turn_task
            except asyncio.CancelledError:
                pass
+
+        # Ensure underlying services are cancelled to avoid leaking work/audio
+        if self.tts_service:
+            await self.tts_service.cancel()
+        if self.llm_service and hasattr(self.llm_service, 'cancel'):
+            self.llm_service.cancel()
        
        self._is_bot_speaking = False
        self._interrupt_event.clear()
--- a/docs/duplex_interaction.svg
+++ b/docs/duplex_interaction.svg
@@ -0,0 +1,96 @@
+<svg width="1200" height="620" viewBox="0 0 1200 620" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <style>
+      .box { fill:#11131a; stroke:#3a3f4b; stroke-width:1.2; rx:10; ry:10; }
+      .title { font: 600 14px 'Arial'; fill:#f2f3f7; }
+      .text { font: 12px 'Arial'; fill:#c8ccd8; }
+      .arrow { stroke:#7aa2ff; stroke-width:1.6; marker-end:url(#arrow); fill:none; }
+      .arrow2 { stroke:#2dd4bf; stroke-width:1.6; marker-end:url(#arrow); fill:none; }
+      .arrow3 { stroke:#ff6b6b; stroke-width:1.6; marker-end:url(#arrow); fill:none; }
+      .label { font: 11px 'Arial'; fill:#9aa3b2; }
+    </style>
+    <marker id="arrow" markerWidth="8" markerHeight="8" refX="7" refY="4" orient="auto">
+      <path d="M0,0 L8,4 L0,8 Z" fill="#7aa2ff"/>
+    </marker>
+  </defs>
+
+  <rect x="40" y="40" width="250" height="120" class="box"/>
+  <text x="60" y="70" class="title">Web Client</text>
+  <text x="60" y="95" class="text">WS JSON commands</text>
+  <text x="60" y="115" class="text">WS binary PCM audio</text>
+
+  <rect x="350" y="40" width="250" height="120" class="box"/>
+  <text x="370" y="70" class="title">FastAPI /ws</text>
+  <text x="370" y="95" class="text">Session + Transport</text>
+
+  <rect x="660" y="40" width="250" height="120" class="box"/>
+  <text x="680" y="70" class="title">DuplexPipeline</text>
+  <text x="680" y="95" class="text">process_audio / process_text</text>
+
+  <rect x="920" y="40" width="240" height="120" class="box"/>
+  <text x="940" y="70" class="title">ConversationManager</text>
+  <text x="940" y="95" class="text">turns + state</text>
+
+  <rect x="660" y="200" width="180" height="100" class="box"/>
+  <text x="680" y="230" class="title">VADProcessor</text>
+  <text x="680" y="255" class="text">speech/silence</text>
+
+  <rect x="860" y="200" width="180" height="100" class="box"/>
+  <text x="880" y="230" class="title">EOU Detector</text>
+  <text x="880" y="255" class="text">end-of-utterance</text>
+
+  <rect x="1060" y="200" width="120" height="100" class="box"/>
+  <text x="1075" y="230" class="title">ASR</text>
+  <text x="1075" y="255" class="text">transcripts</text>
+
+  <rect x="920" y="350" width="240" height="110" class="box"/>
+  <text x="940" y="380" class="title">LLM (stream)</text>
+  <text x="940" y="405" class="text">llmResponse events</text>
+
+  <rect x="660" y="350" width="220" height="110" class="box"/>
+  <text x="680" y="380" class="title">TTS (stream)</text>
+  <text x="680" y="405" class="text">PCM audio</text>
+
+  <rect x="40" y="350" width="250" height="110" class="box"/>
+  <text x="60" y="380" class="title">Web Client</text>
+  <text x="60" y="405" class="text">audio playback + UI</text>
+
+  <path d="M290 80 L350 80" class="arrow"/>
+  <text x="300" y="70" class="label">JSON / PCM</text>
+
+  <path d="M600 80 L660 80" class="arrow"/>
+  <text x="615" y="70" class="label">dispatch</text>
+
+  <path d="M910 80 L920 80" class="arrow"/>
+  <text x="880" y="70" class="label">turn mgmt</text>
+
+  <path d="M750 160 L750 200" class="arrow"/>
+  <text x="705" y="190" class="label">audio chunks</text>
+
+  <path d="M840 250 L860 250" class="arrow"/>
+  <text x="835" y="240" class="label">vad status</text>
+
+  <path d="M1040 250 L1060 250" class="arrow"/>
+  <text x="1010" y="240" class="label">audio buffer</text>
+
+  <path d="M950 300 L950 350" class="arrow2"/>
+  <text x="930" y="340" class="label">EOU -> LLM</text>
+
+  <path d="M880 405 L920 405" class="arrow2"/>
+  <text x="870" y="395" class="label">text stream</text>
+
+  <path d="M660 405 L290 405" class="arrow2"/>
+  <text x="430" y="395" class="label">PCM audio</text>
+
+  <path d="M660 450 L350 450" class="arrow"/>
+  <text x="420" y="440" class="label">events: trackStart/End</text>
+
+  <path d="M350 450 L290 450" class="arrow"/>
+  <text x="315" y="440" class="label">UI updates</text>
+
+  <path d="M750 200 L750 160" class="arrow3"/>
+  <text x="700" y="145" class="label">barge-in detection</text>
+
+  <path d="M760 170 L920 170" class="arrow3"/>
+  <text x="820" y="160" class="label">interrupt event + cancel</text>
+</svg>
--- a/examples/web_client.html
+++ b/examples/web_client.html
@@ -0,0 +1,742 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Duplex Voice Web Client</title>
+    <style>
+      @import url("https://fonts.googleapis.com/css2?family=Fraunces:opsz,wght@9..144,300;9..144,500;9..144,700&family=Recursive:wght@300;400;600;700&display=swap");
+
+      :root {
+        --bg: #0b0b0f;
+        --panel: #14141c;
+        --panel-2: #101018;
+        --ink: #f2f3f7;
+        --muted: #a7acba;
+        --accent: #ff6b6b;
+        --accent-2: #ffd166;
+        --good: #2dd4bf;
+        --bad: #f87171;
+        --grid: rgba(255, 255, 255, 0.06);
+        --shadow: 0 20px 60px rgba(0, 0, 0, 0.45);
+      }
+
+      * {
+        box-sizing: border-box;
+      }
+
+      html,
+      body {
+        height: 100%;
+        margin: 0;
+        color: var(--ink);
+        background: radial-gradient(1200px 600px at 20% -10%, #1d1d2a 0%, transparent 60%),
+          radial-gradient(800px 800px at 110% 10%, #20203a 0%, transparent 50%),
+          var(--bg);
+        font-family: "Recursive", ui-sans-serif, system-ui, -apple-system, "Segoe UI", sans-serif;
+      }
+
+      .noise {
+        position: fixed;
+        inset: 0;
+        background-image: url("data:image/svg+xml;utf8,<svg xmlns='http://www.w3.org/2000/svg' width='120' height='120' viewBox='0 0 120 120'><filter id='n'><feTurbulence type='fractalNoise' baseFrequency='0.9' numOctaves='2' stitchTiles='stitch'/></filter><rect width='120' height='120' filter='url(%23n)' opacity='0.06'/></svg>");
+        pointer-events: none;
+        mix-blend-mode: soft-light;
+      }
+
+      header {
+        padding: 32px 28px 18px;
+        border-bottom: 1px solid var(--grid);
+      }
+
+      h1 {
+        font-family: "Fraunces", serif;
+        font-weight: 600;
+        margin: 0 0 6px;
+        letter-spacing: 0.4px;
+      }
+
+      .subtitle {
+        color: var(--muted);
+        font-size: 0.95rem;
+      }
+
+      main {
+        display: grid;
+        grid-template-columns: 1.1fr 1.4fr;
+        gap: 24px;
+        padding: 24px 28px 40px;
+      }
+
+      .panel {
+        background: linear-gradient(180deg, rgba(255, 255, 255, 0.02), transparent),
+          var(--panel);
+        border: 1px solid var(--grid);
+        border-radius: 16px;
+        padding: 20px;
+        box-shadow: var(--shadow);
+      }
+
+      .panel h2 {
+        margin: 0 0 12px;
+        font-size: 1.05rem;
+        font-weight: 600;
+      }
+
+      .stack {
+        display: grid;
+        gap: 12px;
+      }
+
+      label {
+        display: block;
+        font-size: 0.85rem;
+        color: var(--muted);
+        margin-bottom: 6px;
+      }
+
+      input,
+      select,
+      button,
+      textarea {
+        font-family: inherit;
+      }
+
+      input,
+      select,
+      textarea {
+        width: 100%;
+        padding: 10px 12px;
+        border-radius: 10px;
+        border: 1px solid var(--grid);
+        background: var(--panel-2);
+        color: var(--ink);
+        outline: none;
+      }
+
+      textarea {
+        min-height: 80px;
+        resize: vertical;
+      }
+
+      .row {
+        display: grid;
+        grid-template-columns: 1fr 1fr;
+        gap: 12px;
+      }
+
+      .btn-row {
+        display: flex;
+        flex-wrap: wrap;
+        gap: 10px;
+      }
+
+      button {
+        border: none;
+        border-radius: 999px;
+        padding: 10px 16px;
+        font-weight: 600;
+        background: var(--ink);
+        color: #111;
+        cursor: pointer;
+        transition: transform 0.2s ease, box-shadow 0.2s ease;
+      }
+
+      button.secondary {
+        background: transparent;
+        color: var(--ink);
+        border: 1px solid var(--grid);
+      }
+
+      button.accent {
+        background: linear-gradient(120deg, var(--accent), #f97316);
+        color: #0b0b0f;
+      }
+
+      button.good {
+        background: linear-gradient(120deg, var(--good), #22c55e);
+        color: #07261f;
+      }
+
+      button.bad {
+        background: linear-gradient(120deg, var(--bad), #f97316);
+        color: #2a0b0b;
+      }
+
+      button:active {
+        transform: translateY(1px) scale(0.99);
+      }
+
+      .status {
+        display: flex;
+        align-items: center;
+        gap: 12px;
+        padding: 12px;
+        background: rgba(255, 255, 255, 0.03);
+        border-radius: 12px;
+        border: 1px dashed var(--grid);
+        font-size: 0.9rem;
+      }
+
+      .dot {
+        width: 10px;
+        height: 10px;
+        border-radius: 999px;
+        background: var(--bad);
+        box-shadow: 0 0 12px rgba(248, 113, 113, 0.5);
+      }
+
+      .dot.on {
+        background: var(--good);
+        box-shadow: 0 0 12px rgba(45, 212, 191, 0.7);
+      }
+
+      .log {
+        height: 320px;
+        overflow: auto;
+        padding: 12px;
+        background: #0d0d14;
+        border-radius: 12px;
+        border: 1px solid var(--grid);
+        font-size: 0.85rem;
+        line-height: 1.4;
+      }
+
+      .chat {
+        height: 260px;
+        overflow: auto;
+        padding: 12px;
+        background: #0d0d14;
+        border-radius: 12px;
+        border: 1px solid var(--grid);
+        font-size: 0.9rem;
+        line-height: 1.45;
+      }
+
+      .chat-entry {
+        padding: 8px 10px;
+        margin-bottom: 8px;
+        border-radius: 10px;
+        background: rgba(255, 255, 255, 0.04);
+        border: 1px solid rgba(255, 255, 255, 0.06);
+      }
+
+      .chat-entry.user {
+        border-left: 3px solid var(--accent-2);
+      }
+
+      .chat-entry.ai {
+        border-left: 3px solid var(--good);
+      }
+
+      .chat-entry.interim {
+        opacity: 0.7;
+        font-style: italic;
+      }
+
+      .log-entry {
+        padding: 6px 8px;
+        border-bottom: 1px dashed rgba(255, 255, 255, 0.06);
+      }
+
+      .log-entry:last-child {
+        border-bottom: none;
+      }
+
+      .tag {
+        display: inline-flex;
+        align-items: center;
+        gap: 6px;
+        padding: 2px 8px;
+        border-radius: 999px;
+        font-size: 0.7rem;
+        text-transform: uppercase;
+        letter-spacing: 0.6px;
+        background: rgba(255, 255, 255, 0.08);
+        color: var(--muted);
+      }
+
+      .tag.event {
+        background: rgba(255, 107, 107, 0.18);
+        color: #ffc1c1;
+      }
+
+      .tag.audio {
+        background: rgba(45, 212, 191, 0.2);
+        color: #c5f9f0;
+      }
+
+      .tag.sys {
+        background: rgba(255, 209, 102, 0.2);
+        color: #ffefb0;
+      }
+
+      .muted {
+        color: var(--muted);
+      }
+
+      footer {
+        padding: 0 28px 28px;
+        color: var(--muted);
+        font-size: 0.8rem;
+      }
+
+      @media (max-width: 1100px) {
+        main {
+          grid-template-columns: 1fr;
+        }
+        .log {
+          height: 360px;
+        }
+        .chat {
+          height: 260px;
+        }
+      }
+    </style>
+  </head>
+  <body>
+    <div class="noise"></div>
+    <header>
+      <h1>Duplex Voice Client</h1>
+      <div class="subtitle">Browser client for the WebSocket duplex pipeline. Device selection + event logging.</div>
+    </header>
+
+    <main>
+      <section class="panel stack">
+        <h2>Connection</h2>
+        <div>
+          <label for="wsUrl">WebSocket URL</label>
+          <input id="wsUrl" value="ws://localhost:8000/ws" />
+        </div>
+        <div class="btn-row">
+          <button class="accent" id="connectBtn">Connect</button>
+          <button class="secondary" id="disconnectBtn">Disconnect</button>
+        </div>
+        <div class="status">
+          <div id="statusDot" class="dot"></div>
+          <div>
+            <div id="statusText">Disconnected</div>
+            <div class="muted" id="statusSub">Waiting for connection</div>
+          </div>
+        </div>
+
+        <h2>Devices</h2>
+        <div class="row">
+          <div>
+            <label for="inputSelect">Input (Mic)</label>
+            <select id="inputSelect"></select>
+          </div>
+          <div>
+            <label for="outputSelect">Output (Speaker)</label>
+            <select id="outputSelect"></select>
+          </div>
+        </div>
+        <div class="btn-row">
+          <button class="secondary" id="refreshDevicesBtn">Refresh Devices</button>
+          <button class="good" id="startMicBtn">Start Mic</button>
+          <button class="secondary" id="stopMicBtn">Stop Mic</button>
+        </div>
+
+        <h2>Chat</h2>
+        <div class="stack">
+          <textarea id="chatInput" placeholder="Type a message, press Send"></textarea>
+          <div class="btn-row">
+            <button class="accent" id="sendChatBtn">Send Chat</button>
+            <button class="secondary" id="clearLogBtn">Clear Log</button>
+          </div>
+        </div>
+      </section>
+
+      <section class="stack">
+        <div class="panel stack">
+          <h2>Chat History</h2>
+          <div class="chat" id="chatHistory"></div>
+        </div>
+        <div class="panel stack">
+          <h2>Event Log</h2>
+          <div class="log" id="log"></div>
+        </div>
+      </section>
+    </main>
+
+    <footer>
+      Output device selection requires HTTPS + a browser that supports <code>setSinkId</code>.
+      Audio is sent as 16-bit PCM @ 16 kHz, matching <code>examples/mic_client.py</code>.
+    </footer>
+
+    <audio id="audioOut" autoplay></audio>
+
+    <script>
+      const wsUrl = document.getElementById("wsUrl");
+      const connectBtn = document.getElementById("connectBtn");
+      const disconnectBtn = document.getElementById("disconnectBtn");
+      const inputSelect = document.getElementById("inputSelect");
+      const outputSelect = document.getElementById("outputSelect");
+      const startMicBtn = document.getElementById("startMicBtn");
+      const stopMicBtn = document.getElementById("stopMicBtn");
+      const refreshDevicesBtn = document.getElementById("refreshDevicesBtn");
+      const sendChatBtn = document.getElementById("sendChatBtn");
+      const clearLogBtn = document.getElementById("clearLogBtn");
+      const chatInput = document.getElementById("chatInput");
+      const logEl = document.getElementById("log");
+      const chatHistory = document.getElementById("chatHistory");
+      const statusDot = document.getElementById("statusDot");
+      const statusText = document.getElementById("statusText");
+      const statusSub = document.getElementById("statusSub");
+      const audioOut = document.getElementById("audioOut");
+
+      let ws = null;
+      let audioCtx = null;
+      let micStream = null;
+      let processor = null;
+      let micSource = null;
+      let playbackDest = null;
+      let playbackTime = 0;
+      let discardAudio = false;
+      let playbackSources = [];
+      let interimUserEl = null;
+      let interimAiEl = null;
+      let interimUserText = "";
+      let interimAiText = "";
+
+      const targetSampleRate = 16000;
+
+      function logLine(type, text, data) {
+        const time = new Date().toLocaleTimeString();
+        const entry = document.createElement("div");
+        entry.className = "log-entry";
+        const tag = document.createElement("span");
+        tag.className = `tag ${type}`;
+        tag.textContent = type.toUpperCase();
+        const msg = document.createElement("span");
+        msg.style.marginLeft = "10px";
+        msg.textContent = `[${time}] ${text}`;
+        entry.appendChild(tag);
+        entry.appendChild(msg);
+        if (data) {
+          const pre = document.createElement("div");
+          pre.className = "muted";
+          pre.textContent = JSON.stringify(data);
+          pre.style.marginTop = "4px";
+          entry.appendChild(pre);
+        }
+        logEl.appendChild(entry);
+        logEl.scrollTop = logEl.scrollHeight;
+      }
+
+      function addChat(role, text) {
+        const entry = document.createElement("div");
+        entry.className = `chat-entry ${role === "AI" ? "ai" : "user"}`;
+        entry.textContent = `${role}: ${text}`;
+        chatHistory.appendChild(entry);
+        chatHistory.scrollTop = chatHistory.scrollHeight;
+      }
+
+      function setInterim(role, text) {
+        const isAi = role === "AI";
+        let el = isAi ? interimAiEl : interimUserEl;
+        if (!text) {
+          if (el) el.remove();
+          if (isAi) interimAiEl = null;
+          else interimUserEl = null;
+          if (isAi) interimAiText = "";
+          else interimUserText = "";
+          return;
+        }
+        if (!el) {
+          el = document.createElement("div");
+          el.className = `chat-entry ${isAi ? "ai" : "user"} interim`;
+          chatHistory.appendChild(el);
+          if (isAi) interimAiEl = el;
+          else interimUserEl = el;
+        }
+        el.textContent = `${role} (interim): ${text}`;
+        chatHistory.scrollTop = chatHistory.scrollHeight;
+      }
+
+      function stopPlayback() {
+        discardAudio = true;
+        playbackTime = audioCtx ? audioCtx.currentTime : 0;
+        playbackSources.forEach((s) => {
+          try {
+            s.stop();
+          } catch (err) {}
+        });
+        playbackSources = [];
+      }
+
+      function setStatus(connected, detail) {
+        statusDot.classList.toggle("on", connected);
+        statusText.textContent = connected ? "Connected" : "Disconnected";
+        statusSub.textContent = detail || "";
+      }
+
+      async function ensureAudioContext() {
+        if (audioCtx) return;
+        audioCtx = new (window.AudioContext || window.webkitAudioContext)();
+        playbackDest = audioCtx.createMediaStreamDestination();
+        audioOut.srcObject = playbackDest.stream;
+        try {
+          await audioOut.play();
+        } catch (err) {
+          logLine("sys", "Audio playback blocked (user gesture needed)", { err: String(err) });
+        }
+        if (outputSelect.value) {
+          await setOutputDevice(outputSelect.value);
+        }
+      }
+
+      function downsampleBuffer(buffer, inRate, outRate) {
+        if (outRate === inRate) return buffer;
+        const ratio = inRate / outRate;
+        const newLength = Math.round(buffer.length / ratio);
+        const result = new Float32Array(newLength);
+        let offsetResult = 0;
+        let offsetBuffer = 0;
+        while (offsetResult < result.length) {
+          const nextOffsetBuffer = Math.round((offsetResult + 1) * ratio);
+          let accum = 0;
+          let count = 0;
+          for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
+            accum += buffer[i];
+            count++;
+          }
+          result[offsetResult] = accum / count;
+          offsetResult++;
+          offsetBuffer = nextOffsetBuffer;
+        }
+        return result;
+      }
+
+      function floatTo16BitPCM(float32) {
+        const out = new Int16Array(float32.length);
+        for (let i = 0; i < float32.length; i++) {
+          const s = Math.max(-1, Math.min(1, float32[i]));
+          out[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
+        }
+        return out;
+      }
+
+      function schedulePlayback(int16Data) {
+        if (!audioCtx || !playbackDest) return;
+        if (discardAudio) return;
+        const float32 = new Float32Array(int16Data.length);
+        for (let i = 0; i < int16Data.length; i++) {
+          float32[i] = int16Data[i] / 32768;
+        }
+        const buffer = audioCtx.createBuffer(1, float32.length, targetSampleRate);
+        buffer.copyToChannel(float32, 0);
+        const source = audioCtx.createBufferSource();
+        source.buffer = buffer;
+        source.connect(playbackDest);
+        const startTime = Math.max(audioCtx.currentTime + 0.02, playbackTime);
+        source.start(startTime);
+        playbackTime = startTime + buffer.duration;
+        playbackSources.push(source);
+        source.onended = () => {
+          playbackSources = playbackSources.filter((s) => s !== source);
+        };
+      }
+
+      async function connect() {
+        if (ws && ws.readyState === WebSocket.OPEN) return;
+        ws = new WebSocket(wsUrl.value.trim());
+        ws.binaryType = "arraybuffer";
+
+        ws.onopen = () => {
+          setStatus(true, "Session open");
+          logLine("sys", "WebSocket connected");
+          ensureAudioContext();
+          sendCommand({ command: "invite", option: { codec: "pcm", sampleRate: targetSampleRate } });
+        };
+
+        ws.onclose = () => {
+          setStatus(false, "Connection closed");
+          logLine("sys", "WebSocket closed");
+          ws = null;
+        };
+
+        ws.onerror = (err) => {
+          logLine("sys", "WebSocket error", { err: String(err) });
+        };
+
+        ws.onmessage = (msg) => {
+          if (typeof msg.data === "string") {
+            const event = JSON.parse(msg.data);
+            handleEvent(event);
+          } else {
+            const audioBuf = msg.data;
+            const int16 = new Int16Array(audioBuf);
+            schedulePlayback(int16);
+            logLine("audio", `Audio ${Math.round((int16.length / targetSampleRate) * 1000)}ms`);
+          }
+        };
+      }
+
+      function disconnect() {
+        if (ws) ws.close();
+        ws = null;
+        setStatus(false, "Disconnected");
+      }
+
+      function sendCommand(cmd) {
+        if (!ws || ws.readyState !== WebSocket.OPEN) {
+          logLine("sys", "Not connected");
+          return;
+        }
+        ws.send(JSON.stringify(cmd));
+        logLine("sys", `→ ${cmd.command}`, cmd);
+      }
+
+      function handleEvent(event) {
+        const type = event.event || "unknown";
+        logLine("event", type, event);
+        if (type === "transcript") {
+          if (event.isFinal && event.text) {
+            setInterim("You", "");
+            addChat("You", event.text);
+          } else if (event.text) {
+            interimUserText += event.text;
+            setInterim("You", interimUserText);
+          }
+        }
+        if (type === "llmResponse") {
+          if (event.isFinal && event.text) {
+            setInterim("AI", "");
+            addChat("AI", event.text);
+          } else if (event.text) {
+            interimAiText += event.text;
+            setInterim("AI", interimAiText);
+          }
+        }
+        if (type === "trackStart") {
+          // New bot audio: stop any previous playback to avoid overlap
+          stopPlayback();
+          discardAudio = false;
+        }
+        if (type === "speaking") {
+          // User started speaking: clear any in-flight audio to avoid overlap
+          stopPlayback();
+        }
+        if (type === "interrupt") {
+          stopPlayback();
+        }
+      }
+
+      async function startMic() {
+        if (!ws || ws.readyState !== WebSocket.OPEN) {
+          logLine("sys", "Connect before starting mic");
+          return;
+        }
+        await ensureAudioContext();
+        const deviceId = inputSelect.value || undefined;
+        micStream = await navigator.mediaDevices.getUserMedia({
+          audio: deviceId ? { deviceId: { exact: deviceId } } : true,
+        });
+        micSource = audioCtx.createMediaStreamSource(micStream);
+        processor = audioCtx.createScriptProcessor(2048, 1, 1);
+        processor.onaudioprocess = (e) => {
+          if (!ws || ws.readyState !== WebSocket.OPEN) return;
+          const input = e.inputBuffer.getChannelData(0);
+          const downsampled = downsampleBuffer(input, audioCtx.sampleRate, targetSampleRate);
+          const pcm16 = floatTo16BitPCM(downsampled);
+          ws.send(pcm16.buffer);
+        };
+        micSource.connect(processor);
+        processor.connect(audioCtx.destination);
+        logLine("sys", "Microphone started");
+      }
+
+      function stopMic() {
+        if (processor) {
+          processor.disconnect();
+          processor = null;
+        }
+        if (micSource) {
+          micSource.disconnect();
+          micSource = null;
+        }
+        if (micStream) {
+          micStream.getTracks().forEach((t) => t.stop());
+          micStream = null;
+        }
+        logLine("sys", "Microphone stopped");
+      }
+
+      async function refreshDevices() {
+        const devices = await navigator.mediaDevices.enumerateDevices();
+        inputSelect.innerHTML = "";
+        outputSelect.innerHTML = "";
+        devices.forEach((d) => {
+          if (d.kind === "audioinput") {
+            const opt = document.createElement("option");
+            opt.value = d.deviceId;
+            opt.textContent = d.label || `Mic ${inputSelect.length + 1}`;
+            inputSelect.appendChild(opt);
+          }
+          if (d.kind === "audiooutput") {
+            const opt = document.createElement("option");
+            opt.value = d.deviceId;
+            opt.textContent = d.label || `Output ${outputSelect.length + 1}`;
+            outputSelect.appendChild(opt);
+          }
+        });
+      }
+
+      async function requestDeviceAccess() {
+        // Needed to reveal device labels in most browsers
+        try {
+          const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+          stream.getTracks().forEach((t) => t.stop());
+          logLine("sys", "Microphone permission granted");
+        } catch (err) {
+          logLine("sys", "Microphone permission denied", { err: String(err) });
+        }
+      }
+
+      async function setOutputDevice(deviceId) {
+        if (!audioOut.setSinkId) {
+          logLine("sys", "setSinkId not supported in this browser");
+          return;
+        }
+        await audioOut.setSinkId(deviceId);
+        logLine("sys", `Output device set`, { deviceId });
+      }
+
+      connectBtn.addEventListener("click", connect);
+      disconnectBtn.addEventListener("click", disconnect);
+      refreshDevicesBtn.addEventListener("click", async () => {
+        await requestDeviceAccess();
+        await refreshDevices();
+      });
+      startMicBtn.addEventListener("click", startMic);
+      stopMicBtn.addEventListener("click", stopMic);
+      sendChatBtn.addEventListener("click", () => {
+        const text = chatInput.value.trim();
+        if (!text) return;
+        ensureAudioContext();
+        addChat("You", text);
+        sendCommand({ command: "chat", text });
+        chatInput.value = "";
+      });
+      clearLogBtn.addEventListener("click", () => {
+        logEl.innerHTML = "";
+        chatHistory.innerHTML = "";
+        setInterim("You", "");
+        setInterim("AI", "");
+        interimUserText = "";
+        interimAiText = "";
+      });
+      inputSelect.addEventListener("change", () => {
+        if (micStream) {
+          stopMic();
+          startMic();
+        }
+      });
+      outputSelect.addEventListener("change", () => setOutputDevice(outputSelect.value));
+
+      navigator.mediaDevices.addEventListener("devicechange", refreshDevices);
+      refreshDevices().catch(() => {});
+    </script>
+  </body>
+</html>
--- a/processors/vad.py
+++ b/processors/vad.py
@@ -63,6 +63,7 @@ class SileroVAD:
        self.min_chunk_size = 512
        self.last_label = "Silence"
        self.last_probability = 0.0
+        self._energy_noise_floor = 1e-4

    def _reset_state(self):
        # Silero VAD V4+ expects state shape [2, 1, 128]
@@ -81,8 +82,7 @@ class SileroVAD:
            Tuple of (label, probability) where label is "Speech" or "Silence"
        """
        if self.session is None or not ONNX_AVAILABLE:
-            # Fallback energy-based VAD when model isn't available.
-            # Map RMS energy to a pseudo-probability so the existing threshold works.
+            # Fallback energy-based VAD with adaptive noise floor.
            if not pcm_bytes:
                return "Silence", 0.0
            audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
@@ -90,9 +90,17 @@ class SileroVAD:
                return "Silence", 0.0
            audio_float = audio_int16.astype(np.float32) / 32768.0
            rms = float(np.sqrt(np.mean(audio_float * audio_float)))
-            # Typical speech RMS is ~0.02-0.05 at 16-bit normalized scale.
-            # Normalize so threshold=0.5 roughly corresponds to ~0.025 RMS.
-            probability = min(1.0, rms / 0.05)
+
+            # Update adaptive noise floor (slowly rises, faster to fall)
+            if rms < self._energy_noise_floor:
+                self._energy_noise_floor = 0.95 * self._energy_noise_floor + 0.05 * rms
+            else:
+                self._energy_noise_floor = 0.995 * self._energy_noise_floor + 0.005 * rms
+
+            # Compute SNR-like ratio and map to probability
+            denom = max(self._energy_noise_floor, 1e-6)
+            snr = max(0.0, (rms - denom) / denom)
+            probability = min(1.0, snr / 3.0)  # ~3x above noise => strong speech
            label = "Speech" if probability >= 0.5 else "Silence"
            return label, probability
Author	SHA1	Message	Date
Xin Wang	7be8fda424	Fix microphone talk eou missing and clean chat log	2026-02-06 11:36:39 +08:00
Xin Wang	c8c0e30bc3	Update web client	2026-02-06 11:25:05 +08:00
Xin Wang	960690ba80	Remove invite button, correct stream asr tts transcription	2026-02-06 11:20:52 +08:00
Xin Wang	cb35d87eb4	Update web client	2026-02-06 10:46:24 +08:00
Xin Wang	5c03cf2b1f	Update web client layout	2026-02-06 10:34:09 +08:00
Xin Wang	876ca8221c	Put web client together	2026-02-06 09:57:45 +08:00
Xin Wang	a8e7c7e2ef	Add web client to app server	2026-02-06 09:54:23 +08:00
Xin Wang	9d42f3cca1	Fix list devices on web client	2026-02-06 09:40:52 +08:00
Xin Wang	f81a561e0e	Fix indent error	2026-02-06 08:40:42 +08:00
Xin Wang	a70970fee5	Add web client	2026-02-06 08:36:00 +08:00
Xin Wang	e511cf9077	Fix Potential state duplication on barge-in.	2026-02-06 08:30:37 +08:00
Xin Wang	0576231d8d	Fix Race risks if process_audio is called concurrently.	2026-02-06 08:26:56 +08:00
Xin Wang	26458faa6c	Fix Unbounded _audio_buffer growth.	2026-02-06 08:11:14 +08:00
Xin Wang	605968a639	Fix _stop_current_speech doesn’t cancel LLM/TTS services.	2026-02-06 08:05:33 +08:00
Xin Wang	31d24a7428	Merge branch 'master' of https://gitea.xiaowang.eu.org/wx44wx/py-active-call	2026-02-06 08:00:39 +08:00
Xin Wang	7846e4cebc	Fix No cancellation of existing turn on new EOU.	2026-02-06 07:59:31 +08:00