Init commit

2026-02-17 10:39:23 +08:00
commit 30eb4397c2
56 changed files with 11983 additions and 0 deletions
--- a/examples/mic_client.py
+++ b/examples/mic_client.py
@@ -0,0 +1,601 @@
+#!/usr/bin/env python3
+"""
+Microphone client for testing duplex voice conversation.
+
+This client captures audio from the microphone, sends it to the server,
+and plays back the AI's voice response through the speakers.
+It also displays the LLM's text responses in the console.
+
+Usage:
+    python examples/mic_client.py --url ws://localhost:8000/ws
+    python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!"
+    python examples/mic_client.py --url ws://localhost:8000/ws --verbose
+
+Requirements:
+    pip install sounddevice soundfile websockets numpy
+"""
+
+import argparse
+import asyncio
+import json
+import sys
+import time
+import threading
+import queue
+from pathlib import Path
+
+try:
+    import numpy as np
+except ImportError:
+    print("Please install numpy: pip install numpy")
+    sys.exit(1)
+
+try:
+    import sounddevice as sd
+except ImportError:
+    print("Please install sounddevice: pip install sounddevice")
+    sys.exit(1)
+
+try:
+    import websockets
+except ImportError:
+    print("Please install websockets: pip install websockets")
+    sys.exit(1)
+
+
+class MicrophoneClient:
+    """
+    Full-duplex microphone client for voice conversation.
+    
+    Features:
+    - Real-time microphone capture
+    - Real-time speaker playback
+    - WebSocket communication
+    - Text chat support
+    """
+    
+    def __init__(
+        self,
+        url: str,
+        sample_rate: int = 16000,
+        chunk_duration_ms: int = 20,
+        input_device: int = None,
+        output_device: int = None
+    ):
+        """
+        Initialize microphone client.
+        
+        Args:
+            url: WebSocket server URL
+            sample_rate: Audio sample rate (Hz)
+            chunk_duration_ms: Audio chunk duration (ms)
+            input_device: Input device ID (None for default)
+            output_device: Output device ID (None for default)
+        """
+        self.url = url
+        self.sample_rate = sample_rate
+        self.chunk_duration_ms = chunk_duration_ms
+        self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
+        self.input_device = input_device
+        self.output_device = output_device
+        
+        # WebSocket connection
+        self.ws = None
+        self.running = False
+        
+        # Audio buffers
+        self.audio_input_queue = queue.Queue()
+        self.audio_output_buffer = b""  # Continuous buffer for smooth playback
+        self.audio_output_lock = threading.Lock()
+        
+        # Statistics
+        self.bytes_sent = 0
+        self.bytes_received = 0
+        
+        # State
+        self.is_recording = True
+        self.is_playing = True
+        
+        # TTFB tracking (Time to First Byte)
+        self.request_start_time = None
+        self.first_audio_received = False
+        
+        # Interrupt handling - discard audio until next trackStart
+        self._discard_audio = False
+        self._audio_sequence = 0  # Track audio sequence to detect stale chunks
+        
+        # Verbose mode for streaming LLM responses
+        self.verbose = False
+    
+    async def connect(self) -> None:
+        """Connect to WebSocket server."""
+        print(f"Connecting to {self.url}...")
+        self.ws = await websockets.connect(self.url)
+        self.running = True
+        print("Connected!")
+        
+        # Send invite command
+        await self.send_command({
+            "command": "invite",
+            "option": {
+                "codec": "pcm",
+                "sampleRate": self.sample_rate
+            }
+        })
+    
+    async def send_command(self, cmd: dict) -> None:
+        """Send JSON command to server."""
+        if self.ws:
+            await self.ws.send(json.dumps(cmd))
+            print(f"→ Command: {cmd.get('command', 'unknown')}")
+    
+    async def send_chat(self, text: str) -> None:
+        """Send chat message (text input)."""
+        # Reset TTFB tracking for new request
+        self.request_start_time = time.time()
+        self.first_audio_received = False
+        
+        await self.send_command({
+            "command": "chat",
+            "text": text
+        })
+        print(f"→ Chat: {text}")
+    
+    async def send_interrupt(self) -> None:
+        """Send interrupt command."""
+        await self.send_command({
+            "command": "interrupt"
+        })
+    
+    async def send_hangup(self, reason: str = "User quit") -> None:
+        """Send hangup command."""
+        await self.send_command({
+            "command": "hangup",
+            "reason": reason
+        })
+    
+    def _audio_input_callback(self, indata, frames, time, status):
+        """Callback for audio input (microphone)."""
+        if status:
+            print(f"Input status: {status}")
+        
+        if self.is_recording and self.running:
+            # Convert to 16-bit PCM
+            audio_data = (indata[:, 0] * 32767).astype(np.int16).tobytes()
+            self.audio_input_queue.put(audio_data)
+    
+    def _add_audio_to_buffer(self, audio_data: bytes):
+        """Add audio data to playback buffer."""
+        with self.audio_output_lock:
+            self.audio_output_buffer += audio_data
+    
+    def _playback_thread_func(self):
+        """Thread function for continuous audio playback."""
+        import time
+        
+        # Chunk size: 50ms of audio
+        chunk_samples = int(self.sample_rate * 0.05)
+        chunk_bytes = chunk_samples * 2
+        
+        print(f"Audio playback thread started (device: {self.output_device or 'default'})")
+        
+        try:
+            # Create output stream with callback
+            with sd.OutputStream(
+                samplerate=self.sample_rate,
+                channels=1,
+                dtype='int16',
+                blocksize=chunk_samples,
+                device=self.output_device,
+                latency='low'
+            ) as stream:
+                while self.running:
+                    # Get audio from buffer
+                    with self.audio_output_lock:
+                        if len(self.audio_output_buffer) >= chunk_bytes:
+                            audio_data = self.audio_output_buffer[:chunk_bytes]
+                            self.audio_output_buffer = self.audio_output_buffer[chunk_bytes:]
+                        else:
+                            # Not enough audio - output silence
+                            audio_data = b'\x00' * chunk_bytes
+                    
+                    # Convert to numpy array and write to stream
+                    samples = np.frombuffer(audio_data, dtype=np.int16).reshape(-1, 1)
+                    stream.write(samples)
+                    
+        except Exception as e:
+            print(f"Playback thread error: {e}")
+            import traceback
+            traceback.print_exc()
+    
+    async def _playback_task(self):
+        """Start playback thread and monitor it."""
+        # Run playback in a dedicated thread for reliable timing
+        playback_thread = threading.Thread(target=self._playback_thread_func, daemon=True)
+        playback_thread.start()
+        
+        # Wait for client to stop
+        while self.running and playback_thread.is_alive():
+            await asyncio.sleep(0.1)
+        
+        print("Audio playback stopped")
+    
+    async def audio_sender(self) -> None:
+        """Send audio from microphone to server."""
+        while self.running:
+            try:
+                # Get audio from queue with timeout
+                try:
+                    audio_data = await asyncio.get_event_loop().run_in_executor(
+                        None, lambda: self.audio_input_queue.get(timeout=0.1)
+                    )
+                except queue.Empty:
+                    continue
+                
+                # Send to server
+                if self.ws and self.is_recording:
+                    await self.ws.send(audio_data)
+                    self.bytes_sent += len(audio_data)
+                    
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                print(f"Audio sender error: {e}")
+                break
+    
+    async def receiver(self) -> None:
+        """Receive messages from server."""
+        try:
+            while self.running:
+                try:
+                    message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
+                    
+                    if isinstance(message, bytes):
+                        # Audio data received
+                        self.bytes_received += len(message)
+                        
+                        # Check if we should discard this audio (after interrupt)
+                        if self._discard_audio:
+                            duration_ms = len(message) / (self.sample_rate * 2) * 1000
+                            print(f"← Audio: {duration_ms:.0f}ms (DISCARDED - waiting for new track)")
+                            continue
+                        
+                        if self.is_playing:
+                            self._add_audio_to_buffer(message)
+                        
+                        # Calculate and display TTFB for first audio packet
+                        if not self.first_audio_received and self.request_start_time:
+                            client_ttfb_ms = (time.time() - self.request_start_time) * 1000
+                            self.first_audio_received = True
+                            print(f"← [TTFB] Client first audio latency: {client_ttfb_ms:.0f}ms")
+                        
+                        # Show progress (less verbose)
+                        with self.audio_output_lock:
+                            buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000
+                        duration_ms = len(message) / (self.sample_rate * 2) * 1000
+                        print(f"← Audio: {duration_ms:.0f}ms (buffer: {buffer_ms:.0f}ms)")
+                        
+                    else:
+                        # JSON event
+                        event = json.loads(message)
+                        await self._handle_event(event)
+                        
+                except asyncio.TimeoutError:
+                    continue
+                except websockets.ConnectionClosed:
+                    print("Connection closed")
+                    self.running = False
+                    break
+                    
+        except asyncio.CancelledError:
+            pass
+        except Exception as e:
+            print(f"Receiver error: {e}")
+            self.running = False
+    
+    async def _handle_event(self, event: dict) -> None:
+        """Handle incoming event."""
+        event_type = event.get("event", "unknown")
+        
+        if event_type == "answer":
+            print("← Session ready!")
+        elif event_type == "speaking":
+            print("← User speech detected")
+        elif event_type == "silence":
+            print("← User silence detected")
+        elif event_type == "transcript":
+            # Display user speech transcription
+            text = event.get("text", "")
+            is_final = event.get("isFinal", False)
+            if is_final:
+                # Clear the interim line and print final
+                print(" " * 80, end="\r")  # Clear previous interim text
+                print(f"→ You: {text}")
+            else:
+                # Interim result - show with indicator (overwrite same line)
+                display_text = text[:60] + "..." if len(text) > 60 else text
+                print(f"  [listening] {display_text}".ljust(80), end="\r")
+        elif event_type == "ttfb":
+            # Server-side TTFB event
+            latency_ms = event.get("latencyMs", 0)
+            print(f"← [TTFB] Server reported latency: {latency_ms}ms")
+        elif event_type == "llmResponse":
+            # LLM text response
+            text = event.get("text", "")
+            is_final = event.get("isFinal", False)
+            if is_final:
+                # Print final LLM response
+                print(f"← AI: {text}")
+            elif self.verbose:
+                # Show streaming chunks only in verbose mode
+                display_text = text[:60] + "..." if len(text) > 60 else text
+                print(f"  [streaming] {display_text}")
+        elif event_type == "trackStart":
+            print("← Bot started speaking")
+            # IMPORTANT: Accept audio again after trackStart
+            self._discard_audio = False
+            self._audio_sequence += 1
+            # Reset TTFB tracking for voice responses (when no chat was sent)
+            if self.request_start_time is None:
+                self.request_start_time = time.time()
+                self.first_audio_received = False
+            # Clear any old audio in buffer
+            with self.audio_output_lock:
+                self.audio_output_buffer = b""
+        elif event_type == "trackEnd":
+            print("← Bot finished speaking")
+            # Reset TTFB tracking after response completes
+            self.request_start_time = None
+            self.first_audio_received = False
+        elif event_type == "interrupt":
+            print("← Bot interrupted!")
+            # IMPORTANT: Discard all audio until next trackStart
+            self._discard_audio = True
+            # Clear audio buffer immediately
+            with self.audio_output_lock:
+                buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000
+                self.audio_output_buffer = b""
+                print(f"   (cleared {buffer_ms:.0f}ms, discarding audio until new track)")
+        elif event_type == "error":
+            print(f"← Error: {event.get('error')}")
+        elif event_type == "hangup":
+            print(f"← Hangup: {event.get('reason')}")
+            self.running = False
+        else:
+            print(f"← Event: {event_type}")
+    
+    async def interactive_mode(self) -> None:
+        """Run interactive mode for text chat."""
+        print("\n" + "=" * 50)
+        print("Voice Conversation Client")
+        print("=" * 50)
+        print("Speak into your microphone to talk to the AI.")
+        print("Or type messages to send text.")
+        print("")
+        print("Commands:")
+        print("  /quit      - End conversation")
+        print("  /mute      - Mute microphone")
+        print("  /unmute    - Unmute microphone")
+        print("  /interrupt - Interrupt AI speech")
+        print("  /stats     - Show statistics")
+        print("=" * 50 + "\n")
+        
+        while self.running:
+            try:
+                user_input = await asyncio.get_event_loop().run_in_executor(
+                    None, input, ""
+                )
+                
+                if not user_input:
+                    continue
+                
+                # Handle commands
+                if user_input.startswith("/"):
+                    cmd = user_input.lower().strip()
+                    
+                    if cmd == "/quit":
+                        await self.send_hangup("User quit")
+                        break
+                    elif cmd == "/mute":
+                        self.is_recording = False
+                        print("Microphone muted")
+                    elif cmd == "/unmute":
+                        self.is_recording = True
+                        print("Microphone unmuted")
+                    elif cmd == "/interrupt":
+                        await self.send_interrupt()
+                    elif cmd == "/stats":
+                        print(f"Sent: {self.bytes_sent / 1024:.1f} KB")
+                        print(f"Received: {self.bytes_received / 1024:.1f} KB")
+                    else:
+                        print(f"Unknown command: {cmd}")
+                else:
+                    # Send as chat message
+                    await self.send_chat(user_input)
+                    
+            except EOFError:
+                break
+            except Exception as e:
+                print(f"Input error: {e}")
+    
+    async def run(self, chat_message: str = None, interactive: bool = True) -> None:
+        """
+        Run the client.
+        
+        Args:
+            chat_message: Optional single chat message to send
+            interactive: Whether to run in interactive mode
+        """
+        try:
+            await self.connect()
+            
+            # Wait for answer
+            await asyncio.sleep(0.5)
+            
+            # Start audio input stream
+            print("Starting audio streams...")
+            
+            input_stream = sd.InputStream(
+                samplerate=self.sample_rate,
+                channels=1,
+                dtype=np.float32,
+                blocksize=self.chunk_samples,
+                device=self.input_device,
+                callback=self._audio_input_callback
+            )
+            
+            input_stream.start()
+            print("Audio streams started")
+            
+            # Start background tasks
+            sender_task = asyncio.create_task(self.audio_sender())
+            receiver_task = asyncio.create_task(self.receiver())
+            playback_task = asyncio.create_task(self._playback_task())
+            
+            if chat_message:
+                # Send single message and wait
+                await self.send_chat(chat_message)
+                await asyncio.sleep(15)
+            elif interactive:
+                # Run interactive mode
+                await self.interactive_mode()
+            else:
+                # Just wait
+                while self.running:
+                    await asyncio.sleep(0.1)
+            
+            # Cleanup
+            self.running = False
+            sender_task.cancel()
+            receiver_task.cancel()
+            playback_task.cancel()
+            
+            try:
+                await sender_task
+            except asyncio.CancelledError:
+                pass
+            
+            try:
+                await receiver_task
+            except asyncio.CancelledError:
+                pass
+            
+            try:
+                await playback_task
+            except asyncio.CancelledError:
+                pass
+            
+            input_stream.stop()
+            
+        except ConnectionRefusedError:
+            print(f"Error: Could not connect to {self.url}")
+            print("Make sure the server is running.")
+        except Exception as e:
+            print(f"Error: {e}")
+        finally:
+            await self.close()
+    
+    async def close(self) -> None:
+        """Close the connection."""
+        self.running = False
+        if self.ws:
+            await self.ws.close()
+        
+        print(f"\nSession ended")
+        print(f"  Total sent: {self.bytes_sent / 1024:.1f} KB")
+        print(f"  Total received: {self.bytes_received / 1024:.1f} KB")
+
+
+def list_devices():
+    """List available audio devices."""
+    print("\nAvailable audio devices:")
+    print("-" * 60)
+    devices = sd.query_devices()
+    for i, device in enumerate(devices):
+        direction = []
+        if device['max_input_channels'] > 0:
+            direction.append("IN")
+        if device['max_output_channels'] > 0:
+            direction.append("OUT")
+        direction_str = "/".join(direction) if direction else "N/A"
+        
+        default = ""
+        if i == sd.default.device[0]:
+            default += " [DEFAULT INPUT]"
+        if i == sd.default.device[1]:
+            default += " [DEFAULT OUTPUT]"
+        
+        print(f"  {i:2d}: {device['name'][:40]:40s} ({direction_str}){default}")
+    print("-" * 60)
+
+
+async def main():
+    parser = argparse.ArgumentParser(
+        description="Microphone client for duplex voice conversation"
+    )
+    parser.add_argument(
+        "--url",
+        default="ws://localhost:8000/ws",
+        help="WebSocket server URL"
+    )
+    parser.add_argument(
+        "--chat",
+        help="Send a single chat message instead of using microphone"
+    )
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="Audio sample rate (default: 16000)"
+    )
+    parser.add_argument(
+        "--input-device",
+        type=int,
+        help="Input device ID"
+    )
+    parser.add_argument(
+        "--output-device",
+        type=int,
+        help="Output device ID"
+    )
+    parser.add_argument(
+        "--list-devices",
+        action="store_true",
+        help="List available audio devices and exit"
+    )
+    parser.add_argument(
+        "--no-interactive",
+        action="store_true",
+        help="Disable interactive mode"
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Show streaming LLM response chunks"
+    )
+    
+    args = parser.parse_args()
+    
+    if args.list_devices:
+        list_devices()
+        return
+    
+    client = MicrophoneClient(
+        url=args.url,
+        sample_rate=args.sample_rate,
+        input_device=args.input_device,
+        output_device=args.output_device
+    )
+    client.verbose = args.verbose
+    
+    await client.run(
+        chat_message=args.chat,
+        interactive=not args.no_interactive
+    )
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\nInterrupted by user")
--- a/examples/simple_client.py
+++ b/examples/simple_client.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+"""
+Simple WebSocket client for testing voice conversation.
+Uses PyAudio for more reliable audio playback on Windows.
+
+Usage:
+    python examples/simple_client.py
+    python examples/simple_client.py --text "Hello"
+"""
+
+import argparse
+import asyncio
+import json
+import sys
+import time
+import wave
+import io
+
+try:
+    import numpy as np
+except ImportError:
+    print("pip install numpy")
+    sys.exit(1)
+
+try:
+    import websockets
+except ImportError:
+    print("pip install websockets")
+    sys.exit(1)
+
+# Try PyAudio first (more reliable on Windows)
+try:
+    import pyaudio
+    PYAUDIO_AVAILABLE = True
+except ImportError:
+    PYAUDIO_AVAILABLE = False
+    print("PyAudio not available, trying sounddevice...")
+
+try:
+    import sounddevice as sd
+    SD_AVAILABLE = True
+except ImportError:
+    SD_AVAILABLE = False
+
+if not PYAUDIO_AVAILABLE and not SD_AVAILABLE:
+    print("Please install pyaudio or sounddevice:")
+    print("  pip install pyaudio")
+    print("  or: pip install sounddevice")
+    sys.exit(1)
+
+
+class SimpleVoiceClient:
+    """Simple voice client with reliable audio playback."""
+    
+    def __init__(self, url: str, sample_rate: int = 16000):
+        self.url = url
+        self.sample_rate = sample_rate
+        self.ws = None
+        self.running = False
+        
+        # Audio buffer
+        self.audio_buffer = b""
+        
+        # PyAudio setup
+        if PYAUDIO_AVAILABLE:
+            self.pa = pyaudio.PyAudio()
+            self.stream = None
+        
+        # Stats
+        self.bytes_received = 0
+        
+        # TTFB tracking (Time to First Byte)
+        self.request_start_time = None
+        self.first_audio_received = False
+        
+        # Interrupt handling - discard audio until next trackStart
+        self._discard_audio = False
+    
+    async def connect(self):
+        """Connect to server."""
+        print(f"Connecting to {self.url}...")
+        self.ws = await websockets.connect(self.url)
+        self.running = True
+        print("Connected!")
+        
+        # Send invite
+        await self.ws.send(json.dumps({
+            "command": "invite",
+            "option": {"codec": "pcm", "sampleRate": self.sample_rate}
+        }))
+        print("-> invite")
+    
+    async def send_chat(self, text: str):
+        """Send chat message."""
+        # Reset TTFB tracking for new request
+        self.request_start_time = time.time()
+        self.first_audio_received = False
+        
+        await self.ws.send(json.dumps({"command": "chat", "text": text}))
+        print(f"-> chat: {text}")
+    
+    def play_audio(self, audio_data: bytes):
+        """Play audio data immediately."""
+        if len(audio_data) == 0:
+            return
+            
+        if PYAUDIO_AVAILABLE:
+            # Use PyAudio - more reliable on Windows
+            if self.stream is None:
+                self.stream = self.pa.open(
+                    format=pyaudio.paInt16,
+                    channels=1,
+                    rate=self.sample_rate,
+                    output=True,
+                    frames_per_buffer=1024
+                )
+            self.stream.write(audio_data)
+        elif SD_AVAILABLE:
+            # Use sounddevice
+            samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32767.0
+            sd.play(samples, self.sample_rate, blocking=True)
+    
+    async def receive_loop(self):
+        """Receive and play audio."""
+        print("\nWaiting for response...")
+        
+        while self.running:
+            try:
+                msg = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
+                
+                if isinstance(msg, bytes):
+                    # Audio data
+                    self.bytes_received += len(msg)
+                    duration_ms = len(msg) / (self.sample_rate * 2) * 1000
+                    
+                    # Check if we should discard this audio (after interrupt)
+                    if self._discard_audio:
+                        print(f"<- audio: {len(msg)} bytes ({duration_ms:.0f}ms) [DISCARDED]")
+                        continue
+                    
+                    # Calculate and display TTFB for first audio packet
+                    if not self.first_audio_received and self.request_start_time:
+                        client_ttfb_ms = (time.time() - self.request_start_time) * 1000
+                        self.first_audio_received = True
+                        print(f"<- [TTFB] Client first audio latency: {client_ttfb_ms:.0f}ms")
+                    
+                    print(f"<- audio: {len(msg)} bytes ({duration_ms:.0f}ms)")
+                    
+                    # Play immediately in executor to not block
+                    loop = asyncio.get_event_loop()
+                    await loop.run_in_executor(None, self.play_audio, msg)
+                else:
+                    # JSON event
+                    event = json.loads(msg)
+                    etype = event.get("event", "?")
+                    
+                    if etype == "transcript":
+                        # User speech transcription
+                        text = event.get("text", "")
+                        is_final = event.get("isFinal", False)
+                        if is_final:
+                            print(f"<- You said: {text}")
+                        else:
+                            print(f"<- [listening] {text}", end="\r")
+                    elif etype == "ttfb":
+                        # Server-side TTFB event
+                        latency_ms = event.get("latencyMs", 0)
+                        print(f"<- [TTFB] Server reported latency: {latency_ms}ms")
+                    elif etype == "trackStart":
+                        # New track starting - accept audio again
+                        self._discard_audio = False
+                        print(f"<- {etype}")
+                    elif etype == "interrupt":
+                        # Interrupt - discard audio until next trackStart
+                        self._discard_audio = True
+                        print(f"<- {etype} (discarding audio until new track)")
+                    elif etype == "hangup":
+                        print(f"<- {etype}")
+                        self.running = False
+                        break
+                    else:
+                        print(f"<- {etype}")
+                        
+            except asyncio.TimeoutError:
+                continue
+            except websockets.ConnectionClosed:
+                print("Connection closed")
+                self.running = False
+                break
+    
+    async def run(self, text: str = None):
+        """Run the client."""
+        try:
+            await self.connect()
+            await asyncio.sleep(0.5)
+            
+            # Start receiver
+            recv_task = asyncio.create_task(self.receive_loop())
+            
+            if text:
+                await self.send_chat(text)
+                # Wait for response
+                await asyncio.sleep(30)
+            else:
+                # Interactive mode
+                print("\nType a message and press Enter (or 'quit' to exit):")
+                while self.running:
+                    try:
+                        user_input = await asyncio.get_event_loop().run_in_executor(
+                            None, input, "> "
+                        )
+                        if user_input.lower() == 'quit':
+                            break
+                        if user_input.strip():
+                            await self.send_chat(user_input)
+                    except EOFError:
+                        break
+            
+            self.running = False
+            recv_task.cancel()
+            try:
+                await recv_task
+            except asyncio.CancelledError:
+                pass
+                
+        finally:
+            await self.close()
+    
+    async def close(self):
+        """Close connections."""
+        self.running = False
+        
+        if PYAUDIO_AVAILABLE:
+            if self.stream:
+                self.stream.stop_stream()
+                self.stream.close()
+            self.pa.terminate()
+        
+        if self.ws:
+            await self.ws.close()
+        
+        print(f"\nTotal audio received: {self.bytes_received / 1024:.1f} KB")
+
+
+def list_audio_devices():
+    """List available audio devices."""
+    print("\n=== Audio Devices ===")
+    
+    if PYAUDIO_AVAILABLE:
+        pa = pyaudio.PyAudio()
+        print("\nPyAudio devices:")
+        for i in range(pa.get_device_count()):
+            info = pa.get_device_info_by_index(i)
+            if info['maxOutputChannels'] > 0:
+                default = " [DEFAULT]" if i == pa.get_default_output_device_info()['index'] else ""
+                print(f"  {i}: {info['name']}{default}")
+        pa.terminate()
+    
+    if SD_AVAILABLE:
+        print("\nSounddevice devices:")
+        for i, d in enumerate(sd.query_devices()):
+            if d['max_output_channels'] > 0:
+                default = " [DEFAULT]" if i == sd.default.device[1] else ""
+                print(f"  {i}: {d['name']}{default}")
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="Simple voice client")
+    parser.add_argument("--url", default="ws://localhost:8000/ws")
+    parser.add_argument("--text", help="Send text and play response")
+    parser.add_argument("--list-devices", action="store_true")
+    parser.add_argument("--sample-rate", type=int, default=16000)
+    
+    args = parser.parse_args()
+    
+    if args.list_devices:
+        list_audio_devices()
+        return
+    
+    client = SimpleVoiceClient(args.url, args.sample_rate)
+    await client.run(args.text)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/examples/test_websocket.py
+++ b/examples/test_websocket.py
@@ -0,0 +1,176 @@
+"""WebSocket endpoint test client.
+
+Tests the /ws endpoint with sine wave or file audio streaming.
+Based on reference/py-active-call/exec/test_ws_endpoint/test_ws.py
+"""
+
+import asyncio
+import aiohttp
+import json
+import struct
+import math
+import argparse
+import os
+from datetime import datetime
+
+# Configuration
+SERVER_URL = "ws://localhost:8000/ws"
+SAMPLE_RATE = 16000
+FREQUENCY = 440  # 440Hz Sine Wave
+CHUNK_DURATION_MS = 20
+# 16kHz * 16-bit (2 bytes) * 20ms = 640 bytes per chunk
+CHUNK_SIZE_BYTES = int(SAMPLE_RATE * 2 * (CHUNK_DURATION_MS / 1000.0))
+
+
+def generate_sine_wave(duration_ms=1000):
+    """Generates sine wave audio (16kHz mono PCM 16-bit)."""
+    num_samples = int(SAMPLE_RATE * (duration_ms / 1000.0))
+    audio_data = bytearray()
+
+    for x in range(num_samples):
+        # Generate sine wave sample
+        value = int(32767.0 * math.sin(2 * math.pi * FREQUENCY * x / SAMPLE_RATE))
+        # Pack as little-endian 16-bit integer
+        audio_data.extend(struct.pack('<h', value))
+
+    return audio_data
+
+
+async def receive_loop(ws, ready_event: asyncio.Event):
+    """Listen for incoming messages from the server."""
+    print("👂 Listening for server responses...")
+    async for msg in ws:
+        timestamp = datetime.now().strftime("%H:%M:%S")
+
+        if msg.type == aiohttp.WSMsgType.TEXT:
+            try:
+                data = json.loads(msg.data)
+                event_type = data.get('type', 'Unknown')
+                print(f"[{timestamp}] 📨 Event: {event_type} | {msg.data[:150]}...")
+                if event_type == "session.started":
+                    ready_event.set()
+            except json.JSONDecodeError:
+                print(f"[{timestamp}] 📨 Text: {msg.data[:100]}...")
+
+        elif msg.type == aiohttp.WSMsgType.BINARY:
+            # Received audio chunk back (e.g., TTS or echo)
+            print(f"[{timestamp}] 🔊 Audio: {len(msg.data)} bytes", end="\r")
+
+        elif msg.type == aiohttp.WSMsgType.CLOSED:
+            print(f"\n[{timestamp}] ❌ Socket Closed")
+            break
+
+        elif msg.type == aiohttp.WSMsgType.ERROR:
+            print(f"\n[{timestamp}] ⚠️ Socket Error")
+            break
+
+
+async def send_file_loop(ws, file_path):
+    """Stream a raw PCM/WAV file to the server."""
+    if not os.path.exists(file_path):
+        print(f"❌ Error: File '{file_path}' not found.")
+        return
+
+    print(f"📂 Streaming file: {file_path} ...")
+
+    with open(file_path, "rb") as f:
+        # Skip WAV header if present (first 44 bytes)
+        if file_path.endswith('.wav'):
+            f.read(44)
+
+        while True:
+            chunk = f.read(CHUNK_SIZE_BYTES)
+            if not chunk:
+                break
+
+            # Send binary frame
+            await ws.send_bytes(chunk)
+
+            # Sleep to simulate real-time playback
+            await asyncio.sleep(CHUNK_DURATION_MS / 1000.0)
+
+    print(f"\n✅ Finished streaming {file_path}")
+
+
+async def send_sine_loop(ws):
+    """Stream generated sine wave to the server."""
+    print("🎙️  Starting Audio Stream (Sine Wave)...")
+
+    # Generate 10 seconds of audio buffer
+    audio_buffer = generate_sine_wave(5000)
+    cursor = 0
+
+    while cursor < len(audio_buffer):
+        chunk = audio_buffer[cursor:cursor + CHUNK_SIZE_BYTES]
+        if not chunk:
+            break
+
+        await ws.send_bytes(chunk)
+        cursor += len(chunk)
+
+        await asyncio.sleep(CHUNK_DURATION_MS / 1000.0)
+
+    print("\n✅ Finished streaming test audio.")
+
+
+async def run_client(url, file_path=None, use_sine=False):
+    """Run the WebSocket test client."""
+    session = aiohttp.ClientSession()
+    try:
+        print(f"🔌 Connecting to {url}...")
+        async with session.ws_connect(url) as ws:
+            print("✅ Connected!")
+            session_ready = asyncio.Event()
+            recv_task = asyncio.create_task(receive_loop(ws, session_ready))
+
+            # Send v1 hello + session.start handshake
+            await ws.send_json({"type": "hello", "version": "v1"})
+            await ws.send_json({
+                "type": "session.start",
+                "audio": {
+                    "encoding": "pcm_s16le",
+                    "sample_rate_hz": SAMPLE_RATE,
+                    "channels": 1
+                }
+            })
+            print("📤 Sent v1 hello/session.start")
+            await asyncio.wait_for(session_ready.wait(), timeout=8)
+
+            # Select sender based on args
+            if use_sine:
+                await send_sine_loop(ws)
+            elif file_path:
+                await send_file_loop(ws, file_path)
+            else:
+                # Default to sine wave
+                await send_sine_loop(ws)
+
+            await ws.send_json({"type": "session.stop", "reason": "test_complete"})
+            await asyncio.sleep(1)
+            recv_task.cancel()
+            try:
+                await recv_task
+            except asyncio.CancelledError:
+                pass
+
+    except aiohttp.ClientConnectorError:
+        print(f"❌ Connection Failed. Is the server running at {url}?")
+    except asyncio.TimeoutError:
+        print("❌ Timeout waiting for session.started")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+    finally:
+        await session.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="WebSocket Audio Test Client")
+    parser.add_argument("--url", default=SERVER_URL, help="WebSocket endpoint URL")
+    parser.add_argument("--file", help="Path to PCM/WAV file to stream")
+    parser.add_argument("--sine", action="store_true", help="Use sine wave generation (default)")
+    args = parser.parse_args()
+
+    try:
+        asyncio.run(run_client(args.url, args.file, args.sine))
+    except KeyboardInterrupt:
+        print("\n👋 Client stopped.")
--- a/examples/wav_client.py
+++ b/examples/wav_client.py
@@ -0,0 +1,504 @@
+#!/usr/bin/env python3
+"""
+WAV file client for testing duplex voice conversation.
+
+This client reads audio from a WAV file, sends it to the server,
+and saves the AI's voice response to an output WAV file.
+
+Usage:
+    python examples/wav_client.py --input input.wav --output response.wav
+    python examples/wav_client.py --input input.wav --output response.wav --url ws://localhost:8000/ws
+    python examples/wav_client.py --input input.wav --output response.wav --wait-time 10
+    python wav_client.py --input ../data/audio_examples/two_utterances.wav -o response.wav
+Requirements:
+    pip install soundfile websockets numpy
+"""
+
+import argparse
+import asyncio
+import json
+import sys
+import time
+import wave
+from pathlib import Path
+
+try:
+    import numpy as np
+except ImportError:
+    print("Please install numpy: pip install numpy")
+    sys.exit(1)
+
+try:
+    import soundfile as sf
+except ImportError:
+    print("Please install soundfile: pip install soundfile")
+    sys.exit(1)
+
+try:
+    import websockets
+except ImportError:
+    print("Please install websockets: pip install websockets")
+    sys.exit(1)
+
+
+class WavFileClient:
+    """
+    WAV file client for voice conversation testing.
+    
+    Features:
+    - Read audio from WAV file
+    - Send audio to WebSocket server
+    - Receive and save response audio
+    - Event logging
+    """
+    
+    def __init__(
+        self,
+        url: str,
+        input_file: str,
+        output_file: str,
+        sample_rate: int = 16000,
+        chunk_duration_ms: int = 20,
+        wait_time: float = 15.0,
+        verbose: bool = False
+    ):
+        """
+        Initialize WAV file client.
+        
+        Args:
+            url: WebSocket server URL
+            input_file: Input WAV file path
+            output_file: Output WAV file path
+            sample_rate: Audio sample rate (Hz)
+            chunk_duration_ms: Audio chunk duration (ms) for sending
+            wait_time: Time to wait for response after sending (seconds)
+            verbose: Enable verbose output
+        """
+        self.url = url
+        self.input_file = Path(input_file)
+        self.output_file = Path(output_file)
+        self.sample_rate = sample_rate
+        self.chunk_duration_ms = chunk_duration_ms
+        self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
+        self.wait_time = wait_time
+        self.verbose = verbose
+        
+        # WebSocket connection
+        self.ws = None
+        self.running = False
+        
+        # Audio buffers
+        self.received_audio = bytearray()
+        
+        # Statistics
+        self.bytes_sent = 0
+        self.bytes_received = 0
+        
+        # TTFB tracking (per response)
+        self.send_start_time = None
+        self.response_start_time = None  # set on each trackStart
+        self.waiting_for_first_audio = False
+        self.ttfb_ms = None  # last TTFB for summary
+        self.ttfb_list = []  # TTFB for each response
+        
+        # State tracking
+        self.track_started = False
+        self.track_ended = False
+        self.send_completed = False
+        
+        # Events log
+        self.events_log = []
+    
+    def log_event(self, direction: str, message: str):
+        """Log an event with timestamp."""
+        timestamp = time.time()
+        self.events_log.append({
+            "timestamp": timestamp,
+            "direction": direction,
+            "message": message
+        })
+        # Handle encoding errors on Windows
+        try:
+            print(f"{direction} {message}")
+        except UnicodeEncodeError:
+            # Replace problematic characters for console output
+            safe_message = message.encode('ascii', errors='replace').decode('ascii')
+            print(f"{direction} {safe_message}")
+    
+    async def connect(self) -> None:
+        """Connect to WebSocket server."""
+        self.log_event("→", f"Connecting to {self.url}...")
+        self.ws = await websockets.connect(self.url)
+        self.running = True
+        self.log_event("←", "Connected!")
+        
+        # Send invite command
+        await self.send_command({
+            "command": "invite",
+            "option": {
+                "codec": "pcm",
+                "sampleRate": self.sample_rate
+            }
+        })
+    
+    async def send_command(self, cmd: dict) -> None:
+        """Send JSON command to server."""
+        if self.ws:
+            await self.ws.send(json.dumps(cmd))
+            self.log_event("→", f"Command: {cmd.get('command', 'unknown')}")
+    
+    async def send_hangup(self, reason: str = "Session complete") -> None:
+        """Send hangup command."""
+        await self.send_command({
+            "command": "hangup",
+            "reason": reason
+        })
+    
+    def load_wav_file(self) -> tuple[np.ndarray, int]:
+        """
+        Load and prepare WAV file for sending.
+        
+        Returns:
+            Tuple of (audio_data as int16 numpy array, original sample rate)
+        """
+        if not self.input_file.exists():
+            raise FileNotFoundError(f"Input file not found: {self.input_file}")
+        
+        # Load audio file
+        audio_data, file_sample_rate = sf.read(self.input_file)
+        self.log_event("→", f"Loaded: {self.input_file}")
+        self.log_event("→", f"  Original sample rate: {file_sample_rate} Hz")
+        self.log_event("→", f"  Duration: {len(audio_data) / file_sample_rate:.2f}s")
+        
+        # Convert stereo to mono if needed
+        if len(audio_data.shape) > 1:
+            audio_data = audio_data.mean(axis=1)
+            self.log_event("→", "  Converted stereo to mono")
+        
+        # Resample if needed
+        if file_sample_rate != self.sample_rate:
+            # Simple resampling using numpy
+            duration = len(audio_data) / file_sample_rate
+            num_samples = int(duration * self.sample_rate)
+            indices = np.linspace(0, len(audio_data) - 1, num_samples)
+            audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data)
+            self.log_event("→", f"  Resampled to {self.sample_rate} Hz")
+        
+        # Convert to int16
+        if audio_data.dtype != np.int16:
+            # Normalize to [-1, 1] if needed
+            max_val = np.max(np.abs(audio_data))
+            if max_val > 1.0:
+                audio_data = audio_data / max_val
+            audio_data = (audio_data * 32767).astype(np.int16)
+        
+        self.log_event("→", f"  Prepared: {len(audio_data)} samples ({len(audio_data)/self.sample_rate:.2f}s)")
+        
+        return audio_data, file_sample_rate
+    
+    async def audio_sender(self, audio_data: np.ndarray) -> None:
+        """Send audio data to server in chunks."""
+        total_samples = len(audio_data)
+        chunk_size = self.chunk_samples
+        sent_samples = 0
+        
+        self.send_start_time = time.time()
+        self.log_event("→", f"Starting audio transmission ({total_samples} samples)...")
+        
+        while sent_samples < total_samples and self.running:
+            # Get next chunk
+            end_sample = min(sent_samples + chunk_size, total_samples)
+            chunk = audio_data[sent_samples:end_sample]
+            chunk_bytes = chunk.tobytes()
+            
+            # Send to server
+            if self.ws:
+                await self.ws.send(chunk_bytes)
+                self.bytes_sent += len(chunk_bytes)
+            
+            sent_samples = end_sample
+            
+            # Progress logging (every 500ms worth of audio)
+            if self.verbose and sent_samples % (self.sample_rate // 2) == 0:
+                progress = (sent_samples / total_samples) * 100
+                print(f"  Sending: {progress:.0f}%", end="\r")
+            
+            # Delay to simulate real-time streaming
+            # Server expects audio at real-time pace for VAD/ASR to work properly
+            await asyncio.sleep(self.chunk_duration_ms / 1000)
+        
+        self.send_completed = True
+        elapsed = time.time() - self.send_start_time
+        self.log_event("→", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent/1024:.1f} KB)")
+    
+    async def receiver(self) -> None:
+        """Receive messages from server."""
+        try:
+            while self.running:
+                try:
+                    message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
+                    
+                    if isinstance(message, bytes):
+                        # Audio data received
+                        self.bytes_received += len(message)
+                        self.received_audio.extend(message)
+                        
+                        # Calculate TTFB on first audio of each response
+                        if self.waiting_for_first_audio and self.response_start_time is not None:
+                            ttfb_ms = (time.time() - self.response_start_time) * 1000
+                            self.ttfb_ms = ttfb_ms
+                            self.ttfb_list.append(ttfb_ms)
+                            self.waiting_for_first_audio = False
+                            self.log_event("←", f"[TTFB] First audio latency: {ttfb_ms:.0f}ms")
+                        
+                        # Log progress
+                        duration_ms = len(message) / (self.sample_rate * 2) * 1000
+                        total_ms = len(self.received_audio) / (self.sample_rate * 2) * 1000
+                        if self.verbose:
+                            print(f"← Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r")
+                        
+                    else:
+                        # JSON event
+                        event = json.loads(message)
+                        await self._handle_event(event)
+                        
+                except asyncio.TimeoutError:
+                    continue
+                except websockets.ConnectionClosed:
+                    self.log_event("←", "Connection closed")
+                    self.running = False
+                    break
+                    
+        except asyncio.CancelledError:
+            pass
+        except Exception as e:
+            self.log_event("!", f"Receiver error: {e}")
+            self.running = False
+    
+    async def _handle_event(self, event: dict) -> None:
+        """Handle incoming event."""
+        event_type = event.get("event", "unknown")
+        
+        if event_type == "answer":
+            self.log_event("←", "Session ready!")
+        elif event_type == "speaking":
+            self.log_event("←", "Speech detected")
+        elif event_type == "silence":
+            self.log_event("←", "Silence detected")
+        elif event_type == "transcript":
+            # ASR transcript (interim = asrDelta-style, final = asrFinal-style)
+            text = event.get("text", "")
+            is_final = event.get("isFinal", False)
+            if is_final:
+                # Clear interim line and print final
+                print(" " * 80, end="\r")
+                self.log_event("←", f"→ You: {text}")
+            else:
+                # Interim result - show with indicator (overwrite same line, as in mic_client)
+                display_text = text[:60] + "..." if len(text) > 60 else text
+                print(f"  [listening] {display_text}".ljust(80), end="\r")
+        elif event_type == "ttfb":
+            latency_ms = event.get("latencyMs", 0)
+            self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
+        elif event_type == "llmResponse":
+            text = event.get("text", "")
+            is_final = event.get("isFinal", False)
+            if is_final:
+                self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
+            elif self.verbose:
+                # Show streaming chunks only in verbose mode
+                self.log_event("←", f"LLM: {text}")
+        elif event_type == "trackStart":
+            self.track_started = True
+            self.response_start_time = time.time()
+            self.waiting_for_first_audio = True
+            self.log_event("←", "Bot started speaking")
+        elif event_type == "trackEnd":
+            self.track_ended = True
+            self.log_event("←", "Bot finished speaking")
+        elif event_type == "interrupt":
+            self.log_event("←", "Bot interrupted!")
+        elif event_type == "error":
+            self.log_event("!", f"Error: {event.get('error')}")
+        elif event_type == "hangup":
+            self.log_event("←", f"Hangup: {event.get('reason')}")
+            self.running = False
+        else:
+            self.log_event("←", f"Event: {event_type}")
+    
+    def save_output_wav(self) -> None:
+        """Save received audio to output WAV file."""
+        if not self.received_audio:
+            self.log_event("!", "No audio received to save")
+            return
+        
+        # Convert bytes to numpy array
+        audio_data = np.frombuffer(bytes(self.received_audio), dtype=np.int16)
+        
+        # Ensure output directory exists
+        self.output_file.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Save using wave module for compatibility
+        with wave.open(str(self.output_file), 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(self.sample_rate)
+            wav_file.writeframes(audio_data.tobytes())
+        
+        duration = len(audio_data) / self.sample_rate
+        self.log_event("→", f"Saved output: {self.output_file}")
+        self.log_event("→", f"  Duration: {duration:.2f}s ({len(audio_data)} samples)")
+        self.log_event("→", f"  Size: {len(self.received_audio)/1024:.1f} KB")
+    
+    async def run(self) -> None:
+        """Run the WAV file test."""
+        try:
+            # Load input WAV file
+            audio_data, _ = self.load_wav_file()
+            
+            # Connect to server
+            await self.connect()
+            
+            # Wait for answer
+            await asyncio.sleep(0.5)
+            
+            # Start receiver task
+            receiver_task = asyncio.create_task(self.receiver())
+            
+            # Send audio
+            await self.audio_sender(audio_data)
+            
+            # Wait for response
+            self.log_event("→", f"Waiting {self.wait_time}s for response...")
+            
+            wait_start = time.time()
+            while self.running and (time.time() - wait_start) < self.wait_time:
+                # Check if track has ended (response complete)
+                if self.track_ended and self.send_completed:
+                    # Give a little extra time for any remaining audio
+                    await asyncio.sleep(1.0)
+                    break
+                await asyncio.sleep(0.1)
+            
+            # Cleanup
+            self.running = False
+            receiver_task.cancel()
+            
+            try:
+                await receiver_task
+            except asyncio.CancelledError:
+                pass
+            
+            # Save output
+            self.save_output_wav()
+            
+            # Print summary
+            self._print_summary()
+            
+        except FileNotFoundError as e:
+            print(f"Error: {e}")
+            sys.exit(1)
+        except ConnectionRefusedError:
+            print(f"Error: Could not connect to {self.url}")
+            print("Make sure the server is running.")
+            sys.exit(1)
+        except Exception as e:
+            print(f"Error: {e}")
+            import traceback
+            traceback.print_exc()
+            sys.exit(1)
+        finally:
+            await self.close()
+    
+    def _print_summary(self):
+        """Print session summary."""
+        print("\n" + "=" * 50)
+        print("Session Summary")
+        print("=" * 50)
+        print(f"  Input file:  {self.input_file}")
+        print(f"  Output file: {self.output_file}")
+        print(f"  Bytes sent:     {self.bytes_sent / 1024:.1f} KB")
+        print(f"  Bytes received: {self.bytes_received / 1024:.1f} KB")
+        if self.ttfb_list:
+            if len(self.ttfb_list) == 1:
+                print(f"  TTFB:           {self.ttfb_list[0]:.0f} ms")
+            else:
+                print(f"  TTFB (per response): {', '.join(f'{t:.0f}ms' for t in self.ttfb_list)}")
+        if self.received_audio:
+            duration = len(self.received_audio) / (self.sample_rate * 2)
+            print(f"  Response duration: {duration:.2f}s")
+        print("=" * 50)
+    
+    async def close(self) -> None:
+        """Close the connection."""
+        self.running = False
+        if self.ws:
+            try:
+                await self.ws.close()
+            except:
+                pass
+
+
+async def main():
+    parser = argparse.ArgumentParser(
+        description="WAV file client for testing duplex voice conversation"
+    )
+    parser.add_argument(
+        "--input", "-i",
+        required=True,
+        help="Input WAV file path"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        required=True,
+        help="Output WAV file path for response"
+    )
+    parser.add_argument(
+        "--url",
+        default="ws://localhost:8000/ws",
+        help="WebSocket server URL (default: ws://localhost:8000/ws)"
+    )
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="Target sample rate for audio (default: 16000)"
+    )
+    parser.add_argument(
+        "--chunk-duration",
+        type=int,
+        default=20,
+        help="Chunk duration in ms for sending (default: 20)"
+    )
+    parser.add_argument(
+        "--wait-time", "-w",
+        type=float,
+        default=15.0,
+        help="Time to wait for response after sending (default: 15.0)"
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Enable verbose output"
+    )
+    
+    args = parser.parse_args()
+    
+    client = WavFileClient(
+        url=args.url,
+        input_file=args.input,
+        output_file=args.output,
+        sample_rate=args.sample_rate,
+        chunk_duration_ms=args.chunk_duration,
+        wait_time=args.wait_time,
+        verbose=args.verbose
+    )
+    
+    await client.run()
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\nInterrupted by user")
--- a/examples/web_client.html
+++ b/examples/web_client.html
@@ -0,0 +1,766 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Duplex Voice Web Client</title>
+    <style>
+      @import url("https://fonts.googleapis.com/css2?family=Fraunces:opsz,wght@9..144,300;9..144,500;9..144,700&family=Recursive:wght@300;400;600;700&display=swap");
+
+      :root {
+        --bg: #0b0b0f;
+        --panel: #14141c;
+        --panel-2: #101018;
+        --ink: #f2f3f7;
+        --muted: #a7acba;
+        --accent: #ff6b6b;
+        --accent-2: #ffd166;
+        --good: #2dd4bf;
+        --bad: #f87171;
+        --grid: rgba(255, 255, 255, 0.06);
+        --shadow: 0 20px 60px rgba(0, 0, 0, 0.45);
+      }
+
+      * {
+        box-sizing: border-box;
+      }
+
+      html,
+      body {
+        height: 100%;
+        margin: 0;
+        color: var(--ink);
+        background: radial-gradient(1200px 600px at 20% -10%, #1d1d2a 0%, transparent 60%),
+          radial-gradient(800px 800px at 110% 10%, #20203a 0%, transparent 50%),
+          var(--bg);
+        font-family: "Recursive", ui-sans-serif, system-ui, -apple-system, "Segoe UI", sans-serif;
+      }
+
+      .noise {
+        position: fixed;
+        inset: 0;
+        background-image: url("data:image/svg+xml;utf8,<svg xmlns='http://www.w3.org/2000/svg' width='120' height='120' viewBox='0 0 120 120'><filter id='n'><feTurbulence type='fractalNoise' baseFrequency='0.9' numOctaves='2' stitchTiles='stitch'/></filter><rect width='120' height='120' filter='url(%23n)' opacity='0.06'/></svg>");
+        pointer-events: none;
+        mix-blend-mode: soft-light;
+      }
+
+      header {
+        padding: 32px 28px 18px;
+        border-bottom: 1px solid var(--grid);
+      }
+
+      h1 {
+        font-family: "Fraunces", serif;
+        font-weight: 600;
+        margin: 0 0 6px;
+        letter-spacing: 0.4px;
+      }
+
+      .subtitle {
+        color: var(--muted);
+        font-size: 0.95rem;
+      }
+
+      main {
+        display: grid;
+        grid-template-columns: 1.1fr 1.4fr;
+        gap: 24px;
+        padding: 24px 28px 40px;
+      }
+
+      .panel {
+        background: linear-gradient(180deg, rgba(255, 255, 255, 0.02), transparent),
+          var(--panel);
+        border: 1px solid var(--grid);
+        border-radius: 16px;
+        padding: 20px;
+        box-shadow: var(--shadow);
+      }
+
+      .panel h2 {
+        margin: 0 0 12px;
+        font-size: 1.05rem;
+        font-weight: 600;
+      }
+
+      .stack {
+        display: grid;
+        gap: 12px;
+      }
+
+      label {
+        display: block;
+        font-size: 0.85rem;
+        color: var(--muted);
+        margin-bottom: 6px;
+      }
+
+      input,
+      select,
+      button,
+      textarea {
+        font-family: inherit;
+      }
+
+      input,
+      select,
+      textarea {
+        width: 100%;
+        padding: 10px 12px;
+        border-radius: 10px;
+        border: 1px solid var(--grid);
+        background: var(--panel-2);
+        color: var(--ink);
+        outline: none;
+      }
+
+      textarea {
+        min-height: 80px;
+        resize: vertical;
+      }
+
+      .row {
+        display: grid;
+        grid-template-columns: 1fr 1fr;
+        gap: 12px;
+      }
+
+      .btn-row {
+        display: flex;
+        flex-wrap: wrap;
+        gap: 10px;
+      }
+
+      button {
+        border: none;
+        border-radius: 999px;
+        padding: 10px 16px;
+        font-weight: 600;
+        background: var(--ink);
+        color: #111;
+        cursor: pointer;
+        transition: transform 0.2s ease, box-shadow 0.2s ease;
+      }
+
+      button.secondary {
+        background: transparent;
+        color: var(--ink);
+        border: 1px solid var(--grid);
+      }
+
+      button.accent {
+        background: linear-gradient(120deg, var(--accent), #f97316);
+        color: #0b0b0f;
+      }
+
+      button.good {
+        background: linear-gradient(120deg, var(--good), #22c55e);
+        color: #07261f;
+      }
+
+      button.bad {
+        background: linear-gradient(120deg, var(--bad), #f97316);
+        color: #2a0b0b;
+      }
+
+      button:active {
+        transform: translateY(1px) scale(0.99);
+      }
+
+      .status {
+        display: flex;
+        align-items: center;
+        gap: 12px;
+        padding: 12px;
+        background: rgba(255, 255, 255, 0.03);
+        border-radius: 12px;
+        border: 1px dashed var(--grid);
+        font-size: 0.9rem;
+      }
+
+      .dot {
+        width: 10px;
+        height: 10px;
+        border-radius: 999px;
+        background: var(--bad);
+        box-shadow: 0 0 12px rgba(248, 113, 113, 0.5);
+      }
+
+      .dot.on {
+        background: var(--good);
+        box-shadow: 0 0 12px rgba(45, 212, 191, 0.7);
+      }
+
+      .log {
+        height: 320px;
+        overflow: auto;
+        padding: 12px;
+        background: #0d0d14;
+        border-radius: 12px;
+        border: 1px solid var(--grid);
+        font-size: 0.85rem;
+        line-height: 1.4;
+      }
+
+      .chat {
+        height: 260px;
+        overflow: auto;
+        padding: 12px;
+        background: #0d0d14;
+        border-radius: 12px;
+        border: 1px solid var(--grid);
+        font-size: 0.9rem;
+        line-height: 1.45;
+      }
+
+      .chat-entry {
+        padding: 8px 10px;
+        margin-bottom: 8px;
+        border-radius: 10px;
+        background: rgba(255, 255, 255, 0.04);
+        border: 1px solid rgba(255, 255, 255, 0.06);
+      }
+
+      .chat-entry.user {
+        border-left: 3px solid var(--accent-2);
+      }
+
+      .chat-entry.ai {
+        border-left: 3px solid var(--good);
+      }
+
+      .chat-entry.interim {
+        opacity: 0.7;
+        font-style: italic;
+      }
+
+      .log-entry {
+        padding: 6px 8px;
+        border-bottom: 1px dashed rgba(255, 255, 255, 0.06);
+      }
+
+      .log-entry:last-child {
+        border-bottom: none;
+      }
+
+      .tag {
+        display: inline-flex;
+        align-items: center;
+        gap: 6px;
+        padding: 2px 8px;
+        border-radius: 999px;
+        font-size: 0.7rem;
+        text-transform: uppercase;
+        letter-spacing: 0.6px;
+        background: rgba(255, 255, 255, 0.08);
+        color: var(--muted);
+      }
+
+      .tag.event {
+        background: rgba(255, 107, 107, 0.18);
+        color: #ffc1c1;
+      }
+
+      .tag.audio {
+        background: rgba(45, 212, 191, 0.2);
+        color: #c5f9f0;
+      }
+
+      .tag.sys {
+        background: rgba(255, 209, 102, 0.2);
+        color: #ffefb0;
+      }
+
+      .muted {
+        color: var(--muted);
+      }
+
+      footer {
+        padding: 0 28px 28px;
+        color: var(--muted);
+        font-size: 0.8rem;
+      }
+
+      @media (max-width: 1100px) {
+        main {
+          grid-template-columns: 1fr;
+        }
+        .log {
+          height: 360px;
+        }
+        .chat {
+          height: 260px;
+        }
+      }
+    </style>
+  </head>
+  <body>
+    <div class="noise"></div>
+    <header>
+      <h1>Duplex Voice Client</h1>
+      <div class="subtitle">Browser client for the WebSocket duplex pipeline. Device selection + event logging.</div>
+    </header>
+
+    <main>
+      <section class="panel stack">
+        <h2>Connection</h2>
+        <div>
+          <label for="wsUrl">WebSocket URL</label>
+          <input id="wsUrl" value="ws://localhost:8000/ws" />
+        </div>
+        <div class="btn-row">
+          <button class="accent" id="connectBtn">Connect</button>
+          <button class="secondary" id="disconnectBtn">Disconnect</button>
+        </div>
+        <div class="status">
+          <div id="statusDot" class="dot"></div>
+          <div>
+            <div id="statusText">Disconnected</div>
+            <div class="muted" id="statusSub">Waiting for connection</div>
+          </div>
+        </div>
+
+        <h2>Devices</h2>
+        <div class="row">
+          <div>
+            <label for="inputSelect">Input (Mic)</label>
+            <select id="inputSelect"></select>
+          </div>
+          <div>
+            <label for="outputSelect">Output (Speaker)</label>
+            <select id="outputSelect"></select>
+          </div>
+        </div>
+        <div class="btn-row">
+          <button class="secondary" id="refreshDevicesBtn">Refresh Devices</button>
+          <button class="good" id="startMicBtn">Start Mic</button>
+          <button class="secondary" id="stopMicBtn">Stop Mic</button>
+        </div>
+
+        <h2>Chat</h2>
+        <div class="stack">
+          <textarea id="chatInput" placeholder="Type a message, press Send"></textarea>
+          <div class="btn-row">
+            <button class="accent" id="sendChatBtn">Send Chat</button>
+            <button class="secondary" id="clearLogBtn">Clear Log</button>
+          </div>
+        </div>
+      </section>
+
+      <section class="stack">
+        <div class="panel stack">
+          <h2>Chat History</h2>
+          <div class="chat" id="chatHistory"></div>
+        </div>
+        <div class="panel stack">
+          <h2>Event Log</h2>
+          <div class="log" id="log"></div>
+        </div>
+      </section>
+    </main>
+
+    <footer>
+      Output device selection requires HTTPS + a browser that supports <code>setSinkId</code>.
+      Audio is sent as 16-bit PCM @ 16 kHz, matching <code>examples/mic_client.py</code>.
+    </footer>
+
+    <audio id="audioOut" autoplay></audio>
+
+    <script>
+      const wsUrl = document.getElementById("wsUrl");
+      const connectBtn = document.getElementById("connectBtn");
+      const disconnectBtn = document.getElementById("disconnectBtn");
+      const inputSelect = document.getElementById("inputSelect");
+      const outputSelect = document.getElementById("outputSelect");
+      const startMicBtn = document.getElementById("startMicBtn");
+      const stopMicBtn = document.getElementById("stopMicBtn");
+      const refreshDevicesBtn = document.getElementById("refreshDevicesBtn");
+      const sendChatBtn = document.getElementById("sendChatBtn");
+      const clearLogBtn = document.getElementById("clearLogBtn");
+      const chatInput = document.getElementById("chatInput");
+      const logEl = document.getElementById("log");
+      const chatHistory = document.getElementById("chatHistory");
+      const statusDot = document.getElementById("statusDot");
+      const statusText = document.getElementById("statusText");
+      const statusSub = document.getElementById("statusSub");
+      const audioOut = document.getElementById("audioOut");
+
+      let ws = null;
+      let audioCtx = null;
+      let micStream = null;
+      let processor = null;
+      let micSource = null;
+      let playbackDest = null;
+      let playbackTime = 0;
+      let discardAudio = false;
+      let playbackSources = [];
+      let interimUserEl = null;
+      let interimAiEl = null;
+      let interimUserText = "";
+      let interimAiText = "";
+
+      const targetSampleRate = 16000;
+      const playbackStopRampSec = 0.008;
+
+      function logLine(type, text, data) {
+        const time = new Date().toLocaleTimeString();
+        const entry = document.createElement("div");
+        entry.className = "log-entry";
+        const tag = document.createElement("span");
+        tag.className = `tag ${type}`;
+        tag.textContent = type.toUpperCase();
+        const msg = document.createElement("span");
+        msg.style.marginLeft = "10px";
+        msg.textContent = `[${time}] ${text}`;
+        entry.appendChild(tag);
+        entry.appendChild(msg);
+        if (data) {
+          const pre = document.createElement("div");
+          pre.className = "muted";
+          pre.textContent = JSON.stringify(data);
+          pre.style.marginTop = "4px";
+          entry.appendChild(pre);
+        }
+        logEl.appendChild(entry);
+        logEl.scrollTop = logEl.scrollHeight;
+      }
+
+      function addChat(role, text) {
+        const entry = document.createElement("div");
+        entry.className = `chat-entry ${role === "AI" ? "ai" : "user"}`;
+        entry.textContent = `${role}: ${text}`;
+        chatHistory.appendChild(entry);
+        chatHistory.scrollTop = chatHistory.scrollHeight;
+      }
+
+      function setInterim(role, text) {
+        const isAi = role === "AI";
+        let el = isAi ? interimAiEl : interimUserEl;
+        if (!text) {
+          if (el) el.remove();
+          if (isAi) interimAiEl = null;
+          else interimUserEl = null;
+          if (isAi) interimAiText = "";
+          else interimUserText = "";
+          return;
+        }
+        if (!el) {
+          el = document.createElement("div");
+          el.className = `chat-entry ${isAi ? "ai" : "user"} interim`;
+          chatHistory.appendChild(el);
+          if (isAi) interimAiEl = el;
+          else interimUserEl = el;
+        }
+        el.textContent = `${role} (interim): ${text}`;
+        chatHistory.scrollTop = chatHistory.scrollHeight;
+      }
+
+      function stopPlayback() {
+        discardAudio = true;
+        const now = audioCtx ? audioCtx.currentTime : 0;
+        playbackTime = now;
+        playbackSources.forEach((node) => {
+          try {
+            if (audioCtx && node.gainNode && node.source) {
+              node.gainNode.gain.cancelScheduledValues(now);
+              node.gainNode.gain.setValueAtTime(node.gainNode.gain.value || 1, now);
+              node.gainNode.gain.linearRampToValueAtTime(0, now + playbackStopRampSec);
+              node.source.stop(now + playbackStopRampSec + 0.002);
+            } else if (node.source) {
+              node.source.stop();
+            }
+          } catch (err) {}
+        });
+        playbackSources = [];
+      }
+
+      function setStatus(connected, detail) {
+        statusDot.classList.toggle("on", connected);
+        statusText.textContent = connected ? "Connected" : "Disconnected";
+        statusSub.textContent = detail || "";
+      }
+
+      async function ensureAudioContext() {
+        if (audioCtx) return;
+        audioCtx = new (window.AudioContext || window.webkitAudioContext)();
+        playbackDest = audioCtx.createMediaStreamDestination();
+        audioOut.srcObject = playbackDest.stream;
+        try {
+          await audioOut.play();
+        } catch (err) {
+          logLine("sys", "Audio playback blocked (user gesture needed)", { err: String(err) });
+        }
+        if (outputSelect.value) {
+          await setOutputDevice(outputSelect.value);
+        }
+      }
+
+      function downsampleBuffer(buffer, inRate, outRate) {
+        if (outRate === inRate) return buffer;
+        const ratio = inRate / outRate;
+        const newLength = Math.round(buffer.length / ratio);
+        const result = new Float32Array(newLength);
+        let offsetResult = 0;
+        let offsetBuffer = 0;
+        while (offsetResult < result.length) {
+          const nextOffsetBuffer = Math.round((offsetResult + 1) * ratio);
+          let accum = 0;
+          let count = 0;
+          for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
+            accum += buffer[i];
+            count++;
+          }
+          result[offsetResult] = accum / count;
+          offsetResult++;
+          offsetBuffer = nextOffsetBuffer;
+        }
+        return result;
+      }
+
+      function floatTo16BitPCM(float32) {
+        const out = new Int16Array(float32.length);
+        for (let i = 0; i < float32.length; i++) {
+          const s = Math.max(-1, Math.min(1, float32[i]));
+          out[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
+        }
+        return out;
+      }
+
+      function schedulePlayback(int16Data) {
+        if (!audioCtx || !playbackDest) return;
+        if (discardAudio) return;
+        const float32 = new Float32Array(int16Data.length);
+        for (let i = 0; i < int16Data.length; i++) {
+          float32[i] = int16Data[i] / 32768;
+        }
+        const buffer = audioCtx.createBuffer(1, float32.length, targetSampleRate);
+        buffer.copyToChannel(float32, 0);
+        const source = audioCtx.createBufferSource();
+        const gainNode = audioCtx.createGain();
+        source.buffer = buffer;
+        source.connect(gainNode);
+        gainNode.connect(playbackDest);
+        const startTime = Math.max(audioCtx.currentTime + 0.02, playbackTime);
+        gainNode.gain.setValueAtTime(1, startTime);
+        source.start(startTime);
+        playbackTime = startTime + buffer.duration;
+        const playbackNode = { source, gainNode };
+        playbackSources.push(playbackNode);
+        source.onended = () => {
+          playbackSources = playbackSources.filter((s) => s !== playbackNode);
+        };
+      }
+
+      async function connect() {
+        if (ws && ws.readyState === WebSocket.OPEN) return;
+        ws = new WebSocket(wsUrl.value.trim());
+        ws.binaryType = "arraybuffer";
+
+        ws.onopen = () => {
+          setStatus(true, "Session open");
+          logLine("sys", "WebSocket connected");
+          ensureAudioContext();
+          sendCommand({ type: "hello", version: "v1" });
+        };
+
+        ws.onclose = () => {
+          setStatus(false, "Connection closed");
+          logLine("sys", "WebSocket closed");
+          ws = null;
+        };
+
+        ws.onerror = (err) => {
+          logLine("sys", "WebSocket error", { err: String(err) });
+        };
+
+        ws.onmessage = (msg) => {
+          if (typeof msg.data === "string") {
+            const event = JSON.parse(msg.data);
+            handleEvent(event);
+          } else {
+            const audioBuf = msg.data;
+            const int16 = new Int16Array(audioBuf);
+            schedulePlayback(int16);
+            logLine("audio", `Audio ${Math.round((int16.length / targetSampleRate) * 1000)}ms`);
+          }
+        };
+      }
+
+      function disconnect() {
+        if (ws && ws.readyState === WebSocket.OPEN) {
+          sendCommand({ type: "session.stop", reason: "client_disconnect" });
+          ws.close();
+        }
+        ws = null;
+        setStatus(false, "Disconnected");
+      }
+
+      function sendCommand(cmd) {
+        if (!ws || ws.readyState !== WebSocket.OPEN) {
+          logLine("sys", "Not connected");
+          return;
+        }
+        ws.send(JSON.stringify(cmd));
+        logLine("sys", `→ ${cmd.type}`, cmd);
+      }
+
+      function handleEvent(event) {
+        const type = event.type || "unknown";
+        logLine("event", type, event);
+        if (type === "hello.ack") {
+          sendCommand({
+            type: "session.start",
+            audio: { encoding: "pcm_s16le", sample_rate_hz: targetSampleRate, channels: 1 },
+          });
+        }
+        if (type === "transcript.final") {
+          if (event.text) {
+            setInterim("You", "");
+            addChat("You", event.text);
+          }
+        }
+        if (type === "transcript.delta" && event.text) {
+          setInterim("You", event.text);
+        }
+        if (type === "assistant.response.final") {
+          if (event.text) {
+            setInterim("AI", "");
+            addChat("AI", event.text);
+          }
+        }
+        if (type === "assistant.response.delta" && event.text) {
+          interimAiText += event.text;
+          setInterim("AI", interimAiText);
+        }
+        if (type === "output.audio.start") {
+          // New bot audio: stop any previous playback to avoid overlap
+          stopPlayback();
+          discardAudio = false;
+          interimAiText = "";
+        }
+        if (type === "input.speech_started") {
+          // User started speaking: clear any in-flight audio to avoid overlap
+          stopPlayback();
+        }
+        if (type === "response.interrupted") {
+          stopPlayback();
+        }
+      }
+
+      async function startMic() {
+        if (!ws || ws.readyState !== WebSocket.OPEN) {
+          logLine("sys", "Connect before starting mic");
+          return;
+        }
+        await ensureAudioContext();
+        const deviceId = inputSelect.value || undefined;
+        micStream = await navigator.mediaDevices.getUserMedia({
+          audio: deviceId ? { deviceId: { exact: deviceId } } : true,
+        });
+        micSource = audioCtx.createMediaStreamSource(micStream);
+        processor = audioCtx.createScriptProcessor(2048, 1, 1);
+        processor.onaudioprocess = (e) => {
+          if (!ws || ws.readyState !== WebSocket.OPEN) return;
+          const input = e.inputBuffer.getChannelData(0);
+          const downsampled = downsampleBuffer(input, audioCtx.sampleRate, targetSampleRate);
+          const pcm16 = floatTo16BitPCM(downsampled);
+          ws.send(pcm16.buffer);
+        };
+        micSource.connect(processor);
+        processor.connect(audioCtx.destination);
+        logLine("sys", "Microphone started");
+      }
+
+      function stopMic() {
+        if (processor) {
+          processor.disconnect();
+          processor = null;
+        }
+        if (micSource) {
+          micSource.disconnect();
+          micSource = null;
+        }
+        if (micStream) {
+          micStream.getTracks().forEach((t) => t.stop());
+          micStream = null;
+        }
+        logLine("sys", "Microphone stopped");
+      }
+
+      async function refreshDevices() {
+        const devices = await navigator.mediaDevices.enumerateDevices();
+        inputSelect.innerHTML = "";
+        outputSelect.innerHTML = "";
+        devices.forEach((d) => {
+          if (d.kind === "audioinput") {
+            const opt = document.createElement("option");
+            opt.value = d.deviceId;
+            opt.textContent = d.label || `Mic ${inputSelect.length + 1}`;
+            inputSelect.appendChild(opt);
+          }
+          if (d.kind === "audiooutput") {
+            const opt = document.createElement("option");
+            opt.value = d.deviceId;
+            opt.textContent = d.label || `Output ${outputSelect.length + 1}`;
+            outputSelect.appendChild(opt);
+          }
+        });
+      }
+
+      async function requestDeviceAccess() {
+        // Needed to reveal device labels in most browsers
+        try {
+          const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+          stream.getTracks().forEach((t) => t.stop());
+          logLine("sys", "Microphone permission granted");
+        } catch (err) {
+          logLine("sys", "Microphone permission denied", { err: String(err) });
+        }
+      }
+
+      async function setOutputDevice(deviceId) {
+        if (!audioOut.setSinkId) {
+          logLine("sys", "setSinkId not supported in this browser");
+          return;
+        }
+        await audioOut.setSinkId(deviceId);
+        logLine("sys", `Output device set`, { deviceId });
+      }
+
+      connectBtn.addEventListener("click", connect);
+      disconnectBtn.addEventListener("click", disconnect);
+      refreshDevicesBtn.addEventListener("click", async () => {
+        await requestDeviceAccess();
+        await refreshDevices();
+      });
+      startMicBtn.addEventListener("click", startMic);
+      stopMicBtn.addEventListener("click", stopMic);
+      sendChatBtn.addEventListener("click", () => {
+        const text = chatInput.value.trim();
+        if (!text) return;
+        ensureAudioContext();
+        addChat("You", text);
+        sendCommand({ type: "input.text", text });
+        chatInput.value = "";
+      });
+      clearLogBtn.addEventListener("click", () => {
+        logEl.innerHTML = "";
+        chatHistory.innerHTML = "";
+        setInterim("You", "");
+        setInterim("AI", "");
+        interimUserText = "";
+        interimAiText = "";
+      });
+      inputSelect.addEventListener("change", () => {
+        if (micStream) {
+          stopMic();
+          startMic();
+        }
+      });
+      outputSelect.addEventListener("change", () => setOutputDevice(outputSelect.value));
+
+      navigator.mediaDevices.addEventListener("devicechange", refreshDevices);
+      refreshDevices().catch(() => {});
+    </script>
+  </body>
+</html>