Init commit

This commit is contained in:
Xin Wang
2026-02-17 10:39:23 +08:00
commit 30eb4397c2
56 changed files with 11983 additions and 0 deletions

601
examples/mic_client.py Normal file
View File

@@ -0,0 +1,601 @@
#!/usr/bin/env python3
"""
Microphone client for testing duplex voice conversation.
This client captures audio from the microphone, sends it to the server,
and plays back the AI's voice response through the speakers.
It also displays the LLM's text responses in the console.
Usage:
python examples/mic_client.py --url ws://localhost:8000/ws
python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!"
python examples/mic_client.py --url ws://localhost:8000/ws --verbose
Requirements:
pip install sounddevice soundfile websockets numpy
"""
import argparse
import asyncio
import json
import sys
import time
import threading
import queue
from pathlib import Path
try:
import numpy as np
except ImportError:
print("Please install numpy: pip install numpy")
sys.exit(1)
try:
import sounddevice as sd
except ImportError:
print("Please install sounddevice: pip install sounddevice")
sys.exit(1)
try:
import websockets
except ImportError:
print("Please install websockets: pip install websockets")
sys.exit(1)
class MicrophoneClient:
"""
Full-duplex microphone client for voice conversation.
Features:
- Real-time microphone capture
- Real-time speaker playback
- WebSocket communication
- Text chat support
"""
def __init__(
self,
url: str,
sample_rate: int = 16000,
chunk_duration_ms: int = 20,
input_device: int = None,
output_device: int = None
):
"""
Initialize microphone client.
Args:
url: WebSocket server URL
sample_rate: Audio sample rate (Hz)
chunk_duration_ms: Audio chunk duration (ms)
input_device: Input device ID (None for default)
output_device: Output device ID (None for default)
"""
self.url = url
self.sample_rate = sample_rate
self.chunk_duration_ms = chunk_duration_ms
self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
self.input_device = input_device
self.output_device = output_device
# WebSocket connection
self.ws = None
self.running = False
# Audio buffers
self.audio_input_queue = queue.Queue()
self.audio_output_buffer = b"" # Continuous buffer for smooth playback
self.audio_output_lock = threading.Lock()
# Statistics
self.bytes_sent = 0
self.bytes_received = 0
# State
self.is_recording = True
self.is_playing = True
# TTFB tracking (Time to First Byte)
self.request_start_time = None
self.first_audio_received = False
# Interrupt handling - discard audio until next trackStart
self._discard_audio = False
self._audio_sequence = 0 # Track audio sequence to detect stale chunks
# Verbose mode for streaming LLM responses
self.verbose = False
async def connect(self) -> None:
"""Connect to WebSocket server."""
print(f"Connecting to {self.url}...")
self.ws = await websockets.connect(self.url)
self.running = True
print("Connected!")
# Send invite command
await self.send_command({
"command": "invite",
"option": {
"codec": "pcm",
"sampleRate": self.sample_rate
}
})
async def send_command(self, cmd: dict) -> None:
"""Send JSON command to server."""
if self.ws:
await self.ws.send(json.dumps(cmd))
print(f"→ Command: {cmd.get('command', 'unknown')}")
async def send_chat(self, text: str) -> None:
"""Send chat message (text input)."""
# Reset TTFB tracking for new request
self.request_start_time = time.time()
self.first_audio_received = False
await self.send_command({
"command": "chat",
"text": text
})
print(f"→ Chat: {text}")
async def send_interrupt(self) -> None:
"""Send interrupt command."""
await self.send_command({
"command": "interrupt"
})
async def send_hangup(self, reason: str = "User quit") -> None:
"""Send hangup command."""
await self.send_command({
"command": "hangup",
"reason": reason
})
def _audio_input_callback(self, indata, frames, time, status):
"""Callback for audio input (microphone)."""
if status:
print(f"Input status: {status}")
if self.is_recording and self.running:
# Convert to 16-bit PCM
audio_data = (indata[:, 0] * 32767).astype(np.int16).tobytes()
self.audio_input_queue.put(audio_data)
def _add_audio_to_buffer(self, audio_data: bytes):
"""Add audio data to playback buffer."""
with self.audio_output_lock:
self.audio_output_buffer += audio_data
def _playback_thread_func(self):
"""Thread function for continuous audio playback."""
import time
# Chunk size: 50ms of audio
chunk_samples = int(self.sample_rate * 0.05)
chunk_bytes = chunk_samples * 2
print(f"Audio playback thread started (device: {self.output_device or 'default'})")
try:
# Create output stream with callback
with sd.OutputStream(
samplerate=self.sample_rate,
channels=1,
dtype='int16',
blocksize=chunk_samples,
device=self.output_device,
latency='low'
) as stream:
while self.running:
# Get audio from buffer
with self.audio_output_lock:
if len(self.audio_output_buffer) >= chunk_bytes:
audio_data = self.audio_output_buffer[:chunk_bytes]
self.audio_output_buffer = self.audio_output_buffer[chunk_bytes:]
else:
# Not enough audio - output silence
audio_data = b'\x00' * chunk_bytes
# Convert to numpy array and write to stream
samples = np.frombuffer(audio_data, dtype=np.int16).reshape(-1, 1)
stream.write(samples)
except Exception as e:
print(f"Playback thread error: {e}")
import traceback
traceback.print_exc()
async def _playback_task(self):
"""Start playback thread and monitor it."""
# Run playback in a dedicated thread for reliable timing
playback_thread = threading.Thread(target=self._playback_thread_func, daemon=True)
playback_thread.start()
# Wait for client to stop
while self.running and playback_thread.is_alive():
await asyncio.sleep(0.1)
print("Audio playback stopped")
async def audio_sender(self) -> None:
"""Send audio from microphone to server."""
while self.running:
try:
# Get audio from queue with timeout
try:
audio_data = await asyncio.get_event_loop().run_in_executor(
None, lambda: self.audio_input_queue.get(timeout=0.1)
)
except queue.Empty:
continue
# Send to server
if self.ws and self.is_recording:
await self.ws.send(audio_data)
self.bytes_sent += len(audio_data)
except asyncio.CancelledError:
break
except Exception as e:
print(f"Audio sender error: {e}")
break
async def receiver(self) -> None:
"""Receive messages from server."""
try:
while self.running:
try:
message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
if isinstance(message, bytes):
# Audio data received
self.bytes_received += len(message)
# Check if we should discard this audio (after interrupt)
if self._discard_audio:
duration_ms = len(message) / (self.sample_rate * 2) * 1000
print(f"← Audio: {duration_ms:.0f}ms (DISCARDED - waiting for new track)")
continue
if self.is_playing:
self._add_audio_to_buffer(message)
# Calculate and display TTFB for first audio packet
if not self.first_audio_received and self.request_start_time:
client_ttfb_ms = (time.time() - self.request_start_time) * 1000
self.first_audio_received = True
print(f"← [TTFB] Client first audio latency: {client_ttfb_ms:.0f}ms")
# Show progress (less verbose)
with self.audio_output_lock:
buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000
duration_ms = len(message) / (self.sample_rate * 2) * 1000
print(f"← Audio: {duration_ms:.0f}ms (buffer: {buffer_ms:.0f}ms)")
else:
# JSON event
event = json.loads(message)
await self._handle_event(event)
except asyncio.TimeoutError:
continue
except websockets.ConnectionClosed:
print("Connection closed")
self.running = False
break
except asyncio.CancelledError:
pass
except Exception as e:
print(f"Receiver error: {e}")
self.running = False
async def _handle_event(self, event: dict) -> None:
"""Handle incoming event."""
event_type = event.get("event", "unknown")
if event_type == "answer":
print("← Session ready!")
elif event_type == "speaking":
print("← User speech detected")
elif event_type == "silence":
print("← User silence detected")
elif event_type == "transcript":
# Display user speech transcription
text = event.get("text", "")
is_final = event.get("isFinal", False)
if is_final:
# Clear the interim line and print final
print(" " * 80, end="\r") # Clear previous interim text
print(f"→ You: {text}")
else:
# Interim result - show with indicator (overwrite same line)
display_text = text[:60] + "..." if len(text) > 60 else text
print(f" [listening] {display_text}".ljust(80), end="\r")
elif event_type == "ttfb":
# Server-side TTFB event
latency_ms = event.get("latencyMs", 0)
print(f"← [TTFB] Server reported latency: {latency_ms}ms")
elif event_type == "llmResponse":
# LLM text response
text = event.get("text", "")
is_final = event.get("isFinal", False)
if is_final:
# Print final LLM response
print(f"← AI: {text}")
elif self.verbose:
# Show streaming chunks only in verbose mode
display_text = text[:60] + "..." if len(text) > 60 else text
print(f" [streaming] {display_text}")
elif event_type == "trackStart":
print("← Bot started speaking")
# IMPORTANT: Accept audio again after trackStart
self._discard_audio = False
self._audio_sequence += 1
# Reset TTFB tracking for voice responses (when no chat was sent)
if self.request_start_time is None:
self.request_start_time = time.time()
self.first_audio_received = False
# Clear any old audio in buffer
with self.audio_output_lock:
self.audio_output_buffer = b""
elif event_type == "trackEnd":
print("← Bot finished speaking")
# Reset TTFB tracking after response completes
self.request_start_time = None
self.first_audio_received = False
elif event_type == "interrupt":
print("← Bot interrupted!")
# IMPORTANT: Discard all audio until next trackStart
self._discard_audio = True
# Clear audio buffer immediately
with self.audio_output_lock:
buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000
self.audio_output_buffer = b""
print(f" (cleared {buffer_ms:.0f}ms, discarding audio until new track)")
elif event_type == "error":
print(f"← Error: {event.get('error')}")
elif event_type == "hangup":
print(f"← Hangup: {event.get('reason')}")
self.running = False
else:
print(f"← Event: {event_type}")
async def interactive_mode(self) -> None:
"""Run interactive mode for text chat."""
print("\n" + "=" * 50)
print("Voice Conversation Client")
print("=" * 50)
print("Speak into your microphone to talk to the AI.")
print("Or type messages to send text.")
print("")
print("Commands:")
print(" /quit - End conversation")
print(" /mute - Mute microphone")
print(" /unmute - Unmute microphone")
print(" /interrupt - Interrupt AI speech")
print(" /stats - Show statistics")
print("=" * 50 + "\n")
while self.running:
try:
user_input = await asyncio.get_event_loop().run_in_executor(
None, input, ""
)
if not user_input:
continue
# Handle commands
if user_input.startswith("/"):
cmd = user_input.lower().strip()
if cmd == "/quit":
await self.send_hangup("User quit")
break
elif cmd == "/mute":
self.is_recording = False
print("Microphone muted")
elif cmd == "/unmute":
self.is_recording = True
print("Microphone unmuted")
elif cmd == "/interrupt":
await self.send_interrupt()
elif cmd == "/stats":
print(f"Sent: {self.bytes_sent / 1024:.1f} KB")
print(f"Received: {self.bytes_received / 1024:.1f} KB")
else:
print(f"Unknown command: {cmd}")
else:
# Send as chat message
await self.send_chat(user_input)
except EOFError:
break
except Exception as e:
print(f"Input error: {e}")
async def run(self, chat_message: str = None, interactive: bool = True) -> None:
"""
Run the client.
Args:
chat_message: Optional single chat message to send
interactive: Whether to run in interactive mode
"""
try:
await self.connect()
# Wait for answer
await asyncio.sleep(0.5)
# Start audio input stream
print("Starting audio streams...")
input_stream = sd.InputStream(
samplerate=self.sample_rate,
channels=1,
dtype=np.float32,
blocksize=self.chunk_samples,
device=self.input_device,
callback=self._audio_input_callback
)
input_stream.start()
print("Audio streams started")
# Start background tasks
sender_task = asyncio.create_task(self.audio_sender())
receiver_task = asyncio.create_task(self.receiver())
playback_task = asyncio.create_task(self._playback_task())
if chat_message:
# Send single message and wait
await self.send_chat(chat_message)
await asyncio.sleep(15)
elif interactive:
# Run interactive mode
await self.interactive_mode()
else:
# Just wait
while self.running:
await asyncio.sleep(0.1)
# Cleanup
self.running = False
sender_task.cancel()
receiver_task.cancel()
playback_task.cancel()
try:
await sender_task
except asyncio.CancelledError:
pass
try:
await receiver_task
except asyncio.CancelledError:
pass
try:
await playback_task
except asyncio.CancelledError:
pass
input_stream.stop()
except ConnectionRefusedError:
print(f"Error: Could not connect to {self.url}")
print("Make sure the server is running.")
except Exception as e:
print(f"Error: {e}")
finally:
await self.close()
async def close(self) -> None:
"""Close the connection."""
self.running = False
if self.ws:
await self.ws.close()
print(f"\nSession ended")
print(f" Total sent: {self.bytes_sent / 1024:.1f} KB")
print(f" Total received: {self.bytes_received / 1024:.1f} KB")
def list_devices():
"""List available audio devices."""
print("\nAvailable audio devices:")
print("-" * 60)
devices = sd.query_devices()
for i, device in enumerate(devices):
direction = []
if device['max_input_channels'] > 0:
direction.append("IN")
if device['max_output_channels'] > 0:
direction.append("OUT")
direction_str = "/".join(direction) if direction else "N/A"
default = ""
if i == sd.default.device[0]:
default += " [DEFAULT INPUT]"
if i == sd.default.device[1]:
default += " [DEFAULT OUTPUT]"
print(f" {i:2d}: {device['name'][:40]:40s} ({direction_str}){default}")
print("-" * 60)
async def main():
parser = argparse.ArgumentParser(
description="Microphone client for duplex voice conversation"
)
parser.add_argument(
"--url",
default="ws://localhost:8000/ws",
help="WebSocket server URL"
)
parser.add_argument(
"--chat",
help="Send a single chat message instead of using microphone"
)
parser.add_argument(
"--sample-rate",
type=int,
default=16000,
help="Audio sample rate (default: 16000)"
)
parser.add_argument(
"--input-device",
type=int,
help="Input device ID"
)
parser.add_argument(
"--output-device",
type=int,
help="Output device ID"
)
parser.add_argument(
"--list-devices",
action="store_true",
help="List available audio devices and exit"
)
parser.add_argument(
"--no-interactive",
action="store_true",
help="Disable interactive mode"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Show streaming LLM response chunks"
)
args = parser.parse_args()
if args.list_devices:
list_devices()
return
client = MicrophoneClient(
url=args.url,
sample_rate=args.sample_rate,
input_device=args.input_device,
output_device=args.output_device
)
client.verbose = args.verbose
await client.run(
chat_message=args.chat,
interactive=not args.no_interactive
)
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
print("\nInterrupted by user")

285
examples/simple_client.py Normal file
View File

@@ -0,0 +1,285 @@
#!/usr/bin/env python3
"""
Simple WebSocket client for testing voice conversation.
Uses PyAudio for more reliable audio playback on Windows.
Usage:
python examples/simple_client.py
python examples/simple_client.py --text "Hello"
"""
import argparse
import asyncio
import json
import sys
import time
import wave
import io
try:
import numpy as np
except ImportError:
print("pip install numpy")
sys.exit(1)
try:
import websockets
except ImportError:
print("pip install websockets")
sys.exit(1)
# Try PyAudio first (more reliable on Windows)
try:
import pyaudio
PYAUDIO_AVAILABLE = True
except ImportError:
PYAUDIO_AVAILABLE = False
print("PyAudio not available, trying sounddevice...")
try:
import sounddevice as sd
SD_AVAILABLE = True
except ImportError:
SD_AVAILABLE = False
if not PYAUDIO_AVAILABLE and not SD_AVAILABLE:
print("Please install pyaudio or sounddevice:")
print(" pip install pyaudio")
print(" or: pip install sounddevice")
sys.exit(1)
class SimpleVoiceClient:
"""Simple voice client with reliable audio playback."""
def __init__(self, url: str, sample_rate: int = 16000):
self.url = url
self.sample_rate = sample_rate
self.ws = None
self.running = False
# Audio buffer
self.audio_buffer = b""
# PyAudio setup
if PYAUDIO_AVAILABLE:
self.pa = pyaudio.PyAudio()
self.stream = None
# Stats
self.bytes_received = 0
# TTFB tracking (Time to First Byte)
self.request_start_time = None
self.first_audio_received = False
# Interrupt handling - discard audio until next trackStart
self._discard_audio = False
async def connect(self):
"""Connect to server."""
print(f"Connecting to {self.url}...")
self.ws = await websockets.connect(self.url)
self.running = True
print("Connected!")
# Send invite
await self.ws.send(json.dumps({
"command": "invite",
"option": {"codec": "pcm", "sampleRate": self.sample_rate}
}))
print("-> invite")
async def send_chat(self, text: str):
"""Send chat message."""
# Reset TTFB tracking for new request
self.request_start_time = time.time()
self.first_audio_received = False
await self.ws.send(json.dumps({"command": "chat", "text": text}))
print(f"-> chat: {text}")
def play_audio(self, audio_data: bytes):
"""Play audio data immediately."""
if len(audio_data) == 0:
return
if PYAUDIO_AVAILABLE:
# Use PyAudio - more reliable on Windows
if self.stream is None:
self.stream = self.pa.open(
format=pyaudio.paInt16,
channels=1,
rate=self.sample_rate,
output=True,
frames_per_buffer=1024
)
self.stream.write(audio_data)
elif SD_AVAILABLE:
# Use sounddevice
samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32767.0
sd.play(samples, self.sample_rate, blocking=True)
async def receive_loop(self):
"""Receive and play audio."""
print("\nWaiting for response...")
while self.running:
try:
msg = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
if isinstance(msg, bytes):
# Audio data
self.bytes_received += len(msg)
duration_ms = len(msg) / (self.sample_rate * 2) * 1000
# Check if we should discard this audio (after interrupt)
if self._discard_audio:
print(f"<- audio: {len(msg)} bytes ({duration_ms:.0f}ms) [DISCARDED]")
continue
# Calculate and display TTFB for first audio packet
if not self.first_audio_received and self.request_start_time:
client_ttfb_ms = (time.time() - self.request_start_time) * 1000
self.first_audio_received = True
print(f"<- [TTFB] Client first audio latency: {client_ttfb_ms:.0f}ms")
print(f"<- audio: {len(msg)} bytes ({duration_ms:.0f}ms)")
# Play immediately in executor to not block
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, self.play_audio, msg)
else:
# JSON event
event = json.loads(msg)
etype = event.get("event", "?")
if etype == "transcript":
# User speech transcription
text = event.get("text", "")
is_final = event.get("isFinal", False)
if is_final:
print(f"<- You said: {text}")
else:
print(f"<- [listening] {text}", end="\r")
elif etype == "ttfb":
# Server-side TTFB event
latency_ms = event.get("latencyMs", 0)
print(f"<- [TTFB] Server reported latency: {latency_ms}ms")
elif etype == "trackStart":
# New track starting - accept audio again
self._discard_audio = False
print(f"<- {etype}")
elif etype == "interrupt":
# Interrupt - discard audio until next trackStart
self._discard_audio = True
print(f"<- {etype} (discarding audio until new track)")
elif etype == "hangup":
print(f"<- {etype}")
self.running = False
break
else:
print(f"<- {etype}")
except asyncio.TimeoutError:
continue
except websockets.ConnectionClosed:
print("Connection closed")
self.running = False
break
async def run(self, text: str = None):
"""Run the client."""
try:
await self.connect()
await asyncio.sleep(0.5)
# Start receiver
recv_task = asyncio.create_task(self.receive_loop())
if text:
await self.send_chat(text)
# Wait for response
await asyncio.sleep(30)
else:
# Interactive mode
print("\nType a message and press Enter (or 'quit' to exit):")
while self.running:
try:
user_input = await asyncio.get_event_loop().run_in_executor(
None, input, "> "
)
if user_input.lower() == 'quit':
break
if user_input.strip():
await self.send_chat(user_input)
except EOFError:
break
self.running = False
recv_task.cancel()
try:
await recv_task
except asyncio.CancelledError:
pass
finally:
await self.close()
async def close(self):
"""Close connections."""
self.running = False
if PYAUDIO_AVAILABLE:
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.pa.terminate()
if self.ws:
await self.ws.close()
print(f"\nTotal audio received: {self.bytes_received / 1024:.1f} KB")
def list_audio_devices():
"""List available audio devices."""
print("\n=== Audio Devices ===")
if PYAUDIO_AVAILABLE:
pa = pyaudio.PyAudio()
print("\nPyAudio devices:")
for i in range(pa.get_device_count()):
info = pa.get_device_info_by_index(i)
if info['maxOutputChannels'] > 0:
default = " [DEFAULT]" if i == pa.get_default_output_device_info()['index'] else ""
print(f" {i}: {info['name']}{default}")
pa.terminate()
if SD_AVAILABLE:
print("\nSounddevice devices:")
for i, d in enumerate(sd.query_devices()):
if d['max_output_channels'] > 0:
default = " [DEFAULT]" if i == sd.default.device[1] else ""
print(f" {i}: {d['name']}{default}")
async def main():
parser = argparse.ArgumentParser(description="Simple voice client")
parser.add_argument("--url", default="ws://localhost:8000/ws")
parser.add_argument("--text", help="Send text and play response")
parser.add_argument("--list-devices", action="store_true")
parser.add_argument("--sample-rate", type=int, default=16000)
args = parser.parse_args()
if args.list_devices:
list_audio_devices()
return
client = SimpleVoiceClient(args.url, args.sample_rate)
await client.run(args.text)
if __name__ == "__main__":
asyncio.run(main())

176
examples/test_websocket.py Normal file
View File

@@ -0,0 +1,176 @@
"""WebSocket endpoint test client.
Tests the /ws endpoint with sine wave or file audio streaming.
Based on reference/py-active-call/exec/test_ws_endpoint/test_ws.py
"""
import asyncio
import aiohttp
import json
import struct
import math
import argparse
import os
from datetime import datetime
# Configuration
SERVER_URL = "ws://localhost:8000/ws"
SAMPLE_RATE = 16000
FREQUENCY = 440 # 440Hz Sine Wave
CHUNK_DURATION_MS = 20
# 16kHz * 16-bit (2 bytes) * 20ms = 640 bytes per chunk
CHUNK_SIZE_BYTES = int(SAMPLE_RATE * 2 * (CHUNK_DURATION_MS / 1000.0))
def generate_sine_wave(duration_ms=1000):
"""Generates sine wave audio (16kHz mono PCM 16-bit)."""
num_samples = int(SAMPLE_RATE * (duration_ms / 1000.0))
audio_data = bytearray()
for x in range(num_samples):
# Generate sine wave sample
value = int(32767.0 * math.sin(2 * math.pi * FREQUENCY * x / SAMPLE_RATE))
# Pack as little-endian 16-bit integer
audio_data.extend(struct.pack('<h', value))
return audio_data
async def receive_loop(ws, ready_event: asyncio.Event):
"""Listen for incoming messages from the server."""
print("👂 Listening for server responses...")
async for msg in ws:
timestamp = datetime.now().strftime("%H:%M:%S")
if msg.type == aiohttp.WSMsgType.TEXT:
try:
data = json.loads(msg.data)
event_type = data.get('type', 'Unknown')
print(f"[{timestamp}] 📨 Event: {event_type} | {msg.data[:150]}...")
if event_type == "session.started":
ready_event.set()
except json.JSONDecodeError:
print(f"[{timestamp}] 📨 Text: {msg.data[:100]}...")
elif msg.type == aiohttp.WSMsgType.BINARY:
# Received audio chunk back (e.g., TTS or echo)
print(f"[{timestamp}] 🔊 Audio: {len(msg.data)} bytes", end="\r")
elif msg.type == aiohttp.WSMsgType.CLOSED:
print(f"\n[{timestamp}] ❌ Socket Closed")
break
elif msg.type == aiohttp.WSMsgType.ERROR:
print(f"\n[{timestamp}] ⚠️ Socket Error")
break
async def send_file_loop(ws, file_path):
"""Stream a raw PCM/WAV file to the server."""
if not os.path.exists(file_path):
print(f"❌ Error: File '{file_path}' not found.")
return
print(f"📂 Streaming file: {file_path} ...")
with open(file_path, "rb") as f:
# Skip WAV header if present (first 44 bytes)
if file_path.endswith('.wav'):
f.read(44)
while True:
chunk = f.read(CHUNK_SIZE_BYTES)
if not chunk:
break
# Send binary frame
await ws.send_bytes(chunk)
# Sleep to simulate real-time playback
await asyncio.sleep(CHUNK_DURATION_MS / 1000.0)
print(f"\n✅ Finished streaming {file_path}")
async def send_sine_loop(ws):
"""Stream generated sine wave to the server."""
print("🎙️ Starting Audio Stream (Sine Wave)...")
# Generate 10 seconds of audio buffer
audio_buffer = generate_sine_wave(5000)
cursor = 0
while cursor < len(audio_buffer):
chunk = audio_buffer[cursor:cursor + CHUNK_SIZE_BYTES]
if not chunk:
break
await ws.send_bytes(chunk)
cursor += len(chunk)
await asyncio.sleep(CHUNK_DURATION_MS / 1000.0)
print("\n✅ Finished streaming test audio.")
async def run_client(url, file_path=None, use_sine=False):
"""Run the WebSocket test client."""
session = aiohttp.ClientSession()
try:
print(f"🔌 Connecting to {url}...")
async with session.ws_connect(url) as ws:
print("✅ Connected!")
session_ready = asyncio.Event()
recv_task = asyncio.create_task(receive_loop(ws, session_ready))
# Send v1 hello + session.start handshake
await ws.send_json({"type": "hello", "version": "v1"})
await ws.send_json({
"type": "session.start",
"audio": {
"encoding": "pcm_s16le",
"sample_rate_hz": SAMPLE_RATE,
"channels": 1
}
})
print("📤 Sent v1 hello/session.start")
await asyncio.wait_for(session_ready.wait(), timeout=8)
# Select sender based on args
if use_sine:
await send_sine_loop(ws)
elif file_path:
await send_file_loop(ws, file_path)
else:
# Default to sine wave
await send_sine_loop(ws)
await ws.send_json({"type": "session.stop", "reason": "test_complete"})
await asyncio.sleep(1)
recv_task.cancel()
try:
await recv_task
except asyncio.CancelledError:
pass
except aiohttp.ClientConnectorError:
print(f"❌ Connection Failed. Is the server running at {url}?")
except asyncio.TimeoutError:
print("❌ Timeout waiting for session.started")
except Exception as e:
print(f"❌ Error: {e}")
finally:
await session.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="WebSocket Audio Test Client")
parser.add_argument("--url", default=SERVER_URL, help="WebSocket endpoint URL")
parser.add_argument("--file", help="Path to PCM/WAV file to stream")
parser.add_argument("--sine", action="store_true", help="Use sine wave generation (default)")
args = parser.parse_args()
try:
asyncio.run(run_client(args.url, args.file, args.sine))
except KeyboardInterrupt:
print("\n👋 Client stopped.")

504
examples/wav_client.py Normal file
View File

@@ -0,0 +1,504 @@
#!/usr/bin/env python3
"""
WAV file client for testing duplex voice conversation.
This client reads audio from a WAV file, sends it to the server,
and saves the AI's voice response to an output WAV file.
Usage:
python examples/wav_client.py --input input.wav --output response.wav
python examples/wav_client.py --input input.wav --output response.wav --url ws://localhost:8000/ws
python examples/wav_client.py --input input.wav --output response.wav --wait-time 10
python wav_client.py --input ../data/audio_examples/two_utterances.wav -o response.wav
Requirements:
pip install soundfile websockets numpy
"""
import argparse
import asyncio
import json
import sys
import time
import wave
from pathlib import Path
try:
import numpy as np
except ImportError:
print("Please install numpy: pip install numpy")
sys.exit(1)
try:
import soundfile as sf
except ImportError:
print("Please install soundfile: pip install soundfile")
sys.exit(1)
try:
import websockets
except ImportError:
print("Please install websockets: pip install websockets")
sys.exit(1)
class WavFileClient:
"""
WAV file client for voice conversation testing.
Features:
- Read audio from WAV file
- Send audio to WebSocket server
- Receive and save response audio
- Event logging
"""
def __init__(
self,
url: str,
input_file: str,
output_file: str,
sample_rate: int = 16000,
chunk_duration_ms: int = 20,
wait_time: float = 15.0,
verbose: bool = False
):
"""
Initialize WAV file client.
Args:
url: WebSocket server URL
input_file: Input WAV file path
output_file: Output WAV file path
sample_rate: Audio sample rate (Hz)
chunk_duration_ms: Audio chunk duration (ms) for sending
wait_time: Time to wait for response after sending (seconds)
verbose: Enable verbose output
"""
self.url = url
self.input_file = Path(input_file)
self.output_file = Path(output_file)
self.sample_rate = sample_rate
self.chunk_duration_ms = chunk_duration_ms
self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
self.wait_time = wait_time
self.verbose = verbose
# WebSocket connection
self.ws = None
self.running = False
# Audio buffers
self.received_audio = bytearray()
# Statistics
self.bytes_sent = 0
self.bytes_received = 0
# TTFB tracking (per response)
self.send_start_time = None
self.response_start_time = None # set on each trackStart
self.waiting_for_first_audio = False
self.ttfb_ms = None # last TTFB for summary
self.ttfb_list = [] # TTFB for each response
# State tracking
self.track_started = False
self.track_ended = False
self.send_completed = False
# Events log
self.events_log = []
def log_event(self, direction: str, message: str):
"""Log an event with timestamp."""
timestamp = time.time()
self.events_log.append({
"timestamp": timestamp,
"direction": direction,
"message": message
})
# Handle encoding errors on Windows
try:
print(f"{direction} {message}")
except UnicodeEncodeError:
# Replace problematic characters for console output
safe_message = message.encode('ascii', errors='replace').decode('ascii')
print(f"{direction} {safe_message}")
async def connect(self) -> None:
"""Connect to WebSocket server."""
self.log_event("", f"Connecting to {self.url}...")
self.ws = await websockets.connect(self.url)
self.running = True
self.log_event("", "Connected!")
# Send invite command
await self.send_command({
"command": "invite",
"option": {
"codec": "pcm",
"sampleRate": self.sample_rate
}
})
async def send_command(self, cmd: dict) -> None:
"""Send JSON command to server."""
if self.ws:
await self.ws.send(json.dumps(cmd))
self.log_event("", f"Command: {cmd.get('command', 'unknown')}")
async def send_hangup(self, reason: str = "Session complete") -> None:
"""Send hangup command."""
await self.send_command({
"command": "hangup",
"reason": reason
})
def load_wav_file(self) -> tuple[np.ndarray, int]:
"""
Load and prepare WAV file for sending.
Returns:
Tuple of (audio_data as int16 numpy array, original sample rate)
"""
if not self.input_file.exists():
raise FileNotFoundError(f"Input file not found: {self.input_file}")
# Load audio file
audio_data, file_sample_rate = sf.read(self.input_file)
self.log_event("", f"Loaded: {self.input_file}")
self.log_event("", f" Original sample rate: {file_sample_rate} Hz")
self.log_event("", f" Duration: {len(audio_data) / file_sample_rate:.2f}s")
# Convert stereo to mono if needed
if len(audio_data.shape) > 1:
audio_data = audio_data.mean(axis=1)
self.log_event("", " Converted stereo to mono")
# Resample if needed
if file_sample_rate != self.sample_rate:
# Simple resampling using numpy
duration = len(audio_data) / file_sample_rate
num_samples = int(duration * self.sample_rate)
indices = np.linspace(0, len(audio_data) - 1, num_samples)
audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data)
self.log_event("", f" Resampled to {self.sample_rate} Hz")
# Convert to int16
if audio_data.dtype != np.int16:
# Normalize to [-1, 1] if needed
max_val = np.max(np.abs(audio_data))
if max_val > 1.0:
audio_data = audio_data / max_val
audio_data = (audio_data * 32767).astype(np.int16)
self.log_event("", f" Prepared: {len(audio_data)} samples ({len(audio_data)/self.sample_rate:.2f}s)")
return audio_data, file_sample_rate
async def audio_sender(self, audio_data: np.ndarray) -> None:
"""Send audio data to server in chunks."""
total_samples = len(audio_data)
chunk_size = self.chunk_samples
sent_samples = 0
self.send_start_time = time.time()
self.log_event("", f"Starting audio transmission ({total_samples} samples)...")
while sent_samples < total_samples and self.running:
# Get next chunk
end_sample = min(sent_samples + chunk_size, total_samples)
chunk = audio_data[sent_samples:end_sample]
chunk_bytes = chunk.tobytes()
# Send to server
if self.ws:
await self.ws.send(chunk_bytes)
self.bytes_sent += len(chunk_bytes)
sent_samples = end_sample
# Progress logging (every 500ms worth of audio)
if self.verbose and sent_samples % (self.sample_rate // 2) == 0:
progress = (sent_samples / total_samples) * 100
print(f" Sending: {progress:.0f}%", end="\r")
# Delay to simulate real-time streaming
# Server expects audio at real-time pace for VAD/ASR to work properly
await asyncio.sleep(self.chunk_duration_ms / 1000)
self.send_completed = True
elapsed = time.time() - self.send_start_time
self.log_event("", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent/1024:.1f} KB)")
async def receiver(self) -> None:
"""Receive messages from server."""
try:
while self.running:
try:
message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
if isinstance(message, bytes):
# Audio data received
self.bytes_received += len(message)
self.received_audio.extend(message)
# Calculate TTFB on first audio of each response
if self.waiting_for_first_audio and self.response_start_time is not None:
ttfb_ms = (time.time() - self.response_start_time) * 1000
self.ttfb_ms = ttfb_ms
self.ttfb_list.append(ttfb_ms)
self.waiting_for_first_audio = False
self.log_event("", f"[TTFB] First audio latency: {ttfb_ms:.0f}ms")
# Log progress
duration_ms = len(message) / (self.sample_rate * 2) * 1000
total_ms = len(self.received_audio) / (self.sample_rate * 2) * 1000
if self.verbose:
print(f"← Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r")
else:
# JSON event
event = json.loads(message)
await self._handle_event(event)
except asyncio.TimeoutError:
continue
except websockets.ConnectionClosed:
self.log_event("", "Connection closed")
self.running = False
break
except asyncio.CancelledError:
pass
except Exception as e:
self.log_event("!", f"Receiver error: {e}")
self.running = False
async def _handle_event(self, event: dict) -> None:
"""Handle incoming event."""
event_type = event.get("event", "unknown")
if event_type == "answer":
self.log_event("", "Session ready!")
elif event_type == "speaking":
self.log_event("", "Speech detected")
elif event_type == "silence":
self.log_event("", "Silence detected")
elif event_type == "transcript":
# ASR transcript (interim = asrDelta-style, final = asrFinal-style)
text = event.get("text", "")
is_final = event.get("isFinal", False)
if is_final:
# Clear interim line and print final
print(" " * 80, end="\r")
self.log_event("", f"→ You: {text}")
else:
# Interim result - show with indicator (overwrite same line, as in mic_client)
display_text = text[:60] + "..." if len(text) > 60 else text
print(f" [listening] {display_text}".ljust(80), end="\r")
elif event_type == "ttfb":
latency_ms = event.get("latencyMs", 0)
self.log_event("", f"[TTFB] Server latency: {latency_ms}ms")
elif event_type == "llmResponse":
text = event.get("text", "")
is_final = event.get("isFinal", False)
if is_final:
self.log_event("", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
elif self.verbose:
# Show streaming chunks only in verbose mode
self.log_event("", f"LLM: {text}")
elif event_type == "trackStart":
self.track_started = True
self.response_start_time = time.time()
self.waiting_for_first_audio = True
self.log_event("", "Bot started speaking")
elif event_type == "trackEnd":
self.track_ended = True
self.log_event("", "Bot finished speaking")
elif event_type == "interrupt":
self.log_event("", "Bot interrupted!")
elif event_type == "error":
self.log_event("!", f"Error: {event.get('error')}")
elif event_type == "hangup":
self.log_event("", f"Hangup: {event.get('reason')}")
self.running = False
else:
self.log_event("", f"Event: {event_type}")
def save_output_wav(self) -> None:
"""Save received audio to output WAV file."""
if not self.received_audio:
self.log_event("!", "No audio received to save")
return
# Convert bytes to numpy array
audio_data = np.frombuffer(bytes(self.received_audio), dtype=np.int16)
# Ensure output directory exists
self.output_file.parent.mkdir(parents=True, exist_ok=True)
# Save using wave module for compatibility
with wave.open(str(self.output_file), 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(self.sample_rate)
wav_file.writeframes(audio_data.tobytes())
duration = len(audio_data) / self.sample_rate
self.log_event("", f"Saved output: {self.output_file}")
self.log_event("", f" Duration: {duration:.2f}s ({len(audio_data)} samples)")
self.log_event("", f" Size: {len(self.received_audio)/1024:.1f} KB")
async def run(self) -> None:
"""Run the WAV file test."""
try:
# Load input WAV file
audio_data, _ = self.load_wav_file()
# Connect to server
await self.connect()
# Wait for answer
await asyncio.sleep(0.5)
# Start receiver task
receiver_task = asyncio.create_task(self.receiver())
# Send audio
await self.audio_sender(audio_data)
# Wait for response
self.log_event("", f"Waiting {self.wait_time}s for response...")
wait_start = time.time()
while self.running and (time.time() - wait_start) < self.wait_time:
# Check if track has ended (response complete)
if self.track_ended and self.send_completed:
# Give a little extra time for any remaining audio
await asyncio.sleep(1.0)
break
await asyncio.sleep(0.1)
# Cleanup
self.running = False
receiver_task.cancel()
try:
await receiver_task
except asyncio.CancelledError:
pass
# Save output
self.save_output_wav()
# Print summary
self._print_summary()
except FileNotFoundError as e:
print(f"Error: {e}")
sys.exit(1)
except ConnectionRefusedError:
print(f"Error: Could not connect to {self.url}")
print("Make sure the server is running.")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
finally:
await self.close()
def _print_summary(self):
"""Print session summary."""
print("\n" + "=" * 50)
print("Session Summary")
print("=" * 50)
print(f" Input file: {self.input_file}")
print(f" Output file: {self.output_file}")
print(f" Bytes sent: {self.bytes_sent / 1024:.1f} KB")
print(f" Bytes received: {self.bytes_received / 1024:.1f} KB")
if self.ttfb_list:
if len(self.ttfb_list) == 1:
print(f" TTFB: {self.ttfb_list[0]:.0f} ms")
else:
print(f" TTFB (per response): {', '.join(f'{t:.0f}ms' for t in self.ttfb_list)}")
if self.received_audio:
duration = len(self.received_audio) / (self.sample_rate * 2)
print(f" Response duration: {duration:.2f}s")
print("=" * 50)
async def close(self) -> None:
"""Close the connection."""
self.running = False
if self.ws:
try:
await self.ws.close()
except:
pass
async def main():
parser = argparse.ArgumentParser(
description="WAV file client for testing duplex voice conversation"
)
parser.add_argument(
"--input", "-i",
required=True,
help="Input WAV file path"
)
parser.add_argument(
"--output", "-o",
required=True,
help="Output WAV file path for response"
)
parser.add_argument(
"--url",
default="ws://localhost:8000/ws",
help="WebSocket server URL (default: ws://localhost:8000/ws)"
)
parser.add_argument(
"--sample-rate",
type=int,
default=16000,
help="Target sample rate for audio (default: 16000)"
)
parser.add_argument(
"--chunk-duration",
type=int,
default=20,
help="Chunk duration in ms for sending (default: 20)"
)
parser.add_argument(
"--wait-time", "-w",
type=float,
default=15.0,
help="Time to wait for response after sending (default: 15.0)"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Enable verbose output"
)
args = parser.parse_args()
client = WavFileClient(
url=args.url,
input_file=args.input,
output_file=args.output,
sample_rate=args.sample_rate,
chunk_duration_ms=args.chunk_duration,
wait_time=args.wait_time,
verbose=args.verbose
)
await client.run()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
print("\nInterrupted by user")

766
examples/web_client.html Normal file
View File

@@ -0,0 +1,766 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Duplex Voice Web Client</title>
<style>
@import url("https://fonts.googleapis.com/css2?family=Fraunces:opsz,wght@9..144,300;9..144,500;9..144,700&family=Recursive:wght@300;400;600;700&display=swap");
:root {
--bg: #0b0b0f;
--panel: #14141c;
--panel-2: #101018;
--ink: #f2f3f7;
--muted: #a7acba;
--accent: #ff6b6b;
--accent-2: #ffd166;
--good: #2dd4bf;
--bad: #f87171;
--grid: rgba(255, 255, 255, 0.06);
--shadow: 0 20px 60px rgba(0, 0, 0, 0.45);
}
* {
box-sizing: border-box;
}
html,
body {
height: 100%;
margin: 0;
color: var(--ink);
background: radial-gradient(1200px 600px at 20% -10%, #1d1d2a 0%, transparent 60%),
radial-gradient(800px 800px at 110% 10%, #20203a 0%, transparent 50%),
var(--bg);
font-family: "Recursive", ui-sans-serif, system-ui, -apple-system, "Segoe UI", sans-serif;
}
.noise {
position: fixed;
inset: 0;
background-image: url("data:image/svg+xml;utf8,<svg xmlns='http://www.w3.org/2000/svg' width='120' height='120' viewBox='0 0 120 120'><filter id='n'><feTurbulence type='fractalNoise' baseFrequency='0.9' numOctaves='2' stitchTiles='stitch'/></filter><rect width='120' height='120' filter='url(%23n)' opacity='0.06'/></svg>");
pointer-events: none;
mix-blend-mode: soft-light;
}
header {
padding: 32px 28px 18px;
border-bottom: 1px solid var(--grid);
}
h1 {
font-family: "Fraunces", serif;
font-weight: 600;
margin: 0 0 6px;
letter-spacing: 0.4px;
}
.subtitle {
color: var(--muted);
font-size: 0.95rem;
}
main {
display: grid;
grid-template-columns: 1.1fr 1.4fr;
gap: 24px;
padding: 24px 28px 40px;
}
.panel {
background: linear-gradient(180deg, rgba(255, 255, 255, 0.02), transparent),
var(--panel);
border: 1px solid var(--grid);
border-radius: 16px;
padding: 20px;
box-shadow: var(--shadow);
}
.panel h2 {
margin: 0 0 12px;
font-size: 1.05rem;
font-weight: 600;
}
.stack {
display: grid;
gap: 12px;
}
label {
display: block;
font-size: 0.85rem;
color: var(--muted);
margin-bottom: 6px;
}
input,
select,
button,
textarea {
font-family: inherit;
}
input,
select,
textarea {
width: 100%;
padding: 10px 12px;
border-radius: 10px;
border: 1px solid var(--grid);
background: var(--panel-2);
color: var(--ink);
outline: none;
}
textarea {
min-height: 80px;
resize: vertical;
}
.row {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 12px;
}
.btn-row {
display: flex;
flex-wrap: wrap;
gap: 10px;
}
button {
border: none;
border-radius: 999px;
padding: 10px 16px;
font-weight: 600;
background: var(--ink);
color: #111;
cursor: pointer;
transition: transform 0.2s ease, box-shadow 0.2s ease;
}
button.secondary {
background: transparent;
color: var(--ink);
border: 1px solid var(--grid);
}
button.accent {
background: linear-gradient(120deg, var(--accent), #f97316);
color: #0b0b0f;
}
button.good {
background: linear-gradient(120deg, var(--good), #22c55e);
color: #07261f;
}
button.bad {
background: linear-gradient(120deg, var(--bad), #f97316);
color: #2a0b0b;
}
button:active {
transform: translateY(1px) scale(0.99);
}
.status {
display: flex;
align-items: center;
gap: 12px;
padding: 12px;
background: rgba(255, 255, 255, 0.03);
border-radius: 12px;
border: 1px dashed var(--grid);
font-size: 0.9rem;
}
.dot {
width: 10px;
height: 10px;
border-radius: 999px;
background: var(--bad);
box-shadow: 0 0 12px rgba(248, 113, 113, 0.5);
}
.dot.on {
background: var(--good);
box-shadow: 0 0 12px rgba(45, 212, 191, 0.7);
}
.log {
height: 320px;
overflow: auto;
padding: 12px;
background: #0d0d14;
border-radius: 12px;
border: 1px solid var(--grid);
font-size: 0.85rem;
line-height: 1.4;
}
.chat {
height: 260px;
overflow: auto;
padding: 12px;
background: #0d0d14;
border-radius: 12px;
border: 1px solid var(--grid);
font-size: 0.9rem;
line-height: 1.45;
}
.chat-entry {
padding: 8px 10px;
margin-bottom: 8px;
border-radius: 10px;
background: rgba(255, 255, 255, 0.04);
border: 1px solid rgba(255, 255, 255, 0.06);
}
.chat-entry.user {
border-left: 3px solid var(--accent-2);
}
.chat-entry.ai {
border-left: 3px solid var(--good);
}
.chat-entry.interim {
opacity: 0.7;
font-style: italic;
}
.log-entry {
padding: 6px 8px;
border-bottom: 1px dashed rgba(255, 255, 255, 0.06);
}
.log-entry:last-child {
border-bottom: none;
}
.tag {
display: inline-flex;
align-items: center;
gap: 6px;
padding: 2px 8px;
border-radius: 999px;
font-size: 0.7rem;
text-transform: uppercase;
letter-spacing: 0.6px;
background: rgba(255, 255, 255, 0.08);
color: var(--muted);
}
.tag.event {
background: rgba(255, 107, 107, 0.18);
color: #ffc1c1;
}
.tag.audio {
background: rgba(45, 212, 191, 0.2);
color: #c5f9f0;
}
.tag.sys {
background: rgba(255, 209, 102, 0.2);
color: #ffefb0;
}
.muted {
color: var(--muted);
}
footer {
padding: 0 28px 28px;
color: var(--muted);
font-size: 0.8rem;
}
@media (max-width: 1100px) {
main {
grid-template-columns: 1fr;
}
.log {
height: 360px;
}
.chat {
height: 260px;
}
}
</style>
</head>
<body>
<div class="noise"></div>
<header>
<h1>Duplex Voice Client</h1>
<div class="subtitle">Browser client for the WebSocket duplex pipeline. Device selection + event logging.</div>
</header>
<main>
<section class="panel stack">
<h2>Connection</h2>
<div>
<label for="wsUrl">WebSocket URL</label>
<input id="wsUrl" value="ws://localhost:8000/ws" />
</div>
<div class="btn-row">
<button class="accent" id="connectBtn">Connect</button>
<button class="secondary" id="disconnectBtn">Disconnect</button>
</div>
<div class="status">
<div id="statusDot" class="dot"></div>
<div>
<div id="statusText">Disconnected</div>
<div class="muted" id="statusSub">Waiting for connection</div>
</div>
</div>
<h2>Devices</h2>
<div class="row">
<div>
<label for="inputSelect">Input (Mic)</label>
<select id="inputSelect"></select>
</div>
<div>
<label for="outputSelect">Output (Speaker)</label>
<select id="outputSelect"></select>
</div>
</div>
<div class="btn-row">
<button class="secondary" id="refreshDevicesBtn">Refresh Devices</button>
<button class="good" id="startMicBtn">Start Mic</button>
<button class="secondary" id="stopMicBtn">Stop Mic</button>
</div>
<h2>Chat</h2>
<div class="stack">
<textarea id="chatInput" placeholder="Type a message, press Send"></textarea>
<div class="btn-row">
<button class="accent" id="sendChatBtn">Send Chat</button>
<button class="secondary" id="clearLogBtn">Clear Log</button>
</div>
</div>
</section>
<section class="stack">
<div class="panel stack">
<h2>Chat History</h2>
<div class="chat" id="chatHistory"></div>
</div>
<div class="panel stack">
<h2>Event Log</h2>
<div class="log" id="log"></div>
</div>
</section>
</main>
<footer>
Output device selection requires HTTPS + a browser that supports <code>setSinkId</code>.
Audio is sent as 16-bit PCM @ 16 kHz, matching <code>examples/mic_client.py</code>.
</footer>
<audio id="audioOut" autoplay></audio>
<script>
const wsUrl = document.getElementById("wsUrl");
const connectBtn = document.getElementById("connectBtn");
const disconnectBtn = document.getElementById("disconnectBtn");
const inputSelect = document.getElementById("inputSelect");
const outputSelect = document.getElementById("outputSelect");
const startMicBtn = document.getElementById("startMicBtn");
const stopMicBtn = document.getElementById("stopMicBtn");
const refreshDevicesBtn = document.getElementById("refreshDevicesBtn");
const sendChatBtn = document.getElementById("sendChatBtn");
const clearLogBtn = document.getElementById("clearLogBtn");
const chatInput = document.getElementById("chatInput");
const logEl = document.getElementById("log");
const chatHistory = document.getElementById("chatHistory");
const statusDot = document.getElementById("statusDot");
const statusText = document.getElementById("statusText");
const statusSub = document.getElementById("statusSub");
const audioOut = document.getElementById("audioOut");
let ws = null;
let audioCtx = null;
let micStream = null;
let processor = null;
let micSource = null;
let playbackDest = null;
let playbackTime = 0;
let discardAudio = false;
let playbackSources = [];
let interimUserEl = null;
let interimAiEl = null;
let interimUserText = "";
let interimAiText = "";
const targetSampleRate = 16000;
const playbackStopRampSec = 0.008;
function logLine(type, text, data) {
const time = new Date().toLocaleTimeString();
const entry = document.createElement("div");
entry.className = "log-entry";
const tag = document.createElement("span");
tag.className = `tag ${type}`;
tag.textContent = type.toUpperCase();
const msg = document.createElement("span");
msg.style.marginLeft = "10px";
msg.textContent = `[${time}] ${text}`;
entry.appendChild(tag);
entry.appendChild(msg);
if (data) {
const pre = document.createElement("div");
pre.className = "muted";
pre.textContent = JSON.stringify(data);
pre.style.marginTop = "4px";
entry.appendChild(pre);
}
logEl.appendChild(entry);
logEl.scrollTop = logEl.scrollHeight;
}
function addChat(role, text) {
const entry = document.createElement("div");
entry.className = `chat-entry ${role === "AI" ? "ai" : "user"}`;
entry.textContent = `${role}: ${text}`;
chatHistory.appendChild(entry);
chatHistory.scrollTop = chatHistory.scrollHeight;
}
function setInterim(role, text) {
const isAi = role === "AI";
let el = isAi ? interimAiEl : interimUserEl;
if (!text) {
if (el) el.remove();
if (isAi) interimAiEl = null;
else interimUserEl = null;
if (isAi) interimAiText = "";
else interimUserText = "";
return;
}
if (!el) {
el = document.createElement("div");
el.className = `chat-entry ${isAi ? "ai" : "user"} interim`;
chatHistory.appendChild(el);
if (isAi) interimAiEl = el;
else interimUserEl = el;
}
el.textContent = `${role} (interim): ${text}`;
chatHistory.scrollTop = chatHistory.scrollHeight;
}
function stopPlayback() {
discardAudio = true;
const now = audioCtx ? audioCtx.currentTime : 0;
playbackTime = now;
playbackSources.forEach((node) => {
try {
if (audioCtx && node.gainNode && node.source) {
node.gainNode.gain.cancelScheduledValues(now);
node.gainNode.gain.setValueAtTime(node.gainNode.gain.value || 1, now);
node.gainNode.gain.linearRampToValueAtTime(0, now + playbackStopRampSec);
node.source.stop(now + playbackStopRampSec + 0.002);
} else if (node.source) {
node.source.stop();
}
} catch (err) {}
});
playbackSources = [];
}
function setStatus(connected, detail) {
statusDot.classList.toggle("on", connected);
statusText.textContent = connected ? "Connected" : "Disconnected";
statusSub.textContent = detail || "";
}
async function ensureAudioContext() {
if (audioCtx) return;
audioCtx = new (window.AudioContext || window.webkitAudioContext)();
playbackDest = audioCtx.createMediaStreamDestination();
audioOut.srcObject = playbackDest.stream;
try {
await audioOut.play();
} catch (err) {
logLine("sys", "Audio playback blocked (user gesture needed)", { err: String(err) });
}
if (outputSelect.value) {
await setOutputDevice(outputSelect.value);
}
}
function downsampleBuffer(buffer, inRate, outRate) {
if (outRate === inRate) return buffer;
const ratio = inRate / outRate;
const newLength = Math.round(buffer.length / ratio);
const result = new Float32Array(newLength);
let offsetResult = 0;
let offsetBuffer = 0;
while (offsetResult < result.length) {
const nextOffsetBuffer = Math.round((offsetResult + 1) * ratio);
let accum = 0;
let count = 0;
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
accum += buffer[i];
count++;
}
result[offsetResult] = accum / count;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result;
}
function floatTo16BitPCM(float32) {
const out = new Int16Array(float32.length);
for (let i = 0; i < float32.length; i++) {
const s = Math.max(-1, Math.min(1, float32[i]));
out[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
return out;
}
function schedulePlayback(int16Data) {
if (!audioCtx || !playbackDest) return;
if (discardAudio) return;
const float32 = new Float32Array(int16Data.length);
for (let i = 0; i < int16Data.length; i++) {
float32[i] = int16Data[i] / 32768;
}
const buffer = audioCtx.createBuffer(1, float32.length, targetSampleRate);
buffer.copyToChannel(float32, 0);
const source = audioCtx.createBufferSource();
const gainNode = audioCtx.createGain();
source.buffer = buffer;
source.connect(gainNode);
gainNode.connect(playbackDest);
const startTime = Math.max(audioCtx.currentTime + 0.02, playbackTime);
gainNode.gain.setValueAtTime(1, startTime);
source.start(startTime);
playbackTime = startTime + buffer.duration;
const playbackNode = { source, gainNode };
playbackSources.push(playbackNode);
source.onended = () => {
playbackSources = playbackSources.filter((s) => s !== playbackNode);
};
}
async function connect() {
if (ws && ws.readyState === WebSocket.OPEN) return;
ws = new WebSocket(wsUrl.value.trim());
ws.binaryType = "arraybuffer";
ws.onopen = () => {
setStatus(true, "Session open");
logLine("sys", "WebSocket connected");
ensureAudioContext();
sendCommand({ type: "hello", version: "v1" });
};
ws.onclose = () => {
setStatus(false, "Connection closed");
logLine("sys", "WebSocket closed");
ws = null;
};
ws.onerror = (err) => {
logLine("sys", "WebSocket error", { err: String(err) });
};
ws.onmessage = (msg) => {
if (typeof msg.data === "string") {
const event = JSON.parse(msg.data);
handleEvent(event);
} else {
const audioBuf = msg.data;
const int16 = new Int16Array(audioBuf);
schedulePlayback(int16);
logLine("audio", `Audio ${Math.round((int16.length / targetSampleRate) * 1000)}ms`);
}
};
}
function disconnect() {
if (ws && ws.readyState === WebSocket.OPEN) {
sendCommand({ type: "session.stop", reason: "client_disconnect" });
ws.close();
}
ws = null;
setStatus(false, "Disconnected");
}
function sendCommand(cmd) {
if (!ws || ws.readyState !== WebSocket.OPEN) {
logLine("sys", "Not connected");
return;
}
ws.send(JSON.stringify(cmd));
logLine("sys", `${cmd.type}`, cmd);
}
function handleEvent(event) {
const type = event.type || "unknown";
logLine("event", type, event);
if (type === "hello.ack") {
sendCommand({
type: "session.start",
audio: { encoding: "pcm_s16le", sample_rate_hz: targetSampleRate, channels: 1 },
});
}
if (type === "transcript.final") {
if (event.text) {
setInterim("You", "");
addChat("You", event.text);
}
}
if (type === "transcript.delta" && event.text) {
setInterim("You", event.text);
}
if (type === "assistant.response.final") {
if (event.text) {
setInterim("AI", "");
addChat("AI", event.text);
}
}
if (type === "assistant.response.delta" && event.text) {
interimAiText += event.text;
setInterim("AI", interimAiText);
}
if (type === "output.audio.start") {
// New bot audio: stop any previous playback to avoid overlap
stopPlayback();
discardAudio = false;
interimAiText = "";
}
if (type === "input.speech_started") {
// User started speaking: clear any in-flight audio to avoid overlap
stopPlayback();
}
if (type === "response.interrupted") {
stopPlayback();
}
}
async function startMic() {
if (!ws || ws.readyState !== WebSocket.OPEN) {
logLine("sys", "Connect before starting mic");
return;
}
await ensureAudioContext();
const deviceId = inputSelect.value || undefined;
micStream = await navigator.mediaDevices.getUserMedia({
audio: deviceId ? { deviceId: { exact: deviceId } } : true,
});
micSource = audioCtx.createMediaStreamSource(micStream);
processor = audioCtx.createScriptProcessor(2048, 1, 1);
processor.onaudioprocess = (e) => {
if (!ws || ws.readyState !== WebSocket.OPEN) return;
const input = e.inputBuffer.getChannelData(0);
const downsampled = downsampleBuffer(input, audioCtx.sampleRate, targetSampleRate);
const pcm16 = floatTo16BitPCM(downsampled);
ws.send(pcm16.buffer);
};
micSource.connect(processor);
processor.connect(audioCtx.destination);
logLine("sys", "Microphone started");
}
function stopMic() {
if (processor) {
processor.disconnect();
processor = null;
}
if (micSource) {
micSource.disconnect();
micSource = null;
}
if (micStream) {
micStream.getTracks().forEach((t) => t.stop());
micStream = null;
}
logLine("sys", "Microphone stopped");
}
async function refreshDevices() {
const devices = await navigator.mediaDevices.enumerateDevices();
inputSelect.innerHTML = "";
outputSelect.innerHTML = "";
devices.forEach((d) => {
if (d.kind === "audioinput") {
const opt = document.createElement("option");
opt.value = d.deviceId;
opt.textContent = d.label || `Mic ${inputSelect.length + 1}`;
inputSelect.appendChild(opt);
}
if (d.kind === "audiooutput") {
const opt = document.createElement("option");
opt.value = d.deviceId;
opt.textContent = d.label || `Output ${outputSelect.length + 1}`;
outputSelect.appendChild(opt);
}
});
}
async function requestDeviceAccess() {
// Needed to reveal device labels in most browsers
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
stream.getTracks().forEach((t) => t.stop());
logLine("sys", "Microphone permission granted");
} catch (err) {
logLine("sys", "Microphone permission denied", { err: String(err) });
}
}
async function setOutputDevice(deviceId) {
if (!audioOut.setSinkId) {
logLine("sys", "setSinkId not supported in this browser");
return;
}
await audioOut.setSinkId(deviceId);
logLine("sys", `Output device set`, { deviceId });
}
connectBtn.addEventListener("click", connect);
disconnectBtn.addEventListener("click", disconnect);
refreshDevicesBtn.addEventListener("click", async () => {
await requestDeviceAccess();
await refreshDevices();
});
startMicBtn.addEventListener("click", startMic);
stopMicBtn.addEventListener("click", stopMic);
sendChatBtn.addEventListener("click", () => {
const text = chatInput.value.trim();
if (!text) return;
ensureAudioContext();
addChat("You", text);
sendCommand({ type: "input.text", text });
chatInput.value = "";
});
clearLogBtn.addEventListener("click", () => {
logEl.innerHTML = "";
chatHistory.innerHTML = "";
setInterim("You", "");
setInterim("AI", "");
interimUserText = "";
interimAiText = "";
});
inputSelect.addEventListener("change", () => {
if (micStream) {
stopMic();
startMic();
}
});
outputSelect.addEventListener("change", () => setOutputDevice(outputSelect.value));
navigator.mediaDevices.addEventListener("devicechange", refreshDevices);
refreshDevices().catch(() => {});
</script>
</body>
</html>