Init commit
This commit is contained in:
601
examples/mic_client.py
Normal file
601
examples/mic_client.py
Normal file
@@ -0,0 +1,601 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Microphone client for testing duplex voice conversation.
|
||||
|
||||
This client captures audio from the microphone, sends it to the server,
|
||||
and plays back the AI's voice response through the speakers.
|
||||
It also displays the LLM's text responses in the console.
|
||||
|
||||
Usage:
|
||||
python examples/mic_client.py --url ws://localhost:8000/ws
|
||||
python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!"
|
||||
python examples/mic_client.py --url ws://localhost:8000/ws --verbose
|
||||
|
||||
Requirements:
|
||||
pip install sounddevice soundfile websockets numpy
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import threading
|
||||
import queue
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
print("Please install numpy: pip install numpy")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
import sounddevice as sd
|
||||
except ImportError:
|
||||
print("Please install sounddevice: pip install sounddevice")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
import websockets
|
||||
except ImportError:
|
||||
print("Please install websockets: pip install websockets")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class MicrophoneClient:
|
||||
"""
|
||||
Full-duplex microphone client for voice conversation.
|
||||
|
||||
Features:
|
||||
- Real-time microphone capture
|
||||
- Real-time speaker playback
|
||||
- WebSocket communication
|
||||
- Text chat support
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
sample_rate: int = 16000,
|
||||
chunk_duration_ms: int = 20,
|
||||
input_device: int = None,
|
||||
output_device: int = None
|
||||
):
|
||||
"""
|
||||
Initialize microphone client.
|
||||
|
||||
Args:
|
||||
url: WebSocket server URL
|
||||
sample_rate: Audio sample rate (Hz)
|
||||
chunk_duration_ms: Audio chunk duration (ms)
|
||||
input_device: Input device ID (None for default)
|
||||
output_device: Output device ID (None for default)
|
||||
"""
|
||||
self.url = url
|
||||
self.sample_rate = sample_rate
|
||||
self.chunk_duration_ms = chunk_duration_ms
|
||||
self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
|
||||
self.input_device = input_device
|
||||
self.output_device = output_device
|
||||
|
||||
# WebSocket connection
|
||||
self.ws = None
|
||||
self.running = False
|
||||
|
||||
# Audio buffers
|
||||
self.audio_input_queue = queue.Queue()
|
||||
self.audio_output_buffer = b"" # Continuous buffer for smooth playback
|
||||
self.audio_output_lock = threading.Lock()
|
||||
|
||||
# Statistics
|
||||
self.bytes_sent = 0
|
||||
self.bytes_received = 0
|
||||
|
||||
# State
|
||||
self.is_recording = True
|
||||
self.is_playing = True
|
||||
|
||||
# TTFB tracking (Time to First Byte)
|
||||
self.request_start_time = None
|
||||
self.first_audio_received = False
|
||||
|
||||
# Interrupt handling - discard audio until next trackStart
|
||||
self._discard_audio = False
|
||||
self._audio_sequence = 0 # Track audio sequence to detect stale chunks
|
||||
|
||||
# Verbose mode for streaming LLM responses
|
||||
self.verbose = False
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Connect to WebSocket server."""
|
||||
print(f"Connecting to {self.url}...")
|
||||
self.ws = await websockets.connect(self.url)
|
||||
self.running = True
|
||||
print("Connected!")
|
||||
|
||||
# Send invite command
|
||||
await self.send_command({
|
||||
"command": "invite",
|
||||
"option": {
|
||||
"codec": "pcm",
|
||||
"sampleRate": self.sample_rate
|
||||
}
|
||||
})
|
||||
|
||||
async def send_command(self, cmd: dict) -> None:
|
||||
"""Send JSON command to server."""
|
||||
if self.ws:
|
||||
await self.ws.send(json.dumps(cmd))
|
||||
print(f"→ Command: {cmd.get('command', 'unknown')}")
|
||||
|
||||
async def send_chat(self, text: str) -> None:
|
||||
"""Send chat message (text input)."""
|
||||
# Reset TTFB tracking for new request
|
||||
self.request_start_time = time.time()
|
||||
self.first_audio_received = False
|
||||
|
||||
await self.send_command({
|
||||
"command": "chat",
|
||||
"text": text
|
||||
})
|
||||
print(f"→ Chat: {text}")
|
||||
|
||||
async def send_interrupt(self) -> None:
|
||||
"""Send interrupt command."""
|
||||
await self.send_command({
|
||||
"command": "interrupt"
|
||||
})
|
||||
|
||||
async def send_hangup(self, reason: str = "User quit") -> None:
|
||||
"""Send hangup command."""
|
||||
await self.send_command({
|
||||
"command": "hangup",
|
||||
"reason": reason
|
||||
})
|
||||
|
||||
def _audio_input_callback(self, indata, frames, time, status):
|
||||
"""Callback for audio input (microphone)."""
|
||||
if status:
|
||||
print(f"Input status: {status}")
|
||||
|
||||
if self.is_recording and self.running:
|
||||
# Convert to 16-bit PCM
|
||||
audio_data = (indata[:, 0] * 32767).astype(np.int16).tobytes()
|
||||
self.audio_input_queue.put(audio_data)
|
||||
|
||||
def _add_audio_to_buffer(self, audio_data: bytes):
|
||||
"""Add audio data to playback buffer."""
|
||||
with self.audio_output_lock:
|
||||
self.audio_output_buffer += audio_data
|
||||
|
||||
def _playback_thread_func(self):
|
||||
"""Thread function for continuous audio playback."""
|
||||
import time
|
||||
|
||||
# Chunk size: 50ms of audio
|
||||
chunk_samples = int(self.sample_rate * 0.05)
|
||||
chunk_bytes = chunk_samples * 2
|
||||
|
||||
print(f"Audio playback thread started (device: {self.output_device or 'default'})")
|
||||
|
||||
try:
|
||||
# Create output stream with callback
|
||||
with sd.OutputStream(
|
||||
samplerate=self.sample_rate,
|
||||
channels=1,
|
||||
dtype='int16',
|
||||
blocksize=chunk_samples,
|
||||
device=self.output_device,
|
||||
latency='low'
|
||||
) as stream:
|
||||
while self.running:
|
||||
# Get audio from buffer
|
||||
with self.audio_output_lock:
|
||||
if len(self.audio_output_buffer) >= chunk_bytes:
|
||||
audio_data = self.audio_output_buffer[:chunk_bytes]
|
||||
self.audio_output_buffer = self.audio_output_buffer[chunk_bytes:]
|
||||
else:
|
||||
# Not enough audio - output silence
|
||||
audio_data = b'\x00' * chunk_bytes
|
||||
|
||||
# Convert to numpy array and write to stream
|
||||
samples = np.frombuffer(audio_data, dtype=np.int16).reshape(-1, 1)
|
||||
stream.write(samples)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Playback thread error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
async def _playback_task(self):
|
||||
"""Start playback thread and monitor it."""
|
||||
# Run playback in a dedicated thread for reliable timing
|
||||
playback_thread = threading.Thread(target=self._playback_thread_func, daemon=True)
|
||||
playback_thread.start()
|
||||
|
||||
# Wait for client to stop
|
||||
while self.running and playback_thread.is_alive():
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
print("Audio playback stopped")
|
||||
|
||||
async def audio_sender(self) -> None:
|
||||
"""Send audio from microphone to server."""
|
||||
while self.running:
|
||||
try:
|
||||
# Get audio from queue with timeout
|
||||
try:
|
||||
audio_data = await asyncio.get_event_loop().run_in_executor(
|
||||
None, lambda: self.audio_input_queue.get(timeout=0.1)
|
||||
)
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
# Send to server
|
||||
if self.ws and self.is_recording:
|
||||
await self.ws.send(audio_data)
|
||||
self.bytes_sent += len(audio_data)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Audio sender error: {e}")
|
||||
break
|
||||
|
||||
async def receiver(self) -> None:
|
||||
"""Receive messages from server."""
|
||||
try:
|
||||
while self.running:
|
||||
try:
|
||||
message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
|
||||
|
||||
if isinstance(message, bytes):
|
||||
# Audio data received
|
||||
self.bytes_received += len(message)
|
||||
|
||||
# Check if we should discard this audio (after interrupt)
|
||||
if self._discard_audio:
|
||||
duration_ms = len(message) / (self.sample_rate * 2) * 1000
|
||||
print(f"← Audio: {duration_ms:.0f}ms (DISCARDED - waiting for new track)")
|
||||
continue
|
||||
|
||||
if self.is_playing:
|
||||
self._add_audio_to_buffer(message)
|
||||
|
||||
# Calculate and display TTFB for first audio packet
|
||||
if not self.first_audio_received and self.request_start_time:
|
||||
client_ttfb_ms = (time.time() - self.request_start_time) * 1000
|
||||
self.first_audio_received = True
|
||||
print(f"← [TTFB] Client first audio latency: {client_ttfb_ms:.0f}ms")
|
||||
|
||||
# Show progress (less verbose)
|
||||
with self.audio_output_lock:
|
||||
buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000
|
||||
duration_ms = len(message) / (self.sample_rate * 2) * 1000
|
||||
print(f"← Audio: {duration_ms:.0f}ms (buffer: {buffer_ms:.0f}ms)")
|
||||
|
||||
else:
|
||||
# JSON event
|
||||
event = json.loads(message)
|
||||
await self._handle_event(event)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
except websockets.ConnectionClosed:
|
||||
print("Connection closed")
|
||||
self.running = False
|
||||
break
|
||||
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"Receiver error: {e}")
|
||||
self.running = False
|
||||
|
||||
async def _handle_event(self, event: dict) -> None:
|
||||
"""Handle incoming event."""
|
||||
event_type = event.get("event", "unknown")
|
||||
|
||||
if event_type == "answer":
|
||||
print("← Session ready!")
|
||||
elif event_type == "speaking":
|
||||
print("← User speech detected")
|
||||
elif event_type == "silence":
|
||||
print("← User silence detected")
|
||||
elif event_type == "transcript":
|
||||
# Display user speech transcription
|
||||
text = event.get("text", "")
|
||||
is_final = event.get("isFinal", False)
|
||||
if is_final:
|
||||
# Clear the interim line and print final
|
||||
print(" " * 80, end="\r") # Clear previous interim text
|
||||
print(f"→ You: {text}")
|
||||
else:
|
||||
# Interim result - show with indicator (overwrite same line)
|
||||
display_text = text[:60] + "..." if len(text) > 60 else text
|
||||
print(f" [listening] {display_text}".ljust(80), end="\r")
|
||||
elif event_type == "ttfb":
|
||||
# Server-side TTFB event
|
||||
latency_ms = event.get("latencyMs", 0)
|
||||
print(f"← [TTFB] Server reported latency: {latency_ms}ms")
|
||||
elif event_type == "llmResponse":
|
||||
# LLM text response
|
||||
text = event.get("text", "")
|
||||
is_final = event.get("isFinal", False)
|
||||
if is_final:
|
||||
# Print final LLM response
|
||||
print(f"← AI: {text}")
|
||||
elif self.verbose:
|
||||
# Show streaming chunks only in verbose mode
|
||||
display_text = text[:60] + "..." if len(text) > 60 else text
|
||||
print(f" [streaming] {display_text}")
|
||||
elif event_type == "trackStart":
|
||||
print("← Bot started speaking")
|
||||
# IMPORTANT: Accept audio again after trackStart
|
||||
self._discard_audio = False
|
||||
self._audio_sequence += 1
|
||||
# Reset TTFB tracking for voice responses (when no chat was sent)
|
||||
if self.request_start_time is None:
|
||||
self.request_start_time = time.time()
|
||||
self.first_audio_received = False
|
||||
# Clear any old audio in buffer
|
||||
with self.audio_output_lock:
|
||||
self.audio_output_buffer = b""
|
||||
elif event_type == "trackEnd":
|
||||
print("← Bot finished speaking")
|
||||
# Reset TTFB tracking after response completes
|
||||
self.request_start_time = None
|
||||
self.first_audio_received = False
|
||||
elif event_type == "interrupt":
|
||||
print("← Bot interrupted!")
|
||||
# IMPORTANT: Discard all audio until next trackStart
|
||||
self._discard_audio = True
|
||||
# Clear audio buffer immediately
|
||||
with self.audio_output_lock:
|
||||
buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000
|
||||
self.audio_output_buffer = b""
|
||||
print(f" (cleared {buffer_ms:.0f}ms, discarding audio until new track)")
|
||||
elif event_type == "error":
|
||||
print(f"← Error: {event.get('error')}")
|
||||
elif event_type == "hangup":
|
||||
print(f"← Hangup: {event.get('reason')}")
|
||||
self.running = False
|
||||
else:
|
||||
print(f"← Event: {event_type}")
|
||||
|
||||
async def interactive_mode(self) -> None:
|
||||
"""Run interactive mode for text chat."""
|
||||
print("\n" + "=" * 50)
|
||||
print("Voice Conversation Client")
|
||||
print("=" * 50)
|
||||
print("Speak into your microphone to talk to the AI.")
|
||||
print("Or type messages to send text.")
|
||||
print("")
|
||||
print("Commands:")
|
||||
print(" /quit - End conversation")
|
||||
print(" /mute - Mute microphone")
|
||||
print(" /unmute - Unmute microphone")
|
||||
print(" /interrupt - Interrupt AI speech")
|
||||
print(" /stats - Show statistics")
|
||||
print("=" * 50 + "\n")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
user_input = await asyncio.get_event_loop().run_in_executor(
|
||||
None, input, ""
|
||||
)
|
||||
|
||||
if not user_input:
|
||||
continue
|
||||
|
||||
# Handle commands
|
||||
if user_input.startswith("/"):
|
||||
cmd = user_input.lower().strip()
|
||||
|
||||
if cmd == "/quit":
|
||||
await self.send_hangup("User quit")
|
||||
break
|
||||
elif cmd == "/mute":
|
||||
self.is_recording = False
|
||||
print("Microphone muted")
|
||||
elif cmd == "/unmute":
|
||||
self.is_recording = True
|
||||
print("Microphone unmuted")
|
||||
elif cmd == "/interrupt":
|
||||
await self.send_interrupt()
|
||||
elif cmd == "/stats":
|
||||
print(f"Sent: {self.bytes_sent / 1024:.1f} KB")
|
||||
print(f"Received: {self.bytes_received / 1024:.1f} KB")
|
||||
else:
|
||||
print(f"Unknown command: {cmd}")
|
||||
else:
|
||||
# Send as chat message
|
||||
await self.send_chat(user_input)
|
||||
|
||||
except EOFError:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Input error: {e}")
|
||||
|
||||
async def run(self, chat_message: str = None, interactive: bool = True) -> None:
|
||||
"""
|
||||
Run the client.
|
||||
|
||||
Args:
|
||||
chat_message: Optional single chat message to send
|
||||
interactive: Whether to run in interactive mode
|
||||
"""
|
||||
try:
|
||||
await self.connect()
|
||||
|
||||
# Wait for answer
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Start audio input stream
|
||||
print("Starting audio streams...")
|
||||
|
||||
input_stream = sd.InputStream(
|
||||
samplerate=self.sample_rate,
|
||||
channels=1,
|
||||
dtype=np.float32,
|
||||
blocksize=self.chunk_samples,
|
||||
device=self.input_device,
|
||||
callback=self._audio_input_callback
|
||||
)
|
||||
|
||||
input_stream.start()
|
||||
print("Audio streams started")
|
||||
|
||||
# Start background tasks
|
||||
sender_task = asyncio.create_task(self.audio_sender())
|
||||
receiver_task = asyncio.create_task(self.receiver())
|
||||
playback_task = asyncio.create_task(self._playback_task())
|
||||
|
||||
if chat_message:
|
||||
# Send single message and wait
|
||||
await self.send_chat(chat_message)
|
||||
await asyncio.sleep(15)
|
||||
elif interactive:
|
||||
# Run interactive mode
|
||||
await self.interactive_mode()
|
||||
else:
|
||||
# Just wait
|
||||
while self.running:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Cleanup
|
||||
self.running = False
|
||||
sender_task.cancel()
|
||||
receiver_task.cancel()
|
||||
playback_task.cancel()
|
||||
|
||||
try:
|
||||
await sender_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
try:
|
||||
await receiver_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
try:
|
||||
await playback_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
input_stream.stop()
|
||||
|
||||
except ConnectionRefusedError:
|
||||
print(f"Error: Could not connect to {self.url}")
|
||||
print("Make sure the server is running.")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
finally:
|
||||
await self.close()
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the connection."""
|
||||
self.running = False
|
||||
if self.ws:
|
||||
await self.ws.close()
|
||||
|
||||
print(f"\nSession ended")
|
||||
print(f" Total sent: {self.bytes_sent / 1024:.1f} KB")
|
||||
print(f" Total received: {self.bytes_received / 1024:.1f} KB")
|
||||
|
||||
|
||||
def list_devices():
|
||||
"""List available audio devices."""
|
||||
print("\nAvailable audio devices:")
|
||||
print("-" * 60)
|
||||
devices = sd.query_devices()
|
||||
for i, device in enumerate(devices):
|
||||
direction = []
|
||||
if device['max_input_channels'] > 0:
|
||||
direction.append("IN")
|
||||
if device['max_output_channels'] > 0:
|
||||
direction.append("OUT")
|
||||
direction_str = "/".join(direction) if direction else "N/A"
|
||||
|
||||
default = ""
|
||||
if i == sd.default.device[0]:
|
||||
default += " [DEFAULT INPUT]"
|
||||
if i == sd.default.device[1]:
|
||||
default += " [DEFAULT OUTPUT]"
|
||||
|
||||
print(f" {i:2d}: {device['name'][:40]:40s} ({direction_str}){default}")
|
||||
print("-" * 60)
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Microphone client for duplex voice conversation"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
default="ws://localhost:8000/ws",
|
||||
help="WebSocket server URL"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat",
|
||||
help="Send a single chat message instead of using microphone"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sample-rate",
|
||||
type=int,
|
||||
default=16000,
|
||||
help="Audio sample rate (default: 16000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-device",
|
||||
type=int,
|
||||
help="Input device ID"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-device",
|
||||
type=int,
|
||||
help="Output device ID"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-devices",
|
||||
action="store_true",
|
||||
help="List available audio devices and exit"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-interactive",
|
||||
action="store_true",
|
||||
help="Disable interactive mode"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose", "-v",
|
||||
action="store_true",
|
||||
help="Show streaming LLM response chunks"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.list_devices:
|
||||
list_devices()
|
||||
return
|
||||
|
||||
client = MicrophoneClient(
|
||||
url=args.url,
|
||||
sample_rate=args.sample_rate,
|
||||
input_device=args.input_device,
|
||||
output_device=args.output_device
|
||||
)
|
||||
client.verbose = args.verbose
|
||||
|
||||
await client.run(
|
||||
chat_message=args.chat,
|
||||
interactive=not args.no_interactive
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\nInterrupted by user")
|
||||
285
examples/simple_client.py
Normal file
285
examples/simple_client.py
Normal file
@@ -0,0 +1,285 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple WebSocket client for testing voice conversation.
|
||||
Uses PyAudio for more reliable audio playback on Windows.
|
||||
|
||||
Usage:
|
||||
python examples/simple_client.py
|
||||
python examples/simple_client.py --text "Hello"
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import wave
|
||||
import io
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
print("pip install numpy")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
import websockets
|
||||
except ImportError:
|
||||
print("pip install websockets")
|
||||
sys.exit(1)
|
||||
|
||||
# Try PyAudio first (more reliable on Windows)
|
||||
try:
|
||||
import pyaudio
|
||||
PYAUDIO_AVAILABLE = True
|
||||
except ImportError:
|
||||
PYAUDIO_AVAILABLE = False
|
||||
print("PyAudio not available, trying sounddevice...")
|
||||
|
||||
try:
|
||||
import sounddevice as sd
|
||||
SD_AVAILABLE = True
|
||||
except ImportError:
|
||||
SD_AVAILABLE = False
|
||||
|
||||
if not PYAUDIO_AVAILABLE and not SD_AVAILABLE:
|
||||
print("Please install pyaudio or sounddevice:")
|
||||
print(" pip install pyaudio")
|
||||
print(" or: pip install sounddevice")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class SimpleVoiceClient:
|
||||
"""Simple voice client with reliable audio playback."""
|
||||
|
||||
def __init__(self, url: str, sample_rate: int = 16000):
|
||||
self.url = url
|
||||
self.sample_rate = sample_rate
|
||||
self.ws = None
|
||||
self.running = False
|
||||
|
||||
# Audio buffer
|
||||
self.audio_buffer = b""
|
||||
|
||||
# PyAudio setup
|
||||
if PYAUDIO_AVAILABLE:
|
||||
self.pa = pyaudio.PyAudio()
|
||||
self.stream = None
|
||||
|
||||
# Stats
|
||||
self.bytes_received = 0
|
||||
|
||||
# TTFB tracking (Time to First Byte)
|
||||
self.request_start_time = None
|
||||
self.first_audio_received = False
|
||||
|
||||
# Interrupt handling - discard audio until next trackStart
|
||||
self._discard_audio = False
|
||||
|
||||
async def connect(self):
|
||||
"""Connect to server."""
|
||||
print(f"Connecting to {self.url}...")
|
||||
self.ws = await websockets.connect(self.url)
|
||||
self.running = True
|
||||
print("Connected!")
|
||||
|
||||
# Send invite
|
||||
await self.ws.send(json.dumps({
|
||||
"command": "invite",
|
||||
"option": {"codec": "pcm", "sampleRate": self.sample_rate}
|
||||
}))
|
||||
print("-> invite")
|
||||
|
||||
async def send_chat(self, text: str):
|
||||
"""Send chat message."""
|
||||
# Reset TTFB tracking for new request
|
||||
self.request_start_time = time.time()
|
||||
self.first_audio_received = False
|
||||
|
||||
await self.ws.send(json.dumps({"command": "chat", "text": text}))
|
||||
print(f"-> chat: {text}")
|
||||
|
||||
def play_audio(self, audio_data: bytes):
|
||||
"""Play audio data immediately."""
|
||||
if len(audio_data) == 0:
|
||||
return
|
||||
|
||||
if PYAUDIO_AVAILABLE:
|
||||
# Use PyAudio - more reliable on Windows
|
||||
if self.stream is None:
|
||||
self.stream = self.pa.open(
|
||||
format=pyaudio.paInt16,
|
||||
channels=1,
|
||||
rate=self.sample_rate,
|
||||
output=True,
|
||||
frames_per_buffer=1024
|
||||
)
|
||||
self.stream.write(audio_data)
|
||||
elif SD_AVAILABLE:
|
||||
# Use sounddevice
|
||||
samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32767.0
|
||||
sd.play(samples, self.sample_rate, blocking=True)
|
||||
|
||||
async def receive_loop(self):
|
||||
"""Receive and play audio."""
|
||||
print("\nWaiting for response...")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
msg = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
|
||||
|
||||
if isinstance(msg, bytes):
|
||||
# Audio data
|
||||
self.bytes_received += len(msg)
|
||||
duration_ms = len(msg) / (self.sample_rate * 2) * 1000
|
||||
|
||||
# Check if we should discard this audio (after interrupt)
|
||||
if self._discard_audio:
|
||||
print(f"<- audio: {len(msg)} bytes ({duration_ms:.0f}ms) [DISCARDED]")
|
||||
continue
|
||||
|
||||
# Calculate and display TTFB for first audio packet
|
||||
if not self.first_audio_received and self.request_start_time:
|
||||
client_ttfb_ms = (time.time() - self.request_start_time) * 1000
|
||||
self.first_audio_received = True
|
||||
print(f"<- [TTFB] Client first audio latency: {client_ttfb_ms:.0f}ms")
|
||||
|
||||
print(f"<- audio: {len(msg)} bytes ({duration_ms:.0f}ms)")
|
||||
|
||||
# Play immediately in executor to not block
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, self.play_audio, msg)
|
||||
else:
|
||||
# JSON event
|
||||
event = json.loads(msg)
|
||||
etype = event.get("event", "?")
|
||||
|
||||
if etype == "transcript":
|
||||
# User speech transcription
|
||||
text = event.get("text", "")
|
||||
is_final = event.get("isFinal", False)
|
||||
if is_final:
|
||||
print(f"<- You said: {text}")
|
||||
else:
|
||||
print(f"<- [listening] {text}", end="\r")
|
||||
elif etype == "ttfb":
|
||||
# Server-side TTFB event
|
||||
latency_ms = event.get("latencyMs", 0)
|
||||
print(f"<- [TTFB] Server reported latency: {latency_ms}ms")
|
||||
elif etype == "trackStart":
|
||||
# New track starting - accept audio again
|
||||
self._discard_audio = False
|
||||
print(f"<- {etype}")
|
||||
elif etype == "interrupt":
|
||||
# Interrupt - discard audio until next trackStart
|
||||
self._discard_audio = True
|
||||
print(f"<- {etype} (discarding audio until new track)")
|
||||
elif etype == "hangup":
|
||||
print(f"<- {etype}")
|
||||
self.running = False
|
||||
break
|
||||
else:
|
||||
print(f"<- {etype}")
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
except websockets.ConnectionClosed:
|
||||
print("Connection closed")
|
||||
self.running = False
|
||||
break
|
||||
|
||||
async def run(self, text: str = None):
|
||||
"""Run the client."""
|
||||
try:
|
||||
await self.connect()
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Start receiver
|
||||
recv_task = asyncio.create_task(self.receive_loop())
|
||||
|
||||
if text:
|
||||
await self.send_chat(text)
|
||||
# Wait for response
|
||||
await asyncio.sleep(30)
|
||||
else:
|
||||
# Interactive mode
|
||||
print("\nType a message and press Enter (or 'quit' to exit):")
|
||||
while self.running:
|
||||
try:
|
||||
user_input = await asyncio.get_event_loop().run_in_executor(
|
||||
None, input, "> "
|
||||
)
|
||||
if user_input.lower() == 'quit':
|
||||
break
|
||||
if user_input.strip():
|
||||
await self.send_chat(user_input)
|
||||
except EOFError:
|
||||
break
|
||||
|
||||
self.running = False
|
||||
recv_task.cancel()
|
||||
try:
|
||||
await recv_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
finally:
|
||||
await self.close()
|
||||
|
||||
async def close(self):
|
||||
"""Close connections."""
|
||||
self.running = False
|
||||
|
||||
if PYAUDIO_AVAILABLE:
|
||||
if self.stream:
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.pa.terminate()
|
||||
|
||||
if self.ws:
|
||||
await self.ws.close()
|
||||
|
||||
print(f"\nTotal audio received: {self.bytes_received / 1024:.1f} KB")
|
||||
|
||||
|
||||
def list_audio_devices():
|
||||
"""List available audio devices."""
|
||||
print("\n=== Audio Devices ===")
|
||||
|
||||
if PYAUDIO_AVAILABLE:
|
||||
pa = pyaudio.PyAudio()
|
||||
print("\nPyAudio devices:")
|
||||
for i in range(pa.get_device_count()):
|
||||
info = pa.get_device_info_by_index(i)
|
||||
if info['maxOutputChannels'] > 0:
|
||||
default = " [DEFAULT]" if i == pa.get_default_output_device_info()['index'] else ""
|
||||
print(f" {i}: {info['name']}{default}")
|
||||
pa.terminate()
|
||||
|
||||
if SD_AVAILABLE:
|
||||
print("\nSounddevice devices:")
|
||||
for i, d in enumerate(sd.query_devices()):
|
||||
if d['max_output_channels'] > 0:
|
||||
default = " [DEFAULT]" if i == sd.default.device[1] else ""
|
||||
print(f" {i}: {d['name']}{default}")
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="Simple voice client")
|
||||
parser.add_argument("--url", default="ws://localhost:8000/ws")
|
||||
parser.add_argument("--text", help="Send text and play response")
|
||||
parser.add_argument("--list-devices", action="store_true")
|
||||
parser.add_argument("--sample-rate", type=int, default=16000)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.list_devices:
|
||||
list_audio_devices()
|
||||
return
|
||||
|
||||
client = SimpleVoiceClient(args.url, args.sample_rate)
|
||||
await client.run(args.text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
176
examples/test_websocket.py
Normal file
176
examples/test_websocket.py
Normal file
@@ -0,0 +1,176 @@
|
||||
"""WebSocket endpoint test client.
|
||||
|
||||
Tests the /ws endpoint with sine wave or file audio streaming.
|
||||
Based on reference/py-active-call/exec/test_ws_endpoint/test_ws.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import json
|
||||
import struct
|
||||
import math
|
||||
import argparse
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
# Configuration
|
||||
SERVER_URL = "ws://localhost:8000/ws"
|
||||
SAMPLE_RATE = 16000
|
||||
FREQUENCY = 440 # 440Hz Sine Wave
|
||||
CHUNK_DURATION_MS = 20
|
||||
# 16kHz * 16-bit (2 bytes) * 20ms = 640 bytes per chunk
|
||||
CHUNK_SIZE_BYTES = int(SAMPLE_RATE * 2 * (CHUNK_DURATION_MS / 1000.0))
|
||||
|
||||
|
||||
def generate_sine_wave(duration_ms=1000):
|
||||
"""Generates sine wave audio (16kHz mono PCM 16-bit)."""
|
||||
num_samples = int(SAMPLE_RATE * (duration_ms / 1000.0))
|
||||
audio_data = bytearray()
|
||||
|
||||
for x in range(num_samples):
|
||||
# Generate sine wave sample
|
||||
value = int(32767.0 * math.sin(2 * math.pi * FREQUENCY * x / SAMPLE_RATE))
|
||||
# Pack as little-endian 16-bit integer
|
||||
audio_data.extend(struct.pack('<h', value))
|
||||
|
||||
return audio_data
|
||||
|
||||
|
||||
async def receive_loop(ws, ready_event: asyncio.Event):
|
||||
"""Listen for incoming messages from the server."""
|
||||
print("👂 Listening for server responses...")
|
||||
async for msg in ws:
|
||||
timestamp = datetime.now().strftime("%H:%M:%S")
|
||||
|
||||
if msg.type == aiohttp.WSMsgType.TEXT:
|
||||
try:
|
||||
data = json.loads(msg.data)
|
||||
event_type = data.get('type', 'Unknown')
|
||||
print(f"[{timestamp}] 📨 Event: {event_type} | {msg.data[:150]}...")
|
||||
if event_type == "session.started":
|
||||
ready_event.set()
|
||||
except json.JSONDecodeError:
|
||||
print(f"[{timestamp}] 📨 Text: {msg.data[:100]}...")
|
||||
|
||||
elif msg.type == aiohttp.WSMsgType.BINARY:
|
||||
# Received audio chunk back (e.g., TTS or echo)
|
||||
print(f"[{timestamp}] 🔊 Audio: {len(msg.data)} bytes", end="\r")
|
||||
|
||||
elif msg.type == aiohttp.WSMsgType.CLOSED:
|
||||
print(f"\n[{timestamp}] ❌ Socket Closed")
|
||||
break
|
||||
|
||||
elif msg.type == aiohttp.WSMsgType.ERROR:
|
||||
print(f"\n[{timestamp}] ⚠️ Socket Error")
|
||||
break
|
||||
|
||||
|
||||
async def send_file_loop(ws, file_path):
|
||||
"""Stream a raw PCM/WAV file to the server."""
|
||||
if not os.path.exists(file_path):
|
||||
print(f"❌ Error: File '{file_path}' not found.")
|
||||
return
|
||||
|
||||
print(f"📂 Streaming file: {file_path} ...")
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
# Skip WAV header if present (first 44 bytes)
|
||||
if file_path.endswith('.wav'):
|
||||
f.read(44)
|
||||
|
||||
while True:
|
||||
chunk = f.read(CHUNK_SIZE_BYTES)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
# Send binary frame
|
||||
await ws.send_bytes(chunk)
|
||||
|
||||
# Sleep to simulate real-time playback
|
||||
await asyncio.sleep(CHUNK_DURATION_MS / 1000.0)
|
||||
|
||||
print(f"\n✅ Finished streaming {file_path}")
|
||||
|
||||
|
||||
async def send_sine_loop(ws):
|
||||
"""Stream generated sine wave to the server."""
|
||||
print("🎙️ Starting Audio Stream (Sine Wave)...")
|
||||
|
||||
# Generate 10 seconds of audio buffer
|
||||
audio_buffer = generate_sine_wave(5000)
|
||||
cursor = 0
|
||||
|
||||
while cursor < len(audio_buffer):
|
||||
chunk = audio_buffer[cursor:cursor + CHUNK_SIZE_BYTES]
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
await ws.send_bytes(chunk)
|
||||
cursor += len(chunk)
|
||||
|
||||
await asyncio.sleep(CHUNK_DURATION_MS / 1000.0)
|
||||
|
||||
print("\n✅ Finished streaming test audio.")
|
||||
|
||||
|
||||
async def run_client(url, file_path=None, use_sine=False):
|
||||
"""Run the WebSocket test client."""
|
||||
session = aiohttp.ClientSession()
|
||||
try:
|
||||
print(f"🔌 Connecting to {url}...")
|
||||
async with session.ws_connect(url) as ws:
|
||||
print("✅ Connected!")
|
||||
session_ready = asyncio.Event()
|
||||
recv_task = asyncio.create_task(receive_loop(ws, session_ready))
|
||||
|
||||
# Send v1 hello + session.start handshake
|
||||
await ws.send_json({"type": "hello", "version": "v1"})
|
||||
await ws.send_json({
|
||||
"type": "session.start",
|
||||
"audio": {
|
||||
"encoding": "pcm_s16le",
|
||||
"sample_rate_hz": SAMPLE_RATE,
|
||||
"channels": 1
|
||||
}
|
||||
})
|
||||
print("📤 Sent v1 hello/session.start")
|
||||
await asyncio.wait_for(session_ready.wait(), timeout=8)
|
||||
|
||||
# Select sender based on args
|
||||
if use_sine:
|
||||
await send_sine_loop(ws)
|
||||
elif file_path:
|
||||
await send_file_loop(ws, file_path)
|
||||
else:
|
||||
# Default to sine wave
|
||||
await send_sine_loop(ws)
|
||||
|
||||
await ws.send_json({"type": "session.stop", "reason": "test_complete"})
|
||||
await asyncio.sleep(1)
|
||||
recv_task.cancel()
|
||||
try:
|
||||
await recv_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
except aiohttp.ClientConnectorError:
|
||||
print(f"❌ Connection Failed. Is the server running at {url}?")
|
||||
except asyncio.TimeoutError:
|
||||
print("❌ Timeout waiting for session.started")
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
finally:
|
||||
await session.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="WebSocket Audio Test Client")
|
||||
parser.add_argument("--url", default=SERVER_URL, help="WebSocket endpoint URL")
|
||||
parser.add_argument("--file", help="Path to PCM/WAV file to stream")
|
||||
parser.add_argument("--sine", action="store_true", help="Use sine wave generation (default)")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
asyncio.run(run_client(args.url, args.file, args.sine))
|
||||
except KeyboardInterrupt:
|
||||
print("\n👋 Client stopped.")
|
||||
504
examples/wav_client.py
Normal file
504
examples/wav_client.py
Normal file
@@ -0,0 +1,504 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
WAV file client for testing duplex voice conversation.
|
||||
|
||||
This client reads audio from a WAV file, sends it to the server,
|
||||
and saves the AI's voice response to an output WAV file.
|
||||
|
||||
Usage:
|
||||
python examples/wav_client.py --input input.wav --output response.wav
|
||||
python examples/wav_client.py --input input.wav --output response.wav --url ws://localhost:8000/ws
|
||||
python examples/wav_client.py --input input.wav --output response.wav --wait-time 10
|
||||
python wav_client.py --input ../data/audio_examples/two_utterances.wav -o response.wav
|
||||
Requirements:
|
||||
pip install soundfile websockets numpy
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import wave
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
print("Please install numpy: pip install numpy")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
import soundfile as sf
|
||||
except ImportError:
|
||||
print("Please install soundfile: pip install soundfile")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
import websockets
|
||||
except ImportError:
|
||||
print("Please install websockets: pip install websockets")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class WavFileClient:
|
||||
"""
|
||||
WAV file client for voice conversation testing.
|
||||
|
||||
Features:
|
||||
- Read audio from WAV file
|
||||
- Send audio to WebSocket server
|
||||
- Receive and save response audio
|
||||
- Event logging
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
input_file: str,
|
||||
output_file: str,
|
||||
sample_rate: int = 16000,
|
||||
chunk_duration_ms: int = 20,
|
||||
wait_time: float = 15.0,
|
||||
verbose: bool = False
|
||||
):
|
||||
"""
|
||||
Initialize WAV file client.
|
||||
|
||||
Args:
|
||||
url: WebSocket server URL
|
||||
input_file: Input WAV file path
|
||||
output_file: Output WAV file path
|
||||
sample_rate: Audio sample rate (Hz)
|
||||
chunk_duration_ms: Audio chunk duration (ms) for sending
|
||||
wait_time: Time to wait for response after sending (seconds)
|
||||
verbose: Enable verbose output
|
||||
"""
|
||||
self.url = url
|
||||
self.input_file = Path(input_file)
|
||||
self.output_file = Path(output_file)
|
||||
self.sample_rate = sample_rate
|
||||
self.chunk_duration_ms = chunk_duration_ms
|
||||
self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
|
||||
self.wait_time = wait_time
|
||||
self.verbose = verbose
|
||||
|
||||
# WebSocket connection
|
||||
self.ws = None
|
||||
self.running = False
|
||||
|
||||
# Audio buffers
|
||||
self.received_audio = bytearray()
|
||||
|
||||
# Statistics
|
||||
self.bytes_sent = 0
|
||||
self.bytes_received = 0
|
||||
|
||||
# TTFB tracking (per response)
|
||||
self.send_start_time = None
|
||||
self.response_start_time = None # set on each trackStart
|
||||
self.waiting_for_first_audio = False
|
||||
self.ttfb_ms = None # last TTFB for summary
|
||||
self.ttfb_list = [] # TTFB for each response
|
||||
|
||||
# State tracking
|
||||
self.track_started = False
|
||||
self.track_ended = False
|
||||
self.send_completed = False
|
||||
|
||||
# Events log
|
||||
self.events_log = []
|
||||
|
||||
def log_event(self, direction: str, message: str):
|
||||
"""Log an event with timestamp."""
|
||||
timestamp = time.time()
|
||||
self.events_log.append({
|
||||
"timestamp": timestamp,
|
||||
"direction": direction,
|
||||
"message": message
|
||||
})
|
||||
# Handle encoding errors on Windows
|
||||
try:
|
||||
print(f"{direction} {message}")
|
||||
except UnicodeEncodeError:
|
||||
# Replace problematic characters for console output
|
||||
safe_message = message.encode('ascii', errors='replace').decode('ascii')
|
||||
print(f"{direction} {safe_message}")
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Connect to WebSocket server."""
|
||||
self.log_event("→", f"Connecting to {self.url}...")
|
||||
self.ws = await websockets.connect(self.url)
|
||||
self.running = True
|
||||
self.log_event("←", "Connected!")
|
||||
|
||||
# Send invite command
|
||||
await self.send_command({
|
||||
"command": "invite",
|
||||
"option": {
|
||||
"codec": "pcm",
|
||||
"sampleRate": self.sample_rate
|
||||
}
|
||||
})
|
||||
|
||||
async def send_command(self, cmd: dict) -> None:
|
||||
"""Send JSON command to server."""
|
||||
if self.ws:
|
||||
await self.ws.send(json.dumps(cmd))
|
||||
self.log_event("→", f"Command: {cmd.get('command', 'unknown')}")
|
||||
|
||||
async def send_hangup(self, reason: str = "Session complete") -> None:
|
||||
"""Send hangup command."""
|
||||
await self.send_command({
|
||||
"command": "hangup",
|
||||
"reason": reason
|
||||
})
|
||||
|
||||
def load_wav_file(self) -> tuple[np.ndarray, int]:
|
||||
"""
|
||||
Load and prepare WAV file for sending.
|
||||
|
||||
Returns:
|
||||
Tuple of (audio_data as int16 numpy array, original sample rate)
|
||||
"""
|
||||
if not self.input_file.exists():
|
||||
raise FileNotFoundError(f"Input file not found: {self.input_file}")
|
||||
|
||||
# Load audio file
|
||||
audio_data, file_sample_rate = sf.read(self.input_file)
|
||||
self.log_event("→", f"Loaded: {self.input_file}")
|
||||
self.log_event("→", f" Original sample rate: {file_sample_rate} Hz")
|
||||
self.log_event("→", f" Duration: {len(audio_data) / file_sample_rate:.2f}s")
|
||||
|
||||
# Convert stereo to mono if needed
|
||||
if len(audio_data.shape) > 1:
|
||||
audio_data = audio_data.mean(axis=1)
|
||||
self.log_event("→", " Converted stereo to mono")
|
||||
|
||||
# Resample if needed
|
||||
if file_sample_rate != self.sample_rate:
|
||||
# Simple resampling using numpy
|
||||
duration = len(audio_data) / file_sample_rate
|
||||
num_samples = int(duration * self.sample_rate)
|
||||
indices = np.linspace(0, len(audio_data) - 1, num_samples)
|
||||
audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data)
|
||||
self.log_event("→", f" Resampled to {self.sample_rate} Hz")
|
||||
|
||||
# Convert to int16
|
||||
if audio_data.dtype != np.int16:
|
||||
# Normalize to [-1, 1] if needed
|
||||
max_val = np.max(np.abs(audio_data))
|
||||
if max_val > 1.0:
|
||||
audio_data = audio_data / max_val
|
||||
audio_data = (audio_data * 32767).astype(np.int16)
|
||||
|
||||
self.log_event("→", f" Prepared: {len(audio_data)} samples ({len(audio_data)/self.sample_rate:.2f}s)")
|
||||
|
||||
return audio_data, file_sample_rate
|
||||
|
||||
async def audio_sender(self, audio_data: np.ndarray) -> None:
|
||||
"""Send audio data to server in chunks."""
|
||||
total_samples = len(audio_data)
|
||||
chunk_size = self.chunk_samples
|
||||
sent_samples = 0
|
||||
|
||||
self.send_start_time = time.time()
|
||||
self.log_event("→", f"Starting audio transmission ({total_samples} samples)...")
|
||||
|
||||
while sent_samples < total_samples and self.running:
|
||||
# Get next chunk
|
||||
end_sample = min(sent_samples + chunk_size, total_samples)
|
||||
chunk = audio_data[sent_samples:end_sample]
|
||||
chunk_bytes = chunk.tobytes()
|
||||
|
||||
# Send to server
|
||||
if self.ws:
|
||||
await self.ws.send(chunk_bytes)
|
||||
self.bytes_sent += len(chunk_bytes)
|
||||
|
||||
sent_samples = end_sample
|
||||
|
||||
# Progress logging (every 500ms worth of audio)
|
||||
if self.verbose and sent_samples % (self.sample_rate // 2) == 0:
|
||||
progress = (sent_samples / total_samples) * 100
|
||||
print(f" Sending: {progress:.0f}%", end="\r")
|
||||
|
||||
# Delay to simulate real-time streaming
|
||||
# Server expects audio at real-time pace for VAD/ASR to work properly
|
||||
await asyncio.sleep(self.chunk_duration_ms / 1000)
|
||||
|
||||
self.send_completed = True
|
||||
elapsed = time.time() - self.send_start_time
|
||||
self.log_event("→", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent/1024:.1f} KB)")
|
||||
|
||||
async def receiver(self) -> None:
|
||||
"""Receive messages from server."""
|
||||
try:
|
||||
while self.running:
|
||||
try:
|
||||
message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
|
||||
|
||||
if isinstance(message, bytes):
|
||||
# Audio data received
|
||||
self.bytes_received += len(message)
|
||||
self.received_audio.extend(message)
|
||||
|
||||
# Calculate TTFB on first audio of each response
|
||||
if self.waiting_for_first_audio and self.response_start_time is not None:
|
||||
ttfb_ms = (time.time() - self.response_start_time) * 1000
|
||||
self.ttfb_ms = ttfb_ms
|
||||
self.ttfb_list.append(ttfb_ms)
|
||||
self.waiting_for_first_audio = False
|
||||
self.log_event("←", f"[TTFB] First audio latency: {ttfb_ms:.0f}ms")
|
||||
|
||||
# Log progress
|
||||
duration_ms = len(message) / (self.sample_rate * 2) * 1000
|
||||
total_ms = len(self.received_audio) / (self.sample_rate * 2) * 1000
|
||||
if self.verbose:
|
||||
print(f"← Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r")
|
||||
|
||||
else:
|
||||
# JSON event
|
||||
event = json.loads(message)
|
||||
await self._handle_event(event)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
except websockets.ConnectionClosed:
|
||||
self.log_event("←", "Connection closed")
|
||||
self.running = False
|
||||
break
|
||||
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.log_event("!", f"Receiver error: {e}")
|
||||
self.running = False
|
||||
|
||||
async def _handle_event(self, event: dict) -> None:
|
||||
"""Handle incoming event."""
|
||||
event_type = event.get("event", "unknown")
|
||||
|
||||
if event_type == "answer":
|
||||
self.log_event("←", "Session ready!")
|
||||
elif event_type == "speaking":
|
||||
self.log_event("←", "Speech detected")
|
||||
elif event_type == "silence":
|
||||
self.log_event("←", "Silence detected")
|
||||
elif event_type == "transcript":
|
||||
# ASR transcript (interim = asrDelta-style, final = asrFinal-style)
|
||||
text = event.get("text", "")
|
||||
is_final = event.get("isFinal", False)
|
||||
if is_final:
|
||||
# Clear interim line and print final
|
||||
print(" " * 80, end="\r")
|
||||
self.log_event("←", f"→ You: {text}")
|
||||
else:
|
||||
# Interim result - show with indicator (overwrite same line, as in mic_client)
|
||||
display_text = text[:60] + "..." if len(text) > 60 else text
|
||||
print(f" [listening] {display_text}".ljust(80), end="\r")
|
||||
elif event_type == "ttfb":
|
||||
latency_ms = event.get("latencyMs", 0)
|
||||
self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
|
||||
elif event_type == "llmResponse":
|
||||
text = event.get("text", "")
|
||||
is_final = event.get("isFinal", False)
|
||||
if is_final:
|
||||
self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
|
||||
elif self.verbose:
|
||||
# Show streaming chunks only in verbose mode
|
||||
self.log_event("←", f"LLM: {text}")
|
||||
elif event_type == "trackStart":
|
||||
self.track_started = True
|
||||
self.response_start_time = time.time()
|
||||
self.waiting_for_first_audio = True
|
||||
self.log_event("←", "Bot started speaking")
|
||||
elif event_type == "trackEnd":
|
||||
self.track_ended = True
|
||||
self.log_event("←", "Bot finished speaking")
|
||||
elif event_type == "interrupt":
|
||||
self.log_event("←", "Bot interrupted!")
|
||||
elif event_type == "error":
|
||||
self.log_event("!", f"Error: {event.get('error')}")
|
||||
elif event_type == "hangup":
|
||||
self.log_event("←", f"Hangup: {event.get('reason')}")
|
||||
self.running = False
|
||||
else:
|
||||
self.log_event("←", f"Event: {event_type}")
|
||||
|
||||
def save_output_wav(self) -> None:
|
||||
"""Save received audio to output WAV file."""
|
||||
if not self.received_audio:
|
||||
self.log_event("!", "No audio received to save")
|
||||
return
|
||||
|
||||
# Convert bytes to numpy array
|
||||
audio_data = np.frombuffer(bytes(self.received_audio), dtype=np.int16)
|
||||
|
||||
# Ensure output directory exists
|
||||
self.output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save using wave module for compatibility
|
||||
with wave.open(str(self.output_file), 'wb') as wav_file:
|
||||
wav_file.setnchannels(1)
|
||||
wav_file.setsampwidth(2) # 16-bit
|
||||
wav_file.setframerate(self.sample_rate)
|
||||
wav_file.writeframes(audio_data.tobytes())
|
||||
|
||||
duration = len(audio_data) / self.sample_rate
|
||||
self.log_event("→", f"Saved output: {self.output_file}")
|
||||
self.log_event("→", f" Duration: {duration:.2f}s ({len(audio_data)} samples)")
|
||||
self.log_event("→", f" Size: {len(self.received_audio)/1024:.1f} KB")
|
||||
|
||||
async def run(self) -> None:
|
||||
"""Run the WAV file test."""
|
||||
try:
|
||||
# Load input WAV file
|
||||
audio_data, _ = self.load_wav_file()
|
||||
|
||||
# Connect to server
|
||||
await self.connect()
|
||||
|
||||
# Wait for answer
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Start receiver task
|
||||
receiver_task = asyncio.create_task(self.receiver())
|
||||
|
||||
# Send audio
|
||||
await self.audio_sender(audio_data)
|
||||
|
||||
# Wait for response
|
||||
self.log_event("→", f"Waiting {self.wait_time}s for response...")
|
||||
|
||||
wait_start = time.time()
|
||||
while self.running and (time.time() - wait_start) < self.wait_time:
|
||||
# Check if track has ended (response complete)
|
||||
if self.track_ended and self.send_completed:
|
||||
# Give a little extra time for any remaining audio
|
||||
await asyncio.sleep(1.0)
|
||||
break
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Cleanup
|
||||
self.running = False
|
||||
receiver_task.cancel()
|
||||
|
||||
try:
|
||||
await receiver_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# Save output
|
||||
self.save_output_wav()
|
||||
|
||||
# Print summary
|
||||
self._print_summary()
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: {e}")
|
||||
sys.exit(1)
|
||||
except ConnectionRefusedError:
|
||||
print(f"Error: Could not connect to {self.url}")
|
||||
print("Make sure the server is running.")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
finally:
|
||||
await self.close()
|
||||
|
||||
def _print_summary(self):
|
||||
"""Print session summary."""
|
||||
print("\n" + "=" * 50)
|
||||
print("Session Summary")
|
||||
print("=" * 50)
|
||||
print(f" Input file: {self.input_file}")
|
||||
print(f" Output file: {self.output_file}")
|
||||
print(f" Bytes sent: {self.bytes_sent / 1024:.1f} KB")
|
||||
print(f" Bytes received: {self.bytes_received / 1024:.1f} KB")
|
||||
if self.ttfb_list:
|
||||
if len(self.ttfb_list) == 1:
|
||||
print(f" TTFB: {self.ttfb_list[0]:.0f} ms")
|
||||
else:
|
||||
print(f" TTFB (per response): {', '.join(f'{t:.0f}ms' for t in self.ttfb_list)}")
|
||||
if self.received_audio:
|
||||
duration = len(self.received_audio) / (self.sample_rate * 2)
|
||||
print(f" Response duration: {duration:.2f}s")
|
||||
print("=" * 50)
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the connection."""
|
||||
self.running = False
|
||||
if self.ws:
|
||||
try:
|
||||
await self.ws.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="WAV file client for testing duplex voice conversation"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input", "-i",
|
||||
required=True,
|
||||
help="Input WAV file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o",
|
||||
required=True,
|
||||
help="Output WAV file path for response"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
default="ws://localhost:8000/ws",
|
||||
help="WebSocket server URL (default: ws://localhost:8000/ws)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sample-rate",
|
||||
type=int,
|
||||
default=16000,
|
||||
help="Target sample rate for audio (default: 16000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chunk-duration",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Chunk duration in ms for sending (default: 20)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wait-time", "-w",
|
||||
type=float,
|
||||
default=15.0,
|
||||
help="Time to wait for response after sending (default: 15.0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose", "-v",
|
||||
action="store_true",
|
||||
help="Enable verbose output"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
client = WavFileClient(
|
||||
url=args.url,
|
||||
input_file=args.input,
|
||||
output_file=args.output,
|
||||
sample_rate=args.sample_rate,
|
||||
chunk_duration_ms=args.chunk_duration,
|
||||
wait_time=args.wait_time,
|
||||
verbose=args.verbose
|
||||
)
|
||||
|
||||
await client.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\nInterrupted by user")
|
||||
766
examples/web_client.html
Normal file
766
examples/web_client.html
Normal file
@@ -0,0 +1,766 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>Duplex Voice Web Client</title>
|
||||
<style>
|
||||
@import url("https://fonts.googleapis.com/css2?family=Fraunces:opsz,wght@9..144,300;9..144,500;9..144,700&family=Recursive:wght@300;400;600;700&display=swap");
|
||||
|
||||
:root {
|
||||
--bg: #0b0b0f;
|
||||
--panel: #14141c;
|
||||
--panel-2: #101018;
|
||||
--ink: #f2f3f7;
|
||||
--muted: #a7acba;
|
||||
--accent: #ff6b6b;
|
||||
--accent-2: #ffd166;
|
||||
--good: #2dd4bf;
|
||||
--bad: #f87171;
|
||||
--grid: rgba(255, 255, 255, 0.06);
|
||||
--shadow: 0 20px 60px rgba(0, 0, 0, 0.45);
|
||||
}
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
html,
|
||||
body {
|
||||
height: 100%;
|
||||
margin: 0;
|
||||
color: var(--ink);
|
||||
background: radial-gradient(1200px 600px at 20% -10%, #1d1d2a 0%, transparent 60%),
|
||||
radial-gradient(800px 800px at 110% 10%, #20203a 0%, transparent 50%),
|
||||
var(--bg);
|
||||
font-family: "Recursive", ui-sans-serif, system-ui, -apple-system, "Segoe UI", sans-serif;
|
||||
}
|
||||
|
||||
.noise {
|
||||
position: fixed;
|
||||
inset: 0;
|
||||
background-image: url("data:image/svg+xml;utf8,<svg xmlns='http://www.w3.org/2000/svg' width='120' height='120' viewBox='0 0 120 120'><filter id='n'><feTurbulence type='fractalNoise' baseFrequency='0.9' numOctaves='2' stitchTiles='stitch'/></filter><rect width='120' height='120' filter='url(%23n)' opacity='0.06'/></svg>");
|
||||
pointer-events: none;
|
||||
mix-blend-mode: soft-light;
|
||||
}
|
||||
|
||||
header {
|
||||
padding: 32px 28px 18px;
|
||||
border-bottom: 1px solid var(--grid);
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-family: "Fraunces", serif;
|
||||
font-weight: 600;
|
||||
margin: 0 0 6px;
|
||||
letter-spacing: 0.4px;
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
color: var(--muted);
|
||||
font-size: 0.95rem;
|
||||
}
|
||||
|
||||
main {
|
||||
display: grid;
|
||||
grid-template-columns: 1.1fr 1.4fr;
|
||||
gap: 24px;
|
||||
padding: 24px 28px 40px;
|
||||
}
|
||||
|
||||
.panel {
|
||||
background: linear-gradient(180deg, rgba(255, 255, 255, 0.02), transparent),
|
||||
var(--panel);
|
||||
border: 1px solid var(--grid);
|
||||
border-radius: 16px;
|
||||
padding: 20px;
|
||||
box-shadow: var(--shadow);
|
||||
}
|
||||
|
||||
.panel h2 {
|
||||
margin: 0 0 12px;
|
||||
font-size: 1.05rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.stack {
|
||||
display: grid;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
label {
|
||||
display: block;
|
||||
font-size: 0.85rem;
|
||||
color: var(--muted);
|
||||
margin-bottom: 6px;
|
||||
}
|
||||
|
||||
input,
|
||||
select,
|
||||
button,
|
||||
textarea {
|
||||
font-family: inherit;
|
||||
}
|
||||
|
||||
input,
|
||||
select,
|
||||
textarea {
|
||||
width: 100%;
|
||||
padding: 10px 12px;
|
||||
border-radius: 10px;
|
||||
border: 1px solid var(--grid);
|
||||
background: var(--panel-2);
|
||||
color: var(--ink);
|
||||
outline: none;
|
||||
}
|
||||
|
||||
textarea {
|
||||
min-height: 80px;
|
||||
resize: vertical;
|
||||
}
|
||||
|
||||
.row {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.btn-row {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
button {
|
||||
border: none;
|
||||
border-radius: 999px;
|
||||
padding: 10px 16px;
|
||||
font-weight: 600;
|
||||
background: var(--ink);
|
||||
color: #111;
|
||||
cursor: pointer;
|
||||
transition: transform 0.2s ease, box-shadow 0.2s ease;
|
||||
}
|
||||
|
||||
button.secondary {
|
||||
background: transparent;
|
||||
color: var(--ink);
|
||||
border: 1px solid var(--grid);
|
||||
}
|
||||
|
||||
button.accent {
|
||||
background: linear-gradient(120deg, var(--accent), #f97316);
|
||||
color: #0b0b0f;
|
||||
}
|
||||
|
||||
button.good {
|
||||
background: linear-gradient(120deg, var(--good), #22c55e);
|
||||
color: #07261f;
|
||||
}
|
||||
|
||||
button.bad {
|
||||
background: linear-gradient(120deg, var(--bad), #f97316);
|
||||
color: #2a0b0b;
|
||||
}
|
||||
|
||||
button:active {
|
||||
transform: translateY(1px) scale(0.99);
|
||||
}
|
||||
|
||||
.status {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
padding: 12px;
|
||||
background: rgba(255, 255, 255, 0.03);
|
||||
border-radius: 12px;
|
||||
border: 1px dashed var(--grid);
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
.dot {
|
||||
width: 10px;
|
||||
height: 10px;
|
||||
border-radius: 999px;
|
||||
background: var(--bad);
|
||||
box-shadow: 0 0 12px rgba(248, 113, 113, 0.5);
|
||||
}
|
||||
|
||||
.dot.on {
|
||||
background: var(--good);
|
||||
box-shadow: 0 0 12px rgba(45, 212, 191, 0.7);
|
||||
}
|
||||
|
||||
.log {
|
||||
height: 320px;
|
||||
overflow: auto;
|
||||
padding: 12px;
|
||||
background: #0d0d14;
|
||||
border-radius: 12px;
|
||||
border: 1px solid var(--grid);
|
||||
font-size: 0.85rem;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.chat {
|
||||
height: 260px;
|
||||
overflow: auto;
|
||||
padding: 12px;
|
||||
background: #0d0d14;
|
||||
border-radius: 12px;
|
||||
border: 1px solid var(--grid);
|
||||
font-size: 0.9rem;
|
||||
line-height: 1.45;
|
||||
}
|
||||
|
||||
.chat-entry {
|
||||
padding: 8px 10px;
|
||||
margin-bottom: 8px;
|
||||
border-radius: 10px;
|
||||
background: rgba(255, 255, 255, 0.04);
|
||||
border: 1px solid rgba(255, 255, 255, 0.06);
|
||||
}
|
||||
|
||||
.chat-entry.user {
|
||||
border-left: 3px solid var(--accent-2);
|
||||
}
|
||||
|
||||
.chat-entry.ai {
|
||||
border-left: 3px solid var(--good);
|
||||
}
|
||||
|
||||
.chat-entry.interim {
|
||||
opacity: 0.7;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
.log-entry {
|
||||
padding: 6px 8px;
|
||||
border-bottom: 1px dashed rgba(255, 255, 255, 0.06);
|
||||
}
|
||||
|
||||
.log-entry:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
.tag {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
padding: 2px 8px;
|
||||
border-radius: 999px;
|
||||
font-size: 0.7rem;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.6px;
|
||||
background: rgba(255, 255, 255, 0.08);
|
||||
color: var(--muted);
|
||||
}
|
||||
|
||||
.tag.event {
|
||||
background: rgba(255, 107, 107, 0.18);
|
||||
color: #ffc1c1;
|
||||
}
|
||||
|
||||
.tag.audio {
|
||||
background: rgba(45, 212, 191, 0.2);
|
||||
color: #c5f9f0;
|
||||
}
|
||||
|
||||
.tag.sys {
|
||||
background: rgba(255, 209, 102, 0.2);
|
||||
color: #ffefb0;
|
||||
}
|
||||
|
||||
.muted {
|
||||
color: var(--muted);
|
||||
}
|
||||
|
||||
footer {
|
||||
padding: 0 28px 28px;
|
||||
color: var(--muted);
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
|
||||
@media (max-width: 1100px) {
|
||||
main {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
.log {
|
||||
height: 360px;
|
||||
}
|
||||
.chat {
|
||||
height: 260px;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="noise"></div>
|
||||
<header>
|
||||
<h1>Duplex Voice Client</h1>
|
||||
<div class="subtitle">Browser client for the WebSocket duplex pipeline. Device selection + event logging.</div>
|
||||
</header>
|
||||
|
||||
<main>
|
||||
<section class="panel stack">
|
||||
<h2>Connection</h2>
|
||||
<div>
|
||||
<label for="wsUrl">WebSocket URL</label>
|
||||
<input id="wsUrl" value="ws://localhost:8000/ws" />
|
||||
</div>
|
||||
<div class="btn-row">
|
||||
<button class="accent" id="connectBtn">Connect</button>
|
||||
<button class="secondary" id="disconnectBtn">Disconnect</button>
|
||||
</div>
|
||||
<div class="status">
|
||||
<div id="statusDot" class="dot"></div>
|
||||
<div>
|
||||
<div id="statusText">Disconnected</div>
|
||||
<div class="muted" id="statusSub">Waiting for connection</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2>Devices</h2>
|
||||
<div class="row">
|
||||
<div>
|
||||
<label for="inputSelect">Input (Mic)</label>
|
||||
<select id="inputSelect"></select>
|
||||
</div>
|
||||
<div>
|
||||
<label for="outputSelect">Output (Speaker)</label>
|
||||
<select id="outputSelect"></select>
|
||||
</div>
|
||||
</div>
|
||||
<div class="btn-row">
|
||||
<button class="secondary" id="refreshDevicesBtn">Refresh Devices</button>
|
||||
<button class="good" id="startMicBtn">Start Mic</button>
|
||||
<button class="secondary" id="stopMicBtn">Stop Mic</button>
|
||||
</div>
|
||||
|
||||
<h2>Chat</h2>
|
||||
<div class="stack">
|
||||
<textarea id="chatInput" placeholder="Type a message, press Send"></textarea>
|
||||
<div class="btn-row">
|
||||
<button class="accent" id="sendChatBtn">Send Chat</button>
|
||||
<button class="secondary" id="clearLogBtn">Clear Log</button>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="stack">
|
||||
<div class="panel stack">
|
||||
<h2>Chat History</h2>
|
||||
<div class="chat" id="chatHistory"></div>
|
||||
</div>
|
||||
<div class="panel stack">
|
||||
<h2>Event Log</h2>
|
||||
<div class="log" id="log"></div>
|
||||
</div>
|
||||
</section>
|
||||
</main>
|
||||
|
||||
<footer>
|
||||
Output device selection requires HTTPS + a browser that supports <code>setSinkId</code>.
|
||||
Audio is sent as 16-bit PCM @ 16 kHz, matching <code>examples/mic_client.py</code>.
|
||||
</footer>
|
||||
|
||||
<audio id="audioOut" autoplay></audio>
|
||||
|
||||
<script>
|
||||
const wsUrl = document.getElementById("wsUrl");
|
||||
const connectBtn = document.getElementById("connectBtn");
|
||||
const disconnectBtn = document.getElementById("disconnectBtn");
|
||||
const inputSelect = document.getElementById("inputSelect");
|
||||
const outputSelect = document.getElementById("outputSelect");
|
||||
const startMicBtn = document.getElementById("startMicBtn");
|
||||
const stopMicBtn = document.getElementById("stopMicBtn");
|
||||
const refreshDevicesBtn = document.getElementById("refreshDevicesBtn");
|
||||
const sendChatBtn = document.getElementById("sendChatBtn");
|
||||
const clearLogBtn = document.getElementById("clearLogBtn");
|
||||
const chatInput = document.getElementById("chatInput");
|
||||
const logEl = document.getElementById("log");
|
||||
const chatHistory = document.getElementById("chatHistory");
|
||||
const statusDot = document.getElementById("statusDot");
|
||||
const statusText = document.getElementById("statusText");
|
||||
const statusSub = document.getElementById("statusSub");
|
||||
const audioOut = document.getElementById("audioOut");
|
||||
|
||||
let ws = null;
|
||||
let audioCtx = null;
|
||||
let micStream = null;
|
||||
let processor = null;
|
||||
let micSource = null;
|
||||
let playbackDest = null;
|
||||
let playbackTime = 0;
|
||||
let discardAudio = false;
|
||||
let playbackSources = [];
|
||||
let interimUserEl = null;
|
||||
let interimAiEl = null;
|
||||
let interimUserText = "";
|
||||
let interimAiText = "";
|
||||
|
||||
const targetSampleRate = 16000;
|
||||
const playbackStopRampSec = 0.008;
|
||||
|
||||
function logLine(type, text, data) {
|
||||
const time = new Date().toLocaleTimeString();
|
||||
const entry = document.createElement("div");
|
||||
entry.className = "log-entry";
|
||||
const tag = document.createElement("span");
|
||||
tag.className = `tag ${type}`;
|
||||
tag.textContent = type.toUpperCase();
|
||||
const msg = document.createElement("span");
|
||||
msg.style.marginLeft = "10px";
|
||||
msg.textContent = `[${time}] ${text}`;
|
||||
entry.appendChild(tag);
|
||||
entry.appendChild(msg);
|
||||
if (data) {
|
||||
const pre = document.createElement("div");
|
||||
pre.className = "muted";
|
||||
pre.textContent = JSON.stringify(data);
|
||||
pre.style.marginTop = "4px";
|
||||
entry.appendChild(pre);
|
||||
}
|
||||
logEl.appendChild(entry);
|
||||
logEl.scrollTop = logEl.scrollHeight;
|
||||
}
|
||||
|
||||
function addChat(role, text) {
|
||||
const entry = document.createElement("div");
|
||||
entry.className = `chat-entry ${role === "AI" ? "ai" : "user"}`;
|
||||
entry.textContent = `${role}: ${text}`;
|
||||
chatHistory.appendChild(entry);
|
||||
chatHistory.scrollTop = chatHistory.scrollHeight;
|
||||
}
|
||||
|
||||
function setInterim(role, text) {
|
||||
const isAi = role === "AI";
|
||||
let el = isAi ? interimAiEl : interimUserEl;
|
||||
if (!text) {
|
||||
if (el) el.remove();
|
||||
if (isAi) interimAiEl = null;
|
||||
else interimUserEl = null;
|
||||
if (isAi) interimAiText = "";
|
||||
else interimUserText = "";
|
||||
return;
|
||||
}
|
||||
if (!el) {
|
||||
el = document.createElement("div");
|
||||
el.className = `chat-entry ${isAi ? "ai" : "user"} interim`;
|
||||
chatHistory.appendChild(el);
|
||||
if (isAi) interimAiEl = el;
|
||||
else interimUserEl = el;
|
||||
}
|
||||
el.textContent = `${role} (interim): ${text}`;
|
||||
chatHistory.scrollTop = chatHistory.scrollHeight;
|
||||
}
|
||||
|
||||
function stopPlayback() {
|
||||
discardAudio = true;
|
||||
const now = audioCtx ? audioCtx.currentTime : 0;
|
||||
playbackTime = now;
|
||||
playbackSources.forEach((node) => {
|
||||
try {
|
||||
if (audioCtx && node.gainNode && node.source) {
|
||||
node.gainNode.gain.cancelScheduledValues(now);
|
||||
node.gainNode.gain.setValueAtTime(node.gainNode.gain.value || 1, now);
|
||||
node.gainNode.gain.linearRampToValueAtTime(0, now + playbackStopRampSec);
|
||||
node.source.stop(now + playbackStopRampSec + 0.002);
|
||||
} else if (node.source) {
|
||||
node.source.stop();
|
||||
}
|
||||
} catch (err) {}
|
||||
});
|
||||
playbackSources = [];
|
||||
}
|
||||
|
||||
function setStatus(connected, detail) {
|
||||
statusDot.classList.toggle("on", connected);
|
||||
statusText.textContent = connected ? "Connected" : "Disconnected";
|
||||
statusSub.textContent = detail || "";
|
||||
}
|
||||
|
||||
async function ensureAudioContext() {
|
||||
if (audioCtx) return;
|
||||
audioCtx = new (window.AudioContext || window.webkitAudioContext)();
|
||||
playbackDest = audioCtx.createMediaStreamDestination();
|
||||
audioOut.srcObject = playbackDest.stream;
|
||||
try {
|
||||
await audioOut.play();
|
||||
} catch (err) {
|
||||
logLine("sys", "Audio playback blocked (user gesture needed)", { err: String(err) });
|
||||
}
|
||||
if (outputSelect.value) {
|
||||
await setOutputDevice(outputSelect.value);
|
||||
}
|
||||
}
|
||||
|
||||
function downsampleBuffer(buffer, inRate, outRate) {
|
||||
if (outRate === inRate) return buffer;
|
||||
const ratio = inRate / outRate;
|
||||
const newLength = Math.round(buffer.length / ratio);
|
||||
const result = new Float32Array(newLength);
|
||||
let offsetResult = 0;
|
||||
let offsetBuffer = 0;
|
||||
while (offsetResult < result.length) {
|
||||
const nextOffsetBuffer = Math.round((offsetResult + 1) * ratio);
|
||||
let accum = 0;
|
||||
let count = 0;
|
||||
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
|
||||
accum += buffer[i];
|
||||
count++;
|
||||
}
|
||||
result[offsetResult] = accum / count;
|
||||
offsetResult++;
|
||||
offsetBuffer = nextOffsetBuffer;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function floatTo16BitPCM(float32) {
|
||||
const out = new Int16Array(float32.length);
|
||||
for (let i = 0; i < float32.length; i++) {
|
||||
const s = Math.max(-1, Math.min(1, float32[i]));
|
||||
out[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
function schedulePlayback(int16Data) {
|
||||
if (!audioCtx || !playbackDest) return;
|
||||
if (discardAudio) return;
|
||||
const float32 = new Float32Array(int16Data.length);
|
||||
for (let i = 0; i < int16Data.length; i++) {
|
||||
float32[i] = int16Data[i] / 32768;
|
||||
}
|
||||
const buffer = audioCtx.createBuffer(1, float32.length, targetSampleRate);
|
||||
buffer.copyToChannel(float32, 0);
|
||||
const source = audioCtx.createBufferSource();
|
||||
const gainNode = audioCtx.createGain();
|
||||
source.buffer = buffer;
|
||||
source.connect(gainNode);
|
||||
gainNode.connect(playbackDest);
|
||||
const startTime = Math.max(audioCtx.currentTime + 0.02, playbackTime);
|
||||
gainNode.gain.setValueAtTime(1, startTime);
|
||||
source.start(startTime);
|
||||
playbackTime = startTime + buffer.duration;
|
||||
const playbackNode = { source, gainNode };
|
||||
playbackSources.push(playbackNode);
|
||||
source.onended = () => {
|
||||
playbackSources = playbackSources.filter((s) => s !== playbackNode);
|
||||
};
|
||||
}
|
||||
|
||||
async function connect() {
|
||||
if (ws && ws.readyState === WebSocket.OPEN) return;
|
||||
ws = new WebSocket(wsUrl.value.trim());
|
||||
ws.binaryType = "arraybuffer";
|
||||
|
||||
ws.onopen = () => {
|
||||
setStatus(true, "Session open");
|
||||
logLine("sys", "WebSocket connected");
|
||||
ensureAudioContext();
|
||||
sendCommand({ type: "hello", version: "v1" });
|
||||
};
|
||||
|
||||
ws.onclose = () => {
|
||||
setStatus(false, "Connection closed");
|
||||
logLine("sys", "WebSocket closed");
|
||||
ws = null;
|
||||
};
|
||||
|
||||
ws.onerror = (err) => {
|
||||
logLine("sys", "WebSocket error", { err: String(err) });
|
||||
};
|
||||
|
||||
ws.onmessage = (msg) => {
|
||||
if (typeof msg.data === "string") {
|
||||
const event = JSON.parse(msg.data);
|
||||
handleEvent(event);
|
||||
} else {
|
||||
const audioBuf = msg.data;
|
||||
const int16 = new Int16Array(audioBuf);
|
||||
schedulePlayback(int16);
|
||||
logLine("audio", `Audio ${Math.round((int16.length / targetSampleRate) * 1000)}ms`);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
function disconnect() {
|
||||
if (ws && ws.readyState === WebSocket.OPEN) {
|
||||
sendCommand({ type: "session.stop", reason: "client_disconnect" });
|
||||
ws.close();
|
||||
}
|
||||
ws = null;
|
||||
setStatus(false, "Disconnected");
|
||||
}
|
||||
|
||||
function sendCommand(cmd) {
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
||||
logLine("sys", "Not connected");
|
||||
return;
|
||||
}
|
||||
ws.send(JSON.stringify(cmd));
|
||||
logLine("sys", `→ ${cmd.type}`, cmd);
|
||||
}
|
||||
|
||||
function handleEvent(event) {
|
||||
const type = event.type || "unknown";
|
||||
logLine("event", type, event);
|
||||
if (type === "hello.ack") {
|
||||
sendCommand({
|
||||
type: "session.start",
|
||||
audio: { encoding: "pcm_s16le", sample_rate_hz: targetSampleRate, channels: 1 },
|
||||
});
|
||||
}
|
||||
if (type === "transcript.final") {
|
||||
if (event.text) {
|
||||
setInterim("You", "");
|
||||
addChat("You", event.text);
|
||||
}
|
||||
}
|
||||
if (type === "transcript.delta" && event.text) {
|
||||
setInterim("You", event.text);
|
||||
}
|
||||
if (type === "assistant.response.final") {
|
||||
if (event.text) {
|
||||
setInterim("AI", "");
|
||||
addChat("AI", event.text);
|
||||
}
|
||||
}
|
||||
if (type === "assistant.response.delta" && event.text) {
|
||||
interimAiText += event.text;
|
||||
setInterim("AI", interimAiText);
|
||||
}
|
||||
if (type === "output.audio.start") {
|
||||
// New bot audio: stop any previous playback to avoid overlap
|
||||
stopPlayback();
|
||||
discardAudio = false;
|
||||
interimAiText = "";
|
||||
}
|
||||
if (type === "input.speech_started") {
|
||||
// User started speaking: clear any in-flight audio to avoid overlap
|
||||
stopPlayback();
|
||||
}
|
||||
if (type === "response.interrupted") {
|
||||
stopPlayback();
|
||||
}
|
||||
}
|
||||
|
||||
async function startMic() {
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
||||
logLine("sys", "Connect before starting mic");
|
||||
return;
|
||||
}
|
||||
await ensureAudioContext();
|
||||
const deviceId = inputSelect.value || undefined;
|
||||
micStream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: deviceId ? { deviceId: { exact: deviceId } } : true,
|
||||
});
|
||||
micSource = audioCtx.createMediaStreamSource(micStream);
|
||||
processor = audioCtx.createScriptProcessor(2048, 1, 1);
|
||||
processor.onaudioprocess = (e) => {
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) return;
|
||||
const input = e.inputBuffer.getChannelData(0);
|
||||
const downsampled = downsampleBuffer(input, audioCtx.sampleRate, targetSampleRate);
|
||||
const pcm16 = floatTo16BitPCM(downsampled);
|
||||
ws.send(pcm16.buffer);
|
||||
};
|
||||
micSource.connect(processor);
|
||||
processor.connect(audioCtx.destination);
|
||||
logLine("sys", "Microphone started");
|
||||
}
|
||||
|
||||
function stopMic() {
|
||||
if (processor) {
|
||||
processor.disconnect();
|
||||
processor = null;
|
||||
}
|
||||
if (micSource) {
|
||||
micSource.disconnect();
|
||||
micSource = null;
|
||||
}
|
||||
if (micStream) {
|
||||
micStream.getTracks().forEach((t) => t.stop());
|
||||
micStream = null;
|
||||
}
|
||||
logLine("sys", "Microphone stopped");
|
||||
}
|
||||
|
||||
async function refreshDevices() {
|
||||
const devices = await navigator.mediaDevices.enumerateDevices();
|
||||
inputSelect.innerHTML = "";
|
||||
outputSelect.innerHTML = "";
|
||||
devices.forEach((d) => {
|
||||
if (d.kind === "audioinput") {
|
||||
const opt = document.createElement("option");
|
||||
opt.value = d.deviceId;
|
||||
opt.textContent = d.label || `Mic ${inputSelect.length + 1}`;
|
||||
inputSelect.appendChild(opt);
|
||||
}
|
||||
if (d.kind === "audiooutput") {
|
||||
const opt = document.createElement("option");
|
||||
opt.value = d.deviceId;
|
||||
opt.textContent = d.label || `Output ${outputSelect.length + 1}`;
|
||||
outputSelect.appendChild(opt);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async function requestDeviceAccess() {
|
||||
// Needed to reveal device labels in most browsers
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
stream.getTracks().forEach((t) => t.stop());
|
||||
logLine("sys", "Microphone permission granted");
|
||||
} catch (err) {
|
||||
logLine("sys", "Microphone permission denied", { err: String(err) });
|
||||
}
|
||||
}
|
||||
|
||||
async function setOutputDevice(deviceId) {
|
||||
if (!audioOut.setSinkId) {
|
||||
logLine("sys", "setSinkId not supported in this browser");
|
||||
return;
|
||||
}
|
||||
await audioOut.setSinkId(deviceId);
|
||||
logLine("sys", `Output device set`, { deviceId });
|
||||
}
|
||||
|
||||
connectBtn.addEventListener("click", connect);
|
||||
disconnectBtn.addEventListener("click", disconnect);
|
||||
refreshDevicesBtn.addEventListener("click", async () => {
|
||||
await requestDeviceAccess();
|
||||
await refreshDevices();
|
||||
});
|
||||
startMicBtn.addEventListener("click", startMic);
|
||||
stopMicBtn.addEventListener("click", stopMic);
|
||||
sendChatBtn.addEventListener("click", () => {
|
||||
const text = chatInput.value.trim();
|
||||
if (!text) return;
|
||||
ensureAudioContext();
|
||||
addChat("You", text);
|
||||
sendCommand({ type: "input.text", text });
|
||||
chatInput.value = "";
|
||||
});
|
||||
clearLogBtn.addEventListener("click", () => {
|
||||
logEl.innerHTML = "";
|
||||
chatHistory.innerHTML = "";
|
||||
setInterim("You", "");
|
||||
setInterim("AI", "");
|
||||
interimUserText = "";
|
||||
interimAiText = "";
|
||||
});
|
||||
inputSelect.addEventListener("change", () => {
|
||||
if (micStream) {
|
||||
stopMic();
|
||||
startMic();
|
||||
}
|
||||
});
|
||||
outputSelect.addEventListener("change", () => setOutputDevice(outputSelect.value));
|
||||
|
||||
navigator.mediaDevices.addEventListener("devicechange", refreshDevices);
|
||||
refreshDevices().catch(() => {});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user