#!/usr/bin/env python3 """ Simple WebSocket client for testing voice conversation. Uses PyAudio for more reliable audio playback on Windows. Usage: python examples/simple_client.py python examples/simple_client.py --text "Hello" """ import argparse import asyncio import json import sys import time import wave import io try: import numpy as np except ImportError: print("pip install numpy") sys.exit(1) try: import websockets except ImportError: print("pip install websockets") sys.exit(1) # Try PyAudio first (more reliable on Windows) try: import pyaudio PYAUDIO_AVAILABLE = True except ImportError: PYAUDIO_AVAILABLE = False print("PyAudio not available, trying sounddevice...") try: import sounddevice as sd SD_AVAILABLE = True except ImportError: SD_AVAILABLE = False if not PYAUDIO_AVAILABLE and not SD_AVAILABLE: print("Please install pyaudio or sounddevice:") print(" pip install pyaudio") print(" or: pip install sounddevice") sys.exit(1) class SimpleVoiceClient: """Simple voice client with reliable audio playback.""" def __init__(self, url: str, sample_rate: int = 16000): self.url = url self.sample_rate = sample_rate self.ws = None self.running = False # Audio buffer self.audio_buffer = b"" # PyAudio setup if PYAUDIO_AVAILABLE: self.pa = pyaudio.PyAudio() self.stream = None # Stats self.bytes_received = 0 # TTFB tracking (Time to First Byte) self.request_start_time = None self.first_audio_received = False # Interrupt handling - discard audio until next trackStart self._discard_audio = False async def connect(self): """Connect to server.""" print(f"Connecting to {self.url}...") self.ws = await websockets.connect(self.url) self.running = True print("Connected!") # Send invite await self.ws.send(json.dumps({ "command": "invite", "option": {"codec": "pcm", "sampleRate": self.sample_rate} })) print("-> invite") async def send_chat(self, text: str): """Send chat message.""" # Reset TTFB tracking for new request self.request_start_time = time.time() self.first_audio_received = False await self.ws.send(json.dumps({"command": "chat", "text": text})) print(f"-> chat: {text}") def play_audio(self, audio_data: bytes): """Play audio data immediately.""" if len(audio_data) == 0: return if PYAUDIO_AVAILABLE: # Use PyAudio - more reliable on Windows if self.stream is None: self.stream = self.pa.open( format=pyaudio.paInt16, channels=1, rate=self.sample_rate, output=True, frames_per_buffer=1024 ) self.stream.write(audio_data) elif SD_AVAILABLE: # Use sounddevice samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32767.0 sd.play(samples, self.sample_rate, blocking=True) async def receive_loop(self): """Receive and play audio.""" print("\nWaiting for response...") while self.running: try: msg = await asyncio.wait_for(self.ws.recv(), timeout=0.1) if isinstance(msg, bytes): # Audio data self.bytes_received += len(msg) duration_ms = len(msg) / (self.sample_rate * 2) * 1000 # Check if we should discard this audio (after interrupt) if self._discard_audio: print(f"<- audio: {len(msg)} bytes ({duration_ms:.0f}ms) [DISCARDED]") continue # Calculate and display TTFB for first audio packet if not self.first_audio_received and self.request_start_time: client_ttfb_ms = (time.time() - self.request_start_time) * 1000 self.first_audio_received = True print(f"<- [TTFB] Client first audio latency: {client_ttfb_ms:.0f}ms") print(f"<- audio: {len(msg)} bytes ({duration_ms:.0f}ms)") # Play immediately in executor to not block loop = asyncio.get_event_loop() await loop.run_in_executor(None, self.play_audio, msg) else: # JSON event event = json.loads(msg) etype = event.get("event", "?") if etype == "transcript": # User speech transcription text = event.get("text", "") is_final = event.get("isFinal", False) if is_final: print(f"<- You said: {text}") else: print(f"<- [listening] {text}", end="\r") elif etype == "ttfb": # Server-side TTFB event latency_ms = event.get("latencyMs", 0) print(f"<- [TTFB] Server reported latency: {latency_ms}ms") elif etype == "trackStart": # New track starting - accept audio again self._discard_audio = False print(f"<- {etype}") elif etype == "interrupt": # Interrupt - discard audio until next trackStart self._discard_audio = True print(f"<- {etype} (discarding audio until new track)") elif etype == "hangup": print(f"<- {etype}") self.running = False break else: print(f"<- {etype}") except asyncio.TimeoutError: continue except websockets.ConnectionClosed: print("Connection closed") self.running = False break async def run(self, text: str = None): """Run the client.""" try: await self.connect() await asyncio.sleep(0.5) # Start receiver recv_task = asyncio.create_task(self.receive_loop()) if text: await self.send_chat(text) # Wait for response await asyncio.sleep(30) else: # Interactive mode print("\nType a message and press Enter (or 'quit' to exit):") while self.running: try: user_input = await asyncio.get_event_loop().run_in_executor( None, input, "> " ) if user_input.lower() == 'quit': break if user_input.strip(): await self.send_chat(user_input) except EOFError: break self.running = False recv_task.cancel() try: await recv_task except asyncio.CancelledError: pass finally: await self.close() async def close(self): """Close connections.""" self.running = False if PYAUDIO_AVAILABLE: if self.stream: self.stream.stop_stream() self.stream.close() self.pa.terminate() if self.ws: await self.ws.close() print(f"\nTotal audio received: {self.bytes_received / 1024:.1f} KB") def list_audio_devices(): """List available audio devices.""" print("\n=== Audio Devices ===") if PYAUDIO_AVAILABLE: pa = pyaudio.PyAudio() print("\nPyAudio devices:") for i in range(pa.get_device_count()): info = pa.get_device_info_by_index(i) if info['maxOutputChannels'] > 0: default = " [DEFAULT]" if i == pa.get_default_output_device_info()['index'] else "" print(f" {i}: {info['name']}{default}") pa.terminate() if SD_AVAILABLE: print("\nSounddevice devices:") for i, d in enumerate(sd.query_devices()): if d['max_output_channels'] > 0: default = " [DEFAULT]" if i == sd.default.device[1] else "" print(f" {i}: {d['name']}{default}") async def main(): parser = argparse.ArgumentParser(description="Simple voice client") parser.add_argument("--url", default="ws://localhost:8000/ws") parser.add_argument("--text", help="Send text and play response") parser.add_argument("--list-devices", action="store_true") parser.add_argument("--sample-rate", type=int, default=16000) args = parser.parse_args() if args.list_devices: list_audio_devices() return client = SimpleVoiceClient(args.url, args.sample_rate) await client.run(args.text) if __name__ == "__main__": asyncio.run(main())