#!/usr/bin/env python3 """ Microphone client for testing duplex voice conversation. This client captures audio from the microphone, sends it to the server, and plays back the AI's voice response through the speakers. Usage: python examples/mic_client.py --url ws://localhost:8000/ws python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!" Requirements: pip install sounddevice soundfile websockets numpy """ import argparse import asyncio import json import sys import threading import queue from pathlib import Path try: import numpy as np except ImportError: print("Please install numpy: pip install numpy") sys.exit(1) try: import sounddevice as sd except ImportError: print("Please install sounddevice: pip install sounddevice") sys.exit(1) try: import websockets except ImportError: print("Please install websockets: pip install websockets") sys.exit(1) class MicrophoneClient: """ Full-duplex microphone client for voice conversation. Features: - Real-time microphone capture - Real-time speaker playback - WebSocket communication - Text chat support """ def __init__( self, url: str, sample_rate: int = 16000, chunk_duration_ms: int = 20, input_device: int = None, output_device: int = None ): """ Initialize microphone client. Args: url: WebSocket server URL sample_rate: Audio sample rate (Hz) chunk_duration_ms: Audio chunk duration (ms) input_device: Input device ID (None for default) output_device: Output device ID (None for default) """ self.url = url self.sample_rate = sample_rate self.chunk_duration_ms = chunk_duration_ms self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000) self.input_device = input_device self.output_device = output_device # WebSocket connection self.ws = None self.running = False # Audio buffers self.audio_input_queue = queue.Queue() self.audio_output_buffer = b"" # Continuous buffer for smooth playback self.audio_output_lock = threading.Lock() # Statistics self.bytes_sent = 0 self.bytes_received = 0 # State self.is_recording = True self.is_playing = True async def connect(self) -> None: """Connect to WebSocket server.""" print(f"Connecting to {self.url}...") self.ws = await websockets.connect(self.url) self.running = True print("Connected!") # Send invite command await self.send_command({ "command": "invite", "option": { "codec": "pcm", "sampleRate": self.sample_rate } }) async def send_command(self, cmd: dict) -> None: """Send JSON command to server.""" if self.ws: await self.ws.send(json.dumps(cmd)) print(f"→ Command: {cmd.get('command', 'unknown')}") async def send_chat(self, text: str) -> None: """Send chat message (text input).""" await self.send_command({ "command": "chat", "text": text }) print(f"→ Chat: {text}") async def send_interrupt(self) -> None: """Send interrupt command.""" await self.send_command({ "command": "interrupt" }) async def send_hangup(self, reason: str = "User quit") -> None: """Send hangup command.""" await self.send_command({ "command": "hangup", "reason": reason }) def _audio_input_callback(self, indata, frames, time, status): """Callback for audio input (microphone).""" if status: print(f"Input status: {status}") if self.is_recording and self.running: # Convert to 16-bit PCM audio_data = (indata[:, 0] * 32767).astype(np.int16).tobytes() self.audio_input_queue.put(audio_data) def _add_audio_to_buffer(self, audio_data: bytes): """Add audio data to playback buffer.""" with self.audio_output_lock: self.audio_output_buffer += audio_data async def _playback_task(self): """Background task to play buffered audio smoothly using output stream.""" # Use a continuous output stream for smooth playback chunk_samples = int(self.sample_rate * 0.05) # 50ms chunks chunk_bytes = chunk_samples * 2 # 16-bit = 2 bytes per sample def output_callback(outdata, frames, time_info, status): """Audio output callback.""" if status: print(f"Output status: {status}") bytes_needed = frames * 2 with self.audio_output_lock: if len(self.audio_output_buffer) >= bytes_needed: audio_data = self.audio_output_buffer[:bytes_needed] self.audio_output_buffer = self.audio_output_buffer[bytes_needed:] samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32767.0 outdata[:, 0] = samples else: outdata.fill(0) # Create and start output stream try: output_stream = sd.OutputStream( samplerate=self.sample_rate, channels=1, dtype=np.float32, blocksize=chunk_samples, device=self.output_device, callback=output_callback, latency='low' ) output_stream.start() print(f"Audio output stream started (device: {self.output_device or 'default'})") # Keep stream running while client is active while self.running: await asyncio.sleep(0.1) output_stream.stop() output_stream.close() except Exception as e: print(f"Playback error: {e}") import traceback traceback.print_exc() async def audio_sender(self) -> None: """Send audio from microphone to server.""" while self.running: try: # Get audio from queue with timeout try: audio_data = await asyncio.get_event_loop().run_in_executor( None, lambda: self.audio_input_queue.get(timeout=0.1) ) except queue.Empty: continue # Send to server if self.ws and self.is_recording: await self.ws.send(audio_data) self.bytes_sent += len(audio_data) except asyncio.CancelledError: break except Exception as e: print(f"Audio sender error: {e}") break async def receiver(self) -> None: """Receive messages from server.""" try: while self.running: try: message = await asyncio.wait_for(self.ws.recv(), timeout=0.1) if isinstance(message, bytes): # Audio data received self.bytes_received += len(message) if self.is_playing: self._add_audio_to_buffer(message) # Show progress (less verbose) with self.audio_output_lock: buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000 duration_ms = len(message) / (self.sample_rate * 2) * 1000 print(f"← Audio: {duration_ms:.0f}ms (buffer: {buffer_ms:.0f}ms)") else: # JSON event event = json.loads(message) await self._handle_event(event) except asyncio.TimeoutError: continue except websockets.ConnectionClosed: print("Connection closed") self.running = False break except asyncio.CancelledError: pass except Exception as e: print(f"Receiver error: {e}") self.running = False async def _handle_event(self, event: dict) -> None: """Handle incoming event.""" event_type = event.get("event", "unknown") if event_type == "answer": print("← Session ready!") elif event_type == "speaking": print("← User speech detected") elif event_type == "silence": print("← User silence detected") elif event_type == "trackStart": print("← Bot started speaking") # Clear any old audio in buffer with self.audio_output_lock: self.audio_output_buffer = b"" elif event_type == "trackEnd": print("← Bot finished speaking") elif event_type == "interrupt": print("← Bot interrupted!") elif event_type == "error": print(f"← Error: {event.get('error')}") elif event_type == "hangup": print(f"← Hangup: {event.get('reason')}") self.running = False else: print(f"← Event: {event_type}") async def interactive_mode(self) -> None: """Run interactive mode for text chat.""" print("\n" + "=" * 50) print("Voice Conversation Client") print("=" * 50) print("Speak into your microphone to talk to the AI.") print("Or type messages to send text.") print("") print("Commands:") print(" /quit - End conversation") print(" /mute - Mute microphone") print(" /unmute - Unmute microphone") print(" /interrupt - Interrupt AI speech") print(" /stats - Show statistics") print("=" * 50 + "\n") while self.running: try: user_input = await asyncio.get_event_loop().run_in_executor( None, input, "" ) if not user_input: continue # Handle commands if user_input.startswith("/"): cmd = user_input.lower().strip() if cmd == "/quit": await self.send_hangup("User quit") break elif cmd == "/mute": self.is_recording = False print("Microphone muted") elif cmd == "/unmute": self.is_recording = True print("Microphone unmuted") elif cmd == "/interrupt": await self.send_interrupt() elif cmd == "/stats": print(f"Sent: {self.bytes_sent / 1024:.1f} KB") print(f"Received: {self.bytes_received / 1024:.1f} KB") else: print(f"Unknown command: {cmd}") else: # Send as chat message await self.send_chat(user_input) except EOFError: break except Exception as e: print(f"Input error: {e}") async def run(self, chat_message: str = None, interactive: bool = True) -> None: """ Run the client. Args: chat_message: Optional single chat message to send interactive: Whether to run in interactive mode """ try: await self.connect() # Wait for answer await asyncio.sleep(0.5) # Start audio input stream print("Starting audio streams...") input_stream = sd.InputStream( samplerate=self.sample_rate, channels=1, dtype=np.float32, blocksize=self.chunk_samples, device=self.input_device, callback=self._audio_input_callback ) input_stream.start() print("Audio streams started") # Start background tasks sender_task = asyncio.create_task(self.audio_sender()) receiver_task = asyncio.create_task(self.receiver()) playback_task = asyncio.create_task(self._playback_task()) if chat_message: # Send single message and wait await self.send_chat(chat_message) await asyncio.sleep(15) elif interactive: # Run interactive mode await self.interactive_mode() else: # Just wait while self.running: await asyncio.sleep(0.1) # Cleanup self.running = False sender_task.cancel() receiver_task.cancel() playback_task.cancel() try: await sender_task except asyncio.CancelledError: pass try: await receiver_task except asyncio.CancelledError: pass try: await playback_task except asyncio.CancelledError: pass input_stream.stop() except ConnectionRefusedError: print(f"Error: Could not connect to {self.url}") print("Make sure the server is running.") except Exception as e: print(f"Error: {e}") finally: await self.close() async def close(self) -> None: """Close the connection.""" self.running = False if self.ws: await self.ws.close() print(f"\nSession ended") print(f" Total sent: {self.bytes_sent / 1024:.1f} KB") print(f" Total received: {self.bytes_received / 1024:.1f} KB") def list_devices(): """List available audio devices.""" print("\nAvailable audio devices:") print("-" * 60) devices = sd.query_devices() for i, device in enumerate(devices): direction = [] if device['max_input_channels'] > 0: direction.append("IN") if device['max_output_channels'] > 0: direction.append("OUT") direction_str = "/".join(direction) if direction else "N/A" default = "" if i == sd.default.device[0]: default += " [DEFAULT INPUT]" if i == sd.default.device[1]: default += " [DEFAULT OUTPUT]" print(f" {i:2d}: {device['name'][:40]:40s} ({direction_str}){default}") print("-" * 60) async def main(): parser = argparse.ArgumentParser( description="Microphone client for duplex voice conversation" ) parser.add_argument( "--url", default="ws://localhost:8000/ws", help="WebSocket server URL" ) parser.add_argument( "--chat", help="Send a single chat message instead of using microphone" ) parser.add_argument( "--sample-rate", type=int, default=16000, help="Audio sample rate (default: 16000)" ) parser.add_argument( "--input-device", type=int, help="Input device ID" ) parser.add_argument( "--output-device", type=int, help="Output device ID" ) parser.add_argument( "--list-devices", action="store_true", help="List available audio devices and exit" ) parser.add_argument( "--no-interactive", action="store_true", help="Disable interactive mode" ) args = parser.parse_args() if args.list_devices: list_devices() return client = MicrophoneClient( url=args.url, sample_rate=args.sample_rate, input_device=args.input_device, output_device=args.output_device ) await client.run( chat_message=args.chat, interactive=not args.no_interactive ) if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: print("\nInterrupted by user")