I can use text to get audio response and barge in

This commit is contained in:
Xin Wang
2026-01-29 16:25:53 +08:00
parent cd90b4fb37
commit ac0c76e6e8
16 changed files with 3394 additions and 119 deletions

View File

@@ -1,137 +1,517 @@
#!/usr/bin/env python3
"""
Microphone WebSocket Client
Microphone client for testing duplex voice conversation.
Connects to the backend WebSocket endpoint and streams audio from the microphone.
Used to test VAD and EOU detection.
This client captures audio from the microphone, sends it to the server,
and plays back the AI's voice response through the speakers.
Dependencies:
pip install pyaudio aiohttp
Usage:
python examples/mic_client.py --url ws://localhost:8000/ws
python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!"
Requirements:
pip install sounddevice soundfile websockets numpy
"""
import argparse
import asyncio
import aiohttp
import pyaudio
import json
import sys
from datetime import datetime
import threading
import queue
from pathlib import Path
# Configuration
SERVER_URL = "ws://localhost:8000/ws"
SAMPLE_RATE = 16000
CHANNELS = 1
CHUNK_DURATION_MS = 20
CHUNK_SIZE = int(SAMPLE_RATE * (CHUNK_DURATION_MS / 1000.0)) # 320 samples for 20ms
FORMAT = pyaudio.paInt16
try:
import numpy as np
except ImportError:
print("Please install numpy: pip install numpy")
sys.exit(1)
async def send_audio_loop(ws, stream):
"""Read from microphone and send to WebSocket."""
print("🎙️ Microphone streaming started...")
try:
while True:
# Read non-blocking? PyAudio read is blocking, so run in executor or use specialized async lib.
# For simplicity in this script, we'll just read. It might block the event loop slightly
# but for 20ms chunks it's usually acceptable for a test script.
# To be proper async, we should run_in_executor.
data = await asyncio.get_event_loop().run_in_executor(
None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False)
try:
import sounddevice as sd
except ImportError:
print("Please install sounddevice: pip install sounddevice")
sys.exit(1)
try:
import websockets
except ImportError:
print("Please install websockets: pip install websockets")
sys.exit(1)
class MicrophoneClient:
"""
Full-duplex microphone client for voice conversation.
Features:
- Real-time microphone capture
- Real-time speaker playback
- WebSocket communication
- Text chat support
"""
def __init__(
self,
url: str,
sample_rate: int = 16000,
chunk_duration_ms: int = 20,
input_device: int = None,
output_device: int = None
):
"""
Initialize microphone client.
Args:
url: WebSocket server URL
sample_rate: Audio sample rate (Hz)
chunk_duration_ms: Audio chunk duration (ms)
input_device: Input device ID (None for default)
output_device: Output device ID (None for default)
"""
self.url = url
self.sample_rate = sample_rate
self.chunk_duration_ms = chunk_duration_ms
self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
self.input_device = input_device
self.output_device = output_device
# WebSocket connection
self.ws = None
self.running = False
# Audio buffers
self.audio_input_queue = queue.Queue()
self.audio_output_buffer = b"" # Continuous buffer for smooth playback
self.audio_output_lock = threading.Lock()
# Statistics
self.bytes_sent = 0
self.bytes_received = 0
# State
self.is_recording = True
self.is_playing = True
async def connect(self) -> None:
"""Connect to WebSocket server."""
print(f"Connecting to {self.url}...")
self.ws = await websockets.connect(self.url)
self.running = True
print("Connected!")
# Send invite command
await self.send_command({
"command": "invite",
"option": {
"codec": "pcm",
"sampleRate": self.sample_rate
}
})
async def send_command(self, cmd: dict) -> None:
"""Send JSON command to server."""
if self.ws:
await self.ws.send(json.dumps(cmd))
print(f"→ Command: {cmd.get('command', 'unknown')}")
async def send_chat(self, text: str) -> None:
"""Send chat message (text input)."""
await self.send_command({
"command": "chat",
"text": text
})
print(f"→ Chat: {text}")
async def send_interrupt(self) -> None:
"""Send interrupt command."""
await self.send_command({
"command": "interrupt"
})
async def send_hangup(self, reason: str = "User quit") -> None:
"""Send hangup command."""
await self.send_command({
"command": "hangup",
"reason": reason
})
def _audio_input_callback(self, indata, frames, time, status):
"""Callback for audio input (microphone)."""
if status:
print(f"Input status: {status}")
if self.is_recording and self.running:
# Convert to 16-bit PCM
audio_data = (indata[:, 0] * 32767).astype(np.int16).tobytes()
self.audio_input_queue.put(audio_data)
def _add_audio_to_buffer(self, audio_data: bytes):
"""Add audio data to playback buffer."""
with self.audio_output_lock:
self.audio_output_buffer += audio_data
async def _playback_task(self):
"""Background task to play buffered audio smoothly using output stream."""
# Use a continuous output stream for smooth playback
chunk_samples = int(self.sample_rate * 0.05) # 50ms chunks
chunk_bytes = chunk_samples * 2 # 16-bit = 2 bytes per sample
def output_callback(outdata, frames, time_info, status):
"""Audio output callback."""
if status:
print(f"Output status: {status}")
bytes_needed = frames * 2
with self.audio_output_lock:
if len(self.audio_output_buffer) >= bytes_needed:
audio_data = self.audio_output_buffer[:bytes_needed]
self.audio_output_buffer = self.audio_output_buffer[bytes_needed:]
samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32767.0
outdata[:, 0] = samples
else:
outdata.fill(0)
# Create and start output stream
try:
output_stream = sd.OutputStream(
samplerate=self.sample_rate,
channels=1,
dtype=np.float32,
blocksize=chunk_samples,
device=self.output_device,
callback=output_callback,
latency='low'
)
output_stream.start()
print(f"Audio output stream started (device: {self.output_device or 'default'})")
# Keep stream running while client is active
while self.running:
await asyncio.sleep(0.1)
output_stream.stop()
output_stream.close()
except Exception as e:
print(f"Playback error: {e}")
import traceback
traceback.print_exc()
async def audio_sender(self) -> None:
"""Send audio from microphone to server."""
while self.running:
try:
# Get audio from queue with timeout
try:
audio_data = await asyncio.get_event_loop().run_in_executor(
None, lambda: self.audio_input_queue.get(timeout=0.1)
)
except queue.Empty:
continue
# Send to server
if self.ws and self.is_recording:
await self.ws.send(audio_data)
self.bytes_sent += len(audio_data)
except asyncio.CancelledError:
break
except Exception as e:
print(f"Audio sender error: {e}")
break
async def receiver(self) -> None:
"""Receive messages from server."""
try:
while self.running:
try:
message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
if isinstance(message, bytes):
# Audio data received
self.bytes_received += len(message)
if self.is_playing:
self._add_audio_to_buffer(message)
# Show progress (less verbose)
with self.audio_output_lock:
buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000
duration_ms = len(message) / (self.sample_rate * 2) * 1000
print(f"← Audio: {duration_ms:.0f}ms (buffer: {buffer_ms:.0f}ms)")
else:
# JSON event
event = json.loads(message)
await self._handle_event(event)
except asyncio.TimeoutError:
continue
except websockets.ConnectionClosed:
print("Connection closed")
self.running = False
break
except asyncio.CancelledError:
pass
except Exception as e:
print(f"Receiver error: {e}")
self.running = False
async def _handle_event(self, event: dict) -> None:
"""Handle incoming event."""
event_type = event.get("event", "unknown")
if event_type == "answer":
print("← Session ready!")
elif event_type == "speaking":
print("← User speech detected")
elif event_type == "silence":
print("← User silence detected")
elif event_type == "trackStart":
print("← Bot started speaking")
# Clear any old audio in buffer
with self.audio_output_lock:
self.audio_output_buffer = b""
elif event_type == "trackEnd":
print("← Bot finished speaking")
elif event_type == "interrupt":
print("← Bot interrupted!")
elif event_type == "error":
print(f"← Error: {event.get('error')}")
elif event_type == "hangup":
print(f"← Hangup: {event.get('reason')}")
self.running = False
else:
print(f"← Event: {event_type}")
async def interactive_mode(self) -> None:
"""Run interactive mode for text chat."""
print("\n" + "=" * 50)
print("Voice Conversation Client")
print("=" * 50)
print("Speak into your microphone to talk to the AI.")
print("Or type messages to send text.")
print("")
print("Commands:")
print(" /quit - End conversation")
print(" /mute - Mute microphone")
print(" /unmute - Unmute microphone")
print(" /interrupt - Interrupt AI speech")
print(" /stats - Show statistics")
print("=" * 50 + "\n")
while self.running:
try:
user_input = await asyncio.get_event_loop().run_in_executor(
None, input, ""
)
if not user_input:
continue
# Handle commands
if user_input.startswith("/"):
cmd = user_input.lower().strip()
if cmd == "/quit":
await self.send_hangup("User quit")
break
elif cmd == "/mute":
self.is_recording = False
print("Microphone muted")
elif cmd == "/unmute":
self.is_recording = True
print("Microphone unmuted")
elif cmd == "/interrupt":
await self.send_interrupt()
elif cmd == "/stats":
print(f"Sent: {self.bytes_sent / 1024:.1f} KB")
print(f"Received: {self.bytes_received / 1024:.1f} KB")
else:
print(f"Unknown command: {cmd}")
else:
# Send as chat message
await self.send_chat(user_input)
except EOFError:
break
except Exception as e:
print(f"Input error: {e}")
async def run(self, chat_message: str = None, interactive: bool = True) -> None:
"""
Run the client.
Args:
chat_message: Optional single chat message to send
interactive: Whether to run in interactive mode
"""
try:
await self.connect()
# Wait for answer
await asyncio.sleep(0.5)
# Start audio input stream
print("Starting audio streams...")
input_stream = sd.InputStream(
samplerate=self.sample_rate,
channels=1,
dtype=np.float32,
blocksize=self.chunk_samples,
device=self.input_device,
callback=self._audio_input_callback
)
await ws.send_bytes(data)
# No sleep needed here as microphone dictates the timing
input_stream.start()
print("Audio streams started")
# Start background tasks
sender_task = asyncio.create_task(self.audio_sender())
receiver_task = asyncio.create_task(self.receiver())
playback_task = asyncio.create_task(self._playback_task())
if chat_message:
# Send single message and wait
await self.send_chat(chat_message)
await asyncio.sleep(15)
elif interactive:
# Run interactive mode
await self.interactive_mode()
else:
# Just wait
while self.running:
await asyncio.sleep(0.1)
# Cleanup
self.running = False
sender_task.cancel()
receiver_task.cancel()
playback_task.cancel()
except Exception as e:
print(f"❌ Error in send loop: {e}")
async def receive_loop(ws):
"""Listen for VAD/EOU events."""
print("👂 Listening for server events...")
async for msg in ws:
timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
if msg.type == aiohttp.WSMsgType.TEXT:
try:
data = json.loads(msg.data)
event = data.get('event')
# Highlight VAD/EOU events
if event == 'speaking':
print(f"[{timestamp}] 🗣️ SPEAKING STARTED")
elif event == 'silence':
print(f"[{timestamp}] 🤫 SILENCE DETECTED")
elif event == 'eou':
print(f"[{timestamp}] ✅ END OF UTTERANCE (EOU)")
elif event == 'error':
print(f"[{timestamp}] ❌ ERROR: {data.get('error')}")
else:
print(f"[{timestamp}] 📩 {event}: {str(data)[:100]}")
except json.JSONDecodeError:
print(f"[{timestamp}] 📄 Text: {msg.data}")
elif msg.type == aiohttp.WSMsgType.CLOSED:
print("❌ Connection closed")
break
elif msg.type == aiohttp.WSMsgType.ERROR:
print("❌ Connection error")
break
await sender_task
except asyncio.CancelledError:
pass
try:
await receiver_task
except asyncio.CancelledError:
pass
try:
await playback_task
except asyncio.CancelledError:
pass
input_stream.stop()
except ConnectionRefusedError:
print(f"Error: Could not connect to {self.url}")
print("Make sure the server is running.")
except Exception as e:
print(f"Error: {e}")
finally:
await self.close()
async def close(self) -> None:
"""Close the connection."""
self.running = False
if self.ws:
await self.ws.close()
print(f"\nSession ended")
print(f" Total sent: {self.bytes_sent / 1024:.1f} KB")
print(f" Total received: {self.bytes_received / 1024:.1f} KB")
def list_devices():
"""List available audio devices."""
print("\nAvailable audio devices:")
print("-" * 60)
devices = sd.query_devices()
for i, device in enumerate(devices):
direction = []
if device['max_input_channels'] > 0:
direction.append("IN")
if device['max_output_channels'] > 0:
direction.append("OUT")
direction_str = "/".join(direction) if direction else "N/A"
default = ""
if i == sd.default.device[0]:
default += " [DEFAULT INPUT]"
if i == sd.default.device[1]:
default += " [DEFAULT OUTPUT]"
print(f" {i:2d}: {device['name'][:40]:40s} ({direction_str}){default}")
print("-" * 60)
async def main():
p = pyaudio.PyAudio()
parser = argparse.ArgumentParser(
description="Microphone client for duplex voice conversation"
)
parser.add_argument(
"--url",
default="ws://localhost:8000/ws",
help="WebSocket server URL"
)
parser.add_argument(
"--chat",
help="Send a single chat message instead of using microphone"
)
parser.add_argument(
"--sample-rate",
type=int,
default=16000,
help="Audio sample rate (default: 16000)"
)
parser.add_argument(
"--input-device",
type=int,
help="Input device ID"
)
parser.add_argument(
"--output-device",
type=int,
help="Output device ID"
)
parser.add_argument(
"--list-devices",
action="store_true",
help="List available audio devices and exit"
)
parser.add_argument(
"--no-interactive",
action="store_true",
help="Disable interactive mode"
)
# Check for input devices
info = p.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
if numdevices == 0:
print("❌ No audio input devices found")
return
# Open microphone stream
try:
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNK_SIZE)
except Exception as e:
print(f"❌ Failed to open microphone: {e}")
return
session = aiohttp.ClientSession()
args = parser.parse_args()
try:
print(f"🔌 Connecting to {SERVER_URL}...")
async with session.ws_connect(SERVER_URL) as ws:
print("✅ Connected!")
if args.list_devices:
list_devices()
return
client = MicrophoneClient(
url=args.url,
sample_rate=args.sample_rate,
input_device=args.input_device,
output_device=args.output_device
)
await client.run(
chat_message=args.chat,
interactive=not args.no_interactive
)
# 1. Send Invite
invite_msg = {
"command": "invite",
"option": {
"codec": "pcm",
"samplerate": SAMPLE_RATE
}
}
await ws.send_json(invite_msg)
print("📤 Sent Invite")
# 2. Run loops
await asyncio.gather(
receive_loop(ws),
send_audio_loop(ws, stream)
)
except aiohttp.ClientConnectorError:
print(f"❌ Failed to connect to {SERVER_URL}. Is the server running?")
except KeyboardInterrupt:
print("\n👋 Stopping...")
finally:
stream.stop_stream()
stream.close()
p.terminate()
await session.close()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
pass
print("\nInterrupted by user")