I can use text to get audio response and barge in
This commit is contained in:
@@ -1,137 +1,517 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Microphone WebSocket Client
|
||||
Microphone client for testing duplex voice conversation.
|
||||
|
||||
Connects to the backend WebSocket endpoint and streams audio from the microphone.
|
||||
Used to test VAD and EOU detection.
|
||||
This client captures audio from the microphone, sends it to the server,
|
||||
and plays back the AI's voice response through the speakers.
|
||||
|
||||
Dependencies:
|
||||
pip install pyaudio aiohttp
|
||||
Usage:
|
||||
python examples/mic_client.py --url ws://localhost:8000/ws
|
||||
python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!"
|
||||
|
||||
Requirements:
|
||||
pip install sounddevice soundfile websockets numpy
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import pyaudio
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
import threading
|
||||
import queue
|
||||
from pathlib import Path
|
||||
|
||||
# Configuration
|
||||
SERVER_URL = "ws://localhost:8000/ws"
|
||||
SAMPLE_RATE = 16000
|
||||
CHANNELS = 1
|
||||
CHUNK_DURATION_MS = 20
|
||||
CHUNK_SIZE = int(SAMPLE_RATE * (CHUNK_DURATION_MS / 1000.0)) # 320 samples for 20ms
|
||||
FORMAT = pyaudio.paInt16
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
print("Please install numpy: pip install numpy")
|
||||
sys.exit(1)
|
||||
|
||||
async def send_audio_loop(ws, stream):
|
||||
"""Read from microphone and send to WebSocket."""
|
||||
print("🎙️ Microphone streaming started...")
|
||||
try:
|
||||
while True:
|
||||
# Read non-blocking? PyAudio read is blocking, so run in executor or use specialized async lib.
|
||||
# For simplicity in this script, we'll just read. It might block the event loop slightly
|
||||
# but for 20ms chunks it's usually acceptable for a test script.
|
||||
# To be proper async, we should run_in_executor.
|
||||
data = await asyncio.get_event_loop().run_in_executor(
|
||||
None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False)
|
||||
try:
|
||||
import sounddevice as sd
|
||||
except ImportError:
|
||||
print("Please install sounddevice: pip install sounddevice")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
import websockets
|
||||
except ImportError:
|
||||
print("Please install websockets: pip install websockets")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class MicrophoneClient:
|
||||
"""
|
||||
Full-duplex microphone client for voice conversation.
|
||||
|
||||
Features:
|
||||
- Real-time microphone capture
|
||||
- Real-time speaker playback
|
||||
- WebSocket communication
|
||||
- Text chat support
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
sample_rate: int = 16000,
|
||||
chunk_duration_ms: int = 20,
|
||||
input_device: int = None,
|
||||
output_device: int = None
|
||||
):
|
||||
"""
|
||||
Initialize microphone client.
|
||||
|
||||
Args:
|
||||
url: WebSocket server URL
|
||||
sample_rate: Audio sample rate (Hz)
|
||||
chunk_duration_ms: Audio chunk duration (ms)
|
||||
input_device: Input device ID (None for default)
|
||||
output_device: Output device ID (None for default)
|
||||
"""
|
||||
self.url = url
|
||||
self.sample_rate = sample_rate
|
||||
self.chunk_duration_ms = chunk_duration_ms
|
||||
self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
|
||||
self.input_device = input_device
|
||||
self.output_device = output_device
|
||||
|
||||
# WebSocket connection
|
||||
self.ws = None
|
||||
self.running = False
|
||||
|
||||
# Audio buffers
|
||||
self.audio_input_queue = queue.Queue()
|
||||
self.audio_output_buffer = b"" # Continuous buffer for smooth playback
|
||||
self.audio_output_lock = threading.Lock()
|
||||
|
||||
# Statistics
|
||||
self.bytes_sent = 0
|
||||
self.bytes_received = 0
|
||||
|
||||
# State
|
||||
self.is_recording = True
|
||||
self.is_playing = True
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Connect to WebSocket server."""
|
||||
print(f"Connecting to {self.url}...")
|
||||
self.ws = await websockets.connect(self.url)
|
||||
self.running = True
|
||||
print("Connected!")
|
||||
|
||||
# Send invite command
|
||||
await self.send_command({
|
||||
"command": "invite",
|
||||
"option": {
|
||||
"codec": "pcm",
|
||||
"sampleRate": self.sample_rate
|
||||
}
|
||||
})
|
||||
|
||||
async def send_command(self, cmd: dict) -> None:
|
||||
"""Send JSON command to server."""
|
||||
if self.ws:
|
||||
await self.ws.send(json.dumps(cmd))
|
||||
print(f"→ Command: {cmd.get('command', 'unknown')}")
|
||||
|
||||
async def send_chat(self, text: str) -> None:
|
||||
"""Send chat message (text input)."""
|
||||
await self.send_command({
|
||||
"command": "chat",
|
||||
"text": text
|
||||
})
|
||||
print(f"→ Chat: {text}")
|
||||
|
||||
async def send_interrupt(self) -> None:
|
||||
"""Send interrupt command."""
|
||||
await self.send_command({
|
||||
"command": "interrupt"
|
||||
})
|
||||
|
||||
async def send_hangup(self, reason: str = "User quit") -> None:
|
||||
"""Send hangup command."""
|
||||
await self.send_command({
|
||||
"command": "hangup",
|
||||
"reason": reason
|
||||
})
|
||||
|
||||
def _audio_input_callback(self, indata, frames, time, status):
|
||||
"""Callback for audio input (microphone)."""
|
||||
if status:
|
||||
print(f"Input status: {status}")
|
||||
|
||||
if self.is_recording and self.running:
|
||||
# Convert to 16-bit PCM
|
||||
audio_data = (indata[:, 0] * 32767).astype(np.int16).tobytes()
|
||||
self.audio_input_queue.put(audio_data)
|
||||
|
||||
def _add_audio_to_buffer(self, audio_data: bytes):
|
||||
"""Add audio data to playback buffer."""
|
||||
with self.audio_output_lock:
|
||||
self.audio_output_buffer += audio_data
|
||||
|
||||
async def _playback_task(self):
|
||||
"""Background task to play buffered audio smoothly using output stream."""
|
||||
# Use a continuous output stream for smooth playback
|
||||
chunk_samples = int(self.sample_rate * 0.05) # 50ms chunks
|
||||
chunk_bytes = chunk_samples * 2 # 16-bit = 2 bytes per sample
|
||||
|
||||
def output_callback(outdata, frames, time_info, status):
|
||||
"""Audio output callback."""
|
||||
if status:
|
||||
print(f"Output status: {status}")
|
||||
|
||||
bytes_needed = frames * 2
|
||||
with self.audio_output_lock:
|
||||
if len(self.audio_output_buffer) >= bytes_needed:
|
||||
audio_data = self.audio_output_buffer[:bytes_needed]
|
||||
self.audio_output_buffer = self.audio_output_buffer[bytes_needed:]
|
||||
samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32767.0
|
||||
outdata[:, 0] = samples
|
||||
else:
|
||||
outdata.fill(0)
|
||||
|
||||
# Create and start output stream
|
||||
try:
|
||||
output_stream = sd.OutputStream(
|
||||
samplerate=self.sample_rate,
|
||||
channels=1,
|
||||
dtype=np.float32,
|
||||
blocksize=chunk_samples,
|
||||
device=self.output_device,
|
||||
callback=output_callback,
|
||||
latency='low'
|
||||
)
|
||||
output_stream.start()
|
||||
print(f"Audio output stream started (device: {self.output_device or 'default'})")
|
||||
|
||||
# Keep stream running while client is active
|
||||
while self.running:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
output_stream.stop()
|
||||
output_stream.close()
|
||||
|
||||
except Exception as e:
|
||||
print(f"Playback error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
async def audio_sender(self) -> None:
|
||||
"""Send audio from microphone to server."""
|
||||
while self.running:
|
||||
try:
|
||||
# Get audio from queue with timeout
|
||||
try:
|
||||
audio_data = await asyncio.get_event_loop().run_in_executor(
|
||||
None, lambda: self.audio_input_queue.get(timeout=0.1)
|
||||
)
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
# Send to server
|
||||
if self.ws and self.is_recording:
|
||||
await self.ws.send(audio_data)
|
||||
self.bytes_sent += len(audio_data)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Audio sender error: {e}")
|
||||
break
|
||||
|
||||
async def receiver(self) -> None:
|
||||
"""Receive messages from server."""
|
||||
try:
|
||||
while self.running:
|
||||
try:
|
||||
message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
|
||||
|
||||
if isinstance(message, bytes):
|
||||
# Audio data received
|
||||
self.bytes_received += len(message)
|
||||
|
||||
if self.is_playing:
|
||||
self._add_audio_to_buffer(message)
|
||||
|
||||
# Show progress (less verbose)
|
||||
with self.audio_output_lock:
|
||||
buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000
|
||||
duration_ms = len(message) / (self.sample_rate * 2) * 1000
|
||||
print(f"← Audio: {duration_ms:.0f}ms (buffer: {buffer_ms:.0f}ms)")
|
||||
|
||||
else:
|
||||
# JSON event
|
||||
event = json.loads(message)
|
||||
await self._handle_event(event)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
except websockets.ConnectionClosed:
|
||||
print("Connection closed")
|
||||
self.running = False
|
||||
break
|
||||
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"Receiver error: {e}")
|
||||
self.running = False
|
||||
|
||||
async def _handle_event(self, event: dict) -> None:
|
||||
"""Handle incoming event."""
|
||||
event_type = event.get("event", "unknown")
|
||||
|
||||
if event_type == "answer":
|
||||
print("← Session ready!")
|
||||
elif event_type == "speaking":
|
||||
print("← User speech detected")
|
||||
elif event_type == "silence":
|
||||
print("← User silence detected")
|
||||
elif event_type == "trackStart":
|
||||
print("← Bot started speaking")
|
||||
# Clear any old audio in buffer
|
||||
with self.audio_output_lock:
|
||||
self.audio_output_buffer = b""
|
||||
elif event_type == "trackEnd":
|
||||
print("← Bot finished speaking")
|
||||
elif event_type == "interrupt":
|
||||
print("← Bot interrupted!")
|
||||
elif event_type == "error":
|
||||
print(f"← Error: {event.get('error')}")
|
||||
elif event_type == "hangup":
|
||||
print(f"← Hangup: {event.get('reason')}")
|
||||
self.running = False
|
||||
else:
|
||||
print(f"← Event: {event_type}")
|
||||
|
||||
async def interactive_mode(self) -> None:
|
||||
"""Run interactive mode for text chat."""
|
||||
print("\n" + "=" * 50)
|
||||
print("Voice Conversation Client")
|
||||
print("=" * 50)
|
||||
print("Speak into your microphone to talk to the AI.")
|
||||
print("Or type messages to send text.")
|
||||
print("")
|
||||
print("Commands:")
|
||||
print(" /quit - End conversation")
|
||||
print(" /mute - Mute microphone")
|
||||
print(" /unmute - Unmute microphone")
|
||||
print(" /interrupt - Interrupt AI speech")
|
||||
print(" /stats - Show statistics")
|
||||
print("=" * 50 + "\n")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
user_input = await asyncio.get_event_loop().run_in_executor(
|
||||
None, input, ""
|
||||
)
|
||||
|
||||
if not user_input:
|
||||
continue
|
||||
|
||||
# Handle commands
|
||||
if user_input.startswith("/"):
|
||||
cmd = user_input.lower().strip()
|
||||
|
||||
if cmd == "/quit":
|
||||
await self.send_hangup("User quit")
|
||||
break
|
||||
elif cmd == "/mute":
|
||||
self.is_recording = False
|
||||
print("Microphone muted")
|
||||
elif cmd == "/unmute":
|
||||
self.is_recording = True
|
||||
print("Microphone unmuted")
|
||||
elif cmd == "/interrupt":
|
||||
await self.send_interrupt()
|
||||
elif cmd == "/stats":
|
||||
print(f"Sent: {self.bytes_sent / 1024:.1f} KB")
|
||||
print(f"Received: {self.bytes_received / 1024:.1f} KB")
|
||||
else:
|
||||
print(f"Unknown command: {cmd}")
|
||||
else:
|
||||
# Send as chat message
|
||||
await self.send_chat(user_input)
|
||||
|
||||
except EOFError:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Input error: {e}")
|
||||
|
||||
async def run(self, chat_message: str = None, interactive: bool = True) -> None:
|
||||
"""
|
||||
Run the client.
|
||||
|
||||
Args:
|
||||
chat_message: Optional single chat message to send
|
||||
interactive: Whether to run in interactive mode
|
||||
"""
|
||||
try:
|
||||
await self.connect()
|
||||
|
||||
# Wait for answer
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Start audio input stream
|
||||
print("Starting audio streams...")
|
||||
|
||||
input_stream = sd.InputStream(
|
||||
samplerate=self.sample_rate,
|
||||
channels=1,
|
||||
dtype=np.float32,
|
||||
blocksize=self.chunk_samples,
|
||||
device=self.input_device,
|
||||
callback=self._audio_input_callback
|
||||
)
|
||||
|
||||
await ws.send_bytes(data)
|
||||
# No sleep needed here as microphone dictates the timing
|
||||
input_stream.start()
|
||||
print("Audio streams started")
|
||||
|
||||
# Start background tasks
|
||||
sender_task = asyncio.create_task(self.audio_sender())
|
||||
receiver_task = asyncio.create_task(self.receiver())
|
||||
playback_task = asyncio.create_task(self._playback_task())
|
||||
|
||||
if chat_message:
|
||||
# Send single message and wait
|
||||
await self.send_chat(chat_message)
|
||||
await asyncio.sleep(15)
|
||||
elif interactive:
|
||||
# Run interactive mode
|
||||
await self.interactive_mode()
|
||||
else:
|
||||
# Just wait
|
||||
while self.running:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Cleanup
|
||||
self.running = False
|
||||
sender_task.cancel()
|
||||
receiver_task.cancel()
|
||||
playback_task.cancel()
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error in send loop: {e}")
|
||||
|
||||
async def receive_loop(ws):
|
||||
"""Listen for VAD/EOU events."""
|
||||
print("👂 Listening for server events...")
|
||||
async for msg in ws:
|
||||
timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
||||
|
||||
if msg.type == aiohttp.WSMsgType.TEXT:
|
||||
try:
|
||||
data = json.loads(msg.data)
|
||||
event = data.get('event')
|
||||
|
||||
# Highlight VAD/EOU events
|
||||
if event == 'speaking':
|
||||
print(f"[{timestamp}] 🗣️ SPEAKING STARTED")
|
||||
elif event == 'silence':
|
||||
print(f"[{timestamp}] 🤫 SILENCE DETECTED")
|
||||
elif event == 'eou':
|
||||
print(f"[{timestamp}] ✅ END OF UTTERANCE (EOU)")
|
||||
elif event == 'error':
|
||||
print(f"[{timestamp}] ❌ ERROR: {data.get('error')}")
|
||||
else:
|
||||
print(f"[{timestamp}] 📩 {event}: {str(data)[:100]}")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(f"[{timestamp}] 📄 Text: {msg.data}")
|
||||
|
||||
elif msg.type == aiohttp.WSMsgType.CLOSED:
|
||||
print("❌ Connection closed")
|
||||
break
|
||||
elif msg.type == aiohttp.WSMsgType.ERROR:
|
||||
print("❌ Connection error")
|
||||
break
|
||||
await sender_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
try:
|
||||
await receiver_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
try:
|
||||
await playback_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
input_stream.stop()
|
||||
|
||||
except ConnectionRefusedError:
|
||||
print(f"Error: Could not connect to {self.url}")
|
||||
print("Make sure the server is running.")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
finally:
|
||||
await self.close()
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the connection."""
|
||||
self.running = False
|
||||
if self.ws:
|
||||
await self.ws.close()
|
||||
|
||||
print(f"\nSession ended")
|
||||
print(f" Total sent: {self.bytes_sent / 1024:.1f} KB")
|
||||
print(f" Total received: {self.bytes_received / 1024:.1f} KB")
|
||||
|
||||
|
||||
def list_devices():
|
||||
"""List available audio devices."""
|
||||
print("\nAvailable audio devices:")
|
||||
print("-" * 60)
|
||||
devices = sd.query_devices()
|
||||
for i, device in enumerate(devices):
|
||||
direction = []
|
||||
if device['max_input_channels'] > 0:
|
||||
direction.append("IN")
|
||||
if device['max_output_channels'] > 0:
|
||||
direction.append("OUT")
|
||||
direction_str = "/".join(direction) if direction else "N/A"
|
||||
|
||||
default = ""
|
||||
if i == sd.default.device[0]:
|
||||
default += " [DEFAULT INPUT]"
|
||||
if i == sd.default.device[1]:
|
||||
default += " [DEFAULT OUTPUT]"
|
||||
|
||||
print(f" {i:2d}: {device['name'][:40]:40s} ({direction_str}){default}")
|
||||
print("-" * 60)
|
||||
|
||||
|
||||
async def main():
|
||||
p = pyaudio.PyAudio()
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Microphone client for duplex voice conversation"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
default="ws://localhost:8000/ws",
|
||||
help="WebSocket server URL"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat",
|
||||
help="Send a single chat message instead of using microphone"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sample-rate",
|
||||
type=int,
|
||||
default=16000,
|
||||
help="Audio sample rate (default: 16000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-device",
|
||||
type=int,
|
||||
help="Input device ID"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-device",
|
||||
type=int,
|
||||
help="Output device ID"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-devices",
|
||||
action="store_true",
|
||||
help="List available audio devices and exit"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-interactive",
|
||||
action="store_true",
|
||||
help="Disable interactive mode"
|
||||
)
|
||||
|
||||
# Check for input devices
|
||||
info = p.get_host_api_info_by_index(0)
|
||||
numdevices = info.get('deviceCount')
|
||||
if numdevices == 0:
|
||||
print("❌ No audio input devices found")
|
||||
return
|
||||
|
||||
# Open microphone stream
|
||||
try:
|
||||
stream = p.open(format=FORMAT,
|
||||
channels=CHANNELS,
|
||||
rate=SAMPLE_RATE,
|
||||
input=True,
|
||||
frames_per_buffer=CHUNK_SIZE)
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to open microphone: {e}")
|
||||
return
|
||||
|
||||
session = aiohttp.ClientSession()
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
print(f"🔌 Connecting to {SERVER_URL}...")
|
||||
async with session.ws_connect(SERVER_URL) as ws:
|
||||
print("✅ Connected!")
|
||||
if args.list_devices:
|
||||
list_devices()
|
||||
return
|
||||
|
||||
client = MicrophoneClient(
|
||||
url=args.url,
|
||||
sample_rate=args.sample_rate,
|
||||
input_device=args.input_device,
|
||||
output_device=args.output_device
|
||||
)
|
||||
|
||||
await client.run(
|
||||
chat_message=args.chat,
|
||||
interactive=not args.no_interactive
|
||||
)
|
||||
|
||||
# 1. Send Invite
|
||||
invite_msg = {
|
||||
"command": "invite",
|
||||
"option": {
|
||||
"codec": "pcm",
|
||||
"samplerate": SAMPLE_RATE
|
||||
}
|
||||
}
|
||||
await ws.send_json(invite_msg)
|
||||
print("📤 Sent Invite")
|
||||
|
||||
# 2. Run loops
|
||||
await asyncio.gather(
|
||||
receive_loop(ws),
|
||||
send_audio_loop(ws, stream)
|
||||
)
|
||||
|
||||
except aiohttp.ClientConnectorError:
|
||||
print(f"❌ Failed to connect to {SERVER_URL}. Is the server running?")
|
||||
except KeyboardInterrupt:
|
||||
print("\n👋 Stopping...")
|
||||
finally:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
await session.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
print("\nInterrupted by user")
|
||||
|
||||
Reference in New Issue
Block a user