477 lines
16 KiB
Python
477 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
WAV file client for testing duplex voice conversation.
|
|
|
|
This client reads audio from a WAV file, sends it to the server,
|
|
and saves the AI's voice response to an output WAV file.
|
|
|
|
Usage:
|
|
python examples/wav_client.py --input input.wav --output response.wav
|
|
python examples/wav_client.py --input input.wav --output response.wav --url ws://localhost:8000/ws
|
|
python examples/wav_client.py --input input.wav --output response.wav --wait-time 10
|
|
python wav_client.py --input ../data/audio_examples/two_utterances.wav -o response.wav
|
|
Requirements:
|
|
pip install soundfile websockets numpy
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import sys
|
|
import time
|
|
import wave
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import numpy as np
|
|
except ImportError:
|
|
print("Please install numpy: pip install numpy")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
import soundfile as sf
|
|
except ImportError:
|
|
print("Please install soundfile: pip install soundfile")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
import websockets
|
|
except ImportError:
|
|
print("Please install websockets: pip install websockets")
|
|
sys.exit(1)
|
|
|
|
|
|
class WavFileClient:
|
|
"""
|
|
WAV file client for voice conversation testing.
|
|
|
|
Features:
|
|
- Read audio from WAV file
|
|
- Send audio to WebSocket server
|
|
- Receive and save response audio
|
|
- Event logging
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
url: str,
|
|
input_file: str,
|
|
output_file: str,
|
|
sample_rate: int = 16000,
|
|
chunk_duration_ms: int = 20,
|
|
wait_time: float = 15.0,
|
|
verbose: bool = False
|
|
):
|
|
"""
|
|
Initialize WAV file client.
|
|
|
|
Args:
|
|
url: WebSocket server URL
|
|
input_file: Input WAV file path
|
|
output_file: Output WAV file path
|
|
sample_rate: Audio sample rate (Hz)
|
|
chunk_duration_ms: Audio chunk duration (ms) for sending
|
|
wait_time: Time to wait for response after sending (seconds)
|
|
verbose: Enable verbose output
|
|
"""
|
|
self.url = url
|
|
self.input_file = Path(input_file)
|
|
self.output_file = Path(output_file)
|
|
self.sample_rate = sample_rate
|
|
self.chunk_duration_ms = chunk_duration_ms
|
|
self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
|
|
self.wait_time = wait_time
|
|
self.verbose = verbose
|
|
|
|
# WebSocket connection
|
|
self.ws = None
|
|
self.running = False
|
|
|
|
# Audio buffers
|
|
self.received_audio = bytearray()
|
|
|
|
# Statistics
|
|
self.bytes_sent = 0
|
|
self.bytes_received = 0
|
|
|
|
# TTFB tracking
|
|
self.send_start_time = None
|
|
self.first_audio_received = False
|
|
self.ttfb_ms = None
|
|
|
|
# State tracking
|
|
self.track_started = False
|
|
self.track_ended = False
|
|
self.send_completed = False
|
|
|
|
# Events log
|
|
self.events_log = []
|
|
|
|
def log_event(self, direction: str, message: str):
|
|
"""Log an event with timestamp."""
|
|
timestamp = time.time()
|
|
self.events_log.append({
|
|
"timestamp": timestamp,
|
|
"direction": direction,
|
|
"message": message
|
|
})
|
|
print(f"{direction} {message}")
|
|
|
|
async def connect(self) -> None:
|
|
"""Connect to WebSocket server."""
|
|
self.log_event("→", f"Connecting to {self.url}...")
|
|
self.ws = await websockets.connect(self.url)
|
|
self.running = True
|
|
self.log_event("←", "Connected!")
|
|
|
|
# Send invite command
|
|
await self.send_command({
|
|
"command": "invite",
|
|
"option": {
|
|
"codec": "pcm",
|
|
"sampleRate": self.sample_rate
|
|
}
|
|
})
|
|
|
|
async def send_command(self, cmd: dict) -> None:
|
|
"""Send JSON command to server."""
|
|
if self.ws:
|
|
await self.ws.send(json.dumps(cmd))
|
|
self.log_event("→", f"Command: {cmd.get('command', 'unknown')}")
|
|
|
|
async def send_hangup(self, reason: str = "Session complete") -> None:
|
|
"""Send hangup command."""
|
|
await self.send_command({
|
|
"command": "hangup",
|
|
"reason": reason
|
|
})
|
|
|
|
def load_wav_file(self) -> tuple[np.ndarray, int]:
|
|
"""
|
|
Load and prepare WAV file for sending.
|
|
|
|
Returns:
|
|
Tuple of (audio_data as int16 numpy array, original sample rate)
|
|
"""
|
|
if not self.input_file.exists():
|
|
raise FileNotFoundError(f"Input file not found: {self.input_file}")
|
|
|
|
# Load audio file
|
|
audio_data, file_sample_rate = sf.read(self.input_file)
|
|
self.log_event("→", f"Loaded: {self.input_file}")
|
|
self.log_event("→", f" Original sample rate: {file_sample_rate} Hz")
|
|
self.log_event("→", f" Duration: {len(audio_data) / file_sample_rate:.2f}s")
|
|
|
|
# Convert stereo to mono if needed
|
|
if len(audio_data.shape) > 1:
|
|
audio_data = audio_data.mean(axis=1)
|
|
self.log_event("→", " Converted stereo to mono")
|
|
|
|
# Resample if needed
|
|
if file_sample_rate != self.sample_rate:
|
|
# Simple resampling using numpy
|
|
duration = len(audio_data) / file_sample_rate
|
|
num_samples = int(duration * self.sample_rate)
|
|
indices = np.linspace(0, len(audio_data) - 1, num_samples)
|
|
audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data)
|
|
self.log_event("→", f" Resampled to {self.sample_rate} Hz")
|
|
|
|
# Convert to int16
|
|
if audio_data.dtype != np.int16:
|
|
# Normalize to [-1, 1] if needed
|
|
max_val = np.max(np.abs(audio_data))
|
|
if max_val > 1.0:
|
|
audio_data = audio_data / max_val
|
|
audio_data = (audio_data * 32767).astype(np.int16)
|
|
|
|
self.log_event("→", f" Prepared: {len(audio_data)} samples ({len(audio_data)/self.sample_rate:.2f}s)")
|
|
|
|
return audio_data, file_sample_rate
|
|
|
|
async def audio_sender(self, audio_data: np.ndarray) -> None:
|
|
"""Send audio data to server in chunks."""
|
|
total_samples = len(audio_data)
|
|
chunk_size = self.chunk_samples
|
|
sent_samples = 0
|
|
|
|
self.send_start_time = time.time()
|
|
self.log_event("→", f"Starting audio transmission ({total_samples} samples)...")
|
|
|
|
while sent_samples < total_samples and self.running:
|
|
# Get next chunk
|
|
end_sample = min(sent_samples + chunk_size, total_samples)
|
|
chunk = audio_data[sent_samples:end_sample]
|
|
chunk_bytes = chunk.tobytes()
|
|
|
|
# Send to server
|
|
if self.ws:
|
|
await self.ws.send(chunk_bytes)
|
|
self.bytes_sent += len(chunk_bytes)
|
|
|
|
sent_samples = end_sample
|
|
|
|
# Progress logging (every 500ms worth of audio)
|
|
if self.verbose and sent_samples % (self.sample_rate // 2) == 0:
|
|
progress = (sent_samples / total_samples) * 100
|
|
print(f" Sending: {progress:.0f}%", end="\r")
|
|
|
|
# Delay to simulate real-time streaming
|
|
# Server expects audio at real-time pace for VAD/ASR to work properly
|
|
await asyncio.sleep(self.chunk_duration_ms / 1000)
|
|
|
|
self.send_completed = True
|
|
elapsed = time.time() - self.send_start_time
|
|
self.log_event("→", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent/1024:.1f} KB)")
|
|
|
|
async def receiver(self) -> None:
|
|
"""Receive messages from server."""
|
|
try:
|
|
while self.running:
|
|
try:
|
|
message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
|
|
|
|
if isinstance(message, bytes):
|
|
# Audio data received
|
|
self.bytes_received += len(message)
|
|
self.received_audio.extend(message)
|
|
|
|
# Calculate TTFB on first audio
|
|
if not self.first_audio_received and self.send_start_time:
|
|
self.ttfb_ms = (time.time() - self.send_start_time) * 1000
|
|
self.first_audio_received = True
|
|
self.log_event("←", f"[TTFB] First audio latency: {self.ttfb_ms:.0f}ms")
|
|
|
|
# Log progress
|
|
duration_ms = len(message) / (self.sample_rate * 2) * 1000
|
|
total_ms = len(self.received_audio) / (self.sample_rate * 2) * 1000
|
|
if self.verbose:
|
|
print(f"← Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r")
|
|
|
|
else:
|
|
# JSON event
|
|
event = json.loads(message)
|
|
await self._handle_event(event)
|
|
|
|
except asyncio.TimeoutError:
|
|
continue
|
|
except websockets.ConnectionClosed:
|
|
self.log_event("←", "Connection closed")
|
|
self.running = False
|
|
break
|
|
|
|
except asyncio.CancelledError:
|
|
pass
|
|
except Exception as e:
|
|
self.log_event("!", f"Receiver error: {e}")
|
|
self.running = False
|
|
|
|
async def _handle_event(self, event: dict) -> None:
|
|
"""Handle incoming event."""
|
|
event_type = event.get("event", "unknown")
|
|
|
|
if event_type == "answer":
|
|
self.log_event("←", "Session ready!")
|
|
elif event_type == "speaking":
|
|
self.log_event("←", "Speech detected")
|
|
elif event_type == "silence":
|
|
self.log_event("←", "Silence detected")
|
|
elif event_type == "transcript":
|
|
text = event.get("text", "")
|
|
is_final = event.get("isFinal", False)
|
|
if is_final:
|
|
self.log_event("←", f"Transcript (final): {text}")
|
|
elif self.verbose:
|
|
self.log_event("←", f"Transcript (interim): {text[:50]}...")
|
|
elif event_type == "ttfb":
|
|
latency_ms = event.get("latencyMs", 0)
|
|
self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
|
|
elif event_type == "trackStart":
|
|
self.track_started = True
|
|
self.log_event("←", "Bot started speaking")
|
|
elif event_type == "trackEnd":
|
|
self.track_ended = True
|
|
self.log_event("←", "Bot finished speaking")
|
|
elif event_type == "interrupt":
|
|
self.log_event("←", "Bot interrupted!")
|
|
elif event_type == "error":
|
|
self.log_event("!", f"Error: {event.get('error')}")
|
|
elif event_type == "hangup":
|
|
self.log_event("←", f"Hangup: {event.get('reason')}")
|
|
self.running = False
|
|
else:
|
|
self.log_event("←", f"Event: {event_type}")
|
|
|
|
def save_output_wav(self) -> None:
|
|
"""Save received audio to output WAV file."""
|
|
if not self.received_audio:
|
|
self.log_event("!", "No audio received to save")
|
|
return
|
|
|
|
# Convert bytes to numpy array
|
|
audio_data = np.frombuffer(bytes(self.received_audio), dtype=np.int16)
|
|
|
|
# Ensure output directory exists
|
|
self.output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Save using wave module for compatibility
|
|
with wave.open(str(self.output_file), 'wb') as wav_file:
|
|
wav_file.setnchannels(1)
|
|
wav_file.setsampwidth(2) # 16-bit
|
|
wav_file.setframerate(self.sample_rate)
|
|
wav_file.writeframes(audio_data.tobytes())
|
|
|
|
duration = len(audio_data) / self.sample_rate
|
|
self.log_event("→", f"Saved output: {self.output_file}")
|
|
self.log_event("→", f" Duration: {duration:.2f}s ({len(audio_data)} samples)")
|
|
self.log_event("→", f" Size: {len(self.received_audio)/1024:.1f} KB")
|
|
|
|
async def run(self) -> None:
|
|
"""Run the WAV file test."""
|
|
try:
|
|
# Load input WAV file
|
|
audio_data, _ = self.load_wav_file()
|
|
|
|
# Connect to server
|
|
await self.connect()
|
|
|
|
# Wait for answer
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Start receiver task
|
|
receiver_task = asyncio.create_task(self.receiver())
|
|
|
|
# Send audio
|
|
await self.audio_sender(audio_data)
|
|
|
|
# Wait for response
|
|
self.log_event("→", f"Waiting {self.wait_time}s for response...")
|
|
|
|
wait_start = time.time()
|
|
while self.running and (time.time() - wait_start) < self.wait_time:
|
|
# Check if track has ended (response complete)
|
|
if self.track_ended and self.send_completed:
|
|
# Give a little extra time for any remaining audio
|
|
await asyncio.sleep(1.0)
|
|
break
|
|
await asyncio.sleep(0.1)
|
|
|
|
# Cleanup
|
|
self.running = False
|
|
receiver_task.cancel()
|
|
|
|
try:
|
|
await receiver_task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
|
|
# Save output
|
|
self.save_output_wav()
|
|
|
|
# Print summary
|
|
self._print_summary()
|
|
|
|
except FileNotFoundError as e:
|
|
print(f"Error: {e}")
|
|
sys.exit(1)
|
|
except ConnectionRefusedError:
|
|
print(f"Error: Could not connect to {self.url}")
|
|
print("Make sure the server is running.")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
finally:
|
|
await self.close()
|
|
|
|
def _print_summary(self):
|
|
"""Print session summary."""
|
|
print("\n" + "=" * 50)
|
|
print("Session Summary")
|
|
print("=" * 50)
|
|
print(f" Input file: {self.input_file}")
|
|
print(f" Output file: {self.output_file}")
|
|
print(f" Bytes sent: {self.bytes_sent / 1024:.1f} KB")
|
|
print(f" Bytes received: {self.bytes_received / 1024:.1f} KB")
|
|
if self.ttfb_ms:
|
|
print(f" TTFB: {self.ttfb_ms:.0f} ms")
|
|
if self.received_audio:
|
|
duration = len(self.received_audio) / (self.sample_rate * 2)
|
|
print(f" Response duration: {duration:.2f}s")
|
|
print("=" * 50)
|
|
|
|
async def close(self) -> None:
|
|
"""Close the connection."""
|
|
self.running = False
|
|
if self.ws:
|
|
try:
|
|
await self.ws.close()
|
|
except:
|
|
pass
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="WAV file client for testing duplex voice conversation"
|
|
)
|
|
parser.add_argument(
|
|
"--input", "-i",
|
|
required=True,
|
|
help="Input WAV file path"
|
|
)
|
|
parser.add_argument(
|
|
"--output", "-o",
|
|
required=True,
|
|
help="Output WAV file path for response"
|
|
)
|
|
parser.add_argument(
|
|
"--url",
|
|
default="ws://localhost:8000/ws",
|
|
help="WebSocket server URL (default: ws://localhost:8000/ws)"
|
|
)
|
|
parser.add_argument(
|
|
"--sample-rate",
|
|
type=int,
|
|
default=16000,
|
|
help="Target sample rate for audio (default: 16000)"
|
|
)
|
|
parser.add_argument(
|
|
"--chunk-duration",
|
|
type=int,
|
|
default=20,
|
|
help="Chunk duration in ms for sending (default: 20)"
|
|
)
|
|
parser.add_argument(
|
|
"--wait-time", "-w",
|
|
type=float,
|
|
default=15.0,
|
|
help="Time to wait for response after sending (default: 15.0)"
|
|
)
|
|
parser.add_argument(
|
|
"--verbose", "-v",
|
|
action="store_true",
|
|
help="Enable verbose output"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
client = WavFileClient(
|
|
url=args.url,
|
|
input_file=args.input,
|
|
output_file=args.output,
|
|
sample_rate=args.sample_rate,
|
|
chunk_duration_ms=args.chunk_duration,
|
|
wait_time=args.wait_time,
|
|
verbose=args.verbose
|
|
)
|
|
|
|
await client.run()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
asyncio.run(main())
|
|
except KeyboardInterrupt:
|
|
print("\nInterrupted by user")
|