Add generate test audio script

This commit is contained in:
Xin Wang
2026-02-04 10:32:54 +08:00
parent 8bc24ded59
commit 5aa9a12ca8
3 changed files with 789 additions and 0 deletions

476
examples/wav_client.py Normal file
View File

@@ -0,0 +1,476 @@
#!/usr/bin/env python3
"""
WAV file client for testing duplex voice conversation.
This client reads audio from a WAV file, sends it to the server,
and saves the AI's voice response to an output WAV file.
Usage:
python examples/wav_client.py --input input.wav --output response.wav
python examples/wav_client.py --input input.wav --output response.wav --url ws://localhost:8000/ws
python examples/wav_client.py --input input.wav --output response.wav --wait-time 10
python wav_client.py --input ../data/audio_examples/two_utterances.wav -o response.wav
Requirements:
pip install soundfile websockets numpy
"""
import argparse
import asyncio
import json
import sys
import time
import wave
from pathlib import Path
try:
import numpy as np
except ImportError:
print("Please install numpy: pip install numpy")
sys.exit(1)
try:
import soundfile as sf
except ImportError:
print("Please install soundfile: pip install soundfile")
sys.exit(1)
try:
import websockets
except ImportError:
print("Please install websockets: pip install websockets")
sys.exit(1)
class WavFileClient:
"""
WAV file client for voice conversation testing.
Features:
- Read audio from WAV file
- Send audio to WebSocket server
- Receive and save response audio
- Event logging
"""
def __init__(
self,
url: str,
input_file: str,
output_file: str,
sample_rate: int = 16000,
chunk_duration_ms: int = 20,
wait_time: float = 15.0,
verbose: bool = False
):
"""
Initialize WAV file client.
Args:
url: WebSocket server URL
input_file: Input WAV file path
output_file: Output WAV file path
sample_rate: Audio sample rate (Hz)
chunk_duration_ms: Audio chunk duration (ms) for sending
wait_time: Time to wait for response after sending (seconds)
verbose: Enable verbose output
"""
self.url = url
self.input_file = Path(input_file)
self.output_file = Path(output_file)
self.sample_rate = sample_rate
self.chunk_duration_ms = chunk_duration_ms
self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
self.wait_time = wait_time
self.verbose = verbose
# WebSocket connection
self.ws = None
self.running = False
# Audio buffers
self.received_audio = bytearray()
# Statistics
self.bytes_sent = 0
self.bytes_received = 0
# TTFB tracking
self.send_start_time = None
self.first_audio_received = False
self.ttfb_ms = None
# State tracking
self.track_started = False
self.track_ended = False
self.send_completed = False
# Events log
self.events_log = []
def log_event(self, direction: str, message: str):
"""Log an event with timestamp."""
timestamp = time.time()
self.events_log.append({
"timestamp": timestamp,
"direction": direction,
"message": message
})
print(f"{direction} {message}")
async def connect(self) -> None:
"""Connect to WebSocket server."""
self.log_event("", f"Connecting to {self.url}...")
self.ws = await websockets.connect(self.url)
self.running = True
self.log_event("", "Connected!")
# Send invite command
await self.send_command({
"command": "invite",
"option": {
"codec": "pcm",
"sampleRate": self.sample_rate
}
})
async def send_command(self, cmd: dict) -> None:
"""Send JSON command to server."""
if self.ws:
await self.ws.send(json.dumps(cmd))
self.log_event("", f"Command: {cmd.get('command', 'unknown')}")
async def send_hangup(self, reason: str = "Session complete") -> None:
"""Send hangup command."""
await self.send_command({
"command": "hangup",
"reason": reason
})
def load_wav_file(self) -> tuple[np.ndarray, int]:
"""
Load and prepare WAV file for sending.
Returns:
Tuple of (audio_data as int16 numpy array, original sample rate)
"""
if not self.input_file.exists():
raise FileNotFoundError(f"Input file not found: {self.input_file}")
# Load audio file
audio_data, file_sample_rate = sf.read(self.input_file)
self.log_event("", f"Loaded: {self.input_file}")
self.log_event("", f" Original sample rate: {file_sample_rate} Hz")
self.log_event("", f" Duration: {len(audio_data) / file_sample_rate:.2f}s")
# Convert stereo to mono if needed
if len(audio_data.shape) > 1:
audio_data = audio_data.mean(axis=1)
self.log_event("", " Converted stereo to mono")
# Resample if needed
if file_sample_rate != self.sample_rate:
# Simple resampling using numpy
duration = len(audio_data) / file_sample_rate
num_samples = int(duration * self.sample_rate)
indices = np.linspace(0, len(audio_data) - 1, num_samples)
audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data)
self.log_event("", f" Resampled to {self.sample_rate} Hz")
# Convert to int16
if audio_data.dtype != np.int16:
# Normalize to [-1, 1] if needed
max_val = np.max(np.abs(audio_data))
if max_val > 1.0:
audio_data = audio_data / max_val
audio_data = (audio_data * 32767).astype(np.int16)
self.log_event("", f" Prepared: {len(audio_data)} samples ({len(audio_data)/self.sample_rate:.2f}s)")
return audio_data, file_sample_rate
async def audio_sender(self, audio_data: np.ndarray) -> None:
"""Send audio data to server in chunks."""
total_samples = len(audio_data)
chunk_size = self.chunk_samples
sent_samples = 0
self.send_start_time = time.time()
self.log_event("", f"Starting audio transmission ({total_samples} samples)...")
while sent_samples < total_samples and self.running:
# Get next chunk
end_sample = min(sent_samples + chunk_size, total_samples)
chunk = audio_data[sent_samples:end_sample]
chunk_bytes = chunk.tobytes()
# Send to server
if self.ws:
await self.ws.send(chunk_bytes)
self.bytes_sent += len(chunk_bytes)
sent_samples = end_sample
# Progress logging (every 500ms worth of audio)
if self.verbose and sent_samples % (self.sample_rate // 2) == 0:
progress = (sent_samples / total_samples) * 100
print(f" Sending: {progress:.0f}%", end="\r")
# Delay to simulate real-time streaming
# Server expects audio at real-time pace for VAD/ASR to work properly
await asyncio.sleep(self.chunk_duration_ms / 1000)
self.send_completed = True
elapsed = time.time() - self.send_start_time
self.log_event("", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent/1024:.1f} KB)")
async def receiver(self) -> None:
"""Receive messages from server."""
try:
while self.running:
try:
message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
if isinstance(message, bytes):
# Audio data received
self.bytes_received += len(message)
self.received_audio.extend(message)
# Calculate TTFB on first audio
if not self.first_audio_received and self.send_start_time:
self.ttfb_ms = (time.time() - self.send_start_time) * 1000
self.first_audio_received = True
self.log_event("", f"[TTFB] First audio latency: {self.ttfb_ms:.0f}ms")
# Log progress
duration_ms = len(message) / (self.sample_rate * 2) * 1000
total_ms = len(self.received_audio) / (self.sample_rate * 2) * 1000
if self.verbose:
print(f"← Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r")
else:
# JSON event
event = json.loads(message)
await self._handle_event(event)
except asyncio.TimeoutError:
continue
except websockets.ConnectionClosed:
self.log_event("", "Connection closed")
self.running = False
break
except asyncio.CancelledError:
pass
except Exception as e:
self.log_event("!", f"Receiver error: {e}")
self.running = False
async def _handle_event(self, event: dict) -> None:
"""Handle incoming event."""
event_type = event.get("event", "unknown")
if event_type == "answer":
self.log_event("", "Session ready!")
elif event_type == "speaking":
self.log_event("", "Speech detected")
elif event_type == "silence":
self.log_event("", "Silence detected")
elif event_type == "transcript":
text = event.get("text", "")
is_final = event.get("isFinal", False)
if is_final:
self.log_event("", f"Transcript (final): {text}")
elif self.verbose:
self.log_event("", f"Transcript (interim): {text[:50]}...")
elif event_type == "ttfb":
latency_ms = event.get("latencyMs", 0)
self.log_event("", f"[TTFB] Server latency: {latency_ms}ms")
elif event_type == "trackStart":
self.track_started = True
self.log_event("", "Bot started speaking")
elif event_type == "trackEnd":
self.track_ended = True
self.log_event("", "Bot finished speaking")
elif event_type == "interrupt":
self.log_event("", "Bot interrupted!")
elif event_type == "error":
self.log_event("!", f"Error: {event.get('error')}")
elif event_type == "hangup":
self.log_event("", f"Hangup: {event.get('reason')}")
self.running = False
else:
self.log_event("", f"Event: {event_type}")
def save_output_wav(self) -> None:
"""Save received audio to output WAV file."""
if not self.received_audio:
self.log_event("!", "No audio received to save")
return
# Convert bytes to numpy array
audio_data = np.frombuffer(bytes(self.received_audio), dtype=np.int16)
# Ensure output directory exists
self.output_file.parent.mkdir(parents=True, exist_ok=True)
# Save using wave module for compatibility
with wave.open(str(self.output_file), 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(self.sample_rate)
wav_file.writeframes(audio_data.tobytes())
duration = len(audio_data) / self.sample_rate
self.log_event("", f"Saved output: {self.output_file}")
self.log_event("", f" Duration: {duration:.2f}s ({len(audio_data)} samples)")
self.log_event("", f" Size: {len(self.received_audio)/1024:.1f} KB")
async def run(self) -> None:
"""Run the WAV file test."""
try:
# Load input WAV file
audio_data, _ = self.load_wav_file()
# Connect to server
await self.connect()
# Wait for answer
await asyncio.sleep(0.5)
# Start receiver task
receiver_task = asyncio.create_task(self.receiver())
# Send audio
await self.audio_sender(audio_data)
# Wait for response
self.log_event("", f"Waiting {self.wait_time}s for response...")
wait_start = time.time()
while self.running and (time.time() - wait_start) < self.wait_time:
# Check if track has ended (response complete)
if self.track_ended and self.send_completed:
# Give a little extra time for any remaining audio
await asyncio.sleep(1.0)
break
await asyncio.sleep(0.1)
# Cleanup
self.running = False
receiver_task.cancel()
try:
await receiver_task
except asyncio.CancelledError:
pass
# Save output
self.save_output_wav()
# Print summary
self._print_summary()
except FileNotFoundError as e:
print(f"Error: {e}")
sys.exit(1)
except ConnectionRefusedError:
print(f"Error: Could not connect to {self.url}")
print("Make sure the server is running.")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
finally:
await self.close()
def _print_summary(self):
"""Print session summary."""
print("\n" + "=" * 50)
print("Session Summary")
print("=" * 50)
print(f" Input file: {self.input_file}")
print(f" Output file: {self.output_file}")
print(f" Bytes sent: {self.bytes_sent / 1024:.1f} KB")
print(f" Bytes received: {self.bytes_received / 1024:.1f} KB")
if self.ttfb_ms:
print(f" TTFB: {self.ttfb_ms:.0f} ms")
if self.received_audio:
duration = len(self.received_audio) / (self.sample_rate * 2)
print(f" Response duration: {duration:.2f}s")
print("=" * 50)
async def close(self) -> None:
"""Close the connection."""
self.running = False
if self.ws:
try:
await self.ws.close()
except:
pass
async def main():
parser = argparse.ArgumentParser(
description="WAV file client for testing duplex voice conversation"
)
parser.add_argument(
"--input", "-i",
required=True,
help="Input WAV file path"
)
parser.add_argument(
"--output", "-o",
required=True,
help="Output WAV file path for response"
)
parser.add_argument(
"--url",
default="ws://localhost:8000/ws",
help="WebSocket server URL (default: ws://localhost:8000/ws)"
)
parser.add_argument(
"--sample-rate",
type=int,
default=16000,
help="Target sample rate for audio (default: 16000)"
)
parser.add_argument(
"--chunk-duration",
type=int,
default=20,
help="Chunk duration in ms for sending (default: 20)"
)
parser.add_argument(
"--wait-time", "-w",
type=float,
default=15.0,
help="Time to wait for response after sending (default: 15.0)"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Enable verbose output"
)
args = parser.parse_args()
client = WavFileClient(
url=args.url,
input_file=args.input,
output_file=args.output,
sample_rate=args.sample_rate,
chunk_duration_ms=args.chunk_duration,
wait_time=args.wait_time,
verbose=args.verbose
)
await client.run()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
print("\nInterrupted by user")

1
scripts/README.md Normal file
View File

@@ -0,0 +1 @@
# Development Script

View File

@@ -0,0 +1,312 @@
#!/usr/bin/env python3
"""
Generate test audio file with utterances using SiliconFlow TTS API.
Creates a 16kHz mono WAV file with real speech segments separated by
configurable silence (for VAD/testing).
Usage:
python scripts/generate_test_audio.py [OPTIONS]
Options:
-o, --output PATH Output WAV path (default: data/audio_examples/two_utterances_16k.wav)
-u, --utterance TEXT Utterance text; repeat for multiple (ignored if -j is set)
-j, --json PATH JSON file: array of strings or {"utterances": [...]}
--silence-ms MS Silence in ms between utterances (default: 500)
--lead-silence-ms MS Silence in ms at start (default: 200)
--trail-silence-ms MS Silence in ms at end (default: 300)
Examples:
# Default utterances and output
python scripts/generate_test_audio.py
# Custom output path
python scripts/generate_test_audio.py -o out.wav
# Utterances from command line
python scripts/generate_test_audio.py -u "Hello" -u "World" -o test.wav
# Utterances from JSON file
python scripts/generate_test_audio.py -j utterances.json -o test.wav
# Custom silence (1s between utterances)
python scripts/generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav
Requires SILICONFLOW_API_KEY in .env.
"""
import wave
import struct
import argparse
import asyncio
import aiohttp
import json
import os
from pathlib import Path
from dotenv import load_dotenv
# Load .env file from project root
project_root = Path(__file__).parent.parent
load_dotenv(project_root / ".env")
# SiliconFlow TTS Configuration
SILICONFLOW_API_URL = "https://api.siliconflow.cn/v1/audio/speech"
SILICONFLOW_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
# Available voices
VOICES = {
"alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
"anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
"bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
"benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
"charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
"claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
"david": "FunAudioLLM/CosyVoice2-0.5B:david",
"diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
}
def generate_silence(duration_ms: int, sample_rate: int = 16000) -> bytes:
"""Generate silence as PCM bytes."""
num_samples = int(sample_rate * (duration_ms / 1000.0))
return b'\x00\x00' * num_samples
async def synthesize_speech(
text: str,
api_key: str,
voice: str = "anna",
sample_rate: int = 16000,
speed: float = 1.0
) -> bytes:
"""
Synthesize speech using SiliconFlow TTS API.
Args:
text: Text to synthesize
api_key: SiliconFlow API key
voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
speed: Speech speed (0.25 to 4.0)
Returns:
PCM audio bytes (16-bit signed, little-endian)
"""
# Resolve voice name
full_voice = VOICES.get(voice, voice)
payload = {
"model": SILICONFLOW_MODEL,
"input": text,
"voice": full_voice,
"response_format": "pcm",
"sample_rate": sample_rate,
"stream": False,
"speed": speed
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
async with aiohttp.ClientSession() as session:
async with session.post(SILICONFLOW_API_URL, json=payload, headers=headers) as response:
if response.status != 200:
error_text = await response.text()
raise RuntimeError(f"SiliconFlow TTS error: {response.status} - {error_text}")
return await response.read()
async def generate_test_audio(
output_path: str,
utterances: list[str],
silence_ms: int = 500,
lead_silence_ms: int = 200,
trail_silence_ms: int = 300,
voice: str = "anna",
sample_rate: int = 16000,
speed: float = 1.0
):
"""
Generate test audio with multiple utterances separated by silence.
Args:
output_path: Path to save the WAV file
utterances: List of text strings for each utterance
silence_ms: Silence duration between utterances (milliseconds)
lead_silence_ms: Silence at the beginning (milliseconds)
trail_silence_ms: Silence at the end (milliseconds)
voice: TTS voice to use
sample_rate: Audio sample rate
speed: TTS speech speed
"""
api_key = os.getenv("SILICONFLOW_API_KEY")
if not api_key:
raise ValueError(
"SILICONFLOW_API_KEY not found in environment.\n"
"Please set it in your .env file:\n"
" SILICONFLOW_API_KEY=your-api-key-here"
)
print(f"Using SiliconFlow TTS API")
print(f" Voice: {voice}")
print(f" Sample rate: {sample_rate}Hz")
print(f" Speed: {speed}x")
print()
segments = []
# Lead-in silence
if lead_silence_ms > 0:
segments.append(generate_silence(lead_silence_ms, sample_rate))
print(f" [silence: {lead_silence_ms}ms]")
# Generate each utterance with silence between
for i, text in enumerate(utterances):
print(f" Synthesizing utterance {i + 1}: \"{text}\"")
audio = await synthesize_speech(
text=text,
api_key=api_key,
voice=voice,
sample_rate=sample_rate,
speed=speed
)
segments.append(audio)
# Add silence between utterances (not after the last one)
if i < len(utterances) - 1:
segments.append(generate_silence(silence_ms, sample_rate))
print(f" [silence: {silence_ms}ms]")
# Trail silence
if trail_silence_ms > 0:
segments.append(generate_silence(trail_silence_ms, sample_rate))
print(f" [silence: {trail_silence_ms}ms]")
# Concatenate all segments
audio_data = b''.join(segments)
# Write WAV file
with wave.open(output_path, 'wb') as wf:
wf.setnchannels(1) # Mono
wf.setsampwidth(2) # 16-bit
wf.setframerate(sample_rate)
wf.writeframes(audio_data)
duration_sec = len(audio_data) / (sample_rate * 2)
print()
print(f"Generated: {output_path}")
print(f" Duration: {duration_sec:.2f}s")
print(f" Sample rate: {sample_rate}Hz")
print(f" Format: 16-bit mono PCM WAV")
print(f" Size: {len(audio_data):,} bytes")
def load_utterances_from_json(path: Path) -> list[str]:
"""
Load utterances from a JSON file.
Accepts either:
- A JSON array: ["utterance 1", "utterance 2"]
- A JSON object with "utterances" key: {"utterances": ["a", "b"]}
"""
with open(path, encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
return [str(s) for s in data]
if isinstance(data, dict) and "utterances" in data:
return [str(s) for s in data["utterances"]]
raise ValueError(
f"JSON file must be an array of strings or an object with 'utterances' key. "
f"Got: {type(data).__name__}"
)
def parse_args():
"""Parse command-line arguments."""
script_dir = Path(__file__).parent
default_output = script_dir.parent / "data" / "audio_examples" / "two_utterances_16k.wav"
parser = argparse.ArgumentParser(description="Generate test audio with SiliconFlow TTS (utterances + silence).")
parser.add_argument(
"-o", "--output",
type=Path,
default=default_output,
help=f"Output WAV file path (default: {default_output})"
)
parser.add_argument(
"-u", "--utterance",
action="append",
dest="utterances",
metavar="TEXT",
help="Utterance text (repeat for multiple). Ignored if --json is set."
)
parser.add_argument(
"-j", "--json",
type=Path,
metavar="PATH",
help="JSON file with utterances: array of strings or object with 'utterances' key"
)
parser.add_argument(
"--silence-ms",
type=int,
default=500,
metavar="MS",
help="Silence in ms between utterances (default: 500)"
)
parser.add_argument(
"--lead-silence-ms",
type=int,
default=200,
metavar="MS",
help="Silence in ms at start of file (default: 200)"
)
parser.add_argument(
"--trail-silence-ms",
type=int,
default=300,
metavar="MS",
help="Silence in ms at end of file (default: 300)"
)
return parser.parse_args()
async def main():
"""Main entry point."""
args = parse_args()
output_path = args.output
output_path.parent.mkdir(parents=True, exist_ok=True)
# Resolve utterances: JSON file > -u args > defaults
if args.json is not None:
if not args.json.is_file():
raise FileNotFoundError(f"Utterances JSON file not found: {args.json}")
utterances = load_utterances_from_json(args.json)
if not utterances:
raise ValueError(f"JSON file has no utterances: {args.json}")
elif args.utterances:
utterances = args.utterances
else:
utterances = [
"Hello, how are you doing today?",
"I'm doing great, thank you for asking!"
]
await generate_test_audio(
output_path=str(output_path),
utterances=utterances,
silence_ms=args.silence_ms,
lead_silence_ms=args.lead_silence_ms,
trail_silence_ms=args.trail_silence_ms,
voice="anna",
sample_rate=16000,
speed=1.0
)
if __name__ == "__main__":
asyncio.run(main())