From 5aa9a12ca86b8767aec89a5bb093c7624301aa86 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Wed, 4 Feb 2026 10:32:54 +0800 Subject: [PATCH] Add generate test audio script --- examples/wav_client.py | 476 ++++++++++++++++++ scripts/README.md | 1 + .../generate_test_audio.py | 312 ++++++++++++ 3 files changed, 789 insertions(+) create mode 100644 examples/wav_client.py create mode 100644 scripts/README.md create mode 100644 scripts/generate_test_audio/generate_test_audio.py diff --git a/examples/wav_client.py b/examples/wav_client.py new file mode 100644 index 0000000..d784bba --- /dev/null +++ b/examples/wav_client.py @@ -0,0 +1,476 @@ +#!/usr/bin/env python3 +""" +WAV file client for testing duplex voice conversation. + +This client reads audio from a WAV file, sends it to the server, +and saves the AI's voice response to an output WAV file. + +Usage: + python examples/wav_client.py --input input.wav --output response.wav + python examples/wav_client.py --input input.wav --output response.wav --url ws://localhost:8000/ws + python examples/wav_client.py --input input.wav --output response.wav --wait-time 10 + python wav_client.py --input ../data/audio_examples/two_utterances.wav -o response.wav +Requirements: + pip install soundfile websockets numpy +""" + +import argparse +import asyncio +import json +import sys +import time +import wave +from pathlib import Path + +try: + import numpy as np +except ImportError: + print("Please install numpy: pip install numpy") + sys.exit(1) + +try: + import soundfile as sf +except ImportError: + print("Please install soundfile: pip install soundfile") + sys.exit(1) + +try: + import websockets +except ImportError: + print("Please install websockets: pip install websockets") + sys.exit(1) + + +class WavFileClient: + """ + WAV file client for voice conversation testing. + + Features: + - Read audio from WAV file + - Send audio to WebSocket server + - Receive and save response audio + - Event logging + """ + + def __init__( + self, + url: str, + input_file: str, + output_file: str, + sample_rate: int = 16000, + chunk_duration_ms: int = 20, + wait_time: float = 15.0, + verbose: bool = False + ): + """ + Initialize WAV file client. + + Args: + url: WebSocket server URL + input_file: Input WAV file path + output_file: Output WAV file path + sample_rate: Audio sample rate (Hz) + chunk_duration_ms: Audio chunk duration (ms) for sending + wait_time: Time to wait for response after sending (seconds) + verbose: Enable verbose output + """ + self.url = url + self.input_file = Path(input_file) + self.output_file = Path(output_file) + self.sample_rate = sample_rate + self.chunk_duration_ms = chunk_duration_ms + self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000) + self.wait_time = wait_time + self.verbose = verbose + + # WebSocket connection + self.ws = None + self.running = False + + # Audio buffers + self.received_audio = bytearray() + + # Statistics + self.bytes_sent = 0 + self.bytes_received = 0 + + # TTFB tracking + self.send_start_time = None + self.first_audio_received = False + self.ttfb_ms = None + + # State tracking + self.track_started = False + self.track_ended = False + self.send_completed = False + + # Events log + self.events_log = [] + + def log_event(self, direction: str, message: str): + """Log an event with timestamp.""" + timestamp = time.time() + self.events_log.append({ + "timestamp": timestamp, + "direction": direction, + "message": message + }) + print(f"{direction} {message}") + + async def connect(self) -> None: + """Connect to WebSocket server.""" + self.log_event("→", f"Connecting to {self.url}...") + self.ws = await websockets.connect(self.url) + self.running = True + self.log_event("←", "Connected!") + + # Send invite command + await self.send_command({ + "command": "invite", + "option": { + "codec": "pcm", + "sampleRate": self.sample_rate + } + }) + + async def send_command(self, cmd: dict) -> None: + """Send JSON command to server.""" + if self.ws: + await self.ws.send(json.dumps(cmd)) + self.log_event("→", f"Command: {cmd.get('command', 'unknown')}") + + async def send_hangup(self, reason: str = "Session complete") -> None: + """Send hangup command.""" + await self.send_command({ + "command": "hangup", + "reason": reason + }) + + def load_wav_file(self) -> tuple[np.ndarray, int]: + """ + Load and prepare WAV file for sending. + + Returns: + Tuple of (audio_data as int16 numpy array, original sample rate) + """ + if not self.input_file.exists(): + raise FileNotFoundError(f"Input file not found: {self.input_file}") + + # Load audio file + audio_data, file_sample_rate = sf.read(self.input_file) + self.log_event("→", f"Loaded: {self.input_file}") + self.log_event("→", f" Original sample rate: {file_sample_rate} Hz") + self.log_event("→", f" Duration: {len(audio_data) / file_sample_rate:.2f}s") + + # Convert stereo to mono if needed + if len(audio_data.shape) > 1: + audio_data = audio_data.mean(axis=1) + self.log_event("→", " Converted stereo to mono") + + # Resample if needed + if file_sample_rate != self.sample_rate: + # Simple resampling using numpy + duration = len(audio_data) / file_sample_rate + num_samples = int(duration * self.sample_rate) + indices = np.linspace(0, len(audio_data) - 1, num_samples) + audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data) + self.log_event("→", f" Resampled to {self.sample_rate} Hz") + + # Convert to int16 + if audio_data.dtype != np.int16: + # Normalize to [-1, 1] if needed + max_val = np.max(np.abs(audio_data)) + if max_val > 1.0: + audio_data = audio_data / max_val + audio_data = (audio_data * 32767).astype(np.int16) + + self.log_event("→", f" Prepared: {len(audio_data)} samples ({len(audio_data)/self.sample_rate:.2f}s)") + + return audio_data, file_sample_rate + + async def audio_sender(self, audio_data: np.ndarray) -> None: + """Send audio data to server in chunks.""" + total_samples = len(audio_data) + chunk_size = self.chunk_samples + sent_samples = 0 + + self.send_start_time = time.time() + self.log_event("→", f"Starting audio transmission ({total_samples} samples)...") + + while sent_samples < total_samples and self.running: + # Get next chunk + end_sample = min(sent_samples + chunk_size, total_samples) + chunk = audio_data[sent_samples:end_sample] + chunk_bytes = chunk.tobytes() + + # Send to server + if self.ws: + await self.ws.send(chunk_bytes) + self.bytes_sent += len(chunk_bytes) + + sent_samples = end_sample + + # Progress logging (every 500ms worth of audio) + if self.verbose and sent_samples % (self.sample_rate // 2) == 0: + progress = (sent_samples / total_samples) * 100 + print(f" Sending: {progress:.0f}%", end="\r") + + # Delay to simulate real-time streaming + # Server expects audio at real-time pace for VAD/ASR to work properly + await asyncio.sleep(self.chunk_duration_ms / 1000) + + self.send_completed = True + elapsed = time.time() - self.send_start_time + self.log_event("→", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent/1024:.1f} KB)") + + async def receiver(self) -> None: + """Receive messages from server.""" + try: + while self.running: + try: + message = await asyncio.wait_for(self.ws.recv(), timeout=0.1) + + if isinstance(message, bytes): + # Audio data received + self.bytes_received += len(message) + self.received_audio.extend(message) + + # Calculate TTFB on first audio + if not self.first_audio_received and self.send_start_time: + self.ttfb_ms = (time.time() - self.send_start_time) * 1000 + self.first_audio_received = True + self.log_event("←", f"[TTFB] First audio latency: {self.ttfb_ms:.0f}ms") + + # Log progress + duration_ms = len(message) / (self.sample_rate * 2) * 1000 + total_ms = len(self.received_audio) / (self.sample_rate * 2) * 1000 + if self.verbose: + print(f"← Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r") + + else: + # JSON event + event = json.loads(message) + await self._handle_event(event) + + except asyncio.TimeoutError: + continue + except websockets.ConnectionClosed: + self.log_event("←", "Connection closed") + self.running = False + break + + except asyncio.CancelledError: + pass + except Exception as e: + self.log_event("!", f"Receiver error: {e}") + self.running = False + + async def _handle_event(self, event: dict) -> None: + """Handle incoming event.""" + event_type = event.get("event", "unknown") + + if event_type == "answer": + self.log_event("←", "Session ready!") + elif event_type == "speaking": + self.log_event("←", "Speech detected") + elif event_type == "silence": + self.log_event("←", "Silence detected") + elif event_type == "transcript": + text = event.get("text", "") + is_final = event.get("isFinal", False) + if is_final: + self.log_event("←", f"Transcript (final): {text}") + elif self.verbose: + self.log_event("←", f"Transcript (interim): {text[:50]}...") + elif event_type == "ttfb": + latency_ms = event.get("latencyMs", 0) + self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms") + elif event_type == "trackStart": + self.track_started = True + self.log_event("←", "Bot started speaking") + elif event_type == "trackEnd": + self.track_ended = True + self.log_event("←", "Bot finished speaking") + elif event_type == "interrupt": + self.log_event("←", "Bot interrupted!") + elif event_type == "error": + self.log_event("!", f"Error: {event.get('error')}") + elif event_type == "hangup": + self.log_event("←", f"Hangup: {event.get('reason')}") + self.running = False + else: + self.log_event("←", f"Event: {event_type}") + + def save_output_wav(self) -> None: + """Save received audio to output WAV file.""" + if not self.received_audio: + self.log_event("!", "No audio received to save") + return + + # Convert bytes to numpy array + audio_data = np.frombuffer(bytes(self.received_audio), dtype=np.int16) + + # Ensure output directory exists + self.output_file.parent.mkdir(parents=True, exist_ok=True) + + # Save using wave module for compatibility + with wave.open(str(self.output_file), 'wb') as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) # 16-bit + wav_file.setframerate(self.sample_rate) + wav_file.writeframes(audio_data.tobytes()) + + duration = len(audio_data) / self.sample_rate + self.log_event("→", f"Saved output: {self.output_file}") + self.log_event("→", f" Duration: {duration:.2f}s ({len(audio_data)} samples)") + self.log_event("→", f" Size: {len(self.received_audio)/1024:.1f} KB") + + async def run(self) -> None: + """Run the WAV file test.""" + try: + # Load input WAV file + audio_data, _ = self.load_wav_file() + + # Connect to server + await self.connect() + + # Wait for answer + await asyncio.sleep(0.5) + + # Start receiver task + receiver_task = asyncio.create_task(self.receiver()) + + # Send audio + await self.audio_sender(audio_data) + + # Wait for response + self.log_event("→", f"Waiting {self.wait_time}s for response...") + + wait_start = time.time() + while self.running and (time.time() - wait_start) < self.wait_time: + # Check if track has ended (response complete) + if self.track_ended and self.send_completed: + # Give a little extra time for any remaining audio + await asyncio.sleep(1.0) + break + await asyncio.sleep(0.1) + + # Cleanup + self.running = False + receiver_task.cancel() + + try: + await receiver_task + except asyncio.CancelledError: + pass + + # Save output + self.save_output_wav() + + # Print summary + self._print_summary() + + except FileNotFoundError as e: + print(f"Error: {e}") + sys.exit(1) + except ConnectionRefusedError: + print(f"Error: Could not connect to {self.url}") + print("Make sure the server is running.") + sys.exit(1) + except Exception as e: + print(f"Error: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + finally: + await self.close() + + def _print_summary(self): + """Print session summary.""" + print("\n" + "=" * 50) + print("Session Summary") + print("=" * 50) + print(f" Input file: {self.input_file}") + print(f" Output file: {self.output_file}") + print(f" Bytes sent: {self.bytes_sent / 1024:.1f} KB") + print(f" Bytes received: {self.bytes_received / 1024:.1f} KB") + if self.ttfb_ms: + print(f" TTFB: {self.ttfb_ms:.0f} ms") + if self.received_audio: + duration = len(self.received_audio) / (self.sample_rate * 2) + print(f" Response duration: {duration:.2f}s") + print("=" * 50) + + async def close(self) -> None: + """Close the connection.""" + self.running = False + if self.ws: + try: + await self.ws.close() + except: + pass + + +async def main(): + parser = argparse.ArgumentParser( + description="WAV file client for testing duplex voice conversation" + ) + parser.add_argument( + "--input", "-i", + required=True, + help="Input WAV file path" + ) + parser.add_argument( + "--output", "-o", + required=True, + help="Output WAV file path for response" + ) + parser.add_argument( + "--url", + default="ws://localhost:8000/ws", + help="WebSocket server URL (default: ws://localhost:8000/ws)" + ) + parser.add_argument( + "--sample-rate", + type=int, + default=16000, + help="Target sample rate for audio (default: 16000)" + ) + parser.add_argument( + "--chunk-duration", + type=int, + default=20, + help="Chunk duration in ms for sending (default: 20)" + ) + parser.add_argument( + "--wait-time", "-w", + type=float, + default=15.0, + help="Time to wait for response after sending (default: 15.0)" + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Enable verbose output" + ) + + args = parser.parse_args() + + client = WavFileClient( + url=args.url, + input_file=args.input, + output_file=args.output, + sample_rate=args.sample_rate, + chunk_duration_ms=args.chunk_duration, + wait_time=args.wait_time, + verbose=args.verbose + ) + + await client.run() + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\nInterrupted by user") diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..8b6f7a0 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1 @@ +# Development Script \ No newline at end of file diff --git a/scripts/generate_test_audio/generate_test_audio.py b/scripts/generate_test_audio/generate_test_audio.py new file mode 100644 index 0000000..66c1908 --- /dev/null +++ b/scripts/generate_test_audio/generate_test_audio.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python3 +""" +Generate test audio file with utterances using SiliconFlow TTS API. + +Creates a 16kHz mono WAV file with real speech segments separated by +configurable silence (for VAD/testing). + +Usage: + python scripts/generate_test_audio.py [OPTIONS] + +Options: + -o, --output PATH Output WAV path (default: data/audio_examples/two_utterances_16k.wav) + -u, --utterance TEXT Utterance text; repeat for multiple (ignored if -j is set) + -j, --json PATH JSON file: array of strings or {"utterances": [...]} + --silence-ms MS Silence in ms between utterances (default: 500) + --lead-silence-ms MS Silence in ms at start (default: 200) + --trail-silence-ms MS Silence in ms at end (default: 300) + +Examples: + # Default utterances and output + python scripts/generate_test_audio.py + + # Custom output path + python scripts/generate_test_audio.py -o out.wav + + # Utterances from command line + python scripts/generate_test_audio.py -u "Hello" -u "World" -o test.wav + + # Utterances from JSON file + python scripts/generate_test_audio.py -j utterances.json -o test.wav + + # Custom silence (1s between utterances) + python scripts/generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav + +Requires SILICONFLOW_API_KEY in .env. +""" + +import wave +import struct +import argparse +import asyncio +import aiohttp +import json +import os +from pathlib import Path +from dotenv import load_dotenv + + +# Load .env file from project root +project_root = Path(__file__).parent.parent +load_dotenv(project_root / ".env") + + +# SiliconFlow TTS Configuration +SILICONFLOW_API_URL = "https://api.siliconflow.cn/v1/audio/speech" +SILICONFLOW_MODEL = "FunAudioLLM/CosyVoice2-0.5B" + +# Available voices +VOICES = { + "alex": "FunAudioLLM/CosyVoice2-0.5B:alex", + "anna": "FunAudioLLM/CosyVoice2-0.5B:anna", + "bella": "FunAudioLLM/CosyVoice2-0.5B:bella", + "benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin", + "charles": "FunAudioLLM/CosyVoice2-0.5B:charles", + "claire": "FunAudioLLM/CosyVoice2-0.5B:claire", + "david": "FunAudioLLM/CosyVoice2-0.5B:david", + "diana": "FunAudioLLM/CosyVoice2-0.5B:diana", +} + + +def generate_silence(duration_ms: int, sample_rate: int = 16000) -> bytes: + """Generate silence as PCM bytes.""" + num_samples = int(sample_rate * (duration_ms / 1000.0)) + return b'\x00\x00' * num_samples + + +async def synthesize_speech( + text: str, + api_key: str, + voice: str = "anna", + sample_rate: int = 16000, + speed: float = 1.0 +) -> bytes: + """ + Synthesize speech using SiliconFlow TTS API. + + Args: + text: Text to synthesize + api_key: SiliconFlow API key + voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana) + sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100) + speed: Speech speed (0.25 to 4.0) + + Returns: + PCM audio bytes (16-bit signed, little-endian) + """ + # Resolve voice name + full_voice = VOICES.get(voice, voice) + + payload = { + "model": SILICONFLOW_MODEL, + "input": text, + "voice": full_voice, + "response_format": "pcm", + "sample_rate": sample_rate, + "stream": False, + "speed": speed + } + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + } + + async with aiohttp.ClientSession() as session: + async with session.post(SILICONFLOW_API_URL, json=payload, headers=headers) as response: + if response.status != 200: + error_text = await response.text() + raise RuntimeError(f"SiliconFlow TTS error: {response.status} - {error_text}") + + return await response.read() + + +async def generate_test_audio( + output_path: str, + utterances: list[str], + silence_ms: int = 500, + lead_silence_ms: int = 200, + trail_silence_ms: int = 300, + voice: str = "anna", + sample_rate: int = 16000, + speed: float = 1.0 +): + """ + Generate test audio with multiple utterances separated by silence. + + Args: + output_path: Path to save the WAV file + utterances: List of text strings for each utterance + silence_ms: Silence duration between utterances (milliseconds) + lead_silence_ms: Silence at the beginning (milliseconds) + trail_silence_ms: Silence at the end (milliseconds) + voice: TTS voice to use + sample_rate: Audio sample rate + speed: TTS speech speed + """ + api_key = os.getenv("SILICONFLOW_API_KEY") + if not api_key: + raise ValueError( + "SILICONFLOW_API_KEY not found in environment.\n" + "Please set it in your .env file:\n" + " SILICONFLOW_API_KEY=your-api-key-here" + ) + + print(f"Using SiliconFlow TTS API") + print(f" Voice: {voice}") + print(f" Sample rate: {sample_rate}Hz") + print(f" Speed: {speed}x") + print() + + segments = [] + + # Lead-in silence + if lead_silence_ms > 0: + segments.append(generate_silence(lead_silence_ms, sample_rate)) + print(f" [silence: {lead_silence_ms}ms]") + + # Generate each utterance with silence between + for i, text in enumerate(utterances): + print(f" Synthesizing utterance {i + 1}: \"{text}\"") + audio = await synthesize_speech( + text=text, + api_key=api_key, + voice=voice, + sample_rate=sample_rate, + speed=speed + ) + segments.append(audio) + + # Add silence between utterances (not after the last one) + if i < len(utterances) - 1: + segments.append(generate_silence(silence_ms, sample_rate)) + print(f" [silence: {silence_ms}ms]") + + # Trail silence + if trail_silence_ms > 0: + segments.append(generate_silence(trail_silence_ms, sample_rate)) + print(f" [silence: {trail_silence_ms}ms]") + + # Concatenate all segments + audio_data = b''.join(segments) + + # Write WAV file + with wave.open(output_path, 'wb') as wf: + wf.setnchannels(1) # Mono + wf.setsampwidth(2) # 16-bit + wf.setframerate(sample_rate) + wf.writeframes(audio_data) + + duration_sec = len(audio_data) / (sample_rate * 2) + print() + print(f"Generated: {output_path}") + print(f" Duration: {duration_sec:.2f}s") + print(f" Sample rate: {sample_rate}Hz") + print(f" Format: 16-bit mono PCM WAV") + print(f" Size: {len(audio_data):,} bytes") + + +def load_utterances_from_json(path: Path) -> list[str]: + """ + Load utterances from a JSON file. + + Accepts either: + - A JSON array: ["utterance 1", "utterance 2"] + - A JSON object with "utterances" key: {"utterances": ["a", "b"]} + """ + with open(path, encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, list): + return [str(s) for s in data] + if isinstance(data, dict) and "utterances" in data: + return [str(s) for s in data["utterances"]] + raise ValueError( + f"JSON file must be an array of strings or an object with 'utterances' key. " + f"Got: {type(data).__name__}" + ) + + +def parse_args(): + """Parse command-line arguments.""" + script_dir = Path(__file__).parent + default_output = script_dir.parent / "data" / "audio_examples" / "two_utterances_16k.wav" + + parser = argparse.ArgumentParser(description="Generate test audio with SiliconFlow TTS (utterances + silence).") + parser.add_argument( + "-o", "--output", + type=Path, + default=default_output, + help=f"Output WAV file path (default: {default_output})" + ) + parser.add_argument( + "-u", "--utterance", + action="append", + dest="utterances", + metavar="TEXT", + help="Utterance text (repeat for multiple). Ignored if --json is set." + ) + parser.add_argument( + "-j", "--json", + type=Path, + metavar="PATH", + help="JSON file with utterances: array of strings or object with 'utterances' key" + ) + parser.add_argument( + "--silence-ms", + type=int, + default=500, + metavar="MS", + help="Silence in ms between utterances (default: 500)" + ) + parser.add_argument( + "--lead-silence-ms", + type=int, + default=200, + metavar="MS", + help="Silence in ms at start of file (default: 200)" + ) + parser.add_argument( + "--trail-silence-ms", + type=int, + default=300, + metavar="MS", + help="Silence in ms at end of file (default: 300)" + ) + return parser.parse_args() + + +async def main(): + """Main entry point.""" + args = parse_args() + output_path = args.output + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Resolve utterances: JSON file > -u args > defaults + if args.json is not None: + if not args.json.is_file(): + raise FileNotFoundError(f"Utterances JSON file not found: {args.json}") + utterances = load_utterances_from_json(args.json) + if not utterances: + raise ValueError(f"JSON file has no utterances: {args.json}") + elif args.utterances: + utterances = args.utterances + else: + utterances = [ + "Hello, how are you doing today?", + "I'm doing great, thank you for asking!" + ] + + await generate_test_audio( + output_path=str(output_path), + utterances=utterances, + silence_ms=args.silence_ms, + lead_silence_ms=args.lead_silence_ms, + trail_silence_ms=args.trail_silence_ms, + voice="anna", + sample_rate=16000, + speed=1.0 + ) + + +if __name__ == "__main__": + asyncio.run(main())