#!/usr/bin/env python3 """ Generate test audio file with utterances using SiliconFlow TTS API. Creates a 16kHz mono WAV file with real speech segments separated by configurable silence (for VAD/testing). Usage: python generate_test_audio.py [OPTIONS] Options: -o, --output PATH Output WAV path (default: data/audio_examples/two_utterances_16k.wav) -u, --utterance TEXT Utterance text; repeat for multiple (ignored if -j is set) -j, --json PATH JSON file: array of strings or {"utterances": [...]} --silence-ms MS Silence in ms between utterances (default: 500) --lead-silence-ms MS Silence in ms at start (default: 200) --trail-silence-ms MS Silence in ms at end (default: 300) Examples: # Default utterances and output python generate_test_audio.py # Custom output path python generate_test_audio.py -o out.wav # Utterances from command line python generate_test_audio.py -u "Hello" -u "World" -o test.wav # Utterancgenerate_test_audio.py -j utterances.json -o test.wav # Custom silence (1s between utterances) python generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav Requires SILICONFLOW_API_KEY in .env. """ import wave import struct import argparse import asyncio import aiohttp import json import os from pathlib import Path from dotenv import load_dotenv # Load .env file from project root project_root = Path(__file__).parent.parent.parent load_dotenv(project_root / ".env") # SiliconFlow TTS Configuration SILICONFLOW_API_URL = "https://api.siliconflow.cn/v1/audio/speech" SILICONFLOW_MODEL = "FunAudioLLM/CosyVoice2-0.5B" # Available voices VOICES = { "alex": "FunAudioLLM/CosyVoice2-0.5B:alex", "anna": "FunAudioLLM/CosyVoice2-0.5B:anna", "bella": "FunAudioLLM/CosyVoice2-0.5B:bella", "benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin", "charles": "FunAudioLLM/CosyVoice2-0.5B:charles", "claire": "FunAudioLLM/CosyVoice2-0.5B:claire", "david": "FunAudioLLM/CosyVoice2-0.5B:david", "diana": "FunAudioLLM/CosyVoice2-0.5B:diana", } def generate_silence(duration_ms: int, sample_rate: int = 16000) -> bytes: """Generate silence as PCM bytes.""" num_samples = int(sample_rate * (duration_ms / 1000.0)) return b'\x00\x00' * num_samples async def synthesize_speech( text: str, api_key: str, voice: str = "anna", sample_rate: int = 16000, speed: float = 1.0 ) -> bytes: """ Synthesize speech using SiliconFlow TTS API. Args: text: Text to synthesize api_key: SiliconFlow API key voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana) sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100) speed: Speech speed (0.25 to 4.0) Returns: PCM audio bytes (16-bit signed, little-endian) """ # Resolve voice name full_voice = VOICES.get(voice, voice) payload = { "model": SILICONFLOW_MODEL, "input": text, "voice": full_voice, "response_format": "pcm", "sample_rate": sample_rate, "stream": False, "speed": speed } headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } async with aiohttp.ClientSession() as session: async with session.post(SILICONFLOW_API_URL, json=payload, headers=headers) as response: if response.status != 200: error_text = await response.text() raise RuntimeError(f"SiliconFlow TTS error: {response.status} - {error_text}") return await response.read() async def generate_test_audio( output_path: str, utterances: list[str], silence_ms: int = 500, lead_silence_ms: int = 200, trail_silence_ms: int = 300, voice: str = "anna", sample_rate: int = 16000, speed: float = 1.0 ): """ Generate test audio with multiple utterances separated by silence. Args: output_path: Path to save the WAV file utterances: List of text strings for each utterance silence_ms: Silence duration between utterances (milliseconds) lead_silence_ms: Silence at the beginning (milliseconds) trail_silence_ms: Silence at the end (milliseconds) voice: TTS voice to use sample_rate: Audio sample rate speed: TTS speech speed """ api_key = os.getenv("SILICONFLOW_API_KEY") if not api_key: raise ValueError( "SILICONFLOW_API_KEY not found in environment.\n" "Please set it in your .env file:\n" " SILICONFLOW_API_KEY=your-api-key-here" ) print(f"Using SiliconFlow TTS API") print(f" Voice: {voice}") print(f" Sample rate: {sample_rate}Hz") print(f" Speed: {speed}x") print() segments = [] # Lead-in silence if lead_silence_ms > 0: segments.append(generate_silence(lead_silence_ms, sample_rate)) print(f" [silence: {lead_silence_ms}ms]") # Generate each utterance with silence between for i, text in enumerate(utterances): print(f" Synthesizing utterance {i + 1}: \"{text}\"") audio = await synthesize_speech( text=text, api_key=api_key, voice=voice, sample_rate=sample_rate, speed=speed ) segments.append(audio) # Add silence between utterances (not after the last one) if i < len(utterances) - 1: segments.append(generate_silence(silence_ms, sample_rate)) print(f" [silence: {silence_ms}ms]") # Trail silence if trail_silence_ms > 0: segments.append(generate_silence(trail_silence_ms, sample_rate)) print(f" [silence: {trail_silence_ms}ms]") # Concatenate all segments audio_data = b''.join(segments) # Write WAV file with wave.open(output_path, 'wb') as wf: wf.setnchannels(1) # Mono wf.setsampwidth(2) # 16-bit wf.setframerate(sample_rate) wf.writeframes(audio_data) duration_sec = len(audio_data) / (sample_rate * 2) print() print(f"Generated: {output_path}") print(f" Duration: {duration_sec:.2f}s") print(f" Sample rate: {sample_rate}Hz") print(f" Format: 16-bit mono PCM WAV") print(f" Size: {len(audio_data):,} bytes") def load_utterances_from_json(path: Path) -> list[str]: """ Load utterances from a JSON file. Accepts either: - A JSON array: ["utterance 1", "utterance 2"] - A JSON object with "utterances" key: {"utterances": ["a", "b"]} """ with open(path, encoding="utf-8") as f: data = json.load(f) if isinstance(data, list): return [str(s) for s in data] if isinstance(data, dict) and "utterances" in data: return [str(s) for s in data["utterances"]] raise ValueError( f"JSON file must be an array of strings or an object with 'utterances' key. " f"Got: {type(data).__name__}" ) def parse_args(): """Parse command-line arguments.""" script_dir = Path(__file__).parent default_output = script_dir.parent / "data" / "audio_examples" / "two_utterances_16k.wav" parser = argparse.ArgumentParser(description="Generate test audio with SiliconFlow TTS (utterances + silence).") parser.add_argument( "-o", "--output", type=Path, default=default_output, help=f"Output WAV file path (default: {default_output})" ) parser.add_argument( "-u", "--utterance", action="append", dest="utterances", metavar="TEXT", help="Utterance text (repeat for multiple). Ignored if --json is set." ) parser.add_argument( "-j", "--json", type=Path, metavar="PATH", help="JSON file with utterances: array of strings or object with 'utterances' key" ) parser.add_argument( "--silence-ms", type=int, default=500, metavar="MS", help="Silence in ms between utterances (default: 500)" ) parser.add_argument( "--lead-silence-ms", type=int, default=200, metavar="MS", help="Silence in ms at start of file (default: 200)" ) parser.add_argument( "--trail-silence-ms", type=int, default=300, metavar="MS", help="Silence in ms at end of file (default: 300)" ) return parser.parse_args() async def main(): """Main entry point.""" args = parse_args() output_path = args.output output_path.parent.mkdir(parents=True, exist_ok=True) # Resolve utterances: JSON file > -u args > defaults if args.json is not None: if not args.json.is_file(): raise FileNotFoundError(f"Utterances JSON file not found: {args.json}") utterances = load_utterances_from_json(args.json) if not utterances: raise ValueError(f"JSON file has no utterances: {args.json}") elif args.utterances: utterances = args.utterances else: utterances = [ "Hello, how are you doing today?", "I'm doing great, thank you for asking!" ] await generate_test_audio( output_path=str(output_path), utterances=utterances, silence_ms=args.silence_ms, lead_silence_ms=args.lead_silence_ms, trail_silence_ms=args.trail_silence_ms, voice="anna", sample_rate=16000, speed=1.0 ) if __name__ == "__main__": asyncio.run(main())