Add backend api and engine
This commit is contained in:
311
engine/scripts/generate_test_audio/generate_test_audio.py
Normal file
311
engine/scripts/generate_test_audio/generate_test_audio.py
Normal file
@@ -0,0 +1,311 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate test audio file with utterances using SiliconFlow TTS API.
|
||||
|
||||
Creates a 16kHz mono WAV file with real speech segments separated by
|
||||
configurable silence (for VAD/testing).
|
||||
|
||||
Usage:
|
||||
python generate_test_audio.py [OPTIONS]
|
||||
|
||||
Options:
|
||||
-o, --output PATH Output WAV path (default: data/audio_examples/two_utterances_16k.wav)
|
||||
-u, --utterance TEXT Utterance text; repeat for multiple (ignored if -j is set)
|
||||
-j, --json PATH JSON file: array of strings or {"utterances": [...]}
|
||||
--silence-ms MS Silence in ms between utterances (default: 500)
|
||||
--lead-silence-ms MS Silence in ms at start (default: 200)
|
||||
--trail-silence-ms MS Silence in ms at end (default: 300)
|
||||
|
||||
Examples:
|
||||
# Default utterances and output
|
||||
python generate_test_audio.py
|
||||
|
||||
# Custom output path
|
||||
python generate_test_audio.py -o out.wav
|
||||
|
||||
# Utterances from command line
|
||||
python generate_test_audio.py -u "Hello" -u "World" -o test.wav
|
||||
|
||||
# Utterancgenerate_test_audio.py -j utterances.json -o test.wav
|
||||
|
||||
# Custom silence (1s between utterances)
|
||||
python generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav
|
||||
|
||||
Requires SILICONFLOW_API_KEY in .env.
|
||||
"""
|
||||
|
||||
import wave
|
||||
import struct
|
||||
import argparse
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
# Load .env file from project root
|
||||
project_root = Path(__file__).parent.parent.parent
|
||||
load_dotenv(project_root / ".env")
|
||||
|
||||
|
||||
# SiliconFlow TTS Configuration
|
||||
SILICONFLOW_API_URL = "https://api.siliconflow.cn/v1/audio/speech"
|
||||
SILICONFLOW_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
|
||||
|
||||
# Available voices
|
||||
VOICES = {
|
||||
"alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
|
||||
"anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
|
||||
"bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
|
||||
"benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
|
||||
"charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
|
||||
"claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
|
||||
"david": "FunAudioLLM/CosyVoice2-0.5B:david",
|
||||
"diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
|
||||
}
|
||||
|
||||
|
||||
def generate_silence(duration_ms: int, sample_rate: int = 16000) -> bytes:
|
||||
"""Generate silence as PCM bytes."""
|
||||
num_samples = int(sample_rate * (duration_ms / 1000.0))
|
||||
return b'\x00\x00' * num_samples
|
||||
|
||||
|
||||
async def synthesize_speech(
|
||||
text: str,
|
||||
api_key: str,
|
||||
voice: str = "anna",
|
||||
sample_rate: int = 16000,
|
||||
speed: float = 1.0
|
||||
) -> bytes:
|
||||
"""
|
||||
Synthesize speech using SiliconFlow TTS API.
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
api_key: SiliconFlow API key
|
||||
voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
|
||||
sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
|
||||
speed: Speech speed (0.25 to 4.0)
|
||||
|
||||
Returns:
|
||||
PCM audio bytes (16-bit signed, little-endian)
|
||||
"""
|
||||
# Resolve voice name
|
||||
full_voice = VOICES.get(voice, voice)
|
||||
|
||||
payload = {
|
||||
"model": SILICONFLOW_MODEL,
|
||||
"input": text,
|
||||
"voice": full_voice,
|
||||
"response_format": "pcm",
|
||||
"sample_rate": sample_rate,
|
||||
"stream": False,
|
||||
"speed": speed
|
||||
}
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(SILICONFLOW_API_URL, json=payload, headers=headers) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
raise RuntimeError(f"SiliconFlow TTS error: {response.status} - {error_text}")
|
||||
|
||||
return await response.read()
|
||||
|
||||
|
||||
async def generate_test_audio(
|
||||
output_path: str,
|
||||
utterances: list[str],
|
||||
silence_ms: int = 500,
|
||||
lead_silence_ms: int = 200,
|
||||
trail_silence_ms: int = 300,
|
||||
voice: str = "anna",
|
||||
sample_rate: int = 16000,
|
||||
speed: float = 1.0
|
||||
):
|
||||
"""
|
||||
Generate test audio with multiple utterances separated by silence.
|
||||
|
||||
Args:
|
||||
output_path: Path to save the WAV file
|
||||
utterances: List of text strings for each utterance
|
||||
silence_ms: Silence duration between utterances (milliseconds)
|
||||
lead_silence_ms: Silence at the beginning (milliseconds)
|
||||
trail_silence_ms: Silence at the end (milliseconds)
|
||||
voice: TTS voice to use
|
||||
sample_rate: Audio sample rate
|
||||
speed: TTS speech speed
|
||||
"""
|
||||
api_key = os.getenv("SILICONFLOW_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
"SILICONFLOW_API_KEY not found in environment.\n"
|
||||
"Please set it in your .env file:\n"
|
||||
" SILICONFLOW_API_KEY=your-api-key-here"
|
||||
)
|
||||
|
||||
print(f"Using SiliconFlow TTS API")
|
||||
print(f" Voice: {voice}")
|
||||
print(f" Sample rate: {sample_rate}Hz")
|
||||
print(f" Speed: {speed}x")
|
||||
print()
|
||||
|
||||
segments = []
|
||||
|
||||
# Lead-in silence
|
||||
if lead_silence_ms > 0:
|
||||
segments.append(generate_silence(lead_silence_ms, sample_rate))
|
||||
print(f" [silence: {lead_silence_ms}ms]")
|
||||
|
||||
# Generate each utterance with silence between
|
||||
for i, text in enumerate(utterances):
|
||||
print(f" Synthesizing utterance {i + 1}: \"{text}\"")
|
||||
audio = await synthesize_speech(
|
||||
text=text,
|
||||
api_key=api_key,
|
||||
voice=voice,
|
||||
sample_rate=sample_rate,
|
||||
speed=speed
|
||||
)
|
||||
segments.append(audio)
|
||||
|
||||
# Add silence between utterances (not after the last one)
|
||||
if i < len(utterances) - 1:
|
||||
segments.append(generate_silence(silence_ms, sample_rate))
|
||||
print(f" [silence: {silence_ms}ms]")
|
||||
|
||||
# Trail silence
|
||||
if trail_silence_ms > 0:
|
||||
segments.append(generate_silence(trail_silence_ms, sample_rate))
|
||||
print(f" [silence: {trail_silence_ms}ms]")
|
||||
|
||||
# Concatenate all segments
|
||||
audio_data = b''.join(segments)
|
||||
|
||||
# Write WAV file
|
||||
with wave.open(output_path, 'wb') as wf:
|
||||
wf.setnchannels(1) # Mono
|
||||
wf.setsampwidth(2) # 16-bit
|
||||
wf.setframerate(sample_rate)
|
||||
wf.writeframes(audio_data)
|
||||
|
||||
duration_sec = len(audio_data) / (sample_rate * 2)
|
||||
print()
|
||||
print(f"Generated: {output_path}")
|
||||
print(f" Duration: {duration_sec:.2f}s")
|
||||
print(f" Sample rate: {sample_rate}Hz")
|
||||
print(f" Format: 16-bit mono PCM WAV")
|
||||
print(f" Size: {len(audio_data):,} bytes")
|
||||
|
||||
|
||||
def load_utterances_from_json(path: Path) -> list[str]:
|
||||
"""
|
||||
Load utterances from a JSON file.
|
||||
|
||||
Accepts either:
|
||||
- A JSON array: ["utterance 1", "utterance 2"]
|
||||
- A JSON object with "utterances" key: {"utterances": ["a", "b"]}
|
||||
"""
|
||||
with open(path, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if isinstance(data, list):
|
||||
return [str(s) for s in data]
|
||||
if isinstance(data, dict) and "utterances" in data:
|
||||
return [str(s) for s in data["utterances"]]
|
||||
raise ValueError(
|
||||
f"JSON file must be an array of strings or an object with 'utterances' key. "
|
||||
f"Got: {type(data).__name__}"
|
||||
)
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""Parse command-line arguments."""
|
||||
script_dir = Path(__file__).parent
|
||||
default_output = script_dir.parent / "data" / "audio_examples" / "two_utterances_16k.wav"
|
||||
|
||||
parser = argparse.ArgumentParser(description="Generate test audio with SiliconFlow TTS (utterances + silence).")
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
type=Path,
|
||||
default=default_output,
|
||||
help=f"Output WAV file path (default: {default_output})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-u", "--utterance",
|
||||
action="append",
|
||||
dest="utterances",
|
||||
metavar="TEXT",
|
||||
help="Utterance text (repeat for multiple). Ignored if --json is set."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-j", "--json",
|
||||
type=Path,
|
||||
metavar="PATH",
|
||||
help="JSON file with utterances: array of strings or object with 'utterances' key"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--silence-ms",
|
||||
type=int,
|
||||
default=500,
|
||||
metavar="MS",
|
||||
help="Silence in ms between utterances (default: 500)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lead-silence-ms",
|
||||
type=int,
|
||||
default=200,
|
||||
metavar="MS",
|
||||
help="Silence in ms at start of file (default: 200)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trail-silence-ms",
|
||||
type=int,
|
||||
default=300,
|
||||
metavar="MS",
|
||||
help="Silence in ms at end of file (default: 300)"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main entry point."""
|
||||
args = parse_args()
|
||||
output_path = args.output
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Resolve utterances: JSON file > -u args > defaults
|
||||
if args.json is not None:
|
||||
if not args.json.is_file():
|
||||
raise FileNotFoundError(f"Utterances JSON file not found: {args.json}")
|
||||
utterances = load_utterances_from_json(args.json)
|
||||
if not utterances:
|
||||
raise ValueError(f"JSON file has no utterances: {args.json}")
|
||||
elif args.utterances:
|
||||
utterances = args.utterances
|
||||
else:
|
||||
utterances = [
|
||||
"Hello, how are you doing today?",
|
||||
"I'm doing great, thank you for asking!"
|
||||
]
|
||||
|
||||
await generate_test_audio(
|
||||
output_path=str(output_path),
|
||||
utterances=utterances,
|
||||
silence_ms=args.silence_ms,
|
||||
lead_silence_ms=args.lead_silence_ms,
|
||||
trail_silence_ms=args.trail_silence_ms,
|
||||
voice="anna",
|
||||
sample_rate=16000,
|
||||
speed=1.0
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user