Add backend api and engine

2026-02-06 14:01:34 +08:00
parent 590014e821
commit d5c1ab34b3
61 changed files with 10351 additions and 1 deletions
--- a/engine/scripts/generate_test_audio/generate_test_audio.py
+++ b/engine/scripts/generate_test_audio/generate_test_audio.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+"""
+Generate test audio file with utterances using SiliconFlow TTS API.
+
+Creates a 16kHz mono WAV file with real speech segments separated by
+configurable silence (for VAD/testing).
+
+Usage:
+  python generate_test_audio.py [OPTIONS]
+
+Options:
+  -o, --output PATH       Output WAV path (default: data/audio_examples/two_utterances_16k.wav)
+  -u, --utterance TEXT    Utterance text; repeat for multiple (ignored if -j is set)
+  -j, --json PATH         JSON file: array of strings or {"utterances": [...]}
+  --silence-ms MS         Silence in ms between utterances (default: 500)
+  --lead-silence-ms MS    Silence in ms at start (default: 200)
+  --trail-silence-ms MS   Silence in ms at end (default: 300)
+
+Examples:
+  # Default utterances and output
+  python generate_test_audio.py
+
+  # Custom output path
+  python generate_test_audio.py -o out.wav
+
+  # Utterances from command line
+  python generate_test_audio.py -u "Hello" -u "World" -o test.wav
+
+  # Utterancgenerate_test_audio.py -j utterances.json -o test.wav
+
+  # Custom silence (1s between utterances)
+  python generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav
+
+Requires SILICONFLOW_API_KEY in .env.
+"""
+
+import wave
+import struct
+import argparse
+import asyncio
+import aiohttp
+import json
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+
+
+# Load .env file from project root
+project_root = Path(__file__).parent.parent.parent
+load_dotenv(project_root / ".env")
+
+
+# SiliconFlow TTS Configuration
+SILICONFLOW_API_URL = "https://api.siliconflow.cn/v1/audio/speech"
+SILICONFLOW_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
+
+# Available voices
+VOICES = {
+    "alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
+    "anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
+    "bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
+    "benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
+    "charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
+    "claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
+    "david": "FunAudioLLM/CosyVoice2-0.5B:david",
+    "diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
+}
+
+
+def generate_silence(duration_ms: int, sample_rate: int = 16000) -> bytes:
+    """Generate silence as PCM bytes."""
+    num_samples = int(sample_rate * (duration_ms / 1000.0))
+    return b'\x00\x00' * num_samples
+
+
+async def synthesize_speech(
+    text: str,
+    api_key: str,
+    voice: str = "anna",
+    sample_rate: int = 16000,
+    speed: float = 1.0
+) -> bytes:
+    """
+    Synthesize speech using SiliconFlow TTS API.
+    
+    Args:
+        text: Text to synthesize
+        api_key: SiliconFlow API key
+        voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
+        sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
+        speed: Speech speed (0.25 to 4.0)
+        
+    Returns:
+        PCM audio bytes (16-bit signed, little-endian)
+    """
+    # Resolve voice name
+    full_voice = VOICES.get(voice, voice)
+    
+    payload = {
+        "model": SILICONFLOW_MODEL,
+        "input": text,
+        "voice": full_voice,
+        "response_format": "pcm",
+        "sample_rate": sample_rate,
+        "stream": False,
+        "speed": speed
+    }
+    
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    
+    async with aiohttp.ClientSession() as session:
+        async with session.post(SILICONFLOW_API_URL, json=payload, headers=headers) as response:
+            if response.status != 200:
+                error_text = await response.text()
+                raise RuntimeError(f"SiliconFlow TTS error: {response.status} - {error_text}")
+            
+            return await response.read()
+
+
+async def generate_test_audio(
+    output_path: str,
+    utterances: list[str],
+    silence_ms: int = 500,
+    lead_silence_ms: int = 200,
+    trail_silence_ms: int = 300,
+    voice: str = "anna",
+    sample_rate: int = 16000,
+    speed: float = 1.0
+):
+    """
+    Generate test audio with multiple utterances separated by silence.
+    
+    Args:
+        output_path: Path to save the WAV file
+        utterances: List of text strings for each utterance
+        silence_ms: Silence duration between utterances (milliseconds)
+        lead_silence_ms: Silence at the beginning (milliseconds)
+        trail_silence_ms: Silence at the end (milliseconds)
+        voice: TTS voice to use
+        sample_rate: Audio sample rate
+        speed: TTS speech speed
+    """
+    api_key = os.getenv("SILICONFLOW_API_KEY")
+    if not api_key:
+        raise ValueError(
+            "SILICONFLOW_API_KEY not found in environment.\n"
+            "Please set it in your .env file:\n"
+            "  SILICONFLOW_API_KEY=your-api-key-here"
+        )
+    
+    print(f"Using SiliconFlow TTS API")
+    print(f"  Voice: {voice}")
+    print(f"  Sample rate: {sample_rate}Hz")
+    print(f"  Speed: {speed}x")
+    print()
+    
+    segments = []
+    
+    # Lead-in silence
+    if lead_silence_ms > 0:
+        segments.append(generate_silence(lead_silence_ms, sample_rate))
+        print(f"  [silence: {lead_silence_ms}ms]")
+    
+    # Generate each utterance with silence between
+    for i, text in enumerate(utterances):
+        print(f"  Synthesizing utterance {i + 1}: \"{text}\"")
+        audio = await synthesize_speech(
+            text=text,
+            api_key=api_key,
+            voice=voice,
+            sample_rate=sample_rate,
+            speed=speed
+        )
+        segments.append(audio)
+        
+        # Add silence between utterances (not after the last one)
+        if i < len(utterances) - 1:
+            segments.append(generate_silence(silence_ms, sample_rate))
+            print(f"  [silence: {silence_ms}ms]")
+    
+    # Trail silence
+    if trail_silence_ms > 0:
+        segments.append(generate_silence(trail_silence_ms, sample_rate))
+        print(f"  [silence: {trail_silence_ms}ms]")
+    
+    # Concatenate all segments
+    audio_data = b''.join(segments)
+    
+    # Write WAV file
+    with wave.open(output_path, 'wb') as wf:
+        wf.setnchannels(1)          # Mono
+        wf.setsampwidth(2)          # 16-bit
+        wf.setframerate(sample_rate)
+        wf.writeframes(audio_data)
+    
+    duration_sec = len(audio_data) / (sample_rate * 2)
+    print()
+    print(f"Generated: {output_path}")
+    print(f"  Duration: {duration_sec:.2f}s")
+    print(f"  Sample rate: {sample_rate}Hz")
+    print(f"  Format: 16-bit mono PCM WAV")
+    print(f"  Size: {len(audio_data):,} bytes")
+
+
+def load_utterances_from_json(path: Path) -> list[str]:
+    """
+    Load utterances from a JSON file.
+    
+    Accepts either:
+    - A JSON array: ["utterance 1", "utterance 2"]
+    - A JSON object with "utterances" key: {"utterances": ["a", "b"]}
+    """
+    with open(path, encoding="utf-8") as f:
+        data = json.load(f)
+    if isinstance(data, list):
+        return [str(s) for s in data]
+    if isinstance(data, dict) and "utterances" in data:
+        return [str(s) for s in data["utterances"]]
+    raise ValueError(
+        f"JSON file must be an array of strings or an object with 'utterances' key. "
+        f"Got: {type(data).__name__}"
+    )
+
+
+def parse_args():
+    """Parse command-line arguments."""
+    script_dir = Path(__file__).parent
+    default_output = script_dir.parent / "data" / "audio_examples" / "two_utterances_16k.wav"
+    
+    parser = argparse.ArgumentParser(description="Generate test audio with SiliconFlow TTS (utterances + silence).")
+    parser.add_argument(
+        "-o", "--output",
+        type=Path,
+        default=default_output,
+        help=f"Output WAV file path (default: {default_output})"
+    )
+    parser.add_argument(
+        "-u", "--utterance",
+        action="append",
+        dest="utterances",
+        metavar="TEXT",
+        help="Utterance text (repeat for multiple). Ignored if --json is set."
+    )
+    parser.add_argument(
+        "-j", "--json",
+        type=Path,
+        metavar="PATH",
+        help="JSON file with utterances: array of strings or object with 'utterances' key"
+    )
+    parser.add_argument(
+        "--silence-ms",
+        type=int,
+        default=500,
+        metavar="MS",
+        help="Silence in ms between utterances (default: 500)"
+    )
+    parser.add_argument(
+        "--lead-silence-ms",
+        type=int,
+        default=200,
+        metavar="MS",
+        help="Silence in ms at start of file (default: 200)"
+    )
+    parser.add_argument(
+        "--trail-silence-ms",
+        type=int,
+        default=300,
+        metavar="MS",
+        help="Silence in ms at end of file (default: 300)"
+    )
+    return parser.parse_args()
+
+
+async def main():
+    """Main entry point."""
+    args = parse_args()
+    output_path = args.output
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    
+    # Resolve utterances: JSON file > -u args > defaults
+    if args.json is not None:
+        if not args.json.is_file():
+            raise FileNotFoundError(f"Utterances JSON file not found: {args.json}")
+        utterances = load_utterances_from_json(args.json)
+        if not utterances:
+            raise ValueError(f"JSON file has no utterances: {args.json}")
+    elif args.utterances:
+        utterances = args.utterances
+    else:
+        utterances = [
+            "Hello, how are you doing today?",
+            "I'm doing great, thank you for asking!"
+        ]
+    
+    await generate_test_audio(
+        output_path=str(output_path),
+        utterances=utterances,
+        silence_ms=args.silence_ms,
+        lead_silence_ms=args.lead_silence_ms,
+        trail_silence_ms=args.trail_silence_ms,
+        voice="anna",
+        sample_rate=16000,
+        speed=1.0
+    )
+
+
+if __name__ == "__main__":
+    asyncio.run(main())