Add generate test audio script

2026-02-04 10:32:54 +08:00
parent 8bc24ded59
commit 5aa9a12ca8
3 changed files with 789 additions and 0 deletions
--- a/examples/wav_client.py
+++ b/examples/wav_client.py
@@ -0,0 +1,476 @@
 #!/usr/bin/env python3
 """
 WAV file client for testing duplex voice conversation.
 This client reads audio from a WAV file, sends it to the server,
 and saves the AI's voice response to an output WAV file.
 Usage:
    python examples/wav_client.py --input input.wav --output response.wav
    python examples/wav_client.py --input input.wav --output response.wav --url ws://localhost:8000/ws
    python examples/wav_client.py --input input.wav --output response.wav --wait-time 10
    python wav_client.py --input ../data/audio_examples/two_utterances.wav -o response.wav
 Requirements:
    pip install soundfile websockets numpy
 """
 import argparse
 import asyncio
 import json
 import sys
 import time
 import wave
 from pathlib import Path
 try:
    import numpy as np
 except ImportError:
    print("Please install numpy: pip install numpy")
    sys.exit(1)
 try:
    import soundfile as sf
 except ImportError:
    print("Please install soundfile: pip install soundfile")
    sys.exit(1)
 try:
    import websockets
 except ImportError:
    print("Please install websockets: pip install websockets")
    sys.exit(1)
 class WavFileClient:
    """
    WAV file client for voice conversation testing.
    Features:
    - Read audio from WAV file
    - Send audio to WebSocket server
    - Receive and save response audio
    - Event logging
    """
    def __init__(
        self,
        url: str,
        input_file: str,
        output_file: str,
        sample_rate: int = 16000,
        chunk_duration_ms: int = 20,
        wait_time: float = 15.0,
        verbose: bool = False
    ):
        """
        Initialize WAV file client.
        Args:
            url: WebSocket server URL
            input_file: Input WAV file path
            output_file: Output WAV file path
            sample_rate: Audio sample rate (Hz)
            chunk_duration_ms: Audio chunk duration (ms) for sending
            wait_time: Time to wait for response after sending (seconds)
            verbose: Enable verbose output
        """
        self.url = url
        self.input_file = Path(input_file)
        self.output_file = Path(output_file)
        self.sample_rate = sample_rate
        self.chunk_duration_ms = chunk_duration_ms
        self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
        self.wait_time = wait_time
        self.verbose = verbose
        # WebSocket connection
        self.ws = None
        self.running = False
        # Audio buffers
        self.received_audio = bytearray()
        # Statistics
        self.bytes_sent = 0
        self.bytes_received = 0
        # TTFB tracking
        self.send_start_time = None
        self.first_audio_received = False
        self.ttfb_ms = None
        # State tracking
        self.track_started = False
        self.track_ended = False
        self.send_completed = False
        # Events log
        self.events_log = []
    def log_event(self, direction: str, message: str):
        """Log an event with timestamp."""
        timestamp = time.time()
        self.events_log.append({
            "timestamp": timestamp,
            "direction": direction,
            "message": message
        })
        print(f"{direction} {message}")
    async def connect(self) -> None:
        """Connect to WebSocket server."""
        self.log_event("→", f"Connecting to {self.url}...")
        self.ws = await websockets.connect(self.url)
        self.running = True
        self.log_event("←", "Connected!")
        # Send invite command
        await self.send_command({
            "command": "invite",
            "option": {
                "codec": "pcm",
                "sampleRate": self.sample_rate
            }
        })
    async def send_command(self, cmd: dict) -> None:
        """Send JSON command to server."""
        if self.ws:
            await self.ws.send(json.dumps(cmd))
            self.log_event("→", f"Command: {cmd.get('command', 'unknown')}")
    async def send_hangup(self, reason: str = "Session complete") -> None:
        """Send hangup command."""
        await self.send_command({
            "command": "hangup",
            "reason": reason
        })
    def load_wav_file(self) -> tuple[np.ndarray, int]:
        """
        Load and prepare WAV file for sending.
        Returns:
            Tuple of (audio_data as int16 numpy array, original sample rate)
        """
        if not self.input_file.exists():
            raise FileNotFoundError(f"Input file not found: {self.input_file}")
        # Load audio file
        audio_data, file_sample_rate = sf.read(self.input_file)
        self.log_event("→", f"Loaded: {self.input_file}")
        self.log_event("→", f"  Original sample rate: {file_sample_rate} Hz")
        self.log_event("→", f"  Duration: {len(audio_data) / file_sample_rate:.2f}s")
        # Convert stereo to mono if needed
        if len(audio_data.shape) > 1:
            audio_data = audio_data.mean(axis=1)
            self.log_event("→", "  Converted stereo to mono")
        # Resample if needed
        if file_sample_rate != self.sample_rate:
            # Simple resampling using numpy
            duration = len(audio_data) / file_sample_rate
            num_samples = int(duration * self.sample_rate)
            indices = np.linspace(0, len(audio_data) - 1, num_samples)
            audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data)
            self.log_event("→", f"  Resampled to {self.sample_rate} Hz")
        # Convert to int16
        if audio_data.dtype != np.int16:
            # Normalize to [-1, 1] if needed
            max_val = np.max(np.abs(audio_data))
            if max_val > 1.0:
                audio_data = audio_data / max_val
            audio_data = (audio_data * 32767).astype(np.int16)
        self.log_event("→", f"  Prepared: {len(audio_data)} samples ({len(audio_data)/self.sample_rate:.2f}s)")
        return audio_data, file_sample_rate
    async def audio_sender(self, audio_data: np.ndarray) -> None:
        """Send audio data to server in chunks."""
        total_samples = len(audio_data)
        chunk_size = self.chunk_samples
        sent_samples = 0
        self.send_start_time = time.time()
        self.log_event("→", f"Starting audio transmission ({total_samples} samples)...")
        while sent_samples < total_samples and self.running:
            # Get next chunk
            end_sample = min(sent_samples + chunk_size, total_samples)
            chunk = audio_data[sent_samples:end_sample]
            chunk_bytes = chunk.tobytes()
            # Send to server
            if self.ws:
                await self.ws.send(chunk_bytes)
                self.bytes_sent += len(chunk_bytes)
            sent_samples = end_sample
            # Progress logging (every 500ms worth of audio)
            if self.verbose and sent_samples % (self.sample_rate // 2) == 0:
                progress = (sent_samples / total_samples) * 100
                print(f"  Sending: {progress:.0f}%", end="\r")
            # Delay to simulate real-time streaming
            # Server expects audio at real-time pace for VAD/ASR to work properly
            await asyncio.sleep(self.chunk_duration_ms / 1000)
        self.send_completed = True
        elapsed = time.time() - self.send_start_time
        self.log_event("→", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent/1024:.1f} KB)")
    async def receiver(self) -> None:
        """Receive messages from server."""
        try:
            while self.running:
                try:
                    message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
                    if isinstance(message, bytes):
                        # Audio data received
                        self.bytes_received += len(message)
                        self.received_audio.extend(message)
                        # Calculate TTFB on first audio
                        if not self.first_audio_received and self.send_start_time:
                            self.ttfb_ms = (time.time() - self.send_start_time) * 1000
                            self.first_audio_received = True
                            self.log_event("←", f"[TTFB] First audio latency: {self.ttfb_ms:.0f}ms")
                        # Log progress
                        duration_ms = len(message) / (self.sample_rate * 2) * 1000
                        total_ms = len(self.received_audio) / (self.sample_rate * 2) * 1000
                        if self.verbose:
                            print(f"← Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r")
                    else:
                        # JSON event
                        event = json.loads(message)
                        await self._handle_event(event)
                except asyncio.TimeoutError:
                    continue
                except websockets.ConnectionClosed:
                    self.log_event("←", "Connection closed")
                    self.running = False
                    break
        except asyncio.CancelledError:
            pass
        except Exception as e:
            self.log_event("!", f"Receiver error: {e}")
            self.running = False
    async def _handle_event(self, event: dict) -> None:
        """Handle incoming event."""
        event_type = event.get("event", "unknown")
        if event_type == "answer":
            self.log_event("←", "Session ready!")
        elif event_type == "speaking":
            self.log_event("←", "Speech detected")
        elif event_type == "silence":
            self.log_event("←", "Silence detected")
        elif event_type == "transcript":
            text = event.get("text", "")
            is_final = event.get("isFinal", False)
            if is_final:
                self.log_event("←", f"Transcript (final): {text}")
            elif self.verbose:
                self.log_event("←", f"Transcript (interim): {text[:50]}...")
        elif event_type == "ttfb":
            latency_ms = event.get("latencyMs", 0)
            self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
        elif event_type == "trackStart":
            self.track_started = True
            self.log_event("←", "Bot started speaking")
        elif event_type == "trackEnd":
            self.track_ended = True
            self.log_event("←", "Bot finished speaking")
        elif event_type == "interrupt":
            self.log_event("←", "Bot interrupted!")
        elif event_type == "error":
            self.log_event("!", f"Error: {event.get('error')}")
        elif event_type == "hangup":
            self.log_event("←", f"Hangup: {event.get('reason')}")
            self.running = False
        else:
            self.log_event("←", f"Event: {event_type}")
    def save_output_wav(self) -> None:
        """Save received audio to output WAV file."""
        if not self.received_audio:
            self.log_event("!", "No audio received to save")
            return
        # Convert bytes to numpy array
        audio_data = np.frombuffer(bytes(self.received_audio), dtype=np.int16)
        # Ensure output directory exists
        self.output_file.parent.mkdir(parents=True, exist_ok=True)
        # Save using wave module for compatibility
        with wave.open(str(self.output_file), 'wb') as wav_file:
            wav_file.setnchannels(1)
            wav_file.setsampwidth(2)  # 16-bit
            wav_file.setframerate(self.sample_rate)
            wav_file.writeframes(audio_data.tobytes())
        duration = len(audio_data) / self.sample_rate
        self.log_event("→", f"Saved output: {self.output_file}")
        self.log_event("→", f"  Duration: {duration:.2f}s ({len(audio_data)} samples)")
        self.log_event("→", f"  Size: {len(self.received_audio)/1024:.1f} KB")
    async def run(self) -> None:
        """Run the WAV file test."""
        try:
            # Load input WAV file
            audio_data, _ = self.load_wav_file()
            # Connect to server
            await self.connect()
            # Wait for answer
            await asyncio.sleep(0.5)
            # Start receiver task
            receiver_task = asyncio.create_task(self.receiver())
            # Send audio
            await self.audio_sender(audio_data)
            # Wait for response
            self.log_event("→", f"Waiting {self.wait_time}s for response...")
            wait_start = time.time()
            while self.running and (time.time() - wait_start) < self.wait_time:
                # Check if track has ended (response complete)
                if self.track_ended and self.send_completed:
                    # Give a little extra time for any remaining audio
                    await asyncio.sleep(1.0)
                    break
                await asyncio.sleep(0.1)
            # Cleanup
            self.running = False
            receiver_task.cancel()
            try:
                await receiver_task
            except asyncio.CancelledError:
                pass
            # Save output
            self.save_output_wav()
            # Print summary
            self._print_summary()
        except FileNotFoundError as e:
            print(f"Error: {e}")
            sys.exit(1)
        except ConnectionRefusedError:
            print(f"Error: Could not connect to {self.url}")
            print("Make sure the server is running.")
            sys.exit(1)
        except Exception as e:
            print(f"Error: {e}")
            import traceback
            traceback.print_exc()
            sys.exit(1)
        finally:
            await self.close()
    def _print_summary(self):
        """Print session summary."""
        print("\n" + "=" * 50)
        print("Session Summary")
        print("=" * 50)
        print(f"  Input file:  {self.input_file}")
        print(f"  Output file: {self.output_file}")
        print(f"  Bytes sent:     {self.bytes_sent / 1024:.1f} KB")
        print(f"  Bytes received: {self.bytes_received / 1024:.1f} KB")
        if self.ttfb_ms:
            print(f"  TTFB:           {self.ttfb_ms:.0f} ms")
        if self.received_audio:
            duration = len(self.received_audio) / (self.sample_rate * 2)
            print(f"  Response duration: {duration:.2f}s")
        print("=" * 50)
    async def close(self) -> None:
        """Close the connection."""
        self.running = False
        if self.ws:
            try:
                await self.ws.close()
            except:
                pass
 async def main():
    parser = argparse.ArgumentParser(
        description="WAV file client for testing duplex voice conversation"
    )
    parser.add_argument(
        "--input", "-i",
        required=True,
        help="Input WAV file path"
    )
    parser.add_argument(
        "--output", "-o",
        required=True,
        help="Output WAV file path for response"
    )
    parser.add_argument(
        "--url",
        default="ws://localhost:8000/ws",
        help="WebSocket server URL (default: ws://localhost:8000/ws)"
    )
    parser.add_argument(
        "--sample-rate",
        type=int,
        default=16000,
        help="Target sample rate for audio (default: 16000)"
    )
    parser.add_argument(
        "--chunk-duration",
        type=int,
        default=20,
        help="Chunk duration in ms for sending (default: 20)"
    )
    parser.add_argument(
        "--wait-time", "-w",
        type=float,
        default=15.0,
        help="Time to wait for response after sending (default: 15.0)"
    )
    parser.add_argument(
        "--verbose", "-v",
        action="store_true",
        help="Enable verbose output"
    )
    args = parser.parse_args()
    client = WavFileClient(
        url=args.url,
        input_file=args.input,
        output_file=args.output,
        sample_rate=args.sample_rate,
        chunk_duration_ms=args.chunk_duration,
        wait_time=args.wait_time,
        verbose=args.verbose
    )
    await client.run()
 if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        print("\nInterrupted by user")
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -0,0 +1 @@
 # Development Script
--- a/scripts/generate_test_audio/generate_test_audio.py
+++ b/scripts/generate_test_audio/generate_test_audio.py
@@ -0,0 +1,312 @@
 #!/usr/bin/env python3
 """
 Generate test audio file with utterances using SiliconFlow TTS API.
 Creates a 16kHz mono WAV file with real speech segments separated by
 configurable silence (for VAD/testing).
 Usage:
  python scripts/generate_test_audio.py [OPTIONS]
 Options:
  -o, --output PATH       Output WAV path (default: data/audio_examples/two_utterances_16k.wav)
  -u, --utterance TEXT    Utterance text; repeat for multiple (ignored if -j is set)
  -j, --json PATH         JSON file: array of strings or {"utterances": [...]}
  --silence-ms MS         Silence in ms between utterances (default: 500)
  --lead-silence-ms MS    Silence in ms at start (default: 200)
  --trail-silence-ms MS   Silence in ms at end (default: 300)
 Examples:
  # Default utterances and output
  python scripts/generate_test_audio.py
  # Custom output path
  python scripts/generate_test_audio.py -o out.wav
  # Utterances from command line
  python scripts/generate_test_audio.py -u "Hello" -u "World" -o test.wav
  # Utterances from JSON file
  python scripts/generate_test_audio.py -j utterances.json -o test.wav
  # Custom silence (1s between utterances)
  python scripts/generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav
 Requires SILICONFLOW_API_KEY in .env.
 """
 import wave
 import struct
 import argparse
 import asyncio
 import aiohttp
 import json
 import os
 from pathlib import Path
 from dotenv import load_dotenv
 # Load .env file from project root
 project_root = Path(__file__).parent.parent
 load_dotenv(project_root / ".env")
 # SiliconFlow TTS Configuration
 SILICONFLOW_API_URL = "https://api.siliconflow.cn/v1/audio/speech"
 SILICONFLOW_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
 # Available voices
 VOICES = {
    "alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
    "anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
    "bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
    "benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
    "charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
    "claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
    "david": "FunAudioLLM/CosyVoice2-0.5B:david",
    "diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
 }
 def generate_silence(duration_ms: int, sample_rate: int = 16000) -> bytes:
    """Generate silence as PCM bytes."""
    num_samples = int(sample_rate * (duration_ms / 1000.0))
    return b'\x00\x00' * num_samples
 async def synthesize_speech(
    text: str,
    api_key: str,
    voice: str = "anna",
    sample_rate: int = 16000,
    speed: float = 1.0
 ) -> bytes:
    """
    Synthesize speech using SiliconFlow TTS API.
    Args:
        text: Text to synthesize
        api_key: SiliconFlow API key
        voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
        sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
        speed: Speech speed (0.25 to 4.0)
    Returns:
        PCM audio bytes (16-bit signed, little-endian)
    """
    # Resolve voice name
    full_voice = VOICES.get(voice, voice)
    payload = {
        "model": SILICONFLOW_MODEL,
        "input": text,
        "voice": full_voice,
        "response_format": "pcm",
        "sample_rate": sample_rate,
        "stream": False,
        "speed": speed
    }
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    async with aiohttp.ClientSession() as session:
        async with session.post(SILICONFLOW_API_URL, json=payload, headers=headers) as response:
            if response.status != 200:
                error_text = await response.text()
                raise RuntimeError(f"SiliconFlow TTS error: {response.status} - {error_text}")
            return await response.read()
 async def generate_test_audio(
    output_path: str,
    utterances: list[str],
    silence_ms: int = 500,
    lead_silence_ms: int = 200,
    trail_silence_ms: int = 300,
    voice: str = "anna",
    sample_rate: int = 16000,
    speed: float = 1.0
 ):
    """
    Generate test audio with multiple utterances separated by silence.
    Args:
        output_path: Path to save the WAV file
        utterances: List of text strings for each utterance
        silence_ms: Silence duration between utterances (milliseconds)
        lead_silence_ms: Silence at the beginning (milliseconds)
        trail_silence_ms: Silence at the end (milliseconds)
        voice: TTS voice to use
        sample_rate: Audio sample rate
        speed: TTS speech speed
    """
    api_key = os.getenv("SILICONFLOW_API_KEY")
    if not api_key:
        raise ValueError(
            "SILICONFLOW_API_KEY not found in environment.\n"
            "Please set it in your .env file:\n"
            "  SILICONFLOW_API_KEY=your-api-key-here"
        )
    print(f"Using SiliconFlow TTS API")
    print(f"  Voice: {voice}")
    print(f"  Sample rate: {sample_rate}Hz")
    print(f"  Speed: {speed}x")
    print()
    segments = []
    # Lead-in silence
    if lead_silence_ms > 0:
        segments.append(generate_silence(lead_silence_ms, sample_rate))
        print(f"  [silence: {lead_silence_ms}ms]")
    # Generate each utterance with silence between
    for i, text in enumerate(utterances):
        print(f"  Synthesizing utterance {i + 1}: \"{text}\"")
        audio = await synthesize_speech(
            text=text,
            api_key=api_key,
            voice=voice,
            sample_rate=sample_rate,
            speed=speed
        )
        segments.append(audio)
        # Add silence between utterances (not after the last one)
        if i < len(utterances) - 1:
            segments.append(generate_silence(silence_ms, sample_rate))
            print(f"  [silence: {silence_ms}ms]")
    # Trail silence
    if trail_silence_ms > 0:
        segments.append(generate_silence(trail_silence_ms, sample_rate))
        print(f"  [silence: {trail_silence_ms}ms]")
    # Concatenate all segments
    audio_data = b''.join(segments)
    # Write WAV file
    with wave.open(output_path, 'wb') as wf:
        wf.setnchannels(1)          # Mono
        wf.setsampwidth(2)          # 16-bit
        wf.setframerate(sample_rate)
        wf.writeframes(audio_data)
    duration_sec = len(audio_data) / (sample_rate * 2)
    print()
    print(f"Generated: {output_path}")
    print(f"  Duration: {duration_sec:.2f}s")
    print(f"  Sample rate: {sample_rate}Hz")
    print(f"  Format: 16-bit mono PCM WAV")
    print(f"  Size: {len(audio_data):,} bytes")
 def load_utterances_from_json(path: Path) -> list[str]:
    """
    Load utterances from a JSON file.
    Accepts either:
    - A JSON array: ["utterance 1", "utterance 2"]
    - A JSON object with "utterances" key: {"utterances": ["a", "b"]}
    """
    with open(path, encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, list):
        return [str(s) for s in data]
    if isinstance(data, dict) and "utterances" in data:
        return [str(s) for s in data["utterances"]]
    raise ValueError(
        f"JSON file must be an array of strings or an object with 'utterances' key. "
        f"Got: {type(data).__name__}"
    )
 def parse_args():
    """Parse command-line arguments."""
    script_dir = Path(__file__).parent
    default_output = script_dir.parent / "data" / "audio_examples" / "two_utterances_16k.wav"
    parser = argparse.ArgumentParser(description="Generate test audio with SiliconFlow TTS (utterances + silence).")
    parser.add_argument(
        "-o", "--output",
        type=Path,
        default=default_output,
        help=f"Output WAV file path (default: {default_output})"
    )
    parser.add_argument(
        "-u", "--utterance",
        action="append",
        dest="utterances",
        metavar="TEXT",
        help="Utterance text (repeat for multiple). Ignored if --json is set."
    )
    parser.add_argument(
        "-j", "--json",
        type=Path,
        metavar="PATH",
        help="JSON file with utterances: array of strings or object with 'utterances' key"
    )
    parser.add_argument(
        "--silence-ms",
        type=int,
        default=500,
        metavar="MS",
        help="Silence in ms between utterances (default: 500)"
    )
    parser.add_argument(
        "--lead-silence-ms",
        type=int,
        default=200,
        metavar="MS",
        help="Silence in ms at start of file (default: 200)"
    )
    parser.add_argument(
        "--trail-silence-ms",
        type=int,
        default=300,
        metavar="MS",
        help="Silence in ms at end of file (default: 300)"
    )
    return parser.parse_args()
 async def main():
    """Main entry point."""
    args = parse_args()
    output_path = args.output
    output_path.parent.mkdir(parents=True, exist_ok=True)
    # Resolve utterances: JSON file > -u args > defaults
    if args.json is not None:
        if not args.json.is_file():
            raise FileNotFoundError(f"Utterances JSON file not found: {args.json}")
        utterances = load_utterances_from_json(args.json)
        if not utterances:
            raise ValueError(f"JSON file has no utterances: {args.json}")
    elif args.utterances:
        utterances = args.utterances
    else:
        utterances = [
            "Hello, how are you doing today?",
            "I'm doing great, thank you for asking!"
        ]
    await generate_test_audio(
        output_path=str(output_path),
        utterances=utterances,
        silence_ms=args.silence_ms,
        lead_silence_ms=args.lead_silence_ms,
        trail_silence_ms=args.trail_silence_ms,
        voice="anna",
        sample_rate=16000,
        speed=1.0
    )
 if __name__ == "__main__":
    asyncio.run(main())