Add backend api and engine

2026-02-06 14:01:34 +08:00
parent 590014e821
commit d5c1ab34b3
61 changed files with 10351 additions and 1 deletions
--- a/engine/examples/wav_client.py
+++ b/engine/examples/wav_client.py
@@ -0,0 +1,504 @@
+#!/usr/bin/env python3
+"""
+WAV file client for testing duplex voice conversation.
+
+This client reads audio from a WAV file, sends it to the server,
+and saves the AI's voice response to an output WAV file.
+
+Usage:
+    python examples/wav_client.py --input input.wav --output response.wav
+    python examples/wav_client.py --input input.wav --output response.wav --url ws://localhost:8000/ws
+    python examples/wav_client.py --input input.wav --output response.wav --wait-time 10
+    python wav_client.py --input ../data/audio_examples/two_utterances.wav -o response.wav
+Requirements:
+    pip install soundfile websockets numpy
+"""
+
+import argparse
+import asyncio
+import json
+import sys
+import time
+import wave
+from pathlib import Path
+
+try:
+    import numpy as np
+except ImportError:
+    print("Please install numpy: pip install numpy")
+    sys.exit(1)
+
+try:
+    import soundfile as sf
+except ImportError:
+    print("Please install soundfile: pip install soundfile")
+    sys.exit(1)
+
+try:
+    import websockets
+except ImportError:
+    print("Please install websockets: pip install websockets")
+    sys.exit(1)
+
+
+class WavFileClient:
+    """
+    WAV file client for voice conversation testing.
+    
+    Features:
+    - Read audio from WAV file
+    - Send audio to WebSocket server
+    - Receive and save response audio
+    - Event logging
+    """
+    
+    def __init__(
+        self,
+        url: str,
+        input_file: str,
+        output_file: str,
+        sample_rate: int = 16000,
+        chunk_duration_ms: int = 20,
+        wait_time: float = 15.0,
+        verbose: bool = False
+    ):
+        """
+        Initialize WAV file client.
+        
+        Args:
+            url: WebSocket server URL
+            input_file: Input WAV file path
+            output_file: Output WAV file path
+            sample_rate: Audio sample rate (Hz)
+            chunk_duration_ms: Audio chunk duration (ms) for sending
+            wait_time: Time to wait for response after sending (seconds)
+            verbose: Enable verbose output
+        """
+        self.url = url
+        self.input_file = Path(input_file)
+        self.output_file = Path(output_file)
+        self.sample_rate = sample_rate
+        self.chunk_duration_ms = chunk_duration_ms
+        self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
+        self.wait_time = wait_time
+        self.verbose = verbose
+        
+        # WebSocket connection
+        self.ws = None
+        self.running = False
+        
+        # Audio buffers
+        self.received_audio = bytearray()
+        
+        # Statistics
+        self.bytes_sent = 0
+        self.bytes_received = 0
+        
+        # TTFB tracking (per response)
+        self.send_start_time = None
+        self.response_start_time = None  # set on each trackStart
+        self.waiting_for_first_audio = False
+        self.ttfb_ms = None  # last TTFB for summary
+        self.ttfb_list = []  # TTFB for each response
+        
+        # State tracking
+        self.track_started = False
+        self.track_ended = False
+        self.send_completed = False
+        
+        # Events log
+        self.events_log = []
+    
+    def log_event(self, direction: str, message: str):
+        """Log an event with timestamp."""
+        timestamp = time.time()
+        self.events_log.append({
+            "timestamp": timestamp,
+            "direction": direction,
+            "message": message
+        })
+        # Handle encoding errors on Windows
+        try:
+            print(f"{direction} {message}")
+        except UnicodeEncodeError:
+            # Replace problematic characters for console output
+            safe_message = message.encode('ascii', errors='replace').decode('ascii')
+            print(f"{direction} {safe_message}")
+    
+    async def connect(self) -> None:
+        """Connect to WebSocket server."""
+        self.log_event("→", f"Connecting to {self.url}...")
+        self.ws = await websockets.connect(self.url)
+        self.running = True
+        self.log_event("←", "Connected!")
+        
+        # Send invite command
+        await self.send_command({
+            "command": "invite",
+            "option": {
+                "codec": "pcm",
+                "sampleRate": self.sample_rate
+            }
+        })
+    
+    async def send_command(self, cmd: dict) -> None:
+        """Send JSON command to server."""
+        if self.ws:
+            await self.ws.send(json.dumps(cmd))
+            self.log_event("→", f"Command: {cmd.get('command', 'unknown')}")
+    
+    async def send_hangup(self, reason: str = "Session complete") -> None:
+        """Send hangup command."""
+        await self.send_command({
+            "command": "hangup",
+            "reason": reason
+        })
+    
+    def load_wav_file(self) -> tuple[np.ndarray, int]:
+        """
+        Load and prepare WAV file for sending.
+        
+        Returns:
+            Tuple of (audio_data as int16 numpy array, original sample rate)
+        """
+        if not self.input_file.exists():
+            raise FileNotFoundError(f"Input file not found: {self.input_file}")
+        
+        # Load audio file
+        audio_data, file_sample_rate = sf.read(self.input_file)
+        self.log_event("→", f"Loaded: {self.input_file}")
+        self.log_event("→", f"  Original sample rate: {file_sample_rate} Hz")
+        self.log_event("→", f"  Duration: {len(audio_data) / file_sample_rate:.2f}s")
+        
+        # Convert stereo to mono if needed
+        if len(audio_data.shape) > 1:
+            audio_data = audio_data.mean(axis=1)
+            self.log_event("→", "  Converted stereo to mono")
+        
+        # Resample if needed
+        if file_sample_rate != self.sample_rate:
+            # Simple resampling using numpy
+            duration = len(audio_data) / file_sample_rate
+            num_samples = int(duration * self.sample_rate)
+            indices = np.linspace(0, len(audio_data) - 1, num_samples)
+            audio_data = np.interp(indices, np.arange(len(audio_data)), audio_data)
+            self.log_event("→", f"  Resampled to {self.sample_rate} Hz")
+        
+        # Convert to int16
+        if audio_data.dtype != np.int16:
+            # Normalize to [-1, 1] if needed
+            max_val = np.max(np.abs(audio_data))
+            if max_val > 1.0:
+                audio_data = audio_data / max_val
+            audio_data = (audio_data * 32767).astype(np.int16)
+        
+        self.log_event("→", f"  Prepared: {len(audio_data)} samples ({len(audio_data)/self.sample_rate:.2f}s)")
+        
+        return audio_data, file_sample_rate
+    
+    async def audio_sender(self, audio_data: np.ndarray) -> None:
+        """Send audio data to server in chunks."""
+        total_samples = len(audio_data)
+        chunk_size = self.chunk_samples
+        sent_samples = 0
+        
+        self.send_start_time = time.time()
+        self.log_event("→", f"Starting audio transmission ({total_samples} samples)...")
+        
+        while sent_samples < total_samples and self.running:
+            # Get next chunk
+            end_sample = min(sent_samples + chunk_size, total_samples)
+            chunk = audio_data[sent_samples:end_sample]
+            chunk_bytes = chunk.tobytes()
+            
+            # Send to server
+            if self.ws:
+                await self.ws.send(chunk_bytes)
+                self.bytes_sent += len(chunk_bytes)
+            
+            sent_samples = end_sample
+            
+            # Progress logging (every 500ms worth of audio)
+            if self.verbose and sent_samples % (self.sample_rate // 2) == 0:
+                progress = (sent_samples / total_samples) * 100
+                print(f"  Sending: {progress:.0f}%", end="\r")
+            
+            # Delay to simulate real-time streaming
+            # Server expects audio at real-time pace for VAD/ASR to work properly
+            await asyncio.sleep(self.chunk_duration_ms / 1000)
+        
+        self.send_completed = True
+        elapsed = time.time() - self.send_start_time
+        self.log_event("→", f"Audio transmission complete ({elapsed:.2f}s, {self.bytes_sent/1024:.1f} KB)")
+    
+    async def receiver(self) -> None:
+        """Receive messages from server."""
+        try:
+            while self.running:
+                try:
+                    message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
+                    
+                    if isinstance(message, bytes):
+                        # Audio data received
+                        self.bytes_received += len(message)
+                        self.received_audio.extend(message)
+                        
+                        # Calculate TTFB on first audio of each response
+                        if self.waiting_for_first_audio and self.response_start_time is not None:
+                            ttfb_ms = (time.time() - self.response_start_time) * 1000
+                            self.ttfb_ms = ttfb_ms
+                            self.ttfb_list.append(ttfb_ms)
+                            self.waiting_for_first_audio = False
+                            self.log_event("←", f"[TTFB] First audio latency: {ttfb_ms:.0f}ms")
+                        
+                        # Log progress
+                        duration_ms = len(message) / (self.sample_rate * 2) * 1000
+                        total_ms = len(self.received_audio) / (self.sample_rate * 2) * 1000
+                        if self.verbose:
+                            print(f"← Audio: +{duration_ms:.0f}ms (total: {total_ms:.0f}ms)", end="\r")
+                        
+                    else:
+                        # JSON event
+                        event = json.loads(message)
+                        await self._handle_event(event)
+                        
+                except asyncio.TimeoutError:
+                    continue
+                except websockets.ConnectionClosed:
+                    self.log_event("←", "Connection closed")
+                    self.running = False
+                    break
+                    
+        except asyncio.CancelledError:
+            pass
+        except Exception as e:
+            self.log_event("!", f"Receiver error: {e}")
+            self.running = False
+    
+    async def _handle_event(self, event: dict) -> None:
+        """Handle incoming event."""
+        event_type = event.get("event", "unknown")
+        
+        if event_type == "answer":
+            self.log_event("←", "Session ready!")
+        elif event_type == "speaking":
+            self.log_event("←", "Speech detected")
+        elif event_type == "silence":
+            self.log_event("←", "Silence detected")
+        elif event_type == "transcript":
+            # ASR transcript (interim = asrDelta-style, final = asrFinal-style)
+            text = event.get("text", "")
+            is_final = event.get("isFinal", False)
+            if is_final:
+                # Clear interim line and print final
+                print(" " * 80, end="\r")
+                self.log_event("←", f"→ You: {text}")
+            else:
+                # Interim result - show with indicator (overwrite same line, as in mic_client)
+                display_text = text[:60] + "..." if len(text) > 60 else text
+                print(f"  [listening] {display_text}".ljust(80), end="\r")
+        elif event_type == "ttfb":
+            latency_ms = event.get("latencyMs", 0)
+            self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
+        elif event_type == "llmResponse":
+            text = event.get("text", "")
+            is_final = event.get("isFinal", False)
+            if is_final:
+                self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
+            elif self.verbose:
+                # Show streaming chunks only in verbose mode
+                self.log_event("←", f"LLM: {text}")
+        elif event_type == "trackStart":
+            self.track_started = True
+            self.response_start_time = time.time()
+            self.waiting_for_first_audio = True
+            self.log_event("←", "Bot started speaking")
+        elif event_type == "trackEnd":
+            self.track_ended = True
+            self.log_event("←", "Bot finished speaking")
+        elif event_type == "interrupt":
+            self.log_event("←", "Bot interrupted!")
+        elif event_type == "error":
+            self.log_event("!", f"Error: {event.get('error')}")
+        elif event_type == "hangup":
+            self.log_event("←", f"Hangup: {event.get('reason')}")
+            self.running = False
+        else:
+            self.log_event("←", f"Event: {event_type}")
+    
+    def save_output_wav(self) -> None:
+        """Save received audio to output WAV file."""
+        if not self.received_audio:
+            self.log_event("!", "No audio received to save")
+            return
+        
+        # Convert bytes to numpy array
+        audio_data = np.frombuffer(bytes(self.received_audio), dtype=np.int16)
+        
+        # Ensure output directory exists
+        self.output_file.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Save using wave module for compatibility
+        with wave.open(str(self.output_file), 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(self.sample_rate)
+            wav_file.writeframes(audio_data.tobytes())
+        
+        duration = len(audio_data) / self.sample_rate
+        self.log_event("→", f"Saved output: {self.output_file}")
+        self.log_event("→", f"  Duration: {duration:.2f}s ({len(audio_data)} samples)")
+        self.log_event("→", f"  Size: {len(self.received_audio)/1024:.1f} KB")
+    
+    async def run(self) -> None:
+        """Run the WAV file test."""
+        try:
+            # Load input WAV file
+            audio_data, _ = self.load_wav_file()
+            
+            # Connect to server
+            await self.connect()
+            
+            # Wait for answer
+            await asyncio.sleep(0.5)
+            
+            # Start receiver task
+            receiver_task = asyncio.create_task(self.receiver())
+            
+            # Send audio
+            await self.audio_sender(audio_data)
+            
+            # Wait for response
+            self.log_event("→", f"Waiting {self.wait_time}s for response...")
+            
+            wait_start = time.time()
+            while self.running and (time.time() - wait_start) < self.wait_time:
+                # Check if track has ended (response complete)
+                if self.track_ended and self.send_completed:
+                    # Give a little extra time for any remaining audio
+                    await asyncio.sleep(1.0)
+                    break
+                await asyncio.sleep(0.1)
+            
+            # Cleanup
+            self.running = False
+            receiver_task.cancel()
+            
+            try:
+                await receiver_task
+            except asyncio.CancelledError:
+                pass
+            
+            # Save output
+            self.save_output_wav()
+            
+            # Print summary
+            self._print_summary()
+            
+        except FileNotFoundError as e:
+            print(f"Error: {e}")
+            sys.exit(1)
+        except ConnectionRefusedError:
+            print(f"Error: Could not connect to {self.url}")
+            print("Make sure the server is running.")
+            sys.exit(1)
+        except Exception as e:
+            print(f"Error: {e}")
+            import traceback
+            traceback.print_exc()
+            sys.exit(1)
+        finally:
+            await self.close()
+    
+    def _print_summary(self):
+        """Print session summary."""
+        print("\n" + "=" * 50)
+        print("Session Summary")
+        print("=" * 50)
+        print(f"  Input file:  {self.input_file}")
+        print(f"  Output file: {self.output_file}")
+        print(f"  Bytes sent:     {self.bytes_sent / 1024:.1f} KB")
+        print(f"  Bytes received: {self.bytes_received / 1024:.1f} KB")
+        if self.ttfb_list:
+            if len(self.ttfb_list) == 1:
+                print(f"  TTFB:           {self.ttfb_list[0]:.0f} ms")
+            else:
+                print(f"  TTFB (per response): {', '.join(f'{t:.0f}ms' for t in self.ttfb_list)}")
+        if self.received_audio:
+            duration = len(self.received_audio) / (self.sample_rate * 2)
+            print(f"  Response duration: {duration:.2f}s")
+        print("=" * 50)
+    
+    async def close(self) -> None:
+        """Close the connection."""
+        self.running = False
+        if self.ws:
+            try:
+                await self.ws.close()
+            except:
+                pass
+
+
+async def main():
+    parser = argparse.ArgumentParser(
+        description="WAV file client for testing duplex voice conversation"
+    )
+    parser.add_argument(
+        "--input", "-i",
+        required=True,
+        help="Input WAV file path"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        required=True,
+        help="Output WAV file path for response"
+    )
+    parser.add_argument(
+        "--url",
+        default="ws://localhost:8000/ws",
+        help="WebSocket server URL (default: ws://localhost:8000/ws)"
+    )
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="Target sample rate for audio (default: 16000)"
+    )
+    parser.add_argument(
+        "--chunk-duration",
+        type=int,
+        default=20,
+        help="Chunk duration in ms for sending (default: 20)"
+    )
+    parser.add_argument(
+        "--wait-time", "-w",
+        type=float,
+        default=15.0,
+        help="Time to wait for response after sending (default: 15.0)"
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Enable verbose output"
+    )
+    
+    args = parser.parse_args()
+    
+    client = WavFileClient(
+        url=args.url,
+        input_file=args.input,
+        output_file=args.output,
+        sample_rate=args.sample_rate,
+        chunk_duration_ms=args.chunk_duration,
+        wait_time=args.wait_time,
+        verbose=args.verbose
+    )
+    
+    await client.run()
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\nInterrupted by user")