I can use text to get audio response and barge in

2026-01-29 16:25:53 +08:00
parent cd90b4fb37
commit ac0c76e6e8
16 changed files with 3394 additions and 119 deletions
--- a/examples/mic_client.py
+++ b/examples/mic_client.py
@@ -1,137 +1,517 @@
+#!/usr/bin/env python3
 """
-Microphone WebSocket Client
+Microphone client for testing duplex voice conversation.

-Connects to the backend WebSocket endpoint and streams audio from the microphone.
-Used to test VAD and EOU detection.
+This client captures audio from the microphone, sends it to the server,
+and plays back the AI's voice response through the speakers.

-Dependencies:
-    pip install pyaudio aiohttp
+Usage:
+    python examples/mic_client.py --url ws://localhost:8000/ws
+    python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!"
+
+Requirements:
+    pip install sounddevice soundfile websockets numpy
 """

+import argparse
 import asyncio
-import aiohttp
-import pyaudio
 import json
 import sys
-from datetime import datetime
+import threading
+import queue
+from pathlib import Path

-# Configuration
-SERVER_URL = "ws://localhost:8000/ws"
-SAMPLE_RATE = 16000
-CHANNELS = 1
-CHUNK_DURATION_MS = 20
-CHUNK_SIZE = int(SAMPLE_RATE * (CHUNK_DURATION_MS / 1000.0))  # 320 samples for 20ms
-FORMAT = pyaudio.paInt16
+try:
+    import numpy as np
+except ImportError:
+    print("Please install numpy: pip install numpy")
+    sys.exit(1)

-async def send_audio_loop(ws, stream):
-    """Read from microphone and send to WebSocket."""
-    print("🎙️  Microphone streaming started...")
-    try:
-        while True:
-            # Read non-blocking? PyAudio read is blocking, so run in executor or use specialized async lib.
-            # For simplicity in this script, we'll just read. It might block the event loop slightly 
-            # but for 20ms chunks it's usually acceptable for a test script.
-            # To be proper async, we should run_in_executor.
-            data = await asyncio.get_event_loop().run_in_executor(
-                None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False)
+try:
+    import sounddevice as sd
+except ImportError:
+    print("Please install sounddevice: pip install sounddevice")
+    sys.exit(1)
+
+try:
+    import websockets
+except ImportError:
+    print("Please install websockets: pip install websockets")
+    sys.exit(1)
+
+
+class MicrophoneClient:
+    """
+    Full-duplex microphone client for voice conversation.
+    
+    Features:
+    - Real-time microphone capture
+    - Real-time speaker playback
+    - WebSocket communication
+    - Text chat support
+    """
+    
+    def __init__(
+        self,
+        url: str,
+        sample_rate: int = 16000,
+        chunk_duration_ms: int = 20,
+        input_device: int = None,
+        output_device: int = None
+    ):
+        """
+        Initialize microphone client.
+        
+        Args:
+            url: WebSocket server URL
+            sample_rate: Audio sample rate (Hz)
+            chunk_duration_ms: Audio chunk duration (ms)
+            input_device: Input device ID (None for default)
+            output_device: Output device ID (None for default)
+        """
+        self.url = url
+        self.sample_rate = sample_rate
+        self.chunk_duration_ms = chunk_duration_ms
+        self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
+        self.input_device = input_device
+        self.output_device = output_device
+        
+        # WebSocket connection
+        self.ws = None
+        self.running = False
+        
+        # Audio buffers
+        self.audio_input_queue = queue.Queue()
+        self.audio_output_buffer = b""  # Continuous buffer for smooth playback
+        self.audio_output_lock = threading.Lock()
+        
+        # Statistics
+        self.bytes_sent = 0
+        self.bytes_received = 0
+        
+        # State
+        self.is_recording = True
+        self.is_playing = True
+    
+    async def connect(self) -> None:
+        """Connect to WebSocket server."""
+        print(f"Connecting to {self.url}...")
+        self.ws = await websockets.connect(self.url)
+        self.running = True
+        print("Connected!")
+        
+        # Send invite command
+        await self.send_command({
+            "command": "invite",
+            "option": {
+                "codec": "pcm",
+                "sampleRate": self.sample_rate
+            }
+        })
+    
+    async def send_command(self, cmd: dict) -> None:
+        """Send JSON command to server."""
+        if self.ws:
+            await self.ws.send(json.dumps(cmd))
+            print(f"→ Command: {cmd.get('command', 'unknown')}")
+    
+    async def send_chat(self, text: str) -> None:
+        """Send chat message (text input)."""
+        await self.send_command({
+            "command": "chat",
+            "text": text
+        })
+        print(f"→ Chat: {text}")
+    
+    async def send_interrupt(self) -> None:
+        """Send interrupt command."""
+        await self.send_command({
+            "command": "interrupt"
+        })
+    
+    async def send_hangup(self, reason: str = "User quit") -> None:
+        """Send hangup command."""
+        await self.send_command({
+            "command": "hangup",
+            "reason": reason
+        })
+    
+    def _audio_input_callback(self, indata, frames, time, status):
+        """Callback for audio input (microphone)."""
+        if status:
+            print(f"Input status: {status}")
+        
+        if self.is_recording and self.running:
+            # Convert to 16-bit PCM
+            audio_data = (indata[:, 0] * 32767).astype(np.int16).tobytes()
+            self.audio_input_queue.put(audio_data)
+    
+    def _add_audio_to_buffer(self, audio_data: bytes):
+        """Add audio data to playback buffer."""
+        with self.audio_output_lock:
+            self.audio_output_buffer += audio_data
+    
+    async def _playback_task(self):
+        """Background task to play buffered audio smoothly using output stream."""
+        # Use a continuous output stream for smooth playback
+        chunk_samples = int(self.sample_rate * 0.05)  # 50ms chunks
+        chunk_bytes = chunk_samples * 2  # 16-bit = 2 bytes per sample
+        
+        def output_callback(outdata, frames, time_info, status):
+            """Audio output callback."""
+            if status:
+                print(f"Output status: {status}")
+            
+            bytes_needed = frames * 2
+            with self.audio_output_lock:
+                if len(self.audio_output_buffer) >= bytes_needed:
+                    audio_data = self.audio_output_buffer[:bytes_needed]
+                    self.audio_output_buffer = self.audio_output_buffer[bytes_needed:]
+                    samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32767.0
+                    outdata[:, 0] = samples
+                else:
+                    outdata.fill(0)
+        
+        # Create and start output stream
+        try:
+            output_stream = sd.OutputStream(
+                samplerate=self.sample_rate,
+                channels=1,
+                dtype=np.float32,
+                blocksize=chunk_samples,
+                device=self.output_device,
+                callback=output_callback,
+                latency='low'
+            )
+            output_stream.start()
+            print(f"Audio output stream started (device: {self.output_device or 'default'})")
+            
+            # Keep stream running while client is active
+            while self.running:
+                await asyncio.sleep(0.1)
+            
+            output_stream.stop()
+            output_stream.close()
+            
+        except Exception as e:
+            print(f"Playback error: {e}")
+            import traceback
+            traceback.print_exc()
+    
+    async def audio_sender(self) -> None:
+        """Send audio from microphone to server."""
+        while self.running:
+            try:
+                # Get audio from queue with timeout
+                try:
+                    audio_data = await asyncio.get_event_loop().run_in_executor(
+                        None, lambda: self.audio_input_queue.get(timeout=0.1)
+                    )
+                except queue.Empty:
+                    continue
+                
+                # Send to server
+                if self.ws and self.is_recording:
+                    await self.ws.send(audio_data)
+                    self.bytes_sent += len(audio_data)
+                    
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                print(f"Audio sender error: {e}")
+                break
+    
+    async def receiver(self) -> None:
+        """Receive messages from server."""
+        try:
+            while self.running:
+                try:
+                    message = await asyncio.wait_for(self.ws.recv(), timeout=0.1)
+                    
+                    if isinstance(message, bytes):
+                        # Audio data received
+                        self.bytes_received += len(message)
+                        
+                        if self.is_playing:
+                            self._add_audio_to_buffer(message)
+                        
+                        # Show progress (less verbose)
+                        with self.audio_output_lock:
+                            buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000
+                        duration_ms = len(message) / (self.sample_rate * 2) * 1000
+                        print(f"← Audio: {duration_ms:.0f}ms (buffer: {buffer_ms:.0f}ms)")
+                        
+                    else:
+                        # JSON event
+                        event = json.loads(message)
+                        await self._handle_event(event)
+                        
+                except asyncio.TimeoutError:
+                    continue
+                except websockets.ConnectionClosed:
+                    print("Connection closed")
+                    self.running = False
+                    break
+                    
+        except asyncio.CancelledError:
+            pass
+        except Exception as e:
+            print(f"Receiver error: {e}")
+            self.running = False
+    
+    async def _handle_event(self, event: dict) -> None:
+        """Handle incoming event."""
+        event_type = event.get("event", "unknown")
+        
+        if event_type == "answer":
+            print("← Session ready!")
+        elif event_type == "speaking":
+            print("← User speech detected")
+        elif event_type == "silence":
+            print("← User silence detected")
+        elif event_type == "trackStart":
+            print("← Bot started speaking")
+            # Clear any old audio in buffer
+            with self.audio_output_lock:
+                self.audio_output_buffer = b""
+        elif event_type == "trackEnd":
+            print("← Bot finished speaking")
+        elif event_type == "interrupt":
+            print("← Bot interrupted!")
+        elif event_type == "error":
+            print(f"← Error: {event.get('error')}")
+        elif event_type == "hangup":
+            print(f"← Hangup: {event.get('reason')}")
+            self.running = False
+        else:
+            print(f"← Event: {event_type}")
+    
+    async def interactive_mode(self) -> None:
+        """Run interactive mode for text chat."""
+        print("\n" + "=" * 50)
+        print("Voice Conversation Client")
+        print("=" * 50)
+        print("Speak into your microphone to talk to the AI.")
+        print("Or type messages to send text.")
+        print("")
+        print("Commands:")
+        print("  /quit      - End conversation")
+        print("  /mute      - Mute microphone")
+        print("  /unmute    - Unmute microphone")
+        print("  /interrupt - Interrupt AI speech")
+        print("  /stats     - Show statistics")
+        print("=" * 50 + "\n")
+        
+        while self.running:
+            try:
+                user_input = await asyncio.get_event_loop().run_in_executor(
+                    None, input, ""
+                )
+                
+                if not user_input:
+                    continue
+                
+                # Handle commands
+                if user_input.startswith("/"):
+                    cmd = user_input.lower().strip()
+                    
+                    if cmd == "/quit":
+                        await self.send_hangup("User quit")
+                        break
+                    elif cmd == "/mute":
+                        self.is_recording = False
+                        print("Microphone muted")
+                    elif cmd == "/unmute":
+                        self.is_recording = True
+                        print("Microphone unmuted")
+                    elif cmd == "/interrupt":
+                        await self.send_interrupt()
+                    elif cmd == "/stats":
+                        print(f"Sent: {self.bytes_sent / 1024:.1f} KB")
+                        print(f"Received: {self.bytes_received / 1024:.1f} KB")
+                    else:
+                        print(f"Unknown command: {cmd}")
+                else:
+                    # Send as chat message
+                    await self.send_chat(user_input)
+                    
+            except EOFError:
+                break
+            except Exception as e:
+                print(f"Input error: {e}")
+    
+    async def run(self, chat_message: str = None, interactive: bool = True) -> None:
+        """
+        Run the client.
+        
+        Args:
+            chat_message: Optional single chat message to send
+            interactive: Whether to run in interactive mode
+        """
+        try:
+            await self.connect()
+            
+            # Wait for answer
+            await asyncio.sleep(0.5)
+            
+            # Start audio input stream
+            print("Starting audio streams...")
+            
+            input_stream = sd.InputStream(
+                samplerate=self.sample_rate,
+                channels=1,
+                dtype=np.float32,
+                blocksize=self.chunk_samples,
+                device=self.input_device,
+                callback=self._audio_input_callback
            )
            
-            await ws.send_bytes(data)
-            # No sleep needed here as microphone dictates the timing
+            input_stream.start()
+            print("Audio streams started")
+            
+            # Start background tasks
+            sender_task = asyncio.create_task(self.audio_sender())
+            receiver_task = asyncio.create_task(self.receiver())
+            playback_task = asyncio.create_task(self._playback_task())
+            
+            if chat_message:
+                # Send single message and wait
+                await self.send_chat(chat_message)
+                await asyncio.sleep(15)
+            elif interactive:
+                # Run interactive mode
+                await self.interactive_mode()
+            else:
+                # Just wait
+                while self.running:
+                    await asyncio.sleep(0.1)
+            
+            # Cleanup
+            self.running = False
+            sender_task.cancel()
+            receiver_task.cancel()
+            playback_task.cancel()
            
-    except Exception as e:
-        print(f"❌ Error in send loop: {e}")
-
-async def receive_loop(ws):
-    """Listen for VAD/EOU events."""
-    print("👂 Listening for server events...")
-    async for msg in ws:
-        timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
-        
-        if msg.type == aiohttp.WSMsgType.TEXT:
            try:
-                data = json.loads(msg.data)
-                event = data.get('event')
-                
-                # Highlight VAD/EOU events
-                if event == 'speaking':
-                    print(f"[{timestamp}] 🗣️  SPEAKING STARTED")
-                elif event == 'silence':
-                    print(f"[{timestamp}] 🤫 SILENCE DETECTED")
-                elif event == 'eou':
-                    print(f"[{timestamp}] ✅ END OF UTTERANCE (EOU)")
-                elif event == 'error':
-                    print(f"[{timestamp}] ❌ ERROR: {data.get('error')}")
-                else:
-                    print(f"[{timestamp}] 📩 {event}: {str(data)[:100]}")
-                    
-            except json.JSONDecodeError:
-                print(f"[{timestamp}] 📄 Text: {msg.data}")
-                
-        elif msg.type == aiohttp.WSMsgType.CLOSED:
-            print("❌ Connection closed")
-            break
-        elif msg.type == aiohttp.WSMsgType.ERROR:
-            print("❌ Connection error")
-            break
+                await sender_task
+            except asyncio.CancelledError:
+                pass
+            
+            try:
+                await receiver_task
+            except asyncio.CancelledError:
+                pass
+            
+            try:
+                await playback_task
+            except asyncio.CancelledError:
+                pass
+            
+            input_stream.stop()
+            
+        except ConnectionRefusedError:
+            print(f"Error: Could not connect to {self.url}")
+            print("Make sure the server is running.")
+        except Exception as e:
+            print(f"Error: {e}")
+        finally:
+            await self.close()
+    
+    async def close(self) -> None:
+        """Close the connection."""
+        self.running = False
+        if self.ws:
+            await self.ws.close()
+        
+        print(f"\nSession ended")
+        print(f"  Total sent: {self.bytes_sent / 1024:.1f} KB")
+        print(f"  Total received: {self.bytes_received / 1024:.1f} KB")
+
+
+def list_devices():
+    """List available audio devices."""
+    print("\nAvailable audio devices:")
+    print("-" * 60)
+    devices = sd.query_devices()
+    for i, device in enumerate(devices):
+        direction = []
+        if device['max_input_channels'] > 0:
+            direction.append("IN")
+        if device['max_output_channels'] > 0:
+            direction.append("OUT")
+        direction_str = "/".join(direction) if direction else "N/A"
+        
+        default = ""
+        if i == sd.default.device[0]:
+            default += " [DEFAULT INPUT]"
+        if i == sd.default.device[1]:
+            default += " [DEFAULT OUTPUT]"
+        
+        print(f"  {i:2d}: {device['name'][:40]:40s} ({direction_str}){default}")
+    print("-" * 60)
+

 async def main():
-    p = pyaudio.PyAudio()
+    parser = argparse.ArgumentParser(
+        description="Microphone client for duplex voice conversation"
+    )
+    parser.add_argument(
+        "--url",
+        default="ws://localhost:8000/ws",
+        help="WebSocket server URL"
+    )
+    parser.add_argument(
+        "--chat",
+        help="Send a single chat message instead of using microphone"
+    )
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="Audio sample rate (default: 16000)"
+    )
+    parser.add_argument(
+        "--input-device",
+        type=int,
+        help="Input device ID"
+    )
+    parser.add_argument(
+        "--output-device",
+        type=int,
+        help="Output device ID"
+    )
+    parser.add_argument(
+        "--list-devices",
+        action="store_true",
+        help="List available audio devices and exit"
+    )
+    parser.add_argument(
+        "--no-interactive",
+        action="store_true",
+        help="Disable interactive mode"
+    )
    
-    # Check for input devices
-    info = p.get_host_api_info_by_index(0)
-    numdevices = info.get('deviceCount')
-    if numdevices == 0:
-        print("❌ No audio input devices found")
-        return
-
-    # Open microphone stream
-    try:
-        stream = p.open(format=FORMAT,
-                        channels=CHANNELS,
-                        rate=SAMPLE_RATE,
-                        input=True,
-                        frames_per_buffer=CHUNK_SIZE)
-    except Exception as e:
-        print(f"❌ Failed to open microphone: {e}")
-        return
-
-    session = aiohttp.ClientSession()
+    args = parser.parse_args()
    
-    try:
-        print(f"🔌 Connecting to {SERVER_URL}...")
-        async with session.ws_connect(SERVER_URL) as ws:
-            print("✅ Connected!")
+    if args.list_devices:
+        list_devices()
+        return
+    
+    client = MicrophoneClient(
+        url=args.url,
+        sample_rate=args.sample_rate,
+        input_device=args.input_device,
+        output_device=args.output_device
+    )
+    
+    await client.run(
+        chat_message=args.chat,
+        interactive=not args.no_interactive
+    )

-            # 1. Send Invite
-            invite_msg = {
-                "command": "invite",
-                "option": {
-                    "codec": "pcm",
-                    "samplerate": SAMPLE_RATE
-                }
-            }
-            await ws.send_json(invite_msg)
-            print("📤 Sent Invite")
-
-            # 2. Run loops
-            await asyncio.gather(
-                receive_loop(ws),
-                send_audio_loop(ws, stream)
-            )
-
-    except aiohttp.ClientConnectorError:
-        print(f"❌ Failed to connect to {SERVER_URL}. Is the server running?")
-    except KeyboardInterrupt:
-        print("\n👋 Stopping...")
-    finally:
-        stream.stop_stream()
-        stream.close()
-        p.terminate()
-        await session.close()

 if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
-        pass
+        print("\nInterrupted by user")