Update engine

2026-02-23 17:16:18 +08:00
parent 01c0de0a4d
commit c6c84b5af9
9 changed files with 991 additions and 186 deletions
--- a/examples/mic_client.py
+++ b/examples/mic_client.py
@@ -59,8 +59,12 @@ class MicrophoneClient:
        url: str,
        sample_rate: int = 16000,
        chunk_duration_ms: int = 20,
+        app_id: str = "assistant_demo",
+        channel: str = "mic_client",
+        config_version_id: str = "local-dev",
        input_device: int = None,
-        output_device: int = None
+        output_device: int = None,
+        track_debug: bool = False,
    ):
        """
        Initialize microphone client.
@@ -76,8 +80,12 @@ class MicrophoneClient:
        self.sample_rate = sample_rate
        self.chunk_duration_ms = chunk_duration_ms
        self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
+        self.app_id = app_id
+        self.channel = channel
+        self.config_version_id = config_version_id
        self.input_device = input_device
        self.output_device = output_device
+        self.track_debug = track_debug
        
        # WebSocket connection
        self.ws = None
@@ -106,6 +114,17 @@ class MicrophoneClient:
        
        # Verbose mode for streaming LLM responses
        self.verbose = False
+
+    @staticmethod
+    def _event_ids_suffix(event: dict) -> str:
+        data = event.get("data") if isinstance(event.get("data"), dict) else {}
+        keys = ("turn_id", "utterance_id", "response_id", "tool_call_id", "tts_id")
+        parts = []
+        for key in keys:
+            value = data.get(key, event.get(key))
+            if value:
+                parts.append(f"{key}={value}")
+        return f" [{' '.join(parts)}]" if parts else ""
    
    async def connect(self) -> None:
        """Connect to WebSocket server."""
@@ -114,20 +133,30 @@ class MicrophoneClient:
        self.running = True
        print("Connected!")
        
-        # Send invite command
+        # WS v1 handshake: hello -> session.start
        await self.send_command({
-            "command": "invite",
-            "option": {
-                "codec": "pcm",
-                "sampleRate": self.sample_rate
-            }
+            "type": "hello",
+            "version": "v1",
+        })
+        await self.send_command({
+            "type": "session.start",
+            "audio": {
+                "encoding": "pcm_s16le",
+                "sample_rate_hz": self.sample_rate,
+                "channels": 1,
+            },
+            "metadata": {
+                "appId": self.app_id,
+                "channel": self.channel,
+                "configVersionId": self.config_version_id,
+            },
        })
    
    async def send_command(self, cmd: dict) -> None:
        """Send JSON command to server."""
        if self.ws:
            await self.ws.send(json.dumps(cmd))
-            print(f"→ Command: {cmd.get('command', 'unknown')}")
+            print(f"→ Command: {cmd.get('type', 'unknown')}")
    
    async def send_chat(self, text: str) -> None:
        """Send chat message (text input)."""
@@ -136,7 +165,7 @@ class MicrophoneClient:
        self.first_audio_received = False
        
        await self.send_command({
-            "command": "chat",
+            "type": "input.text",
            "text": text
        })
        print(f"→ Chat: {text}")
@@ -144,13 +173,14 @@ class MicrophoneClient:
    async def send_interrupt(self) -> None:
        """Send interrupt command."""
        await self.send_command({
-            "command": "interrupt"
+            "type": "response.cancel",
+            "graceful": False,
        })
    
    async def send_hangup(self, reason: str = "User quit") -> None:
        """Send hangup command."""
        await self.send_command({
-            "command": "hangup",
+            "type": "session.stop",
            "reason": reason
        })
    
@@ -295,43 +325,48 @@ class MicrophoneClient:
    
    async def _handle_event(self, event: dict) -> None:
        """Handle incoming event."""
-        event_type = event.get("event", "unknown")
+        event_type = event.get("type", event.get("event", "unknown"))
+        ids = self._event_ids_suffix(event)
+        if self.track_debug:
+            print(f"[track-debug] event={event_type} trackId={event.get('trackId')}{ids}")
        
-        if event_type == "answer":
-            print("← Session ready!")
-        elif event_type == "speaking":
-            print("← User speech detected")
-        elif event_type == "silence":
-            print("← User silence detected")
-        elif event_type == "transcript":
+        if event_type in {"hello.ack", "session.started"}:
+            print(f"← Session ready!{ids}")
+        elif event_type == "config.resolved":
+            print(f"← Config resolved: {event.get('config', {}).get('output', {})}{ids}")
+        elif event_type == "input.speech_started":
+            print(f"← User speech detected{ids}")
+        elif event_type == "input.speech_stopped":
+            print(f"← User silence detected{ids}")
+        elif event_type in {"transcript", "transcript.delta", "transcript.final"}:
            # Display user speech transcription
            text = event.get("text", "")
-            is_final = event.get("isFinal", False)
+            is_final = event_type == "transcript.final" or bool(event.get("isFinal"))
            if is_final:
                # Clear the interim line and print final
                print(" " * 80, end="\r")  # Clear previous interim text
-                print(f"→ You: {text}")
+                print(f"→ You: {text}{ids}")
            else:
                # Interim result - show with indicator (overwrite same line)
                display_text = text[:60] + "..." if len(text) > 60 else text
                print(f"  [listening] {display_text}".ljust(80), end="\r")
-        elif event_type == "ttfb":
+        elif event_type in {"ttfb", "metrics.ttfb"}:
            # Server-side TTFB event
            latency_ms = event.get("latencyMs", 0)
            print(f"← [TTFB] Server reported latency: {latency_ms}ms")
-        elif event_type == "llmResponse":
+        elif event_type in {"llmResponse", "assistant.response.delta", "assistant.response.final"}:
            # LLM text response
            text = event.get("text", "")
-            is_final = event.get("isFinal", False)
+            is_final = event_type == "assistant.response.final" or bool(event.get("isFinal"))
            if is_final:
                # Print final LLM response
                print(f"← AI: {text}")
            elif self.verbose:
                # Show streaming chunks only in verbose mode
                display_text = text[:60] + "..." if len(text) > 60 else text
-                print(f"  [streaming] {display_text}")
-        elif event_type == "trackStart":
-            print("← Bot started speaking")
+                print(f"  [streaming] {display_text}{ids}")
+        elif event_type in {"trackStart", "output.audio.start"}:
+            print(f"← Bot started speaking{ids}")
            # IMPORTANT: Accept audio again after trackStart
            self._discard_audio = False
            self._audio_sequence += 1
@@ -342,13 +377,13 @@ class MicrophoneClient:
            # Clear any old audio in buffer
            with self.audio_output_lock:
                self.audio_output_buffer = b""
-        elif event_type == "trackEnd":
-            print("← Bot finished speaking")
+        elif event_type in {"trackEnd", "output.audio.end"}:
+            print(f"← Bot finished speaking{ids}")
            # Reset TTFB tracking after response completes
            self.request_start_time = None
            self.first_audio_received = False
-        elif event_type == "interrupt":
-            print("← Bot interrupted!")
+        elif event_type in {"interrupt", "response.interrupted"}:
+            print(f"← Bot interrupted!{ids}")
            # IMPORTANT: Discard all audio until next trackStart
            self._discard_audio = True
            # Clear audio buffer immediately
@@ -357,12 +392,12 @@ class MicrophoneClient:
                self.audio_output_buffer = b""
                print(f"   (cleared {buffer_ms:.0f}ms, discarding audio until new track)")
        elif event_type == "error":
-            print(f"← Error: {event.get('error')}")
-        elif event_type == "hangup":
-            print(f"← Hangup: {event.get('reason')}")
+            print(f"← Error: {event.get('error')}{ids}")
+        elif event_type in {"hangup", "session.stopped"}:
+            print(f"← Hangup: {event.get('reason')}{ids}")
            self.running = False
        else:
-            print(f"← Event: {event_type}")
+            print(f"← Event: {event_type}{ids}")
    
    async def interactive_mode(self) -> None:
        """Run interactive mode for text chat."""
@@ -573,6 +608,26 @@ async def main():
        action="store_true",
        help="Show streaming LLM response chunks"
    )
+    parser.add_argument(
+        "--app-id",
+        default="assistant_demo",
+        help="Stable app/assistant identifier for server-side config lookup"
+    )
+    parser.add_argument(
+        "--channel",
+        default="mic_client",
+        help="Client channel name"
+    )
+    parser.add_argument(
+        "--config-version-id",
+        default="local-dev",
+        help="Optional config version identifier"
+    )
+    parser.add_argument(
+        "--track-debug",
+        action="store_true",
+        help="Print event trackId for protocol debugging"
+    )
    
    args = parser.parse_args()
    
@@ -583,8 +638,12 @@ async def main():
    client = MicrophoneClient(
        url=args.url,
        sample_rate=args.sample_rate,
+        app_id=args.app_id,
+        channel=args.channel,
+        config_version_id=args.config_version_id,
        input_device=args.input_device,
-        output_device=args.output_device
+        output_device=args.output_device,
+        track_debug=args.track_debug,
    )
    client.verbose = args.verbose
    
--- a/examples/simple_client.py
+++ b/examples/simple_client.py
@@ -52,9 +52,21 @@ if not PYAUDIO_AVAILABLE and not SD_AVAILABLE:
 class SimpleVoiceClient:
    """Simple voice client with reliable audio playback."""
    
-    def __init__(self, url: str, sample_rate: int = 16000):
+    def __init__(
+        self,
+        url: str,
+        sample_rate: int = 16000,
+        app_id: str = "assistant_demo",
+        channel: str = "simple_client",
+        config_version_id: str = "local-dev",
+        track_debug: bool = False,
+    ):
        self.url = url
        self.sample_rate = sample_rate
+        self.app_id = app_id
+        self.channel = channel
+        self.config_version_id = config_version_id
+        self.track_debug = track_debug
        self.ws = None
        self.running = False
        
@@ -75,6 +87,17 @@ class SimpleVoiceClient:
        
        # Interrupt handling - discard audio until next trackStart
        self._discard_audio = False
+
+    @staticmethod
+    def _event_ids_suffix(event: dict) -> str:
+        data = event.get("data") if isinstance(event.get("data"), dict) else {}
+        keys = ("turn_id", "utterance_id", "response_id", "tool_call_id", "tts_id")
+        parts = []
+        for key in keys:
+            value = data.get(key, event.get(key))
+            if value:
+                parts.append(f"{key}={value}")
+        return f" [{' '.join(parts)}]" if parts else ""
    
    async def connect(self):
        """Connect to server."""
@@ -83,12 +106,25 @@ class SimpleVoiceClient:
        self.running = True
        print("Connected!")
        
-        # Send invite
+        # WS v1 handshake: hello -> session.start
        await self.ws.send(json.dumps({
-            "command": "invite",
-            "option": {"codec": "pcm", "sampleRate": self.sample_rate}
+            "type": "hello",
+            "version": "v1",
        }))
-        print("-> invite")
+        await self.ws.send(json.dumps({
+            "type": "session.start",
+            "audio": {
+                "encoding": "pcm_s16le",
+                "sample_rate_hz": self.sample_rate,
+                "channels": 1,
+            },
+            "metadata": {
+                "appId": self.app_id,
+                "channel": self.channel,
+                "configVersionId": self.config_version_id,
+            },
+        }))
+        print("-> hello/session.start")
    
    async def send_chat(self, text: str):
        """Send chat message."""
@@ -96,8 +132,8 @@ class SimpleVoiceClient:
        self.request_start_time = time.time()
        self.first_audio_received = False
        
-        await self.ws.send(json.dumps({"command": "chat", "text": text}))
-        print(f"-> chat: {text}")
+        await self.ws.send(json.dumps({"type": "input.text", "text": text}))
+        print(f"-> input.text: {text}")
    
    def play_audio(self, audio_data: bytes):
        """Play audio data immediately."""
@@ -152,34 +188,39 @@ class SimpleVoiceClient:
                else:
                    # JSON event
                    event = json.loads(msg)
-                    etype = event.get("event", "?")
+                    etype = event.get("type", event.get("event", "?"))
+                    ids = self._event_ids_suffix(event)
+                    if self.track_debug:
+                        print(f"[track-debug] event={etype} trackId={event.get('trackId')}{ids}")
                    
-                    if etype == "transcript":
+                    if etype in {"transcript", "transcript.delta", "transcript.final"}:
                        # User speech transcription
                        text = event.get("text", "")
-                        is_final = event.get("isFinal", False)
+                        is_final = etype == "transcript.final" or bool(event.get("isFinal"))
                        if is_final:
-                            print(f"<- You said: {text}")
+                            print(f"<- You said: {text}{ids}")
                        else:
                            print(f"<- [listening] {text}", end="\r")
-                    elif etype == "ttfb":
+                    elif etype in {"ttfb", "metrics.ttfb"}:
                        # Server-side TTFB event
                        latency_ms = event.get("latencyMs", 0)
                        print(f"<- [TTFB] Server reported latency: {latency_ms}ms")
-                    elif etype == "trackStart":
+                    elif etype in {"trackStart", "output.audio.start"}:
                        # New track starting - accept audio again
                        self._discard_audio = False
-                        print(f"<- {etype}")
-                    elif etype == "interrupt":
+                        print(f"<- {etype}{ids}")
+                    elif etype in {"interrupt", "response.interrupted"}:
                        # Interrupt - discard audio until next trackStart
                        self._discard_audio = True
-                        print(f"<- {etype} (discarding audio until new track)")
-                    elif etype == "hangup":
-                        print(f"<- {etype}")
+                        print(f"<- {etype}{ids} (discarding audio until new track)")
+                    elif etype in {"hangup", "session.stopped"}:
+                        print(f"<- {etype}{ids}")
                        self.running = False
                        break
+                    elif etype == "config.resolved":
+                        print(f"<- config.resolved {event.get('config', {}).get('output', {})}{ids}")
                    else:
-                        print(f"<- {etype}")
+                        print(f"<- {etype}{ids}")
                        
            except asyncio.TimeoutError:
                continue
@@ -270,6 +311,10 @@ async def main():
    parser.add_argument("--text", help="Send text and play response")
    parser.add_argument("--list-devices", action="store_true")
    parser.add_argument("--sample-rate", type=int, default=16000)
+    parser.add_argument("--app-id", default="assistant_demo")
+    parser.add_argument("--channel", default="simple_client")
+    parser.add_argument("--config-version-id", default="local-dev")
+    parser.add_argument("--track-debug", action="store_true")
    
    args = parser.parse_args()
    
@@ -277,7 +322,14 @@ async def main():
        list_audio_devices()
        return
    
-    client = SimpleVoiceClient(args.url, args.sample_rate)
+    client = SimpleVoiceClient(
+        args.url,
+        args.sample_rate,
+        app_id=args.app_id,
+        channel=args.channel,
+        config_version_id=args.config_version_id,
+        track_debug=args.track_debug,
+    )
    await client.run(args.text)


--- a/examples/test_websocket.py
+++ b/examples/test_websocket.py
@@ -36,8 +36,18 @@ def generate_sine_wave(duration_ms=1000):
    return audio_data


-async def receive_loop(ws, ready_event: asyncio.Event):
+async def receive_loop(ws, ready_event: asyncio.Event, track_debug: bool = False):
    """Listen for incoming messages from the server."""
+    def event_ids_suffix(data):
+        payload = data.get("data") if isinstance(data.get("data"), dict) else {}
+        keys = ("turn_id", "utterance_id", "response_id", "tool_call_id", "tts_id")
+        parts = []
+        for key in keys:
+            value = payload.get(key, data.get(key))
+            if value:
+                parts.append(f"{key}={value}")
+        return f" [{' '.join(parts)}]" if parts else ""
+
    print("👂 Listening for server responses...")
    async for msg in ws:
        timestamp = datetime.now().strftime("%H:%M:%S")
@@ -46,7 +56,10 @@ async def receive_loop(ws, ready_event: asyncio.Event):
            try:
                data = json.loads(msg.data)
                event_type = data.get('type', 'Unknown')
-                print(f"[{timestamp}] 📨 Event: {event_type} | {msg.data[:150]}...")
+                ids = event_ids_suffix(data)
+                print(f"[{timestamp}] 📨 Event: {event_type}{ids} | {msg.data[:150]}...")
+                if track_debug:
+                    print(f"[{timestamp}] [track-debug] event={event_type} trackId={data.get('trackId')}{ids}")
                if event_type == "session.started":
                    ready_event.set()
            except json.JSONDecodeError:
@@ -113,7 +126,7 @@ async def send_sine_loop(ws):
    print("\n✅ Finished streaming test audio.")


-async def run_client(url, file_path=None, use_sine=False):
+async def run_client(url, file_path=None, use_sine=False, track_debug: bool = False):
    """Run the WebSocket test client."""
    session = aiohttp.ClientSession()
    try:
@@ -121,7 +134,7 @@ async def run_client(url, file_path=None, use_sine=False):
        async with session.ws_connect(url) as ws:
            print("✅ Connected!")
            session_ready = asyncio.Event()
-            recv_task = asyncio.create_task(receive_loop(ws, session_ready))
+            recv_task = asyncio.create_task(receive_loop(ws, session_ready, track_debug=track_debug))

            # Send v1 hello + session.start handshake
            await ws.send_json({"type": "hello", "version": "v1"})
@@ -131,7 +144,12 @@ async def run_client(url, file_path=None, use_sine=False):
                    "encoding": "pcm_s16le",
                    "sample_rate_hz": SAMPLE_RATE,
                    "channels": 1
-                }
+                },
+                "metadata": {
+                    "appId": "assistant_demo",
+                    "channel": "test_websocket",
+                    "configVersionId": "local-dev",
+                },
            })
            print("📤 Sent v1 hello/session.start")
            await asyncio.wait_for(session_ready.wait(), timeout=8)
@@ -168,9 +186,10 @@ if __name__ == "__main__":
    parser.add_argument("--url", default=SERVER_URL, help="WebSocket endpoint URL")
    parser.add_argument("--file", help="Path to PCM/WAV file to stream")
    parser.add_argument("--sine", action="store_true", help="Use sine wave generation (default)")
+    parser.add_argument("--track-debug", action="store_true", help="Print event trackId for protocol debugging")
    args = parser.parse_args()

    try:
-        asyncio.run(run_client(args.url, args.file, args.sine))
+        asyncio.run(run_client(args.url, args.file, args.sine, args.track_debug))
    except KeyboardInterrupt:
        print("\n👋 Client stopped.")
--- a/examples/wav_client.py
+++ b/examples/wav_client.py
@@ -57,10 +57,15 @@ class WavFileClient:
        url: str,
        input_file: str,
        output_file: str,
+        app_id: str = "assistant_demo",
+        channel: str = "wav_client",
+        config_version_id: str = "local-dev",
        sample_rate: int = 16000,
        chunk_duration_ms: int = 20,
        wait_time: float = 15.0,
-        verbose: bool = False
+        verbose: bool = False,
+        track_debug: bool = False,
+        tail_silence_ms: int = 800,
    ):
        """
        Initialize WAV file client.
@@ -77,11 +82,17 @@ class WavFileClient:
        self.url = url
        self.input_file = Path(input_file)
        self.output_file = Path(output_file)
+        self.app_id = app_id
+        self.channel = channel
+        self.config_version_id = config_version_id
        self.sample_rate = sample_rate
        self.chunk_duration_ms = chunk_duration_ms
        self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
        self.wait_time = wait_time
        self.verbose = verbose
+        self.track_debug = track_debug
+        self.tail_silence_ms = max(0, int(tail_silence_ms))
+        self.frame_bytes = 640  # 16k mono pcm_s16le, 20ms
        
        # WebSocket connection
        self.ws = None
@@ -125,6 +136,17 @@ class WavFileClient:
            # Replace problematic characters for console output
            safe_message = message.encode('ascii', errors='replace').decode('ascii')
            print(f"{direction} {safe_message}")
+
+    @staticmethod
+    def _event_ids_suffix(event: dict) -> str:
+        data = event.get("data") if isinstance(event.get("data"), dict) else {}
+        keys = ("turn_id", "utterance_id", "response_id", "tool_call_id", "tts_id")
+        parts = []
+        for key in keys:
+            value = data.get(key, event.get(key))
+            if value:
+                parts.append(f"{key}={value}")
+        return f" [{' '.join(parts)}]" if parts else ""
    
    async def connect(self) -> None:
        """Connect to WebSocket server."""
@@ -144,7 +166,12 @@ class WavFileClient:
                "encoding": "pcm_s16le",
                "sample_rate_hz": self.sample_rate,
                "channels": 1
-            }
+            },
+            "metadata": {
+                "appId": self.app_id,
+                "channel": self.channel,
+                "configVersionId": self.config_version_id,
+            },
        })
    
    async def send_command(self, cmd: dict) -> None:
@@ -216,6 +243,10 @@ class WavFileClient:
            end_sample = min(sent_samples + chunk_size, total_samples)
            chunk = audio_data[sent_samples:end_sample]
            chunk_bytes = chunk.tobytes()
+            if len(chunk_bytes) % self.frame_bytes != 0:
+                # v1 audio framing requires 640-byte (20ms) PCM units.
+                pad = self.frame_bytes - (len(chunk_bytes) % self.frame_bytes)
+                chunk_bytes += b"\x00" * pad
            
            # Send to server
            if self.ws:
@@ -232,6 +263,16 @@ class WavFileClient:
            # Delay to simulate real-time streaming
            # Server expects audio at real-time pace for VAD/ASR to work properly
            await asyncio.sleep(self.chunk_duration_ms / 1000)
+
+        # Add a short silence tail to help VAD/EOU close the final utterance.
+        if self.tail_silence_ms > 0 and self.ws:
+            tail_frames = max(1, self.tail_silence_ms // 20)
+            silence = b"\x00" * self.frame_bytes
+            for _ in range(tail_frames):
+                await self.ws.send(silence)
+                self.bytes_sent += len(silence)
+                await asyncio.sleep(0.02)
+            self.log_event("→", f"Sent trailing silence: {self.tail_silence_ms}ms")
        
        self.send_completed = True
        elapsed = time.time() - self.send_start_time
@@ -284,16 +325,22 @@ class WavFileClient:
    async def _handle_event(self, event: dict) -> None:
        """Handle incoming event."""
        event_type = event.get("type", "unknown")
+        ids = self._event_ids_suffix(event)
+        if self.track_debug:
+            print(f"[track-debug] event={event_type} trackId={event.get('trackId')}{ids}")

        if event_type == "hello.ack":
-            self.log_event("←", "Handshake acknowledged")
+            self.log_event("←", f"Handshake acknowledged{ids}")
        elif event_type == "session.started":
            self.session_ready = True
-            self.log_event("←", "Session ready!")
+            self.log_event("←", f"Session ready!{ids}")
+        elif event_type == "config.resolved":
+            config = event.get("config", {})
+            self.log_event("←", f"Config resolved (output={config.get('output', {})}){ids}")
        elif event_type == "input.speech_started":
-            self.log_event("←", "Speech detected")
+            self.log_event("←", f"Speech detected{ids}")
        elif event_type == "input.speech_stopped":
-            self.log_event("←", "Silence detected")
+            self.log_event("←", f"Silence detected{ids}")
        elif event_type == "transcript.delta":
            text = event.get("text", "")
            display_text = text[:60] + "..." if len(text) > 60 else text
@@ -301,35 +348,35 @@ class WavFileClient:
        elif event_type == "transcript.final":
            text = event.get("text", "")
            print(" " * 80, end="\r")
-            self.log_event("←", f"→ You: {text}")
+            self.log_event("←", f"→ You: {text}{ids}")
        elif event_type == "metrics.ttfb":
            latency_ms = event.get("latencyMs", 0)
            self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
        elif event_type == "assistant.response.delta":
            text = event.get("text", "")
            if self.verbose and text:
-                self.log_event("←", f"LLM: {text}")
+                self.log_event("←", f"LLM: {text}{ids}")
        elif event_type == "assistant.response.final":
            text = event.get("text", "")
            if text:
-                self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
+                self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}{ids}")
        elif event_type == "output.audio.start":
            self.track_started = True
            self.response_start_time = time.time()
            self.waiting_for_first_audio = True
-            self.log_event("←", "Bot started speaking")
+            self.log_event("←", f"Bot started speaking{ids}")
        elif event_type == "output.audio.end":
            self.track_ended = True
-            self.log_event("←", "Bot finished speaking")
+            self.log_event("←", f"Bot finished speaking{ids}")
        elif event_type == "response.interrupted":
-            self.log_event("←", "Bot interrupted!")
+            self.log_event("←", f"Bot interrupted!{ids}")
        elif event_type == "error":
-            self.log_event("!", f"Error: {event.get('message')}")
+            self.log_event("!", f"Error: {event.get('message')}{ids}")
        elif event_type == "session.stopped":
-            self.log_event("←", f"Session stopped: {event.get('reason')}")
+            self.log_event("←", f"Session stopped: {event.get('reason')}{ids}")
            self.running = False
        else:
-            self.log_event("←", f"Event: {event_type}")
+            self.log_event("←", f"Event: {event_type}{ids}")
    
    def save_output_wav(self) -> None:
        """Save received audio to output WAV file."""
@@ -473,6 +520,21 @@ async def main():
        default=16000,
        help="Target sample rate for audio (default: 16000)"
    )
+    parser.add_argument(
+        "--app-id",
+        default="assistant_demo",
+        help="Stable app/assistant identifier for server-side config lookup"
+    )
+    parser.add_argument(
+        "--channel",
+        default="wav_client",
+        help="Client channel name"
+    )
+    parser.add_argument(
+        "--config-version-id",
+        default="local-dev",
+        help="Optional config version identifier"
+    )
    parser.add_argument(
        "--chunk-duration",
        type=int,
@@ -490,6 +552,17 @@ async def main():
        action="store_true",
        help="Enable verbose output"
    )
+    parser.add_argument(
+        "--track-debug",
+        action="store_true",
+        help="Print event trackId for protocol debugging"
+    )
+    parser.add_argument(
+        "--tail-silence-ms",
+        type=int,
+        default=800,
+        help="Trailing silence to send after WAV playback for EOU detection (default: 800)"
+    )
    
    args = parser.parse_args()
    
@@ -497,10 +570,15 @@ async def main():
        url=args.url,
        input_file=args.input,
        output_file=args.output,
+        app_id=args.app_id,
+        channel=args.channel,
+        config_version_id=args.config_version_id,
        sample_rate=args.sample_rate,
        chunk_duration_ms=args.chunk_duration,
        wait_time=args.wait_time,
-        verbose=args.verbose
+        verbose=args.verbose,
+        track_debug=args.track_debug,
+        tail_silence_ms=args.tail_silence_ms,
    )
    
    await client.run()
--- a/examples/web_client.html
+++ b/examples/web_client.html
@@ -401,6 +401,9 @@

      const targetSampleRate = 16000;
      const playbackStopRampSec = 0.008;
+      const appId = "assistant_demo";
+      const channel = "web_client";
+      const configVersionId = "local-dev";

      function logLine(type, text, data) {
        const time = new Date().toLocaleTimeString();
@@ -604,15 +607,35 @@
        logLine("sys", `→ ${cmd.type}`, cmd);
      }

+      function eventIdsSuffix(event) {
+        const data = event && typeof event.data === "object" && event.data ? event.data : {};
+        const keys = ["turn_id", "utterance_id", "response_id", "tool_call_id", "tts_id"];
+        const parts = [];
+        for (const key of keys) {
+          const value = data[key] || event[key];
+          if (value) parts.push(`${key}=${value}`);
+        }
+        return parts.length ? ` [${parts.join(" ")}]` : "";
+      }
+
      function handleEvent(event) {
        const type = event.type || "unknown";
-        logLine("event", type, event);
+        const ids = eventIdsSuffix(event);
+        logLine("event", `${type}${ids}`, event);
        if (type === "hello.ack") {
          sendCommand({
            type: "session.start",
            audio: { encoding: "pcm_s16le", sample_rate_hz: targetSampleRate, channels: 1 },
+            metadata: {
+              appId,
+              channel,
+              configVersionId,
+            },
          });
        }
+        if (type === "config.resolved") {
+          logLine("sys", "config.resolved", event.config || {});
+        }
        if (type === "transcript.final") {
          if (event.text) {
            setInterim("You", "");