diff --git a/examples/wav_client.py b/examples/wav_client.py index db638b9..729e4d2 100644 --- a/examples/wav_client.py +++ b/examples/wav_client.py @@ -105,6 +105,7 @@ class WavFileClient: self.track_started = False self.track_ended = False self.send_completed = False + self.session_ready = False # Events log self.events_log = [] @@ -131,13 +132,18 @@ class WavFileClient: self.ws = await websockets.connect(self.url) self.running = True self.log_event("←", "Connected!") - - # Send invite command + + # WS v1 handshake: hello -> session.start await self.send_command({ - "command": "invite", - "option": { - "codec": "pcm", - "sampleRate": self.sample_rate + "type": "hello", + "version": "v1", + }) + await self.send_command({ + "type": "session.start", + "audio": { + "encoding": "pcm_s16le", + "sample_rate_hz": self.sample_rate, + "channels": 1 } }) @@ -145,12 +151,12 @@ class WavFileClient: """Send JSON command to server.""" if self.ws: await self.ws.send(json.dumps(cmd)) - self.log_event("→", f"Command: {cmd.get('command', 'unknown')}") + self.log_event("→", f"Command: {cmd.get('type', 'unknown')}") async def send_hangup(self, reason: str = "Session complete") -> None: """Send hangup command.""" await self.send_command({ - "command": "hangup", + "type": "session.stop", "reason": reason }) @@ -277,51 +283,50 @@ class WavFileClient: async def _handle_event(self, event: dict) -> None: """Handle incoming event.""" - event_type = event.get("event", "unknown") - - if event_type == "answer": + event_type = event.get("type", "unknown") + + if event_type == "hello.ack": + self.log_event("←", "Handshake acknowledged") + elif event_type == "session.started": + self.session_ready = True self.log_event("←", "Session ready!") - elif event_type == "speaking": + elif event_type == "input.speech_started": self.log_event("←", "Speech detected") - elif event_type == "silence": + elif event_type == "input.speech_stopped": self.log_event("←", "Silence detected") - elif event_type == "transcript": - # ASR transcript (interim = asrDelta-style, final = asrFinal-style) + elif event_type == "transcript.delta": text = event.get("text", "") - is_final = event.get("isFinal", False) - if is_final: - # Clear interim line and print final - print(" " * 80, end="\r") - self.log_event("←", f"→ You: {text}") - else: - # Interim result - show with indicator (overwrite same line, as in mic_client) - display_text = text[:60] + "..." if len(text) > 60 else text - print(f" [listening] {display_text}".ljust(80), end="\r") - elif event_type == "ttfb": + display_text = text[:60] + "..." if len(text) > 60 else text + print(f" [listening] {display_text}".ljust(80), end="\r") + elif event_type == "transcript.final": + text = event.get("text", "") + print(" " * 80, end="\r") + self.log_event("←", f"→ You: {text}") + elif event_type == "metrics.ttfb": latency_ms = event.get("latencyMs", 0) self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms") - elif event_type == "llmResponse": + elif event_type == "assistant.response.delta": text = event.get("text", "") - is_final = event.get("isFinal", False) - if is_final: - self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}") - elif self.verbose: - # Show streaming chunks only in verbose mode + if self.verbose and text: self.log_event("←", f"LLM: {text}") - elif event_type == "trackStart": + elif event_type == "assistant.response.final": + text = event.get("text", "") + if text: + self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}") + elif event_type == "output.audio.start": self.track_started = True self.response_start_time = time.time() self.waiting_for_first_audio = True self.log_event("←", "Bot started speaking") - elif event_type == "trackEnd": + elif event_type == "output.audio.end": self.track_ended = True self.log_event("←", "Bot finished speaking") - elif event_type == "interrupt": + elif event_type == "response.interrupted": self.log_event("←", "Bot interrupted!") elif event_type == "error": - self.log_event("!", f"Error: {event.get('error')}") - elif event_type == "hangup": - self.log_event("←", f"Hangup: {event.get('reason')}") + self.log_event("!", f"Error: {event.get('message')}") + elif event_type == "session.stopped": + self.log_event("←", f"Session stopped: {event.get('reason')}") self.running = False else: self.log_event("←", f"Event: {event_type}") @@ -359,11 +364,15 @@ class WavFileClient: # Connect to server await self.connect() - # Wait for answer - await asyncio.sleep(0.5) - # Start receiver task receiver_task = asyncio.create_task(self.receiver()) + + # Wait for session.started before streaming audio + ready_start = time.time() + while self.running and not self.session_ready: + if time.time() - ready_start > 8.0: + raise TimeoutError("Timeout waiting for session.started") + await asyncio.sleep(0.05) # Send audio await self.audio_sender(audio_data)