fix legacy protocol in wav client

This commit is contained in:
Xin Wang
2026-02-17 13:18:00 +08:00
parent 30eb4397c2
commit 01c0de0a4d

View File

@@ -105,6 +105,7 @@ class WavFileClient:
self.track_started = False self.track_started = False
self.track_ended = False self.track_ended = False
self.send_completed = False self.send_completed = False
self.session_ready = False
# Events log # Events log
self.events_log = [] self.events_log = []
@@ -131,13 +132,18 @@ class WavFileClient:
self.ws = await websockets.connect(self.url) self.ws = await websockets.connect(self.url)
self.running = True self.running = True
self.log_event("", "Connected!") self.log_event("", "Connected!")
# Send invite command # WS v1 handshake: hello -> session.start
await self.send_command({ await self.send_command({
"command": "invite", "type": "hello",
"option": { "version": "v1",
"codec": "pcm", })
"sampleRate": self.sample_rate await self.send_command({
"type": "session.start",
"audio": {
"encoding": "pcm_s16le",
"sample_rate_hz": self.sample_rate,
"channels": 1
} }
}) })
@@ -145,12 +151,12 @@ class WavFileClient:
"""Send JSON command to server.""" """Send JSON command to server."""
if self.ws: if self.ws:
await self.ws.send(json.dumps(cmd)) await self.ws.send(json.dumps(cmd))
self.log_event("", f"Command: {cmd.get('command', 'unknown')}") self.log_event("", f"Command: {cmd.get('type', 'unknown')}")
async def send_hangup(self, reason: str = "Session complete") -> None: async def send_hangup(self, reason: str = "Session complete") -> None:
"""Send hangup command.""" """Send hangup command."""
await self.send_command({ await self.send_command({
"command": "hangup", "type": "session.stop",
"reason": reason "reason": reason
}) })
@@ -277,51 +283,50 @@ class WavFileClient:
async def _handle_event(self, event: dict) -> None: async def _handle_event(self, event: dict) -> None:
"""Handle incoming event.""" """Handle incoming event."""
event_type = event.get("event", "unknown") event_type = event.get("type", "unknown")
if event_type == "answer": if event_type == "hello.ack":
self.log_event("", "Handshake acknowledged")
elif event_type == "session.started":
self.session_ready = True
self.log_event("", "Session ready!") self.log_event("", "Session ready!")
elif event_type == "speaking": elif event_type == "input.speech_started":
self.log_event("", "Speech detected") self.log_event("", "Speech detected")
elif event_type == "silence": elif event_type == "input.speech_stopped":
self.log_event("", "Silence detected") self.log_event("", "Silence detected")
elif event_type == "transcript": elif event_type == "transcript.delta":
# ASR transcript (interim = asrDelta-style, final = asrFinal-style)
text = event.get("text", "") text = event.get("text", "")
is_final = event.get("isFinal", False) display_text = text[:60] + "..." if len(text) > 60 else text
if is_final: print(f" [listening] {display_text}".ljust(80), end="\r")
# Clear interim line and print final elif event_type == "transcript.final":
print(" " * 80, end="\r") text = event.get("text", "")
self.log_event("", f"→ You: {text}") print(" " * 80, end="\r")
else: self.log_event("", f"→ You: {text}")
# Interim result - show with indicator (overwrite same line, as in mic_client) elif event_type == "metrics.ttfb":
display_text = text[:60] + "..." if len(text) > 60 else text
print(f" [listening] {display_text}".ljust(80), end="\r")
elif event_type == "ttfb":
latency_ms = event.get("latencyMs", 0) latency_ms = event.get("latencyMs", 0)
self.log_event("", f"[TTFB] Server latency: {latency_ms}ms") self.log_event("", f"[TTFB] Server latency: {latency_ms}ms")
elif event_type == "llmResponse": elif event_type == "assistant.response.delta":
text = event.get("text", "") text = event.get("text", "")
is_final = event.get("isFinal", False) if self.verbose and text:
if is_final:
self.log_event("", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
elif self.verbose:
# Show streaming chunks only in verbose mode
self.log_event("", f"LLM: {text}") self.log_event("", f"LLM: {text}")
elif event_type == "trackStart": elif event_type == "assistant.response.final":
text = event.get("text", "")
if text:
self.log_event("", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
elif event_type == "output.audio.start":
self.track_started = True self.track_started = True
self.response_start_time = time.time() self.response_start_time = time.time()
self.waiting_for_first_audio = True self.waiting_for_first_audio = True
self.log_event("", "Bot started speaking") self.log_event("", "Bot started speaking")
elif event_type == "trackEnd": elif event_type == "output.audio.end":
self.track_ended = True self.track_ended = True
self.log_event("", "Bot finished speaking") self.log_event("", "Bot finished speaking")
elif event_type == "interrupt": elif event_type == "response.interrupted":
self.log_event("", "Bot interrupted!") self.log_event("", "Bot interrupted!")
elif event_type == "error": elif event_type == "error":
self.log_event("!", f"Error: {event.get('error')}") self.log_event("!", f"Error: {event.get('message')}")
elif event_type == "hangup": elif event_type == "session.stopped":
self.log_event("", f"Hangup: {event.get('reason')}") self.log_event("", f"Session stopped: {event.get('reason')}")
self.running = False self.running = False
else: else:
self.log_event("", f"Event: {event_type}") self.log_event("", f"Event: {event_type}")
@@ -359,11 +364,15 @@ class WavFileClient:
# Connect to server # Connect to server
await self.connect() await self.connect()
# Wait for answer
await asyncio.sleep(0.5)
# Start receiver task # Start receiver task
receiver_task = asyncio.create_task(self.receiver()) receiver_task = asyncio.create_task(self.receiver())
# Wait for session.started before streaming audio
ready_start = time.time()
while self.running and not self.session_ready:
if time.time() - ready_start > 8.0:
raise TimeoutError("Timeout waiting for session.started")
await asyncio.sleep(0.05)
# Send audio # Send audio
await self.audio_sender(audio_data) await self.audio_sender(audio_data)