fix legacy protocol in wav client
This commit is contained in:
@@ -105,6 +105,7 @@ class WavFileClient:
|
|||||||
self.track_started = False
|
self.track_started = False
|
||||||
self.track_ended = False
|
self.track_ended = False
|
||||||
self.send_completed = False
|
self.send_completed = False
|
||||||
|
self.session_ready = False
|
||||||
|
|
||||||
# Events log
|
# Events log
|
||||||
self.events_log = []
|
self.events_log = []
|
||||||
@@ -132,12 +133,17 @@ class WavFileClient:
|
|||||||
self.running = True
|
self.running = True
|
||||||
self.log_event("←", "Connected!")
|
self.log_event("←", "Connected!")
|
||||||
|
|
||||||
# Send invite command
|
# WS v1 handshake: hello -> session.start
|
||||||
await self.send_command({
|
await self.send_command({
|
||||||
"command": "invite",
|
"type": "hello",
|
||||||
"option": {
|
"version": "v1",
|
||||||
"codec": "pcm",
|
})
|
||||||
"sampleRate": self.sample_rate
|
await self.send_command({
|
||||||
|
"type": "session.start",
|
||||||
|
"audio": {
|
||||||
|
"encoding": "pcm_s16le",
|
||||||
|
"sample_rate_hz": self.sample_rate,
|
||||||
|
"channels": 1
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -145,12 +151,12 @@ class WavFileClient:
|
|||||||
"""Send JSON command to server."""
|
"""Send JSON command to server."""
|
||||||
if self.ws:
|
if self.ws:
|
||||||
await self.ws.send(json.dumps(cmd))
|
await self.ws.send(json.dumps(cmd))
|
||||||
self.log_event("→", f"Command: {cmd.get('command', 'unknown')}")
|
self.log_event("→", f"Command: {cmd.get('type', 'unknown')}")
|
||||||
|
|
||||||
async def send_hangup(self, reason: str = "Session complete") -> None:
|
async def send_hangup(self, reason: str = "Session complete") -> None:
|
||||||
"""Send hangup command."""
|
"""Send hangup command."""
|
||||||
await self.send_command({
|
await self.send_command({
|
||||||
"command": "hangup",
|
"type": "session.stop",
|
||||||
"reason": reason
|
"reason": reason
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -277,51 +283,50 @@ class WavFileClient:
|
|||||||
|
|
||||||
async def _handle_event(self, event: dict) -> None:
|
async def _handle_event(self, event: dict) -> None:
|
||||||
"""Handle incoming event."""
|
"""Handle incoming event."""
|
||||||
event_type = event.get("event", "unknown")
|
event_type = event.get("type", "unknown")
|
||||||
|
|
||||||
if event_type == "answer":
|
if event_type == "hello.ack":
|
||||||
|
self.log_event("←", "Handshake acknowledged")
|
||||||
|
elif event_type == "session.started":
|
||||||
|
self.session_ready = True
|
||||||
self.log_event("←", "Session ready!")
|
self.log_event("←", "Session ready!")
|
||||||
elif event_type == "speaking":
|
elif event_type == "input.speech_started":
|
||||||
self.log_event("←", "Speech detected")
|
self.log_event("←", "Speech detected")
|
||||||
elif event_type == "silence":
|
elif event_type == "input.speech_stopped":
|
||||||
self.log_event("←", "Silence detected")
|
self.log_event("←", "Silence detected")
|
||||||
elif event_type == "transcript":
|
elif event_type == "transcript.delta":
|
||||||
# ASR transcript (interim = asrDelta-style, final = asrFinal-style)
|
|
||||||
text = event.get("text", "")
|
text = event.get("text", "")
|
||||||
is_final = event.get("isFinal", False)
|
|
||||||
if is_final:
|
|
||||||
# Clear interim line and print final
|
|
||||||
print(" " * 80, end="\r")
|
|
||||||
self.log_event("←", f"→ You: {text}")
|
|
||||||
else:
|
|
||||||
# Interim result - show with indicator (overwrite same line, as in mic_client)
|
|
||||||
display_text = text[:60] + "..." if len(text) > 60 else text
|
display_text = text[:60] + "..." if len(text) > 60 else text
|
||||||
print(f" [listening] {display_text}".ljust(80), end="\r")
|
print(f" [listening] {display_text}".ljust(80), end="\r")
|
||||||
elif event_type == "ttfb":
|
elif event_type == "transcript.final":
|
||||||
|
text = event.get("text", "")
|
||||||
|
print(" " * 80, end="\r")
|
||||||
|
self.log_event("←", f"→ You: {text}")
|
||||||
|
elif event_type == "metrics.ttfb":
|
||||||
latency_ms = event.get("latencyMs", 0)
|
latency_ms = event.get("latencyMs", 0)
|
||||||
self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
|
self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
|
||||||
elif event_type == "llmResponse":
|
elif event_type == "assistant.response.delta":
|
||||||
text = event.get("text", "")
|
text = event.get("text", "")
|
||||||
is_final = event.get("isFinal", False)
|
if self.verbose and text:
|
||||||
if is_final:
|
|
||||||
self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
|
|
||||||
elif self.verbose:
|
|
||||||
# Show streaming chunks only in verbose mode
|
|
||||||
self.log_event("←", f"LLM: {text}")
|
self.log_event("←", f"LLM: {text}")
|
||||||
elif event_type == "trackStart":
|
elif event_type == "assistant.response.final":
|
||||||
|
text = event.get("text", "")
|
||||||
|
if text:
|
||||||
|
self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
|
||||||
|
elif event_type == "output.audio.start":
|
||||||
self.track_started = True
|
self.track_started = True
|
||||||
self.response_start_time = time.time()
|
self.response_start_time = time.time()
|
||||||
self.waiting_for_first_audio = True
|
self.waiting_for_first_audio = True
|
||||||
self.log_event("←", "Bot started speaking")
|
self.log_event("←", "Bot started speaking")
|
||||||
elif event_type == "trackEnd":
|
elif event_type == "output.audio.end":
|
||||||
self.track_ended = True
|
self.track_ended = True
|
||||||
self.log_event("←", "Bot finished speaking")
|
self.log_event("←", "Bot finished speaking")
|
||||||
elif event_type == "interrupt":
|
elif event_type == "response.interrupted":
|
||||||
self.log_event("←", "Bot interrupted!")
|
self.log_event("←", "Bot interrupted!")
|
||||||
elif event_type == "error":
|
elif event_type == "error":
|
||||||
self.log_event("!", f"Error: {event.get('error')}")
|
self.log_event("!", f"Error: {event.get('message')}")
|
||||||
elif event_type == "hangup":
|
elif event_type == "session.stopped":
|
||||||
self.log_event("←", f"Hangup: {event.get('reason')}")
|
self.log_event("←", f"Session stopped: {event.get('reason')}")
|
||||||
self.running = False
|
self.running = False
|
||||||
else:
|
else:
|
||||||
self.log_event("←", f"Event: {event_type}")
|
self.log_event("←", f"Event: {event_type}")
|
||||||
@@ -359,12 +364,16 @@ class WavFileClient:
|
|||||||
# Connect to server
|
# Connect to server
|
||||||
await self.connect()
|
await self.connect()
|
||||||
|
|
||||||
# Wait for answer
|
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
|
|
||||||
# Start receiver task
|
# Start receiver task
|
||||||
receiver_task = asyncio.create_task(self.receiver())
|
receiver_task = asyncio.create_task(self.receiver())
|
||||||
|
|
||||||
|
# Wait for session.started before streaming audio
|
||||||
|
ready_start = time.time()
|
||||||
|
while self.running and not self.session_ready:
|
||||||
|
if time.time() - ready_start > 8.0:
|
||||||
|
raise TimeoutError("Timeout waiting for session.started")
|
||||||
|
await asyncio.sleep(0.05)
|
||||||
|
|
||||||
# Send audio
|
# Send audio
|
||||||
await self.audio_sender(audio_data)
|
await self.audio_sender(audio_data)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user