Update engine

2026-02-23 17:16:18 +08:00
parent 01c0de0a4d
commit c6c84b5af9
9 changed files with 991 additions and 186 deletions
--- a/examples/simple_client.py
+++ b/examples/simple_client.py
@@ -52,9 +52,21 @@ if not PYAUDIO_AVAILABLE and not SD_AVAILABLE:
 class SimpleVoiceClient:
    """Simple voice client with reliable audio playback."""
    
-    def __init__(self, url: str, sample_rate: int = 16000):
+    def __init__(
+        self,
+        url: str,
+        sample_rate: int = 16000,
+        app_id: str = "assistant_demo",
+        channel: str = "simple_client",
+        config_version_id: str = "local-dev",
+        track_debug: bool = False,
+    ):
        self.url = url
        self.sample_rate = sample_rate
+        self.app_id = app_id
+        self.channel = channel
+        self.config_version_id = config_version_id
+        self.track_debug = track_debug
        self.ws = None
        self.running = False
        
@@ -75,6 +87,17 @@ class SimpleVoiceClient:
        
        # Interrupt handling - discard audio until next trackStart
        self._discard_audio = False
+
+    @staticmethod
+    def _event_ids_suffix(event: dict) -> str:
+        data = event.get("data") if isinstance(event.get("data"), dict) else {}
+        keys = ("turn_id", "utterance_id", "response_id", "tool_call_id", "tts_id")
+        parts = []
+        for key in keys:
+            value = data.get(key, event.get(key))
+            if value:
+                parts.append(f"{key}={value}")
+        return f" [{' '.join(parts)}]" if parts else ""
    
    async def connect(self):
        """Connect to server."""
@@ -83,12 +106,25 @@ class SimpleVoiceClient:
        self.running = True
        print("Connected!")
        
-        # Send invite
+        # WS v1 handshake: hello -> session.start
        await self.ws.send(json.dumps({
-            "command": "invite",
-            "option": {"codec": "pcm", "sampleRate": self.sample_rate}
+            "type": "hello",
+            "version": "v1",
        }))
-        print("-> invite")
+        await self.ws.send(json.dumps({
+            "type": "session.start",
+            "audio": {
+                "encoding": "pcm_s16le",
+                "sample_rate_hz": self.sample_rate,
+                "channels": 1,
+            },
+            "metadata": {
+                "appId": self.app_id,
+                "channel": self.channel,
+                "configVersionId": self.config_version_id,
+            },
+        }))
+        print("-> hello/session.start")
    
    async def send_chat(self, text: str):
        """Send chat message."""
@@ -96,8 +132,8 @@ class SimpleVoiceClient:
        self.request_start_time = time.time()
        self.first_audio_received = False
        
-        await self.ws.send(json.dumps({"command": "chat", "text": text}))
-        print(f"-> chat: {text}")
+        await self.ws.send(json.dumps({"type": "input.text", "text": text}))
+        print(f"-> input.text: {text}")
    
    def play_audio(self, audio_data: bytes):
        """Play audio data immediately."""
@@ -152,34 +188,39 @@ class SimpleVoiceClient:
                else:
                    # JSON event
                    event = json.loads(msg)
-                    etype = event.get("event", "?")
+                    etype = event.get("type", event.get("event", "?"))
+                    ids = self._event_ids_suffix(event)
+                    if self.track_debug:
+                        print(f"[track-debug] event={etype} trackId={event.get('trackId')}{ids}")
                    
-                    if etype == "transcript":
+                    if etype in {"transcript", "transcript.delta", "transcript.final"}:
                        # User speech transcription
                        text = event.get("text", "")
-                        is_final = event.get("isFinal", False)
+                        is_final = etype == "transcript.final" or bool(event.get("isFinal"))
                        if is_final:
-                            print(f"<- You said: {text}")
+                            print(f"<- You said: {text}{ids}")
                        else:
                            print(f"<- [listening] {text}", end="\r")
-                    elif etype == "ttfb":
+                    elif etype in {"ttfb", "metrics.ttfb"}:
                        # Server-side TTFB event
                        latency_ms = event.get("latencyMs", 0)
                        print(f"<- [TTFB] Server reported latency: {latency_ms}ms")
-                    elif etype == "trackStart":
+                    elif etype in {"trackStart", "output.audio.start"}:
                        # New track starting - accept audio again
                        self._discard_audio = False
-                        print(f"<- {etype}")
-                    elif etype == "interrupt":
+                        print(f"<- {etype}{ids}")
+                    elif etype in {"interrupt", "response.interrupted"}:
                        # Interrupt - discard audio until next trackStart
                        self._discard_audio = True
-                        print(f"<- {etype} (discarding audio until new track)")
-                    elif etype == "hangup":
-                        print(f"<- {etype}")
+                        print(f"<- {etype}{ids} (discarding audio until new track)")
+                    elif etype in {"hangup", "session.stopped"}:
+                        print(f"<- {etype}{ids}")
                        self.running = False
                        break
+                    elif etype == "config.resolved":
+                        print(f"<- config.resolved {event.get('config', {}).get('output', {})}{ids}")
                    else:
-                        print(f"<- {etype}")
+                        print(f"<- {etype}{ids}")
                        
            except asyncio.TimeoutError:
                continue
@@ -270,6 +311,10 @@ async def main():
    parser.add_argument("--text", help="Send text and play response")
    parser.add_argument("--list-devices", action="store_true")
    parser.add_argument("--sample-rate", type=int, default=16000)
+    parser.add_argument("--app-id", default="assistant_demo")
+    parser.add_argument("--channel", default="simple_client")
+    parser.add_argument("--config-version-id", default="local-dev")
+    parser.add_argument("--track-debug", action="store_true")
    
    args = parser.parse_args()
    
@@ -277,7 +322,14 @@ async def main():
        list_audio_devices()
        return
    
-    client = SimpleVoiceClient(args.url, args.sample_rate)
+    client = SimpleVoiceClient(
+        args.url,
+        args.sample_rate,
+        app_id=args.app_id,
+        channel=args.channel,
+        config_version_id=args.config_version_id,
+        track_debug=args.track_debug,
+    )
    await client.run(args.text)