Unify db api

2026-02-26 01:58:39 +08:00
parent 56f8aa2191
commit 72ed7d0512
40 changed files with 3926 additions and 593 deletions
--- a/engine/examples/mic_client.py
+++ b/engine/examples/mic_client.py
@@ -59,8 +59,12 @@ class MicrophoneClient:
        url: str,
        sample_rate: int = 16000,
        chunk_duration_ms: int = 20,
+        app_id: str = "assistant_demo",
+        channel: str = "mic_client",
+        config_version_id: str = "local-dev",
        input_device: int = None,
-        output_device: int = None
+        output_device: int = None,
+        track_debug: bool = False,
    ):
        """
        Initialize microphone client.
@@ -76,8 +80,12 @@ class MicrophoneClient:
        self.sample_rate = sample_rate
        self.chunk_duration_ms = chunk_duration_ms
        self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
+        self.app_id = app_id
+        self.channel = channel
+        self.config_version_id = config_version_id
        self.input_device = input_device
        self.output_device = output_device
+        self.track_debug = track_debug
        
        # WebSocket connection
        self.ws = None
@@ -106,6 +114,17 @@ class MicrophoneClient:
        
        # Verbose mode for streaming LLM responses
        self.verbose = False
+
+    @staticmethod
+    def _event_ids_suffix(event: dict) -> str:
+        data = event.get("data") if isinstance(event.get("data"), dict) else {}
+        keys = ("turn_id", "utterance_id", "response_id", "tool_call_id", "tts_id")
+        parts = []
+        for key in keys:
+            value = data.get(key, event.get(key))
+            if value:
+                parts.append(f"{key}={value}")
+        return f" [{' '.join(parts)}]" if parts else ""
    
    async def connect(self) -> None:
        """Connect to WebSocket server."""
@@ -114,20 +133,30 @@ class MicrophoneClient:
        self.running = True
        print("Connected!")
        
-        # Send invite command
+        # WS v1 handshake: hello -> session.start
        await self.send_command({
-            "command": "invite",
-            "option": {
-                "codec": "pcm",
-                "sampleRate": self.sample_rate
-            }
+            "type": "hello",
+            "version": "v1",
+        })
+        await self.send_command({
+            "type": "session.start",
+            "audio": {
+                "encoding": "pcm_s16le",
+                "sample_rate_hz": self.sample_rate,
+                "channels": 1,
+            },
+            "metadata": {
+                "appId": self.app_id,
+                "channel": self.channel,
+                "configVersionId": self.config_version_id,
+            },
        })
    
    async def send_command(self, cmd: dict) -> None:
        """Send JSON command to server."""
        if self.ws:
            await self.ws.send(json.dumps(cmd))
-            print(f"→ Command: {cmd.get('command', 'unknown')}")
+            print(f"→ Command: {cmd.get('type', 'unknown')}")
    
    async def send_chat(self, text: str) -> None:
        """Send chat message (text input)."""
@@ -136,7 +165,7 @@ class MicrophoneClient:
        self.first_audio_received = False
        
        await self.send_command({
-            "command": "chat",
+            "type": "input.text",
            "text": text
        })
        print(f"→ Chat: {text}")
@@ -144,13 +173,14 @@ class MicrophoneClient:
    async def send_interrupt(self) -> None:
        """Send interrupt command."""
        await self.send_command({
-            "command": "interrupt"
+            "type": "response.cancel",
+            "graceful": False,
        })
    
    async def send_hangup(self, reason: str = "User quit") -> None:
        """Send hangup command."""
        await self.send_command({
-            "command": "hangup",
+            "type": "session.stop",
            "reason": reason
        })
    
@@ -295,43 +325,48 @@ class MicrophoneClient:
    
    async def _handle_event(self, event: dict) -> None:
        """Handle incoming event."""
-        event_type = event.get("event", "unknown")
+        event_type = event.get("type", event.get("event", "unknown"))
+        ids = self._event_ids_suffix(event)
+        if self.track_debug:
+            print(f"[track-debug] event={event_type} trackId={event.get('trackId')}{ids}")
        
-        if event_type == "answer":
-            print("← Session ready!")
-        elif event_type == "speaking":
-            print("← User speech detected")
-        elif event_type == "silence":
-            print("← User silence detected")
-        elif event_type == "transcript":
+        if event_type in {"hello.ack", "session.started"}:
+            print(f"← Session ready!{ids}")
+        elif event_type == "config.resolved":
+            print(f"← Config resolved: {event.get('config', {}).get('output', {})}{ids}")
+        elif event_type == "input.speech_started":
+            print(f"← User speech detected{ids}")
+        elif event_type == "input.speech_stopped":
+            print(f"← User silence detected{ids}")
+        elif event_type in {"transcript", "transcript.delta", "transcript.final"}:
            # Display user speech transcription
            text = event.get("text", "")
-            is_final = event.get("isFinal", False)
+            is_final = event_type == "transcript.final" or bool(event.get("isFinal"))
            if is_final:
                # Clear the interim line and print final
                print(" " * 80, end="\r")  # Clear previous interim text
-                print(f"→ You: {text}")
+                print(f"→ You: {text}{ids}")
            else:
                # Interim result - show with indicator (overwrite same line)
                display_text = text[:60] + "..." if len(text) > 60 else text
                print(f"  [listening] {display_text}".ljust(80), end="\r")
-        elif event_type == "ttfb":
+        elif event_type in {"ttfb", "metrics.ttfb"}:
            # Server-side TTFB event
            latency_ms = event.get("latencyMs", 0)
            print(f"← [TTFB] Server reported latency: {latency_ms}ms")
-        elif event_type == "llmResponse":
+        elif event_type in {"llmResponse", "assistant.response.delta", "assistant.response.final"}:
            # LLM text response
            text = event.get("text", "")
-            is_final = event.get("isFinal", False)
+            is_final = event_type == "assistant.response.final" or bool(event.get("isFinal"))
            if is_final:
                # Print final LLM response
                print(f"← AI: {text}")
            elif self.verbose:
                # Show streaming chunks only in verbose mode
                display_text = text[:60] + "..." if len(text) > 60 else text
-                print(f"  [streaming] {display_text}")
-        elif event_type == "trackStart":
-            print("← Bot started speaking")
+                print(f"  [streaming] {display_text}{ids}")
+        elif event_type in {"trackStart", "output.audio.start"}:
+            print(f"← Bot started speaking{ids}")
            # IMPORTANT: Accept audio again after trackStart
            self._discard_audio = False
            self._audio_sequence += 1
@@ -342,13 +377,13 @@ class MicrophoneClient:
            # Clear any old audio in buffer
            with self.audio_output_lock:
                self.audio_output_buffer = b""
-        elif event_type == "trackEnd":
-            print("← Bot finished speaking")
+        elif event_type in {"trackEnd", "output.audio.end"}:
+            print(f"← Bot finished speaking{ids}")
            # Reset TTFB tracking after response completes
            self.request_start_time = None
            self.first_audio_received = False
-        elif event_type == "interrupt":
-            print("← Bot interrupted!")
+        elif event_type in {"interrupt", "response.interrupted"}:
+            print(f"← Bot interrupted!{ids}")
            # IMPORTANT: Discard all audio until next trackStart
            self._discard_audio = True
            # Clear audio buffer immediately
@@ -357,12 +392,12 @@ class MicrophoneClient:
                self.audio_output_buffer = b""
                print(f"   (cleared {buffer_ms:.0f}ms, discarding audio until new track)")
        elif event_type == "error":
-            print(f"← Error: {event.get('error')}")
-        elif event_type == "hangup":
-            print(f"← Hangup: {event.get('reason')}")
+            print(f"← Error: {event.get('error')}{ids}")
+        elif event_type in {"hangup", "session.stopped"}:
+            print(f"← Hangup: {event.get('reason')}{ids}")
            self.running = False
        else:
-            print(f"← Event: {event_type}")
+            print(f"← Event: {event_type}{ids}")
    
    async def interactive_mode(self) -> None:
        """Run interactive mode for text chat."""
@@ -573,6 +608,26 @@ async def main():
        action="store_true",
        help="Show streaming LLM response chunks"
    )
+    parser.add_argument(
+        "--app-id",
+        default="assistant_demo",
+        help="Stable app/assistant identifier for server-side config lookup"
+    )
+    parser.add_argument(
+        "--channel",
+        default="mic_client",
+        help="Client channel name"
+    )
+    parser.add_argument(
+        "--config-version-id",
+        default="local-dev",
+        help="Optional config version identifier"
+    )
+    parser.add_argument(
+        "--track-debug",
+        action="store_true",
+        help="Print event trackId for protocol debugging"
+    )
    
    args = parser.parse_args()
    
@@ -583,8 +638,12 @@ async def main():
    client = MicrophoneClient(
        url=args.url,
        sample_rate=args.sample_rate,
+        app_id=args.app_id,
+        channel=args.channel,
+        config_version_id=args.config_version_id,
        input_device=args.input_device,
-        output_device=args.output_device
+        output_device=args.output_device,
+        track_debug=args.track_debug,
    )
    client.verbose = args.verbose