Unify db api

This commit is contained in:
Xin Wang
2026-02-26 01:58:39 +08:00
parent 56f8aa2191
commit 72ed7d0512
40 changed files with 3926 additions and 593 deletions

View File

@@ -59,8 +59,12 @@ class MicrophoneClient:
url: str,
sample_rate: int = 16000,
chunk_duration_ms: int = 20,
app_id: str = "assistant_demo",
channel: str = "mic_client",
config_version_id: str = "local-dev",
input_device: int = None,
output_device: int = None
output_device: int = None,
track_debug: bool = False,
):
"""
Initialize microphone client.
@@ -76,8 +80,12 @@ class MicrophoneClient:
self.sample_rate = sample_rate
self.chunk_duration_ms = chunk_duration_ms
self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
self.app_id = app_id
self.channel = channel
self.config_version_id = config_version_id
self.input_device = input_device
self.output_device = output_device
self.track_debug = track_debug
# WebSocket connection
self.ws = None
@@ -106,6 +114,17 @@ class MicrophoneClient:
# Verbose mode for streaming LLM responses
self.verbose = False
@staticmethod
def _event_ids_suffix(event: dict) -> str:
data = event.get("data") if isinstance(event.get("data"), dict) else {}
keys = ("turn_id", "utterance_id", "response_id", "tool_call_id", "tts_id")
parts = []
for key in keys:
value = data.get(key, event.get(key))
if value:
parts.append(f"{key}={value}")
return f" [{' '.join(parts)}]" if parts else ""
async def connect(self) -> None:
"""Connect to WebSocket server."""
@@ -114,20 +133,30 @@ class MicrophoneClient:
self.running = True
print("Connected!")
# Send invite command
# WS v1 handshake: hello -> session.start
await self.send_command({
"command": "invite",
"option": {
"codec": "pcm",
"sampleRate": self.sample_rate
}
"type": "hello",
"version": "v1",
})
await self.send_command({
"type": "session.start",
"audio": {
"encoding": "pcm_s16le",
"sample_rate_hz": self.sample_rate,
"channels": 1,
},
"metadata": {
"appId": self.app_id,
"channel": self.channel,
"configVersionId": self.config_version_id,
},
})
async def send_command(self, cmd: dict) -> None:
"""Send JSON command to server."""
if self.ws:
await self.ws.send(json.dumps(cmd))
print(f"→ Command: {cmd.get('command', 'unknown')}")
print(f"→ Command: {cmd.get('type', 'unknown')}")
async def send_chat(self, text: str) -> None:
"""Send chat message (text input)."""
@@ -136,7 +165,7 @@ class MicrophoneClient:
self.first_audio_received = False
await self.send_command({
"command": "chat",
"type": "input.text",
"text": text
})
print(f"→ Chat: {text}")
@@ -144,13 +173,14 @@ class MicrophoneClient:
async def send_interrupt(self) -> None:
"""Send interrupt command."""
await self.send_command({
"command": "interrupt"
"type": "response.cancel",
"graceful": False,
})
async def send_hangup(self, reason: str = "User quit") -> None:
"""Send hangup command."""
await self.send_command({
"command": "hangup",
"type": "session.stop",
"reason": reason
})
@@ -295,43 +325,48 @@ class MicrophoneClient:
async def _handle_event(self, event: dict) -> None:
"""Handle incoming event."""
event_type = event.get("event", "unknown")
event_type = event.get("type", event.get("event", "unknown"))
ids = self._event_ids_suffix(event)
if self.track_debug:
print(f"[track-debug] event={event_type} trackId={event.get('trackId')}{ids}")
if event_type == "answer":
print("← Session ready!")
elif event_type == "speaking":
print("User speech detected")
elif event_type == "silence":
print("← User silence detected")
elif event_type == "transcript":
if event_type in {"hello.ack", "session.started"}:
print(f"← Session ready!{ids}")
elif event_type == "config.resolved":
print(f"Config resolved: {event.get('config', {}).get('output', {})}{ids}")
elif event_type == "input.speech_started":
print(f"← User speech detected{ids}")
elif event_type == "input.speech_stopped":
print(f"← User silence detected{ids}")
elif event_type in {"transcript", "transcript.delta", "transcript.final"}:
# Display user speech transcription
text = event.get("text", "")
is_final = event.get("isFinal", False)
is_final = event_type == "transcript.final" or bool(event.get("isFinal"))
if is_final:
# Clear the interim line and print final
print(" " * 80, end="\r") # Clear previous interim text
print(f"→ You: {text}")
print(f"→ You: {text}{ids}")
else:
# Interim result - show with indicator (overwrite same line)
display_text = text[:60] + "..." if len(text) > 60 else text
print(f" [listening] {display_text}".ljust(80), end="\r")
elif event_type == "ttfb":
elif event_type in {"ttfb", "metrics.ttfb"}:
# Server-side TTFB event
latency_ms = event.get("latencyMs", 0)
print(f"← [TTFB] Server reported latency: {latency_ms}ms")
elif event_type == "llmResponse":
elif event_type in {"llmResponse", "assistant.response.delta", "assistant.response.final"}:
# LLM text response
text = event.get("text", "")
is_final = event.get("isFinal", False)
is_final = event_type == "assistant.response.final" or bool(event.get("isFinal"))
if is_final:
# Print final LLM response
print(f"← AI: {text}")
elif self.verbose:
# Show streaming chunks only in verbose mode
display_text = text[:60] + "..." if len(text) > 60 else text
print(f" [streaming] {display_text}")
elif event_type == "trackStart":
print("← Bot started speaking")
print(f" [streaming] {display_text}{ids}")
elif event_type in {"trackStart", "output.audio.start"}:
print(f"← Bot started speaking{ids}")
# IMPORTANT: Accept audio again after trackStart
self._discard_audio = False
self._audio_sequence += 1
@@ -342,13 +377,13 @@ class MicrophoneClient:
# Clear any old audio in buffer
with self.audio_output_lock:
self.audio_output_buffer = b""
elif event_type == "trackEnd":
print("← Bot finished speaking")
elif event_type in {"trackEnd", "output.audio.end"}:
print(f"← Bot finished speaking{ids}")
# Reset TTFB tracking after response completes
self.request_start_time = None
self.first_audio_received = False
elif event_type == "interrupt":
print("← Bot interrupted!")
elif event_type in {"interrupt", "response.interrupted"}:
print(f"← Bot interrupted!{ids}")
# IMPORTANT: Discard all audio until next trackStart
self._discard_audio = True
# Clear audio buffer immediately
@@ -357,12 +392,12 @@ class MicrophoneClient:
self.audio_output_buffer = b""
print(f" (cleared {buffer_ms:.0f}ms, discarding audio until new track)")
elif event_type == "error":
print(f"← Error: {event.get('error')}")
elif event_type == "hangup":
print(f"← Hangup: {event.get('reason')}")
print(f"← Error: {event.get('error')}{ids}")
elif event_type in {"hangup", "session.stopped"}:
print(f"← Hangup: {event.get('reason')}{ids}")
self.running = False
else:
print(f"← Event: {event_type}")
print(f"← Event: {event_type}{ids}")
async def interactive_mode(self) -> None:
"""Run interactive mode for text chat."""
@@ -573,6 +608,26 @@ async def main():
action="store_true",
help="Show streaming LLM response chunks"
)
parser.add_argument(
"--app-id",
default="assistant_demo",
help="Stable app/assistant identifier for server-side config lookup"
)
parser.add_argument(
"--channel",
default="mic_client",
help="Client channel name"
)
parser.add_argument(
"--config-version-id",
default="local-dev",
help="Optional config version identifier"
)
parser.add_argument(
"--track-debug",
action="store_true",
help="Print event trackId for protocol debugging"
)
args = parser.parse_args()
@@ -583,8 +638,12 @@ async def main():
client = MicrophoneClient(
url=args.url,
sample_rate=args.sample_rate,
app_id=args.app_id,
channel=args.channel,
config_version_id=args.config_version_id,
input_device=args.input_device,
output_device=args.output_device
output_device=args.output_device,
track_debug=args.track_debug,
)
client.verbose = args.verbose