Unify db api
This commit is contained in:
@@ -59,8 +59,12 @@ class MicrophoneClient:
|
||||
url: str,
|
||||
sample_rate: int = 16000,
|
||||
chunk_duration_ms: int = 20,
|
||||
app_id: str = "assistant_demo",
|
||||
channel: str = "mic_client",
|
||||
config_version_id: str = "local-dev",
|
||||
input_device: int = None,
|
||||
output_device: int = None
|
||||
output_device: int = None,
|
||||
track_debug: bool = False,
|
||||
):
|
||||
"""
|
||||
Initialize microphone client.
|
||||
@@ -76,8 +80,12 @@ class MicrophoneClient:
|
||||
self.sample_rate = sample_rate
|
||||
self.chunk_duration_ms = chunk_duration_ms
|
||||
self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
|
||||
self.app_id = app_id
|
||||
self.channel = channel
|
||||
self.config_version_id = config_version_id
|
||||
self.input_device = input_device
|
||||
self.output_device = output_device
|
||||
self.track_debug = track_debug
|
||||
|
||||
# WebSocket connection
|
||||
self.ws = None
|
||||
@@ -106,6 +114,17 @@ class MicrophoneClient:
|
||||
|
||||
# Verbose mode for streaming LLM responses
|
||||
self.verbose = False
|
||||
|
||||
@staticmethod
|
||||
def _event_ids_suffix(event: dict) -> str:
|
||||
data = event.get("data") if isinstance(event.get("data"), dict) else {}
|
||||
keys = ("turn_id", "utterance_id", "response_id", "tool_call_id", "tts_id")
|
||||
parts = []
|
||||
for key in keys:
|
||||
value = data.get(key, event.get(key))
|
||||
if value:
|
||||
parts.append(f"{key}={value}")
|
||||
return f" [{' '.join(parts)}]" if parts else ""
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Connect to WebSocket server."""
|
||||
@@ -114,20 +133,30 @@ class MicrophoneClient:
|
||||
self.running = True
|
||||
print("Connected!")
|
||||
|
||||
# Send invite command
|
||||
# WS v1 handshake: hello -> session.start
|
||||
await self.send_command({
|
||||
"command": "invite",
|
||||
"option": {
|
||||
"codec": "pcm",
|
||||
"sampleRate": self.sample_rate
|
||||
}
|
||||
"type": "hello",
|
||||
"version": "v1",
|
||||
})
|
||||
await self.send_command({
|
||||
"type": "session.start",
|
||||
"audio": {
|
||||
"encoding": "pcm_s16le",
|
||||
"sample_rate_hz": self.sample_rate,
|
||||
"channels": 1,
|
||||
},
|
||||
"metadata": {
|
||||
"appId": self.app_id,
|
||||
"channel": self.channel,
|
||||
"configVersionId": self.config_version_id,
|
||||
},
|
||||
})
|
||||
|
||||
async def send_command(self, cmd: dict) -> None:
|
||||
"""Send JSON command to server."""
|
||||
if self.ws:
|
||||
await self.ws.send(json.dumps(cmd))
|
||||
print(f"→ Command: {cmd.get('command', 'unknown')}")
|
||||
print(f"→ Command: {cmd.get('type', 'unknown')}")
|
||||
|
||||
async def send_chat(self, text: str) -> None:
|
||||
"""Send chat message (text input)."""
|
||||
@@ -136,7 +165,7 @@ class MicrophoneClient:
|
||||
self.first_audio_received = False
|
||||
|
||||
await self.send_command({
|
||||
"command": "chat",
|
||||
"type": "input.text",
|
||||
"text": text
|
||||
})
|
||||
print(f"→ Chat: {text}")
|
||||
@@ -144,13 +173,14 @@ class MicrophoneClient:
|
||||
async def send_interrupt(self) -> None:
|
||||
"""Send interrupt command."""
|
||||
await self.send_command({
|
||||
"command": "interrupt"
|
||||
"type": "response.cancel",
|
||||
"graceful": False,
|
||||
})
|
||||
|
||||
async def send_hangup(self, reason: str = "User quit") -> None:
|
||||
"""Send hangup command."""
|
||||
await self.send_command({
|
||||
"command": "hangup",
|
||||
"type": "session.stop",
|
||||
"reason": reason
|
||||
})
|
||||
|
||||
@@ -295,43 +325,48 @@ class MicrophoneClient:
|
||||
|
||||
async def _handle_event(self, event: dict) -> None:
|
||||
"""Handle incoming event."""
|
||||
event_type = event.get("event", "unknown")
|
||||
event_type = event.get("type", event.get("event", "unknown"))
|
||||
ids = self._event_ids_suffix(event)
|
||||
if self.track_debug:
|
||||
print(f"[track-debug] event={event_type} trackId={event.get('trackId')}{ids}")
|
||||
|
||||
if event_type == "answer":
|
||||
print("← Session ready!")
|
||||
elif event_type == "speaking":
|
||||
print("← User speech detected")
|
||||
elif event_type == "silence":
|
||||
print("← User silence detected")
|
||||
elif event_type == "transcript":
|
||||
if event_type in {"hello.ack", "session.started"}:
|
||||
print(f"← Session ready!{ids}")
|
||||
elif event_type == "config.resolved":
|
||||
print(f"← Config resolved: {event.get('config', {}).get('output', {})}{ids}")
|
||||
elif event_type == "input.speech_started":
|
||||
print(f"← User speech detected{ids}")
|
||||
elif event_type == "input.speech_stopped":
|
||||
print(f"← User silence detected{ids}")
|
||||
elif event_type in {"transcript", "transcript.delta", "transcript.final"}:
|
||||
# Display user speech transcription
|
||||
text = event.get("text", "")
|
||||
is_final = event.get("isFinal", False)
|
||||
is_final = event_type == "transcript.final" or bool(event.get("isFinal"))
|
||||
if is_final:
|
||||
# Clear the interim line and print final
|
||||
print(" " * 80, end="\r") # Clear previous interim text
|
||||
print(f"→ You: {text}")
|
||||
print(f"→ You: {text}{ids}")
|
||||
else:
|
||||
# Interim result - show with indicator (overwrite same line)
|
||||
display_text = text[:60] + "..." if len(text) > 60 else text
|
||||
print(f" [listening] {display_text}".ljust(80), end="\r")
|
||||
elif event_type == "ttfb":
|
||||
elif event_type in {"ttfb", "metrics.ttfb"}:
|
||||
# Server-side TTFB event
|
||||
latency_ms = event.get("latencyMs", 0)
|
||||
print(f"← [TTFB] Server reported latency: {latency_ms}ms")
|
||||
elif event_type == "llmResponse":
|
||||
elif event_type in {"llmResponse", "assistant.response.delta", "assistant.response.final"}:
|
||||
# LLM text response
|
||||
text = event.get("text", "")
|
||||
is_final = event.get("isFinal", False)
|
||||
is_final = event_type == "assistant.response.final" or bool(event.get("isFinal"))
|
||||
if is_final:
|
||||
# Print final LLM response
|
||||
print(f"← AI: {text}")
|
||||
elif self.verbose:
|
||||
# Show streaming chunks only in verbose mode
|
||||
display_text = text[:60] + "..." if len(text) > 60 else text
|
||||
print(f" [streaming] {display_text}")
|
||||
elif event_type == "trackStart":
|
||||
print("← Bot started speaking")
|
||||
print(f" [streaming] {display_text}{ids}")
|
||||
elif event_type in {"trackStart", "output.audio.start"}:
|
||||
print(f"← Bot started speaking{ids}")
|
||||
# IMPORTANT: Accept audio again after trackStart
|
||||
self._discard_audio = False
|
||||
self._audio_sequence += 1
|
||||
@@ -342,13 +377,13 @@ class MicrophoneClient:
|
||||
# Clear any old audio in buffer
|
||||
with self.audio_output_lock:
|
||||
self.audio_output_buffer = b""
|
||||
elif event_type == "trackEnd":
|
||||
print("← Bot finished speaking")
|
||||
elif event_type in {"trackEnd", "output.audio.end"}:
|
||||
print(f"← Bot finished speaking{ids}")
|
||||
# Reset TTFB tracking after response completes
|
||||
self.request_start_time = None
|
||||
self.first_audio_received = False
|
||||
elif event_type == "interrupt":
|
||||
print("← Bot interrupted!")
|
||||
elif event_type in {"interrupt", "response.interrupted"}:
|
||||
print(f"← Bot interrupted!{ids}")
|
||||
# IMPORTANT: Discard all audio until next trackStart
|
||||
self._discard_audio = True
|
||||
# Clear audio buffer immediately
|
||||
@@ -357,12 +392,12 @@ class MicrophoneClient:
|
||||
self.audio_output_buffer = b""
|
||||
print(f" (cleared {buffer_ms:.0f}ms, discarding audio until new track)")
|
||||
elif event_type == "error":
|
||||
print(f"← Error: {event.get('error')}")
|
||||
elif event_type == "hangup":
|
||||
print(f"← Hangup: {event.get('reason')}")
|
||||
print(f"← Error: {event.get('error')}{ids}")
|
||||
elif event_type in {"hangup", "session.stopped"}:
|
||||
print(f"← Hangup: {event.get('reason')}{ids}")
|
||||
self.running = False
|
||||
else:
|
||||
print(f"← Event: {event_type}")
|
||||
print(f"← Event: {event_type}{ids}")
|
||||
|
||||
async def interactive_mode(self) -> None:
|
||||
"""Run interactive mode for text chat."""
|
||||
@@ -573,6 +608,26 @@ async def main():
|
||||
action="store_true",
|
||||
help="Show streaming LLM response chunks"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--app-id",
|
||||
default="assistant_demo",
|
||||
help="Stable app/assistant identifier for server-side config lookup"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--channel",
|
||||
default="mic_client",
|
||||
help="Client channel name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config-version-id",
|
||||
default="local-dev",
|
||||
help="Optional config version identifier"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--track-debug",
|
||||
action="store_true",
|
||||
help="Print event trackId for protocol debugging"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -583,8 +638,12 @@ async def main():
|
||||
client = MicrophoneClient(
|
||||
url=args.url,
|
||||
sample_rate=args.sample_rate,
|
||||
app_id=args.app_id,
|
||||
channel=args.channel,
|
||||
config_version_id=args.config_version_id,
|
||||
input_device=args.input_device,
|
||||
output_device=args.output_device
|
||||
output_device=args.output_device,
|
||||
track_debug=args.track_debug,
|
||||
)
|
||||
client.verbose = args.verbose
|
||||
|
||||
|
||||
Reference in New Issue
Block a user