Unify db api
This commit is contained in:
@@ -57,10 +57,15 @@ class WavFileClient:
|
||||
url: str,
|
||||
input_file: str,
|
||||
output_file: str,
|
||||
app_id: str = "assistant_demo",
|
||||
channel: str = "wav_client",
|
||||
config_version_id: str = "local-dev",
|
||||
sample_rate: int = 16000,
|
||||
chunk_duration_ms: int = 20,
|
||||
wait_time: float = 15.0,
|
||||
verbose: bool = False
|
||||
verbose: bool = False,
|
||||
track_debug: bool = False,
|
||||
tail_silence_ms: int = 800,
|
||||
):
|
||||
"""
|
||||
Initialize WAV file client.
|
||||
@@ -77,11 +82,17 @@ class WavFileClient:
|
||||
self.url = url
|
||||
self.input_file = Path(input_file)
|
||||
self.output_file = Path(output_file)
|
||||
self.app_id = app_id
|
||||
self.channel = channel
|
||||
self.config_version_id = config_version_id
|
||||
self.sample_rate = sample_rate
|
||||
self.chunk_duration_ms = chunk_duration_ms
|
||||
self.chunk_samples = int(sample_rate * chunk_duration_ms / 1000)
|
||||
self.wait_time = wait_time
|
||||
self.verbose = verbose
|
||||
self.track_debug = track_debug
|
||||
self.tail_silence_ms = max(0, int(tail_silence_ms))
|
||||
self.frame_bytes = 640 # 16k mono pcm_s16le, 20ms
|
||||
|
||||
# WebSocket connection
|
||||
self.ws = None
|
||||
@@ -105,6 +116,7 @@ class WavFileClient:
|
||||
self.track_started = False
|
||||
self.track_ended = False
|
||||
self.send_completed = False
|
||||
self.session_ready = False
|
||||
|
||||
# Events log
|
||||
self.events_log = []
|
||||
@@ -124,6 +136,17 @@ class WavFileClient:
|
||||
# Replace problematic characters for console output
|
||||
safe_message = message.encode('ascii', errors='replace').decode('ascii')
|
||||
print(f"{direction} {safe_message}")
|
||||
|
||||
@staticmethod
|
||||
def _event_ids_suffix(event: dict) -> str:
|
||||
data = event.get("data") if isinstance(event.get("data"), dict) else {}
|
||||
keys = ("turn_id", "utterance_id", "response_id", "tool_call_id", "tts_id")
|
||||
parts = []
|
||||
for key in keys:
|
||||
value = data.get(key, event.get(key))
|
||||
if value:
|
||||
parts.append(f"{key}={value}")
|
||||
return f" [{' '.join(parts)}]" if parts else ""
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Connect to WebSocket server."""
|
||||
@@ -131,26 +154,36 @@ class WavFileClient:
|
||||
self.ws = await websockets.connect(self.url)
|
||||
self.running = True
|
||||
self.log_event("←", "Connected!")
|
||||
|
||||
# Send invite command
|
||||
|
||||
# WS v1 handshake: hello -> session.start
|
||||
await self.send_command({
|
||||
"command": "invite",
|
||||
"option": {
|
||||
"codec": "pcm",
|
||||
"sampleRate": self.sample_rate
|
||||
}
|
||||
"type": "hello",
|
||||
"version": "v1",
|
||||
})
|
||||
await self.send_command({
|
||||
"type": "session.start",
|
||||
"audio": {
|
||||
"encoding": "pcm_s16le",
|
||||
"sample_rate_hz": self.sample_rate,
|
||||
"channels": 1
|
||||
},
|
||||
"metadata": {
|
||||
"appId": self.app_id,
|
||||
"channel": self.channel,
|
||||
"configVersionId": self.config_version_id,
|
||||
},
|
||||
})
|
||||
|
||||
async def send_command(self, cmd: dict) -> None:
|
||||
"""Send JSON command to server."""
|
||||
if self.ws:
|
||||
await self.ws.send(json.dumps(cmd))
|
||||
self.log_event("→", f"Command: {cmd.get('command', 'unknown')}")
|
||||
self.log_event("→", f"Command: {cmd.get('type', 'unknown')}")
|
||||
|
||||
async def send_hangup(self, reason: str = "Session complete") -> None:
|
||||
"""Send hangup command."""
|
||||
await self.send_command({
|
||||
"command": "hangup",
|
||||
"type": "session.stop",
|
||||
"reason": reason
|
||||
})
|
||||
|
||||
@@ -210,6 +243,10 @@ class WavFileClient:
|
||||
end_sample = min(sent_samples + chunk_size, total_samples)
|
||||
chunk = audio_data[sent_samples:end_sample]
|
||||
chunk_bytes = chunk.tobytes()
|
||||
if len(chunk_bytes) % self.frame_bytes != 0:
|
||||
# v1 audio framing requires 640-byte (20ms) PCM units.
|
||||
pad = self.frame_bytes - (len(chunk_bytes) % self.frame_bytes)
|
||||
chunk_bytes += b"\x00" * pad
|
||||
|
||||
# Send to server
|
||||
if self.ws:
|
||||
@@ -226,6 +263,16 @@ class WavFileClient:
|
||||
# Delay to simulate real-time streaming
|
||||
# Server expects audio at real-time pace for VAD/ASR to work properly
|
||||
await asyncio.sleep(self.chunk_duration_ms / 1000)
|
||||
|
||||
# Add a short silence tail to help VAD/EOU close the final utterance.
|
||||
if self.tail_silence_ms > 0 and self.ws:
|
||||
tail_frames = max(1, self.tail_silence_ms // 20)
|
||||
silence = b"\x00" * self.frame_bytes
|
||||
for _ in range(tail_frames):
|
||||
await self.ws.send(silence)
|
||||
self.bytes_sent += len(silence)
|
||||
await asyncio.sleep(0.02)
|
||||
self.log_event("→", f"Sent trailing silence: {self.tail_silence_ms}ms")
|
||||
|
||||
self.send_completed = True
|
||||
elapsed = time.time() - self.send_start_time
|
||||
@@ -277,54 +324,59 @@ class WavFileClient:
|
||||
|
||||
async def _handle_event(self, event: dict) -> None:
|
||||
"""Handle incoming event."""
|
||||
event_type = event.get("event", "unknown")
|
||||
|
||||
if event_type == "answer":
|
||||
self.log_event("←", "Session ready!")
|
||||
elif event_type == "speaking":
|
||||
self.log_event("←", "Speech detected")
|
||||
elif event_type == "silence":
|
||||
self.log_event("←", "Silence detected")
|
||||
elif event_type == "transcript":
|
||||
# ASR transcript (interim = asrDelta-style, final = asrFinal-style)
|
||||
event_type = event.get("type", "unknown")
|
||||
ids = self._event_ids_suffix(event)
|
||||
if self.track_debug:
|
||||
print(f"[track-debug] event={event_type} trackId={event.get('trackId')}{ids}")
|
||||
|
||||
if event_type == "hello.ack":
|
||||
self.log_event("←", f"Handshake acknowledged{ids}")
|
||||
elif event_type == "session.started":
|
||||
self.session_ready = True
|
||||
self.log_event("←", f"Session ready!{ids}")
|
||||
elif event_type == "config.resolved":
|
||||
config = event.get("config", {})
|
||||
self.log_event("←", f"Config resolved (output={config.get('output', {})}){ids}")
|
||||
elif event_type == "input.speech_started":
|
||||
self.log_event("←", f"Speech detected{ids}")
|
||||
elif event_type == "input.speech_stopped":
|
||||
self.log_event("←", f"Silence detected{ids}")
|
||||
elif event_type == "transcript.delta":
|
||||
text = event.get("text", "")
|
||||
is_final = event.get("isFinal", False)
|
||||
if is_final:
|
||||
# Clear interim line and print final
|
||||
print(" " * 80, end="\r")
|
||||
self.log_event("←", f"→ You: {text}")
|
||||
else:
|
||||
# Interim result - show with indicator (overwrite same line, as in mic_client)
|
||||
display_text = text[:60] + "..." if len(text) > 60 else text
|
||||
print(f" [listening] {display_text}".ljust(80), end="\r")
|
||||
elif event_type == "ttfb":
|
||||
display_text = text[:60] + "..." if len(text) > 60 else text
|
||||
print(f" [listening] {display_text}".ljust(80), end="\r")
|
||||
elif event_type == "transcript.final":
|
||||
text = event.get("text", "")
|
||||
print(" " * 80, end="\r")
|
||||
self.log_event("←", f"→ You: {text}{ids}")
|
||||
elif event_type == "metrics.ttfb":
|
||||
latency_ms = event.get("latencyMs", 0)
|
||||
self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
|
||||
elif event_type == "llmResponse":
|
||||
elif event_type == "assistant.response.delta":
|
||||
text = event.get("text", "")
|
||||
is_final = event.get("isFinal", False)
|
||||
if is_final:
|
||||
self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
|
||||
elif self.verbose:
|
||||
# Show streaming chunks only in verbose mode
|
||||
self.log_event("←", f"LLM: {text}")
|
||||
elif event_type == "trackStart":
|
||||
if self.verbose and text:
|
||||
self.log_event("←", f"LLM: {text}{ids}")
|
||||
elif event_type == "assistant.response.final":
|
||||
text = event.get("text", "")
|
||||
if text:
|
||||
self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}{ids}")
|
||||
elif event_type == "output.audio.start":
|
||||
self.track_started = True
|
||||
self.response_start_time = time.time()
|
||||
self.waiting_for_first_audio = True
|
||||
self.log_event("←", "Bot started speaking")
|
||||
elif event_type == "trackEnd":
|
||||
self.log_event("←", f"Bot started speaking{ids}")
|
||||
elif event_type == "output.audio.end":
|
||||
self.track_ended = True
|
||||
self.log_event("←", "Bot finished speaking")
|
||||
elif event_type == "interrupt":
|
||||
self.log_event("←", "Bot interrupted!")
|
||||
self.log_event("←", f"Bot finished speaking{ids}")
|
||||
elif event_type == "response.interrupted":
|
||||
self.log_event("←", f"Bot interrupted!{ids}")
|
||||
elif event_type == "error":
|
||||
self.log_event("!", f"Error: {event.get('error')}")
|
||||
elif event_type == "hangup":
|
||||
self.log_event("←", f"Hangup: {event.get('reason')}")
|
||||
self.log_event("!", f"Error: {event.get('message')}{ids}")
|
||||
elif event_type == "session.stopped":
|
||||
self.log_event("←", f"Session stopped: {event.get('reason')}{ids}")
|
||||
self.running = False
|
||||
else:
|
||||
self.log_event("←", f"Event: {event_type}")
|
||||
self.log_event("←", f"Event: {event_type}{ids}")
|
||||
|
||||
def save_output_wav(self) -> None:
|
||||
"""Save received audio to output WAV file."""
|
||||
@@ -359,11 +411,15 @@ class WavFileClient:
|
||||
# Connect to server
|
||||
await self.connect()
|
||||
|
||||
# Wait for answer
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Start receiver task
|
||||
receiver_task = asyncio.create_task(self.receiver())
|
||||
|
||||
# Wait for session.started before streaming audio
|
||||
ready_start = time.time()
|
||||
while self.running and not self.session_ready:
|
||||
if time.time() - ready_start > 8.0:
|
||||
raise TimeoutError("Timeout waiting for session.started")
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
# Send audio
|
||||
await self.audio_sender(audio_data)
|
||||
@@ -464,6 +520,21 @@ async def main():
|
||||
default=16000,
|
||||
help="Target sample rate for audio (default: 16000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--app-id",
|
||||
default="assistant_demo",
|
||||
help="Stable app/assistant identifier for server-side config lookup"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--channel",
|
||||
default="wav_client",
|
||||
help="Client channel name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config-version-id",
|
||||
default="local-dev",
|
||||
help="Optional config version identifier"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chunk-duration",
|
||||
type=int,
|
||||
@@ -481,6 +552,17 @@ async def main():
|
||||
action="store_true",
|
||||
help="Enable verbose output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--track-debug",
|
||||
action="store_true",
|
||||
help="Print event trackId for protocol debugging"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tail-silence-ms",
|
||||
type=int,
|
||||
default=800,
|
||||
help="Trailing silence to send after WAV playback for EOU detection (default: 800)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -488,10 +570,15 @@ async def main():
|
||||
url=args.url,
|
||||
input_file=args.input,
|
||||
output_file=args.output,
|
||||
app_id=args.app_id,
|
||||
channel=args.channel,
|
||||
config_version_id=args.config_version_id,
|
||||
sample_rate=args.sample_rate,
|
||||
chunk_duration_ms=args.chunk_duration,
|
||||
wait_time=args.wait_time,
|
||||
verbose=args.verbose
|
||||
verbose=args.verbose,
|
||||
track_debug=args.track_debug,
|
||||
tail_silence_ms=args.tail_silence_ms,
|
||||
)
|
||||
|
||||
await client.run()
|
||||
|
||||
Reference in New Issue
Block a user