Update engine

This commit is contained in:
Xin Wang
2026-02-23 17:16:18 +08:00
parent 01c0de0a4d
commit c6c84b5af9
9 changed files with 991 additions and 186 deletions

View File

@@ -52,9 +52,21 @@ if not PYAUDIO_AVAILABLE and not SD_AVAILABLE:
class SimpleVoiceClient:
"""Simple voice client with reliable audio playback."""
def __init__(self, url: str, sample_rate: int = 16000):
def __init__(
self,
url: str,
sample_rate: int = 16000,
app_id: str = "assistant_demo",
channel: str = "simple_client",
config_version_id: str = "local-dev",
track_debug: bool = False,
):
self.url = url
self.sample_rate = sample_rate
self.app_id = app_id
self.channel = channel
self.config_version_id = config_version_id
self.track_debug = track_debug
self.ws = None
self.running = False
@@ -75,6 +87,17 @@ class SimpleVoiceClient:
# Interrupt handling - discard audio until next trackStart
self._discard_audio = False
@staticmethod
def _event_ids_suffix(event: dict) -> str:
data = event.get("data") if isinstance(event.get("data"), dict) else {}
keys = ("turn_id", "utterance_id", "response_id", "tool_call_id", "tts_id")
parts = []
for key in keys:
value = data.get(key, event.get(key))
if value:
parts.append(f"{key}={value}")
return f" [{' '.join(parts)}]" if parts else ""
async def connect(self):
"""Connect to server."""
@@ -83,12 +106,25 @@ class SimpleVoiceClient:
self.running = True
print("Connected!")
# Send invite
# WS v1 handshake: hello -> session.start
await self.ws.send(json.dumps({
"command": "invite",
"option": {"codec": "pcm", "sampleRate": self.sample_rate}
"type": "hello",
"version": "v1",
}))
print("-> invite")
await self.ws.send(json.dumps({
"type": "session.start",
"audio": {
"encoding": "pcm_s16le",
"sample_rate_hz": self.sample_rate,
"channels": 1,
},
"metadata": {
"appId": self.app_id,
"channel": self.channel,
"configVersionId": self.config_version_id,
},
}))
print("-> hello/session.start")
async def send_chat(self, text: str):
"""Send chat message."""
@@ -96,8 +132,8 @@ class SimpleVoiceClient:
self.request_start_time = time.time()
self.first_audio_received = False
await self.ws.send(json.dumps({"command": "chat", "text": text}))
print(f"-> chat: {text}")
await self.ws.send(json.dumps({"type": "input.text", "text": text}))
print(f"-> input.text: {text}")
def play_audio(self, audio_data: bytes):
"""Play audio data immediately."""
@@ -152,34 +188,39 @@ class SimpleVoiceClient:
else:
# JSON event
event = json.loads(msg)
etype = event.get("event", "?")
etype = event.get("type", event.get("event", "?"))
ids = self._event_ids_suffix(event)
if self.track_debug:
print(f"[track-debug] event={etype} trackId={event.get('trackId')}{ids}")
if etype == "transcript":
if etype in {"transcript", "transcript.delta", "transcript.final"}:
# User speech transcription
text = event.get("text", "")
is_final = event.get("isFinal", False)
is_final = etype == "transcript.final" or bool(event.get("isFinal"))
if is_final:
print(f"<- You said: {text}")
print(f"<- You said: {text}{ids}")
else:
print(f"<- [listening] {text}", end="\r")
elif etype == "ttfb":
elif etype in {"ttfb", "metrics.ttfb"}:
# Server-side TTFB event
latency_ms = event.get("latencyMs", 0)
print(f"<- [TTFB] Server reported latency: {latency_ms}ms")
elif etype == "trackStart":
elif etype in {"trackStart", "output.audio.start"}:
# New track starting - accept audio again
self._discard_audio = False
print(f"<- {etype}")
elif etype == "interrupt":
print(f"<- {etype}{ids}")
elif etype in {"interrupt", "response.interrupted"}:
# Interrupt - discard audio until next trackStart
self._discard_audio = True
print(f"<- {etype} (discarding audio until new track)")
elif etype == "hangup":
print(f"<- {etype}")
print(f"<- {etype}{ids} (discarding audio until new track)")
elif etype in {"hangup", "session.stopped"}:
print(f"<- {etype}{ids}")
self.running = False
break
elif etype == "config.resolved":
print(f"<- config.resolved {event.get('config', {}).get('output', {})}{ids}")
else:
print(f"<- {etype}")
print(f"<- {etype}{ids}")
except asyncio.TimeoutError:
continue
@@ -270,6 +311,10 @@ async def main():
parser.add_argument("--text", help="Send text and play response")
parser.add_argument("--list-devices", action="store_true")
parser.add_argument("--sample-rate", type=int, default=16000)
parser.add_argument("--app-id", default="assistant_demo")
parser.add_argument("--channel", default="simple_client")
parser.add_argument("--config-version-id", default="local-dev")
parser.add_argument("--track-debug", action="store_true")
args = parser.parse_args()
@@ -277,7 +322,14 @@ async def main():
list_audio_devices()
return
client = SimpleVoiceClient(args.url, args.sample_rate)
client = SimpleVoiceClient(
args.url,
args.sample_rate,
app_id=args.app_id,
channel=args.channel,
config_version_id=args.config_version_id,
track_debug=args.track_debug,
)
await client.run(args.text)