api has llm response event

This commit is contained in:
Xin Wang
2026-02-04 12:00:52 +08:00
parent 5aa9a12ca8
commit 7d255468ab
4 changed files with 63 additions and 9 deletions

View File

@@ -426,6 +426,15 @@ class DuplexPipeline:
sentence_buffer += text_chunk sentence_buffer += text_chunk
await self.conversation.update_assistant_text(text_chunk) await self.conversation.update_assistant_text(text_chunk)
# Send LLM response streaming event to client
await self.transport.send_event({
"event": "llmResponse",
"trackId": self.session_id,
"text": text_chunk,
"isFinal": False,
"timestamp": self._get_timestamp_ms()
})
# Check for sentence completion - synthesize immediately for low latency # Check for sentence completion - synthesize immediately for low latency
while any(end in sentence_buffer for end in sentence_ends): while any(end in sentence_buffer for end in sentence_ends):
# Find first sentence end # Find first sentence end
@@ -454,6 +463,16 @@ class DuplexPipeline:
else: else:
break break
# Send final LLM response event
if full_response and not self._interrupt_event.is_set():
await self.transport.send_event({
"event": "llmResponse",
"trackId": self.session_id,
"text": full_response,
"isFinal": True,
"timestamp": self._get_timestamp_ms()
})
# Speak any remaining text # Speak any remaining text
if sentence_buffer.strip() and not self._interrupt_event.is_set(): if sentence_buffer.strip() and not self._interrupt_event.is_set():
if not first_audio_sent: if not first_audio_sent:

View File

@@ -4,10 +4,12 @@ Microphone client for testing duplex voice conversation.
This client captures audio from the microphone, sends it to the server, This client captures audio from the microphone, sends it to the server,
and plays back the AI's voice response through the speakers. and plays back the AI's voice response through the speakers.
It also displays the LLM's text responses in the console.
Usage: Usage:
python examples/mic_client.py --url ws://localhost:8000/ws python examples/mic_client.py --url ws://localhost:8000/ws
python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!" python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!"
python examples/mic_client.py --url ws://localhost:8000/ws --verbose
Requirements: Requirements:
pip install sounddevice soundfile websockets numpy pip install sounddevice soundfile websockets numpy
@@ -102,6 +104,9 @@ class MicrophoneClient:
self._discard_audio = False self._discard_audio = False
self._audio_sequence = 0 # Track audio sequence to detect stale chunks self._audio_sequence = 0 # Track audio sequence to detect stale chunks
# Verbose mode for streaming LLM responses
self.verbose = False
async def connect(self) -> None: async def connect(self) -> None:
"""Connect to WebSocket server.""" """Connect to WebSocket server."""
print(f"Connecting to {self.url}...") print(f"Connecting to {self.url}...")
@@ -314,6 +319,17 @@ class MicrophoneClient:
# Server-side TTFB event # Server-side TTFB event
latency_ms = event.get("latencyMs", 0) latency_ms = event.get("latencyMs", 0)
print(f"← [TTFB] Server reported latency: {latency_ms}ms") print(f"← [TTFB] Server reported latency: {latency_ms}ms")
elif event_type == "llmResponse":
# LLM text response
text = event.get("text", "")
is_final = event.get("isFinal", False)
if is_final:
# Print final LLM response
print(f"← AI: {text}")
elif self.verbose:
# Show streaming chunks only in verbose mode
display_text = text[:60] + "..." if len(text) > 60 else text
print(f" [streaming] {display_text}")
elif event_type == "trackStart": elif event_type == "trackStart":
print("← Bot started speaking") print("← Bot started speaking")
# IMPORTANT: Accept audio again after trackStart # IMPORTANT: Accept audio again after trackStart
@@ -552,6 +568,11 @@ async def main():
action="store_true", action="store_true",
help="Disable interactive mode" help="Disable interactive mode"
) )
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Show streaming LLM response chunks"
)
args = parser.parse_args() args = parser.parse_args()
@@ -565,6 +586,7 @@ async def main():
input_device=args.input_device, input_device=args.input_device,
output_device=args.output_device output_device=args.output_device
) )
client.verbose = args.verbose
await client.run( await client.run(
chat_message=args.chat, chat_message=args.chat,

View File

@@ -115,7 +115,13 @@ class WavFileClient:
"direction": direction, "direction": direction,
"message": message "message": message
}) })
print(f"{direction} {message}") # Handle encoding errors on Windows
try:
print(f"{direction} {message}")
except UnicodeEncodeError:
# Replace problematic characters for console output
safe_message = message.encode('ascii', errors='replace').decode('ascii')
print(f"{direction} {safe_message}")
async def connect(self) -> None: async def connect(self) -> None:
"""Connect to WebSocket server.""" """Connect to WebSocket server."""
@@ -285,6 +291,14 @@ class WavFileClient:
elif event_type == "ttfb": elif event_type == "ttfb":
latency_ms = event.get("latencyMs", 0) latency_ms = event.get("latencyMs", 0)
self.log_event("", f"[TTFB] Server latency: {latency_ms}ms") self.log_event("", f"[TTFB] Server latency: {latency_ms}ms")
elif event_type == "llmResponse":
text = event.get("text", "")
is_final = event.get("isFinal", False)
if is_final:
self.log_event("", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
elif self.verbose:
# Show streaming chunks only in verbose mode
self.log_event("", f"LLM: {text}")
elif event_type == "trackStart": elif event_type == "trackStart":
self.track_started = True self.track_started = True
self.log_event("", "Bot started speaking") self.log_event("", "Bot started speaking")

View File

@@ -6,7 +6,7 @@ Creates a 16kHz mono WAV file with real speech segments separated by
configurable silence (for VAD/testing). configurable silence (for VAD/testing).
Usage: Usage:
python scripts/generate_test_audio.py [OPTIONS] python generate_test_audio.py [OPTIONS]
Options: Options:
-o, --output PATH Output WAV path (default: data/audio_examples/two_utterances_16k.wav) -o, --output PATH Output WAV path (default: data/audio_examples/two_utterances_16k.wav)
@@ -18,19 +18,18 @@ Options:
Examples: Examples:
# Default utterances and output # Default utterances and output
python scripts/generate_test_audio.py python generate_test_audio.py
# Custom output path # Custom output path
python scripts/generate_test_audio.py -o out.wav python generate_test_audio.py -o out.wav
# Utterances from command line # Utterances from command line
python scripts/generate_test_audio.py -u "Hello" -u "World" -o test.wav python generate_test_audio.py -u "Hello" -u "World" -o test.wav
# Utterances from JSON file # Utterancgenerate_test_audio.py -j utterances.json -o test.wav
python scripts/generate_test_audio.py -j utterances.json -o test.wav
# Custom silence (1s between utterances) # Custom silence (1s between utterances)
python scripts/generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav python generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav
Requires SILICONFLOW_API_KEY in .env. Requires SILICONFLOW_API_KEY in .env.
""" """
@@ -47,7 +46,7 @@ from dotenv import load_dotenv
# Load .env file from project root # Load .env file from project root
project_root = Path(__file__).parent.parent project_root = Path(__file__).parent.parent.parent
load_dotenv(project_root / ".env") load_dotenv(project_root / ".env")