api has llm response event
This commit is contained in:
@@ -426,6 +426,15 @@ class DuplexPipeline:
|
|||||||
sentence_buffer += text_chunk
|
sentence_buffer += text_chunk
|
||||||
await self.conversation.update_assistant_text(text_chunk)
|
await self.conversation.update_assistant_text(text_chunk)
|
||||||
|
|
||||||
|
# Send LLM response streaming event to client
|
||||||
|
await self.transport.send_event({
|
||||||
|
"event": "llmResponse",
|
||||||
|
"trackId": self.session_id,
|
||||||
|
"text": text_chunk,
|
||||||
|
"isFinal": False,
|
||||||
|
"timestamp": self._get_timestamp_ms()
|
||||||
|
})
|
||||||
|
|
||||||
# Check for sentence completion - synthesize immediately for low latency
|
# Check for sentence completion - synthesize immediately for low latency
|
||||||
while any(end in sentence_buffer for end in sentence_ends):
|
while any(end in sentence_buffer for end in sentence_ends):
|
||||||
# Find first sentence end
|
# Find first sentence end
|
||||||
@@ -454,6 +463,16 @@ class DuplexPipeline:
|
|||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Send final LLM response event
|
||||||
|
if full_response and not self._interrupt_event.is_set():
|
||||||
|
await self.transport.send_event({
|
||||||
|
"event": "llmResponse",
|
||||||
|
"trackId": self.session_id,
|
||||||
|
"text": full_response,
|
||||||
|
"isFinal": True,
|
||||||
|
"timestamp": self._get_timestamp_ms()
|
||||||
|
})
|
||||||
|
|
||||||
# Speak any remaining text
|
# Speak any remaining text
|
||||||
if sentence_buffer.strip() and not self._interrupt_event.is_set():
|
if sentence_buffer.strip() and not self._interrupt_event.is_set():
|
||||||
if not first_audio_sent:
|
if not first_audio_sent:
|
||||||
|
|||||||
@@ -4,10 +4,12 @@ Microphone client for testing duplex voice conversation.
|
|||||||
|
|
||||||
This client captures audio from the microphone, sends it to the server,
|
This client captures audio from the microphone, sends it to the server,
|
||||||
and plays back the AI's voice response through the speakers.
|
and plays back the AI's voice response through the speakers.
|
||||||
|
It also displays the LLM's text responses in the console.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python examples/mic_client.py --url ws://localhost:8000/ws
|
python examples/mic_client.py --url ws://localhost:8000/ws
|
||||||
python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!"
|
python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!"
|
||||||
|
python examples/mic_client.py --url ws://localhost:8000/ws --verbose
|
||||||
|
|
||||||
Requirements:
|
Requirements:
|
||||||
pip install sounddevice soundfile websockets numpy
|
pip install sounddevice soundfile websockets numpy
|
||||||
@@ -102,6 +104,9 @@ class MicrophoneClient:
|
|||||||
self._discard_audio = False
|
self._discard_audio = False
|
||||||
self._audio_sequence = 0 # Track audio sequence to detect stale chunks
|
self._audio_sequence = 0 # Track audio sequence to detect stale chunks
|
||||||
|
|
||||||
|
# Verbose mode for streaming LLM responses
|
||||||
|
self.verbose = False
|
||||||
|
|
||||||
async def connect(self) -> None:
|
async def connect(self) -> None:
|
||||||
"""Connect to WebSocket server."""
|
"""Connect to WebSocket server."""
|
||||||
print(f"Connecting to {self.url}...")
|
print(f"Connecting to {self.url}...")
|
||||||
@@ -314,6 +319,17 @@ class MicrophoneClient:
|
|||||||
# Server-side TTFB event
|
# Server-side TTFB event
|
||||||
latency_ms = event.get("latencyMs", 0)
|
latency_ms = event.get("latencyMs", 0)
|
||||||
print(f"← [TTFB] Server reported latency: {latency_ms}ms")
|
print(f"← [TTFB] Server reported latency: {latency_ms}ms")
|
||||||
|
elif event_type == "llmResponse":
|
||||||
|
# LLM text response
|
||||||
|
text = event.get("text", "")
|
||||||
|
is_final = event.get("isFinal", False)
|
||||||
|
if is_final:
|
||||||
|
# Print final LLM response
|
||||||
|
print(f"← AI: {text}")
|
||||||
|
elif self.verbose:
|
||||||
|
# Show streaming chunks only in verbose mode
|
||||||
|
display_text = text[:60] + "..." if len(text) > 60 else text
|
||||||
|
print(f" [streaming] {display_text}")
|
||||||
elif event_type == "trackStart":
|
elif event_type == "trackStart":
|
||||||
print("← Bot started speaking")
|
print("← Bot started speaking")
|
||||||
# IMPORTANT: Accept audio again after trackStart
|
# IMPORTANT: Accept audio again after trackStart
|
||||||
@@ -552,6 +568,11 @@ async def main():
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Disable interactive mode"
|
help="Disable interactive mode"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose", "-v",
|
||||||
|
action="store_true",
|
||||||
|
help="Show streaming LLM response chunks"
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@@ -565,6 +586,7 @@ async def main():
|
|||||||
input_device=args.input_device,
|
input_device=args.input_device,
|
||||||
output_device=args.output_device
|
output_device=args.output_device
|
||||||
)
|
)
|
||||||
|
client.verbose = args.verbose
|
||||||
|
|
||||||
await client.run(
|
await client.run(
|
||||||
chat_message=args.chat,
|
chat_message=args.chat,
|
||||||
|
|||||||
@@ -115,7 +115,13 @@ class WavFileClient:
|
|||||||
"direction": direction,
|
"direction": direction,
|
||||||
"message": message
|
"message": message
|
||||||
})
|
})
|
||||||
print(f"{direction} {message}")
|
# Handle encoding errors on Windows
|
||||||
|
try:
|
||||||
|
print(f"{direction} {message}")
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
# Replace problematic characters for console output
|
||||||
|
safe_message = message.encode('ascii', errors='replace').decode('ascii')
|
||||||
|
print(f"{direction} {safe_message}")
|
||||||
|
|
||||||
async def connect(self) -> None:
|
async def connect(self) -> None:
|
||||||
"""Connect to WebSocket server."""
|
"""Connect to WebSocket server."""
|
||||||
@@ -285,6 +291,14 @@ class WavFileClient:
|
|||||||
elif event_type == "ttfb":
|
elif event_type == "ttfb":
|
||||||
latency_ms = event.get("latencyMs", 0)
|
latency_ms = event.get("latencyMs", 0)
|
||||||
self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
|
self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
|
||||||
|
elif event_type == "llmResponse":
|
||||||
|
text = event.get("text", "")
|
||||||
|
is_final = event.get("isFinal", False)
|
||||||
|
if is_final:
|
||||||
|
self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
|
||||||
|
elif self.verbose:
|
||||||
|
# Show streaming chunks only in verbose mode
|
||||||
|
self.log_event("←", f"LLM: {text}")
|
||||||
elif event_type == "trackStart":
|
elif event_type == "trackStart":
|
||||||
self.track_started = True
|
self.track_started = True
|
||||||
self.log_event("←", "Bot started speaking")
|
self.log_event("←", "Bot started speaking")
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ Creates a 16kHz mono WAV file with real speech segments separated by
|
|||||||
configurable silence (for VAD/testing).
|
configurable silence (for VAD/testing).
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python scripts/generate_test_audio.py [OPTIONS]
|
python generate_test_audio.py [OPTIONS]
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
-o, --output PATH Output WAV path (default: data/audio_examples/two_utterances_16k.wav)
|
-o, --output PATH Output WAV path (default: data/audio_examples/two_utterances_16k.wav)
|
||||||
@@ -18,19 +18,18 @@ Options:
|
|||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
# Default utterances and output
|
# Default utterances and output
|
||||||
python scripts/generate_test_audio.py
|
python generate_test_audio.py
|
||||||
|
|
||||||
# Custom output path
|
# Custom output path
|
||||||
python scripts/generate_test_audio.py -o out.wav
|
python generate_test_audio.py -o out.wav
|
||||||
|
|
||||||
# Utterances from command line
|
# Utterances from command line
|
||||||
python scripts/generate_test_audio.py -u "Hello" -u "World" -o test.wav
|
python generate_test_audio.py -u "Hello" -u "World" -o test.wav
|
||||||
|
|
||||||
# Utterances from JSON file
|
# Utterancgenerate_test_audio.py -j utterances.json -o test.wav
|
||||||
python scripts/generate_test_audio.py -j utterances.json -o test.wav
|
|
||||||
|
|
||||||
# Custom silence (1s between utterances)
|
# Custom silence (1s between utterances)
|
||||||
python scripts/generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav
|
python generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav
|
||||||
|
|
||||||
Requires SILICONFLOW_API_KEY in .env.
|
Requires SILICONFLOW_API_KEY in .env.
|
||||||
"""
|
"""
|
||||||
@@ -47,7 +46,7 @@ from dotenv import load_dotenv
|
|||||||
|
|
||||||
|
|
||||||
# Load .env file from project root
|
# Load .env file from project root
|
||||||
project_root = Path(__file__).parent.parent
|
project_root = Path(__file__).parent.parent.parent
|
||||||
load_dotenv(project_root / ".env")
|
load_dotenv(project_root / ".env")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user