diff --git a/core/duplex_pipeline.py b/core/duplex_pipeline.py index 4dd0bae..6ddb8fd 100644 --- a/core/duplex_pipeline.py +++ b/core/duplex_pipeline.py @@ -426,6 +426,15 @@ class DuplexPipeline: sentence_buffer += text_chunk await self.conversation.update_assistant_text(text_chunk) + # Send LLM response streaming event to client + await self.transport.send_event({ + "event": "llmResponse", + "trackId": self.session_id, + "text": text_chunk, + "isFinal": False, + "timestamp": self._get_timestamp_ms() + }) + # Check for sentence completion - synthesize immediately for low latency while any(end in sentence_buffer for end in sentence_ends): # Find first sentence end @@ -454,6 +463,16 @@ class DuplexPipeline: else: break + # Send final LLM response event + if full_response and not self._interrupt_event.is_set(): + await self.transport.send_event({ + "event": "llmResponse", + "trackId": self.session_id, + "text": full_response, + "isFinal": True, + "timestamp": self._get_timestamp_ms() + }) + # Speak any remaining text if sentence_buffer.strip() and not self._interrupt_event.is_set(): if not first_audio_sent: diff --git a/examples/mic_client.py b/examples/mic_client.py index 3d9acc7..509aeaa 100644 --- a/examples/mic_client.py +++ b/examples/mic_client.py @@ -4,10 +4,12 @@ Microphone client for testing duplex voice conversation. This client captures audio from the microphone, sends it to the server, and plays back the AI's voice response through the speakers. +It also displays the LLM's text responses in the console. Usage: python examples/mic_client.py --url ws://localhost:8000/ws python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!" + python examples/mic_client.py --url ws://localhost:8000/ws --verbose Requirements: pip install sounddevice soundfile websockets numpy @@ -101,6 +103,9 @@ class MicrophoneClient: # Interrupt handling - discard audio until next trackStart self._discard_audio = False self._audio_sequence = 0 # Track audio sequence to detect stale chunks + + # Verbose mode for streaming LLM responses + self.verbose = False async def connect(self) -> None: """Connect to WebSocket server.""" @@ -314,6 +319,17 @@ class MicrophoneClient: # Server-side TTFB event latency_ms = event.get("latencyMs", 0) print(f"← [TTFB] Server reported latency: {latency_ms}ms") + elif event_type == "llmResponse": + # LLM text response + text = event.get("text", "") + is_final = event.get("isFinal", False) + if is_final: + # Print final LLM response + print(f"← AI: {text}") + elif self.verbose: + # Show streaming chunks only in verbose mode + display_text = text[:60] + "..." if len(text) > 60 else text + print(f" [streaming] {display_text}") elif event_type == "trackStart": print("← Bot started speaking") # IMPORTANT: Accept audio again after trackStart @@ -552,6 +568,11 @@ async def main(): action="store_true", help="Disable interactive mode" ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Show streaming LLM response chunks" + ) args = parser.parse_args() @@ -565,6 +586,7 @@ async def main(): input_device=args.input_device, output_device=args.output_device ) + client.verbose = args.verbose await client.run( chat_message=args.chat, diff --git a/examples/wav_client.py b/examples/wav_client.py index d784bba..5385acf 100644 --- a/examples/wav_client.py +++ b/examples/wav_client.py @@ -115,7 +115,13 @@ class WavFileClient: "direction": direction, "message": message }) - print(f"{direction} {message}") + # Handle encoding errors on Windows + try: + print(f"{direction} {message}") + except UnicodeEncodeError: + # Replace problematic characters for console output + safe_message = message.encode('ascii', errors='replace').decode('ascii') + print(f"{direction} {safe_message}") async def connect(self) -> None: """Connect to WebSocket server.""" @@ -285,6 +291,14 @@ class WavFileClient: elif event_type == "ttfb": latency_ms = event.get("latencyMs", 0) self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms") + elif event_type == "llmResponse": + text = event.get("text", "") + is_final = event.get("isFinal", False) + if is_final: + self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}") + elif self.verbose: + # Show streaming chunks only in verbose mode + self.log_event("←", f"LLM: {text}") elif event_type == "trackStart": self.track_started = True self.log_event("←", "Bot started speaking") diff --git a/scripts/generate_test_audio/generate_test_audio.py b/scripts/generate_test_audio/generate_test_audio.py index 66c1908..9b37f5f 100644 --- a/scripts/generate_test_audio/generate_test_audio.py +++ b/scripts/generate_test_audio/generate_test_audio.py @@ -6,7 +6,7 @@ Creates a 16kHz mono WAV file with real speech segments separated by configurable silence (for VAD/testing). Usage: - python scripts/generate_test_audio.py [OPTIONS] + python generate_test_audio.py [OPTIONS] Options: -o, --output PATH Output WAV path (default: data/audio_examples/two_utterances_16k.wav) @@ -18,19 +18,18 @@ Options: Examples: # Default utterances and output - python scripts/generate_test_audio.py + python generate_test_audio.py # Custom output path - python scripts/generate_test_audio.py -o out.wav + python generate_test_audio.py -o out.wav # Utterances from command line - python scripts/generate_test_audio.py -u "Hello" -u "World" -o test.wav + python generate_test_audio.py -u "Hello" -u "World" -o test.wav - # Utterances from JSON file - python scripts/generate_test_audio.py -j utterances.json -o test.wav + # Utterancgenerate_test_audio.py -j utterances.json -o test.wav # Custom silence (1s between utterances) - python scripts/generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav + python generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav Requires SILICONFLOW_API_KEY in .env. """ @@ -47,7 +46,7 @@ from dotenv import load_dotenv # Load .env file from project root -project_root = Path(__file__).parent.parent +project_root = Path(__file__).parent.parent.parent load_dotenv(project_root / ".env")