api has llm response event

2026-02-04 12:00:52 +08:00
parent 5aa9a12ca8
commit 7d255468ab
4 changed files with 63 additions and 9 deletions
--- a/core/duplex_pipeline.py
+++ b/core/duplex_pipeline.py
@@ -426,6 +426,15 @@ class DuplexPipeline:
                sentence_buffer += text_chunk
                await self.conversation.update_assistant_text(text_chunk)
                # Send LLM response streaming event to client
                await self.transport.send_event({
                    "event": "llmResponse",
                    "trackId": self.session_id,
                    "text": text_chunk,
                    "isFinal": False,
                    "timestamp": self._get_timestamp_ms()
                })
                # Check for sentence completion - synthesize immediately for low latency
                while any(end in sentence_buffer for end in sentence_ends):
                    # Find first sentence end
@@ -454,6 +463,16 @@ class DuplexPipeline:
                    else:
                        break
            # Send final LLM response event
            if full_response and not self._interrupt_event.is_set():
                await self.transport.send_event({
                    "event": "llmResponse",
                    "trackId": self.session_id,
                    "text": full_response,
                    "isFinal": True,
                    "timestamp": self._get_timestamp_ms()
                })
            # Speak any remaining text
            if sentence_buffer.strip() and not self._interrupt_event.is_set():
                if not first_audio_sent:
--- a/examples/mic_client.py
+++ b/examples/mic_client.py
@@ -4,10 +4,12 @@ Microphone client for testing duplex voice conversation.
 This client captures audio from the microphone, sends it to the server,
 and plays back the AI's voice response through the speakers.
 It also displays the LLM's text responses in the console.
 Usage:
    python examples/mic_client.py --url ws://localhost:8000/ws
    python examples/mic_client.py --url ws://localhost:8000/ws --chat "Hello!"
    python examples/mic_client.py --url ws://localhost:8000/ws --verbose
 Requirements:
    pip install sounddevice soundfile websockets numpy
@@ -102,6 +104,9 @@ class MicrophoneClient:
        self._discard_audio = False
        self._audio_sequence = 0  # Track audio sequence to detect stale chunks
        # Verbose mode for streaming LLM responses
        self.verbose = False
    async def connect(self) -> None:
        """Connect to WebSocket server."""
        print(f"Connecting to {self.url}...")
@@ -314,6 +319,17 @@ class MicrophoneClient:
            # Server-side TTFB event
            latency_ms = event.get("latencyMs", 0)
            print(f"← [TTFB] Server reported latency: {latency_ms}ms")
        elif event_type == "llmResponse":
            # LLM text response
            text = event.get("text", "")
            is_final = event.get("isFinal", False)
            if is_final:
                # Print final LLM response
                print(f"← AI: {text}")
            elif self.verbose:
                # Show streaming chunks only in verbose mode
                display_text = text[:60] + "..." if len(text) > 60 else text
                print(f"  [streaming] {display_text}")
        elif event_type == "trackStart":
            print("← Bot started speaking")
            # IMPORTANT: Accept audio again after trackStart
@@ -552,6 +568,11 @@ async def main():
        action="store_true",
        help="Disable interactive mode"
    )
    parser.add_argument(
        "--verbose", "-v",
        action="store_true",
        help="Show streaming LLM response chunks"
    )
    args = parser.parse_args()
@@ -565,6 +586,7 @@ async def main():
        input_device=args.input_device,
        output_device=args.output_device
    )
    client.verbose = args.verbose
    await client.run(
        chat_message=args.chat,
--- a/examples/wav_client.py
+++ b/examples/wav_client.py
@@ -115,7 +115,13 @@ class WavFileClient:
            "direction": direction,
            "message": message
        })
-        print(f"{direction} {message}")
+        # Handle encoding errors on Windows
        try:
            print(f"{direction} {message}")
        except UnicodeEncodeError:
            # Replace problematic characters for console output
            safe_message = message.encode('ascii', errors='replace').decode('ascii')
            print(f"{direction} {safe_message}")
    async def connect(self) -> None:
        """Connect to WebSocket server."""
@@ -285,6 +291,14 @@ class WavFileClient:
        elif event_type == "ttfb":
            latency_ms = event.get("latencyMs", 0)
            self.log_event("←", f"[TTFB] Server latency: {latency_ms}ms")
        elif event_type == "llmResponse":
            text = event.get("text", "")
            is_final = event.get("isFinal", False)
            if is_final:
                self.log_event("←", f"LLM Response (final): {text[:100]}{'...' if len(text) > 100 else ''}")
            elif self.verbose:
                # Show streaming chunks only in verbose mode
                self.log_event("←", f"LLM: {text}")
        elif event_type == "trackStart":
            self.track_started = True
            self.log_event("←", "Bot started speaking")
--- a/scripts/generate_test_audio/generate_test_audio.py
+++ b/scripts/generate_test_audio/generate_test_audio.py
@@ -6,7 +6,7 @@ Creates a 16kHz mono WAV file with real speech segments separated by
 configurable silence (for VAD/testing).
 Usage:
-  python scripts/generate_test_audio.py [OPTIONS]
+  python generate_test_audio.py [OPTIONS]
 Options:
  -o, --output PATH       Output WAV path (default: data/audio_examples/two_utterances_16k.wav)
@@ -18,19 +18,18 @@ Options:
 Examples:
  # Default utterances and output
-  python scripts/generate_test_audio.py
+  python generate_test_audio.py
  # Custom output path
-  python scripts/generate_test_audio.py -o out.wav
+  python generate_test_audio.py -o out.wav
  # Utterances from command line
-  python scripts/generate_test_audio.py -u "Hello" -u "World" -o test.wav
+  python generate_test_audio.py -u "Hello" -u "World" -o test.wav
-  # Utterances from JSON file
+  # Utterancgenerate_test_audio.py -j utterances.json -o test.wav
  python scripts/generate_test_audio.py -j utterances.json -o test.wav
  # Custom silence (1s between utterances)
-  python scripts/generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav
+  python generate_test_audio.py -u "One" -u "Two" --silence-ms 1000 -o test.wav
 Requires SILICONFLOW_API_KEY in .env.
 """
@@ -47,7 +46,7 @@ from dotenv import load_dotenv
 # Load .env file from project root
-project_root = Path(__file__).parent.parent
+project_root = Path(__file__).parent.parent.parent
 load_dotenv(project_root / ".env")