diff --git a/engine/app/config.py b/engine/app/config.py
index f1bb729..2eb4bbe 100644
--- a/engine/app/config.py
+++ b/engine/app/config.py
@@ -53,6 +53,10 @@ class Settings(BaseSettings):
     siliconflow_asr_model: str = Field(default="FunAudioLLM/SenseVoiceSmall", description="SiliconFlow ASR model")
     asr_interim_interval_ms: int = Field(default=500, description="Interval for interim ASR results in ms")
     asr_min_audio_ms: int = Field(default=300, description="Minimum audio duration before first ASR result")
+    asr_start_min_speech_ms: int = Field(
+        default=160,
+        description="Minimum continuous speech duration before ASR capture starts"
+    )
     asr_pre_speech_ms: int = Field(
         default=240,
         description="Audio context (ms) prepended before detected speech to avoid clipping first phoneme"
diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py
index 2f45387..a4fe91b 100644
--- a/engine/core/duplex_pipeline.py
+++ b/engine/core/duplex_pipeline.py
@@ -230,6 +230,11 @@ class DuplexPipeline:
         self._audio_buffer: bytes = b""
         max_buffer_seconds = settings.max_audio_buffer_seconds if hasattr(settings, "max_audio_buffer_seconds") else 30
         self._max_audio_buffer_bytes = int(settings.sample_rate * 2 * max_buffer_seconds)
+        self._asr_start_min_speech_ms: int = (
+            settings.asr_start_min_speech_ms if hasattr(settings, "asr_start_min_speech_ms") else 160
+        )
+        self._asr_capture_active: bool = False
+        self._pending_speech_audio: bytes = b""
         # Keep a short rolling pre-speech window so VAD transition latency
         # does not clip the first phoneme/character sent to ASR.
         pre_speech_ms = settings.asr_pre_speech_ms if hasattr(settings, "asr_pre_speech_ms") else 240
@@ -646,7 +651,7 @@ class DuplexPipeline:
 
                 # Update state based on VAD
                 if vad_status == "Speech" and self._last_vad_status != "Speech":
-                    await self._on_speech_start(current_chunk=pcm_bytes)
+                    await self._on_speech_start()
 
                 self._last_vad_status = vad_status
 
@@ -685,20 +690,44 @@ class DuplexPipeline:
                     self._barge_in_speech_frames = 0
                     self._barge_in_silence_frames = 0
 
-                # 3. Buffer audio for ASR
-                if vad_status == "Speech" or self.conversation.state == ConversationState.LISTENING:
-                    self._audio_buffer += pcm_bytes
-                    if len(self._audio_buffer) > self._max_audio_buffer_bytes:
-                        # Keep only the most recent audio to cap memory usage
-                        self._audio_buffer = self._audio_buffer[-self._max_audio_buffer_bytes:]
-                    await self.asr_service.send_audio(pcm_bytes)
+                # 3. Buffer audio for ASR.
+                # Gate ASR startup by a short speech-duration threshold to reduce
+                # false positives from micro noises, then always close the turn
+                # by EOU once ASR has started.
+                just_started_asr = False
+                if vad_status == "Speech" and not self._asr_capture_active:
+                    self._pending_speech_audio += pcm_bytes
+                    pending_ms = (len(self._pending_speech_audio) / (settings.sample_rate * 2)) * 1000.0
+                    if pending_ms >= self._asr_start_min_speech_ms:
+                        await self._start_asr_capture()
+                        just_started_asr = True
+
+                if self._asr_capture_active:
+                    if not just_started_asr:
+                        self._audio_buffer += pcm_bytes
+                        if len(self._audio_buffer) > self._max_audio_buffer_bytes:
+                            # Keep only the most recent audio to cap memory usage
+                            self._audio_buffer = self._audio_buffer[-self._max_audio_buffer_bytes:]
+                        await self.asr_service.send_audio(pcm_bytes)
 
                     # For SiliconFlow ASR, trigger interim transcription periodically
                     # The service handles timing internally via start_interim_transcription()
 
                 # 4. Check for End of Utterance - this triggers LLM response
-                if self.eou_detector.process(vad_status):
+                if self.eou_detector.process(vad_status, force_eligible=self._asr_capture_active):
                     await self._on_end_of_utterance()
+                elif (
+                    vad_status == "Silence"
+                    and not self.eou_detector.is_speaking
+                    and not self._asr_capture_active
+                    and self.conversation.state == ConversationState.LISTENING
+                ):
+                    # Speech was too short to pass ASR gate; reset turn so next
+                    # utterance can start cleanly.
+                    self._pending_speech_audio = b""
+                    self._audio_buffer = b""
+                    self._last_sent_transcript = ""
+                    await self.conversation.set_state(ConversationState.IDLE)
 
         except Exception as e:
             logger.error(f"Pipeline audio processing error: {e}", exc_info=True)
@@ -757,32 +786,44 @@ class DuplexPipeline:
             logger.info(f"[ASR] ASR interim: {text[:100]}")
         logger.debug(f"Sent transcript ({'final' if is_final else 'interim'}): {text[:50]}...")
 
-    async def _on_speech_start(self, current_chunk: bytes = b"") -> None:
+    async def _on_speech_start(self) -> None:
         """Handle user starting to speak."""
         if self.conversation.state in (ConversationState.IDLE, ConversationState.INTERRUPTED):
             await self.conversation.start_user_turn()
             self._audio_buffer = b""
             self._last_sent_transcript = ""
             self.eou_detector.reset()
+            self._asr_capture_active = False
+            self._pending_speech_audio = b""
 
-            # Clear ASR buffer and start interim transcriptions
+            # Clear ASR buffer. Interim starts only after ASR capture is activated.
             if hasattr(self.asr_service, 'clear_buffer'):
                 self.asr_service.clear_buffer()
-            if hasattr(self.asr_service, 'start_interim_transcription'):
-                await self.asr_service.start_interim_transcription()
-            # Prime ASR with a short pre-speech context window so the utterance
-            # start isn't lost while waiting for VAD to transition to Speech.
-            pre_roll = self._pre_speech_buffer
-            if current_chunk and len(pre_roll) > len(current_chunk):
-                pre_roll = pre_roll[:-len(current_chunk)]
-            elif current_chunk:
-                pre_roll = b""
-            if pre_roll:
-                await self.asr_service.send_audio(pre_roll)
-                self._audio_buffer = pre_roll
 
             logger.debug("User speech started")
 
+    async def _start_asr_capture(self) -> None:
+        """Start ASR capture for the current turn after min speech gate passes."""
+        if self._asr_capture_active:
+            return
+
+        if hasattr(self.asr_service, 'start_interim_transcription'):
+            await self.asr_service.start_interim_transcription()
+
+        # Prime ASR with a short pre-speech context window so the utterance
+        # start isn't lost while waiting for VAD to transition to Speech.
+        pre_roll = self._pre_speech_buffer
+        capture_audio = pre_roll + self._pending_speech_audio
+        if capture_audio:
+            await self.asr_service.send_audio(capture_audio)
+            self._audio_buffer = capture_audio[-self._max_audio_buffer_bytes:]
+
+        self._asr_capture_active = True
+        logger.debug(
+            f"ASR capture started after speech gate ({self._asr_start_min_speech_ms}ms), "
+            f"capture={len(capture_audio)} bytes"
+        )
+
     async def _on_end_of_utterance(self) -> None:
         """Handle end of user utterance."""
         if self.conversation.state not in (ConversationState.LISTENING, ConversationState.INTERRUPTED):
@@ -813,6 +854,8 @@ class DuplexPipeline:
             # Reset for next utterance
             self._audio_buffer = b""
             self._last_sent_transcript = ""
+            self._asr_capture_active = False
+            self._pending_speech_audio = b""
             # Return to idle; don't force LISTENING which causes buffering on silence
             await self.conversation.set_state(ConversationState.IDLE)
             return
@@ -833,6 +876,8 @@ class DuplexPipeline:
         # Clear buffers
         self._audio_buffer = b""
         self._last_sent_transcript = ""
+        self._asr_capture_active = False
+        self._pending_speech_audio = b""
 
         # Process the turn - trigger LLM response
         # Cancel any existing turn to avoid overlapping assistant responses
@@ -1493,6 +1538,8 @@ class DuplexPipeline:
         await self.conversation.start_user_turn()
         self._audio_buffer = b""
         self.eou_detector.reset()
+        self._asr_capture_active = False
+        self._pending_speech_audio = b""
 
     async def _stop_current_speech(self) -> None:
         """Stop any current speech task."""
diff --git a/engine/processors/eou.py b/engine/processors/eou.py
index baf6807..22d104f 100644
--- a/engine/processors/eou.py
+++ b/engine/processors/eou.py
@@ -30,7 +30,7 @@ class EouDetector:
         self.silence_start_time: Optional[float] = None
         self.triggered = False
 
-    def process(self, vad_status: str) -> bool:
+    def process(self, vad_status: str, force_eligible: bool = False) -> bool:
         """
         Process VAD status and detect end of utterance.
 
@@ -58,7 +58,7 @@ class EouDetector:
                 self.silence_start_time = now
 
             speech_duration = self.silence_start_time - self.speech_start_time
-            if speech_duration < self.min_speech:
+            if speech_duration < self.min_speech and not force_eligible:
                 self.is_speaking = False
                 self.silence_start_time = None
                 return False