diff --git a/CHANGELOG.md b/CHANGELOG.md
index 04a8eca84..7c15476b8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - For `LmntTTSService`, changed the default `model` to `blizzard`, LMNT's
   recommended model.
 
+### Fixed
+
+- Fixed an issue where, in some edge cases, the `EmulateUserStartedSpeakingFrame`
+  could be created even if we didn't have a transcription.
+
 ## [0.0.76] - 2025-07-11
 
 ### Added
diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py
index 8c9212ea0..568314f77 100644
--- a/src/pipecat/processors/aggregators/llm_response.py
+++ b/src/pipecat/processors/aggregators/llm_response.py
@@ -693,7 +693,11 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
         # to emulate VAD (i.e. user start/stopped speaking), but we do it only
         # if the bot is not speaking. If the bot is speaking and we really have
         # a short utterance we don't really want to interrupt the bot.
-        if not self._user_speaking and not self._waiting_for_aggregation:
+        if (
+            not self._user_speaking
+            and not self._waiting_for_aggregation
+            and len(self._aggregation) > 0
+        ):
             if self._bot_speaking:
                 # If we reached this case and the bot is speaking, let's ignore
                 # what the user said.