diff --git a/CHANGELOG.md b/CHANGELOG.md index 04a8eca84..7c15476b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - For `LmntTTSService`, changed the default `model` to `blizzard`, LMNT's recommended model. +### Fixed + +- Fixed an issue where, in some edge cases, the `EmulateUserStartedSpeakingFrame` + could be created even if we didn't have a transcription. + ## [0.0.76] - 2025-07-11 ### Added diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py index 8c9212ea0..568314f77 100644 --- a/src/pipecat/processors/aggregators/llm_response.py +++ b/src/pipecat/processors/aggregators/llm_response.py @@ -693,7 +693,11 @@ class LLMUserContextAggregator(LLMContextResponseAggregator): # to emulate VAD (i.e. user start/stopped speaking), but we do it only # if the bot is not speaking. If the bot is speaking and we really have # a short utterance we don't really want to interrupt the bot. - if not self._user_speaking and not self._waiting_for_aggregation: + if ( + not self._user_speaking + and not self._waiting_for_aggregation + and len(self._aggregation) > 0 + ): if self._bot_speaking: # If we reached this case and the bot is speaking, let's ignore # what the user said.