From 727af2e6fb7d59758b63779fea441d511eeab0aa Mon Sep 17 00:00:00 2001 From: Filipi Fuchter Date: Mon, 14 Jul 2025 17:38:03 -0300 Subject: [PATCH] Only create the EmulateUserStartedSpeakingFrame if we have received a transcription. --- CHANGELOG.md | 5 +++++ src/pipecat/processors/aggregators/llm_response.py | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04a8eca84..7c15476b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - For `LmntTTSService`, changed the default `model` to `blizzard`, LMNT's recommended model. +### Fixed + +- Fixed an issue where, in some edge cases, the `EmulateUserStartedSpeakingFrame` + could be created even if we didn't have a transcription. + ## [0.0.76] - 2025-07-11 ### Added diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py index 8c9212ea0..568314f77 100644 --- a/src/pipecat/processors/aggregators/llm_response.py +++ b/src/pipecat/processors/aggregators/llm_response.py @@ -693,7 +693,11 @@ class LLMUserContextAggregator(LLMContextResponseAggregator): # to emulate VAD (i.e. user start/stopped speaking), but we do it only # if the bot is not speaking. If the bot is speaking and we really have # a short utterance we don't really want to interrupt the bot. - if not self._user_speaking and not self._waiting_for_aggregation: + if ( + not self._user_speaking + and not self._waiting_for_aggregation + and len(self._aggregation) > 0 + ): if self._bot_speaking: # If we reached this case and the bot is speaking, let's ignore # what the user said.