From 99a4154cbc68d5f291f22aef5614419fead8de5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Fri, 25 Apr 2025 11:30:00 -0700 Subject: [PATCH] LLMUserContextAggregator: ignore short uterrances while bot speaking --- CHANGELOG.md | 4 ++ .../processors/aggregators/llm_response.py | 45 ++++++++++++++----- 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 741d86c4b..92ee7d565 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Short utterances not detected by VAD while the bot is speaking are now + ignored. This reduces the amount of bot interruptions significantly providing + a more natural conversation experience. + - Updated `GladiaSTTService` to output a `TranslationFrame` when specifying a `translation` and `translation_config`. diff --git a/src/pipecat/processors/aggregators/llm_response.py b/src/pipecat/processors/aggregators/llm_response.py index 1f7a33ed1..2d9421634 100644 --- a/src/pipecat/processors/aggregators/llm_response.py +++ b/src/pipecat/processors/aggregators/llm_response.py @@ -12,6 +12,7 @@ from typing import Dict, List, Literal, Set from loguru import logger from pipecat.frames.frames import ( + BotStartedSpeakingFrame, BotStoppedSpeakingFrame, CancelFrame, EmulateUserStartedSpeakingFrame, @@ -259,9 +260,10 @@ class LLMUserContextAggregator(LLMContextResponseAggregator): self._params.aggregation_timeout = kwargs["aggregation_timeout"] - self._seen_interim_results = False self._user_speaking = False + self._bot_speaking = False self._emulating_vad = False + self._seen_interim_results = False self._waiting_for_aggregation = False self._aggregation_event = asyncio.Event() @@ -297,6 +299,12 @@ class LLMUserContextAggregator(LLMContextResponseAggregator): elif isinstance(frame, UserStoppedSpeakingFrame): await self._handle_user_stopped_speaking(frame) await self.push_frame(frame, direction) + elif isinstance(frame, BotStartedSpeakingFrame): + await self._handle_bot_started_speaking(frame) + await self.push_frame(frame, direction) + elif isinstance(frame, BotStoppedSpeakingFrame): + await self._handle_bot_stopped_speaking(frame) + await self.push_frame(frame, direction) elif isinstance(frame, TranscriptionFrame): await self._handle_transcription(frame) elif isinstance(frame, InterimTranscriptionFrame): @@ -352,6 +360,12 @@ class LLMUserContextAggregator(LLMContextResponseAggregator): if not self._seen_interim_results: await self.push_aggregation() + async def _handle_bot_started_speaking(self, _: BotStartedSpeakingFrame): + self._bot_speaking = True + + async def _handle_bot_stopped_speaking(self, _: BotStoppedSpeakingFrame): + self._bot_speaking = False + async def _handle_transcription(self, frame: TranscriptionFrame): text = frame.text @@ -383,7 +397,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator): await asyncio.wait_for( self._aggregation_event.wait(), self._params.aggregation_timeout ) - await self._maybe_push_bot_interruption() + await self._maybe_emulate_user_speaking() except asyncio.TimeoutError: if not self._user_speaking: await self.push_aggregation() @@ -398,18 +412,27 @@ class LLMUserContextAggregator(LLMContextResponseAggregator): finally: self._aggregation_event.clear() - async def _maybe_push_bot_interruption(self): - """If the user stopped speaking a while back and we got a transcription - frame we might want to interrupt the bot. + async def _maybe_emulate_user_speaking(self): + """Emulate user speaking if we got a transcription but it was not + detected by VAD. Only do that if the bot is not speaking. """ + # Check if we received a transcription but VAD was not able to detect + # voice (e.g. when you whisper a short utterance). In that case, we need + # to emulate VAD (i.e. user start/stopped speaking), but we do it only + # if the bot is not speaking. If the bot is speaking and we really have + # a short utterance we don't really want to interrupt the bot. if not self._user_speaking and not self._waiting_for_aggregation: - # If we reach this case we received a transcription but VAD was not - # able to detect voice (e.g. when you whisper a short - # utterance). So, we need to emulate VAD (i.e. user start/stopped - # speaking). - await self.push_frame(EmulateUserStartedSpeakingFrame(), FrameDirection.UPSTREAM) - self._emulating_vad = True + if self._bot_speaking: + # If we reached this case and the bot is speaking, let's ignore + # what the user said. + logger.debug("Ignoring user speaking emulation, bot is speaking.") + self.reset() + else: + # The bot is not speaking so, let's trigger user speaking + # emulation. + await self.push_frame(EmulateUserStartedSpeakingFrame(), FrameDirection.UPSTREAM) + self._emulating_vad = True class LLMAssistantContextAggregator(LLMContextResponseAggregator):