LLMUserContextAggregator: ignore short uterrances while bot speaking
This commit is contained in:
@@ -25,6 +25,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Changed
|
||||
|
||||
- Short utterances not detected by VAD while the bot is speaking are now
|
||||
ignored. This reduces the amount of bot interruptions significantly providing
|
||||
a more natural conversation experience.
|
||||
|
||||
- Updated `GladiaSTTService` to output a `TranslationFrame` when specifying a
|
||||
`translation` and `translation_config`.
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ from typing import Dict, List, Literal, Set
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
CancelFrame,
|
||||
EmulateUserStartedSpeakingFrame,
|
||||
@@ -259,9 +260,10 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
||||
|
||||
self._params.aggregation_timeout = kwargs["aggregation_timeout"]
|
||||
|
||||
self._seen_interim_results = False
|
||||
self._user_speaking = False
|
||||
self._bot_speaking = False
|
||||
self._emulating_vad = False
|
||||
self._seen_interim_results = False
|
||||
self._waiting_for_aggregation = False
|
||||
|
||||
self._aggregation_event = asyncio.Event()
|
||||
@@ -297,6 +299,12 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
await self._handle_user_stopped_speaking(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, BotStartedSpeakingFrame):
|
||||
await self._handle_bot_started_speaking(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||
await self._handle_bot_stopped_speaking(frame)
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, TranscriptionFrame):
|
||||
await self._handle_transcription(frame)
|
||||
elif isinstance(frame, InterimTranscriptionFrame):
|
||||
@@ -352,6 +360,12 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
||||
if not self._seen_interim_results:
|
||||
await self.push_aggregation()
|
||||
|
||||
async def _handle_bot_started_speaking(self, _: BotStartedSpeakingFrame):
|
||||
self._bot_speaking = True
|
||||
|
||||
async def _handle_bot_stopped_speaking(self, _: BotStoppedSpeakingFrame):
|
||||
self._bot_speaking = False
|
||||
|
||||
async def _handle_transcription(self, frame: TranscriptionFrame):
|
||||
text = frame.text
|
||||
|
||||
@@ -383,7 +397,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
||||
await asyncio.wait_for(
|
||||
self._aggregation_event.wait(), self._params.aggregation_timeout
|
||||
)
|
||||
await self._maybe_push_bot_interruption()
|
||||
await self._maybe_emulate_user_speaking()
|
||||
except asyncio.TimeoutError:
|
||||
if not self._user_speaking:
|
||||
await self.push_aggregation()
|
||||
@@ -398,18 +412,27 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
||||
finally:
|
||||
self._aggregation_event.clear()
|
||||
|
||||
async def _maybe_push_bot_interruption(self):
|
||||
"""If the user stopped speaking a while back and we got a transcription
|
||||
frame we might want to interrupt the bot.
|
||||
async def _maybe_emulate_user_speaking(self):
|
||||
"""Emulate user speaking if we got a transcription but it was not
|
||||
detected by VAD. Only do that if the bot is not speaking.
|
||||
|
||||
"""
|
||||
# Check if we received a transcription but VAD was not able to detect
|
||||
# voice (e.g. when you whisper a short utterance). In that case, we need
|
||||
# to emulate VAD (i.e. user start/stopped speaking), but we do it only
|
||||
# if the bot is not speaking. If the bot is speaking and we really have
|
||||
# a short utterance we don't really want to interrupt the bot.
|
||||
if not self._user_speaking and not self._waiting_for_aggregation:
|
||||
# If we reach this case we received a transcription but VAD was not
|
||||
# able to detect voice (e.g. when you whisper a short
|
||||
# utterance). So, we need to emulate VAD (i.e. user start/stopped
|
||||
# speaking).
|
||||
await self.push_frame(EmulateUserStartedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
self._emulating_vad = True
|
||||
if self._bot_speaking:
|
||||
# If we reached this case and the bot is speaking, let's ignore
|
||||
# what the user said.
|
||||
logger.debug("Ignoring user speaking emulation, bot is speaking.")
|
||||
self.reset()
|
||||
else:
|
||||
# The bot is not speaking so, let's trigger user speaking
|
||||
# emulation.
|
||||
await self.push_frame(EmulateUserStartedSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
self._emulating_vad = True
|
||||
|
||||
|
||||
class LLMAssistantContextAggregator(LLMContextResponseAggregator):
|
||||
|
||||
Reference in New Issue
Block a user