From d33c72a8b0bd7b874f70bcd5099f6f3aa20193cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Thu, 11 Dec 2025 10:29:56 -0800 Subject: [PATCH] LLMUserAggregator: allow external user started/stopped speaking frames --- .../aggregators/llm_response_universal.py | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/pipecat/processors/aggregators/llm_response_universal.py b/src/pipecat/processors/aggregators/llm_response_universal.py index df3f707c3..a007c5ebd 100644 --- a/src/pipecat/processors/aggregators/llm_response_universal.py +++ b/src/pipecat/processors/aggregators/llm_response_universal.py @@ -15,6 +15,7 @@ import asyncio import json import warnings from abc import abstractmethod +from dataclasses import dataclass from typing import Any, Dict, List, Literal, Optional, Set from loguru import logger @@ -58,7 +59,6 @@ from pipecat.processors.aggregators.llm_context import ( ) from pipecat.processors.aggregators.llm_response import ( LLMAssistantAggregatorParams, - LLMUserAggregatorParams, ) from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.turns.bot.base_bot_turn_start_strategy import BaseBotTurnStartStrategy @@ -67,6 +67,21 @@ from pipecat.utils.string import TextPartForConcatenation, concatenate_aggregate from pipecat.utils.time import time_now_iso8601 +@dataclass +class LLMUserAggregatorParams: + """Parameters for configuring LLM user aggregation behavior. + + Parameters: + enable_user_speaking_frames: If True, the aggregator will emit frames + indicating when the user starts and stops speaking, as well as + interruption frames. This is enabled by default, but you may want + to disable it if another component (e.g., an STT service) is already + generating these frames. + """ + + enable_user_speaking_frames: bool = True + + class LLMContextAggregator(FrameProcessor): """Base LLM aggregator that uses an LLMContext for conversation storage. @@ -370,15 +385,15 @@ class LLMUserAggregator(LLMContextAggregator): self._user_speaking = True - logger.debug(f"User started speaking (user turn start strategy: {strategy})") - # Reset all user turn start strategies to start fresh. if self.turn_start_strategies: for s in self.turn_start_strategies.user: await s.reset() - await self.push_frame(UserStartedSpeakingFrame()) - await self.push_frame(InterruptionFrame()) + if self._params.enable_user_speaking_frames: + logger.debug(f"User started speaking (user turn start strategy: {strategy})") + await self.push_frame(UserStartedSpeakingFrame()) + await self.push_frame(InterruptionFrame()) async def _trigger_bot_turn_start(self, strategy: BaseBotTurnStartStrategy): if not self._user_speaking: @@ -386,14 +401,16 @@ class LLMUserAggregator(LLMContextAggregator): self._user_speaking = False - logger.debug(f"User stopped speaking (bot turn start strategy: {strategy})") - # Reset all bot turn start strategies to start fresh. if self.turn_start_strategies: for s in self.turn_start_strategies.bot: await s.reset() - await self.push_frame(UserStoppedSpeakingFrame()) + if self._params.enable_user_speaking_frames: + logger.debug(f"User stopped speaking (bot turn start strategy: {strategy})") + await self.push_frame(UserStoppedSpeakingFrame()) + + # Always push context frame. await self.push_aggregation()