From 5803936838950ceb166c7cb2b0902602f406f9f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 26 Aug 2025 12:38:30 -0700 Subject: [PATCH 1/3] TextFrame: add skip_tts field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This lets a text frame bypass TTS while still being included in the LLM context. Useful for cases like structured text that isn’t meant to be spoken but should still contribute to context. --- CHANGELOG.md | 4 ++++ src/pipecat/frames/frames.py | 5 +++++ src/pipecat/services/tts_service.py | 4 +++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a643b7a3e..14b779f47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added `skip_tts` field to `TextFrame`. This lets a text frame bypass TTS while + still being included in the LLM context. Useful for cases like structured text + that isn’t meant to be spoken but should still contribute to context. + - Added a `cancel_timeout_secs` argument to `PipelineTask` which defines how long the pipeline has to complete cancellation. When `PipelineTask.cancel()` is called, a `CancelFrame` is pushed through the pipeline and must reach the diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 139857297..18ab13bac 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -305,6 +305,11 @@ class TextFrame(DataFrame): """ text: str + skip_tts: bool = field(init=False) + + def __post_init__(self): + super().__post_init__() + self.skip_tts = False def __str__(self): pts = format_pts(self.pts) diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index 2a210f093..afe5da62d 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -296,7 +296,9 @@ class TTSService(AIService): """ await super().process_frame(frame, direction) - if ( + if isinstance(frame, TextFrame) and frame.skip_tts: + await self.push_frame(frame, direction) + elif ( isinstance(frame, TextFrame) and not isinstance(frame, InterimTranscriptionFrame) and not isinstance(frame, TranscriptionFrame) From 16f57be72cec9f718ab1d42df258baac82b39f96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 26 Aug 2025 14:01:48 -0700 Subject: [PATCH 2/3] LLMConfigureOutputFrame: allow configuring LLM output --- CHANGELOG.md | 6 ++++++ src/pipecat/frames/frames.py | 15 +++++++++++++++ .../processors/aggregators/dtmf_aggregator.py | 2 +- src/pipecat/services/llm_service.py | 17 +++++++++++++++++ 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14b779f47..8ae588300 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added support for switching between audio+text to text-only modes within the + same pipeline. This is done by pushing + `LLMConfigureOutputFrame(skip_tts=True)` to enter text-only mode, and + disabling it to return to audio+text. The LLM will still generate tokens and + add them to the context, but they will not be sent to TTS. + - Added `skip_tts` field to `TextFrame`. This lets a text frame bypass TTS while still being included in the LLM context. Useful for cases like structured text that isn’t meant to be spoken but should still contribute to context. diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 18ab13bac..5052c0df2 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -607,6 +607,21 @@ class LLMEnablePromptCachingFrame(DataFrame): enable: bool +@dataclass +class LLMConfigureOutputFrame(DataFrame): + """Frame to configure LLM output. + + This frame is used to configure how the LLM produces output. For example, it + can tell the LLM to generate tokens that should be added to the context but + not spoken by the TTS service (if one is present in the pipeline). + + Parameters: + skip_tts: Whether LLM tokens should skip the TTS service (if any). + """ + + skip_tts: bool + + @dataclass class TTSSpeakFrame(DataFrame): """Frame containing text that should be spoken by TTS. diff --git a/src/pipecat/processors/aggregators/dtmf_aggregator.py b/src/pipecat/processors/aggregators/dtmf_aggregator.py index 24ef2a1e1..38e1296f6 100644 --- a/src/pipecat/processors/aggregators/dtmf_aggregator.py +++ b/src/pipecat/processors/aggregators/dtmf_aggregator.py @@ -103,7 +103,7 @@ class DTMFAggregator(FrameProcessor): digit_value = frame.button.value self._aggregation += digit_value - # For first digit, schedule interruption in separate task + # For first digit, schedule interruption. if is_first_digit: await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM) diff --git a/src/pipecat/services/llm_service.py b/src/pipecat/services/llm_service.py index 3152a0083..63f7659b3 100644 --- a/src/pipecat/services/llm_service.py +++ b/src/pipecat/services/llm_service.py @@ -37,6 +37,8 @@ from pipecat.frames.frames import ( FunctionCallResultFrame, FunctionCallResultProperties, FunctionCallsStartedFrame, + LLMConfigureOutputFrame, + LLMTextFrame, StartFrame, StartInterruptionFrame, UserImageRequestFrame, @@ -179,6 +181,7 @@ class LLMService(AIService): self._function_call_tasks: Dict[asyncio.Task, FunctionCallRunnerItem] = {} self._sequential_runner_task: Optional[asyncio.Task] = None self._tracing_enabled: bool = False + self._skip_tts: bool = False self._register_event_handler("on_function_calls_started") self._register_event_handler("on_completion_timeout") @@ -272,6 +275,20 @@ class LLMService(AIService): if isinstance(frame, StartInterruptionFrame): await self._handle_interruptions(frame) + elif isinstance(frame, LLMConfigureOutputFrame): + self._skip_tts = frame.skip_tts + + async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM): + """Pushes a frame. + + Args: + frame: The frame to push. + direction: The direction of frame pushing. + """ + if isinstance(frame, LLMTextFrame): + frame.skip_tts = self._skip_tts + + await super().push_frame(frame, direction) async def _handle_interruptions(self, _: StartInterruptionFrame): for function_name, entry in self._functions.items(): From eb248fedc1286f89fa9cd9288b6db3932f4b7c18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Tue, 26 Aug 2025 14:47:19 -0700 Subject: [PATCH 3/3] add skip_tts to LLMFullResponseStartFrame/LLMFullResponseEndFrame --- src/pipecat/frames/frames.py | 12 ++++++++++-- src/pipecat/services/llm_service.py | 5 +++-- src/pipecat/services/tts_service.py | 5 ++++- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 5052c0df2..691a09f5b 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -1351,14 +1351,22 @@ class LLMFullResponseStartFrame(ControlFrame): more TextFrames and a final LLMFullResponseEndFrame. """ - pass + skip_tts: bool = field(init=False) + + def __post_init__(self): + super().__post_init__() + self.skip_tts = False @dataclass class LLMFullResponseEndFrame(ControlFrame): """Frame indicating the end of an LLM response.""" - pass + skip_tts: bool = field(init=False) + + def __post_init__(self): + super().__post_init__() + self.skip_tts = False @dataclass diff --git a/src/pipecat/services/llm_service.py b/src/pipecat/services/llm_service.py index 63f7659b3..a44f7ab26 100644 --- a/src/pipecat/services/llm_service.py +++ b/src/pipecat/services/llm_service.py @@ -14,7 +14,6 @@ from typing import ( Awaitable, Callable, Dict, - List, Mapping, Optional, Protocol, @@ -38,6 +37,8 @@ from pipecat.frames.frames import ( FunctionCallResultProperties, FunctionCallsStartedFrame, LLMConfigureOutputFrame, + LLMFullResponseEndFrame, + LLMFullResponseStartFrame, LLMTextFrame, StartFrame, StartInterruptionFrame, @@ -285,7 +286,7 @@ class LLMService(AIService): frame: The frame to push. direction: The direction of frame pushing. """ - if isinstance(frame, LLMTextFrame): + if isinstance(frame, (LLMTextFrame, LLMFullResponseStartFrame, LLMFullResponseEndFrame)): frame.skip_tts = self._skip_tts await super().push_frame(frame, direction) diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index afe5da62d..9e34adf25 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -296,7 +296,10 @@ class TTSService(AIService): """ await super().process_frame(frame, direction) - if isinstance(frame, TextFrame) and frame.skip_tts: + if ( + isinstance(frame, (TextFrame, LLMFullResponseStartFrame, LLMFullResponseEndFrame)) + and frame.skip_tts + ): await self.push_frame(frame, direction) elif ( isinstance(frame, TextFrame)