Add direct muting to the STTMuteFilter

2025-05-02 14:46:00 -04:00
4 changed files with 80 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Added

+- Added two new frames `RequestSTTMuteFrame`, `RequestSTTUnmuteFrame`. These
+  frames tell the `STTMuteFilter` to directly mute or unmute the user and
+  take precedent over the mute strategies when an `RequestSTTMuteFrame` is
+  processed.
+
 - `BaseOutputTransport` now allows multiple destinations if the transport
  implementation supports it (e.g. Daily's custom tracks). With multiple
  destinations it is possible to send different audio or video tracks with a
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -695,6 +695,20 @@ class STTMuteFrame(SystemFrame):
    mute: bool


+@dataclass
+class RequestSTTMuteFrame(Frame):
+    """Request to mute the STT service."""
+
+    pass
+
+
+@dataclass
+class RequestSTTUnmuteFrame(Frame):
+    """Request to unmute the STT service."""
+
+    pass
+
+
@dataclass
 class TransportMessageUrgentFrame(SystemFrame):
    message: Any
--- a/src/pipecat/processors/filters/stt_mute_filter.py
+++ b/src/pipecat/processors/filters/stt_mute_filter.py
@@ -25,6 +25,8 @@ from pipecat.frames.frames import (
    FunctionCallResultFrame,
    InputAudioRawFrame,
    InterimTranscriptionFrame,
+    RequestSTTMuteFrame,
+    RequestSTTUnmuteFrame,
    StartFrame,
    StartInterruptionFrame,
    StopInterruptionFrame,
@@ -101,6 +103,7 @@ class STTMuteFilter(FrameProcessor):
        self._bot_is_speaking = False
        self._function_call_in_progress = False
        self._is_muted = False  # Initialize as unmuted, will set state on StartFrame if needed
+        self._frame_requested_mute = False

    @property
    def is_muted(self) -> bool:
@@ -116,6 +119,10 @@ class STTMuteFilter(FrameProcessor):

    async def _should_mute(self) -> bool:
        """Determines if STT should be muted based on current state and strategy."""
+        # First check if a RequestSTTMuteFrame was received
+        if self._frame_requested_mute:
+            return True
+
        for strategy in self._config.strategies:
            match strategy:
                case STTMuteStrategy.FUNCTION_CALL:
@@ -151,7 +158,13 @@ class STTMuteFilter(FrameProcessor):
        should_mute = None

        # Process frames to determine mute state
-        if isinstance(frame, StartFrame):
+        if isinstance(frame, RequestSTTMuteFrame):
+            self._frame_requested_mute = True
+            should_mute = await self._should_mute()
+        elif isinstance(frame, RequestSTTUnmuteFrame):
+            self._frame_requested_mute = False
+            should_mute = await self._should_mute()
+        elif isinstance(frame, StartFrame):
            should_mute = await self._should_mute()
        elif isinstance(frame, FunctionCallInProgressFrame):
            self._function_call_in_progress = True
--- a/tests/test_stt_mute_filter.py
+++ b/tests/test_stt_mute_filter.py
@@ -287,3 +287,50 @@ class TestSTTMuteFilter(unittest.IsolatedAsyncioTestCase):
            frames_to_send=frames_to_send,
            expected_down_frames=expected_returned_frames,
        )
+
+    async def test_direct_frame_muting(self):
+        """Test that RequestSTTMuteFrame and RequestSTTUnmuteFrame directly control muting."""
+        from pipecat.frames.frames import RequestSTTMuteFrame, RequestSTTUnmuteFrame
+
+        # Create filter with no strategies to isolate direct frame control
+        filter = STTMuteFilter(config=STTMuteConfig(strategies=set()))
+
+        frames_to_send = [
+            # Initially unmuted - frames should pass through
+            UserStartedSpeakingFrame(),
+            InputAudioRawFrame(audio=b"", sample_rate=16000, num_channels=1),
+            UserStoppedSpeakingFrame(),
+            # Mute via frame - subsequent frames should be suppressed
+            RequestSTTMuteFrame(),
+            SleepFrame(sleep=0.1),
+            UserStartedSpeakingFrame(),  # Should be suppressed
+            InputAudioRawFrame(
+                audio=b"", sample_rate=16000, num_channels=1
+            ),  # Should be suppressed
+            UserStoppedSpeakingFrame(),  # Should be suppressed
+            # Unmute via frame - frames should pass through again
+            RequestSTTUnmuteFrame(),
+            SleepFrame(sleep=0.1),
+            UserStartedSpeakingFrame(),
+            InputAudioRawFrame(audio=b"", sample_rate=16000, num_channels=1),
+            UserStoppedSpeakingFrame(),
+        ]
+
+        expected_returned_frames = [
+            UserStartedSpeakingFrame,
+            InputAudioRawFrame,
+            UserStoppedSpeakingFrame,
+            STTMuteFrame,  # mute=True
+            RequestSTTMuteFrame,
+            STTMuteFrame,  # mute=False
+            RequestSTTUnmuteFrame,
+            UserStartedSpeakingFrame,
+            InputAudioRawFrame,
+            UserStoppedSpeakingFrame,
+        ]
+
+        await run_test(
+            filter,
+            frames_to_send=frames_to_send,
+            expected_down_frames=expected_returned_frames,
+        )