Merge pull request #1702 from pipecat-ai/mb/stt-mute-transcription-frames

Add InterimTranscriptionFrame and TranscriptionFrame to STTMuteFilter…
2025-04-30 17:54:24 -04:00
parent 27d4c927a8 20a59e8c56
commit 8e05f2f1a1
3 changed files with 49 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -34,6 +34,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Changed

+- The `STTMuteFilter` now mutes `InterimTranscriptionFrame` and
+  `TranscriptionFrame` which allows the `STTMuteFilter` to be used in
+  conjunction with transports that generate transcripts, e.g. `DailyTransport`.
+
 - Function calls now receive a single parameter `FunctionCallParams` instead of
  `(function_name, tool_call_id, args, llm, context, result_callback)` which is
  now deprecated.
--- a/src/pipecat/processors/filters/stt_mute_filter.py
+++ b/src/pipecat/processors/filters/stt_mute_filter.py
@@ -24,10 +24,12 @@ from pipecat.frames.frames import (
    FunctionCallInProgressFrame,
    FunctionCallResultFrame,
    InputAudioRawFrame,
+    InterimTranscriptionFrame,
    StartFrame,
    StartInterruptionFrame,
    StopInterruptionFrame,
    STTMuteFrame,
+    TranscriptionFrame,
    UserStartedSpeakingFrame,
    UserStoppedSpeakingFrame,
 )
@@ -175,6 +177,8 @@ class STTMuteFilter(FrameProcessor):
                UserStartedSpeakingFrame,
                UserStoppedSpeakingFrame,
                InputAudioRawFrame,
+                InterimTranscriptionFrame,
+                TranscriptionFrame,
            ),
        ):
            # Only pass VAD-related frames when not muted
--- a/tests/test_stt_mute_filter.py
+++ b/tests/test_stt_mute_filter.py
@@ -12,7 +12,9 @@ from pipecat.frames.frames import (
    FunctionCallInProgressFrame,
    FunctionCallResultFrame,
    InputAudioRawFrame,
+    InterimTranscriptionFrame,
    STTMuteFrame,
+    TranscriptionFrame,
    UserStartedSpeakingFrame,
    UserStoppedSpeakingFrame,
 )
@@ -100,6 +102,45 @@ class TestSTTMuteFilter(unittest.IsolatedAsyncioTestCase):
            expected_down_frames=expected_returned_frames,
        )

+    async def test_transcription_frames_with_always_strategy(self):
+        filter = STTMuteFilter(config=STTMuteConfig(strategies={STTMuteStrategy.ALWAYS}))
+
+        frames_to_send = [
+            # Bot speaking - should mute
+            BotStartedSpeakingFrame(),
+            SleepFrame(sleep=0.1),  # Wait for StartedSpeaking to process
+            InterimTranscriptionFrame(
+                user_id="user1", text="This should be suppressed", timestamp="1234567890"
+            ),
+            TranscriptionFrame(
+                user_id="user1", text="This should be suppressed", timestamp="1234567890"
+            ),
+            SleepFrame(sleep=0.1),  # Wait for transcription frames to queue
+            BotStoppedSpeakingFrame(),
+            # Bot not speaking - should pass through
+            InterimTranscriptionFrame(
+                user_id="user1", text="This should pass", timestamp="1234567891"
+            ),
+            TranscriptionFrame(
+                user_id="user1", text="This should pass through", timestamp="1234567891"
+            ),
+        ]
+
+        expected_returned_frames = [
+            BotStartedSpeakingFrame,
+            STTMuteFrame,  # mute=True
+            BotStoppedSpeakingFrame,
+            STTMuteFrame,  # mute=False
+            InterimTranscriptionFrame,  # Only passes through after bot stops speaking
+            TranscriptionFrame,  # Only passes through after bot stops speaking
+        ]
+
+        await run_test(
+            filter,
+            frames_to_send=frames_to_send,
+            expected_down_frames=expected_returned_frames,
+        )
+
    # TODO: Revisit once we figure out how to test SystemFrames and DataFrames
    # async def test_function_call_strategy(self):
    #     filter = STTMuteFilter(config=STTMuteConfig(strategies={STTMuteStrategy.FUNCTION_CALL}))