From 20a59e8c567af81dcef98e410aaf9e7ec1246680 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 30 Apr 2025 10:44:31 -0400 Subject: [PATCH] Add InterimTranscriptionFrame and TranscriptionFrame to STTMuteFilter frame processing --- CHANGELOG.md | 4 ++ .../processors/filters/stt_mute_filter.py | 4 ++ tests/test_stt_mute_filter.py | 41 +++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e5bb8849..e0153d3a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- The `STTMuteFilter` now mutes `InterimTranscriptionFrame` and + `TranscriptionFrame` which allows the `STTMuteFilter` to be used in + conjunction with transports that generate transcripts, e.g. `DailyTransport`. + - Function calls now receive a single parameter `FunctionCallParams` instead of `(function_name, tool_call_id, args, llm, context, result_callback)` which is now deprecated. diff --git a/src/pipecat/processors/filters/stt_mute_filter.py b/src/pipecat/processors/filters/stt_mute_filter.py index ae81acc1e..e85a3e581 100644 --- a/src/pipecat/processors/filters/stt_mute_filter.py +++ b/src/pipecat/processors/filters/stt_mute_filter.py @@ -24,10 +24,12 @@ from pipecat.frames.frames import ( FunctionCallInProgressFrame, FunctionCallResultFrame, InputAudioRawFrame, + InterimTranscriptionFrame, StartFrame, StartInterruptionFrame, StopInterruptionFrame, STTMuteFrame, + TranscriptionFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame, ) @@ -175,6 +177,8 @@ class STTMuteFilter(FrameProcessor): UserStartedSpeakingFrame, UserStoppedSpeakingFrame, InputAudioRawFrame, + InterimTranscriptionFrame, + TranscriptionFrame, ), ): # Only pass VAD-related frames when not muted diff --git a/tests/test_stt_mute_filter.py b/tests/test_stt_mute_filter.py index a55c4609e..f0c0d7d57 100644 --- a/tests/test_stt_mute_filter.py +++ b/tests/test_stt_mute_filter.py @@ -12,7 +12,9 @@ from pipecat.frames.frames import ( FunctionCallInProgressFrame, FunctionCallResultFrame, InputAudioRawFrame, + InterimTranscriptionFrame, STTMuteFrame, + TranscriptionFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame, ) @@ -100,6 +102,45 @@ class TestSTTMuteFilter(unittest.IsolatedAsyncioTestCase): expected_down_frames=expected_returned_frames, ) + async def test_transcription_frames_with_always_strategy(self): + filter = STTMuteFilter(config=STTMuteConfig(strategies={STTMuteStrategy.ALWAYS})) + + frames_to_send = [ + # Bot speaking - should mute + BotStartedSpeakingFrame(), + SleepFrame(sleep=0.1), # Wait for StartedSpeaking to process + InterimTranscriptionFrame( + user_id="user1", text="This should be suppressed", timestamp="1234567890" + ), + TranscriptionFrame( + user_id="user1", text="This should be suppressed", timestamp="1234567890" + ), + SleepFrame(sleep=0.1), # Wait for transcription frames to queue + BotStoppedSpeakingFrame(), + # Bot not speaking - should pass through + InterimTranscriptionFrame( + user_id="user1", text="This should pass", timestamp="1234567891" + ), + TranscriptionFrame( + user_id="user1", text="This should pass through", timestamp="1234567891" + ), + ] + + expected_returned_frames = [ + BotStartedSpeakingFrame, + STTMuteFrame, # mute=True + BotStoppedSpeakingFrame, + STTMuteFrame, # mute=False + InterimTranscriptionFrame, # Only passes through after bot stops speaking + TranscriptionFrame, # Only passes through after bot stops speaking + ] + + await run_test( + filter, + frames_to_send=frames_to_send, + expected_down_frames=expected_returned_frames, + ) + # TODO: Revisit once we figure out how to test SystemFrames and DataFrames # async def test_function_call_strategy(self): # filter = STTMuteFilter(config=STTMuteConfig(strategies={STTMuteStrategy.FUNCTION_CALL}))