Merge pull request #1501 from pipecat-ai/aleix/transcription-processor-interruption

TranscriptProcessor: send TranscriptionUpdateFrame after interruption
2025-04-01 10:46:16 -07:00
parent 9ea81bc982 3a37b11e56
commit 43008c8c5b
4 changed files with 24 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -74,6 +74,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Fixed

+- Fixed an issue that could cause the `TranscriptionUpdateFrame` being pushed
+  because of an interruption to be discarded.
+
 - Fixed an issue that would cause `SegmentedSTTService` based services
  (e.g. `OpenAISTTService`) to try to transcribe non-spoken audio, causing
  invalid transcriptions.
--- a/src/pipecat/processors/transcript_processor.py
+++ b/src/pipecat/processors/transcript_processor.py
@@ -175,22 +175,28 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor):
        """
        await super().process_frame(frame, direction)

-        if isinstance(frame, TTSTextFrame):
+        if isinstance(frame, (StartInterruptionFrame, CancelFrame)):
+            # Push frame first otherwise our emitted transcription update frame
+            # might get cleaned up.
+            await self.push_frame(frame, direction)
+            # Emit accumulated text with interruptions
+            await self._emit_aggregated_text()
+        elif isinstance(frame, TTSTextFrame):
            # Start timestamp on first text part
            if not self._aggregation_start_time:
                self._aggregation_start_time = time_now_iso8601()

            self._current_text_parts.append(frame.text)

-        elif isinstance(frame, (BotStoppedSpeakingFrame, StartInterruptionFrame, CancelFrame)):
-            # Emit accumulated text when bot finishes speaking or is interrupted
+            # Push frame.
+            await self.push_frame(frame, direction)
+        elif isinstance(frame, (BotStoppedSpeakingFrame, EndFrame)):
+            # Emit accumulated text when bot finishes speaking or pipeline ends.
            await self._emit_aggregated_text()
-
-        elif isinstance(frame, EndFrame):
-            # Emit any remaining text when pipeline ends
-            await self._emit_aggregated_text()
-
-        await self.push_frame(frame, direction)
+            # Push frame.
+            await self.push_frame(frame, direction)
+        else:
+            await self.push_frame(frame, direction)


 class TranscriptProcessor:
--- a/tests/test_context_aggregators.py
+++ b/tests/test_context_aggregators.py
@@ -41,7 +41,10 @@ from pipecat.services.google.llm import (
    GoogleLLMContext,
    GoogleUserContextAggregator,
 )
-from pipecat.services.openai import OpenAIAssistantContextAggregator, OpenAIUserContextAggregator
+from pipecat.services.openai.llm import (
+    OpenAIAssistantContextAggregator,
+    OpenAIUserContextAggregator,
+)
 from pipecat.tests.utils import SleepFrame, run_test

 AGGREGATION_TIMEOUT = 0.1
--- a/tests/test_transcript_processor.py
+++ b/tests/test_transcript_processor.py
@@ -238,8 +238,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
            TTSTextFrame(text="world!"),
            SleepFrame(sleep=0.1),
            StartInterruptionFrame(),  # User interrupts here
-            BotStartedSpeakingFrame(),
            SleepFrame(sleep=0.1),
+            BotStartedSpeakingFrame(),
            TTSTextFrame(text="New"),
            TTSTextFrame(text="response"),
            SleepFrame(sleep=0.1),
@@ -251,8 +251,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
            BotStartedSpeakingFrame,
            TTSTextFrame,  # "Hello"
            TTSTextFrame,  # "world!"
+            StartInterruptionFrame,
            TranscriptionUpdateFrame,  # First message (emitted due to interruption)
-            StartInterruptionFrame,  # Interruption frame comes after the update
            BotStartedSpeakingFrame,
            TTSTextFrame,  # "New"
            TTSTextFrame,  # "response"