Merge pull request #1501 from pipecat-ai/aleix/transcription-processor-interruption
TranscriptProcessor: send TranscriptionUpdateFrame after interruption
This commit is contained in:
@@ -74,6 +74,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue that could cause the `TranscriptionUpdateFrame` being pushed
|
||||
because of an interruption to be discarded.
|
||||
|
||||
- Fixed an issue that would cause `SegmentedSTTService` based services
|
||||
(e.g. `OpenAISTTService`) to try to transcribe non-spoken audio, causing
|
||||
invalid transcriptions.
|
||||
|
||||
@@ -175,22 +175,28 @@ class AssistantTranscriptProcessor(BaseTranscriptProcessor):
|
||||
"""
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TTSTextFrame):
|
||||
if isinstance(frame, (StartInterruptionFrame, CancelFrame)):
|
||||
# Push frame first otherwise our emitted transcription update frame
|
||||
# might get cleaned up.
|
||||
await self.push_frame(frame, direction)
|
||||
# Emit accumulated text with interruptions
|
||||
await self._emit_aggregated_text()
|
||||
elif isinstance(frame, TTSTextFrame):
|
||||
# Start timestamp on first text part
|
||||
if not self._aggregation_start_time:
|
||||
self._aggregation_start_time = time_now_iso8601()
|
||||
|
||||
self._current_text_parts.append(frame.text)
|
||||
|
||||
elif isinstance(frame, (BotStoppedSpeakingFrame, StartInterruptionFrame, CancelFrame)):
|
||||
# Emit accumulated text when bot finishes speaking or is interrupted
|
||||
# Push frame.
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, (BotStoppedSpeakingFrame, EndFrame)):
|
||||
# Emit accumulated text when bot finishes speaking or pipeline ends.
|
||||
await self._emit_aggregated_text()
|
||||
|
||||
elif isinstance(frame, EndFrame):
|
||||
# Emit any remaining text when pipeline ends
|
||||
await self._emit_aggregated_text()
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
# Push frame.
|
||||
await self.push_frame(frame, direction)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class TranscriptProcessor:
|
||||
|
||||
@@ -41,7 +41,10 @@ from pipecat.services.google.llm import (
|
||||
GoogleLLMContext,
|
||||
GoogleUserContextAggregator,
|
||||
)
|
||||
from pipecat.services.openai import OpenAIAssistantContextAggregator, OpenAIUserContextAggregator
|
||||
from pipecat.services.openai.llm import (
|
||||
OpenAIAssistantContextAggregator,
|
||||
OpenAIUserContextAggregator,
|
||||
)
|
||||
from pipecat.tests.utils import SleepFrame, run_test
|
||||
|
||||
AGGREGATION_TIMEOUT = 0.1
|
||||
|
||||
@@ -238,8 +238,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
TTSTextFrame(text="world!"),
|
||||
SleepFrame(sleep=0.1),
|
||||
StartInterruptionFrame(), # User interrupts here
|
||||
BotStartedSpeakingFrame(),
|
||||
SleepFrame(sleep=0.1),
|
||||
BotStartedSpeakingFrame(),
|
||||
TTSTextFrame(text="New"),
|
||||
TTSTextFrame(text="response"),
|
||||
SleepFrame(sleep=0.1),
|
||||
@@ -251,8 +251,8 @@ class TestUserTranscriptProcessor(unittest.IsolatedAsyncioTestCase):
|
||||
BotStartedSpeakingFrame,
|
||||
TTSTextFrame, # "Hello"
|
||||
TTSTextFrame, # "world!"
|
||||
StartInterruptionFrame,
|
||||
TranscriptionUpdateFrame, # First message (emitted due to interruption)
|
||||
StartInterruptionFrame, # Interruption frame comes after the update
|
||||
BotStartedSpeakingFrame,
|
||||
TTSTextFrame, # "New"
|
||||
TTSTextFrame, # "response"
|
||||
|
||||
Reference in New Issue
Block a user