services(azure): allow transcriptions during interruptions

If the user interrupts we can't just discard transcriptions because the user is
actually interrupting and talking.
This commit is contained in:
Aleix Conchillo Flaqué
2024-06-25 21:33:06 -07:00
parent d302cbb114
commit dac033fe61

View File

@@ -20,7 +20,6 @@ from pipecat.frames.frames import (
Frame,
StartFrame,
StartInterruptionFrame,
StopInterruptionFrame,
SystemFrame,
TranscriptionFrame,
URLImageRawFrame)
@@ -143,7 +142,7 @@ class AzureSTTService(AIService):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, StartInterruptionFrame) or isinstance(frame, StopInterruptionFrame):
if isinstance(frame, StartInterruptionFrame):
await self._handle_interruptions(frame)
elif isinstance(frame, SystemFrame):
await self.push_frame(frame, direction)
@@ -166,21 +165,14 @@ class AzureSTTService(AIService):
await self._push_frame_task
async def _handle_interruptions(self, frame: Frame):
if isinstance(frame, StartInterruptionFrame):
# Indicate we are interrupted, we should ignore any out-of-band
# transcriptions.
self._is_interrupted_event.set()
# Cancel the task. This will stop pushing frames downstream.
self._push_frame_task.cancel()
await self._push_frame_task
# Push an out-of-band frame (i.e. not using the ordered push
# frame task).
await self.push_frame(frame)
# Create a new queue and task.
self._create_push_task()
elif isinstance(frame, StopInterruptionFrame):
# We should now be able to receive transcriptions again.
self._is_interrupted_event.clear()
# Cancel the task. This will stop pushing frames downstream.
self._push_frame_task.cancel()
await self._push_frame_task
# Push an out-of-band frame (i.e. not using the ordered push
# frame task).
await self.push_frame(frame)
# Create a new queue and task.
self._create_push_task()
def _create_push_task(self):
self._push_queue = asyncio.Queue()
@@ -197,9 +189,6 @@ class AzureSTTService(AIService):
break
def _on_handle_recognized(self, event):
if self._is_interrupted_event.is_set():
return
if event.result.reason == ResultReason.RecognizedSpeech and len(event.result.text) > 0:
direction = FrameDirection.DOWNSTREAM
frame = TranscriptionFrame(event.result.text, "", int(time.time_ns() / 1000000))