From b640b2d02417eef44cb641d9644caf356f068ea8 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Mon, 7 Oct 2024 20:58:17 -0700 Subject: [PATCH] send user started/stopped speaking event from openai realtime events send user started/stopped speaking event from openai realtime events --- examples/foundational/19-openai-realtime-beta.py | 2 +- .../openai_realtime_beta/llm_and_context.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/examples/foundational/19-openai-realtime-beta.py b/examples/foundational/19-openai-realtime-beta.py index feab9c10f..16b291ef2 100644 --- a/examples/foundational/19-openai-realtime-beta.py +++ b/examples/foundational/19-openai-realtime-beta.py @@ -76,7 +76,7 @@ async def main(): audio_out_enabled=True, audio_out_sample_rate=24000, transcription_enabled=False, - vad_enabled=True, + vad_enabled=False, vad_analyzer=SileroVADAnalyzer(), vad_audio_passthrough=True, ), diff --git a/src/pipecat/services/openai_realtime_beta/llm_and_context.py b/src/pipecat/services/openai_realtime_beta/llm_and_context.py index 5cf4f6985..35397d95d 100644 --- a/src/pipecat/services/openai_realtime_beta/llm_and_context.py +++ b/src/pipecat/services/openai_realtime_beta/llm_and_context.py @@ -22,11 +22,14 @@ from pipecat.frames.frames import ( LLMUpdateSettingsFrame, StartFrame, StartInterruptionFrame, + StopInterruptionFrame, TextFrame, TranscriptionFrame, TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, + UserStartedSpeakingFrame, + UserStoppedSpeakingFrame, ) from pipecat.metrics.metrics import LLMTokenUsage from pipecat.processors.aggregators.openai_llm_context import ( @@ -120,6 +123,7 @@ class OpenAILLMServiceRealtimeBeta(LLMService): session_properties: events.SessionProperties = events.SessionProperties(), start_audio_paused: bool = False, send_transcription_frames: bool = True, + send_user_started_speaking_frames: bool = True, **kwargs, ): super().__init__(base_url=base_url, **kwargs) @@ -129,6 +133,7 @@ class OpenAILLMServiceRealtimeBeta(LLMService): self._session_properties = session_properties self._audio_input_paused = start_audio_paused self._send_transcription_frames = send_transcription_frames + self._send_user_started_speaking_frames = send_user_started_speaking_frames self._websocket = None self._receive_task = None self._context = None @@ -213,10 +218,19 @@ class OpenAILLMServiceRealtimeBeta(LLMService): elif evt.type == "input_audio_buffer.speech_started": # user started speaking # todo: send user started speaking if configured + if self._send_user_started_speaking_frames: + await self.push_frame(UserStartedSpeakingFrame()) + await self.push_frame(StartInterruptionFrame()) + logger.debug("User started speaking") pass elif evt.type == "input_audio_buffer.speech_stopped": # user stopped speaking # todo: send user stopped speaking if configured + if self._send_user_started_speaking_frames: + await self.push_frame(UserStoppedSpeakingFrame()) + await self.push_frame(StopInterruptionFrame()) + + logger.debug("User stopped speaking") await self.start_processing_metrics() await self.start_ttfb_metrics() elif evt.type == "conversation.item.created":