send user started/stopped speaking event from openai realtime events

2024-10-07 20:58:17 -07:00
parent bd0649e3ed
commit b640b2d024
2 changed files with 15 additions and 1 deletions
--- a/examples/foundational/19-openai-realtime-beta.py
+++ b/examples/foundational/19-openai-realtime-beta.py
@@ -76,7 +76,7 @@ async def main():
                audio_out_enabled=True,
                audio_out_sample_rate=24000,
                transcription_enabled=False,
-                vad_enabled=True,
+                vad_enabled=False,
                vad_analyzer=SileroVADAnalyzer(),
                vad_audio_passthrough=True,
            ),
--- a/src/pipecat/services/openai_realtime_beta/llm_and_context.py
+++ b/src/pipecat/services/openai_realtime_beta/llm_and_context.py
@@ -22,11 +22,14 @@ from pipecat.frames.frames import (
    LLMUpdateSettingsFrame,
    StartFrame,
    StartInterruptionFrame,
+    StopInterruptionFrame,
    TextFrame,
    TranscriptionFrame,
    TTSAudioRawFrame,
    TTSStartedFrame,
    TTSStoppedFrame,
+    UserStartedSpeakingFrame,
+    UserStoppedSpeakingFrame,
 )
 from pipecat.metrics.metrics import LLMTokenUsage
 from pipecat.processors.aggregators.openai_llm_context import (
@@ -120,6 +123,7 @@ class OpenAILLMServiceRealtimeBeta(LLMService):
        session_properties: events.SessionProperties = events.SessionProperties(),
        start_audio_paused: bool = False,
        send_transcription_frames: bool = True,
+        send_user_started_speaking_frames: bool = True,
        **kwargs,
    ):
        super().__init__(base_url=base_url, **kwargs)
@@ -129,6 +133,7 @@ class OpenAILLMServiceRealtimeBeta(LLMService):
        self._session_properties = session_properties
        self._audio_input_paused = start_audio_paused
        self._send_transcription_frames = send_transcription_frames
+        self._send_user_started_speaking_frames = send_user_started_speaking_frames
        self._websocket = None
        self._receive_task = None
        self._context = None
@@ -213,10 +218,19 @@ class OpenAILLMServiceRealtimeBeta(LLMService):
                elif evt.type == "input_audio_buffer.speech_started":
                    # user started speaking
                    # todo: send user started speaking if configured
+                    if self._send_user_started_speaking_frames:
+                        await self.push_frame(UserStartedSpeakingFrame())
+                        await self.push_frame(StartInterruptionFrame())
+                        logger.debug("User started speaking")
                    pass
                elif evt.type == "input_audio_buffer.speech_stopped":
                    # user stopped speaking
                    # todo: send user stopped speaking if configured
+                    if self._send_user_started_speaking_frames:
+                        await self.push_frame(UserStoppedSpeakingFrame())
+                        await self.push_frame(StopInterruptionFrame())
+
+                        logger.debug("User stopped speaking")
                    await self.start_processing_metrics()
                    await self.start_ttfb_metrics()
                elif evt.type == "conversation.item.created":