From 71cd0f1c87c3caa7d0dced712f58ddbf5598ee68 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 13 Aug 2025 09:08:15 -0400 Subject: [PATCH] fix: Add text support to OpenAIRealtimeBetaLLMService --- CHANGELOG.md | 6 +++++- .../foundational/19-openai-realtime-beta.py | 1 + .../services/openai_realtime_beta/openai.py | 21 +++++++++++++++++-- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c80ac4fd..8410aec66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -66,6 +66,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed an issue where `SmallWebRTCTransport` ended before TTS finished. +- Fixed an issue in `OpenAIRealtimeBetaLLMService` where specifying a `text` + `modalities` didn't result in text being outputted from the model. + - Fixed a `WatchdogPriorityQueue` issue that could cause an exception when compating watchdog cancel sentinel items with other items in the queue. @@ -307,7 +310,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 callbacks. - Added SSML reserved character escaping to `AzureBaseTTSService` to properly handle special characters in text sent to Azure TTS. This fixes an issue where characters like `&`, `<`, `>`, `"`, and `'` in LLM-generated text would cause TTS failures. -- +- + ### Changed - Changed the default `url` for `NeuphonicTTSService` to diff --git a/examples/foundational/19-openai-realtime-beta.py b/examples/foundational/19-openai-realtime-beta.py index b6bd2d864..655439d7c 100644 --- a/examples/foundational/19-openai-realtime-beta.py +++ b/examples/foundational/19-openai-realtime-beta.py @@ -113,6 +113,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): session_properties = SessionProperties( input_audio_transcription=InputAudioTranscription(), + modalities=["text", "audio"], # Set openai TurnDetection parameters. Not setting this at all will turn it # on by default turn_detection=SemanticTurnDetection(), diff --git a/src/pipecat/services/openai_realtime_beta/openai.py b/src/pipecat/services/openai_realtime_beta/openai.py index 03b3d4938..5c7df7597 100644 --- a/src/pipecat/services/openai_realtime_beta/openai.py +++ b/src/pipecat/services/openai_realtime_beta/openai.py @@ -171,6 +171,15 @@ class OpenAIRealtimeBetaLLMService(LLMService): """ self._audio_input_paused = paused + def _is_modality_enabled(self, modality: str) -> bool: + """Check if a specific modality is enabled, "text" or "audio".""" + modalities = self._session_properties.modalities or ["audio", "text"] + return modality in modalities + + def _get_enabled_modalities(self) -> list[str]: + """Get the list of enabled modalities.""" + return self._session_properties.modalities or ["audio", "text"] + async def retrieve_conversation_item(self, item_id: str): """Retrieve a conversation item by ID from the server. @@ -243,7 +252,9 @@ class OpenAIRealtimeBetaLLMService(LLMService): await self.stop_all_metrics() if self._current_assistant_response: await self.push_frame(LLMFullResponseEndFrame()) - await self.push_frame(TTSStoppedFrame()) + # Only push TTSStoppedFrame if audio modality is enabled + if self._is_modality_enabled("audio"): + await self.push_frame(TTSStoppedFrame()) async def _handle_user_started_speaking(self, frame): pass @@ -469,6 +480,8 @@ class OpenAIRealtimeBetaLLMService(LLMService): await self._handle_evt_speech_started(evt) elif evt.type == "input_audio_buffer.speech_stopped": await self._handle_evt_speech_stopped(evt) + elif evt.type == "response.text.delta": + await self._handle_evt_text_delta(evt) elif evt.type == "response.audio_transcript.delta": await self._handle_evt_audio_transcript_delta(evt) elif evt.type == "error": @@ -617,6 +630,10 @@ class OpenAIRealtimeBetaLLMService(LLMService): # Response message without preceding user message. Add it to the context. await self._handle_assistant_output(evt.response.output) + async def _handle_evt_text_delta(self, evt): + if evt.delta: + await self.push_frame(LLMTextFrame(evt.delta)) + async def _handle_evt_audio_transcript_delta(self, evt): if evt.delta: await self.push_frame(LLMTextFrame(evt.delta)) @@ -723,7 +740,7 @@ class OpenAIRealtimeBetaLLMService(LLMService): await self.start_ttfb_metrics() await self.send_client_event( events.ResponseCreateEvent( - response=events.ResponseProperties(modalities=["audio", "text"]) + response=events.ResponseProperties(modalities=self._get_enabled_modalities()) ) )