From 71cd0f1c87c3caa7d0dced712f58ddbf5598ee68 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Wed, 13 Aug 2025 09:08:15 -0400
Subject: [PATCH] fix: Add text support to OpenAIRealtimeBetaLLMService

---
 CHANGELOG.md                                  |  6 +++++-
 .../foundational/19-openai-realtime-beta.py   |  1 +
 .../services/openai_realtime_beta/openai.py   | 21 +++++++++++++++++--
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9c80ac4fd..8410aec66 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -66,6 +66,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Fixed an issue where `SmallWebRTCTransport` ended before TTS finished.
 
+- Fixed an issue in `OpenAIRealtimeBetaLLMService` where specifying a `text`
+  `modalities` didn't result in text being outputted from the model.
+
 - Fixed a `WatchdogPriorityQueue` issue that could cause an exception when
   compating watchdog cancel sentinel items with other items in the queue.
 
@@ -307,7 +310,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   callbacks.
 
 - Added SSML reserved character escaping to `AzureBaseTTSService` to properly handle special characters in text sent to Azure TTS. This fixes an issue where characters like `&`, `<`, `>`, `"`, and `'` in LLM-generated text would cause TTS failures.
-- 
+-
+
 ### Changed
 
 - Changed the default `url` for `NeuphonicTTSService` to
diff --git a/examples/foundational/19-openai-realtime-beta.py b/examples/foundational/19-openai-realtime-beta.py
index b6bd2d864..655439d7c 100644
--- a/examples/foundational/19-openai-realtime-beta.py
+++ b/examples/foundational/19-openai-realtime-beta.py
@@ -113,6 +113,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
 
     session_properties = SessionProperties(
         input_audio_transcription=InputAudioTranscription(),
+        modalities=["text", "audio"],
         # Set openai TurnDetection parameters. Not setting this at all will turn it
         # on by default
         turn_detection=SemanticTurnDetection(),
diff --git a/src/pipecat/services/openai_realtime_beta/openai.py b/src/pipecat/services/openai_realtime_beta/openai.py
index 03b3d4938..5c7df7597 100644
--- a/src/pipecat/services/openai_realtime_beta/openai.py
+++ b/src/pipecat/services/openai_realtime_beta/openai.py
@@ -171,6 +171,15 @@ class OpenAIRealtimeBetaLLMService(LLMService):
         """
         self._audio_input_paused = paused
 
+    def _is_modality_enabled(self, modality: str) -> bool:
+        """Check if a specific modality is enabled, "text" or "audio"."""
+        modalities = self._session_properties.modalities or ["audio", "text"]
+        return modality in modalities
+
+    def _get_enabled_modalities(self) -> list[str]:
+        """Get the list of enabled modalities."""
+        return self._session_properties.modalities or ["audio", "text"]
+
     async def retrieve_conversation_item(self, item_id: str):
         """Retrieve a conversation item by ID from the server.
 
@@ -243,7 +252,9 @@ class OpenAIRealtimeBetaLLMService(LLMService):
         await self.stop_all_metrics()
         if self._current_assistant_response:
             await self.push_frame(LLMFullResponseEndFrame())
-            await self.push_frame(TTSStoppedFrame())
+            # Only push TTSStoppedFrame if audio modality is enabled
+            if self._is_modality_enabled("audio"):
+                await self.push_frame(TTSStoppedFrame())
 
     async def _handle_user_started_speaking(self, frame):
         pass
@@ -469,6 +480,8 @@ class OpenAIRealtimeBetaLLMService(LLMService):
                 await self._handle_evt_speech_started(evt)
             elif evt.type == "input_audio_buffer.speech_stopped":
                 await self._handle_evt_speech_stopped(evt)
+            elif evt.type == "response.text.delta":
+                await self._handle_evt_text_delta(evt)
             elif evt.type == "response.audio_transcript.delta":
                 await self._handle_evt_audio_transcript_delta(evt)
             elif evt.type == "error":
@@ -617,6 +630,10 @@ class OpenAIRealtimeBetaLLMService(LLMService):
             # Response message without preceding user message. Add it to the context.
             await self._handle_assistant_output(evt.response.output)
 
+    async def _handle_evt_text_delta(self, evt):
+        if evt.delta:
+            await self.push_frame(LLMTextFrame(evt.delta))
+
     async def _handle_evt_audio_transcript_delta(self, evt):
         if evt.delta:
             await self.push_frame(LLMTextFrame(evt.delta))
@@ -723,7 +740,7 @@ class OpenAIRealtimeBetaLLMService(LLMService):
         await self.start_ttfb_metrics()
         await self.send_client_event(
             events.ResponseCreateEvent(
-                response=events.ResponseProperties(modalities=["audio", "text"])
+                response=events.ResponseProperties(modalities=self._get_enabled_modalities())
             )
         )