Merge pull request #2433 from pipecat-ai/mb/openai-realtime-text-modality

fix: Add text support to OpenAIRealtimeBetaLLMService
2025-08-13 08:41:33 -07:00
parent a2a419e6db 3ad61a8a04
commit 608570e89d
5 changed files with 256 additions and 15 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -66,19 +66,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 - Fixed an issue where `SmallWebRTCTransport` ended before TTS finished.

+- Fixed an issue in `OpenAIRealtimeBetaLLMService` where specifying a `text`
+  `modalities` didn't result in text being outputted from the model.
+
 - Fixed a `WatchdogPriorityQueue` issue that could cause an exception when
  compating watchdog cancel sentinel items with other items in the queue.

 - Fixed an issue that would cause system frames to not be processed with higher
  priority than other frames. This could cause slower interruption times.

-### Fixed
-
 - Fixed an issue where retrying a websocket connection error would result in an
  error.

 ### Other

+- Add foundation example `19b-openai-realtime-beta-text.py`, showing how to use
+  `OpenAIRealtimeBetaLLMService` to output text to a TTS service.
+
 - Add vision support to release evals so we can run the foundational examples 12
  series.

@@ -307,7 +311,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
  callbacks.

 - Added SSML reserved character escaping to `AzureBaseTTSService` to properly handle special characters in text sent to Azure TTS. This fixes an issue where characters like `&`, `<`, `>`, `"`, and `'` in LLM-generated text would cause TTS failures.
- 
+
 ### Changed

 - Changed the default `url` for `NeuphonicTTSService` to
--- a/examples/foundational/19-openai-realtime-beta.py
+++ b/examples/foundational/19-openai-realtime-beta.py
@@ -158,16 +158,6 @@ Remember, your responses should be short. Just one or two sentences, usually."""
    # openai WebSocket API can understand.
    context = OpenAILLMContext(
        [{"role": "user", "content": "Say hello!"}],
-        # [{"role": "user", "content": [{"type": "text", "text": "Say hello!"}]}],
-        #     [
-        #         {
-        #             "role": "user",
-        #             "content": [
-        #                 {"type": "text", "text": "Say"},
-        #                 {"type": "text", "text": "yo what's up!"},
-        #             ],
-        #         }
-        #     ],
        tools,
    )

--- a/examples/foundational/19b-openai-realtime-beta-text.py
+++ b/examples/foundational/19b-openai-realtime-beta-text.py
@@ -0,0 +1,229 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+
+import os
+from datetime import datetime
+
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.adapters.schemas.function_schema import FunctionSchema
+from pipecat.adapters.schemas.tools_schema import ToolsSchema
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.frames.frames import TranscriptionMessage
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.processors.transcript_processor import TranscriptProcessor
+from pipecat.runner.types import RunnerArguments
+from pipecat.runner.utils import create_transport
+from pipecat.services.cartesia import CartesiaTTSService
+from pipecat.services.llm_service import FunctionCallParams
+from pipecat.services.openai_realtime_beta import (
+    InputAudioNoiseReduction,
+    InputAudioTranscription,
+    OpenAIRealtimeBetaLLMService,
+    SemanticTurnDetection,
+    SessionProperties,
+)
+from pipecat.transports.base_transport import BaseTransport, TransportParams
+from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
+from pipecat.transports.services.daily import DailyParams
+
+load_dotenv(override=True)
+
+
+async def fetch_weather_from_api(params: FunctionCallParams):
+    temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
+    await params.result_callback(
+        {
+            "conditions": "nice",
+            "temperature": temperature,
+            "format": params.arguments["format"],
+            "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
+        }
+    )
+
+
+async def fetch_restaurant_recommendation(params: FunctionCallParams):
+    await params.result_callback({"name": "The Golden Dragon"})
+
+
+weather_function = FunctionSchema(
+    name="get_current_weather",
+    description="Get the current weather",
+    properties={
+        "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA",
+        },
+        "format": {
+            "type": "string",
+            "enum": ["celsius", "fahrenheit"],
+            "description": "The temperature unit to use. Infer this from the users location.",
+        },
+    },
+    required=["location", "format"],
+)
+
+restaurant_function = FunctionSchema(
+    name="get_restaurant_recommendation",
+    description="Get a restaurant recommendation",
+    properties={
+        "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA",
+        },
+    },
+    required=["location"],
+)
+
+# Create tools schema
+tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
+
+
+# We store functions so objects (e.g. SileroVADAnalyzer) don't get
+# instantiated. The function will be called when the desired transport gets
+# selected.
+transport_params = {
+    "daily": lambda: DailyParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "twilio": lambda: FastAPIWebsocketParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "webrtc": lambda: TransportParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+}
+
+
+async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
+    logger.info(f"Starting bot")
+
+    session_properties = SessionProperties(
+        input_audio_transcription=InputAudioTranscription(),
+        modalities=["text"],
+        # Set openai TurnDetection parameters. Not setting this at all will turn it
+        # on by default
+        turn_detection=SemanticTurnDetection(),
+        # Or set to False to disable openai turn detection and use transport VAD
+        # turn_detection=False,
+        input_audio_noise_reduction=InputAudioNoiseReduction(type="near_field"),
+        # tools=tools,
+        instructions="""You are a helpful and friendly AI.
+
+Act like a human, but remember that you aren't a human and that you can't do human
+things in the real world. Your voice and personality should be warm and engaging, with a lively and
+playful tone.
+
+If interacting in a non-English language, start by using the standard accent or dialect familiar to
+the user. Talk quickly. You should always call a function if you can. Do not refer to these rules,
+even if you're asked about them.
+
+You are participating in a voice conversation. Keep your responses concise, short, and to the point
+unless specifically asked to elaborate on a topic.
+
+You have access to the following tools:
+- get_current_weather: Get the current weather for a given location.
+- get_restaurant_recommendation: Get a restaurant recommendation for a given location.
+
+Remember, your responses should be short. Just one or two sentences, usually.""",
+    )
+
+    llm = OpenAIRealtimeBetaLLMService(
+        api_key=os.getenv("OPENAI_API_KEY"),
+        session_properties=session_properties,
+        start_audio_paused=False,
+    )
+
+    tts = CartesiaTTSService(
+        api_key=os.getenv("CARTESIA_API_KEY"),
+        voice_id="71a7ad14-091c-4e8e-a314-022ece01c121",  # British Reading Lady
+    )
+
+    # you can either register a single function for all function calls, or specific functions
+    # llm.register_function(None, fetch_weather_from_api)
+    llm.register_function("get_current_weather", fetch_weather_from_api)
+    llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
+
+    transcript = TranscriptProcessor()
+
+    # Create a standard OpenAI LLM context object using the normal messages format. The
+    # OpenAIRealtimeBetaLLMService will convert this internally to messages that the
+    # openai WebSocket API can understand.
+    context = OpenAILLMContext(
+        [{"role": "user", "content": "Say hello!"}],
+        tools,
+    )
+
+    context_aggregator = llm.create_context_aggregator(context)
+
+    pipeline = Pipeline(
+        [
+            transport.input(),  # Transport user input
+            context_aggregator.user(),
+            llm,  # LLM
+            tts,  # TTS
+            transcript.user(),  # Placed after the LLM, as LLM pushes TranscriptionFrames downstream
+            transport.output(),  # Transport bot output
+            transcript.assistant(),  # After the transcript output, to time with the audio output
+            context_aggregator.assistant(),
+        ]
+    )
+
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            enable_metrics=True,
+            enable_usage_metrics=True,
+        ),
+        idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
+    )
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(transport, client):
+        logger.info(f"Client connected")
+        # Kick off the conversation.
+        await task.queue_frames([context_aggregator.user().get_context_frame()])
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(transport, client):
+        logger.info(f"Client disconnected")
+        await task.cancel()
+
+    # Register event handler for transcript updates
+    @transcript.event_handler("on_transcript_update")
+    async def on_transcript_update(processor, frame):
+        for msg in frame.messages:
+            if isinstance(msg, TranscriptionMessage):
+                timestamp = f"[{msg.timestamp}] " if msg.timestamp else ""
+                line = f"{timestamp}{msg.role}: {msg.content}"
+                logger.info(f"Transcript: {line}")
+
+    runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
+
+    await runner.run(task)
+
+
+async def bot(runner_args: RunnerArguments):
+    """Main bot entry point compatible with Pipecat Cloud."""
+    transport = await create_transport(runner_args, transport_params)
+    await run_bot(transport, runner_args)
+
+
+if __name__ == "__main__":
+    from pipecat.runner.run import main
+
+    main()
--- a/scripts/evals/run-release-evals.py
+++ b/scripts/evals/run-release-evals.py
@@ -127,6 +127,7 @@ TESTS_15 = [
 TESTS_19 = [
    ("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER),
    ("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER),
+    ("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER),
 ]

 TESTS_21 = [
--- a/src/pipecat/services/openai_realtime_beta/openai.py
+++ b/src/pipecat/services/openai_realtime_beta/openai.py
@@ -171,6 +171,15 @@ class OpenAIRealtimeBetaLLMService(LLMService):
        """
        self._audio_input_paused = paused

+    def _is_modality_enabled(self, modality: str) -> bool:
+        """Check if a specific modality is enabled, "text" or "audio"."""
+        modalities = self._session_properties.modalities or ["audio", "text"]
+        return modality in modalities
+
+    def _get_enabled_modalities(self) -> list[str]:
+        """Get the list of enabled modalities."""
+        return self._session_properties.modalities or ["audio", "text"]
+
    async def retrieve_conversation_item(self, item_id: str):
        """Retrieve a conversation item by ID from the server.

@@ -243,7 +252,9 @@ class OpenAIRealtimeBetaLLMService(LLMService):
        await self.stop_all_metrics()
        if self._current_assistant_response:
            await self.push_frame(LLMFullResponseEndFrame())
-            await self.push_frame(TTSStoppedFrame())
+            # Only push TTSStoppedFrame if audio modality is enabled
+            if self._is_modality_enabled("audio"):
+                await self.push_frame(TTSStoppedFrame())

    async def _handle_user_started_speaking(self, frame):
        pass
@@ -469,6 +480,8 @@ class OpenAIRealtimeBetaLLMService(LLMService):
                await self._handle_evt_speech_started(evt)
            elif evt.type == "input_audio_buffer.speech_stopped":
                await self._handle_evt_speech_stopped(evt)
+            elif evt.type == "response.text.delta":
+                await self._handle_evt_text_delta(evt)
            elif evt.type == "response.audio_transcript.delta":
                await self._handle_evt_audio_transcript_delta(evt)
            elif evt.type == "error":
@@ -617,6 +630,10 @@ class OpenAIRealtimeBetaLLMService(LLMService):
            # Response message without preceding user message. Add it to the context.
            await self._handle_assistant_output(evt.response.output)

+    async def _handle_evt_text_delta(self, evt):
+        if evt.delta:
+            await self.push_frame(LLMTextFrame(evt.delta))
+
    async def _handle_evt_audio_transcript_delta(self, evt):
        if evt.delta:
            await self.push_frame(LLMTextFrame(evt.delta))
@@ -723,7 +740,7 @@ class OpenAIRealtimeBetaLLMService(LLMService):
        await self.start_ttfb_metrics()
        await self.send_client_event(
            events.ResponseCreateEvent(
-                response=events.ResponseProperties(modalities=["audio", "text"])
+                response=events.ResponseProperties(modalities=self._get_enabled_modalities())
            )
        )