Merge pull request #4450 from pipecat-ai/mb/gpt-realtime-whisper

Default OpenAI Realtime transcription to gpt-realtime-whisper
2026-05-13 09:48:33 -04:00
parent 9148e307cc 3e8c5c08f4
commit 5fef239b68
11 changed files with 141 additions and 60 deletions
--- a/changelog/4450.changed.md
+++ b/changelog/4450.changed.md
@@ -0,0 +1 @@
+- Changed the default OpenAI Realtime input audio transcription model from `gpt-4o-transcribe` to `gpt-realtime-whisper` for both `OpenAIRealtimeSTTService` and `OpenAIRealtimeLLMService`. The new model does not accept the `prompt` parameter; if a prompt is supplied alongside `gpt-realtime-whisper`, it is dropped automatically and a warning is logged. To keep using prompt hints, explicitly pin `model="gpt-4o-transcribe"` (or `"gpt-4o-mini-transcribe"`).
--- a/examples/function-calling/function-calling-openai-async.py
+++ b/examples/function-calling/function-calling-openai-async.py
@@ -29,7 +29,7 @@ from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import create_transport
 from pipecat.services.llm_service import FunctionCallParams
 from pipecat.services.openai.llm import OpenAILLMService
-from pipecat.services.openai.stt import OpenAISTTService
+from pipecat.services.openai.stt import OpenAIRealtimeSTTService
 from pipecat.services.openai.tts import OpenAITTSService
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 from pipecat.transports.daily.transport import DailyParams
@@ -69,13 +69,7 @@ transport_params = {
 async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    logger.info(f"Starting bot")

-    stt = OpenAISTTService(
-        api_key=os.environ["OPENAI_API_KEY"],
-        settings=OpenAISTTService.Settings(
-            model="gpt-4o-transcribe",
-            prompt="Expect words related weather, such as temperature and conditions. And restaurant names.",
-        ),
-    )
+    stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])

    tts = OpenAITTSService(
        api_key=os.environ["OPENAI_API_KEY"],
--- a/examples/function-calling/function-calling-openai.py
+++ b/examples/function-calling/function-calling-openai.py
@@ -25,7 +25,7 @@ from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import create_transport
 from pipecat.services.llm_service import FunctionCallParams
 from pipecat.services.openai.llm import OpenAILLMService
-from pipecat.services.openai.stt import OpenAISTTService
+from pipecat.services.openai.stt import OpenAIRealtimeSTTService
 from pipecat.services.openai.tts import OpenAITTSService
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 from pipecat.transports.daily.transport import DailyParams
@@ -63,13 +63,7 @@ transport_params = {
 async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    logger.info(f"Starting bot")

-    stt = OpenAISTTService(
-        api_key=os.environ["OPENAI_API_KEY"],
-        settings=OpenAISTTService.Settings(
-            model="gpt-4o-transcribe",
-            prompt="Expect words related weather, such as temperature and conditions. And restaurant names.",
-        ),
-    )
+    stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])

    tts = OpenAITTSService(
        api_key=os.environ["OPENAI_API_KEY"],
--- a/examples/transcription/transcription-openai.py
+++ b/examples/transcription/transcription-openai.py
@@ -49,13 +49,7 @@ transport_params = {
 async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    logger.info(f"Starting bot")

-    stt = OpenAIRealtimeSTTService(
-        api_key=os.environ["OPENAI_API_KEY"],
-        settings=OpenAIRealtimeSTTService.Settings(
-            model="gpt-4o-transcribe",
-            prompt="Expect words related to dogs, such as breed names.",
-        ),
-    )
+    stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])

    tl = TranscriptionLogger()
    vad_processor = VADProcessor(vad_analyzer=SileroVADAnalyzer())
--- a/examples/voice/voice-openai.py
+++ b/examples/voice/voice-openai.py
@@ -25,7 +25,6 @@ from pipecat.runner.utils import create_transport
 from pipecat.services.openai.llm import OpenAILLMService
 from pipecat.services.openai.stt import OpenAIRealtimeSTTService
 from pipecat.services.openai.tts import OpenAITTSService
-from pipecat.transcriptions.language import Language
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 from pipecat.transports.daily.transport import DailyParams
 from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -53,14 +52,7 @@ transport_params = {
 async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    logger.info(f"Starting bot")

-    stt = OpenAIRealtimeSTTService(
-        api_key=os.environ["OPENAI_API_KEY"],
-        settings=OpenAIRealtimeSTTService.Settings(
-            model="gpt-4o-transcribe",
-            prompt="Expect words related to dogs, such as breed names.",
-            language=Language.EN,
-        ),
-    )
+    stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])

    tts = OpenAITTSService(
        api_key=os.environ["OPENAI_API_KEY"],
@@ -72,7 +64,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
    llm = OpenAILLMService(
        api_key=os.environ["OPENAI_API_KEY"],
        settings=OpenAILLMService.Settings(
-            system_instruction="You are very knowledgable about dogs. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
+            system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
        ),
    )

--- a/src/pipecat/services/openai/_constants.py
+++ b/src/pipecat/services/openai/_constants.py
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2024-2026, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+"""Internal constants for OpenAI service integrations."""
+
+OPENAI_SAMPLE_RATE = 24000
+OPENAI_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper"
--- a/src/pipecat/services/openai/realtime/events.py
+++ b/src/pipecat/services/openai/realtime/events.py
@@ -13,6 +13,7 @@ from typing import Any, Literal
 from pydantic import BaseModel, ConfigDict, Field

 from pipecat.adapters.schemas.tools_schema import ToolsSchema
+from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE

 #
 # session properties
@@ -34,7 +35,7 @@ class PCMAudioFormat(AudioFormat):
    """

    type: Literal["audio/pcm"] = "audio/pcm"
-    rate: Literal[24000] = 24000
+    rate: Literal[24000] = OPENAI_SAMPLE_RATE


 class PCMUAudioFormat(AudioFormat):
@@ -60,20 +61,21 @@ class PCMAAudioFormat(AudioFormat):
 class InputAudioTranscription(BaseModel):
    """Configuration for audio transcription settings."""

-    model: str = "gpt-4o-transcribe"
+    model: str = OPENAI_REALTIME_WHISPER_MODEL
    language: str | None
    prompt: str | None

    def __init__(
        self,
-        model: str | None = "gpt-4o-transcribe",
+        model: str | None = OPENAI_REALTIME_WHISPER_MODEL,
        language: str | None = None,
        prompt: str | None = None,
    ):
        """Initialize InputAudioTranscription.

        Args:
-            model: Transcription model to use (e.g., "gpt-4o-transcribe", "whisper-1").
+            model: Transcription model to use (e.g., "gpt-realtime-whisper",
+                "gpt-4o-transcribe", "whisper-1").
            language: Optional language code for transcription.
            prompt: Optional transcription hint text.
        """
--- a/src/pipecat/services/openai/realtime/llm.py
+++ b/src/pipecat/services/openai/realtime/llm.py
@@ -52,6 +52,7 @@ from pipecat.processors.aggregators import async_tool_messages
 from pipecat.processors.aggregators.llm_context import LLMContext, LLMSpecificMessage
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
+from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
 from pipecat.services.settings import (
    NOT_GIVEN,
    LLMSettings,
@@ -290,6 +291,8 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
        if settings is not None:
            default_settings.apply_update(settings)

+        self._omit_unsupported_input_audio_transcription_prompt(default_settings.session_properties)
+
        # Build WebSocket URL with model query parameter
        # Source: https://platform.openai.com/docs/guides/realtime-websocket
        full_url = f"{base_url}?model={default_settings.model}"
@@ -330,6 +333,29 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
        self._register_event_handler("on_conversation_item_updated")
        self._retrieve_conversation_item_futures = {}

+    @staticmethod
+    def _omit_unsupported_input_audio_transcription_prompt(
+        session_properties: events.SessionProperties,
+    ) -> bool:
+        """Drop input transcription prompt settings unsupported by the selected model."""
+        transcription = (
+            session_properties.audio.input.transcription
+            if session_properties.audio
+            and session_properties.audio.input
+            and session_properties.audio.input.transcription
+            else None
+        )
+        if transcription and transcription.model == OPENAI_REALTIME_WHISPER_MODEL:
+            if transcription.prompt:
+                transcription.prompt = None
+                logger.warning(
+                    f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt "
+                    "parameter; omitting prompt from OpenAI Realtime input audio "
+                    "transcription settings."
+                )
+                return True
+        return False
+
    def can_generate_metrics(self) -> bool:
        """Check if the service can generate usage metrics.

@@ -487,7 +513,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
        self._current_audio_response = None

    def _calculate_audio_duration_ms(
-        self, total_bytes: int, sample_rate: int = 24000, bytes_per_sample: int = 2
+        self, total_bytes: int, sample_rate: int = OPENAI_SAMPLE_RATE, bytes_per_sample: int = 2
    ) -> int:
        """Calculate audio duration in milliseconds based on PCM audio parameters."""
        samples = total_bytes / bytes_per_sample
@@ -656,8 +682,12 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
    async def _update_settings(self, delta):
        """Apply a settings delta, sending a session update when needed."""
        changed = await super()._update_settings(delta)
+        prompt_omitted = self._omit_unsupported_input_audio_transcription_prompt(
+            assert_given(self._settings.session_properties)
+        )
        handled = {"session_properties", "system_instruction"}
-        if changed.keys() & handled:
+        handled_settings_changed = bool(changed.keys() & handled)
+        if handled_settings_changed or prompt_omitted:
            await self._send_session_update()
        self._warn_unhandled_updated_settings(changed.keys() - handled)
        return changed
@@ -816,7 +846,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
        self._current_audio_response.total_size += len(audio)
        frame = TTSAudioRawFrame(
            audio=audio,
-            sample_rate=24000,
+            sample_rate=OPENAI_SAMPLE_RATE,
            num_channels=1,
        )
        await self.push_frame(frame)
--- a/src/pipecat/services/openai/stt.py
+++ b/src/pipecat/services/openai/stt.py
@@ -36,6 +36,7 @@ from pipecat.frames.frames import (
    VADUserStoppedSpeakingFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
 from pipecat.services.settings import NOT_GIVEN, STTSettings, _NotGiven, assert_given
 from pipecat.services.stt_latency import OPENAI_REALTIME_TTFS_P99, OPENAI_TTFS_P99
 from pipecat.services.stt_service import WebsocketSTTService
@@ -178,15 +179,13 @@ class OpenAISTTService(BaseWhisperSTTService):
        return await self._client.audio.transcriptions.create(**kwargs)


-_OPENAI_SAMPLE_RATE = 24000
-
-
@dataclass
 class OpenAIRealtimeSTTSettings(STTSettings):
    """Settings for OpenAIRealtimeSTTService.

    Parameters:
-        prompt: Optional prompt text to guide transcription style.
+        prompt: Optional prompt text to guide transcription style. Not supported by
+            ``"gpt-realtime-whisper"``.
        noise_reduction: Noise reduction mode. ``"near_field"`` for close
            microphones, ``"far_field"`` for distant microphones, or ``None``
            to disable.
@@ -227,7 +226,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
        stt = OpenAIRealtimeSTTService(
            api_key="sk-...",
            settings=OpenAIRealtimeSTTService.Settings(
-                model="gpt-4o-transcribe",
+                model="gpt-realtime-whisper",
                noise_reduction="near_field",
            ),
        )
@@ -255,7 +254,9 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):

        Args:
            api_key: OpenAI API key for authentication.
-            model: Transcription model. Supported values are
+            model: Transcription model. For low-latency streaming
+                transcription, use ``"gpt-realtime-whisper"``. Other
+                supported transcription models include
                ``"gpt-4o-transcribe"`` and ``"gpt-4o-mini-transcribe"``.

                .. deprecated:: 0.0.105
@@ -269,7 +270,8 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
                    Use ``settings=OpenAIRealtimeSTTService.Settings(language=...)`` instead.

            prompt: Optional prompt text to guide transcription style
-                or provide keyword hints.
+                or provide keyword hints. Not supported by
+                ``"gpt-realtime-whisper"``.

                .. deprecated:: 0.0.105
                    Use ``settings=OpenAIRealtimeSTTService.Settings(prompt=...)`` instead.
@@ -303,7 +305,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):

        # --- 1. Hardcoded defaults ---
        default_settings = self.Settings(
-            model="gpt-4o-transcribe",
+            model=OPENAI_REALTIME_WHISPER_MODEL,
            language=Language.EN,
            prompt=None,
            noise_reduction=None,
@@ -329,6 +331,8 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
        if settings is not None:
            default_settings.apply_update(settings)

+        self._omit_unsupported_prompt(default_settings)
+
        super().__init__(
            ttfs_p99_latency=ttfs_p99_latency,
            settings=default_settings,
@@ -349,6 +353,19 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
        # Set to None or a dict to enable server-side VAD.
        self._server_vad_enabled = turn_detection is not False

+    @staticmethod
+    def _omit_unsupported_prompt(settings: OpenAIRealtimeSTTSettings) -> dict[str, Any]:
+        """Drop prompt settings that are not accepted by the selected model."""
+        if settings.model == OPENAI_REALTIME_WHISPER_MODEL and settings.prompt:
+            old_prompt = settings.prompt
+            settings.prompt = None
+            logger.warning(
+                f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt parameter; "
+                "omitting prompt from OpenAI Realtime transcription session."
+            )
+            return {"prompt": old_prompt}
+        return {}
+
    @staticmethod
    def _language_to_code(language: Language) -> str:
        """Convert a Language enum value to an ISO-639-1 code.
@@ -382,6 +399,8 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
            Dict mapping changed field names to their previous values.
        """
        changed = await super()._update_settings(delta)
+        for field, previous_value in self._omit_unsupported_prompt(self._settings).items():
+            changed.setdefault(field, previous_value)

        if changed and self._session_ready:
            await self._send_session_update()
@@ -550,7 +569,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
        input_audio: dict = {
            "format": {
                "type": "audio/pcm",
-                "rate": _OPENAI_SAMPLE_RATE,
+                "rate": OPENAI_SAMPLE_RATE,
            },
            "transcription": transcription,
        }
@@ -587,7 +606,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
        Args:
            audio: Raw audio bytes at the pipeline sample rate.
        """
-        audio = await self._resampler.resample(audio, self.sample_rate, _OPENAI_SAMPLE_RATE)
+        audio = await self._resampler.resample(audio, self.sample_rate, OPENAI_SAMPLE_RATE)
        if not audio:
            return
        payload = base64.b64encode(audio).decode("utf-8")
@@ -676,9 +695,9 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
    async def _handle_transcription_delta(self, evt: dict):
        """Handle incremental transcription text.

-        For ``gpt-4o-transcribe`` and ``gpt-4o-mini-transcribe``, deltas
-        contain streaming partial text. For ``whisper-1``, each delta
-        contains the full turn transcript.
+        For ``gpt-realtime-whisper``, ``gpt-4o-transcribe``, and
+        ``gpt-4o-mini-transcribe``, deltas contain low-latency streaming
+        partial text.

        Args:
            evt: The delta event from the server.
--- a/src/pipecat/services/openai/tts.py
+++ b/src/pipecat/services/openai/tts.py
@@ -24,6 +24,7 @@ from pipecat.frames.frames import (
    StartFrame,
    TTSAudioRawFrame,
 )
+from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
 from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, assert_given
 from pipecat.services.tts_service import TTSService
 from pipecat.utils.tracing.service_decorators import traced_tts
@@ -85,8 +86,6 @@ class OpenAITTSService(TTSService):
    Settings = OpenAITTSSettings
    _settings: Settings

-    OPENAI_SAMPLE_RATE = 24000  # OpenAI TTS always outputs at 24kHz
-
    class InputParams(BaseModel):
        """Input parameters for OpenAI TTS configuration.

@@ -150,9 +149,9 @@ class OpenAITTSService(TTSService):
                parameters, ``settings`` values take precedence.
            **kwargs: Additional keyword arguments passed to TTSService.
        """
-        if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
+        if sample_rate and sample_rate != OPENAI_SAMPLE_RATE:
            logger.warning(
-                f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
+                f"OpenAI TTS only supports {OPENAI_SAMPLE_RATE}Hz sample rate. "
                f"Current rate of {sample_rate}Hz may cause issues."
            )

@@ -217,9 +216,9 @@ class OpenAITTSService(TTSService):
            frame: The start frame containing initialization parameters.
        """
        await super().start(frame)
-        if self.sample_rate != self.OPENAI_SAMPLE_RATE:
+        if self.sample_rate != OPENAI_SAMPLE_RATE:
            logger.warning(
-                f"OpenAI TTS requires {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
+                f"OpenAI TTS requires {OPENAI_SAMPLE_RATE}Hz sample rate. "
                f"Current rate of {self.sample_rate}Hz may cause issues."
            )

--- a/tests/test_settings.py
+++ b/tests/test_settings.py
@@ -12,8 +12,12 @@ from pipecat.services.deepgram.sagemaker.stt import DeepgramSageMakerSTTSettings
 from pipecat.services.deepgram.stt import DeepgramSTTService, DeepgramSTTSettings
 from pipecat.services.inworld.realtime import events as inworld_events
 from pipecat.services.inworld.realtime.llm import InworldRealtimeLLMSettings
+from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL
 from pipecat.services.openai.realtime import events
-from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMSettings
+from pipecat.services.openai.realtime.llm import (
+    OpenAIRealtimeLLMService,
+    OpenAIRealtimeLLMSettings,
+)
 from pipecat.services.settings import (
    NOT_GIVEN,
    LLMSettings,
@@ -747,6 +751,48 @@ class TestOpenAIRealtimeSettingsApplyUpdate:
        assert store.session_properties.instructions == "Keep me."


+class TestOpenAIRealtimeSessionProperties:
+    def test_realtime_whisper_prompt_is_omitted(self):
+        """gpt-realtime-whisper does not support input transcription prompt."""
+        session_properties = events.SessionProperties(
+            audio=events.AudioConfiguration(
+                input=events.AudioInput(
+                    transcription=events.InputAudioTranscription(
+                        model=OPENAI_REALTIME_WHISPER_MODEL,
+                        prompt="Keywords: metoprolol",
+                    )
+                )
+            )
+        )
+
+        changed = OpenAIRealtimeLLMService._omit_unsupported_input_audio_transcription_prompt(
+            session_properties
+        )
+
+        assert changed is True
+        assert session_properties.audio.input.transcription.prompt is None
+
+    def test_supported_transcription_model_keeps_prompt(self):
+        """Other input transcription models can keep prompt settings."""
+        session_properties = events.SessionProperties(
+            audio=events.AudioConfiguration(
+                input=events.AudioInput(
+                    transcription=events.InputAudioTranscription(
+                        model="gpt-4o-transcribe",
+                        prompt="Keywords: metoprolol",
+                    )
+                )
+            )
+        )
+
+        changed = OpenAIRealtimeLLMService._omit_unsupported_input_audio_transcription_prompt(
+            session_properties
+        )
+
+        assert changed is False
+        assert session_properties.audio.input.transcription.prompt == "Keywords: metoprolol"
+
+
 # ---------------------------------------------------------------------------
 # OpenAIRealtimeLLMSettings: from_mapping
 # ---------------------------------------------------------------------------
				`@@ -0,0 +1 @@`
				- Changed the default OpenAI Realtime input audio transcription model from `gpt-4o-transcribe` to `gpt-realtime-whisper` for both `OpenAIRealtimeSTTService` and `OpenAIRealtimeLLMService`. The new model does not accept the `prompt` parameter; if a prompt is supplied alongside `gpt-realtime-whisper`, it is dropped automatically and a warning is logged. To keep using prompt hints, explicitly pin `model="gpt-4o-transcribe"` (or `"gpt-4o-mini-transcribe"`).