Merge pull request #4450 from pipecat-ai/mb/gpt-realtime-whisper

Default OpenAI Realtime transcription to gpt-realtime-whisper
This commit is contained in:
Mark Backman
2026-05-13 09:48:33 -04:00
committed by GitHub
11 changed files with 141 additions and 60 deletions

View File

@@ -0,0 +1 @@
- Changed the default OpenAI Realtime input audio transcription model from `gpt-4o-transcribe` to `gpt-realtime-whisper` for both `OpenAIRealtimeSTTService` and `OpenAIRealtimeLLMService`. The new model does not accept the `prompt` parameter; if a prompt is supplied alongside `gpt-realtime-whisper`, it is dropped automatically and a warning is logged. To keep using prompt hints, explicitly pin `model="gpt-4o-transcribe"` (or `"gpt-4o-mini-transcribe"`).

View File

@@ -29,7 +29,7 @@ from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.openai.stt import OpenAISTTService
from pipecat.services.openai.stt import OpenAIRealtimeSTTService
from pipecat.services.openai.tts import OpenAITTSService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
@@ -69,13 +69,7 @@ transport_params = {
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = OpenAISTTService(
api_key=os.environ["OPENAI_API_KEY"],
settings=OpenAISTTService.Settings(
model="gpt-4o-transcribe",
prompt="Expect words related weather, such as temperature and conditions. And restaurant names.",
),
)
stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])
tts = OpenAITTSService(
api_key=os.environ["OPENAI_API_KEY"],

View File

@@ -25,7 +25,7 @@ from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.openai.stt import OpenAISTTService
from pipecat.services.openai.stt import OpenAIRealtimeSTTService
from pipecat.services.openai.tts import OpenAITTSService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
@@ -63,13 +63,7 @@ transport_params = {
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = OpenAISTTService(
api_key=os.environ["OPENAI_API_KEY"],
settings=OpenAISTTService.Settings(
model="gpt-4o-transcribe",
prompt="Expect words related weather, such as temperature and conditions. And restaurant names.",
),
)
stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])
tts = OpenAITTSService(
api_key=os.environ["OPENAI_API_KEY"],

View File

@@ -49,13 +49,7 @@ transport_params = {
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = OpenAIRealtimeSTTService(
api_key=os.environ["OPENAI_API_KEY"],
settings=OpenAIRealtimeSTTService.Settings(
model="gpt-4o-transcribe",
prompt="Expect words related to dogs, such as breed names.",
),
)
stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])
tl = TranscriptionLogger()
vad_processor = VADProcessor(vad_analyzer=SileroVADAnalyzer())

View File

@@ -25,7 +25,6 @@ from pipecat.runner.utils import create_transport
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.openai.stt import OpenAIRealtimeSTTService
from pipecat.services.openai.tts import OpenAITTSService
from pipecat.transcriptions.language import Language
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -53,14 +52,7 @@ transport_params = {
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = OpenAIRealtimeSTTService(
api_key=os.environ["OPENAI_API_KEY"],
settings=OpenAIRealtimeSTTService.Settings(
model="gpt-4o-transcribe",
prompt="Expect words related to dogs, such as breed names.",
language=Language.EN,
),
)
stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"])
tts = OpenAITTSService(
api_key=os.environ["OPENAI_API_KEY"],
@@ -72,7 +64,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
llm = OpenAILLMService(
api_key=os.environ["OPENAI_API_KEY"],
settings=OpenAILLMService.Settings(
system_instruction="You are very knowledgable about dogs. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.",
),
)

View File

@@ -0,0 +1,10 @@
#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Internal constants for OpenAI service integrations."""
OPENAI_SAMPLE_RATE = 24000
OPENAI_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper"

View File

@@ -13,6 +13,7 @@ from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
#
# session properties
@@ -34,7 +35,7 @@ class PCMAudioFormat(AudioFormat):
"""
type: Literal["audio/pcm"] = "audio/pcm"
rate: Literal[24000] = 24000
rate: Literal[24000] = OPENAI_SAMPLE_RATE
class PCMUAudioFormat(AudioFormat):
@@ -60,20 +61,21 @@ class PCMAAudioFormat(AudioFormat):
class InputAudioTranscription(BaseModel):
"""Configuration for audio transcription settings."""
model: str = "gpt-4o-transcribe"
model: str = OPENAI_REALTIME_WHISPER_MODEL
language: str | None
prompt: str | None
def __init__(
self,
model: str | None = "gpt-4o-transcribe",
model: str | None = OPENAI_REALTIME_WHISPER_MODEL,
language: str | None = None,
prompt: str | None = None,
):
"""Initialize InputAudioTranscription.
Args:
model: Transcription model to use (e.g., "gpt-4o-transcribe", "whisper-1").
model: Transcription model to use (e.g., "gpt-realtime-whisper",
"gpt-4o-transcribe", "whisper-1").
language: Optional language code for transcription.
prompt: Optional transcription hint text.
"""

View File

@@ -52,6 +52,7 @@ from pipecat.processors.aggregators import async_tool_messages
from pipecat.processors.aggregators.llm_context import LLMContext, LLMSpecificMessage
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
from pipecat.services.settings import (
NOT_GIVEN,
LLMSettings,
@@ -290,6 +291,8 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
if settings is not None:
default_settings.apply_update(settings)
self._omit_unsupported_input_audio_transcription_prompt(default_settings.session_properties)
# Build WebSocket URL with model query parameter
# Source: https://platform.openai.com/docs/guides/realtime-websocket
full_url = f"{base_url}?model={default_settings.model}"
@@ -330,6 +333,29 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
self._register_event_handler("on_conversation_item_updated")
self._retrieve_conversation_item_futures = {}
@staticmethod
def _omit_unsupported_input_audio_transcription_prompt(
session_properties: events.SessionProperties,
) -> bool:
"""Drop input transcription prompt settings unsupported by the selected model."""
transcription = (
session_properties.audio.input.transcription
if session_properties.audio
and session_properties.audio.input
and session_properties.audio.input.transcription
else None
)
if transcription and transcription.model == OPENAI_REALTIME_WHISPER_MODEL:
if transcription.prompt:
transcription.prompt = None
logger.warning(
f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt "
"parameter; omitting prompt from OpenAI Realtime input audio "
"transcription settings."
)
return True
return False
def can_generate_metrics(self) -> bool:
"""Check if the service can generate usage metrics.
@@ -487,7 +513,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
self._current_audio_response = None
def _calculate_audio_duration_ms(
self, total_bytes: int, sample_rate: int = 24000, bytes_per_sample: int = 2
self, total_bytes: int, sample_rate: int = OPENAI_SAMPLE_RATE, bytes_per_sample: int = 2
) -> int:
"""Calculate audio duration in milliseconds based on PCM audio parameters."""
samples = total_bytes / bytes_per_sample
@@ -656,8 +682,12 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
async def _update_settings(self, delta):
"""Apply a settings delta, sending a session update when needed."""
changed = await super()._update_settings(delta)
prompt_omitted = self._omit_unsupported_input_audio_transcription_prompt(
assert_given(self._settings.session_properties)
)
handled = {"session_properties", "system_instruction"}
if changed.keys() & handled:
handled_settings_changed = bool(changed.keys() & handled)
if handled_settings_changed or prompt_omitted:
await self._send_session_update()
self._warn_unhandled_updated_settings(changed.keys() - handled)
return changed
@@ -816,7 +846,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
self._current_audio_response.total_size += len(audio)
frame = TTSAudioRawFrame(
audio=audio,
sample_rate=24000,
sample_rate=OPENAI_SAMPLE_RATE,
num_channels=1,
)
await self.push_frame(frame)

View File

@@ -36,6 +36,7 @@ from pipecat.frames.frames import (
VADUserStoppedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
from pipecat.services.settings import NOT_GIVEN, STTSettings, _NotGiven, assert_given
from pipecat.services.stt_latency import OPENAI_REALTIME_TTFS_P99, OPENAI_TTFS_P99
from pipecat.services.stt_service import WebsocketSTTService
@@ -178,15 +179,13 @@ class OpenAISTTService(BaseWhisperSTTService):
return await self._client.audio.transcriptions.create(**kwargs)
_OPENAI_SAMPLE_RATE = 24000
@dataclass
class OpenAIRealtimeSTTSettings(STTSettings):
"""Settings for OpenAIRealtimeSTTService.
Parameters:
prompt: Optional prompt text to guide transcription style.
prompt: Optional prompt text to guide transcription style. Not supported by
``"gpt-realtime-whisper"``.
noise_reduction: Noise reduction mode. ``"near_field"`` for close
microphones, ``"far_field"`` for distant microphones, or ``None``
to disable.
@@ -227,7 +226,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
stt = OpenAIRealtimeSTTService(
api_key="sk-...",
settings=OpenAIRealtimeSTTService.Settings(
model="gpt-4o-transcribe",
model="gpt-realtime-whisper",
noise_reduction="near_field",
),
)
@@ -255,7 +254,9 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
Args:
api_key: OpenAI API key for authentication.
model: Transcription model. Supported values are
model: Transcription model. For low-latency streaming
transcription, use ``"gpt-realtime-whisper"``. Other
supported transcription models include
``"gpt-4o-transcribe"`` and ``"gpt-4o-mini-transcribe"``.
.. deprecated:: 0.0.105
@@ -269,7 +270,8 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
Use ``settings=OpenAIRealtimeSTTService.Settings(language=...)`` instead.
prompt: Optional prompt text to guide transcription style
or provide keyword hints.
or provide keyword hints. Not supported by
``"gpt-realtime-whisper"``.
.. deprecated:: 0.0.105
Use ``settings=OpenAIRealtimeSTTService.Settings(prompt=...)`` instead.
@@ -303,7 +305,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
# --- 1. Hardcoded defaults ---
default_settings = self.Settings(
model="gpt-4o-transcribe",
model=OPENAI_REALTIME_WHISPER_MODEL,
language=Language.EN,
prompt=None,
noise_reduction=None,
@@ -329,6 +331,8 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
if settings is not None:
default_settings.apply_update(settings)
self._omit_unsupported_prompt(default_settings)
super().__init__(
ttfs_p99_latency=ttfs_p99_latency,
settings=default_settings,
@@ -349,6 +353,19 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
# Set to None or a dict to enable server-side VAD.
self._server_vad_enabled = turn_detection is not False
@staticmethod
def _omit_unsupported_prompt(settings: OpenAIRealtimeSTTSettings) -> dict[str, Any]:
"""Drop prompt settings that are not accepted by the selected model."""
if settings.model == OPENAI_REALTIME_WHISPER_MODEL and settings.prompt:
old_prompt = settings.prompt
settings.prompt = None
logger.warning(
f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt parameter; "
"omitting prompt from OpenAI Realtime transcription session."
)
return {"prompt": old_prompt}
return {}
@staticmethod
def _language_to_code(language: Language) -> str:
"""Convert a Language enum value to an ISO-639-1 code.
@@ -382,6 +399,8 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
Dict mapping changed field names to their previous values.
"""
changed = await super()._update_settings(delta)
for field, previous_value in self._omit_unsupported_prompt(self._settings).items():
changed.setdefault(field, previous_value)
if changed and self._session_ready:
await self._send_session_update()
@@ -550,7 +569,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
input_audio: dict = {
"format": {
"type": "audio/pcm",
"rate": _OPENAI_SAMPLE_RATE,
"rate": OPENAI_SAMPLE_RATE,
},
"transcription": transcription,
}
@@ -587,7 +606,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
Args:
audio: Raw audio bytes at the pipeline sample rate.
"""
audio = await self._resampler.resample(audio, self.sample_rate, _OPENAI_SAMPLE_RATE)
audio = await self._resampler.resample(audio, self.sample_rate, OPENAI_SAMPLE_RATE)
if not audio:
return
payload = base64.b64encode(audio).decode("utf-8")
@@ -676,9 +695,9 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
async def _handle_transcription_delta(self, evt: dict):
"""Handle incremental transcription text.
For ``gpt-4o-transcribe`` and ``gpt-4o-mini-transcribe``, deltas
contain streaming partial text. For ``whisper-1``, each delta
contains the full turn transcript.
For ``gpt-realtime-whisper``, ``gpt-4o-transcribe``, and
``gpt-4o-mini-transcribe``, deltas contain low-latency streaming
partial text.
Args:
evt: The delta event from the server.

View File

@@ -24,6 +24,7 @@ from pipecat.frames.frames import (
StartFrame,
TTSAudioRawFrame,
)
from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, assert_given
from pipecat.services.tts_service import TTSService
from pipecat.utils.tracing.service_decorators import traced_tts
@@ -85,8 +86,6 @@ class OpenAITTSService(TTSService):
Settings = OpenAITTSSettings
_settings: Settings
OPENAI_SAMPLE_RATE = 24000 # OpenAI TTS always outputs at 24kHz
class InputParams(BaseModel):
"""Input parameters for OpenAI TTS configuration.
@@ -150,9 +149,9 @@ class OpenAITTSService(TTSService):
parameters, ``settings`` values take precedence.
**kwargs: Additional keyword arguments passed to TTSService.
"""
if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
if sample_rate and sample_rate != OPENAI_SAMPLE_RATE:
logger.warning(
f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
f"OpenAI TTS only supports {OPENAI_SAMPLE_RATE}Hz sample rate. "
f"Current rate of {sample_rate}Hz may cause issues."
)
@@ -217,9 +216,9 @@ class OpenAITTSService(TTSService):
frame: The start frame containing initialization parameters.
"""
await super().start(frame)
if self.sample_rate != self.OPENAI_SAMPLE_RATE:
if self.sample_rate != OPENAI_SAMPLE_RATE:
logger.warning(
f"OpenAI TTS requires {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
f"OpenAI TTS requires {OPENAI_SAMPLE_RATE}Hz sample rate. "
f"Current rate of {self.sample_rate}Hz may cause issues."
)

View File

@@ -12,8 +12,12 @@ from pipecat.services.deepgram.sagemaker.stt import DeepgramSageMakerSTTSettings
from pipecat.services.deepgram.stt import DeepgramSTTService, DeepgramSTTSettings
from pipecat.services.inworld.realtime import events as inworld_events
from pipecat.services.inworld.realtime.llm import InworldRealtimeLLMSettings
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL
from pipecat.services.openai.realtime import events
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMSettings
from pipecat.services.openai.realtime.llm import (
OpenAIRealtimeLLMService,
OpenAIRealtimeLLMSettings,
)
from pipecat.services.settings import (
NOT_GIVEN,
LLMSettings,
@@ -747,6 +751,48 @@ class TestOpenAIRealtimeSettingsApplyUpdate:
assert store.session_properties.instructions == "Keep me."
class TestOpenAIRealtimeSessionProperties:
def test_realtime_whisper_prompt_is_omitted(self):
"""gpt-realtime-whisper does not support input transcription prompt."""
session_properties = events.SessionProperties(
audio=events.AudioConfiguration(
input=events.AudioInput(
transcription=events.InputAudioTranscription(
model=OPENAI_REALTIME_WHISPER_MODEL,
prompt="Keywords: metoprolol",
)
)
)
)
changed = OpenAIRealtimeLLMService._omit_unsupported_input_audio_transcription_prompt(
session_properties
)
assert changed is True
assert session_properties.audio.input.transcription.prompt is None
def test_supported_transcription_model_keeps_prompt(self):
"""Other input transcription models can keep prompt settings."""
session_properties = events.SessionProperties(
audio=events.AudioConfiguration(
input=events.AudioInput(
transcription=events.InputAudioTranscription(
model="gpt-4o-transcribe",
prompt="Keywords: metoprolol",
)
)
)
)
changed = OpenAIRealtimeLLMService._omit_unsupported_input_audio_transcription_prompt(
session_properties
)
assert changed is False
assert session_properties.audio.input.transcription.prompt == "Keywords: metoprolol"
# ---------------------------------------------------------------------------
# OpenAIRealtimeLLMSettings: from_mapping
# ---------------------------------------------------------------------------