diff --git a/changelog/4450.changed.md b/changelog/4450.changed.md new file mode 100644 index 000000000..88cff3421 --- /dev/null +++ b/changelog/4450.changed.md @@ -0,0 +1 @@ +- Changed the default OpenAI Realtime input audio transcription model from `gpt-4o-transcribe` to `gpt-realtime-whisper` for both `OpenAIRealtimeSTTService` and `OpenAIRealtimeLLMService`. The new model does not accept the `prompt` parameter; if a prompt is supplied alongside `gpt-realtime-whisper`, it is dropped automatically and a warning is logged. To keep using prompt hints, explicitly pin `model="gpt-4o-transcribe"` (or `"gpt-4o-mini-transcribe"`). diff --git a/examples/function-calling/function-calling-openai-async.py b/examples/function-calling/function-calling-openai-async.py index 47039711f..de0d1a014 100644 --- a/examples/function-calling/function-calling-openai-async.py +++ b/examples/function-calling/function-calling-openai-async.py @@ -29,7 +29,7 @@ from pipecat.runner.types import RunnerArguments from pipecat.runner.utils import create_transport from pipecat.services.llm_service import FunctionCallParams from pipecat.services.openai.llm import OpenAILLMService -from pipecat.services.openai.stt import OpenAISTTService +from pipecat.services.openai.stt import OpenAIRealtimeSTTService from pipecat.services.openai.tts import OpenAITTSService from pipecat.transports.base_transport import BaseTransport, TransportParams from pipecat.transports.daily.transport import DailyParams @@ -69,13 +69,7 @@ transport_params = { async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): logger.info(f"Starting bot") - stt = OpenAISTTService( - api_key=os.environ["OPENAI_API_KEY"], - settings=OpenAISTTService.Settings( - model="gpt-4o-transcribe", - prompt="Expect words related weather, such as temperature and conditions. And restaurant names.", - ), - ) + stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"]) tts = OpenAITTSService( api_key=os.environ["OPENAI_API_KEY"], diff --git a/examples/function-calling/function-calling-openai.py b/examples/function-calling/function-calling-openai.py index 2a9779742..38c9d6385 100644 --- a/examples/function-calling/function-calling-openai.py +++ b/examples/function-calling/function-calling-openai.py @@ -25,7 +25,7 @@ from pipecat.runner.types import RunnerArguments from pipecat.runner.utils import create_transport from pipecat.services.llm_service import FunctionCallParams from pipecat.services.openai.llm import OpenAILLMService -from pipecat.services.openai.stt import OpenAISTTService +from pipecat.services.openai.stt import OpenAIRealtimeSTTService from pipecat.services.openai.tts import OpenAITTSService from pipecat.transports.base_transport import BaseTransport, TransportParams from pipecat.transports.daily.transport import DailyParams @@ -63,13 +63,7 @@ transport_params = { async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): logger.info(f"Starting bot") - stt = OpenAISTTService( - api_key=os.environ["OPENAI_API_KEY"], - settings=OpenAISTTService.Settings( - model="gpt-4o-transcribe", - prompt="Expect words related weather, such as temperature and conditions. And restaurant names.", - ), - ) + stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"]) tts = OpenAITTSService( api_key=os.environ["OPENAI_API_KEY"], diff --git a/examples/transcription/transcription-openai.py b/examples/transcription/transcription-openai.py index 5f3072177..47228ce8b 100644 --- a/examples/transcription/transcription-openai.py +++ b/examples/transcription/transcription-openai.py @@ -49,13 +49,7 @@ transport_params = { async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): logger.info(f"Starting bot") - stt = OpenAIRealtimeSTTService( - api_key=os.environ["OPENAI_API_KEY"], - settings=OpenAIRealtimeSTTService.Settings( - model="gpt-4o-transcribe", - prompt="Expect words related to dogs, such as breed names.", - ), - ) + stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"]) tl = TranscriptionLogger() vad_processor = VADProcessor(vad_analyzer=SileroVADAnalyzer()) diff --git a/examples/voice/voice-openai.py b/examples/voice/voice-openai.py index 8f162671f..f5c30d681 100644 --- a/examples/voice/voice-openai.py +++ b/examples/voice/voice-openai.py @@ -25,7 +25,6 @@ from pipecat.runner.utils import create_transport from pipecat.services.openai.llm import OpenAILLMService from pipecat.services.openai.stt import OpenAIRealtimeSTTService from pipecat.services.openai.tts import OpenAITTSService -from pipecat.transcriptions.language import Language from pipecat.transports.base_transport import BaseTransport, TransportParams from pipecat.transports.daily.transport import DailyParams from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams @@ -53,14 +52,7 @@ transport_params = { async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): logger.info(f"Starting bot") - stt = OpenAIRealtimeSTTService( - api_key=os.environ["OPENAI_API_KEY"], - settings=OpenAIRealtimeSTTService.Settings( - model="gpt-4o-transcribe", - prompt="Expect words related to dogs, such as breed names.", - language=Language.EN, - ), - ) + stt = OpenAIRealtimeSTTService(api_key=os.environ["OPENAI_API_KEY"]) tts = OpenAITTSService( api_key=os.environ["OPENAI_API_KEY"], @@ -72,7 +64,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): llm = OpenAILLMService( api_key=os.environ["OPENAI_API_KEY"], settings=OpenAILLMService.Settings( - system_instruction="You are very knowledgable about dogs. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.", + system_instruction="You are a helpful assistant in a voice conversation. Your responses will be spoken aloud, so avoid emojis, bullet points, or other formatting that can't be spoken. Respond to what the user said in a creative, helpful, and brief way.", ), ) diff --git a/src/pipecat/services/openai/_constants.py b/src/pipecat/services/openai/_constants.py new file mode 100644 index 000000000..110c95cdb --- /dev/null +++ b/src/pipecat/services/openai/_constants.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2024-2026, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +"""Internal constants for OpenAI service integrations.""" + +OPENAI_SAMPLE_RATE = 24000 +OPENAI_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper" diff --git a/src/pipecat/services/openai/realtime/events.py b/src/pipecat/services/openai/realtime/events.py index 621f3687f..556eaade4 100644 --- a/src/pipecat/services/openai/realtime/events.py +++ b/src/pipecat/services/openai/realtime/events.py @@ -13,6 +13,7 @@ from typing import Any, Literal from pydantic import BaseModel, ConfigDict, Field from pipecat.adapters.schemas.tools_schema import ToolsSchema +from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE # # session properties @@ -34,7 +35,7 @@ class PCMAudioFormat(AudioFormat): """ type: Literal["audio/pcm"] = "audio/pcm" - rate: Literal[24000] = 24000 + rate: Literal[24000] = OPENAI_SAMPLE_RATE class PCMUAudioFormat(AudioFormat): @@ -60,20 +61,21 @@ class PCMAAudioFormat(AudioFormat): class InputAudioTranscription(BaseModel): """Configuration for audio transcription settings.""" - model: str = "gpt-4o-transcribe" + model: str = OPENAI_REALTIME_WHISPER_MODEL language: str | None prompt: str | None def __init__( self, - model: str | None = "gpt-4o-transcribe", + model: str | None = OPENAI_REALTIME_WHISPER_MODEL, language: str | None = None, prompt: str | None = None, ): """Initialize InputAudioTranscription. Args: - model: Transcription model to use (e.g., "gpt-4o-transcribe", "whisper-1"). + model: Transcription model to use (e.g., "gpt-realtime-whisper", + "gpt-4o-transcribe", "whisper-1"). language: Optional language code for transcription. prompt: Optional transcription hint text. """ diff --git a/src/pipecat/services/openai/realtime/llm.py b/src/pipecat/services/openai/realtime/llm.py index ae3074607..19fa0f717 100644 --- a/src/pipecat/services/openai/realtime/llm.py +++ b/src/pipecat/services/openai/realtime/llm.py @@ -52,6 +52,7 @@ from pipecat.processors.aggregators import async_tool_messages from pipecat.processors.aggregators.llm_context import LLMContext, LLMSpecificMessage from pipecat.processors.frame_processor import FrameDirection from pipecat.services.llm_service import FunctionCallFromLLM, LLMService +from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE from pipecat.services.settings import ( NOT_GIVEN, LLMSettings, @@ -290,6 +291,8 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]): if settings is not None: default_settings.apply_update(settings) + self._omit_unsupported_input_audio_transcription_prompt(default_settings.session_properties) + # Build WebSocket URL with model query parameter # Source: https://platform.openai.com/docs/guides/realtime-websocket full_url = f"{base_url}?model={default_settings.model}" @@ -330,6 +333,29 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]): self._register_event_handler("on_conversation_item_updated") self._retrieve_conversation_item_futures = {} + @staticmethod + def _omit_unsupported_input_audio_transcription_prompt( + session_properties: events.SessionProperties, + ) -> bool: + """Drop input transcription prompt settings unsupported by the selected model.""" + transcription = ( + session_properties.audio.input.transcription + if session_properties.audio + and session_properties.audio.input + and session_properties.audio.input.transcription + else None + ) + if transcription and transcription.model == OPENAI_REALTIME_WHISPER_MODEL: + if transcription.prompt: + transcription.prompt = None + logger.warning( + f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt " + "parameter; omitting prompt from OpenAI Realtime input audio " + "transcription settings." + ) + return True + return False + def can_generate_metrics(self) -> bool: """Check if the service can generate usage metrics. @@ -487,7 +513,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]): self._current_audio_response = None def _calculate_audio_duration_ms( - self, total_bytes: int, sample_rate: int = 24000, bytes_per_sample: int = 2 + self, total_bytes: int, sample_rate: int = OPENAI_SAMPLE_RATE, bytes_per_sample: int = 2 ) -> int: """Calculate audio duration in milliseconds based on PCM audio parameters.""" samples = total_bytes / bytes_per_sample @@ -656,8 +682,12 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]): async def _update_settings(self, delta): """Apply a settings delta, sending a session update when needed.""" changed = await super()._update_settings(delta) + prompt_omitted = self._omit_unsupported_input_audio_transcription_prompt( + assert_given(self._settings.session_properties) + ) handled = {"session_properties", "system_instruction"} - if changed.keys() & handled: + handled_settings_changed = bool(changed.keys() & handled) + if handled_settings_changed or prompt_omitted: await self._send_session_update() self._warn_unhandled_updated_settings(changed.keys() - handled) return changed @@ -816,7 +846,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]): self._current_audio_response.total_size += len(audio) frame = TTSAudioRawFrame( audio=audio, - sample_rate=24000, + sample_rate=OPENAI_SAMPLE_RATE, num_channels=1, ) await self.push_frame(frame) diff --git a/src/pipecat/services/openai/stt.py b/src/pipecat/services/openai/stt.py index 6f635639a..862c0f3a6 100644 --- a/src/pipecat/services/openai/stt.py +++ b/src/pipecat/services/openai/stt.py @@ -36,6 +36,7 @@ from pipecat.frames.frames import ( VADUserStoppedSpeakingFrame, ) from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE from pipecat.services.settings import NOT_GIVEN, STTSettings, _NotGiven, assert_given from pipecat.services.stt_latency import OPENAI_REALTIME_TTFS_P99, OPENAI_TTFS_P99 from pipecat.services.stt_service import WebsocketSTTService @@ -178,15 +179,13 @@ class OpenAISTTService(BaseWhisperSTTService): return await self._client.audio.transcriptions.create(**kwargs) -_OPENAI_SAMPLE_RATE = 24000 - - @dataclass class OpenAIRealtimeSTTSettings(STTSettings): """Settings for OpenAIRealtimeSTTService. Parameters: - prompt: Optional prompt text to guide transcription style. + prompt: Optional prompt text to guide transcription style. Not supported by + ``"gpt-realtime-whisper"``. noise_reduction: Noise reduction mode. ``"near_field"`` for close microphones, ``"far_field"`` for distant microphones, or ``None`` to disable. @@ -227,7 +226,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): stt = OpenAIRealtimeSTTService( api_key="sk-...", settings=OpenAIRealtimeSTTService.Settings( - model="gpt-4o-transcribe", + model="gpt-realtime-whisper", noise_reduction="near_field", ), ) @@ -255,7 +254,9 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): Args: api_key: OpenAI API key for authentication. - model: Transcription model. Supported values are + model: Transcription model. For low-latency streaming + transcription, use ``"gpt-realtime-whisper"``. Other + supported transcription models include ``"gpt-4o-transcribe"`` and ``"gpt-4o-mini-transcribe"``. .. deprecated:: 0.0.105 @@ -269,7 +270,8 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): Use ``settings=OpenAIRealtimeSTTService.Settings(language=...)`` instead. prompt: Optional prompt text to guide transcription style - or provide keyword hints. + or provide keyword hints. Not supported by + ``"gpt-realtime-whisper"``. .. deprecated:: 0.0.105 Use ``settings=OpenAIRealtimeSTTService.Settings(prompt=...)`` instead. @@ -303,7 +305,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): # --- 1. Hardcoded defaults --- default_settings = self.Settings( - model="gpt-4o-transcribe", + model=OPENAI_REALTIME_WHISPER_MODEL, language=Language.EN, prompt=None, noise_reduction=None, @@ -329,6 +331,8 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): if settings is not None: default_settings.apply_update(settings) + self._omit_unsupported_prompt(default_settings) + super().__init__( ttfs_p99_latency=ttfs_p99_latency, settings=default_settings, @@ -349,6 +353,19 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): # Set to None or a dict to enable server-side VAD. self._server_vad_enabled = turn_detection is not False + @staticmethod + def _omit_unsupported_prompt(settings: OpenAIRealtimeSTTSettings) -> dict[str, Any]: + """Drop prompt settings that are not accepted by the selected model.""" + if settings.model == OPENAI_REALTIME_WHISPER_MODEL and settings.prompt: + old_prompt = settings.prompt + settings.prompt = None + logger.warning( + f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt parameter; " + "omitting prompt from OpenAI Realtime transcription session." + ) + return {"prompt": old_prompt} + return {} + @staticmethod def _language_to_code(language: Language) -> str: """Convert a Language enum value to an ISO-639-1 code. @@ -382,6 +399,8 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): Dict mapping changed field names to their previous values. """ changed = await super()._update_settings(delta) + for field, previous_value in self._omit_unsupported_prompt(self._settings).items(): + changed.setdefault(field, previous_value) if changed and self._session_ready: await self._send_session_update() @@ -550,7 +569,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): input_audio: dict = { "format": { "type": "audio/pcm", - "rate": _OPENAI_SAMPLE_RATE, + "rate": OPENAI_SAMPLE_RATE, }, "transcription": transcription, } @@ -587,7 +606,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): Args: audio: Raw audio bytes at the pipeline sample rate. """ - audio = await self._resampler.resample(audio, self.sample_rate, _OPENAI_SAMPLE_RATE) + audio = await self._resampler.resample(audio, self.sample_rate, OPENAI_SAMPLE_RATE) if not audio: return payload = base64.b64encode(audio).decode("utf-8") @@ -676,9 +695,9 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): async def _handle_transcription_delta(self, evt: dict): """Handle incremental transcription text. - For ``gpt-4o-transcribe`` and ``gpt-4o-mini-transcribe``, deltas - contain streaming partial text. For ``whisper-1``, each delta - contains the full turn transcript. + For ``gpt-realtime-whisper``, ``gpt-4o-transcribe``, and + ``gpt-4o-mini-transcribe``, deltas contain low-latency streaming + partial text. Args: evt: The delta event from the server. diff --git a/src/pipecat/services/openai/tts.py b/src/pipecat/services/openai/tts.py index a6528f59e..e6ebb4dd6 100644 --- a/src/pipecat/services/openai/tts.py +++ b/src/pipecat/services/openai/tts.py @@ -24,6 +24,7 @@ from pipecat.frames.frames import ( StartFrame, TTSAudioRawFrame, ) +from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, assert_given from pipecat.services.tts_service import TTSService from pipecat.utils.tracing.service_decorators import traced_tts @@ -85,8 +86,6 @@ class OpenAITTSService(TTSService): Settings = OpenAITTSSettings _settings: Settings - OPENAI_SAMPLE_RATE = 24000 # OpenAI TTS always outputs at 24kHz - class InputParams(BaseModel): """Input parameters for OpenAI TTS configuration. @@ -150,9 +149,9 @@ class OpenAITTSService(TTSService): parameters, ``settings`` values take precedence. **kwargs: Additional keyword arguments passed to TTSService. """ - if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE: + if sample_rate and sample_rate != OPENAI_SAMPLE_RATE: logger.warning( - f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. " + f"OpenAI TTS only supports {OPENAI_SAMPLE_RATE}Hz sample rate. " f"Current rate of {sample_rate}Hz may cause issues." ) @@ -217,9 +216,9 @@ class OpenAITTSService(TTSService): frame: The start frame containing initialization parameters. """ await super().start(frame) - if self.sample_rate != self.OPENAI_SAMPLE_RATE: + if self.sample_rate != OPENAI_SAMPLE_RATE: logger.warning( - f"OpenAI TTS requires {self.OPENAI_SAMPLE_RATE}Hz sample rate. " + f"OpenAI TTS requires {OPENAI_SAMPLE_RATE}Hz sample rate. " f"Current rate of {self.sample_rate}Hz may cause issues." ) diff --git a/tests/test_settings.py b/tests/test_settings.py index 6c45d53f5..ab76a8c6c 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -12,8 +12,12 @@ from pipecat.services.deepgram.sagemaker.stt import DeepgramSageMakerSTTSettings from pipecat.services.deepgram.stt import DeepgramSTTService, DeepgramSTTSettings from pipecat.services.inworld.realtime import events as inworld_events from pipecat.services.inworld.realtime.llm import InworldRealtimeLLMSettings +from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL from pipecat.services.openai.realtime import events -from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMSettings +from pipecat.services.openai.realtime.llm import ( + OpenAIRealtimeLLMService, + OpenAIRealtimeLLMSettings, +) from pipecat.services.settings import ( NOT_GIVEN, LLMSettings, @@ -747,6 +751,48 @@ class TestOpenAIRealtimeSettingsApplyUpdate: assert store.session_properties.instructions == "Keep me." +class TestOpenAIRealtimeSessionProperties: + def test_realtime_whisper_prompt_is_omitted(self): + """gpt-realtime-whisper does not support input transcription prompt.""" + session_properties = events.SessionProperties( + audio=events.AudioConfiguration( + input=events.AudioInput( + transcription=events.InputAudioTranscription( + model=OPENAI_REALTIME_WHISPER_MODEL, + prompt="Keywords: metoprolol", + ) + ) + ) + ) + + changed = OpenAIRealtimeLLMService._omit_unsupported_input_audio_transcription_prompt( + session_properties + ) + + assert changed is True + assert session_properties.audio.input.transcription.prompt is None + + def test_supported_transcription_model_keeps_prompt(self): + """Other input transcription models can keep prompt settings.""" + session_properties = events.SessionProperties( + audio=events.AudioConfiguration( + input=events.AudioInput( + transcription=events.InputAudioTranscription( + model="gpt-4o-transcribe", + prompt="Keywords: metoprolol", + ) + ) + ) + ) + + changed = OpenAIRealtimeLLMService._omit_unsupported_input_audio_transcription_prompt( + session_properties + ) + + assert changed is False + assert session_properties.audio.input.transcription.prompt == "Keywords: metoprolol" + + # --------------------------------------------------------------------------- # OpenAIRealtimeLLMSettings: from_mapping # ---------------------------------------------------------------------------