From 644030584fb3ee9238f7f00fbfe2ed492022db4e Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 12 May 2026 15:38:20 -0400 Subject: [PATCH] Centralize OpenAI audio constants --- src/pipecat/services/openai/_constants.py | 10 ++++++++++ src/pipecat/services/openai/realtime/events.py | 9 ++++----- src/pipecat/services/openai/realtime/llm.py | 9 +++++---- src/pipecat/services/openai/stt.py | 15 ++++++--------- src/pipecat/services/openai/tts.py | 11 +++++------ tests/test_settings.py | 3 ++- 6 files changed, 32 insertions(+), 25 deletions(-) create mode 100644 src/pipecat/services/openai/_constants.py diff --git a/src/pipecat/services/openai/_constants.py b/src/pipecat/services/openai/_constants.py new file mode 100644 index 000000000..110c95cdb --- /dev/null +++ b/src/pipecat/services/openai/_constants.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2024-2026, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +"""Internal constants for OpenAI service integrations.""" + +OPENAI_SAMPLE_RATE = 24000 +OPENAI_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper" diff --git a/src/pipecat/services/openai/realtime/events.py b/src/pipecat/services/openai/realtime/events.py index 42d3b7e4d..215ab1b23 100644 --- a/src/pipecat/services/openai/realtime/events.py +++ b/src/pipecat/services/openai/realtime/events.py @@ -13,13 +13,12 @@ from typing import Any, Literal from pydantic import BaseModel, ConfigDict, Field from pipecat.adapters.schemas.tools_schema import ToolsSchema +from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE # # session properties # -GPT_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper" - class AudioFormat(BaseModel): """Base class for audio format configuration.""" @@ -36,7 +35,7 @@ class PCMAudioFormat(AudioFormat): """ type: Literal["audio/pcm"] = "audio/pcm" - rate: Literal[24000] = 24000 + rate: Literal[24000] = OPENAI_SAMPLE_RATE class PCMUAudioFormat(AudioFormat): @@ -62,13 +61,13 @@ class PCMAAudioFormat(AudioFormat): class InputAudioTranscription(BaseModel): """Configuration for audio transcription settings.""" - model: str = GPT_REALTIME_WHISPER_MODEL + model: str = OPENAI_REALTIME_WHISPER_MODEL language: str | None prompt: str | None def __init__( self, - model: str | None = GPT_REALTIME_WHISPER_MODEL, + model: str | None = OPENAI_REALTIME_WHISPER_MODEL, language: str | None = None, prompt: str | None = None, ): diff --git a/src/pipecat/services/openai/realtime/llm.py b/src/pipecat/services/openai/realtime/llm.py index e39299dc0..a95ceccfd 100644 --- a/src/pipecat/services/openai/realtime/llm.py +++ b/src/pipecat/services/openai/realtime/llm.py @@ -51,6 +51,7 @@ from pipecat.metrics.metrics import LLMTokenUsage from pipecat.processors.aggregators.llm_context import LLMContext from pipecat.processors.frame_processor import FrameDirection from pipecat.services.llm_service import FunctionCallFromLLM, LLMService +from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE from pipecat.services.settings import ( NOT_GIVEN, LLMSettings, @@ -337,11 +338,11 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]): and session_properties.audio.input.transcription else None ) - if transcription and transcription.model == events.GPT_REALTIME_WHISPER_MODEL: + if transcription and transcription.model == OPENAI_REALTIME_WHISPER_MODEL: if transcription.prompt: transcription.prompt = None logger.warning( - f"{events.GPT_REALTIME_WHISPER_MODEL} does not support the prompt " + f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt " "parameter; omitting prompt from OpenAI Realtime input audio " "transcription settings." ) @@ -505,7 +506,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]): self._current_audio_response = None def _calculate_audio_duration_ms( - self, total_bytes: int, sample_rate: int = 24000, bytes_per_sample: int = 2 + self, total_bytes: int, sample_rate: int = OPENAI_SAMPLE_RATE, bytes_per_sample: int = 2 ) -> int: """Calculate audio duration in milliseconds based on PCM audio parameters.""" samples = total_bytes / bytes_per_sample @@ -797,7 +798,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]): self._current_audio_response.total_size += len(audio) frame = TTSAudioRawFrame( audio=audio, - sample_rate=24000, + sample_rate=OPENAI_SAMPLE_RATE, num_channels=1, ) await self.push_frame(frame) diff --git a/src/pipecat/services/openai/stt.py b/src/pipecat/services/openai/stt.py index 8fb82b8df..862c0f3a6 100644 --- a/src/pipecat/services/openai/stt.py +++ b/src/pipecat/services/openai/stt.py @@ -36,6 +36,7 @@ from pipecat.frames.frames import ( VADUserStoppedSpeakingFrame, ) from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE from pipecat.services.settings import NOT_GIVEN, STTSettings, _NotGiven, assert_given from pipecat.services.stt_latency import OPENAI_REALTIME_TTFS_P99, OPENAI_TTFS_P99 from pipecat.services.stt_service import WebsocketSTTService @@ -178,10 +179,6 @@ class OpenAISTTService(BaseWhisperSTTService): return await self._client.audio.transcriptions.create(**kwargs) -_OPENAI_SAMPLE_RATE = 24000 -_OPENAI_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper" - - @dataclass class OpenAIRealtimeSTTSettings(STTSettings): """Settings for OpenAIRealtimeSTTService. @@ -308,7 +305,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): # --- 1. Hardcoded defaults --- default_settings = self.Settings( - model=_OPENAI_REALTIME_WHISPER_MODEL, + model=OPENAI_REALTIME_WHISPER_MODEL, language=Language.EN, prompt=None, noise_reduction=None, @@ -359,11 +356,11 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): @staticmethod def _omit_unsupported_prompt(settings: OpenAIRealtimeSTTSettings) -> dict[str, Any]: """Drop prompt settings that are not accepted by the selected model.""" - if settings.model == _OPENAI_REALTIME_WHISPER_MODEL and settings.prompt: + if settings.model == OPENAI_REALTIME_WHISPER_MODEL and settings.prompt: old_prompt = settings.prompt settings.prompt = None logger.warning( - f"{_OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt parameter; " + f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt parameter; " "omitting prompt from OpenAI Realtime transcription session." ) return {"prompt": old_prompt} @@ -572,7 +569,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): input_audio: dict = { "format": { "type": "audio/pcm", - "rate": _OPENAI_SAMPLE_RATE, + "rate": OPENAI_SAMPLE_RATE, }, "transcription": transcription, } @@ -609,7 +606,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService): Args: audio: Raw audio bytes at the pipeline sample rate. """ - audio = await self._resampler.resample(audio, self.sample_rate, _OPENAI_SAMPLE_RATE) + audio = await self._resampler.resample(audio, self.sample_rate, OPENAI_SAMPLE_RATE) if not audio: return payload = base64.b64encode(audio).decode("utf-8") diff --git a/src/pipecat/services/openai/tts.py b/src/pipecat/services/openai/tts.py index a6528f59e..e6ebb4dd6 100644 --- a/src/pipecat/services/openai/tts.py +++ b/src/pipecat/services/openai/tts.py @@ -24,6 +24,7 @@ from pipecat.frames.frames import ( StartFrame, TTSAudioRawFrame, ) +from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, assert_given from pipecat.services.tts_service import TTSService from pipecat.utils.tracing.service_decorators import traced_tts @@ -85,8 +86,6 @@ class OpenAITTSService(TTSService): Settings = OpenAITTSSettings _settings: Settings - OPENAI_SAMPLE_RATE = 24000 # OpenAI TTS always outputs at 24kHz - class InputParams(BaseModel): """Input parameters for OpenAI TTS configuration. @@ -150,9 +149,9 @@ class OpenAITTSService(TTSService): parameters, ``settings`` values take precedence. **kwargs: Additional keyword arguments passed to TTSService. """ - if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE: + if sample_rate and sample_rate != OPENAI_SAMPLE_RATE: logger.warning( - f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. " + f"OpenAI TTS only supports {OPENAI_SAMPLE_RATE}Hz sample rate. " f"Current rate of {sample_rate}Hz may cause issues." ) @@ -217,9 +216,9 @@ class OpenAITTSService(TTSService): frame: The start frame containing initialization parameters. """ await super().start(frame) - if self.sample_rate != self.OPENAI_SAMPLE_RATE: + if self.sample_rate != OPENAI_SAMPLE_RATE: logger.warning( - f"OpenAI TTS requires {self.OPENAI_SAMPLE_RATE}Hz sample rate. " + f"OpenAI TTS requires {OPENAI_SAMPLE_RATE}Hz sample rate. " f"Current rate of {self.sample_rate}Hz may cause issues." ) diff --git a/tests/test_settings.py b/tests/test_settings.py index 78c1f30b8..ab76a8c6c 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -12,6 +12,7 @@ from pipecat.services.deepgram.sagemaker.stt import DeepgramSageMakerSTTSettings from pipecat.services.deepgram.stt import DeepgramSTTService, DeepgramSTTSettings from pipecat.services.inworld.realtime import events as inworld_events from pipecat.services.inworld.realtime.llm import InworldRealtimeLLMSettings +from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL from pipecat.services.openai.realtime import events from pipecat.services.openai.realtime.llm import ( OpenAIRealtimeLLMService, @@ -757,7 +758,7 @@ class TestOpenAIRealtimeSessionProperties: audio=events.AudioConfiguration( input=events.AudioInput( transcription=events.InputAudioTranscription( - model=events.GPT_REALTIME_WHISPER_MODEL, + model=OPENAI_REALTIME_WHISPER_MODEL, prompt="Keywords: metoprolol", ) )