Centralize OpenAI audio constants

2026-05-12 15:38:20 -04:00
parent e2bfa6352f
commit 644030584f
6 changed files with 32 additions and 25 deletions
--- a/src/pipecat/services/openai/_constants.py
+++ b/src/pipecat/services/openai/_constants.py
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2024-2026, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+"""Internal constants for OpenAI service integrations."""
+
+OPENAI_SAMPLE_RATE = 24000
+OPENAI_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper"
--- a/src/pipecat/services/openai/realtime/events.py
+++ b/src/pipecat/services/openai/realtime/events.py
@@ -13,13 +13,12 @@ from typing import Any, Literal
 from pydantic import BaseModel, ConfigDict, Field

 from pipecat.adapters.schemas.tools_schema import ToolsSchema
+from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE

 #
 # session properties
 #

-GPT_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper"
-

 class AudioFormat(BaseModel):
    """Base class for audio format configuration."""
@@ -36,7 +35,7 @@ class PCMAudioFormat(AudioFormat):
    """

    type: Literal["audio/pcm"] = "audio/pcm"
-    rate: Literal[24000] = 24000
+    rate: Literal[24000] = OPENAI_SAMPLE_RATE


 class PCMUAudioFormat(AudioFormat):
@@ -62,13 +61,13 @@ class PCMAAudioFormat(AudioFormat):
 class InputAudioTranscription(BaseModel):
    """Configuration for audio transcription settings."""

-    model: str = GPT_REALTIME_WHISPER_MODEL
+    model: str = OPENAI_REALTIME_WHISPER_MODEL
    language: str | None
    prompt: str | None

    def __init__(
        self,
-        model: str | None = GPT_REALTIME_WHISPER_MODEL,
+        model: str | None = OPENAI_REALTIME_WHISPER_MODEL,
        language: str | None = None,
        prompt: str | None = None,
    ):
--- a/src/pipecat/services/openai/realtime/llm.py
+++ b/src/pipecat/services/openai/realtime/llm.py
@@ -51,6 +51,7 @@ from pipecat.metrics.metrics import LLMTokenUsage
 from pipecat.processors.aggregators.llm_context import LLMContext
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
+from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
 from pipecat.services.settings import (
    NOT_GIVEN,
    LLMSettings,
@@ -337,11 +338,11 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
            and session_properties.audio.input.transcription
            else None
        )
-        if transcription and transcription.model == events.GPT_REALTIME_WHISPER_MODEL:
+        if transcription and transcription.model == OPENAI_REALTIME_WHISPER_MODEL:
            if transcription.prompt:
                transcription.prompt = None
                logger.warning(
-                    f"{events.GPT_REALTIME_WHISPER_MODEL} does not support the prompt "
+                    f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt "
                    "parameter; omitting prompt from OpenAI Realtime input audio "
                    "transcription settings."
                )
@@ -505,7 +506,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
        self._current_audio_response = None

    def _calculate_audio_duration_ms(
-        self, total_bytes: int, sample_rate: int = 24000, bytes_per_sample: int = 2
+        self, total_bytes: int, sample_rate: int = OPENAI_SAMPLE_RATE, bytes_per_sample: int = 2
    ) -> int:
        """Calculate audio duration in milliseconds based on PCM audio parameters."""
        samples = total_bytes / bytes_per_sample
@@ -797,7 +798,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
        self._current_audio_response.total_size += len(audio)
        frame = TTSAudioRawFrame(
            audio=audio,
-            sample_rate=24000,
+            sample_rate=OPENAI_SAMPLE_RATE,
            num_channels=1,
        )
        await self.push_frame(frame)
--- a/src/pipecat/services/openai/stt.py
+++ b/src/pipecat/services/openai/stt.py
@@ -36,6 +36,7 @@ from pipecat.frames.frames import (
    VADUserStoppedSpeakingFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
 from pipecat.services.settings import NOT_GIVEN, STTSettings, _NotGiven, assert_given
 from pipecat.services.stt_latency import OPENAI_REALTIME_TTFS_P99, OPENAI_TTFS_P99
 from pipecat.services.stt_service import WebsocketSTTService
@@ -178,10 +179,6 @@ class OpenAISTTService(BaseWhisperSTTService):
        return await self._client.audio.transcriptions.create(**kwargs)


-_OPENAI_SAMPLE_RATE = 24000
-_OPENAI_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper"
-
-
@dataclass
 class OpenAIRealtimeSTTSettings(STTSettings):
    """Settings for OpenAIRealtimeSTTService.
@@ -308,7 +305,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):

        # --- 1. Hardcoded defaults ---
        default_settings = self.Settings(
-            model=_OPENAI_REALTIME_WHISPER_MODEL,
+            model=OPENAI_REALTIME_WHISPER_MODEL,
            language=Language.EN,
            prompt=None,
            noise_reduction=None,
@@ -359,11 +356,11 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
    @staticmethod
    def _omit_unsupported_prompt(settings: OpenAIRealtimeSTTSettings) -> dict[str, Any]:
        """Drop prompt settings that are not accepted by the selected model."""
-        if settings.model == _OPENAI_REALTIME_WHISPER_MODEL and settings.prompt:
+        if settings.model == OPENAI_REALTIME_WHISPER_MODEL and settings.prompt:
            old_prompt = settings.prompt
            settings.prompt = None
            logger.warning(
-                f"{_OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt parameter; "
+                f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt parameter; "
                "omitting prompt from OpenAI Realtime transcription session."
            )
            return {"prompt": old_prompt}
@@ -572,7 +569,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
        input_audio: dict = {
            "format": {
                "type": "audio/pcm",
-                "rate": _OPENAI_SAMPLE_RATE,
+                "rate": OPENAI_SAMPLE_RATE,
            },
            "transcription": transcription,
        }
@@ -609,7 +606,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
        Args:
            audio: Raw audio bytes at the pipeline sample rate.
        """
-        audio = await self._resampler.resample(audio, self.sample_rate, _OPENAI_SAMPLE_RATE)
+        audio = await self._resampler.resample(audio, self.sample_rate, OPENAI_SAMPLE_RATE)
        if not audio:
            return
        payload = base64.b64encode(audio).decode("utf-8")
--- a/src/pipecat/services/openai/tts.py
+++ b/src/pipecat/services/openai/tts.py
@@ -24,6 +24,7 @@ from pipecat.frames.frames import (
    StartFrame,
    TTSAudioRawFrame,
 )
+from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
 from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, assert_given
 from pipecat.services.tts_service import TTSService
 from pipecat.utils.tracing.service_decorators import traced_tts
@@ -85,8 +86,6 @@ class OpenAITTSService(TTSService):
    Settings = OpenAITTSSettings
    _settings: Settings

-    OPENAI_SAMPLE_RATE = 24000  # OpenAI TTS always outputs at 24kHz
-
    class InputParams(BaseModel):
        """Input parameters for OpenAI TTS configuration.

@@ -150,9 +149,9 @@ class OpenAITTSService(TTSService):
                parameters, ``settings`` values take precedence.
            **kwargs: Additional keyword arguments passed to TTSService.
        """
-        if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
+        if sample_rate and sample_rate != OPENAI_SAMPLE_RATE:
            logger.warning(
-                f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
+                f"OpenAI TTS only supports {OPENAI_SAMPLE_RATE}Hz sample rate. "
                f"Current rate of {sample_rate}Hz may cause issues."
            )

@@ -217,9 +216,9 @@ class OpenAITTSService(TTSService):
            frame: The start frame containing initialization parameters.
        """
        await super().start(frame)
-        if self.sample_rate != self.OPENAI_SAMPLE_RATE:
+        if self.sample_rate != OPENAI_SAMPLE_RATE:
            logger.warning(
-                f"OpenAI TTS requires {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
+                f"OpenAI TTS requires {OPENAI_SAMPLE_RATE}Hz sample rate. "
                f"Current rate of {self.sample_rate}Hz may cause issues."
            )

--- a/tests/test_settings.py
+++ b/tests/test_settings.py
@@ -12,6 +12,7 @@ from pipecat.services.deepgram.sagemaker.stt import DeepgramSageMakerSTTSettings
 from pipecat.services.deepgram.stt import DeepgramSTTService, DeepgramSTTSettings
 from pipecat.services.inworld.realtime import events as inworld_events
 from pipecat.services.inworld.realtime.llm import InworldRealtimeLLMSettings
+from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL
 from pipecat.services.openai.realtime import events
 from pipecat.services.openai.realtime.llm import (
    OpenAIRealtimeLLMService,
@@ -757,7 +758,7 @@ class TestOpenAIRealtimeSessionProperties:
            audio=events.AudioConfiguration(
                input=events.AudioInput(
                    transcription=events.InputAudioTranscription(
-                        model=events.GPT_REALTIME_WHISPER_MODEL,
+                        model=OPENAI_REALTIME_WHISPER_MODEL,
                        prompt="Keywords: metoprolol",
                    )
                )