Centralize OpenAI audio constants

This commit is contained in:
Mark Backman
2026-05-12 15:38:20 -04:00
parent e2bfa6352f
commit 644030584f
6 changed files with 32 additions and 25 deletions

View File

@@ -0,0 +1,10 @@
#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Internal constants for OpenAI service integrations."""
OPENAI_SAMPLE_RATE = 24000
OPENAI_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper"

View File

@@ -13,13 +13,12 @@ from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
#
# session properties
#
GPT_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper"
class AudioFormat(BaseModel):
"""Base class for audio format configuration."""
@@ -36,7 +35,7 @@ class PCMAudioFormat(AudioFormat):
"""
type: Literal["audio/pcm"] = "audio/pcm"
rate: Literal[24000] = 24000
rate: Literal[24000] = OPENAI_SAMPLE_RATE
class PCMUAudioFormat(AudioFormat):
@@ -62,13 +61,13 @@ class PCMAAudioFormat(AudioFormat):
class InputAudioTranscription(BaseModel):
"""Configuration for audio transcription settings."""
model: str = GPT_REALTIME_WHISPER_MODEL
model: str = OPENAI_REALTIME_WHISPER_MODEL
language: str | None
prompt: str | None
def __init__(
self,
model: str | None = GPT_REALTIME_WHISPER_MODEL,
model: str | None = OPENAI_REALTIME_WHISPER_MODEL,
language: str | None = None,
prompt: str | None = None,
):

View File

@@ -51,6 +51,7 @@ from pipecat.metrics.metrics import LLMTokenUsage
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
from pipecat.services.settings import (
NOT_GIVEN,
LLMSettings,
@@ -337,11 +338,11 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
and session_properties.audio.input.transcription
else None
)
if transcription and transcription.model == events.GPT_REALTIME_WHISPER_MODEL:
if transcription and transcription.model == OPENAI_REALTIME_WHISPER_MODEL:
if transcription.prompt:
transcription.prompt = None
logger.warning(
f"{events.GPT_REALTIME_WHISPER_MODEL} does not support the prompt "
f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt "
"parameter; omitting prompt from OpenAI Realtime input audio "
"transcription settings."
)
@@ -505,7 +506,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
self._current_audio_response = None
def _calculate_audio_duration_ms(
self, total_bytes: int, sample_rate: int = 24000, bytes_per_sample: int = 2
self, total_bytes: int, sample_rate: int = OPENAI_SAMPLE_RATE, bytes_per_sample: int = 2
) -> int:
"""Calculate audio duration in milliseconds based on PCM audio parameters."""
samples = total_bytes / bytes_per_sample
@@ -797,7 +798,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
self._current_audio_response.total_size += len(audio)
frame = TTSAudioRawFrame(
audio=audio,
sample_rate=24000,
sample_rate=OPENAI_SAMPLE_RATE,
num_channels=1,
)
await self.push_frame(frame)

View File

@@ -36,6 +36,7 @@ from pipecat.frames.frames import (
VADUserStoppedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
from pipecat.services.settings import NOT_GIVEN, STTSettings, _NotGiven, assert_given
from pipecat.services.stt_latency import OPENAI_REALTIME_TTFS_P99, OPENAI_TTFS_P99
from pipecat.services.stt_service import WebsocketSTTService
@@ -178,10 +179,6 @@ class OpenAISTTService(BaseWhisperSTTService):
return await self._client.audio.transcriptions.create(**kwargs)
_OPENAI_SAMPLE_RATE = 24000
_OPENAI_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper"
@dataclass
class OpenAIRealtimeSTTSettings(STTSettings):
"""Settings for OpenAIRealtimeSTTService.
@@ -308,7 +305,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
# --- 1. Hardcoded defaults ---
default_settings = self.Settings(
model=_OPENAI_REALTIME_WHISPER_MODEL,
model=OPENAI_REALTIME_WHISPER_MODEL,
language=Language.EN,
prompt=None,
noise_reduction=None,
@@ -359,11 +356,11 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
@staticmethod
def _omit_unsupported_prompt(settings: OpenAIRealtimeSTTSettings) -> dict[str, Any]:
"""Drop prompt settings that are not accepted by the selected model."""
if settings.model == _OPENAI_REALTIME_WHISPER_MODEL and settings.prompt:
if settings.model == OPENAI_REALTIME_WHISPER_MODEL and settings.prompt:
old_prompt = settings.prompt
settings.prompt = None
logger.warning(
f"{_OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt parameter; "
f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt parameter; "
"omitting prompt from OpenAI Realtime transcription session."
)
return {"prompt": old_prompt}
@@ -572,7 +569,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
input_audio: dict = {
"format": {
"type": "audio/pcm",
"rate": _OPENAI_SAMPLE_RATE,
"rate": OPENAI_SAMPLE_RATE,
},
"transcription": transcription,
}
@@ -609,7 +606,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
Args:
audio: Raw audio bytes at the pipeline sample rate.
"""
audio = await self._resampler.resample(audio, self.sample_rate, _OPENAI_SAMPLE_RATE)
audio = await self._resampler.resample(audio, self.sample_rate, OPENAI_SAMPLE_RATE)
if not audio:
return
payload = base64.b64encode(audio).decode("utf-8")

View File

@@ -24,6 +24,7 @@ from pipecat.frames.frames import (
StartFrame,
TTSAudioRawFrame,
)
from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, assert_given
from pipecat.services.tts_service import TTSService
from pipecat.utils.tracing.service_decorators import traced_tts
@@ -85,8 +86,6 @@ class OpenAITTSService(TTSService):
Settings = OpenAITTSSettings
_settings: Settings
OPENAI_SAMPLE_RATE = 24000 # OpenAI TTS always outputs at 24kHz
class InputParams(BaseModel):
"""Input parameters for OpenAI TTS configuration.
@@ -150,9 +149,9 @@ class OpenAITTSService(TTSService):
parameters, ``settings`` values take precedence.
**kwargs: Additional keyword arguments passed to TTSService.
"""
if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
if sample_rate and sample_rate != OPENAI_SAMPLE_RATE:
logger.warning(
f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
f"OpenAI TTS only supports {OPENAI_SAMPLE_RATE}Hz sample rate. "
f"Current rate of {sample_rate}Hz may cause issues."
)
@@ -217,9 +216,9 @@ class OpenAITTSService(TTSService):
frame: The start frame containing initialization parameters.
"""
await super().start(frame)
if self.sample_rate != self.OPENAI_SAMPLE_RATE:
if self.sample_rate != OPENAI_SAMPLE_RATE:
logger.warning(
f"OpenAI TTS requires {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
f"OpenAI TTS requires {OPENAI_SAMPLE_RATE}Hz sample rate. "
f"Current rate of {self.sample_rate}Hz may cause issues."
)

View File

@@ -12,6 +12,7 @@ from pipecat.services.deepgram.sagemaker.stt import DeepgramSageMakerSTTSettings
from pipecat.services.deepgram.stt import DeepgramSTTService, DeepgramSTTSettings
from pipecat.services.inworld.realtime import events as inworld_events
from pipecat.services.inworld.realtime.llm import InworldRealtimeLLMSettings
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL
from pipecat.services.openai.realtime import events
from pipecat.services.openai.realtime.llm import (
OpenAIRealtimeLLMService,
@@ -757,7 +758,7 @@ class TestOpenAIRealtimeSessionProperties:
audio=events.AudioConfiguration(
input=events.AudioInput(
transcription=events.InputAudioTranscription(
model=events.GPT_REALTIME_WHISPER_MODEL,
model=OPENAI_REALTIME_WHISPER_MODEL,
prompt="Keywords: metoprolol",
)
)