Centralize OpenAI audio constants
This commit is contained in:
10
src/pipecat/services/openai/_constants.py
Normal file
10
src/pipecat/services/openai/_constants.py
Normal file
@@ -0,0 +1,10 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Internal constants for OpenAI service integrations."""
|
||||
|
||||
OPENAI_SAMPLE_RATE = 24000
|
||||
OPENAI_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper"
|
||||
@@ -13,13 +13,12 @@ from typing import Any, Literal
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
||||
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
|
||||
|
||||
#
|
||||
# session properties
|
||||
#
|
||||
|
||||
GPT_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper"
|
||||
|
||||
|
||||
class AudioFormat(BaseModel):
|
||||
"""Base class for audio format configuration."""
|
||||
@@ -36,7 +35,7 @@ class PCMAudioFormat(AudioFormat):
|
||||
"""
|
||||
|
||||
type: Literal["audio/pcm"] = "audio/pcm"
|
||||
rate: Literal[24000] = 24000
|
||||
rate: Literal[24000] = OPENAI_SAMPLE_RATE
|
||||
|
||||
|
||||
class PCMUAudioFormat(AudioFormat):
|
||||
@@ -62,13 +61,13 @@ class PCMAAudioFormat(AudioFormat):
|
||||
class InputAudioTranscription(BaseModel):
|
||||
"""Configuration for audio transcription settings."""
|
||||
|
||||
model: str = GPT_REALTIME_WHISPER_MODEL
|
||||
model: str = OPENAI_REALTIME_WHISPER_MODEL
|
||||
language: str | None
|
||||
prompt: str | None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str | None = GPT_REALTIME_WHISPER_MODEL,
|
||||
model: str | None = OPENAI_REALTIME_WHISPER_MODEL,
|
||||
language: str | None = None,
|
||||
prompt: str | None = None,
|
||||
):
|
||||
|
||||
@@ -51,6 +51,7 @@ from pipecat.metrics.metrics import LLMTokenUsage
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
|
||||
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
|
||||
from pipecat.services.settings import (
|
||||
NOT_GIVEN,
|
||||
LLMSettings,
|
||||
@@ -337,11 +338,11 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
|
||||
and session_properties.audio.input.transcription
|
||||
else None
|
||||
)
|
||||
if transcription and transcription.model == events.GPT_REALTIME_WHISPER_MODEL:
|
||||
if transcription and transcription.model == OPENAI_REALTIME_WHISPER_MODEL:
|
||||
if transcription.prompt:
|
||||
transcription.prompt = None
|
||||
logger.warning(
|
||||
f"{events.GPT_REALTIME_WHISPER_MODEL} does not support the prompt "
|
||||
f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt "
|
||||
"parameter; omitting prompt from OpenAI Realtime input audio "
|
||||
"transcription settings."
|
||||
)
|
||||
@@ -505,7 +506,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
|
||||
self._current_audio_response = None
|
||||
|
||||
def _calculate_audio_duration_ms(
|
||||
self, total_bytes: int, sample_rate: int = 24000, bytes_per_sample: int = 2
|
||||
self, total_bytes: int, sample_rate: int = OPENAI_SAMPLE_RATE, bytes_per_sample: int = 2
|
||||
) -> int:
|
||||
"""Calculate audio duration in milliseconds based on PCM audio parameters."""
|
||||
samples = total_bytes / bytes_per_sample
|
||||
@@ -797,7 +798,7 @@ class OpenAIRealtimeLLMService(LLMService[OpenAIRealtimeLLMAdapter]):
|
||||
self._current_audio_response.total_size += len(audio)
|
||||
frame = TTSAudioRawFrame(
|
||||
audio=audio,
|
||||
sample_rate=24000,
|
||||
sample_rate=OPENAI_SAMPLE_RATE,
|
||||
num_channels=1,
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
|
||||
@@ -36,6 +36,7 @@ from pipecat.frames.frames import (
|
||||
VADUserStoppedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL, OPENAI_SAMPLE_RATE
|
||||
from pipecat.services.settings import NOT_GIVEN, STTSettings, _NotGiven, assert_given
|
||||
from pipecat.services.stt_latency import OPENAI_REALTIME_TTFS_P99, OPENAI_TTFS_P99
|
||||
from pipecat.services.stt_service import WebsocketSTTService
|
||||
@@ -178,10 +179,6 @@ class OpenAISTTService(BaseWhisperSTTService):
|
||||
return await self._client.audio.transcriptions.create(**kwargs)
|
||||
|
||||
|
||||
_OPENAI_SAMPLE_RATE = 24000
|
||||
_OPENAI_REALTIME_WHISPER_MODEL = "gpt-realtime-whisper"
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenAIRealtimeSTTSettings(STTSettings):
|
||||
"""Settings for OpenAIRealtimeSTTService.
|
||||
@@ -308,7 +305,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
|
||||
|
||||
# --- 1. Hardcoded defaults ---
|
||||
default_settings = self.Settings(
|
||||
model=_OPENAI_REALTIME_WHISPER_MODEL,
|
||||
model=OPENAI_REALTIME_WHISPER_MODEL,
|
||||
language=Language.EN,
|
||||
prompt=None,
|
||||
noise_reduction=None,
|
||||
@@ -359,11 +356,11 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
|
||||
@staticmethod
|
||||
def _omit_unsupported_prompt(settings: OpenAIRealtimeSTTSettings) -> dict[str, Any]:
|
||||
"""Drop prompt settings that are not accepted by the selected model."""
|
||||
if settings.model == _OPENAI_REALTIME_WHISPER_MODEL and settings.prompt:
|
||||
if settings.model == OPENAI_REALTIME_WHISPER_MODEL and settings.prompt:
|
||||
old_prompt = settings.prompt
|
||||
settings.prompt = None
|
||||
logger.warning(
|
||||
f"{_OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt parameter; "
|
||||
f"{OPENAI_REALTIME_WHISPER_MODEL} does not support the prompt parameter; "
|
||||
"omitting prompt from OpenAI Realtime transcription session."
|
||||
)
|
||||
return {"prompt": old_prompt}
|
||||
@@ -572,7 +569,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
|
||||
input_audio: dict = {
|
||||
"format": {
|
||||
"type": "audio/pcm",
|
||||
"rate": _OPENAI_SAMPLE_RATE,
|
||||
"rate": OPENAI_SAMPLE_RATE,
|
||||
},
|
||||
"transcription": transcription,
|
||||
}
|
||||
@@ -609,7 +606,7 @@ class OpenAIRealtimeSTTService(WebsocketSTTService):
|
||||
Args:
|
||||
audio: Raw audio bytes at the pipeline sample rate.
|
||||
"""
|
||||
audio = await self._resampler.resample(audio, self.sample_rate, _OPENAI_SAMPLE_RATE)
|
||||
audio = await self._resampler.resample(audio, self.sample_rate, OPENAI_SAMPLE_RATE)
|
||||
if not audio:
|
||||
return
|
||||
payload = base64.b64encode(audio).decode("utf-8")
|
||||
|
||||
@@ -24,6 +24,7 @@ from pipecat.frames.frames import (
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
)
|
||||
from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
|
||||
from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, assert_given
|
||||
from pipecat.services.tts_service import TTSService
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
@@ -85,8 +86,6 @@ class OpenAITTSService(TTSService):
|
||||
Settings = OpenAITTSSettings
|
||||
_settings: Settings
|
||||
|
||||
OPENAI_SAMPLE_RATE = 24000 # OpenAI TTS always outputs at 24kHz
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Input parameters for OpenAI TTS configuration.
|
||||
|
||||
@@ -150,9 +149,9 @@ class OpenAITTSService(TTSService):
|
||||
parameters, ``settings`` values take precedence.
|
||||
**kwargs: Additional keyword arguments passed to TTSService.
|
||||
"""
|
||||
if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
|
||||
if sample_rate and sample_rate != OPENAI_SAMPLE_RATE:
|
||||
logger.warning(
|
||||
f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
|
||||
f"OpenAI TTS only supports {OPENAI_SAMPLE_RATE}Hz sample rate. "
|
||||
f"Current rate of {sample_rate}Hz may cause issues."
|
||||
)
|
||||
|
||||
@@ -217,9 +216,9 @@ class OpenAITTSService(TTSService):
|
||||
frame: The start frame containing initialization parameters.
|
||||
"""
|
||||
await super().start(frame)
|
||||
if self.sample_rate != self.OPENAI_SAMPLE_RATE:
|
||||
if self.sample_rate != OPENAI_SAMPLE_RATE:
|
||||
logger.warning(
|
||||
f"OpenAI TTS requires {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
|
||||
f"OpenAI TTS requires {OPENAI_SAMPLE_RATE}Hz sample rate. "
|
||||
f"Current rate of {self.sample_rate}Hz may cause issues."
|
||||
)
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ from pipecat.services.deepgram.sagemaker.stt import DeepgramSageMakerSTTSettings
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService, DeepgramSTTSettings
|
||||
from pipecat.services.inworld.realtime import events as inworld_events
|
||||
from pipecat.services.inworld.realtime.llm import InworldRealtimeLLMSettings
|
||||
from pipecat.services.openai._constants import OPENAI_REALTIME_WHISPER_MODEL
|
||||
from pipecat.services.openai.realtime import events
|
||||
from pipecat.services.openai.realtime.llm import (
|
||||
OpenAIRealtimeLLMService,
|
||||
@@ -757,7 +758,7 @@ class TestOpenAIRealtimeSessionProperties:
|
||||
audio=events.AudioConfiguration(
|
||||
input=events.AudioInput(
|
||||
transcription=events.InputAudioTranscription(
|
||||
model=events.GPT_REALTIME_WHISPER_MODEL,
|
||||
model=OPENAI_REALTIME_WHISPER_MODEL,
|
||||
prompt="Keywords: metoprolol",
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user