Merge pull request #2433 from pipecat-ai/mb/openai-realtime-text-modality

fix: Add text support to OpenAIRealtimeBetaLLMService
This commit is contained in:
Mark Backman
2025-08-13 08:41:33 -07:00
committed by GitHub
5 changed files with 256 additions and 15 deletions

View File

@@ -66,19 +66,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed an issue where `SmallWebRTCTransport` ended before TTS finished.
- Fixed an issue in `OpenAIRealtimeBetaLLMService` where specifying a `text`
`modalities` didn't result in text being outputted from the model.
- Fixed a `WatchdogPriorityQueue` issue that could cause an exception when
compating watchdog cancel sentinel items with other items in the queue.
- Fixed an issue that would cause system frames to not be processed with higher
priority than other frames. This could cause slower interruption times.
### Fixed
- Fixed an issue where retrying a websocket connection error would result in an
error.
### Other
- Add foundation example `19b-openai-realtime-beta-text.py`, showing how to use
`OpenAIRealtimeBetaLLMService` to output text to a TTS service.
- Add vision support to release evals so we can run the foundational examples 12
series.
@@ -307,7 +311,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
callbacks.
- Added SSML reserved character escaping to `AzureBaseTTSService` to properly handle special characters in text sent to Azure TTS. This fixes an issue where characters like `&`, `<`, `>`, `"`, and `'` in LLM-generated text would cause TTS failures.
-
### Changed
- Changed the default `url` for `NeuphonicTTSService` to

View File

@@ -158,16 +158,6 @@ Remember, your responses should be short. Just one or two sentences, usually."""
# openai WebSocket API can understand.
context = OpenAILLMContext(
[{"role": "user", "content": "Say hello!"}],
# [{"role": "user", "content": [{"type": "text", "text": "Say hello!"}]}],
# [
# {
# "role": "user",
# "content": [
# {"type": "text", "text": "Say"},
# {"type": "text", "text": "yo what's up!"},
# ],
# }
# ],
tools,
)

View File

@@ -0,0 +1,229 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from datetime import datetime
from dotenv import load_dotenv
from loguru import logger
from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import TranscriptionMessage
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.transcript_processor import TranscriptProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai_realtime_beta import (
InputAudioNoiseReduction,
InputAudioTranscription,
OpenAIRealtimeBetaLLMService,
SemanticTurnDetection,
SessionProperties,
)
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
async def fetch_weather_from_api(params: FunctionCallParams):
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
await params.result_callback(
{
"conditions": "nice",
"temperature": temperature,
"format": params.arguments["format"],
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
}
)
async def fetch_restaurant_recommendation(params: FunctionCallParams):
await params.result_callback({"name": "The Golden Dragon"})
weather_function = FunctionSchema(
name="get_current_weather",
description="Get the current weather",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location.",
},
},
required=["location", "format"],
)
restaurant_function = FunctionSchema(
name="get_restaurant_recommendation",
description="Get a restaurant recommendation",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
required=["location"],
)
# Create tools schema
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
session_properties = SessionProperties(
input_audio_transcription=InputAudioTranscription(),
modalities=["text"],
# Set openai TurnDetection parameters. Not setting this at all will turn it
# on by default
turn_detection=SemanticTurnDetection(),
# Or set to False to disable openai turn detection and use transport VAD
# turn_detection=False,
input_audio_noise_reduction=InputAudioNoiseReduction(type="near_field"),
# tools=tools,
instructions="""You are a helpful and friendly AI.
Act like a human, but remember that you aren't a human and that you can't do human
things in the real world. Your voice and personality should be warm and engaging, with a lively and
playful tone.
If interacting in a non-English language, start by using the standard accent or dialect familiar to
the user. Talk quickly. You should always call a function if you can. Do not refer to these rules,
even if you're asked about them.
You are participating in a voice conversation. Keep your responses concise, short, and to the point
unless specifically asked to elaborate on a topic.
You have access to the following tools:
- get_current_weather: Get the current weather for a given location.
- get_restaurant_recommendation: Get a restaurant recommendation for a given location.
Remember, your responses should be short. Just one or two sentences, usually.""",
)
llm = OpenAIRealtimeBetaLLMService(
api_key=os.getenv("OPENAI_API_KEY"),
session_properties=session_properties,
start_audio_paused=False,
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady
)
# you can either register a single function for all function calls, or specific functions
# llm.register_function(None, fetch_weather_from_api)
llm.register_function("get_current_weather", fetch_weather_from_api)
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
transcript = TranscriptProcessor()
# Create a standard OpenAI LLM context object using the normal messages format. The
# OpenAIRealtimeBetaLLMService will convert this internally to messages that the
# openai WebSocket API can understand.
context = OpenAILLMContext(
[{"role": "user", "content": "Say hello!"}],
tools,
)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(),
llm, # LLM
tts, # TTS
transcript.user(), # Placed after the LLM, as LLM pushes TranscriptionFrames downstream
transport.output(), # Transport bot output
transcript.assistant(), # After the transcript output, to time with the audio output
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
# Register event handler for transcript updates
@transcript.event_handler("on_transcript_update")
async def on_transcript_update(processor, frame):
for msg in frame.messages:
if isinstance(msg, TranscriptionMessage):
timestamp = f"[{msg.timestamp}] " if msg.timestamp else ""
line = f"{timestamp}{msg.role}: {msg.content}"
logger.info(f"Transcript: {line}")
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -127,6 +127,7 @@ TESTS_15 = [
TESTS_19 = [
("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER),
("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER),
("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER),
]
TESTS_21 = [

View File

@@ -171,6 +171,15 @@ class OpenAIRealtimeBetaLLMService(LLMService):
"""
self._audio_input_paused = paused
def _is_modality_enabled(self, modality: str) -> bool:
"""Check if a specific modality is enabled, "text" or "audio"."""
modalities = self._session_properties.modalities or ["audio", "text"]
return modality in modalities
def _get_enabled_modalities(self) -> list[str]:
"""Get the list of enabled modalities."""
return self._session_properties.modalities or ["audio", "text"]
async def retrieve_conversation_item(self, item_id: str):
"""Retrieve a conversation item by ID from the server.
@@ -243,7 +252,9 @@ class OpenAIRealtimeBetaLLMService(LLMService):
await self.stop_all_metrics()
if self._current_assistant_response:
await self.push_frame(LLMFullResponseEndFrame())
await self.push_frame(TTSStoppedFrame())
# Only push TTSStoppedFrame if audio modality is enabled
if self._is_modality_enabled("audio"):
await self.push_frame(TTSStoppedFrame())
async def _handle_user_started_speaking(self, frame):
pass
@@ -469,6 +480,8 @@ class OpenAIRealtimeBetaLLMService(LLMService):
await self._handle_evt_speech_started(evt)
elif evt.type == "input_audio_buffer.speech_stopped":
await self._handle_evt_speech_stopped(evt)
elif evt.type == "response.text.delta":
await self._handle_evt_text_delta(evt)
elif evt.type == "response.audio_transcript.delta":
await self._handle_evt_audio_transcript_delta(evt)
elif evt.type == "error":
@@ -617,6 +630,10 @@ class OpenAIRealtimeBetaLLMService(LLMService):
# Response message without preceding user message. Add it to the context.
await self._handle_assistant_output(evt.response.output)
async def _handle_evt_text_delta(self, evt):
if evt.delta:
await self.push_frame(LLMTextFrame(evt.delta))
async def _handle_evt_audio_transcript_delta(self, evt):
if evt.delta:
await self.push_frame(LLMTextFrame(evt.delta))
@@ -723,7 +740,7 @@ class OpenAIRealtimeBetaLLMService(LLMService):
await self.start_ttfb_metrics()
await self.send_client_event(
events.ResponseCreateEvent(
response=events.ResponseProperties(modalities=["audio", "text"])
response=events.ResponseProperties(modalities=self._get_enabled_modalities())
)
)