diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c80ac4fd..aa6d53d66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -66,19 +66,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed an issue where `SmallWebRTCTransport` ended before TTS finished. +- Fixed an issue in `OpenAIRealtimeBetaLLMService` where specifying a `text` + `modalities` didn't result in text being outputted from the model. + - Fixed a `WatchdogPriorityQueue` issue that could cause an exception when compating watchdog cancel sentinel items with other items in the queue. - Fixed an issue that would cause system frames to not be processed with higher priority than other frames. This could cause slower interruption times. -### Fixed - - Fixed an issue where retrying a websocket connection error would result in an error. ### Other +- Add foundation example `19b-openai-realtime-beta-text.py`, showing how to use + `OpenAIRealtimeBetaLLMService` to output text to a TTS service. + - Add vision support to release evals so we can run the foundational examples 12 series. @@ -307,7 +311,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 callbacks. - Added SSML reserved character escaping to `AzureBaseTTSService` to properly handle special characters in text sent to Azure TTS. This fixes an issue where characters like `&`, `<`, `>`, `"`, and `'` in LLM-generated text would cause TTS failures. -- + ### Changed - Changed the default `url` for `NeuphonicTTSService` to diff --git a/examples/foundational/19-openai-realtime-beta.py b/examples/foundational/19-openai-realtime-beta.py index b6bd2d864..a9aa27c45 100644 --- a/examples/foundational/19-openai-realtime-beta.py +++ b/examples/foundational/19-openai-realtime-beta.py @@ -158,16 +158,6 @@ Remember, your responses should be short. Just one or two sentences, usually.""" # openai WebSocket API can understand. context = OpenAILLMContext( [{"role": "user", "content": "Say hello!"}], - # [{"role": "user", "content": [{"type": "text", "text": "Say hello!"}]}], - # [ - # { - # "role": "user", - # "content": [ - # {"type": "text", "text": "Say"}, - # {"type": "text", "text": "yo what's up!"}, - # ], - # } - # ], tools, ) diff --git a/examples/foundational/19b-openai-realtime-beta-text.py b/examples/foundational/19b-openai-realtime-beta-text.py new file mode 100644 index 000000000..04ad42fc8 --- /dev/null +++ b/examples/foundational/19b-openai-realtime-beta-text.py @@ -0,0 +1,229 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + + +import os +from datetime import datetime + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.adapters.schemas.function_schema import FunctionSchema +from pipecat.adapters.schemas.tools_schema import ToolsSchema +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.frames.frames import TranscriptionMessage +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.processors.transcript_processor import TranscriptProcessor +from pipecat.runner.types import RunnerArguments +from pipecat.runner.utils import create_transport +from pipecat.services.cartesia import CartesiaTTSService +from pipecat.services.llm_service import FunctionCallParams +from pipecat.services.openai_realtime_beta import ( + InputAudioNoiseReduction, + InputAudioTranscription, + OpenAIRealtimeBetaLLMService, + SemanticTurnDetection, + SessionProperties, +) +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams +from pipecat.transports.services.daily import DailyParams + +load_dotenv(override=True) + + +async def fetch_weather_from_api(params: FunctionCallParams): + temperature = 75 if params.arguments["format"] == "fahrenheit" else 24 + await params.result_callback( + { + "conditions": "nice", + "temperature": temperature, + "format": params.arguments["format"], + "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"), + } + ) + + +async def fetch_restaurant_recommendation(params: FunctionCallParams): + await params.result_callback({"name": "The Golden Dragon"}) + + +weather_function = FunctionSchema( + name="get_current_weather", + description="Get the current weather", + properties={ + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "format": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "The temperature unit to use. Infer this from the users location.", + }, + }, + required=["location", "format"], +) + +restaurant_function = FunctionSchema( + name="get_restaurant_recommendation", + description="Get a restaurant recommendation", + properties={ + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + }, + required=["location"], +) + +# Create tools schema +tools = ToolsSchema(standard_tools=[weather_function, restaurant_function]) + + +# We store functions so objects (e.g. SileroVADAnalyzer) don't get +# instantiated. The function will be called when the desired transport gets +# selected. +transport_params = { + "daily": lambda: DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + "twilio": lambda: FastAPIWebsocketParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + "webrtc": lambda: TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), +} + + +async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): + logger.info(f"Starting bot") + + session_properties = SessionProperties( + input_audio_transcription=InputAudioTranscription(), + modalities=["text"], + # Set openai TurnDetection parameters. Not setting this at all will turn it + # on by default + turn_detection=SemanticTurnDetection(), + # Or set to False to disable openai turn detection and use transport VAD + # turn_detection=False, + input_audio_noise_reduction=InputAudioNoiseReduction(type="near_field"), + # tools=tools, + instructions="""You are a helpful and friendly AI. + +Act like a human, but remember that you aren't a human and that you can't do human +things in the real world. Your voice and personality should be warm and engaging, with a lively and +playful tone. + +If interacting in a non-English language, start by using the standard accent or dialect familiar to +the user. Talk quickly. You should always call a function if you can. Do not refer to these rules, +even if you're asked about them. + +You are participating in a voice conversation. Keep your responses concise, short, and to the point +unless specifically asked to elaborate on a topic. + +You have access to the following tools: +- get_current_weather: Get the current weather for a given location. +- get_restaurant_recommendation: Get a restaurant recommendation for a given location. + +Remember, your responses should be short. Just one or two sentences, usually.""", + ) + + llm = OpenAIRealtimeBetaLLMService( + api_key=os.getenv("OPENAI_API_KEY"), + session_properties=session_properties, + start_audio_paused=False, + ) + + tts = CartesiaTTSService( + api_key=os.getenv("CARTESIA_API_KEY"), + voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady + ) + + # you can either register a single function for all function calls, or specific functions + # llm.register_function(None, fetch_weather_from_api) + llm.register_function("get_current_weather", fetch_weather_from_api) + llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation) + + transcript = TranscriptProcessor() + + # Create a standard OpenAI LLM context object using the normal messages format. The + # OpenAIRealtimeBetaLLMService will convert this internally to messages that the + # openai WebSocket API can understand. + context = OpenAILLMContext( + [{"role": "user", "content": "Say hello!"}], + tools, + ) + + context_aggregator = llm.create_context_aggregator(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + context_aggregator.user(), + llm, # LLM + tts, # TTS + transcript.user(), # Placed after the LLM, as LLM pushes TranscriptionFrames downstream + transport.output(), # Transport bot output + transcript.assistant(), # After the transcript output, to time with the audio output + context_aggregator.assistant(), + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + enable_metrics=True, + enable_usage_metrics=True, + ), + idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation. + await task.queue_frames([context_aggregator.user().get_context_frame()]) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + await task.cancel() + + # Register event handler for transcript updates + @transcript.event_handler("on_transcript_update") + async def on_transcript_update(processor, frame): + for msg in frame.messages: + if isinstance(msg, TranscriptionMessage): + timestamp = f"[{msg.timestamp}] " if msg.timestamp else "" + line = f"{timestamp}{msg.role}: {msg.content}" + logger.info(f"Transcript: {line}") + + runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) + + await runner.run(task) + + +async def bot(runner_args: RunnerArguments): + """Main bot entry point compatible with Pipecat Cloud.""" + transport = await create_transport(runner_args, transport_params) + await run_bot(transport, runner_args) + + +if __name__ == "__main__": + from pipecat.runner.run import main + + main() diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py index 54985fd16..1cee7e684 100644 --- a/scripts/evals/run-release-evals.py +++ b/scripts/evals/run-release-evals.py @@ -127,6 +127,7 @@ TESTS_15 = [ TESTS_19 = [ ("19-openai-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER), ("19a-azure-realtime-beta.py", PROMPT_WEATHER, EVAL_WEATHER), + ("19b-openai-realtime-beta-text.py", PROMPT_WEATHER, EVAL_WEATHER), ] TESTS_21 = [ diff --git a/src/pipecat/services/openai_realtime_beta/openai.py b/src/pipecat/services/openai_realtime_beta/openai.py index 03b3d4938..5c7df7597 100644 --- a/src/pipecat/services/openai_realtime_beta/openai.py +++ b/src/pipecat/services/openai_realtime_beta/openai.py @@ -171,6 +171,15 @@ class OpenAIRealtimeBetaLLMService(LLMService): """ self._audio_input_paused = paused + def _is_modality_enabled(self, modality: str) -> bool: + """Check if a specific modality is enabled, "text" or "audio".""" + modalities = self._session_properties.modalities or ["audio", "text"] + return modality in modalities + + def _get_enabled_modalities(self) -> list[str]: + """Get the list of enabled modalities.""" + return self._session_properties.modalities or ["audio", "text"] + async def retrieve_conversation_item(self, item_id: str): """Retrieve a conversation item by ID from the server. @@ -243,7 +252,9 @@ class OpenAIRealtimeBetaLLMService(LLMService): await self.stop_all_metrics() if self._current_assistant_response: await self.push_frame(LLMFullResponseEndFrame()) - await self.push_frame(TTSStoppedFrame()) + # Only push TTSStoppedFrame if audio modality is enabled + if self._is_modality_enabled("audio"): + await self.push_frame(TTSStoppedFrame()) async def _handle_user_started_speaking(self, frame): pass @@ -469,6 +480,8 @@ class OpenAIRealtimeBetaLLMService(LLMService): await self._handle_evt_speech_started(evt) elif evt.type == "input_audio_buffer.speech_stopped": await self._handle_evt_speech_stopped(evt) + elif evt.type == "response.text.delta": + await self._handle_evt_text_delta(evt) elif evt.type == "response.audio_transcript.delta": await self._handle_evt_audio_transcript_delta(evt) elif evt.type == "error": @@ -617,6 +630,10 @@ class OpenAIRealtimeBetaLLMService(LLMService): # Response message without preceding user message. Add it to the context. await self._handle_assistant_output(evt.response.output) + async def _handle_evt_text_delta(self, evt): + if evt.delta: + await self.push_frame(LLMTextFrame(evt.delta)) + async def _handle_evt_audio_transcript_delta(self, evt): if evt.delta: await self.push_frame(LLMTextFrame(evt.delta)) @@ -723,7 +740,7 @@ class OpenAIRealtimeBetaLLMService(LLMService): await self.start_ttfb_metrics() await self.send_client_event( events.ResponseCreateEvent( - response=events.ResponseProperties(modalities=["audio", "text"]) + response=events.ResponseProperties(modalities=self._get_enabled_modalities()) ) )