fix: Add text support to OpenAIRealtimeBetaLLMService

This commit is contained in:
Mark Backman
2025-08-13 09:08:15 -04:00
parent a2a419e6db
commit 71cd0f1c87
3 changed files with 25 additions and 3 deletions

View File

@@ -66,6 +66,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed an issue where `SmallWebRTCTransport` ended before TTS finished.
- Fixed an issue in `OpenAIRealtimeBetaLLMService` where specifying a `text`
`modalities` didn't result in text being outputted from the model.
- Fixed a `WatchdogPriorityQueue` issue that could cause an exception when
compating watchdog cancel sentinel items with other items in the queue.
@@ -307,7 +310,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
callbacks.
- Added SSML reserved character escaping to `AzureBaseTTSService` to properly handle special characters in text sent to Azure TTS. This fixes an issue where characters like `&`, `<`, `>`, `"`, and `'` in LLM-generated text would cause TTS failures.
-
-
### Changed
- Changed the default `url` for `NeuphonicTTSService` to

View File

@@ -113,6 +113,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
session_properties = SessionProperties(
input_audio_transcription=InputAudioTranscription(),
modalities=["text", "audio"],
# Set openai TurnDetection parameters. Not setting this at all will turn it
# on by default
turn_detection=SemanticTurnDetection(),

View File

@@ -171,6 +171,15 @@ class OpenAIRealtimeBetaLLMService(LLMService):
"""
self._audio_input_paused = paused
def _is_modality_enabled(self, modality: str) -> bool:
"""Check if a specific modality is enabled, "text" or "audio"."""
modalities = self._session_properties.modalities or ["audio", "text"]
return modality in modalities
def _get_enabled_modalities(self) -> list[str]:
"""Get the list of enabled modalities."""
return self._session_properties.modalities or ["audio", "text"]
async def retrieve_conversation_item(self, item_id: str):
"""Retrieve a conversation item by ID from the server.
@@ -243,7 +252,9 @@ class OpenAIRealtimeBetaLLMService(LLMService):
await self.stop_all_metrics()
if self._current_assistant_response:
await self.push_frame(LLMFullResponseEndFrame())
await self.push_frame(TTSStoppedFrame())
# Only push TTSStoppedFrame if audio modality is enabled
if self._is_modality_enabled("audio"):
await self.push_frame(TTSStoppedFrame())
async def _handle_user_started_speaking(self, frame):
pass
@@ -469,6 +480,8 @@ class OpenAIRealtimeBetaLLMService(LLMService):
await self._handle_evt_speech_started(evt)
elif evt.type == "input_audio_buffer.speech_stopped":
await self._handle_evt_speech_stopped(evt)
elif evt.type == "response.text.delta":
await self._handle_evt_text_delta(evt)
elif evt.type == "response.audio_transcript.delta":
await self._handle_evt_audio_transcript_delta(evt)
elif evt.type == "error":
@@ -617,6 +630,10 @@ class OpenAIRealtimeBetaLLMService(LLMService):
# Response message without preceding user message. Add it to the context.
await self._handle_assistant_output(evt.response.output)
async def _handle_evt_text_delta(self, evt):
if evt.delta:
await self.push_frame(LLMTextFrame(evt.delta))
async def _handle_evt_audio_transcript_delta(self, evt):
if evt.delta:
await self.push_frame(LLMTextFrame(evt.delta))
@@ -723,7 +740,7 @@ class OpenAIRealtimeBetaLLMService(LLMService):
await self.start_ttfb_metrics()
await self.send_client_event(
events.ResponseCreateEvent(
response=events.ResponseProperties(modalities=["audio", "text"])
response=events.ResponseProperties(modalities=self._get_enabled_modalities())
)
)