From 348fa5a7194ee9d23e65c76924fb4c552189ca03 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Sat, 20 Dec 2025 08:02:48 -0500 Subject: [PATCH] Improve SessionProperties initialization: remove voice from args, set default for TurnDetection --- examples/foundational/51-grok-realtime.py | 6 +---- src/pipecat/services/grok/realtime/events.py | 8 +++++-- src/pipecat/services/grok/realtime/llm.py | 25 +++++++------------- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/examples/foundational/51-grok-realtime.py b/examples/foundational/51-grok-realtime.py index 355453d5f..efa1c9bac 100644 --- a/examples/foundational/51-grok-realtime.py +++ b/examples/foundational/51-grok-realtime.py @@ -52,7 +52,6 @@ from pipecat.runner.types import RunnerArguments from pipecat.runner.utils import create_transport from pipecat.services.grok.realtime.events import ( SessionProperties, - TurnDetection, WebSearchTool, XSearchTool, ) @@ -172,11 +171,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): # Configure Grok session properties session_properties = SessionProperties( - # Voice options: Ara (warm, friendly), Rex (confident), Sal (smooth), - # Eve (energetic), Leo (authoritative) + # Voice options: Ara, Rex, Sal, Eve, Leo voice="Ara", - # Enable server-side VAD for automatic turn detection - turn_detection=TurnDetection(type="server_vad"), # System instructions instructions="""You are a helpful and friendly AI assistant powered by Grok. diff --git a/src/pipecat/services/grok/realtime/events.py b/src/pipecat/services/grok/realtime/events.py index d93aad233..513e4f287 100644 --- a/src/pipecat/services/grok/realtime/events.py +++ b/src/pipecat/services/grok/realtime/events.py @@ -205,7 +205,9 @@ class SessionProperties(BaseModel): Parameters: instructions: System instructions for the assistant. voice: The voice the model uses to respond. Options: Ara, Rex, Sal, Eve, Leo. - turn_detection: Configuration for turn detection, or None for manual. + Defaults to "Ara". + turn_detection: Configuration for turn detection. Defaults to server-side VAD. + Set to None for manual turn detection. audio: Configuration for input and output audio. tools: Available tools for the assistant (web_search, x_search, file_search, function). """ @@ -215,7 +217,9 @@ class SessionProperties(BaseModel): instructions: Optional[str] = None voice: Optional[GrokVoice] = "Ara" - turn_detection: Optional[TurnDetection] = None + turn_detection: Optional[TurnDetection] = Field( + default_factory=lambda: TurnDetection(type="server_vad") + ) audio: Optional[AudioConfiguration] = None # Tools can be ToolsSchema when provided by user, or list of dicts for API tools: Optional[ToolsSchema | List[GrokTool]] = None diff --git a/src/pipecat/services/grok/realtime/llm.py b/src/pipecat/services/grok/realtime/llm.py index 88cb4e1fc..acbb3f83c 100644 --- a/src/pipecat/services/grok/realtime/llm.py +++ b/src/pipecat/services/grok/realtime/llm.py @@ -107,7 +107,6 @@ class GrokRealtimeLLMService(LLMService): self, *, api_key: str, - voice: events.GrokVoice = "Ara", base_url: str = "wss://api.x.ai/v1/realtime", session_properties: Optional[events.SessionProperties] = None, start_audio_paused: bool = False, @@ -117,12 +116,15 @@ class GrokRealtimeLLMService(LLMService): Args: api_key: xAI API key for authentication. - voice: Voice to use for responses. Options: Ara, Rex, Sal, Eve, Leo. - Defaults to "Ara". base_url: WebSocket base URL for the realtime API. Defaults to "wss://api.x.ai/v1/realtime". session_properties: Configuration properties for the realtime session. - If None, uses default SessionProperties with the specified voice. + If None, uses default SessionProperties with voice "Ara". + To set a different voice, configure it in session_properties: + + session_properties = events.SessionProperties(voice="Rex") + + Available voices: Ara, Rex, Sal, Eve, Leo. start_audio_paused: Whether to start with audio input paused. Defaults to False. **kwargs: Additional arguments passed to parent LLMService. """ @@ -130,20 +132,11 @@ class GrokRealtimeLLMService(LLMService): self.api_key = api_key self.base_url = base_url - self._voice = voice # Initialize session_properties - if session_properties: - self._session_properties = session_properties - # Ensure voice is set - if not self._session_properties.voice: - self._session_properties.voice = voice - else: - self._session_properties = events.SessionProperties( - voice=voice, - turn_detection=events.TurnDetection(type="server_vad"), - # Audio config will be set in start() based on PipelineParams - ) + self._session_properties: events.SessionProperties = ( + session_properties or events.SessionProperties() + ) self._audio_input_paused = start_audio_paused self._websocket = None