diff --git a/examples/foundational/51-grok-realtime.py b/examples/foundational/51-grok-realtime.py index 61eaeb062..355453d5f 100644 --- a/examples/foundational/51-grok-realtime.py +++ b/examples/foundational/51-grok-realtime.py @@ -25,7 +25,6 @@ Usage: python 50-grok-realtime.py --transport daily """ -import asyncio import os from datetime import datetime @@ -37,7 +36,7 @@ from pipecat.adapters.schemas.tools_schema import ToolsSchema # Note: Grok has built-in server-side VAD, so we don't need local VAD # from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.frames.frames import LLMRunFrame, LLMSetToolsFrame, TranscriptionMessage +from pipecat.frames.frames import LLMRunFrame, TranscriptionMessage from pipecat.observers.loggers.transcription_log_observer import ( TranscriptionLogObserver, ) diff --git a/src/pipecat/services/grok/realtime/llm.py b/src/pipecat/services/grok/realtime/llm.py index ec4f263bd..88cb4e1fc 100644 --- a/src/pipecat/services/grok/realtime/llm.py +++ b/src/pipecat/services/grok/realtime/llm.py @@ -27,14 +27,12 @@ from pipecat.frames.frames import ( EndFrame, Frame, InputAudioRawFrame, - InterimTranscriptionFrame, InterruptionFrame, LLMContextFrame, LLMFullResponseEndFrame, LLMFullResponseStartFrame, LLMMessagesAppendFrame, LLMSetToolsFrame, - LLMTextFrame, LLMUpdateSettingsFrame, StartFrame, TranscriptionFrame, @@ -57,7 +55,6 @@ from pipecat.processors.aggregators.llm_response_universal import ( from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.processors.frame_processor import FrameDirection from pipecat.services.llm_service import FunctionCallFromLLM, LLMService -from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 from . import events @@ -114,7 +111,6 @@ class GrokRealtimeLLMService(LLMService): base_url: str = "wss://api.x.ai/v1/realtime", session_properties: Optional[events.SessionProperties] = None, start_audio_paused: bool = False, - sample_rate: int = 24000, **kwargs, ): """Initialize the Grok Realtime Voice Agent LLM service. @@ -128,18 +124,15 @@ class GrokRealtimeLLMService(LLMService): session_properties: Configuration properties for the realtime session. If None, uses default SessionProperties with the specified voice. start_audio_paused: Whether to start with audio input paused. Defaults to False. - sample_rate: Audio sample rate in Hz. Supported: 8000, 16000, 21050, 24000, - 32000, 44100, 48000. Defaults to 24000. **kwargs: Additional arguments passed to parent LLMService. """ super().__init__(base_url=base_url, **kwargs) self.api_key = api_key self.base_url = base_url - self._sample_rate = sample_rate self._voice = voice - # Initialize session_properties with voice and audio config + # Initialize session_properties if session_properties: self._session_properties = session_properties # Ensure voice is set @@ -149,10 +142,7 @@ class GrokRealtimeLLMService(LLMService): self._session_properties = events.SessionProperties( voice=voice, turn_detection=events.TurnDetection(type="server_vad"), - audio=events.AudioConfiguration( - input=events.AudioInput(format=events.PCMAudioFormat(rate=sample_rate)), - output=events.AudioOutput(format=events.PCMAudioFormat(rate=sample_rate)), - ), + # Audio config will be set in start() based on PipelineParams ) self._audio_input_paused = start_audio_paused @@ -192,6 +182,50 @@ class GrokRealtimeLLMService(LLMService): """ self._audio_input_paused = paused + def _get_configured_sample_rate(self, direction: str) -> Optional[int]: + """Get manually configured sample rate for input or output. + + Args: + direction: Either "input" or "output". + + Returns: + Configured sample rate or None if not manually configured. + For PCMU/PCMA formats, returns 8000 Hz (G.711 standard). + """ + if not self._session_properties.audio: + return None + + audio_config = ( + self._session_properties.audio.input + if direction == "input" + else self._session_properties.audio.output + ) + + if audio_config and audio_config.format: + # PCM format has configurable rate + if hasattr(audio_config.format, "rate"): + return audio_config.format.rate + # PCMU/PCMA formats are fixed at 8000 Hz (G.711 standard) + elif audio_config.format.type in ("audio/pcmu", "audio/pcma"): + return 8000 + + return None + + def _get_output_sample_rate(self) -> int: + """Get the output sample rate from session properties. + + Returns: + Output sample rate in Hz. + + Note: + This assumes start() has been called, which guarantees + session_properties.audio.output exists. + """ + rate = self._get_configured_sample_rate("output") + if rate is None: + raise RuntimeError("Output sample rate not configured.") + return rate + def _is_turn_detection_enabled(self) -> bool: """Check if server-side VAD is enabled.""" if self._session_properties.turn_detection: @@ -230,7 +264,7 @@ class GrokRealtimeLLMService(LLMService): ) -> int: """Calculate audio duration in milliseconds based on PCM audio parameters.""" if sample_rate is None: - sample_rate = self._sample_rate + sample_rate = self._get_output_sample_rate() samples = total_bytes / bytes_per_sample duration_seconds = samples / sample_rate return int(duration_seconds * 1000) @@ -260,6 +294,23 @@ class GrokRealtimeLLMService(LLMService): frame: The start frame triggering service initialization. """ await super().start(frame) + + # Ensure audio configuration exists with both input and output + if not self._session_properties.audio: + self._session_properties.audio = events.AudioConfiguration() + + # Fill in missing input configuration + if not self._session_properties.audio.input: + self._session_properties.audio.input = events.AudioInput( + format=events.PCMAudioFormat(rate=frame.audio_in_sample_rate) + ) + + # Fill in missing output configuration + if not self._session_properties.audio.output: + self._session_properties.audio.output = events.AudioOutput( + format=events.PCMAudioFormat(rate=frame.audio_out_sample_rate) + ) + await self._connect() async def stop(self, frame: EndFrame): @@ -501,7 +552,7 @@ class GrokRealtimeLLMService(LLMService): frame = TTSAudioRawFrame( audio=audio, - sample_rate=self._sample_rate, + sample_rate=self._get_output_sample_rate(), num_channels=1, ) await self.push_frame(frame) diff --git a/uv.lock b/uv.lock index 560620aee..24b9a13c6 100644 --- a/uv.lock +++ b/uv.lock @@ -612,11 +612,11 @@ wheels = [ [[package]] name = "certifi" -version = "2025.8.3" +version = "2025.11.12" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/dc/67/960ebe6bf230a96cda2e0abcf73af550ec4f090005363542f0765df162e0/certifi-2025.8.3.tar.gz", hash = "sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407", size = 162386, upload-time = "2025-08-03T03:07:47.08Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/8c/58f469717fa48465e4a50c014a0400602d3c437d7c0c468e17ada824da3a/certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316", size = 160538, upload-time = "2025-11-12T02:54:51.517Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216, upload-time = "2025-08-03T03:07:45.777Z" }, + { url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" }, ] [[package]] @@ -4044,7 +4044,7 @@ soundfile = [ { name = "soundfile" }, ] speechmatics = [ - { name = "speechmatics-rt" }, + { name = "speechmatics-voice", extra = ["smart"] }, ] strands = [ { name = "strands-agents" }, @@ -4192,7 +4192,7 @@ requires-dist = [ { name = "simli-ai", marker = "extra == 'simli'", specifier = "~=1.0.3" }, { name = "soundfile", marker = "extra == 'soundfile'", specifier = "~=0.13.1" }, { name = "soxr", specifier = "~=0.5.0" }, - { name = "speechmatics-rt", marker = "extra == 'speechmatics'", specifier = ">=0.5.0" }, + { name = "speechmatics-voice", extras = ["smart"], marker = "extra == 'speechmatics'", specifier = ">=0.2.4" }, { name = "strands-agents", marker = "extra == 'strands'", specifier = ">=1.9.1,<2" }, { name = "tenacity", marker = "extra == 'livekit'", specifier = ">=8.2.3,<10.0.0" }, { name = "timm", marker = "extra == 'moondream'", specifier = "~=1.0.13" }, @@ -5917,14 +5917,35 @@ wheels = [ [[package]] name = "speechmatics-rt" -version = "0.5.0" +version = "0.5.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "websockets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/57/26/10359e1f16c2aa6a198eb11a9056f4a86a8bb8d4e610bbbe4a118b227b59/speechmatics_rt-0.5.0.tar.gz", hash = "sha256:ca974a186a012f946fd997deeaf3bf1c4f203f6d6e05a866172d27709183afc8", size = 26832, upload-time = "2025-10-15T15:54:25.695Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c0/a3/bb4d063a4405744951066c45ffbf7cd714a6fc00a20ef0cc83fe2494ed79/speechmatics_rt-0.5.3.tar.gz", hash = "sha256:c98d21041e5a0c90a66e463c3d5b98879c17eac0bbebb4100fd9d0f2b330bb19", size = 27333, upload-time = "2025-12-16T19:20:50.199Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/47/2e/9931ebe9360e9d385c68826b33137c2c9a4cfa361cd929d1ac6e72ebfe53/speechmatics_rt-0.5.0-py3-none-any.whl", hash = "sha256:58151488f891fa00cf7054f0cfab1b1eb94b55c3441be587f7941c726caef991", size = 32850, upload-time = "2025-10-15T15:54:24.5Z" }, + { url = "https://files.pythonhosted.org/packages/9c/5a/35dd924f9bfeb1604e01806ad0e16a9c596f3c44d13e66794f10d10f828b/speechmatics_rt-0.5.3-py3-none-any.whl", hash = "sha256:12f97f19bb989852b8ff3c6d1e28f4f0ea6fd9356e19da75d0e9877545931ce6", size = 33365, upload-time = "2025-12-16T19:20:49.031Z" }, +] + +[[package]] +name = "speechmatics-voice" +version = "0.2.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "pydantic" }, + { name = "speechmatics-rt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b2/f9/9d81e4abe9ae1c8745372eaf43523213b0333e9721699fb0f3d3bff6c17e/speechmatics_voice-0.2.4.tar.gz", hash = "sha256:e3b5c7a8c24fa7d555b80a72ab181797665c74944400468ca5fb7e54b5f9eae6", size = 60852, upload-time = "2025-12-17T23:22:13.437Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/a6/401dba9be6be914e57b7814360ba0bece55f24140bb7d5c3dc5f07bcd77f/speechmatics_voice-0.2.4-py3-none-any.whl", hash = "sha256:71d0f5272c2db1221422ab19b6c898ea7b38f9fb7f523904f54a4d8c3e4cef12", size = 57056, upload-time = "2025-12-17T23:22:11.837Z" }, +] + +[package.optional-dependencies] +smart = [ + { name = "certifi" }, + { name = "onnxruntime" }, + { name = "transformers" }, ] [[package]] @@ -6508,7 +6529,7 @@ wheels = [ [[package]] name = "transformers" -version = "4.56.2" +version = "4.57.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, @@ -6522,9 +6543,9 @@ dependencies = [ { name = "tokenizers" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e5/82/0bcfddd134cdf53440becb5e738257cc3cf34cf229d63b57bfd288e6579f/transformers-4.56.2.tar.gz", hash = "sha256:5e7c623e2d7494105c726dd10f6f90c2c99a55ebe86eef7233765abd0cb1c529", size = 9844296, upload-time = "2025-09-19T15:16:26.778Z" } +sdist = { url = "https://files.pythonhosted.org/packages/dd/70/d42a739e8dfde3d92bb2fff5819cbf331fe9657323221e79415cd5eb65ee/transformers-4.57.3.tar.gz", hash = "sha256:df4945029aaddd7c09eec5cad851f30662f8bd1746721b34cc031d70c65afebc", size = 10139680, upload-time = "2025-11-25T15:51:30.139Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/70/26/2591b48412bde75e33bfd292034103ffe41743cacd03120e3242516cd143/transformers-4.56.2-py3-none-any.whl", hash = "sha256:79c03d0e85b26cb573c109ff9eafa96f3c8d4febfd8a0774e8bba32702dd6dde", size = 11608055, upload-time = "2025-09-19T15:16:23.736Z" }, + { url = "https://files.pythonhosted.org/packages/6a/6b/2f416568b3c4c91c96e5a365d164f8a4a4a88030aa8ab4644181fdadce97/transformers-4.57.3-py3-none-any.whl", hash = "sha256:c77d353a4851b1880191603d36acb313411d3577f6e2897814f333841f7003f4", size = 11993463, upload-time = "2025-11-25T15:51:26.493Z" }, ] [[package]]