From 02b97035f87b8177231cf510eb5357ce549abdfe Mon Sep 17 00:00:00 2001 From: Nicholas Zhao Date: Fri, 13 Mar 2026 14:55:54 -0700 Subject: [PATCH] Add xAI TTS service --- README.md | 28 ++-- pyproject.toml | 1 + src/pipecat/services/xai/__init__.py | 13 ++ src/pipecat/services/xai/tts.py | 213 +++++++++++++++++++++++++++ tests/test_xai_tts.py | 91 ++++++++++++ 5 files changed, 332 insertions(+), 14 deletions(-) create mode 100644 src/pipecat/services/xai/__init__.py create mode 100644 src/pipecat/services/xai/tts.py create mode 100644 tests/test_xai_tts.py diff --git a/README.md b/README.md index 842863548..8d2322200 100644 --- a/README.md +++ b/README.md @@ -85,20 +85,20 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout ## 🧩 Available services -| Category | Services | -| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | -| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [Novita](https://docs.pipecat.ai/server/services/llm/novita), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nvidia), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/llm/sarvam), [Together AI](https://docs.pipecat.ai/server/services/llm/together) | -| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Smallest](https://docs.pipecat.ai/server/services/tts/smallest), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | -| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), | -| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | -| Serializers | [Exotel](https://docs.pipecat.ai/server/utilities/serializers/exotel), [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/utilities/serializers/vonage) | -| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/server/services/video/lemonslice), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | -| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | -| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | -| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) | -| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | -| Community | [Browse community integrations →](https://docs.pipecat.ai/server/services/community-integrations) | +| Category | Services | +| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | +| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [Novita](https://docs.pipecat.ai/server/services/llm/novita), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nvidia), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/llm/sarvam), [Together AI](https://docs.pipecat.ai/server/services/llm/together) | +| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Smallest](https://docs.pipecat.ai/server/services/tts/smallest), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), xAI, [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | +| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), | +| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | +| Serializers | [Exotel](https://docs.pipecat.ai/server/utilities/serializers/exotel), [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/utilities/serializers/vonage) | +| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/server/services/video/lemonslice), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | +| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | +| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | +| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) | +| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | +| Community | [Browse community integrations →](https://docs.pipecat.ai/server/services/community-integrations) | 📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services) diff --git a/pyproject.toml b/pyproject.toml index cba80aee4..e1f2dcf16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,6 +127,7 @@ webrtc = [ "aiortc>=1.14.0,<2", "opencv-python>=4.11.0.86,<5" ] websocket = [ "pipecat-ai[websockets-base]", "fastapi>=0.115.6,<1" ] websockets-base = [ "websockets>=13.1,<16.0" ] whisper = [ "faster-whisper~=1.2.1" ] +xai = [] [dependency-groups] dev = [ diff --git a/src/pipecat/services/xai/__init__.py b/src/pipecat/services/xai/__init__.py new file mode 100644 index 000000000..5433f9431 --- /dev/null +++ b/src/pipecat/services/xai/__init__.py @@ -0,0 +1,13 @@ +# +# Copyright (c) 2024-2026, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import sys + +from pipecat.services import DeprecatedModuleProxy + +from .tts import * + +sys.modules[__name__] = DeprecatedModuleProxy(globals(), "xai", "xai.tts") diff --git a/src/pipecat/services/xai/tts.py b/src/pipecat/services/xai/tts.py new file mode 100644 index 000000000..37a3db80a --- /dev/null +++ b/src/pipecat/services/xai/tts.py @@ -0,0 +1,213 @@ +# +# Copyright (c) 2024-2026, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +"""xAI text-to-speech service implementation. + +Uses xAI's HTTP TTS endpoint documented at: +https://docs.x.ai/developers/model-capabilities/audio/text-to-speech +""" + +from dataclasses import dataclass +from typing import AsyncGenerator, Optional + +import aiohttp +from loguru import logger +from pydantic import BaseModel + +from pipecat.frames.frames import ErrorFrame, Frame, TTSAudioRawFrame +from pipecat.services.settings import TTSSettings +from pipecat.services.tts_service import TTSService +from pipecat.transcriptions.language import Language +from pipecat.utils.tracing.service_decorators import traced_tts + + +@dataclass +class XAITTSSettings(TTSSettings): + """Settings for XAITTSService.""" + + pass + + +class XAITTSService(TTSService): + """xAI HTTP text-to-speech service. + + The service requests raw PCM audio so emitted ``TTSAudioRawFrame`` objects + match Pipecat's downstream expectations without extra decoding. + """ + + Settings = XAITTSSettings + _settings: Settings + + XAI_DEFAULT_SAMPLE_RATE = 24000 + XAI_PCM_CODEC = "pcm" + + class InputParams(BaseModel): + """Input parameters for xAI TTS configuration. + + .. deprecated:: 0.0.105 + Use ``settings=XAITTSService.Settings(...)`` instead. + + Parameters: + language: Language for speech synthesis. + """ + + language: Optional[Language] = None + + def __init__( + self, + *, + api_key: str, + base_url: str = "https://api.x.ai/v1/tts", + voice: Optional[str] = None, + language: Optional[str | Language] = None, + sample_rate: Optional[int] = None, + aiohttp_session: Optional[aiohttp.ClientSession] = None, + params: Optional[InputParams] = None, + settings: Optional[Settings] = None, + **kwargs, + ): + """Initialize the xAI TTS service. + + Args: + api_key: xAI API key for authentication. + base_url: xAI TTS endpoint. Defaults to ``https://api.x.ai/v1/tts``. + voice: Voice identifier. Defaults to ``"eve"``. + + .. deprecated:: 0.0.105 + Use ``settings=XAITTSService.Settings(voice=...)`` instead. + + language: BCP-47 or base language code (for example ``"en"`` or ``"pt-BR"``). + Defaults to ``"en"``. + + .. deprecated:: 0.0.105 + Use ``settings=XAITTSService.Settings(language=...)`` instead. + + sample_rate: Output sample rate for PCM audio. Defaults to 24000 Hz. + aiohttp_session: Optional shared aiohttp session. + params: Deprecated input parameters object. + settings: Runtime-updatable settings. When provided alongside deprecated + parameters, ``settings`` values take precedence. + **kwargs: Additional keyword arguments passed to ``TTSService``. + """ + default_settings = self.Settings( + model=None, + voice="eve", + language="en", + ) + + if voice is not None: + self._warn_init_param_moved_to_settings("voice", "voice") + default_settings.voice = voice + if language is not None: + self._warn_init_param_moved_to_settings("language", "language") + default_settings.language = ( + self.language_to_service_language(language) + if isinstance(language, Language) + else language + ) + + if params is not None: + self._warn_init_param_moved_to_settings("params") + if not settings and params.language is not None: + default_settings.language = self.language_to_service_language(params.language) + + if settings is not None: + default_settings.apply_update(settings) + + super().__init__( + pause_frame_processing=True, + push_start_frame=True, + push_stop_frames=True, + sample_rate=sample_rate or self.XAI_DEFAULT_SAMPLE_RATE, + settings=default_settings, + **kwargs, + ) + + self._api_key = api_key + self._base_url = base_url + self._session = aiohttp_session + self._session_owner = aiohttp_session is None + + def can_generate_metrics(self) -> bool: + """Check if this service can generate processing metrics.""" + return True + + def language_to_service_language(self, language: Language) -> Optional[str]: + """Convert a Language enum to xAI's language format.""" + return str(language) + + async def start(self, frame): + """Start the xAI TTS service.""" + await super().start(frame) + if self._session is None or self._session.closed: + self._session = aiohttp.ClientSession() + self._session_owner = True + + async def stop(self, frame): + """Stop the xAI TTS service.""" + await super().stop(frame) + await self._close_session() + + async def cancel(self, frame): + """Cancel the xAI TTS service.""" + await super().cancel(frame) + await self._close_session() + + async def _close_session(self): + if self._session_owner and self._session and not self._session.closed: + await self._session.close() + if self._session_owner: + self._session = None + + @traced_tts + async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]: + """Generate speech from text using xAI's TTS API.""" + logger.debug(f"{self}: Generating TTS [{text}]") + + if self._session is None or self._session.closed: + self._session = aiohttp.ClientSession() + self._session_owner = True + + payload = { + "text": text, + "voice_id": self._settings.voice, + "output_format": { + "codec": self.XAI_PCM_CODEC, + "sample_rate": self.sample_rate, + }, + } + if self._settings.language: + payload["language"] = str(self._settings.language) + + headers = { + "Authorization": f"Bearer {self._api_key}", + "Content-Type": "application/json", + } + + measuring_ttfb = True + try: + async with self._session.post(self._base_url, json=payload, headers=headers) as response: + if response.status != 200: + error = await response.text(errors="ignore") + logger.error( + f"{self} error getting audio (status: {response.status}, error: {error})" + ) + yield ErrorFrame( + error=f"Error getting audio (status: {response.status}, error: {error})" + ) + return + + await self.start_tts_usage_metrics(text) + + async for chunk in response.content.iter_chunked(self.chunk_size): + if not chunk: + continue + if measuring_ttfb: + await self.stop_ttfb_metrics() + measuring_ttfb = False + yield TTSAudioRawFrame(chunk, self.sample_rate, 1, context_id=context_id) + except Exception as e: + yield ErrorFrame(error=f"Unknown error occurred: {e}") diff --git a/tests/test_xai_tts.py b/tests/test_xai_tts.py new file mode 100644 index 000000000..b4c1513f6 --- /dev/null +++ b/tests/test_xai_tts.py @@ -0,0 +1,91 @@ +# +# Copyright (c) 2024-2026, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +"""Tests for XAITTSService.""" + +import asyncio +import unittest + +import aiohttp +import pytest +from aiohttp import web + +from pipecat.frames.frames import ( + AggregatedTextFrame, + TTSAudioRawFrame, + TTSSpeakFrame, + TTSStartedFrame, + TTSStoppedFrame, + TTSTextFrame, +) +from pipecat.services.xai.tts import XAITTSService +from pipecat.tests.utils import run_test + + +@pytest.mark.asyncio +async def test_run_xai_tts_success(aiohttp_client): + """xAI TTS should send the documented request body and emit PCM frames.""" + + request_bodies = [] + + async def handler(request): + request_bodies.append(await request.json()) + + response = web.StreamResponse( + status=200, + reason="OK", + headers={"Content-Type": "audio/pcm"}, + ) + await response.prepare(request) + await response.write(b"\x00\x01\x02\x03" * 1024) + await asyncio.sleep(0.01) + await response.write(b"\x04\x05\x06\x07" * 1024) + await response.write_eof() + return response + + app = web.Application() + app.router.add_post("/v1/tts", handler) + client = await aiohttp_client(app) + base_url = str(client.make_url("/v1/tts")) + + async with aiohttp.ClientSession() as session: + tts_service = XAITTSService( + api_key="test-key", + base_url=base_url, + aiohttp_session=session, + sample_rate=24000, + ) + + down_frames, _ = await run_test( + tts_service, + frames_to_send=[TTSSpeakFrame(text="Hello from xAI.")], + ) + + frame_types = [type(frame) for frame in down_frames] + assert AggregatedTextFrame in frame_types + assert TTSStartedFrame in frame_types + assert TTSStoppedFrame in frame_types + assert TTSTextFrame in frame_types + + audio_frames = [frame for frame in down_frames if isinstance(frame, TTSAudioRawFrame)] + assert audio_frames + assert all(frame.sample_rate == 24000 for frame in audio_frames) + assert all(frame.num_channels == 1 for frame in audio_frames) + + assert len(request_bodies) == 1 + assert request_bodies[0] == { + "text": "Hello from xAI.", + "voice_id": "eve", + "language": "en", + "output_format": { + "codec": "pcm", + "sample_rate": 24000, + }, + } + + +if __name__ == "__main__": + unittest.main()