Add xAI TTS service
This commit is contained in:
committed by
Mark Backman
parent
86e086c6b5
commit
02b97035f8
28
README.md
28
README.md
@@ -85,20 +85,20 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout
|
||||
|
||||
## 🧩 Available services
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [Novita](https://docs.pipecat.ai/server/services/llm/novita), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nvidia), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/llm/sarvam), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Smallest](https://docs.pipecat.ai/server/services/tts/smallest), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Exotel](https://docs.pipecat.ai/server/utilities/serializers/exotel), [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/utilities/serializers/vonage) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/server/services/video/lemonslice), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
| Community | [Browse community integrations →](https://docs.pipecat.ai/server/services/community-integrations) |
|
||||
| Category | Services |
|
||||
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [Novita](https://docs.pipecat.ai/server/services/llm/novita), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nvidia), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/llm/sarvam), [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Smallest](https://docs.pipecat.ai/server/services/tts/smallest), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), xAI, [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Exotel](https://docs.pipecat.ai/server/utilities/serializers/exotel), [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/utilities/serializers/vonage) |
|
||||
| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/server/services/video/lemonslice), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) |
|
||||
| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) |
|
||||
| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) |
|
||||
| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) |
|
||||
| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) |
|
||||
| Community | [Browse community integrations →](https://docs.pipecat.ai/server/services/community-integrations) |
|
||||
|
||||
📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services)
|
||||
|
||||
|
||||
@@ -127,6 +127,7 @@ webrtc = [ "aiortc>=1.14.0,<2", "opencv-python>=4.11.0.86,<5" ]
|
||||
websocket = [ "pipecat-ai[websockets-base]", "fastapi>=0.115.6,<1" ]
|
||||
websockets-base = [ "websockets>=13.1,<16.0" ]
|
||||
whisper = [ "faster-whisper~=1.2.1" ]
|
||||
xai = []
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
|
||||
13
src/pipecat/services/xai/__init__.py
Normal file
13
src/pipecat/services/xai/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import sys
|
||||
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .tts import *
|
||||
|
||||
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "xai", "xai.tts")
|
||||
213
src/pipecat/services/xai/tts.py
Normal file
213
src/pipecat/services/xai/tts.py
Normal file
@@ -0,0 +1,213 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""xAI text-to-speech service implementation.
|
||||
|
||||
Uses xAI's HTTP TTS endpoint documented at:
|
||||
https://docs.x.ai/developers/model-capabilities/audio/text-to-speech
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, TTSAudioRawFrame
|
||||
from pipecat.services.settings import TTSSettings
|
||||
from pipecat.services.tts_service import TTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
|
||||
@dataclass
|
||||
class XAITTSSettings(TTSSettings):
|
||||
"""Settings for XAITTSService."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class XAITTSService(TTSService):
|
||||
"""xAI HTTP text-to-speech service.
|
||||
|
||||
The service requests raw PCM audio so emitted ``TTSAudioRawFrame`` objects
|
||||
match Pipecat's downstream expectations without extra decoding.
|
||||
"""
|
||||
|
||||
Settings = XAITTSSettings
|
||||
_settings: Settings
|
||||
|
||||
XAI_DEFAULT_SAMPLE_RATE = 24000
|
||||
XAI_PCM_CODEC = "pcm"
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Input parameters for xAI TTS configuration.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=XAITTSService.Settings(...)`` instead.
|
||||
|
||||
Parameters:
|
||||
language: Language for speech synthesis.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
base_url: str = "https://api.x.ai/v1/tts",
|
||||
voice: Optional[str] = None,
|
||||
language: Optional[str | Language] = None,
|
||||
sample_rate: Optional[int] = None,
|
||||
aiohttp_session: Optional[aiohttp.ClientSession] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
settings: Optional[Settings] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the xAI TTS service.
|
||||
|
||||
Args:
|
||||
api_key: xAI API key for authentication.
|
||||
base_url: xAI TTS endpoint. Defaults to ``https://api.x.ai/v1/tts``.
|
||||
voice: Voice identifier. Defaults to ``"eve"``.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=XAITTSService.Settings(voice=...)`` instead.
|
||||
|
||||
language: BCP-47 or base language code (for example ``"en"`` or ``"pt-BR"``).
|
||||
Defaults to ``"en"``.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=XAITTSService.Settings(language=...)`` instead.
|
||||
|
||||
sample_rate: Output sample rate for PCM audio. Defaults to 24000 Hz.
|
||||
aiohttp_session: Optional shared aiohttp session.
|
||||
params: Deprecated input parameters object.
|
||||
settings: Runtime-updatable settings. When provided alongside deprecated
|
||||
parameters, ``settings`` values take precedence.
|
||||
**kwargs: Additional keyword arguments passed to ``TTSService``.
|
||||
"""
|
||||
default_settings = self.Settings(
|
||||
model=None,
|
||||
voice="eve",
|
||||
language="en",
|
||||
)
|
||||
|
||||
if voice is not None:
|
||||
self._warn_init_param_moved_to_settings("voice", "voice")
|
||||
default_settings.voice = voice
|
||||
if language is not None:
|
||||
self._warn_init_param_moved_to_settings("language", "language")
|
||||
default_settings.language = (
|
||||
self.language_to_service_language(language)
|
||||
if isinstance(language, Language)
|
||||
else language
|
||||
)
|
||||
|
||||
if params is not None:
|
||||
self._warn_init_param_moved_to_settings("params")
|
||||
if not settings and params.language is not None:
|
||||
default_settings.language = self.language_to_service_language(params.language)
|
||||
|
||||
if settings is not None:
|
||||
default_settings.apply_update(settings)
|
||||
|
||||
super().__init__(
|
||||
pause_frame_processing=True,
|
||||
push_start_frame=True,
|
||||
push_stop_frames=True,
|
||||
sample_rate=sample_rate or self.XAI_DEFAULT_SAMPLE_RATE,
|
||||
settings=default_settings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._api_key = api_key
|
||||
self._base_url = base_url
|
||||
self._session = aiohttp_session
|
||||
self._session_owner = aiohttp_session is None
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics."""
|
||||
return True
|
||||
|
||||
def language_to_service_language(self, language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to xAI's language format."""
|
||||
return str(language)
|
||||
|
||||
async def start(self, frame):
|
||||
"""Start the xAI TTS service."""
|
||||
await super().start(frame)
|
||||
if self._session is None or self._session.closed:
|
||||
self._session = aiohttp.ClientSession()
|
||||
self._session_owner = True
|
||||
|
||||
async def stop(self, frame):
|
||||
"""Stop the xAI TTS service."""
|
||||
await super().stop(frame)
|
||||
await self._close_session()
|
||||
|
||||
async def cancel(self, frame):
|
||||
"""Cancel the xAI TTS service."""
|
||||
await super().cancel(frame)
|
||||
await self._close_session()
|
||||
|
||||
async def _close_session(self):
|
||||
if self._session_owner and self._session and not self._session.closed:
|
||||
await self._session.close()
|
||||
if self._session_owner:
|
||||
self._session = None
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using xAI's TTS API."""
|
||||
logger.debug(f"{self}: Generating TTS [{text}]")
|
||||
|
||||
if self._session is None or self._session.closed:
|
||||
self._session = aiohttp.ClientSession()
|
||||
self._session_owner = True
|
||||
|
||||
payload = {
|
||||
"text": text,
|
||||
"voice_id": self._settings.voice,
|
||||
"output_format": {
|
||||
"codec": self.XAI_PCM_CODEC,
|
||||
"sample_rate": self.sample_rate,
|
||||
},
|
||||
}
|
||||
if self._settings.language:
|
||||
payload["language"] = str(self._settings.language)
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self._api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
measuring_ttfb = True
|
||||
try:
|
||||
async with self._session.post(self._base_url, json=payload, headers=headers) as response:
|
||||
if response.status != 200:
|
||||
error = await response.text(errors="ignore")
|
||||
logger.error(
|
||||
f"{self} error getting audio (status: {response.status}, error: {error})"
|
||||
)
|
||||
yield ErrorFrame(
|
||||
error=f"Error getting audio (status: {response.status}, error: {error})"
|
||||
)
|
||||
return
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
async for chunk in response.content.iter_chunked(self.chunk_size):
|
||||
if not chunk:
|
||||
continue
|
||||
if measuring_ttfb:
|
||||
await self.stop_ttfb_metrics()
|
||||
measuring_ttfb = False
|
||||
yield TTSAudioRawFrame(chunk, self.sample_rate, 1, context_id=context_id)
|
||||
except Exception as e:
|
||||
yield ErrorFrame(error=f"Unknown error occurred: {e}")
|
||||
91
tests/test_xai_tts.py
Normal file
91
tests/test_xai_tts.py
Normal file
@@ -0,0 +1,91 @@
|
||||
#
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""Tests for XAITTSService."""
|
||||
|
||||
import asyncio
|
||||
import unittest
|
||||
|
||||
import aiohttp
|
||||
import pytest
|
||||
from aiohttp import web
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
AggregatedTextFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSSpeakFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
TTSTextFrame,
|
||||
)
|
||||
from pipecat.services.xai.tts import XAITTSService
|
||||
from pipecat.tests.utils import run_test
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_xai_tts_success(aiohttp_client):
|
||||
"""xAI TTS should send the documented request body and emit PCM frames."""
|
||||
|
||||
request_bodies = []
|
||||
|
||||
async def handler(request):
|
||||
request_bodies.append(await request.json())
|
||||
|
||||
response = web.StreamResponse(
|
||||
status=200,
|
||||
reason="OK",
|
||||
headers={"Content-Type": "audio/pcm"},
|
||||
)
|
||||
await response.prepare(request)
|
||||
await response.write(b"\x00\x01\x02\x03" * 1024)
|
||||
await asyncio.sleep(0.01)
|
||||
await response.write(b"\x04\x05\x06\x07" * 1024)
|
||||
await response.write_eof()
|
||||
return response
|
||||
|
||||
app = web.Application()
|
||||
app.router.add_post("/v1/tts", handler)
|
||||
client = await aiohttp_client(app)
|
||||
base_url = str(client.make_url("/v1/tts"))
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tts_service = XAITTSService(
|
||||
api_key="test-key",
|
||||
base_url=base_url,
|
||||
aiohttp_session=session,
|
||||
sample_rate=24000,
|
||||
)
|
||||
|
||||
down_frames, _ = await run_test(
|
||||
tts_service,
|
||||
frames_to_send=[TTSSpeakFrame(text="Hello from xAI.")],
|
||||
)
|
||||
|
||||
frame_types = [type(frame) for frame in down_frames]
|
||||
assert AggregatedTextFrame in frame_types
|
||||
assert TTSStartedFrame in frame_types
|
||||
assert TTSStoppedFrame in frame_types
|
||||
assert TTSTextFrame in frame_types
|
||||
|
||||
audio_frames = [frame for frame in down_frames if isinstance(frame, TTSAudioRawFrame)]
|
||||
assert audio_frames
|
||||
assert all(frame.sample_rate == 24000 for frame in audio_frames)
|
||||
assert all(frame.num_channels == 1 for frame in audio_frames)
|
||||
|
||||
assert len(request_bodies) == 1
|
||||
assert request_bodies[0] == {
|
||||
"text": "Hello from xAI.",
|
||||
"voice_id": "eve",
|
||||
"language": "en",
|
||||
"output_format": {
|
||||
"codec": "pcm",
|
||||
"sample_rate": 24000,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user