From 44993fe9e3c55c76c063ce9ccba7b539ae0a2576 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Wed, 25 Feb 2026 14:09:17 -0500 Subject: [PATCH] Remove PlayHT TTS services --- README.md | 26 +- changelog/3838.removed.md | 1 + docs/api/README.md | 3 +- env.example | 4 - .../07e-interruptible-playht-http.py | 125 ---- .../foundational/07e-interruptible-playht.py | 127 ---- .../55t-update-settings-playht-tts.py | 126 ---- pyproject.toml | 1 - src/pipecat/services/playht/__init__.py | 13 - src/pipecat/services/playht/tts.py | 699 ------------------ uv.lock | 6 +- 11 files changed, 16 insertions(+), 1115 deletions(-) create mode 100644 changelog/3838.removed.md delete mode 100644 examples/foundational/07e-interruptible-playht-http.py delete mode 100644 examples/foundational/07e-interruptible-playht.py delete mode 100644 examples/foundational/55t-update-settings-playht-tts.py delete mode 100644 src/pipecat/services/playht/__init__.py delete mode 100644 src/pipecat/services/playht/tts.py diff --git a/README.md b/README.md index 2221e807e..05874be81 100644 --- a/README.md +++ b/README.md @@ -81,19 +81,19 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout ## 🧩 Available services -| Category | Services | -| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [Hathora](https://docs.pipecat.ai/server/services/stt/hathora), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | -| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) | -| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hathora](https://docs.pipecat.ai/server/services/tts/hathora), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | -| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), | -| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | -| Serializers | [Exotel](https://docs.pipecat.ai/server/utilities/serializers/exotel), [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/utilities/serializers/vonage) | -| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | -| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | -| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | -| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) | -| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | +| Category | Services | +| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [Hathora](https://docs.pipecat.ai/server/services/stt/hathora), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | +| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) | +| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hathora](https://docs.pipecat.ai/server/services/tts/hathora), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | +| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), | +| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | +| Serializers | [Exotel](https://docs.pipecat.ai/server/utilities/serializers/exotel), [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/utilities/serializers/vonage) | +| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | +| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | +| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | +| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter) | +| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | 📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services) diff --git a/changelog/3838.removed.md b/changelog/3838.removed.md new file mode 100644 index 000000000..fa811cb71 --- /dev/null +++ b/changelog/3838.removed.md @@ -0,0 +1 @@ +- ⚠️ Removed `PlayHTTTSService` and `PlayHTHttpTTSService`. PlayHT has been shut down and is no longer available. diff --git a/docs/api/README.md b/docs/api/README.md index 22b62d45e..e181bc898 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -42,7 +42,7 @@ This script: - Creates a fresh virtual environment - Installs all dependencies as specified in requirements files -- Handles conflicting dependencies (like grpcio versions for Riva and PlayHT) +- Handles conflicting dependencies (like grpcio versions for Riva) - Builds the documentation in an isolated environment - Provides detailed logging of the build process @@ -74,7 +74,6 @@ start _build/html/index.html ├── index.rst # Main documentation entry point ├── requirements-base.txt # Base documentation dependencies ├── requirements-riva.txt # Riva-specific dependencies -├── requirements-playht.txt # PlayHT-specific dependencies ├── build-docs.sh # Local build script └── rtd-test.py # ReadTheDocs test build script ``` diff --git a/env.example b/env.example index 2b850dd19..82308812e 100644 --- a/env.example +++ b/env.example @@ -147,10 +147,6 @@ KOALA_ACCESS_KEY=... # Piper PIPER_BASE_URL=... -# PlayHT -PLAYHT_USER_ID=... -PLAYHT_API_KEY=... - # Plivo PLIVO_AUTH_ID=... PLIVO_AUTH_TOKEN=... diff --git a/examples/foundational/07e-interruptible-playht-http.py b/examples/foundational/07e-interruptible-playht-http.py deleted file mode 100644 index c56de3b9f..000000000 --- a/examples/foundational/07e-interruptible-playht-http.py +++ /dev/null @@ -1,125 +0,0 @@ -# -# Copyright (c) 2024-2026, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - - -import os - -from dotenv import load_dotenv -from loguru import logger - -from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.frames.frames import LLMRunFrame -from pipecat.pipeline.pipeline import Pipeline -from pipecat.pipeline.runner import PipelineRunner -from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_context import LLMContext -from pipecat.processors.aggregators.llm_response_universal import ( - LLMContextAggregatorPair, - LLMUserAggregatorParams, -) -from pipecat.runner.types import RunnerArguments -from pipecat.runner.utils import create_transport -from pipecat.services.deepgram.stt import DeepgramSTTService -from pipecat.services.openai.llm import OpenAILLMService -from pipecat.services.playht.tts import PlayHTHttpTTSService -from pipecat.transports.base_transport import BaseTransport, TransportParams -from pipecat.transports.daily.transport import DailyParams -from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams - -load_dotenv(override=True) - -# We use lambdas to defer transport parameter creation until the transport -# type is selected at runtime. -transport_params = { - "daily": lambda: DailyParams( - audio_in_enabled=True, - audio_out_enabled=True, - ), - "twilio": lambda: FastAPIWebsocketParams( - audio_in_enabled=True, - audio_out_enabled=True, - ), - "webrtc": lambda: TransportParams( - audio_in_enabled=True, - audio_out_enabled=True, - ), -} - - -async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): - logger.info(f"Starting bot") - - stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) - - tts = PlayHTHttpTTSService( - user_id=os.getenv("PLAYHT_USER_ID"), - api_key=os.getenv("PLAYHT_API_KEY"), - voice_url="s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json", - ) - - llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) - - messages = [ - { - "role": "system", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.", - }, - ] - - context = LLMContext(messages) - user_aggregator, assistant_aggregator = LLMContextAggregatorPair( - context, - user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()), - ) - - pipeline = Pipeline( - [ - transport.input(), # Transport user input - stt, - user_aggregator, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - assistant_aggregator, # Assistant spoken responses - ] - ) - - task = PipelineTask( - pipeline, - params=PipelineParams( - enable_metrics=True, - enable_usage_metrics=True, - ), - idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, - ) - - @transport.event_handler("on_client_connected") - async def on_client_connected(transport, client): - logger.info(f"Client connected") - # Kick off the conversation. - messages.append({"role": "system", "content": "Please introduce yourself to the user."}) - await task.queue_frames([LLMRunFrame()]) - - @transport.event_handler("on_client_disconnected") - async def on_client_disconnected(transport, client): - logger.info(f"Client disconnected") - await task.cancel() - - runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) - - await runner.run(task) - - -async def bot(runner_args: RunnerArguments): - """Main bot entry point compatible with Pipecat Cloud.""" - transport = await create_transport(runner_args, transport_params) - await run_bot(transport, runner_args) - - -if __name__ == "__main__": - from pipecat.runner.run import main - - main() diff --git a/examples/foundational/07e-interruptible-playht.py b/examples/foundational/07e-interruptible-playht.py deleted file mode 100644 index b42f8f6a2..000000000 --- a/examples/foundational/07e-interruptible-playht.py +++ /dev/null @@ -1,127 +0,0 @@ -# -# Copyright (c) 2024-2026, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - - -import os - -from dotenv import load_dotenv -from loguru import logger - -from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.frames.frames import LLMRunFrame -from pipecat.pipeline.pipeline import Pipeline -from pipecat.pipeline.runner import PipelineRunner -from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_context import LLMContext -from pipecat.processors.aggregators.llm_response_universal import ( - LLMContextAggregatorPair, - LLMUserAggregatorParams, -) -from pipecat.runner.types import RunnerArguments -from pipecat.runner.utils import create_transport -from pipecat.services.deepgram.stt import DeepgramSTTService -from pipecat.services.openai.llm import OpenAILLMService -from pipecat.services.playht.tts import PlayHTTTSService -from pipecat.transcriptions.language import Language -from pipecat.transports.base_transport import BaseTransport, TransportParams -from pipecat.transports.daily.transport import DailyParams -from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams - -load_dotenv(override=True) - -# We use lambdas to defer transport parameter creation until the transport -# type is selected at runtime. -transport_params = { - "daily": lambda: DailyParams( - audio_in_enabled=True, - audio_out_enabled=True, - ), - "twilio": lambda: FastAPIWebsocketParams( - audio_in_enabled=True, - audio_out_enabled=True, - ), - "webrtc": lambda: TransportParams( - audio_in_enabled=True, - audio_out_enabled=True, - ), -} - - -async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): - logger.info(f"Starting bot") - - stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) - - tts = PlayHTTTSService( - user_id=os.getenv("PLAYHT_USER_ID"), - api_key=os.getenv("PLAYHT_API_KEY"), - voice_url="s3://voice-cloning-zero-shot/e46b4027-b38d-4d24-b292-38fbca2be0ef/original/manifest.json", - params=PlayHTTTSService.InputParams(language=Language.EN), - ) - - llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) - - messages = [ - { - "role": "system", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.", - }, - ] - - context = LLMContext(messages) - user_aggregator, assistant_aggregator = LLMContextAggregatorPair( - context, - user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()), - ) - - pipeline = Pipeline( - [ - transport.input(), # Transport user input - stt, - user_aggregator, # User responses - llm, # LLM - tts, # TTS - transport.output(), # Transport bot output - assistant_aggregator, # Assistant spoken responses - ] - ) - - task = PipelineTask( - pipeline, - params=PipelineParams( - enable_metrics=True, - enable_usage_metrics=True, - ), - idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, - ) - - @transport.event_handler("on_client_connected") - async def on_client_connected(transport, client): - logger.info(f"Client connected") - # Kick off the conversation. - messages.append({"role": "system", "content": "Please introduce yourself to the user."}) - await task.queue_frames([LLMRunFrame()]) - - @transport.event_handler("on_client_disconnected") - async def on_client_disconnected(transport, client): - logger.info(f"Client disconnected") - await task.cancel() - - runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) - - await runner.run(task) - - -async def bot(runner_args: RunnerArguments): - """Main bot entry point compatible with Pipecat Cloud.""" - transport = await create_transport(runner_args, transport_params) - await run_bot(transport, runner_args) - - -if __name__ == "__main__": - from pipecat.runner.run import main - - main() diff --git a/examples/foundational/55t-update-settings-playht-tts.py b/examples/foundational/55t-update-settings-playht-tts.py deleted file mode 100644 index d79120d99..000000000 --- a/examples/foundational/55t-update-settings-playht-tts.py +++ /dev/null @@ -1,126 +0,0 @@ -# -# Copyright (c) 2024-2026, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -import asyncio -import os - -from dotenv import load_dotenv -from loguru import logger - -from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.frames.frames import LLMRunFrame, TTSUpdateSettingsFrame -from pipecat.pipeline.pipeline import Pipeline -from pipecat.pipeline.runner import PipelineRunner -from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_context import LLMContext -from pipecat.processors.aggregators.llm_response_universal import ( - LLMContextAggregatorPair, - LLMUserAggregatorParams, -) -from pipecat.runner.types import RunnerArguments -from pipecat.runner.utils import create_transport -from pipecat.services.deepgram.stt import DeepgramSTTService -from pipecat.services.openai.llm import OpenAILLMService -from pipecat.services.playht.tts import PlayHTTTSService, PlayHTTTSSettings -from pipecat.transports.base_transport import BaseTransport, TransportParams -from pipecat.transports.daily.transport import DailyParams -from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams - -load_dotenv(override=True) - -transport_params = { - "daily": lambda: DailyParams( - audio_in_enabled=True, - audio_out_enabled=True, - ), - "twilio": lambda: FastAPIWebsocketParams( - audio_in_enabled=True, - audio_out_enabled=True, - ), - "webrtc": lambda: TransportParams( - audio_in_enabled=True, - audio_out_enabled=True, - ), -} - - -async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): - logger.info(f"Starting bot") - - stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) - - tts = PlayHTTTSService( - api_key=os.getenv("PLAYHT_API_KEY"), - user_id=os.getenv("PLAYHT_USER_ID"), - voice_url=os.getenv("PLAYHT_VOICE_URL", ""), - ) - - llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) - - messages = [ - { - "role": "system", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.", - }, - ] - - context = LLMContext(messages) - user_aggregator, assistant_aggregator = LLMContextAggregatorPair( - context, - user_params=LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()), - ) - - pipeline = Pipeline( - [ - transport.input(), - stt, - user_aggregator, - llm, - tts, - transport.output(), - assistant_aggregator, - ] - ) - - task = PipelineTask( - pipeline, - params=PipelineParams( - enable_metrics=True, - enable_usage_metrics=True, - ), - idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, - ) - - @transport.event_handler("on_client_connected") - async def on_client_connected(transport, client): - logger.info(f"Client connected") - messages.append({"role": "system", "content": "Please introduce yourself to the user."}) - await task.queue_frames([LLMRunFrame()]) - - await asyncio.sleep(10) - logger.info("Updating PlayHT TTS settings: speed=1.3") - await task.queue_frame(TTSUpdateSettingsFrame(delta=PlayHTTTSSettings(speed=1.3))) - - @transport.event_handler("on_client_disconnected") - async def on_client_disconnected(transport, client): - logger.info(f"Client disconnected") - await task.cancel() - - runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) - - await runner.run(task) - - -async def bot(runner_args: RunnerArguments): - """Main bot entry point compatible with Pipecat Cloud.""" - transport = await create_transport(runner_args, transport_params) - await run_bot(transport, runner_args) - - -if __name__ == "__main__": - from pipecat.runner.run import main - - main() diff --git a/pyproject.toml b/pyproject.toml index a45ebb3b3..a925e70d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,7 +100,6 @@ openpipe = [ "openpipe>=4.50.0,<6" ] openrouter = [] perplexity = [] piper = [ "piper-tts>=1.3.0,<2", "requests>=2.32.5,<3" ] -playht = [ "pipecat-ai[websockets-base]" ] qwen = [] remote-smart-turn = [] resembleai = [ "pipecat-ai[websockets-base]" ] diff --git a/src/pipecat/services/playht/__init__.py b/src/pipecat/services/playht/__init__.py deleted file mode 100644 index 500ea0fdc..000000000 --- a/src/pipecat/services/playht/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# -# Copyright (c) 2024-2026, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -import sys - -from pipecat.services import DeprecatedModuleProxy - -from .tts import * - -sys.modules[__name__] = DeprecatedModuleProxy(globals(), "playht", "playht.tts") diff --git a/src/pipecat/services/playht/tts.py b/src/pipecat/services/playht/tts.py deleted file mode 100644 index 08a87209c..000000000 --- a/src/pipecat/services/playht/tts.py +++ /dev/null @@ -1,699 +0,0 @@ -# -# Copyright (c) 2024-2026, Daily -# -# SPDX-License-Identifier: BSD 2-Clause License -# - -"""PlayHT text-to-speech service implementations. - -This module provides integration with PlayHT's text-to-speech API -supporting both WebSocket streaming and HTTP-based synthesis. -""" - -import io -import json -import struct -import uuid -import warnings -from dataclasses import dataclass, field -from typing import Any, AsyncGenerator, Optional - -import aiohttp -from loguru import logger -from pydantic import BaseModel - -from pipecat.frames.frames import ( - CancelFrame, - EndFrame, - ErrorFrame, - Frame, - InterruptionFrame, - StartFrame, - TTSAudioRawFrame, - TTSStartedFrame, - TTSStoppedFrame, -) -from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven -from pipecat.services.tts_service import InterruptibleTTSService, TTSService -from pipecat.transcriptions.language import Language, resolve_language -from pipecat.utils.tracing.service_decorators import traced_tts - -try: - from websockets.asyncio.client import connect as websocket_connect - from websockets.protocol import State -except ModuleNotFoundError as e: - logger.error(f"Exception: {e}") - logger.error("In order to use PlayHTTTSService, you need to `pip install pipecat-ai[playht]`.") - raise Exception(f"Missing module: {e}") - - -def language_to_playht_language(language: Language) -> Optional[str]: - """Convert a Language enum to PlayHT language code. - - Args: - language: The Language enum value to convert. - - Returns: - The corresponding PlayHT language code, or None if not supported. - """ - LANGUAGE_MAP = { - Language.AF: "afrikans", - Language.AM: "amharic", - Language.AR: "arabic", - Language.BN: "bengali", - Language.BG: "bulgarian", - Language.CA: "catalan", - Language.CS: "czech", - Language.DA: "danish", - Language.DE: "german", - Language.EL: "greek", - Language.EN: "english", - Language.ES: "spanish", - Language.FR: "french", - Language.GL: "galician", - Language.HE: "hebrew", - Language.HI: "hindi", - Language.HR: "croatian", - Language.HU: "hungarian", - Language.ID: "indonesian", - Language.IT: "italian", - Language.JA: "japanese", - Language.KO: "korean", - Language.MS: "malay", - Language.NL: "dutch", - Language.PL: "polish", - Language.PT: "portuguese", - Language.RU: "russian", - Language.SQ: "albanian", - Language.SR: "serbian", - Language.SV: "swedish", - Language.TH: "thai", - Language.TL: "tagalog", - Language.TR: "turkish", - Language.UK: "ukrainian", - Language.UR: "urdu", - Language.XH: "xhosa", - Language.ZH: "mandarin", - } - - return resolve_language(language, LANGUAGE_MAP, use_base_code=False) - - -@dataclass -class PlayHTTTSSettings(TTSSettings): - """Settings for PlayHT TTS services. - - Parameters: - output_format: Audio output format. - voice_engine: Voice engine to use. - speed: Speech speed multiplier. Defaults to 1.0. - seed: Random seed for voice consistency. - playht_sample_rate: Audio sample rate sent to the API. - """ - - output_format: str | _NotGiven = field(default_factory=lambda: NOT_GIVEN) - voice_engine: str | _NotGiven = field(default_factory=lambda: NOT_GIVEN) - speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) - seed: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) - playht_sample_rate: int | _NotGiven = field(default_factory=lambda: NOT_GIVEN) - - -class PlayHTTTSService(InterruptibleTTSService): - """PlayHT WebSocket-based text-to-speech service. - - .. deprecated:: 0.0.88 - - This class is deprecated and will be removed in a future version. - PlayHT is shutting down their API on December 31st, 2025. - - Provides real-time text-to-speech synthesis using PlayHT's WebSocket API. - Supports streaming audio generation with configurable voice engines and - language settings. - """ - - _settings: PlayHTTTSSettings - - class InputParams(BaseModel): - """Input parameters for PlayHT TTS configuration. - - Parameters: - language: Language for synthesis. Defaults to English. - speed: Speech speed multiplier. Defaults to 1.0. - seed: Random seed for voice consistency. - """ - - language: Optional[Language] = Language.EN - speed: Optional[float] = 1.0 - seed: Optional[int] = None - - def __init__( - self, - *, - api_key: str, - user_id: str, - voice_url: str, - voice_engine: str = "Play3.0-mini", - sample_rate: Optional[int] = None, - output_format: str = "wav", - params: Optional[InputParams] = None, - **kwargs, - ): - """Initialize the PlayHT WebSocket TTS service. - - Args: - api_key: PlayHT API key for authentication. - user_id: PlayHT user ID for authentication. - voice_url: URL of the voice to use for synthesis. - voice_engine: Voice engine to use. Defaults to "Play3.0-mini". - sample_rate: Audio sample rate. If None, uses default. - output_format: Audio output format. Defaults to "wav". - params: Additional input parameters for voice customization. - **kwargs: Additional arguments passed to parent InterruptibleTTSService. - """ - super().__init__( - pause_frame_processing=True, - sample_rate=sample_rate, - **kwargs, - ) - - with warnings.catch_warnings(): - warnings.simplefilter("always") - warnings.warn( - "PlayHT is shutting down their API on December 31st, 2025. " - "'PlayHTTTSService' is deprecated and will be removed in a future version.", - DeprecationWarning, - stacklevel=2, - ) - - params = params or PlayHTTTSService.InputParams() - - self._api_key = api_key - self._user_id = user_id - self._websocket_url = None - self._receive_task = None - self._context_id = None - - self._settings = PlayHTTTSSettings( - model=voice_engine, - voice=voice_url, - language=self.language_to_service_language(params.language) - if params.language - else "english", - output_format=output_format, - voice_engine=voice_engine, - speed=params.speed, - seed=params.seed, - playht_sample_rate=0, - ) - self._sync_model_name_to_metrics() - - def can_generate_metrics(self) -> bool: - """Check if this service can generate processing metrics. - - Returns: - True, as PlayHT service supports metrics generation. - """ - return True - - async def _update_settings(self, delta: TTSSettings) -> dict[str, Any]: - """Apply a settings delta. - - Settings are stored but not applied to the active connection. - """ - changed = await super()._update_settings(delta) - - if not changed: - return changed - - # TODO: someday we could reconnect here to apply updated settings. - # Code might look something like the below: - # await self._disconnect() - # await self._connect() - - self._warn_unhandled_updated_settings(changed) - - return changed - - def language_to_service_language(self, language: Language) -> Optional[str]: - """Convert a Language enum to PlayHT service language format. - - Args: - language: The language to convert. - - Returns: - The PlayHT-specific language code, or None if not supported. - """ - return language_to_playht_language(language) - - async def start(self, frame: StartFrame): - """Start the PlayHT TTS service. - - Args: - frame: The start frame containing initialization parameters. - """ - await super().start(frame) - await self._connect() - - async def stop(self, frame: EndFrame): - """Stop the PlayHT TTS service. - - Args: - frame: The end frame. - """ - await super().stop(frame) - await self._disconnect() - - async def cancel(self, frame: CancelFrame): - """Cancel the PlayHT TTS service. - - Args: - frame: The cancel frame. - """ - await super().cancel(frame) - await self._disconnect() - - async def _connect(self): - """Connect to PlayHT WebSocket and start receive task.""" - await super()._connect() - - await self._connect_websocket() - - if self._websocket and not self._receive_task: - self._receive_task = self.create_task(self._receive_task_handler(self._report_error)) - - async def _disconnect(self): - """Disconnect from PlayHT WebSocket and clean up tasks.""" - await super()._disconnect() - - if self._receive_task: - await self.cancel_task(self._receive_task) - self._receive_task = None - - await self._disconnect_websocket() - - async def _connect_websocket(self): - """Connect to PlayHT websocket.""" - try: - if self._websocket and self._websocket.state is State.OPEN: - return - - logger.debug("Connecting to PlayHT") - - if not self._websocket_url: - await self._get_websocket_url() - - if not isinstance(self._websocket_url, str): - raise ValueError("WebSocket URL is not a string") - - self._websocket = await websocket_connect(self._websocket_url) - - await self._call_event_handler("on_connected") - except ValueError as e: - logger.error(f"{self} initialization error: {e}") - self._websocket = None - await self._call_event_handler("on_connection_error", f"{e}") - except Exception as e: - await self.push_error(error_msg=f"Error connecting: {e}", exception=e) - self._websocket = None - await self._call_event_handler("on_connection_error", f"{e}") - - async def _disconnect_websocket(self): - """Disconnect from PlayHT websocket.""" - try: - await self.stop_all_metrics() - - if self._websocket: - logger.debug("Disconnecting from PlayHT") - await self._websocket.close() - except Exception as e: - await self.push_error(error_msg=f"Error disconnecting: {e}", exception=e) - finally: - self._context_id = None - self._websocket = None - await self._call_event_handler("on_disconnected") - - async def _get_websocket_url(self): - """Retrieve WebSocket URL from PlayHT API.""" - async with aiohttp.ClientSession() as session: - async with session.post( - "https://api.play.ht/api/v4/websocket-auth", - headers={ - "Authorization": f"Bearer {self._api_key}", - "X-User-Id": self._user_id, - "Content-Type": "application/json", - }, - ) as response: - if response.status in (200, 201): - data = await response.json() - # Handle the new response format with multiple URLs - if "websocket_urls" in data: - # Select URL based on voice_engine - if self._settings.voice_engine in data["websocket_urls"]: - self._websocket_url = data["websocket_urls"][ - self._settings.voice_engine - ] - else: - raise ValueError( - f"Unsupported voice engine: {self._settings.voice_engine}" - ) - else: - raise ValueError("Invalid response: missing websocket_urls") - else: - raise Exception(f"Failed to get WebSocket URL: {response.status}") - - def _get_websocket(self): - """Get the WebSocket connection if available.""" - if self._websocket: - return self._websocket - raise Exception("Websocket not connected") - - def create_context_id(self) -> str: - """Generate a unique context ID for a TTS request in case we don't have one already in progress. - - Returns: - A unique string identifier for the TTS context. - """ - # If a context ID does not exist, create a new one. - # If an ID exists, continue using the current ID. - # When interruptions happen, user speech results in - # an interruption, which resets the context ID. - if not self._context_id: - return str(uuid.uuid4()) - return self._context_id - - async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection): - """Handle interruption by stopping metrics and clearing request ID.""" - await super()._handle_interruption(frame, direction) - await self.stop_all_metrics() - self._context_id = None - - async def _receive_messages(self): - """Receive messages from PlayHT websocket.""" - async for message in self._get_websocket(): - if isinstance(message, bytes): - # Skip the WAV header message - if message.startswith(b"RIFF"): - continue - await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame(message, self.sample_rate, 1, context_id=self._context_id) - await self.push_frame(frame) - else: - logger.debug(f"Received text message: {message}") - try: - msg = json.loads(message) - if msg.get("type") == "start": - # Handle start of stream - logger.debug(f"Started processing request: {msg.get('request_id')}") - elif msg.get("type") == "end": - # Handle end of stream - if "request_id" in msg and msg["request_id"] == self._context_id: - await self.push_frame(TTSStoppedFrame(context_id=self._context_id)) - self._context_id = None - elif "error" in msg: - await self.push_error(error_msg=f"Error: {msg['error']}") - except json.JSONDecodeError: - logger.error(f"Invalid JSON message: {message}") - - @traced_tts - async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]: - """Generate TTS audio from text using PlayHT's WebSocket API. - - Args: - text: The text to synthesize into speech. - context_id: The context ID for tracking audio frames. - - Yields: - Frame: Audio frames containing the synthesized speech. - """ - logger.debug(f"{self}: Generating TTS [{text}]") - - try: - # Reconnect if the websocket is closed - if not self._websocket or self._websocket.state is State.CLOSED: - await self._connect() - - if not self._context_id: - await self.start_ttfb_metrics() - yield TTSStartedFrame(context_id=context_id) - self._context_id = context_id - - tts_command = { - "text": text, - "voice": self._settings.voice, - "voice_engine": self._settings.voice_engine, - "output_format": self._settings.output_format, - "sample_rate": self.sample_rate, - "language": self._settings.language, - "speed": self._settings.speed, - "seed": self._settings.seed, - "request_id": self._context_id, - } - - try: - await self._get_websocket().send(json.dumps(tts_command)) - await self.start_tts_usage_metrics(text) - except Exception as e: - yield ErrorFrame(error=f"Unknown error occurred: {e}") - yield TTSStoppedFrame(context_id=context_id) - await self._disconnect() - await self._connect() - return - - # The actual audio frames will be handled in _receive_task_handler - yield None - - except Exception as e: - yield ErrorFrame(error=f"Unknown error occurred: {e}") - - -class PlayHTHttpTTSService(TTSService): - """PlayHT HTTP-based text-to-speech service. - - .. deprecated:: 0.0.88 - - This class is deprecated and will be removed in a future version. - PlayHT is shutting down their API on December 31st, 2025. - - Provides text-to-speech synthesis using PlayHT's HTTP API for simpler, - non-streaming synthesis. Suitable for use cases where streaming is not - required and simpler integration is preferred. - """ - - _settings: PlayHTTTSSettings - - class InputParams(BaseModel): - """Input parameters for PlayHT HTTP TTS configuration. - - Parameters: - language: Language for synthesis. Defaults to English. - speed: Speech speed multiplier. Defaults to 1.0. - seed: Random seed for voice consistency. - """ - - language: Optional[Language] = Language.EN - speed: Optional[float] = 1.0 - seed: Optional[int] = None - - def __init__( - self, - *, - api_key: str, - user_id: str, - voice_url: str, - voice_engine: str = "Play3.0-mini", - protocol: Optional[str] = None, - output_format: str = "wav", - sample_rate: Optional[int] = None, - params: Optional[InputParams] = None, - **kwargs, - ): - """Initialize the PlayHT HTTP TTS service. - - Args: - api_key: PlayHT API key for authentication. - user_id: PlayHT user ID for authentication. - voice_url: URL of the voice to use for synthesis. - voice_engine: Voice engine to use. Defaults to "Play3.0-mini". - protocol: Protocol to use ("http" or "ws"). - - .. deprecated:: 0.0.80 - This parameter no longer has any effect and will be removed in a future version. - Use PlayHTTTSService for WebSocket or PlayHTHttpTTSService for HTTP. - - output_format: Audio output format. Defaults to "wav". - sample_rate: Audio sample rate. If None, uses default. - params: Additional input parameters for voice customization. - **kwargs: Additional arguments passed to parent TTSService. - """ - super().__init__(sample_rate=sample_rate, **kwargs) - - # Warn about deprecated protocol parameter if explicitly provided - if protocol: - with warnings.catch_warnings(): - warnings.simplefilter("always") - warnings.warn( - "The 'protocol' parameter is deprecated and will be removed in a future version.", - DeprecationWarning, - stacklevel=2, - ) - - with warnings.catch_warnings(): - warnings.simplefilter("always") - warnings.warn( - "PlayHT is shutting down their API on December 31st, 2025. " - "'PlayHTHttpTTSService' is deprecated and will be removed in a future version.", - DeprecationWarning, - stacklevel=2, - ) - - params = params or PlayHTHttpTTSService.InputParams() - - self._user_id = user_id - self._api_key = api_key - - # Check if voice_engine contains protocol information (backward compatibility) - if "-http" in voice_engine: - # Extract the base engine name - voice_engine = voice_engine.replace("-http", "") - elif "-ws" in voice_engine: - # Extract the base engine name - voice_engine = voice_engine.replace("-ws", "") - - self._settings = PlayHTTTSSettings( - model=voice_engine, - voice=voice_url, - language=self.language_to_service_language(params.language) - if params.language - else "english", - output_format=output_format, - voice_engine=voice_engine, - speed=params.speed, - seed=params.seed, - playht_sample_rate=0, - ) - self._sync_model_name_to_metrics() - - async def start(self, frame: StartFrame): - """Start the PlayHT HTTP TTS service. - - Args: - frame: The start frame containing initialization parameters. - """ - await super().start(frame) - self._settings.playht_sample_rate = self.sample_rate - - def can_generate_metrics(self) -> bool: - """Check if this service can generate processing metrics. - - Returns: - True, as PlayHT HTTP service supports metrics generation. - """ - return True - - def language_to_service_language(self, language: Language) -> Optional[str]: - """Convert a Language enum to PlayHT service language format. - - Args: - language: The language to convert. - - Returns: - The PlayHT-specific language code, or None if not supported. - """ - return language_to_playht_language(language) - - @traced_tts - async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]: - """Generate TTS audio from text using PlayHT's HTTP API. - - Args: - text: The text to synthesize into speech. - context_id: The context ID for tracking audio frames. - - Yields: - Frame: Audio frames containing the synthesized speech. - """ - logger.debug(f"{self}: Generating TTS [{text}]") - - try: - await self.start_ttfb_metrics() - - # Prepare the request payload - payload = { - "text": text, - "voice": self._settings.voice, - "voice_engine": self._settings.voice_engine, - "output_format": self._settings.output_format, - "sample_rate": self.sample_rate, - "language": self._settings.language, - } - - # Add optional parameters if they exist - if self._settings.speed is not None: - payload["speed"] = self._settings.speed - if self._settings.seed is not None: - payload["seed"] = self._settings.seed - - headers = { - "Authorization": f"Bearer {self._api_key}", - "X-User-Id": self._user_id, - "Content-Type": "application/json", - "Accept": "*/*", - } - - await self.start_tts_usage_metrics(text) - - yield TTSStartedFrame(context_id=context_id) - - async with aiohttp.ClientSession() as session: - async with session.post( - "https://api.play.ht/api/v2/tts/stream", - headers=headers, - json=payload, - ) as response: - if response.status not in (200, 201): - error_text = await response.text() - raise Exception(f"PlayHT API error {response.status}: {error_text}") - - in_header = True - buffer = b"" - - CHUNK_SIZE = self.chunk_size - - async for chunk in response.content.iter_chunked(CHUNK_SIZE): - if len(chunk) == 0: - continue - - # Skip the RIFF header - if in_header: - buffer += chunk - if len(buffer) <= 36: - continue - else: - fh = io.BytesIO(buffer) - fh.seek(36) - (data, size) = struct.unpack("<4sI", fh.read(8)) - while data != b"data": - fh.read(size) - (data, size) = struct.unpack("<4sI", fh.read(8)) - # Extract audio data after header - audio_data = buffer[fh.tell() :] - if len(audio_data) > 0: - await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame( - audio_data, self.sample_rate, 1, context_id=context_id - ) - yield frame - in_header = False - elif len(chunk) > 0: - await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame( - chunk, self.sample_rate, 1, context_id=context_id - ) - yield frame - - except Exception as e: - yield ErrorFrame(error=f"Unknown error occurred: {e}") - finally: - await self.stop_ttfb_metrics() - yield TTSStoppedFrame(context_id=context_id) diff --git a/uv.lock b/uv.lock index bd2f64639..e2615b170 100644 --- a/uv.lock +++ b/uv.lock @@ -4550,9 +4550,6 @@ piper = [ { name = "piper-tts" }, { name = "requests" }, ] -playht = [ - { name = "websockets" }, -] resembleai = [ { name = "websockets" }, ] @@ -4722,7 +4719,6 @@ requires-dist = [ { name = "pipecat-ai", extras = ["websockets-base"], marker = "extra == 'lmnt'" }, { name = "pipecat-ai", extras = ["websockets-base"], marker = "extra == 'neuphonic'" }, { name = "pipecat-ai", extras = ["websockets-base"], marker = "extra == 'openai'" }, - { name = "pipecat-ai", extras = ["websockets-base"], marker = "extra == 'playht'" }, { name = "pipecat-ai", extras = ["websockets-base"], marker = "extra == 'resembleai'" }, { name = "pipecat-ai", extras = ["websockets-base"], marker = "extra == 'rime'" }, { name = "pipecat-ai", extras = ["websockets-base"], marker = "extra == 'sarvam'" }, @@ -4763,7 +4759,7 @@ requires-dist = [ { name = "wait-for2", marker = "python_full_version < '3.12'", specifier = ">=0.4.1" }, { name = "websockets", marker = "extra == 'websockets-base'", specifier = ">=13.1,<16.0" }, ] -provides-extras = ["aic", "anthropic", "assemblyai", "asyncai", "aws", "aws-nova-sonic", "azure", "cartesia", "camb", "cerebras", "daily", "deepgram", "deepseek", "elevenlabs", "fal", "fireworks", "fish", "gladia", "google", "gradium", "grok", "groq", "gstreamer", "heygen", "hume", "inworld", "koala", "kokoro", "krisp", "langchain", "livekit", "lmnt", "local", "local-smart-turn", "mcp", "mem0", "mistral", "mlx-whisper", "moondream", "neuphonic", "noisereduce", "nvidia", "openai", "rnnoise", "openpipe", "openrouter", "perplexity", "piper", "playht", "qwen", "remote-smart-turn", "resembleai", "rime", "riva", "runner", "sagemaker", "sambanova", "sarvam", "sentry", "silero", "simli", "soniox", "soundfile", "speechmatics", "strands", "tavus", "together", "tracing", "ultravox", "webrtc", "websocket", "websockets-base", "whisper"] +provides-extras = ["aic", "anthropic", "assemblyai", "asyncai", "aws", "aws-nova-sonic", "azure", "cartesia", "camb", "cerebras", "daily", "deepgram", "deepseek", "elevenlabs", "fal", "fireworks", "fish", "gladia", "google", "gradium", "grok", "groq", "gstreamer", "heygen", "hume", "inworld", "koala", "kokoro", "krisp", "langchain", "livekit", "lmnt", "local", "local-smart-turn", "mcp", "mem0", "mistral", "mlx-whisper", "moondream", "neuphonic", "noisereduce", "nvidia", "openai", "rnnoise", "openpipe", "openrouter", "perplexity", "piper", "qwen", "remote-smart-turn", "resembleai", "rime", "riva", "runner", "sagemaker", "sambanova", "sarvam", "sentry", "silero", "simli", "soniox", "soundfile", "speechmatics", "strands", "tavus", "together", "tracing", "ultravox", "webrtc", "websocket", "websockets-base", "whisper"] [package.metadata.requires-dev] dev = [