add Hume example, small fixes
This commit is contained in:
@@ -81,7 +81,7 @@ You can connect to Pipecat from any platform using our official SDKs:
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
|
||||
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |
|
||||
|
||||
124
examples/foundational/07ad-interruptible-hume.py
Normal file
124
examples/foundational/07ad-interruptible-hume.py
Normal file
@@ -0,0 +1,124 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import StartFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.hume.tts import HUME_SAMPLE_RATE, HumeTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
|
||||
from pipecat.transports.services.daily import DailyParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
"webrtc": lambda: TransportParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
audio_out_sample_rate=HUME_SAMPLE_RATE,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = HumeTTSService(
|
||||
api_key=os.getenv("HUME_API_KEY"),
|
||||
# Replace with your Hume voice ID
|
||||
voice_id="f898a92e-685f-43fa-985b-a46920f0650b",
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
stt,
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
),
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([context_aggregator.user().get_context_frame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
runner_args.transport = "webrtc"
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
@@ -112,6 +112,11 @@ webrtc = [ "aiortc>=1.13.0,<2", "opencv-python>=4.11.0.86,<5" ]
|
||||
websocket = [ "pipecat-ai[websockets-base]", "fastapi>=0.115.6,<0.117.0" ]
|
||||
websockets-base = [ "websockets>=13.1,<16.0" ]
|
||||
whisper = [ "faster-whisper~=1.1.1" ]
|
||||
fastapi = [
|
||||
"fastapi",
|
||||
"uvicorn",
|
||||
"websockets",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
|
||||
@@ -103,6 +103,7 @@ TESTS_07 = [
|
||||
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
("07ad-interruptible-hume.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# Needs a local XTTS docker instance running.
|
||||
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
|
||||
# Needs a Krisp license.
|
||||
|
||||
@@ -3,11 +3,3 @@
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import sys
|
||||
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .tts import *
|
||||
|
||||
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "hume", "hume.tts")
|
||||
@@ -3,6 +3,7 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
|
||||
"""Hume Text-to-Speech service implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
@@ -26,8 +27,8 @@ from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
try:
|
||||
from hume import AsyncHumeClient
|
||||
from hume.tts import (
|
||||
PostedUtterance,
|
||||
FormatPcm,
|
||||
PostedUtterance,
|
||||
PostedUtteranceVoiceWithId,
|
||||
)
|
||||
except ModuleNotFoundError as e: # pragma: no cover - import-time guidance
|
||||
@@ -45,24 +46,21 @@ class HumeTTSService(TTSService):
|
||||
Streams PCM audio via Hume's HTTP output streaming (JSON chunks) endpoint
|
||||
using the Python SDK and emits `TTSAudioRawFrame`s suitable for Pipecat transports.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
api_key:
|
||||
Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable.
|
||||
voice_id:
|
||||
**Required**: ID of the voice to use (ID-only; names are not supported here).
|
||||
params:
|
||||
Optional synthesis controls (acting instructions, speed, trailing silence).
|
||||
sample_rate:
|
||||
Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume).
|
||||
Supported features:
|
||||
|
||||
- Generates speech from text using Hume TTS.
|
||||
- Streams PCM audio.
|
||||
- Supports dynamic updates of voice and synthesis parameters at runtime.
|
||||
- Provides metrics for Time To First Byte (TTFB) and TTS usage.
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Optional synthesis parameters for Hume TTS.
|
||||
|
||||
description: Natural-language acting directions (≤100 chars)
|
||||
speed: Speaking-rate multiplier (0.5-2.0)
|
||||
trailing_silence: Seconds of silence to append at the end (0-5)
|
||||
Parameters:
|
||||
description: Natural-language acting directions (up to 100 characters).
|
||||
speed: Speaking-rate multiplier (0.5-2.0).
|
||||
trailing_silence: Seconds of silence to append at the end (0-5).
|
||||
"""
|
||||
|
||||
description: Optional[str] = None
|
||||
@@ -78,6 +76,15 @@ class HumeTTSService(TTSService):
|
||||
sample_rate: Optional[int] = HUME_SAMPLE_RATE,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""Initialize the HumeTTSService.
|
||||
|
||||
Args:
|
||||
api_key: Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable.
|
||||
voice_id: ID of the voice to use (ID-only; names are not supported here).
|
||||
params: Optional synthesis controls (acting instructions, speed, trailing silence).
|
||||
sample_rate: Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume).
|
||||
**kwargs: Additional arguments passed to the parent class.
|
||||
"""
|
||||
api_key = api_key or os.getenv("HUME_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("HumeTTSService requires an API key (env HUME_API_KEY or api_key=)")
|
||||
@@ -88,9 +95,6 @@ class HumeTTSService(TTSService):
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
aggregate_sentences=True,
|
||||
push_text_frames=False,
|
||||
push_stop_frames=True,
|
||||
pause_frame_processing=True,
|
||||
sample_rate=sample_rate,
|
||||
**kwargs,
|
||||
@@ -102,20 +106,34 @@ class HumeTTSService(TTSService):
|
||||
# Store voice in the base class (mirrors other services)
|
||||
self.set_voice(voice_id)
|
||||
|
||||
self._audio_bytes = b""
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Can generate metrics.
|
||||
|
||||
Returns:
|
||||
True if metrics can be generated, False otherwise.
|
||||
"""
|
||||
return True
|
||||
|
||||
async def start(self, frame: StartFrame) -> None:
|
||||
"""Start the service.
|
||||
|
||||
Args:
|
||||
frame: The start frame.
|
||||
"""
|
||||
await super().start(frame)
|
||||
|
||||
async def update_setting(self, key: str, value: Any) -> None:
|
||||
"""Runtime updates via `TTSUpdateSettingsFrame`.
|
||||
|
||||
Recognized keys:
|
||||
- "voice_id"
|
||||
- "description"
|
||||
- "speed"
|
||||
- "trailing_silence"
|
||||
Args:
|
||||
key: The name of the setting to update. Recognized keys are:
|
||||
- "voice_id"
|
||||
- "description"
|
||||
- "speed"
|
||||
- "trailing_silence"
|
||||
value: The new value for the setting.
|
||||
"""
|
||||
key_l = (key or "").lower()
|
||||
|
||||
@@ -134,13 +152,22 @@ class HumeTTSService(TTSService):
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using Hume TTS."""
|
||||
"""Generate speech from text using Hume TTS.
|
||||
|
||||
Args:
|
||||
text: The text to be synthesized.
|
||||
|
||||
Returns:
|
||||
An async generator that yields `Frame` objects, including
|
||||
`TTSStartedFrame`, `TTSAudioRawFrame`, `ErrorFrame`, and
|
||||
`TTSStoppedFrame`.
|
||||
"""
|
||||
logger.debug(f"{self}: Generating Hume TTS: [{text}]")
|
||||
|
||||
# Build the request payload
|
||||
utterance_kwargs: dict[str, Any] = {
|
||||
"text": text,
|
||||
"voice": PostedUtteranceVoiceWithId(id=self.voice),
|
||||
"voice": PostedUtteranceVoiceWithId(id=self._voice_id),
|
||||
}
|
||||
if self._params.description is not None:
|
||||
utterance_kwargs["description"] = self._params.description
|
||||
@@ -161,6 +188,10 @@ class HumeTTSService(TTSService):
|
||||
|
||||
try:
|
||||
# Instant mode is always enabled here (not user-configurable)
|
||||
# Hume emits mono PCM at 48 kHz; downstream can resample if needed.
|
||||
# We buffer audio bytes before sending to prevent glitches.
|
||||
self._audio_bytes = b""
|
||||
first_audio_sent = False
|
||||
async for chunk in self._client.tts.synthesize_json_streaming(
|
||||
utterances=[utterance],
|
||||
format=pcm_fmt,
|
||||
@@ -171,18 +202,34 @@ class HumeTTSService(TTSService):
|
||||
continue
|
||||
|
||||
pcm_bytes = base64.b64decode(audio_b64)
|
||||
self._audio_bytes += pcm_bytes
|
||||
|
||||
if measuring_ttfb:
|
||||
await self.stop_ttfb_metrics()
|
||||
measuring_ttfb = False
|
||||
# Send the first audio chunk immediately to avoid client-side delays.
|
||||
if not first_audio_sent:
|
||||
if self._audio_bytes:
|
||||
yield TTSAudioRawFrame(self._audio_bytes, self.sample_rate, 1)
|
||||
if measuring_ttfb:
|
||||
await self.stop_ttfb_metrics()
|
||||
measuring_ttfb = False
|
||||
first_audio_sent = True
|
||||
# Do NOT clear _audio_bytes here. Subsequent chunks will build on this.
|
||||
continue
|
||||
|
||||
# Hume emits mono PCM at 48 kHz; downstream can resample if needed.
|
||||
yield TTSAudioRawFrame(pcm_bytes, self.sample_rate, 1)
|
||||
# Buffer audio until we have enough to avoid glitches
|
||||
if len(self._audio_bytes) < self.chunk_size:
|
||||
continue
|
||||
|
||||
yield TTSAudioRawFrame(self._audio_bytes, self.sample_rate, 1)
|
||||
self._audio_bytes = b""
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} error generating TTS: {e}")
|
||||
yield ErrorFrame(error=str(e))
|
||||
finally:
|
||||
# Yield any remaining audio
|
||||
if self._audio_bytes:
|
||||
yield TTSAudioRawFrame(self._audio_bytes, self.sample_rate, 1)
|
||||
|
||||
# Ensure TTFB timer is stopped even on early failures
|
||||
if measuring_ttfb:
|
||||
await self.stop_ttfb_metrics()
|
||||
|
||||
@@ -142,7 +142,6 @@ class TTSService(AIService):
|
||||
"""
|
||||
return self._sample_rate
|
||||
|
||||
@property
|
||||
def chunk_size(self) -> int:
|
||||
"""Get the recommended chunk size for audio streaming.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user