add Hume example, small fixes

This commit is contained in:
ivaaan
2025-09-30 17:18:56 -07:00
parent b489de2fc3
commit 4ffdabcfde
8 changed files with 4035 additions and 5792 deletions

View File

@@ -81,7 +81,7 @@ You can connect to Pipecat from any platform using our official SDKs:
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local |
| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) |

View File

@@ -0,0 +1,124 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import StartFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.hume.tts import HUME_SAMPLE_RATE, HumeTTSService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
audio_out_sample_rate=HUME_SAMPLE_RATE,
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = HumeTTSService(
api_key=os.getenv("HUME_API_KEY"),
# Replace with your Hume voice ID
voice_id="f898a92e-685f-43fa-985b-a46920f0650b",
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt,
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
runner_args.transport = "webrtc"
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -112,6 +112,11 @@ webrtc = [ "aiortc>=1.13.0,<2", "opencv-python>=4.11.0.86,<5" ]
websocket = [ "pipecat-ai[websockets-base]", "fastapi>=0.115.6,<0.117.0" ]
websockets-base = [ "websockets>=13.1,<16.0" ]
whisper = [ "faster-whisper~=1.1.1" ]
fastapi = [
"fastapi",
"uvicorn",
"websockets",
]
[dependency-groups]
dev = [

View File

@@ -103,6 +103,7 @@ TESTS_07 = [
("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
("07ad-interruptible-hume.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
# Needs a local XTTS docker instance running.
# ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
# Needs a Krisp license.

View File

@@ -3,11 +3,3 @@
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import sys
from pipecat.services import DeprecatedModuleProxy
from .tts import *
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "hume", "hume.tts")

View File

@@ -3,6 +3,7 @@
# SPDX-License-Identifier: BSD 2-Clause License
"""Hume Text-to-Speech service implementation."""
from __future__ import annotations
import base64
@@ -26,8 +27,8 @@ from pipecat.utils.tracing.service_decorators import traced_tts
try:
from hume import AsyncHumeClient
from hume.tts import (
PostedUtterance,
FormatPcm,
PostedUtterance,
PostedUtteranceVoiceWithId,
)
except ModuleNotFoundError as e: # pragma: no cover - import-time guidance
@@ -45,24 +46,21 @@ class HumeTTSService(TTSService):
Streams PCM audio via Hume's HTTP output streaming (JSON chunks) endpoint
using the Python SDK and emits `TTSAudioRawFrame`s suitable for Pipecat transports.
Parameters
----------
api_key:
Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable.
voice_id:
**Required**: ID of the voice to use (ID-only; names are not supported here).
params:
Optional synthesis controls (acting instructions, speed, trailing silence).
sample_rate:
Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume).
Supported features:
- Generates speech from text using Hume TTS.
- Streams PCM audio.
- Supports dynamic updates of voice and synthesis parameters at runtime.
- Provides metrics for Time To First Byte (TTFB) and TTS usage.
"""
class InputParams(BaseModel):
"""Optional synthesis parameters for Hume TTS.
description: Natural-language acting directions (≤100 chars)
speed: Speaking-rate multiplier (0.5-2.0)
trailing_silence: Seconds of silence to append at the end (0-5)
Parameters:
description: Natural-language acting directions (up to 100 characters).
speed: Speaking-rate multiplier (0.5-2.0).
trailing_silence: Seconds of silence to append at the end (0-5).
"""
description: Optional[str] = None
@@ -78,6 +76,15 @@ class HumeTTSService(TTSService):
sample_rate: Optional[int] = HUME_SAMPLE_RATE,
**kwargs,
) -> None:
"""Initialize the HumeTTSService.
Args:
api_key: Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable.
voice_id: ID of the voice to use (ID-only; names are not supported here).
params: Optional synthesis controls (acting instructions, speed, trailing silence).
sample_rate: Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume).
**kwargs: Additional arguments passed to the parent class.
"""
api_key = api_key or os.getenv("HUME_API_KEY")
if not api_key:
raise ValueError("HumeTTSService requires an API key (env HUME_API_KEY or api_key=)")
@@ -88,9 +95,6 @@ class HumeTTSService(TTSService):
)
super().__init__(
aggregate_sentences=True,
push_text_frames=False,
push_stop_frames=True,
pause_frame_processing=True,
sample_rate=sample_rate,
**kwargs,
@@ -102,20 +106,34 @@ class HumeTTSService(TTSService):
# Store voice in the base class (mirrors other services)
self.set_voice(voice_id)
self._audio_bytes = b""
def can_generate_metrics(self) -> bool:
"""Can generate metrics.
Returns:
True if metrics can be generated, False otherwise.
"""
return True
async def start(self, frame: StartFrame) -> None:
"""Start the service.
Args:
frame: The start frame.
"""
await super().start(frame)
async def update_setting(self, key: str, value: Any) -> None:
"""Runtime updates via `TTSUpdateSettingsFrame`.
Recognized keys:
- "voice_id"
- "description"
- "speed"
- "trailing_silence"
Args:
key: The name of the setting to update. Recognized keys are:
- "voice_id"
- "description"
- "speed"
- "trailing_silence"
value: The new value for the setting.
"""
key_l = (key or "").lower()
@@ -134,13 +152,22 @@ class HumeTTSService(TTSService):
@traced_tts
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"""Generate speech from text using Hume TTS."""
"""Generate speech from text using Hume TTS.
Args:
text: The text to be synthesized.
Returns:
An async generator that yields `Frame` objects, including
`TTSStartedFrame`, `TTSAudioRawFrame`, `ErrorFrame`, and
`TTSStoppedFrame`.
"""
logger.debug(f"{self}: Generating Hume TTS: [{text}]")
# Build the request payload
utterance_kwargs: dict[str, Any] = {
"text": text,
"voice": PostedUtteranceVoiceWithId(id=self.voice),
"voice": PostedUtteranceVoiceWithId(id=self._voice_id),
}
if self._params.description is not None:
utterance_kwargs["description"] = self._params.description
@@ -161,6 +188,10 @@ class HumeTTSService(TTSService):
try:
# Instant mode is always enabled here (not user-configurable)
# Hume emits mono PCM at 48 kHz; downstream can resample if needed.
# We buffer audio bytes before sending to prevent glitches.
self._audio_bytes = b""
first_audio_sent = False
async for chunk in self._client.tts.synthesize_json_streaming(
utterances=[utterance],
format=pcm_fmt,
@@ -171,18 +202,34 @@ class HumeTTSService(TTSService):
continue
pcm_bytes = base64.b64decode(audio_b64)
self._audio_bytes += pcm_bytes
if measuring_ttfb:
await self.stop_ttfb_metrics()
measuring_ttfb = False
# Send the first audio chunk immediately to avoid client-side delays.
if not first_audio_sent:
if self._audio_bytes:
yield TTSAudioRawFrame(self._audio_bytes, self.sample_rate, 1)
if measuring_ttfb:
await self.stop_ttfb_metrics()
measuring_ttfb = False
first_audio_sent = True
# Do NOT clear _audio_bytes here. Subsequent chunks will build on this.
continue
# Hume emits mono PCM at 48 kHz; downstream can resample if needed.
yield TTSAudioRawFrame(pcm_bytes, self.sample_rate, 1)
# Buffer audio until we have enough to avoid glitches
if len(self._audio_bytes) < self.chunk_size:
continue
yield TTSAudioRawFrame(self._audio_bytes, self.sample_rate, 1)
self._audio_bytes = b""
except Exception as e:
logger.exception(f"{self} error generating TTS: {e}")
yield ErrorFrame(error=str(e))
finally:
# Yield any remaining audio
if self._audio_bytes:
yield TTSAudioRawFrame(self._audio_bytes, self.sample_rate, 1)
# Ensure TTFB timer is stopped even on early failures
if measuring_ttfb:
await self.stop_ttfb_metrics()

View File

@@ -142,7 +142,6 @@ class TTSService(AIService):
"""
return self._sample_rate
@property
def chunk_size(self) -> int:
"""Get the recommended chunk size for audio streaming.

9581
uv.lock generated

File diff suppressed because it is too large Load Diff