From b489de2fc3e37b63dfc4735e948a4c33db210669 Mon Sep 17 00:00:00 2001 From: zach Date: Tue, 26 Aug 2025 16:13:55 -0700 Subject: [PATCH] adds hume tts service --- src/pipecat/services/hume/__init__.py | 13 ++ src/pipecat/services/hume/tts.py | 192 ++++++++++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 src/pipecat/services/hume/__init__.py create mode 100644 src/pipecat/services/hume/tts.py diff --git a/src/pipecat/services/hume/__init__.py b/src/pipecat/services/hume/__init__.py new file mode 100644 index 000000000..f73fd43d9 --- /dev/null +++ b/src/pipecat/services/hume/__init__.py @@ -0,0 +1,13 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import sys + +from pipecat.services import DeprecatedModuleProxy + +from .tts import * + +sys.modules[__name__] = DeprecatedModuleProxy(globals(), "hume", "hume.tts") \ No newline at end of file diff --git a/src/pipecat/services/hume/tts.py b/src/pipecat/services/hume/tts.py new file mode 100644 index 000000000..25ed66edd --- /dev/null +++ b/src/pipecat/services/hume/tts.py @@ -0,0 +1,192 @@ +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License + +"""Hume Text-to-Speech service implementation.""" +from __future__ import annotations + +import base64 +import os +from typing import Any, AsyncGenerator, Optional + +from loguru import logger +from pydantic import BaseModel + +from pipecat.frames.frames import ( + ErrorFrame, + Frame, + StartFrame, + TTSAudioRawFrame, + TTSStartedFrame, + TTSStoppedFrame, +) +from pipecat.services.tts_service import TTSService +from pipecat.utils.tracing.service_decorators import traced_tts + +try: + from hume import AsyncHumeClient + from hume.tts import ( + PostedUtterance, + FormatPcm, + PostedUtteranceVoiceWithId, + ) +except ModuleNotFoundError as e: # pragma: no cover - import-time guidance + logger.error(f"Exception: {e}") + logger.error("In order to use Hume, you need to `pip install pipecat-ai[hume]`.") + raise + + +HUME_SAMPLE_RATE = 48_000 # Hume TTS streams at 48 kHz + + +class HumeTTSService(TTSService): + """Hume Octave Text-to-Speech service. + + Streams PCM audio via Hume's HTTP output streaming (JSON chunks) endpoint + using the Python SDK and emits `TTSAudioRawFrame`s suitable for Pipecat transports. + + Parameters + ---------- + api_key: + Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable. + voice_id: + **Required**: ID of the voice to use (ID-only; names are not supported here). + params: + Optional synthesis controls (acting instructions, speed, trailing silence). + sample_rate: + Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume). + """ + + class InputParams(BaseModel): + """Optional synthesis parameters for Hume TTS. + + description: Natural-language acting directions (≤100 chars) + speed: Speaking-rate multiplier (0.5-2.0) + trailing_silence: Seconds of silence to append at the end (0-5) + """ + + description: Optional[str] = None + speed: Optional[float] = None + trailing_silence: Optional[float] = None + + def __init__( + self, + *, + api_key: Optional[str] = None, + voice_id: str, + params: Optional[InputParams] = None, + sample_rate: Optional[int] = HUME_SAMPLE_RATE, + **kwargs, + ) -> None: + api_key = api_key or os.getenv("HUME_API_KEY") + if not api_key: + raise ValueError("HumeTTSService requires an API key (env HUME_API_KEY or api_key=)") + + if sample_rate != HUME_SAMPLE_RATE: + logger.warning( + f"Hume TTS streams at {HUME_SAMPLE_RATE} Hz; configured sample_rate={sample_rate}" + ) + + super().__init__( + aggregate_sentences=True, + push_text_frames=False, + push_stop_frames=True, + pause_frame_processing=True, + sample_rate=sample_rate, + **kwargs, + ) + + self._client = AsyncHumeClient(api_key=api_key) + self._params = params or HumeTTSService.InputParams() + + # Store voice in the base class (mirrors other services) + self.set_voice(voice_id) + + def can_generate_metrics(self) -> bool: + return True + + async def start(self, frame: StartFrame) -> None: + await super().start(frame) + + async def update_setting(self, key: str, value: Any) -> None: + """Runtime updates via `TTSUpdateSettingsFrame`. + + Recognized keys: + - "voice_id" + - "description" + - "speed" + - "trailing_silence" + """ + key_l = (key or "").lower() + + if key_l == "voice_id": + self.set_voice(str(value)) + logger.info(f"HumeTTSService voice_id set to: {self.voice}") + elif key_l == "description": + self._params.description = None if value is None else str(value) + elif key_l == "speed": + self._params.speed = None if value is None else float(value) + elif key_l == "trailing_silence": + self._params.trailing_silence = None if value is None else float(value) + else: + # Defer unknown keys to the base class + await super().update_setting(key, value) + + @traced_tts + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + """Generate speech from text using Hume TTS.""" + logger.debug(f"{self}: Generating Hume TTS: [{text}]") + + # Build the request payload + utterance_kwargs: dict[str, Any] = { + "text": text, + "voice": PostedUtteranceVoiceWithId(id=self.voice), + } + if self._params.description is not None: + utterance_kwargs["description"] = self._params.description + if self._params.speed is not None: + utterance_kwargs["speed"] = self._params.speed + if self._params.trailing_silence is not None: + utterance_kwargs["trailing_silence"] = self._params.trailing_silence + + utterance = PostedUtterance(**utterance_kwargs) + + # Request raw PCM chunks in the streaming JSON + pcm_fmt = FormatPcm(type="pcm") + + measuring_ttfb = True + await self.start_ttfb_metrics() + await self.start_tts_usage_metrics(text) + yield TTSStartedFrame() + + try: + # Instant mode is always enabled here (not user-configurable) + async for chunk in self._client.tts.synthesize_json_streaming( + utterances=[utterance], + format=pcm_fmt, + instant_mode=True, + ): + audio_b64 = getattr(chunk, "audio", None) + if not audio_b64: + continue + + pcm_bytes = base64.b64decode(audio_b64) + + if measuring_ttfb: + await self.stop_ttfb_metrics() + measuring_ttfb = False + + # Hume emits mono PCM at 48 kHz; downstream can resample if needed. + yield TTSAudioRawFrame(pcm_bytes, self.sample_rate, 1) + + except Exception as e: + logger.exception(f"{self} error generating TTS: {e}") + yield ErrorFrame(error=str(e)) + finally: + # Ensure TTFB timer is stopped even on early failures + if measuring_ttfb: + await self.stop_ttfb_metrics() + yield TTSStoppedFrame() + + +__all__ = ["HumeTTSService"]