adds hume tts service
This commit is contained in:
13
src/pipecat/services/hume/__init__.py
Normal file
13
src/pipecat/services/hume/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import sys
|
||||
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .tts import *
|
||||
|
||||
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "hume", "hume.tts")
|
||||
192
src/pipecat/services/hume/tts.py
Normal file
192
src/pipecat/services/hume/tts.py
Normal file
@@ -0,0 +1,192 @@
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
|
||||
"""Hume Text-to-Speech service implementation."""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import os
|
||||
from typing import Any, AsyncGenerator, Optional
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.services.tts_service import TTSService
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
try:
|
||||
from hume import AsyncHumeClient
|
||||
from hume.tts import (
|
||||
PostedUtterance,
|
||||
FormatPcm,
|
||||
PostedUtteranceVoiceWithId,
|
||||
)
|
||||
except ModuleNotFoundError as e: # pragma: no cover - import-time guidance
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use Hume, you need to `pip install pipecat-ai[hume]`.")
|
||||
raise
|
||||
|
||||
|
||||
HUME_SAMPLE_RATE = 48_000 # Hume TTS streams at 48 kHz
|
||||
|
||||
|
||||
class HumeTTSService(TTSService):
|
||||
"""Hume Octave Text-to-Speech service.
|
||||
|
||||
Streams PCM audio via Hume's HTTP output streaming (JSON chunks) endpoint
|
||||
using the Python SDK and emits `TTSAudioRawFrame`s suitable for Pipecat transports.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
api_key:
|
||||
Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable.
|
||||
voice_id:
|
||||
**Required**: ID of the voice to use (ID-only; names are not supported here).
|
||||
params:
|
||||
Optional synthesis controls (acting instructions, speed, trailing silence).
|
||||
sample_rate:
|
||||
Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume).
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Optional synthesis parameters for Hume TTS.
|
||||
|
||||
description: Natural-language acting directions (≤100 chars)
|
||||
speed: Speaking-rate multiplier (0.5-2.0)
|
||||
trailing_silence: Seconds of silence to append at the end (0-5)
|
||||
"""
|
||||
|
||||
description: Optional[str] = None
|
||||
speed: Optional[float] = None
|
||||
trailing_silence: Optional[float] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: Optional[str] = None,
|
||||
voice_id: str,
|
||||
params: Optional[InputParams] = None,
|
||||
sample_rate: Optional[int] = HUME_SAMPLE_RATE,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
api_key = api_key or os.getenv("HUME_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("HumeTTSService requires an API key (env HUME_API_KEY or api_key=)")
|
||||
|
||||
if sample_rate != HUME_SAMPLE_RATE:
|
||||
logger.warning(
|
||||
f"Hume TTS streams at {HUME_SAMPLE_RATE} Hz; configured sample_rate={sample_rate}"
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
aggregate_sentences=True,
|
||||
push_text_frames=False,
|
||||
push_stop_frames=True,
|
||||
pause_frame_processing=True,
|
||||
sample_rate=sample_rate,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._client = AsyncHumeClient(api_key=api_key)
|
||||
self._params = params or HumeTTSService.InputParams()
|
||||
|
||||
# Store voice in the base class (mirrors other services)
|
||||
self.set_voice(voice_id)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
|
||||
async def start(self, frame: StartFrame) -> None:
|
||||
await super().start(frame)
|
||||
|
||||
async def update_setting(self, key: str, value: Any) -> None:
|
||||
"""Runtime updates via `TTSUpdateSettingsFrame`.
|
||||
|
||||
Recognized keys:
|
||||
- "voice_id"
|
||||
- "description"
|
||||
- "speed"
|
||||
- "trailing_silence"
|
||||
"""
|
||||
key_l = (key or "").lower()
|
||||
|
||||
if key_l == "voice_id":
|
||||
self.set_voice(str(value))
|
||||
logger.info(f"HumeTTSService voice_id set to: {self.voice}")
|
||||
elif key_l == "description":
|
||||
self._params.description = None if value is None else str(value)
|
||||
elif key_l == "speed":
|
||||
self._params.speed = None if value is None else float(value)
|
||||
elif key_l == "trailing_silence":
|
||||
self._params.trailing_silence = None if value is None else float(value)
|
||||
else:
|
||||
# Defer unknown keys to the base class
|
||||
await super().update_setting(key, value)
|
||||
|
||||
@traced_tts
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using Hume TTS."""
|
||||
logger.debug(f"{self}: Generating Hume TTS: [{text}]")
|
||||
|
||||
# Build the request payload
|
||||
utterance_kwargs: dict[str, Any] = {
|
||||
"text": text,
|
||||
"voice": PostedUtteranceVoiceWithId(id=self.voice),
|
||||
}
|
||||
if self._params.description is not None:
|
||||
utterance_kwargs["description"] = self._params.description
|
||||
if self._params.speed is not None:
|
||||
utterance_kwargs["speed"] = self._params.speed
|
||||
if self._params.trailing_silence is not None:
|
||||
utterance_kwargs["trailing_silence"] = self._params.trailing_silence
|
||||
|
||||
utterance = PostedUtterance(**utterance_kwargs)
|
||||
|
||||
# Request raw PCM chunks in the streaming JSON
|
||||
pcm_fmt = FormatPcm(type="pcm")
|
||||
|
||||
measuring_ttfb = True
|
||||
await self.start_ttfb_metrics()
|
||||
await self.start_tts_usage_metrics(text)
|
||||
yield TTSStartedFrame()
|
||||
|
||||
try:
|
||||
# Instant mode is always enabled here (not user-configurable)
|
||||
async for chunk in self._client.tts.synthesize_json_streaming(
|
||||
utterances=[utterance],
|
||||
format=pcm_fmt,
|
||||
instant_mode=True,
|
||||
):
|
||||
audio_b64 = getattr(chunk, "audio", None)
|
||||
if not audio_b64:
|
||||
continue
|
||||
|
||||
pcm_bytes = base64.b64decode(audio_b64)
|
||||
|
||||
if measuring_ttfb:
|
||||
await self.stop_ttfb_metrics()
|
||||
measuring_ttfb = False
|
||||
|
||||
# Hume emits mono PCM at 48 kHz; downstream can resample if needed.
|
||||
yield TTSAudioRawFrame(pcm_bytes, self.sample_rate, 1)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} error generating TTS: {e}")
|
||||
yield ErrorFrame(error=str(e))
|
||||
finally:
|
||||
# Ensure TTFB timer is stopped even on early failures
|
||||
if measuring_ttfb:
|
||||
await self.stop_ttfb_metrics()
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
|
||||
__all__ = ["HumeTTSService"]
|
||||
Reference in New Issue
Block a user