adds hume tts service

2025-08-26 16:13:55 -07:00
parent d9656cbb1a
commit b489de2fc3
2 changed files with 205 additions and 0 deletions
--- a/src/pipecat/services/hume/init.py
+++ b/src/pipecat/services/hume/init.py
@@ -0,0 +1,13 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import sys
+
+from pipecat.services import DeprecatedModuleProxy
+
+from .tts import *
+
+sys.modules[__name__] = DeprecatedModuleProxy(globals(), "hume", "hume.tts")
--- a/src/pipecat/services/hume/tts.py
+++ b/src/pipecat/services/hume/tts.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+
+"""Hume Text-to-Speech service implementation."""
+from __future__ import annotations
+
+import base64
+import os
+from typing import Any, AsyncGenerator, Optional
+
+from loguru import logger
+from pydantic import BaseModel
+
+from pipecat.frames.frames import (
+    ErrorFrame,
+    Frame,
+    StartFrame,
+    TTSAudioRawFrame,
+    TTSStartedFrame,
+    TTSStoppedFrame,
+)
+from pipecat.services.tts_service import TTSService
+from pipecat.utils.tracing.service_decorators import traced_tts
+
+try:
+    from hume import AsyncHumeClient
+    from hume.tts import (
+        PostedUtterance,
+        FormatPcm,
+        PostedUtteranceVoiceWithId,
+    )
+except ModuleNotFoundError as e:  # pragma: no cover - import-time guidance
+    logger.error(f"Exception: {e}")
+    logger.error("In order to use Hume, you need to `pip install pipecat-ai[hume]`.")
+    raise
+
+
+HUME_SAMPLE_RATE = 48_000  # Hume TTS streams at 48 kHz
+
+
+class HumeTTSService(TTSService):
+    """Hume Octave Text-to-Speech service.
+
+    Streams PCM audio via Hume's HTTP output streaming (JSON chunks) endpoint
+    using the Python SDK and emits `TTSAudioRawFrame`s suitable for Pipecat transports.
+
+    Parameters
+    ----------
+    api_key:
+        Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable.
+    voice_id:
+        **Required**: ID of the voice to use (ID-only; names are not supported here).
+    params:
+        Optional synthesis controls (acting instructions, speed, trailing silence).
+    sample_rate:
+        Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume).
+    """
+
+    class InputParams(BaseModel):
+        """Optional synthesis parameters for Hume TTS.
+
+        description: Natural-language acting directions (≤100 chars)
+        speed: Speaking-rate multiplier (0.5-2.0)
+        trailing_silence: Seconds of silence to append at the end (0-5)
+        """
+
+        description: Optional[str] = None
+        speed: Optional[float] = None
+        trailing_silence: Optional[float] = None
+
+    def __init__(
+        self,
+        *,
+        api_key: Optional[str] = None,
+        voice_id: str,
+        params: Optional[InputParams] = None,
+        sample_rate: Optional[int] = HUME_SAMPLE_RATE,
+        **kwargs,
+    ) -> None:
+        api_key = api_key or os.getenv("HUME_API_KEY")
+        if not api_key:
+            raise ValueError("HumeTTSService requires an API key (env HUME_API_KEY or api_key=)")
+
+        if sample_rate != HUME_SAMPLE_RATE:
+            logger.warning(
+                f"Hume TTS streams at {HUME_SAMPLE_RATE} Hz; configured sample_rate={sample_rate}"
+            )
+
+        super().__init__(
+            aggregate_sentences=True,
+            push_text_frames=False,
+            push_stop_frames=True,
+            pause_frame_processing=True,
+            sample_rate=sample_rate,
+            **kwargs,
+        )
+
+        self._client = AsyncHumeClient(api_key=api_key)
+        self._params = params or HumeTTSService.InputParams()
+
+        # Store voice in the base class (mirrors other services)
+        self.set_voice(voice_id)
+
+    def can_generate_metrics(self) -> bool:
+        return True
+
+    async def start(self, frame: StartFrame) -> None:
+        await super().start(frame)
+
+    async def update_setting(self, key: str, value: Any) -> None:
+        """Runtime updates via `TTSUpdateSettingsFrame`.
+
+        Recognized keys:
+          - "voice_id"
+          - "description"
+          - "speed"
+          - "trailing_silence"
+        """
+        key_l = (key or "").lower()
+
+        if key_l == "voice_id":
+            self.set_voice(str(value))
+            logger.info(f"HumeTTSService voice_id set to: {self.voice}")
+        elif key_l == "description":
+            self._params.description = None if value is None else str(value)
+        elif key_l == "speed":
+            self._params.speed = None if value is None else float(value)
+        elif key_l == "trailing_silence":
+            self._params.trailing_silence = None if value is None else float(value)
+        else:
+            # Defer unknown keys to the base class
+            await super().update_setting(key, value)
+
+    @traced_tts
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using Hume TTS."""
+        logger.debug(f"{self}: Generating Hume TTS: [{text}]")
+
+        # Build the request payload
+        utterance_kwargs: dict[str, Any] = {
+            "text": text,
+            "voice": PostedUtteranceVoiceWithId(id=self.voice),
+        }
+        if self._params.description is not None:
+            utterance_kwargs["description"] = self._params.description
+        if self._params.speed is not None:
+            utterance_kwargs["speed"] = self._params.speed
+        if self._params.trailing_silence is not None:
+            utterance_kwargs["trailing_silence"] = self._params.trailing_silence
+
+        utterance = PostedUtterance(**utterance_kwargs)
+
+        # Request raw PCM chunks in the streaming JSON
+        pcm_fmt = FormatPcm(type="pcm")
+
+        measuring_ttfb = True
+        await self.start_ttfb_metrics()
+        await self.start_tts_usage_metrics(text)
+        yield TTSStartedFrame()
+
+        try:
+            # Instant mode is always enabled here (not user-configurable)
+            async for chunk in self._client.tts.synthesize_json_streaming(
+                utterances=[utterance],
+                format=pcm_fmt,
+                instant_mode=True,
+            ):
+                audio_b64 = getattr(chunk, "audio", None)
+                if not audio_b64:
+                    continue
+
+                pcm_bytes = base64.b64decode(audio_b64)
+
+                if measuring_ttfb:
+                    await self.stop_ttfb_metrics()
+                    measuring_ttfb = False
+
+                # Hume emits mono PCM at 48 kHz; downstream can resample if needed.
+                yield TTSAudioRawFrame(pcm_bytes, self.sample_rate, 1)
+
+        except Exception as e:
+            logger.exception(f"{self} error generating TTS: {e}")
+            yield ErrorFrame(error=str(e))
+        finally:
+            # Ensure TTFB timer is stopped even on early failures
+            if measuring_ttfb:
+                await self.stop_ttfb_metrics()
+            yield TTSStoppedFrame()
+
+
+__all__ = ["HumeTTSService"]