add Hume example, small fixes

2025-09-30 17:18:56 -07:00
parent b489de2fc3
commit 4ffdabcfde
8 changed files with 4035 additions and 5792 deletions
--- a/README.md
+++ b/README.md
@@ -81,7 +81,7 @@ You can connect to Pipecat from any platform using our official SDKs:
 | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | Speech-to-Text      | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper)                                                                                                                    |
 | LLMs                | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together)                                                                                          |
-| Text-to-Speech      | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
+| Text-to-Speech      | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
 | Speech-to-Speech    | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | Transport           | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
 | Serializers         | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
--- a/examples/foundational/07ad-interruptible-hume.py
+++ b/examples/foundational/07ad-interruptible-hume.py
@@ -0,0 +1,124 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import os
+
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.frames.frames import StartFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.runner.types import RunnerArguments
+from pipecat.runner.utils import create_transport
+from pipecat.services.deepgram.stt import DeepgramSTTService
+from pipecat.services.hume.tts import HUME_SAMPLE_RATE, HumeTTSService
+from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.transports.base_transport import BaseTransport, TransportParams
+from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
+from pipecat.transports.services.daily import DailyParams
+
+load_dotenv(override=True)
+
+# We store functions so objects (e.g. SileroVADAnalyzer) don't get
+# instantiated. The function will be called when the desired transport gets
+# selected.
+transport_params = {
+    "daily": lambda: DailyParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "twilio": lambda: FastAPIWebsocketParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "webrtc": lambda: TransportParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+        audio_out_sample_rate=HUME_SAMPLE_RATE,
+    ),
+}
+
+
+async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
+    logger.info(f"Starting bot")
+
+    stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+
+    tts = HumeTTSService(
+        api_key=os.getenv("HUME_API_KEY"),
+        # Replace with your Hume voice ID
+        voice_id="f898a92e-685f-43fa-985b-a46920f0650b",
+    )
+
+    llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+        },
+    ]
+
+    context = OpenAILLMContext(messages)
+    context_aggregator = llm.create_context_aggregator(context)
+
+    pipeline = Pipeline(
+        [
+            transport.input(),  # Transport user input
+            stt,
+            context_aggregator.user(),  # User responses
+            llm,  # LLM
+            tts,  # TTS
+            transport.output(),  # Transport bot output
+            context_aggregator.assistant(),  # Assistant spoken responses
+        ]
+    )
+
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            enable_metrics=True,
+            enable_usage_metrics=True,
+        ),
+        idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
+    )
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(transport, client):
+        logger.info(f"Client connected")
+        # Kick off the conversation.
+        messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+        await task.queue_frames([context_aggregator.user().get_context_frame()])
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(transport, client):
+        logger.info(f"Client disconnected")
+        await task.cancel()
+
+    runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
+
+    await runner.run(task)
+
+
+async def bot(runner_args: RunnerArguments):
+    """Main bot entry point compatible with Pipecat Cloud."""
+    runner_args.transport = "webrtc"
+    transport = await create_transport(runner_args, transport_params)
+
+    await run_bot(transport, runner_args)
+
+
+if __name__ == "__main__":
+    from pipecat.runner.run import main
+
+    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -112,6 +112,11 @@ webrtc = [ "aiortc>=1.13.0,<2", "opencv-python>=4.11.0.86,<5" ]
 websocket = [ "pipecat-ai[websockets-base]", "fastapi>=0.115.6,<0.117.0" ]
 websockets-base = [ "websockets>=13.1,<16.0" ]
 whisper = [ "faster-whisper~=1.1.1" ]
+fastapi = [
+    "fastapi",
+    "uvicorn",
+    "websockets",
+]

 [dependency-groups]
 dev = [
--- a/scripts/evals/run-release-evals.py
+++ b/scripts/evals/run-release-evals.py
@@ -103,6 +103,7 @@ TESTS_07 = [
    ("07w-interruptible-fal.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
    ("07y-interruptible-minimax.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
    ("07z-interruptible-sarvam.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
+    ("07ad-interruptible-hume.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
    # Needs a local XTTS docker instance running.
    # ("07i-interruptible-xtts.py", PROMPT_SIMPLE_MATH, EVAL_SIMPLE_MATH, BOT_SPEAKS_FIRST),
    # Needs a Krisp license.
--- a/src/pipecat/services/hume/init.py
+++ b/src/pipecat/services/hume/init.py
@@ -3,11 +3,3 @@
 #
 # SPDX-License-Identifier: BSD 2-Clause License
 #
-
-import sys
-
-from pipecat.services import DeprecatedModuleProxy
-
-from .tts import *
-
-sys.modules[__name__] = DeprecatedModuleProxy(globals(), "hume", "hume.tts")
--- a/src/pipecat/services/hume/tts.py
+++ b/src/pipecat/services/hume/tts.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: BSD 2-Clause License

 """Hume Text-to-Speech service implementation."""
+
 from __future__ import annotations

 import base64
@@ -26,8 +27,8 @@ from pipecat.utils.tracing.service_decorators import traced_tts
 try:
    from hume import AsyncHumeClient
    from hume.tts import (
-        PostedUtterance,
        FormatPcm,
+        PostedUtterance,
        PostedUtteranceVoiceWithId,
    )
 except ModuleNotFoundError as e:  # pragma: no cover - import-time guidance
@@ -45,24 +46,21 @@ class HumeTTSService(TTSService):
    Streams PCM audio via Hume's HTTP output streaming (JSON chunks) endpoint
    using the Python SDK and emits `TTSAudioRawFrame`s suitable for Pipecat transports.

-    Parameters
-    ----------
-    api_key:
-        Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable.
-    voice_id:
-        **Required**: ID of the voice to use (ID-only; names are not supported here).
-    params:
-        Optional synthesis controls (acting instructions, speed, trailing silence).
-    sample_rate:
-        Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume).
+    Supported features:
+
+    - Generates speech from text using Hume TTS.
+    - Streams PCM audio.
+    - Supports dynamic updates of voice and synthesis parameters at runtime.
+    - Provides metrics for Time To First Byte (TTFB) and TTS usage.
    """

    class InputParams(BaseModel):
        """Optional synthesis parameters for Hume TTS.

-        description: Natural-language acting directions (≤100 chars)
-        speed: Speaking-rate multiplier (0.5-2.0)
-        trailing_silence: Seconds of silence to append at the end (0-5)
+        Parameters:
+            description: Natural-language acting directions (up to 100 characters).
+            speed: Speaking-rate multiplier (0.5-2.0).
+            trailing_silence: Seconds of silence to append at the end (0-5).
        """

        description: Optional[str] = None
@@ -78,6 +76,15 @@ class HumeTTSService(TTSService):
        sample_rate: Optional[int] = HUME_SAMPLE_RATE,
        **kwargs,
    ) -> None:
+        """Initialize the HumeTTSService.
+
+        Args:
+            api_key: Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable.
+            voice_id: ID of the voice to use (ID-only; names are not supported here).
+            params: Optional synthesis controls (acting instructions, speed, trailing silence).
+            sample_rate: Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume).
+            **kwargs: Additional arguments passed to the parent class.
+        """
        api_key = api_key or os.getenv("HUME_API_KEY")
        if not api_key:
            raise ValueError("HumeTTSService requires an API key (env HUME_API_KEY or api_key=)")
@@ -88,9 +95,6 @@ class HumeTTSService(TTSService):
            )

        super().__init__(
-            aggregate_sentences=True,
-            push_text_frames=False,
-            push_stop_frames=True,
            pause_frame_processing=True,
            sample_rate=sample_rate,
            **kwargs,
@@ -102,20 +106,34 @@ class HumeTTSService(TTSService):
        # Store voice in the base class (mirrors other services)
        self.set_voice(voice_id)

+        self._audio_bytes = b""
+
    def can_generate_metrics(self) -> bool:
+        """Can generate metrics.
+
+        Returns:
+            True if metrics can be generated, False otherwise.
+        """
        return True

    async def start(self, frame: StartFrame) -> None:
+        """Start the service.
+
+        Args:
+            frame: The start frame.
+        """
        await super().start(frame)

    async def update_setting(self, key: str, value: Any) -> None:
        """Runtime updates via `TTSUpdateSettingsFrame`.

-        Recognized keys:
-          - "voice_id"
-          - "description"
-          - "speed"
-          - "trailing_silence"
+        Args:
+            key: The name of the setting to update. Recognized keys are:
+                - "voice_id"
+                - "description"
+                - "speed"
+                - "trailing_silence"
+            value: The new value for the setting.
        """
        key_l = (key or "").lower()

@@ -134,13 +152,22 @@ class HumeTTSService(TTSService):

    @traced_tts
    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
-        """Generate speech from text using Hume TTS."""
+        """Generate speech from text using Hume TTS.
+
+        Args:
+            text: The text to be synthesized.
+
+        Returns:
+            An async generator that yields `Frame` objects, including
+            `TTSStartedFrame`, `TTSAudioRawFrame`, `ErrorFrame`, and
+            `TTSStoppedFrame`.
+        """
        logger.debug(f"{self}: Generating Hume TTS: [{text}]")

        # Build the request payload
        utterance_kwargs: dict[str, Any] = {
            "text": text,
-            "voice": PostedUtteranceVoiceWithId(id=self.voice),
+            "voice": PostedUtteranceVoiceWithId(id=self._voice_id),
        }
        if self._params.description is not None:
            utterance_kwargs["description"] = self._params.description
@@ -161,6 +188,10 @@ class HumeTTSService(TTSService):

        try:
            # Instant mode is always enabled here (not user-configurable)
+            # Hume emits mono PCM at 48 kHz; downstream can resample if needed.
+            # We buffer audio bytes before sending to prevent glitches.
+            self._audio_bytes = b""
+            first_audio_sent = False
            async for chunk in self._client.tts.synthesize_json_streaming(
                utterances=[utterance],
                format=pcm_fmt,
@@ -171,18 +202,34 @@ class HumeTTSService(TTSService):
                    continue

                pcm_bytes = base64.b64decode(audio_b64)
+                self._audio_bytes += pcm_bytes

-                if measuring_ttfb:
-                    await self.stop_ttfb_metrics()
-                    measuring_ttfb = False
+                # Send the first audio chunk immediately to avoid client-side delays.
+                if not first_audio_sent:
+                    if self._audio_bytes:
+                        yield TTSAudioRawFrame(self._audio_bytes, self.sample_rate, 1)
+                        if measuring_ttfb:
+                            await self.stop_ttfb_metrics()
+                            measuring_ttfb = False
+                        first_audio_sent = True
+                        # Do NOT clear _audio_bytes here. Subsequent chunks will build on this.
+                    continue

-                # Hume emits mono PCM at 48 kHz; downstream can resample if needed.
-                yield TTSAudioRawFrame(pcm_bytes, self.sample_rate, 1)
+                # Buffer audio until we have enough to avoid glitches
+                if len(self._audio_bytes) < self.chunk_size:
+                    continue
+
+                yield TTSAudioRawFrame(self._audio_bytes, self.sample_rate, 1)
+                self._audio_bytes = b""

        except Exception as e:
            logger.exception(f"{self} error generating TTS: {e}")
            yield ErrorFrame(error=str(e))
        finally:
+            # Yield any remaining audio
+            if self._audio_bytes:
+                yield TTSAudioRawFrame(self._audio_bytes, self.sample_rate, 1)
+
            # Ensure TTFB timer is stopped even on early failures
            if measuring_ttfb:
                await self.stop_ttfb_metrics()
--- a/src/pipecat/services/tts_service.py
+++ b/src/pipecat/services/tts_service.py
@@ -142,7 +142,6 @@ class TTSService(AIService):
        """
        return self._sample_rate

-    @property
    def chunk_size(self) -> int:
        """Get the recommended chunk size for audio streaming.

--- a/uv.lock
+++ b/uv.lock