Added word/timestamp pairs to ElevenLabsHttpTTSService

2025-04-15 21:29:19 -04:00
parent f6f01ea7e4
commit 384f80983f
2 changed files with 149 additions and 20 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
  you to control aggregator settings. You can now pass these arguments when
  creating aggregator pairs with `create_context_aggregator()`.

+- Added word/timestamp pairs to `ElevenLabsHttpTTSService`.
+
 - It is now possible to disable `SoundfileMixer` when created. You can then use
  `MixerEnableFrame` to dynamically enable it when necessary.

@@ -55,7 +57,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed an issue in `SmallWebRTCTransport` where an error was thrown if the
  client did not create a video transceiver.

- Fixed an  issue where LLM input parameters were not working and applied correctly in `GoogleVertexLLMService`, causing
+- Fixed an issue where LLM input parameters were not working and applied correctly in `GoogleVertexLLMService`, causing
  unexpected behavior during inference.

 ## [0.0.63] - 2025-04-11
--- a/src/pipecat/services/elevenlabs/tts.py
+++ b/src/pipecat/services/elevenlabs/tts.py
@@ -25,7 +25,7 @@ from pipecat.frames.frames import (
    TTSStoppedFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.tts_service import InterruptibleWordTTSService, TTSService
+from pipecat.services.tts_service import InterruptibleWordTTSService, WordTTSService
 from pipecat.transcriptions.language import Language

 # See .env.example for ElevenLabs configuration needed
@@ -441,8 +441,8 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
            logger.error(f"{self} exception: {e}")


-class ElevenLabsHttpTTSService(TTSService):
-    """ElevenLabs Text-to-Speech service using HTTP streaming.
+class ElevenLabsHttpTTSService(WordTTSService):
+    """ElevenLabs Text-to-Speech service using HTTP streaming with word timestamps.

    Args:
        api_key: ElevenLabs API key
@@ -475,7 +475,13 @@ class ElevenLabsHttpTTSService(TTSService):
        params: InputParams = InputParams(),
        **kwargs,
    ):
-        super().__init__(sample_rate=sample_rate, **kwargs)
+        super().__init__(
+            aggregate_sentences=True,
+            push_text_frames=False,
+            push_stop_frames=True,
+            sample_rate=sample_rate,
+            **kwargs,
+        )

        self._api_key = api_key
        self._base_url = base_url
@@ -498,28 +504,109 @@ class ElevenLabsHttpTTSService(TTSService):
        self._output_format = ""  # initialized in start()
        self._voice_settings = self._set_voice_settings()

+        # Track cumulative time to properly sequence word timestamps across utterances
+        self._cumulative_time = 0
+        self._started = False
+
+    def language_to_service_language(self, language: Language) -> Optional[str]:
+        """Convert pipecat Language to ElevenLabs language code."""
+        return language_to_elevenlabs_language(language)
+
    def can_generate_metrics(self) -> bool:
+        """Indicate that this service can generate usage metrics."""
        return True

    def _set_voice_settings(self):
        return build_elevenlabs_voice_settings(self._settings)

    async def start(self, frame: StartFrame):
+        """Initialize the service upon receiving a StartFrame."""
        await super().start(frame)
        self._output_format = output_format_from_sample_rate(self.sample_rate)
+        self._cumulative_time = 0
+        self._started = False

-    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
-        """Generate speech from text using ElevenLabs streaming API.
+    async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
+        await super().push_frame(frame, direction)
+        if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
+            # Reset timing on interruption or stop
+            self._started = False
+            self._cumulative_time = 0
+            if isinstance(frame, TTSStoppedFrame):
+                await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
+
+    def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
+        """Calculate word timing from character alignment data.
+
+        Example input data:
+        {
+            "characters": [" ", "H", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"],
+            "character_start_times_seconds": [0.0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
+            "character_end_times_seconds": [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
+        }
+
+        Would produce word times (with cumulative_time=0):
+        [("Hello", 0.1), ("world", 0.5)]

        Args:
-            text: The text to convert to speech
+            alignment_info: Character timing data from ElevenLabs
+
+        Returns:
+            List of (word, timestamp) pairs
+        """
+        chars = alignment_info.get("characters", [])
+        char_start_times = alignment_info.get("character_start_times_seconds", [])
+
+        if not chars or not char_start_times or len(chars) != len(char_start_times):
+            logger.warning(
+                f"Invalid alignment data: chars={len(chars)}, times={len(char_start_times)}"
+            )
+            return []
+
+        # Build the words and find their start times
+        words = []
+        word_start_times = []
+        current_word = ""
+        first_char_idx = -1
+
+        for i, char in enumerate(chars):
+            if char == " ":
+                if current_word:  # Only add non-empty words
+                    words.append(current_word)
+                    # Use time of the first character of the word, offset by cumulative time
+                    word_start_times.append(
+                        self._cumulative_time + char_start_times[first_char_idx]
+                    )
+                    current_word = ""
+                    first_char_idx = -1
+            else:
+                if not current_word:  # This is the first character of a new word
+                    first_char_idx = i
+                current_word += char
+
+        # Don't forget the last word if there's no trailing space
+        if current_word and first_char_idx >= 0:
+            words.append(current_word)
+            word_start_times.append(self._cumulative_time + char_start_times[first_char_idx])
+
+        # Create word-time pairs
+        word_times = list(zip(words, word_start_times))
+
+        return word_times
+
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using ElevenLabs streaming API with timestamps.
+
+        Args:
+            text: Text to convert to speech

        Yields:
-            Frames containing audio data and status information
+            Audio and control frames
        """
        logger.debug(f"{self}: Generating TTS [{text}]")

-        url = f"{self._base_url}/v1/text-to-speech/{self._voice_id}/stream"
+        # Use the with-timestamps endpoint
+        url = f"{self._base_url}/v1/text-to-speech/{self._voice_id}/stream/with-timestamps"

        payload: Dict[str, Union[str, Dict[str, Union[float, bool]]]] = {
            "text": text,
@@ -550,8 +637,6 @@ class ElevenLabsHttpTTSService(TTSService):
        if self._settings["optimize_streaming_latency"] is not None:
            params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]

-        logger.debug(f"{self} ElevenLabs request - payload: {payload}, params: {params}")
-
        try:
            await self.start_ttfb_metrics()

@@ -566,17 +651,59 @@ class ElevenLabsHttpTTSService(TTSService):

                await self.start_tts_usage_metrics(text)

-                # Process the streaming response
-                CHUNK_SIZE = 1024
+                # Start TTS sequence if not already started
+                if not self._started:
+                    self.start_word_timestamps()
+                    yield TTSStartedFrame()
+                    self._started = True
+
+                # Track the duration of this utterance based on the last character's end time
+                utterance_duration = 0
+                async for line in response.content:
+                    line_str = line.decode("utf-8").strip()
+                    if not line_str:
+                        continue
+
+                    try:
+                        # Parse the JSON object
+                        data = json.loads(line_str)
+
+                        # Process audio if present
+                        if data and "audio_base64" in data:
+                            await self.stop_ttfb_metrics()
+                            audio = base64.b64decode(data["audio_base64"])
+                            yield TTSAudioRawFrame(audio, self.sample_rate, 1)
+
+                        # Process alignment if present
+                        if data and "alignment" in data:
+                            alignment = data["alignment"]
+                            if alignment:  # Ensure alignment is not None
+                                # Get end time of the last character in this chunk
+                                char_end_times = alignment.get("character_end_times_seconds", [])
+                                if char_end_times:
+                                    chunk_end_time = char_end_times[-1]
+                                    # Update to the longest end time seen so far
+                                    utterance_duration = max(utterance_duration, chunk_end_time)
+
+                                # Calculate word timestamps
+                                word_times = self.calculate_word_times(alignment)
+                                if word_times:
+                                    await self.add_word_timestamps(word_times)
+                    except json.JSONDecodeError as e:
+                        logger.warning(f"Failed to parse JSON from stream: {e}")
+                        continue
+                    except Exception as e:
+                        logger.error(f"Error processing response: {e}", exc_info=True)
+                        continue
+
+                # After processing all chunks, add the total utterance duration
+                # to the cumulative time to ensure next utterance starts after this one
+                if utterance_duration > 0:
+                    self._cumulative_time += utterance_duration

-                yield TTSStartedFrame()
-                async for chunk in response.content.iter_chunked(CHUNK_SIZE):
-                    if len(chunk) > 0:
-                        await self.stop_ttfb_metrics()
-                        yield TTSAudioRawFrame(chunk, self.sample_rate, 1)
        except Exception as e:
            logger.error(f"Error in run_tts: {e}")
            yield ErrorFrame(error=str(e))
        finally:
            await self.stop_ttfb_metrics()
-            yield TTSStoppedFrame()
+            # Let the parent class handle TTSStoppedFrame