Added word/timestamp pairs to ElevenLabsHttpTTSService

This commit is contained in:
Mark Backman
2025-04-15 21:29:19 -04:00
parent f6f01ea7e4
commit 384f80983f
2 changed files with 149 additions and 20 deletions

View File

@@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
you to control aggregator settings. You can now pass these arguments when
creating aggregator pairs with `create_context_aggregator()`.
- Added word/timestamp pairs to `ElevenLabsHttpTTSService`.
- It is now possible to disable `SoundfileMixer` when created. You can then use
`MixerEnableFrame` to dynamically enable it when necessary.
@@ -55,7 +57,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed an issue in `SmallWebRTCTransport` where an error was thrown if the
client did not create a video transceiver.
- Fixed an issue where LLM input parameters were not working and applied correctly in `GoogleVertexLLMService`, causing
- Fixed an issue where LLM input parameters were not working and applied correctly in `GoogleVertexLLMService`, causing
unexpected behavior during inference.
## [0.0.63] - 2025-04-11

View File

@@ -25,7 +25,7 @@ from pipecat.frames.frames import (
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.tts_service import InterruptibleWordTTSService, TTSService
from pipecat.services.tts_service import InterruptibleWordTTSService, WordTTSService
from pipecat.transcriptions.language import Language
# See .env.example for ElevenLabs configuration needed
@@ -441,8 +441,8 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
logger.error(f"{self} exception: {e}")
class ElevenLabsHttpTTSService(TTSService):
"""ElevenLabs Text-to-Speech service using HTTP streaming.
class ElevenLabsHttpTTSService(WordTTSService):
"""ElevenLabs Text-to-Speech service using HTTP streaming with word timestamps.
Args:
api_key: ElevenLabs API key
@@ -475,7 +475,13 @@ class ElevenLabsHttpTTSService(TTSService):
params: InputParams = InputParams(),
**kwargs,
):
super().__init__(sample_rate=sample_rate, **kwargs)
super().__init__(
aggregate_sentences=True,
push_text_frames=False,
push_stop_frames=True,
sample_rate=sample_rate,
**kwargs,
)
self._api_key = api_key
self._base_url = base_url
@@ -498,28 +504,109 @@ class ElevenLabsHttpTTSService(TTSService):
self._output_format = "" # initialized in start()
self._voice_settings = self._set_voice_settings()
# Track cumulative time to properly sequence word timestamps across utterances
self._cumulative_time = 0
self._started = False
def language_to_service_language(self, language: Language) -> Optional[str]:
"""Convert pipecat Language to ElevenLabs language code."""
return language_to_elevenlabs_language(language)
def can_generate_metrics(self) -> bool:
"""Indicate that this service can generate usage metrics."""
return True
def _set_voice_settings(self):
return build_elevenlabs_voice_settings(self._settings)
async def start(self, frame: StartFrame):
"""Initialize the service upon receiving a StartFrame."""
await super().start(frame)
self._output_format = output_format_from_sample_rate(self.sample_rate)
self._cumulative_time = 0
self._started = False
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"""Generate speech from text using ElevenLabs streaming API.
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
await super().push_frame(frame, direction)
if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
# Reset timing on interruption or stop
self._started = False
self._cumulative_time = 0
if isinstance(frame, TTSStoppedFrame):
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
"""Calculate word timing from character alignment data.
Example input data:
{
"characters": [" ", "H", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"],
"character_start_times_seconds": [0.0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
"character_end_times_seconds": [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}
Would produce word times (with cumulative_time=0):
[("Hello", 0.1), ("world", 0.5)]
Args:
text: The text to convert to speech
alignment_info: Character timing data from ElevenLabs
Returns:
List of (word, timestamp) pairs
"""
chars = alignment_info.get("characters", [])
char_start_times = alignment_info.get("character_start_times_seconds", [])
if not chars or not char_start_times or len(chars) != len(char_start_times):
logger.warning(
f"Invalid alignment data: chars={len(chars)}, times={len(char_start_times)}"
)
return []
# Build the words and find their start times
words = []
word_start_times = []
current_word = ""
first_char_idx = -1
for i, char in enumerate(chars):
if char == " ":
if current_word: # Only add non-empty words
words.append(current_word)
# Use time of the first character of the word, offset by cumulative time
word_start_times.append(
self._cumulative_time + char_start_times[first_char_idx]
)
current_word = ""
first_char_idx = -1
else:
if not current_word: # This is the first character of a new word
first_char_idx = i
current_word += char
# Don't forget the last word if there's no trailing space
if current_word and first_char_idx >= 0:
words.append(current_word)
word_start_times.append(self._cumulative_time + char_start_times[first_char_idx])
# Create word-time pairs
word_times = list(zip(words, word_start_times))
return word_times
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"""Generate speech from text using ElevenLabs streaming API with timestamps.
Args:
text: Text to convert to speech
Yields:
Frames containing audio data and status information
Audio and control frames
"""
logger.debug(f"{self}: Generating TTS [{text}]")
url = f"{self._base_url}/v1/text-to-speech/{self._voice_id}/stream"
# Use the with-timestamps endpoint
url = f"{self._base_url}/v1/text-to-speech/{self._voice_id}/stream/with-timestamps"
payload: Dict[str, Union[str, Dict[str, Union[float, bool]]]] = {
"text": text,
@@ -550,8 +637,6 @@ class ElevenLabsHttpTTSService(TTSService):
if self._settings["optimize_streaming_latency"] is not None:
params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
logger.debug(f"{self} ElevenLabs request - payload: {payload}, params: {params}")
try:
await self.start_ttfb_metrics()
@@ -566,17 +651,59 @@ class ElevenLabsHttpTTSService(TTSService):
await self.start_tts_usage_metrics(text)
# Process the streaming response
CHUNK_SIZE = 1024
# Start TTS sequence if not already started
if not self._started:
self.start_word_timestamps()
yield TTSStartedFrame()
self._started = True
# Track the duration of this utterance based on the last character's end time
utterance_duration = 0
async for line in response.content:
line_str = line.decode("utf-8").strip()
if not line_str:
continue
try:
# Parse the JSON object
data = json.loads(line_str)
# Process audio if present
if data and "audio_base64" in data:
await self.stop_ttfb_metrics()
audio = base64.b64decode(data["audio_base64"])
yield TTSAudioRawFrame(audio, self.sample_rate, 1)
# Process alignment if present
if data and "alignment" in data:
alignment = data["alignment"]
if alignment: # Ensure alignment is not None
# Get end time of the last character in this chunk
char_end_times = alignment.get("character_end_times_seconds", [])
if char_end_times:
chunk_end_time = char_end_times[-1]
# Update to the longest end time seen so far
utterance_duration = max(utterance_duration, chunk_end_time)
# Calculate word timestamps
word_times = self.calculate_word_times(alignment)
if word_times:
await self.add_word_timestamps(word_times)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse JSON from stream: {e}")
continue
except Exception as e:
logger.error(f"Error processing response: {e}", exc_info=True)
continue
# After processing all chunks, add the total utterance duration
# to the cumulative time to ensure next utterance starts after this one
if utterance_duration > 0:
self._cumulative_time += utterance_duration
yield TTSStartedFrame()
async for chunk in response.content.iter_chunked(CHUNK_SIZE):
if len(chunk) > 0:
await self.stop_ttfb_metrics()
yield TTSAudioRawFrame(chunk, self.sample_rate, 1)
except Exception as e:
logger.error(f"Error in run_tts: {e}")
yield ErrorFrame(error=str(e))
finally:
await self.stop_ttfb_metrics()
yield TTSStoppedFrame()
# Let the parent class handle TTSStoppedFrame