Added word/timestamp pairs to ElevenLabsHttpTTSService
This commit is contained in:
@@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
you to control aggregator settings. You can now pass these arguments when
|
||||
creating aggregator pairs with `create_context_aggregator()`.
|
||||
|
||||
- Added word/timestamp pairs to `ElevenLabsHttpTTSService`.
|
||||
|
||||
- It is now possible to disable `SoundfileMixer` when created. You can then use
|
||||
`MixerEnableFrame` to dynamically enable it when necessary.
|
||||
|
||||
@@ -55,7 +57,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Fixed an issue in `SmallWebRTCTransport` where an error was thrown if the
|
||||
client did not create a video transceiver.
|
||||
|
||||
- Fixed an issue where LLM input parameters were not working and applied correctly in `GoogleVertexLLMService`, causing
|
||||
- Fixed an issue where LLM input parameters were not working and applied correctly in `GoogleVertexLLMService`, causing
|
||||
unexpected behavior during inference.
|
||||
|
||||
## [0.0.63] - 2025-04-11
|
||||
|
||||
@@ -25,7 +25,7 @@ from pipecat.frames.frames import (
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.tts_service import InterruptibleWordTTSService, TTSService
|
||||
from pipecat.services.tts_service import InterruptibleWordTTSService, WordTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
|
||||
# See .env.example for ElevenLabs configuration needed
|
||||
@@ -441,8 +441,8 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
|
||||
logger.error(f"{self} exception: {e}")
|
||||
|
||||
|
||||
class ElevenLabsHttpTTSService(TTSService):
|
||||
"""ElevenLabs Text-to-Speech service using HTTP streaming.
|
||||
class ElevenLabsHttpTTSService(WordTTSService):
|
||||
"""ElevenLabs Text-to-Speech service using HTTP streaming with word timestamps.
|
||||
|
||||
Args:
|
||||
api_key: ElevenLabs API key
|
||||
@@ -475,7 +475,13 @@ class ElevenLabsHttpTTSService(TTSService):
|
||||
params: InputParams = InputParams(),
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(sample_rate=sample_rate, **kwargs)
|
||||
super().__init__(
|
||||
aggregate_sentences=True,
|
||||
push_text_frames=False,
|
||||
push_stop_frames=True,
|
||||
sample_rate=sample_rate,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self._api_key = api_key
|
||||
self._base_url = base_url
|
||||
@@ -498,28 +504,109 @@ class ElevenLabsHttpTTSService(TTSService):
|
||||
self._output_format = "" # initialized in start()
|
||||
self._voice_settings = self._set_voice_settings()
|
||||
|
||||
# Track cumulative time to properly sequence word timestamps across utterances
|
||||
self._cumulative_time = 0
|
||||
self._started = False
|
||||
|
||||
def language_to_service_language(self, language: Language) -> Optional[str]:
|
||||
"""Convert pipecat Language to ElevenLabs language code."""
|
||||
return language_to_elevenlabs_language(language)
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Indicate that this service can generate usage metrics."""
|
||||
return True
|
||||
|
||||
def _set_voice_settings(self):
|
||||
return build_elevenlabs_voice_settings(self._settings)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
"""Initialize the service upon receiving a StartFrame."""
|
||||
await super().start(frame)
|
||||
self._output_format = output_format_from_sample_rate(self.sample_rate)
|
||||
self._cumulative_time = 0
|
||||
self._started = False
|
||||
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using ElevenLabs streaming API.
|
||||
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
|
||||
await super().push_frame(frame, direction)
|
||||
if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
|
||||
# Reset timing on interruption or stop
|
||||
self._started = False
|
||||
self._cumulative_time = 0
|
||||
if isinstance(frame, TTSStoppedFrame):
|
||||
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
|
||||
|
||||
def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
|
||||
"""Calculate word timing from character alignment data.
|
||||
|
||||
Example input data:
|
||||
{
|
||||
"characters": [" ", "H", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"],
|
||||
"character_start_times_seconds": [0.0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
|
||||
"character_end_times_seconds": [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
|
||||
}
|
||||
|
||||
Would produce word times (with cumulative_time=0):
|
||||
[("Hello", 0.1), ("world", 0.5)]
|
||||
|
||||
Args:
|
||||
text: The text to convert to speech
|
||||
alignment_info: Character timing data from ElevenLabs
|
||||
|
||||
Returns:
|
||||
List of (word, timestamp) pairs
|
||||
"""
|
||||
chars = alignment_info.get("characters", [])
|
||||
char_start_times = alignment_info.get("character_start_times_seconds", [])
|
||||
|
||||
if not chars or not char_start_times or len(chars) != len(char_start_times):
|
||||
logger.warning(
|
||||
f"Invalid alignment data: chars={len(chars)}, times={len(char_start_times)}"
|
||||
)
|
||||
return []
|
||||
|
||||
# Build the words and find their start times
|
||||
words = []
|
||||
word_start_times = []
|
||||
current_word = ""
|
||||
first_char_idx = -1
|
||||
|
||||
for i, char in enumerate(chars):
|
||||
if char == " ":
|
||||
if current_word: # Only add non-empty words
|
||||
words.append(current_word)
|
||||
# Use time of the first character of the word, offset by cumulative time
|
||||
word_start_times.append(
|
||||
self._cumulative_time + char_start_times[first_char_idx]
|
||||
)
|
||||
current_word = ""
|
||||
first_char_idx = -1
|
||||
else:
|
||||
if not current_word: # This is the first character of a new word
|
||||
first_char_idx = i
|
||||
current_word += char
|
||||
|
||||
# Don't forget the last word if there's no trailing space
|
||||
if current_word and first_char_idx >= 0:
|
||||
words.append(current_word)
|
||||
word_start_times.append(self._cumulative_time + char_start_times[first_char_idx])
|
||||
|
||||
# Create word-time pairs
|
||||
word_times = list(zip(words, word_start_times))
|
||||
|
||||
return word_times
|
||||
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text using ElevenLabs streaming API with timestamps.
|
||||
|
||||
Args:
|
||||
text: Text to convert to speech
|
||||
|
||||
Yields:
|
||||
Frames containing audio data and status information
|
||||
Audio and control frames
|
||||
"""
|
||||
logger.debug(f"{self}: Generating TTS [{text}]")
|
||||
|
||||
url = f"{self._base_url}/v1/text-to-speech/{self._voice_id}/stream"
|
||||
# Use the with-timestamps endpoint
|
||||
url = f"{self._base_url}/v1/text-to-speech/{self._voice_id}/stream/with-timestamps"
|
||||
|
||||
payload: Dict[str, Union[str, Dict[str, Union[float, bool]]]] = {
|
||||
"text": text,
|
||||
@@ -550,8 +637,6 @@ class ElevenLabsHttpTTSService(TTSService):
|
||||
if self._settings["optimize_streaming_latency"] is not None:
|
||||
params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
|
||||
|
||||
logger.debug(f"{self} ElevenLabs request - payload: {payload}, params: {params}")
|
||||
|
||||
try:
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
@@ -566,17 +651,59 @@ class ElevenLabsHttpTTSService(TTSService):
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
# Process the streaming response
|
||||
CHUNK_SIZE = 1024
|
||||
# Start TTS sequence if not already started
|
||||
if not self._started:
|
||||
self.start_word_timestamps()
|
||||
yield TTSStartedFrame()
|
||||
self._started = True
|
||||
|
||||
# Track the duration of this utterance based on the last character's end time
|
||||
utterance_duration = 0
|
||||
async for line in response.content:
|
||||
line_str = line.decode("utf-8").strip()
|
||||
if not line_str:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Parse the JSON object
|
||||
data = json.loads(line_str)
|
||||
|
||||
# Process audio if present
|
||||
if data and "audio_base64" in data:
|
||||
await self.stop_ttfb_metrics()
|
||||
audio = base64.b64decode(data["audio_base64"])
|
||||
yield TTSAudioRawFrame(audio, self.sample_rate, 1)
|
||||
|
||||
# Process alignment if present
|
||||
if data and "alignment" in data:
|
||||
alignment = data["alignment"]
|
||||
if alignment: # Ensure alignment is not None
|
||||
# Get end time of the last character in this chunk
|
||||
char_end_times = alignment.get("character_end_times_seconds", [])
|
||||
if char_end_times:
|
||||
chunk_end_time = char_end_times[-1]
|
||||
# Update to the longest end time seen so far
|
||||
utterance_duration = max(utterance_duration, chunk_end_time)
|
||||
|
||||
# Calculate word timestamps
|
||||
word_times = self.calculate_word_times(alignment)
|
||||
if word_times:
|
||||
await self.add_word_timestamps(word_times)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse JSON from stream: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing response: {e}", exc_info=True)
|
||||
continue
|
||||
|
||||
# After processing all chunks, add the total utterance duration
|
||||
# to the cumulative time to ensure next utterance starts after this one
|
||||
if utterance_duration > 0:
|
||||
self._cumulative_time += utterance_duration
|
||||
|
||||
yield TTSStartedFrame()
|
||||
async for chunk in response.content.iter_chunked(CHUNK_SIZE):
|
||||
if len(chunk) > 0:
|
||||
await self.stop_ttfb_metrics()
|
||||
yield TTSAudioRawFrame(chunk, self.sample_rate, 1)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in run_tts: {e}")
|
||||
yield ErrorFrame(error=str(e))
|
||||
finally:
|
||||
await self.stop_ttfb_metrics()
|
||||
yield TTSStoppedFrame()
|
||||
# Let the parent class handle TTSStoppedFrame
|
||||
|
||||
Reference in New Issue
Block a user