From 21d8d148b8adebfaaf5986fc7058a85aa4b8b4ec Mon Sep 17 00:00:00 2001 From: Pyae Sone Myo <66440980+Rickaym@users.noreply.github.com> Date: Sun, 12 Oct 2025 22:10:11 +0630 Subject: [PATCH] fix: handle partial words across alignment chunks gracefully --- src/pipecat/services/elevenlabs/tts.py | 60 ++++++++++++++++---------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py index 3d6d5bd4c..4c91c4f38 100644 --- a/src/pipecat/services/elevenlabs/tts.py +++ b/src/pipecat/services/elevenlabs/tts.py @@ -168,16 +168,24 @@ def build_elevenlabs_voice_settings( def calculate_word_times( - alignment_info: Mapping[str, Any], cumulative_time: float -) -> List[Tuple[str, float]]: + alignment_info: Mapping[str, Any], + cumulative_time: float, + partial_word: str = "", + partial_word_start_time: float = 0.0, +) -> tuple[List[Tuple[str, float]], str, float]: """Calculate word timestamps from character alignment information. Args: alignment_info: Character alignment data from ElevenLabs API. cumulative_time: Base time offset for this chunk. + partial_word: Partial word carried over from previous chunk. + partial_word_start_time: Start time of the partial word. Returns: - List of (word, timestamp) tuples. + Tuple of (word_times, new_partial_word, new_partial_word_start_time): + - word_times: List of (word, timestamp) tuples for complete words + - new_partial_word: Incomplete word at end of chunk (empty if chunk ends with space) + - new_partial_word_start_time: Start time of the incomplete word """ chars = alignment_info["chars"] char_start_times_ms = alignment_info["charStartTimesMs"] @@ -186,41 +194,37 @@ def calculate_word_times( logger.error( f"calculate_word_times: length mismatch - chars={len(chars)}, times={len(char_start_times_ms)}" ) - return [] + return ([], partial_word, partial_word_start_time) # Build words and track their start positions words = [] - word_start_indices = [] - current_word = "" - word_start_index = None + word_start_times = [] + current_word = partial_word # Start with any partial word from previous chunk + word_start_time = partial_word_start_time if partial_word else None for i, char in enumerate(chars): if char == " ": # End of current word if current_word: # Only add non-empty words words.append(current_word) - word_start_indices.append(word_start_index) + word_start_times.append(word_start_time) current_word = "" - word_start_index = None + word_start_time = None else: # Building a word - if word_start_index is None: # First character of new word - word_start_index = i + if word_start_time is None: # First character of new word + # Convert from milliseconds to seconds and add cumulative offset + word_start_time = cumulative_time + (char_start_times_ms[i] / 1000.0) current_word += char - # Handle the last word if there's no trailing space - if current_word and word_start_index is not None: - words.append(current_word) - word_start_indices.append(word_start_index) + # Build result for complete words + word_times = list(zip(words, word_start_times)) - # Calculate timestamps for each word - word_times = [] - for word, start_idx in zip(words, word_start_indices): - # Convert from milliseconds to seconds and add cumulative offset - start_time_seconds = cumulative_time + (char_start_times_ms[start_idx] / 1000.0) - word_times.append((word, start_time_seconds)) + # Return any incomplete word at the end of this chunk + new_partial_word = current_word if current_word else "" + new_partial_word_start_time = word_start_time if word_start_time is not None else 0.0 - return word_times + return (word_times, new_partial_word, new_partial_word_start_time) class ElevenLabsTTSService(AudioContextWordTTSService): @@ -332,6 +336,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService): # there's an interruption or TTSStoppedFrame. self._started = False self._cumulative_time = 0 + # Track partial words that span across alignment chunks + self._partial_word = "" + self._partial_word_start_time = 0.0 # Context management for v1 multi API self._context_id = None @@ -609,7 +616,12 @@ class ElevenLabsTTSService(AudioContextWordTTSService): if msg.get("alignment"): alignment = msg["alignment"] - word_times = calculate_word_times(alignment, self._cumulative_time) + word_times, self._partial_word, self._partial_word_start_time = calculate_word_times( + alignment, + self._cumulative_time, + self._partial_word, + self._partial_word_start_time, + ) if word_times: await self.add_word_timestamps(word_times) @@ -683,6 +695,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService): yield TTSStartedFrame() self._started = True self._cumulative_time = 0 + self._partial_word = "" + self._partial_word_start_time = 0.0 # If a context ID does not exist, create a new one and # register it. If an ID exists, that means the Pipeline is # configured for allow_interruptions=False, so continue