Files
pipecat/tests/test_elevenlabs_tts.py
Mark Backman a5e6886b80 Fix ElevenLabs keepalive racing context-init (1008 disconnects)
The keepalive could fire for a new turn's context before that context's
voice_settings context-init was sent, making the keepalive the context's
first message (no voice_settings) and causing ElevenLabs to reject the
later init with a 1008 policy violation. The keepalive now only targets a
context once its context-init has been sent (tracked in _context_init_sent).
2026-05-20 08:59:01 -04:00

292 lines
8.2 KiB
Python

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Tests for ElevenLabs TTS alignment handling."""
import json
from typing import Any
import pytest
from websockets.protocol import State
from pipecat.services.elevenlabs.tts import (
ElevenLabsTTSService,
_select_alignment,
_strip_utterance_leading_spaces,
calculate_word_times,
)
_WS_ALIGNMENT_KEYS = ("chars", "charStartTimesMs", "charDurationsMs")
def _chunk(text: str) -> dict[str, list[Any]]:
chars = list(text)
return {
"chars": chars,
"charStartTimesMs": [i * 100 for i in range(len(chars))],
"charDurationsMs": [100 for _ in chars],
}
def _words_from_chunks(chunks: list[dict[str, list[Any]]]) -> list[str]:
cumulative_time = 0.0
partial_word = ""
partial_word_start_time = 0.0
word_times = []
alignment_started = False
for chunk in chunks:
alignment = _strip_utterance_leading_spaces(
chunk,
_WS_ALIGNMENT_KEYS,
not alignment_started,
)
alignment_started = True
chunk_word_times, partial_word, partial_word_start_time = calculate_word_times(
alignment,
cumulative_time,
partial_word,
partial_word_start_time,
)
word_times.extend(chunk_word_times)
starts = alignment["charStartTimesMs"]
durations = alignment["charDurationsMs"]
if starts and durations:
cumulative_time += (starts[-1] + durations[-1]) / 1000.0
if partial_word:
word_times.append((partial_word, partial_word_start_time))
return [word for word, _ in word_times]
def test_elevenlabs_flash_alignment_preserves_inter_word_chunk_space():
chunks = [
_chunk(" Why did the math book"),
_chunk(" look so sad? "),
_chunk(" Because it had too m"),
_chunk("any problems. "),
]
assert _words_from_chunks(chunks) == [
"Why",
"did",
"the",
"math",
"book",
"look",
"so",
"sad?",
"Because",
"it",
"had",
"too",
"many",
"problems.",
]
def test_elevenlabs_alignment_strips_only_utterance_leading_spaces():
first = _strip_utterance_leading_spaces(_chunk(" Hello"), _WS_ALIGNMENT_KEYS, True)
subsequent = _strip_utterance_leading_spaces(_chunk(" world"), _WS_ALIGNMENT_KEYS, False)
assert first["chars"] == list("Hello")
assert subsequent["chars"] == list(" world")
def test_select_alignment_default_prefers_alignment():
msg = {
"alignment": _chunk("Hello"),
"normalizedAlignment": _chunk(" Hello"),
}
selected = _select_alignment(
msg,
normalized_key="normalizedAlignment",
alignment_key="alignment",
prefer_normalized=False,
)
assert selected is not None
assert selected["chars"] == list("Hello")
def test_select_alignment_dictionary_mode_prefers_normalized():
msg = {
"alignment": _chunk("Hello"),
"normalizedAlignment": _chunk(" Hello"),
}
selected = _select_alignment(
msg,
normalized_key="normalizedAlignment",
alignment_key="alignment",
prefer_normalized=True,
)
assert selected is not None
assert selected["chars"] == list(" Hello")
def test_select_alignment_falls_back_when_preferred_missing():
msg_default = {"normalizedAlignment": _chunk(" Hello")}
selected = _select_alignment(
msg_default,
normalized_key="normalizedAlignment",
alignment_key="alignment",
prefer_normalized=False,
)
assert selected is not None
assert selected["chars"] == list(" Hello")
msg_dict = {"alignment": _chunk("Hello")}
selected = _select_alignment(
msg_dict,
normalized_key="normalizedAlignment",
alignment_key="alignment",
prefer_normalized=True,
)
assert selected is not None
assert selected["chars"] == list("Hello")
def test_select_alignment_falls_back_when_preferred_null():
msg = {"alignment": None, "normalizedAlignment": _chunk(" Hello")}
selected = _select_alignment(
msg,
normalized_key="normalizedAlignment",
alignment_key="alignment",
prefer_normalized=False,
)
assert selected is not None
assert selected["chars"] == list(" Hello")
def test_select_alignment_returns_none_when_both_missing():
assert (
_select_alignment(
{},
normalized_key="normalizedAlignment",
alignment_key="alignment",
prefer_normalized=False,
)
is None
)
assert (
_select_alignment(
{"alignment": None, "normalizedAlignment": None},
normalized_key="normalizedAlignment",
alignment_key="alignment",
prefer_normalized=True,
)
is None
)
def test_select_alignment_works_with_http_field_names():
msg = {
"alignment": {"characters": list("Hi")},
"normalized_alignment": {"characters": list(" Hi")},
}
selected = _select_alignment(
msg,
normalized_key="normalized_alignment",
alignment_key="alignment",
prefer_normalized=False,
)
assert selected is not None
assert selected["characters"] == list("Hi")
selected = _select_alignment(
msg,
normalized_key="normalized_alignment",
alignment_key="alignment",
prefer_normalized=True,
)
assert selected is not None
assert selected["characters"] == list(" Hi")
# ---------------------------------------------------------------------------
# Keepalive vs context-init race
#
# The keepalive must only stamp a context_id once its context-init (carrying
# voice_settings) has been sent. Stamping it earlier makes the keepalive the
# context's first message, with no voice_settings, and ElevenLabs rejects the
# later context-init with a 1008 policy violation.
# ---------------------------------------------------------------------------
class _FakeWebSocket:
"""Minimal stand-in for the ElevenLabs websocket that records sends."""
def __init__(self):
self.state = State.OPEN
self.sent: list[dict] = []
async def send(self, data: str):
self.sent.append(json.loads(data))
def _make_service() -> ElevenLabsTTSService:
return ElevenLabsTTSService(
api_key="test-key",
settings=ElevenLabsTTSService.Settings(
voice="test-voice",
stability=0.55,
similarity_boost=0.85,
use_speaker_boost=True,
speed=0.81,
),
)
@pytest.mark.asyncio
async def test_keepalive_does_not_stamp_context_before_init():
"""During the pre-init window the keepalive must not stamp the new context_id."""
service = _make_service()
ws = _FakeWebSocket()
service._websocket = ws
# Simulate the start of an LLM turn: TTSService sets the turn context id on
# LLMFullResponseStartFrame, before run_tts sends the voice_settings init.
service._turn_context_id = "ctx-1"
service._playing_context_id = None
assert "ctx-1" not in service._context_init_sent
await service._send_keepalive()
# Context-less keepalive: the real context-init stays the context's first
# message, so ElevenLabs won't reject it with 1008.
assert ws.sent == [{"text": ""}]
@pytest.mark.asyncio
async def test_keepalive_stamps_context_after_init():
"""Once the context-init has been sent, the keepalive targets that context."""
service = _make_service()
ws = _FakeWebSocket()
service._websocket = ws
service._turn_context_id = "ctx-1"
service._playing_context_id = None
# run_tts records the context once its voice_settings init has gone out.
service._context_init_sent.add("ctx-1")
await service._send_keepalive()
assert ws.sent == [{"text": "", "context_id": "ctx-1"}]
@pytest.mark.asyncio
async def test_keepalive_without_active_context_sends_empty():
"""With no active context, the keepalive sends a plain empty message."""
service = _make_service()
ws = _FakeWebSocket()
service._websocket = ws
service._turn_context_id = None
service._playing_context_id = None
await service._send_keepalive()
assert ws.sent == [{"text": ""}]