The keepalive could fire for a new turn's context before that context's voice_settings context-init was sent, making the keepalive the context's first message (no voice_settings) and causing ElevenLabs to reject the later init with a 1008 policy violation. The keepalive now only targets a context once its context-init has been sent (tracked in _context_init_sent).
292 lines
8.2 KiB
Python
292 lines
8.2 KiB
Python
#
|
|
# Copyright (c) 2024-2026, Daily
|
|
#
|
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
#
|
|
|
|
"""Tests for ElevenLabs TTS alignment handling."""
|
|
|
|
import json
|
|
from typing import Any
|
|
|
|
import pytest
|
|
from websockets.protocol import State
|
|
|
|
from pipecat.services.elevenlabs.tts import (
|
|
ElevenLabsTTSService,
|
|
_select_alignment,
|
|
_strip_utterance_leading_spaces,
|
|
calculate_word_times,
|
|
)
|
|
|
|
_WS_ALIGNMENT_KEYS = ("chars", "charStartTimesMs", "charDurationsMs")
|
|
|
|
|
|
def _chunk(text: str) -> dict[str, list[Any]]:
|
|
chars = list(text)
|
|
return {
|
|
"chars": chars,
|
|
"charStartTimesMs": [i * 100 for i in range(len(chars))],
|
|
"charDurationsMs": [100 for _ in chars],
|
|
}
|
|
|
|
|
|
def _words_from_chunks(chunks: list[dict[str, list[Any]]]) -> list[str]:
|
|
cumulative_time = 0.0
|
|
partial_word = ""
|
|
partial_word_start_time = 0.0
|
|
word_times = []
|
|
alignment_started = False
|
|
|
|
for chunk in chunks:
|
|
alignment = _strip_utterance_leading_spaces(
|
|
chunk,
|
|
_WS_ALIGNMENT_KEYS,
|
|
not alignment_started,
|
|
)
|
|
alignment_started = True
|
|
chunk_word_times, partial_word, partial_word_start_time = calculate_word_times(
|
|
alignment,
|
|
cumulative_time,
|
|
partial_word,
|
|
partial_word_start_time,
|
|
)
|
|
word_times.extend(chunk_word_times)
|
|
|
|
starts = alignment["charStartTimesMs"]
|
|
durations = alignment["charDurationsMs"]
|
|
if starts and durations:
|
|
cumulative_time += (starts[-1] + durations[-1]) / 1000.0
|
|
|
|
if partial_word:
|
|
word_times.append((partial_word, partial_word_start_time))
|
|
|
|
return [word for word, _ in word_times]
|
|
|
|
|
|
def test_elevenlabs_flash_alignment_preserves_inter_word_chunk_space():
|
|
chunks = [
|
|
_chunk(" Why did the math book"),
|
|
_chunk(" look so sad? "),
|
|
_chunk(" Because it had too m"),
|
|
_chunk("any problems. "),
|
|
]
|
|
|
|
assert _words_from_chunks(chunks) == [
|
|
"Why",
|
|
"did",
|
|
"the",
|
|
"math",
|
|
"book",
|
|
"look",
|
|
"so",
|
|
"sad?",
|
|
"Because",
|
|
"it",
|
|
"had",
|
|
"too",
|
|
"many",
|
|
"problems.",
|
|
]
|
|
|
|
|
|
def test_elevenlabs_alignment_strips_only_utterance_leading_spaces():
|
|
first = _strip_utterance_leading_spaces(_chunk(" Hello"), _WS_ALIGNMENT_KEYS, True)
|
|
subsequent = _strip_utterance_leading_spaces(_chunk(" world"), _WS_ALIGNMENT_KEYS, False)
|
|
|
|
assert first["chars"] == list("Hello")
|
|
assert subsequent["chars"] == list(" world")
|
|
|
|
|
|
def test_select_alignment_default_prefers_alignment():
|
|
msg = {
|
|
"alignment": _chunk("Hello"),
|
|
"normalizedAlignment": _chunk(" Hello"),
|
|
}
|
|
selected = _select_alignment(
|
|
msg,
|
|
normalized_key="normalizedAlignment",
|
|
alignment_key="alignment",
|
|
prefer_normalized=False,
|
|
)
|
|
assert selected is not None
|
|
assert selected["chars"] == list("Hello")
|
|
|
|
|
|
def test_select_alignment_dictionary_mode_prefers_normalized():
|
|
msg = {
|
|
"alignment": _chunk("Hello"),
|
|
"normalizedAlignment": _chunk(" Hello"),
|
|
}
|
|
selected = _select_alignment(
|
|
msg,
|
|
normalized_key="normalizedAlignment",
|
|
alignment_key="alignment",
|
|
prefer_normalized=True,
|
|
)
|
|
assert selected is not None
|
|
assert selected["chars"] == list(" Hello")
|
|
|
|
|
|
def test_select_alignment_falls_back_when_preferred_missing():
|
|
msg_default = {"normalizedAlignment": _chunk(" Hello")}
|
|
selected = _select_alignment(
|
|
msg_default,
|
|
normalized_key="normalizedAlignment",
|
|
alignment_key="alignment",
|
|
prefer_normalized=False,
|
|
)
|
|
assert selected is not None
|
|
assert selected["chars"] == list(" Hello")
|
|
|
|
msg_dict = {"alignment": _chunk("Hello")}
|
|
selected = _select_alignment(
|
|
msg_dict,
|
|
normalized_key="normalizedAlignment",
|
|
alignment_key="alignment",
|
|
prefer_normalized=True,
|
|
)
|
|
assert selected is not None
|
|
assert selected["chars"] == list("Hello")
|
|
|
|
|
|
def test_select_alignment_falls_back_when_preferred_null():
|
|
msg = {"alignment": None, "normalizedAlignment": _chunk(" Hello")}
|
|
selected = _select_alignment(
|
|
msg,
|
|
normalized_key="normalizedAlignment",
|
|
alignment_key="alignment",
|
|
prefer_normalized=False,
|
|
)
|
|
assert selected is not None
|
|
assert selected["chars"] == list(" Hello")
|
|
|
|
|
|
def test_select_alignment_returns_none_when_both_missing():
|
|
assert (
|
|
_select_alignment(
|
|
{},
|
|
normalized_key="normalizedAlignment",
|
|
alignment_key="alignment",
|
|
prefer_normalized=False,
|
|
)
|
|
is None
|
|
)
|
|
assert (
|
|
_select_alignment(
|
|
{"alignment": None, "normalizedAlignment": None},
|
|
normalized_key="normalizedAlignment",
|
|
alignment_key="alignment",
|
|
prefer_normalized=True,
|
|
)
|
|
is None
|
|
)
|
|
|
|
|
|
def test_select_alignment_works_with_http_field_names():
|
|
msg = {
|
|
"alignment": {"characters": list("Hi")},
|
|
"normalized_alignment": {"characters": list(" Hi")},
|
|
}
|
|
selected = _select_alignment(
|
|
msg,
|
|
normalized_key="normalized_alignment",
|
|
alignment_key="alignment",
|
|
prefer_normalized=False,
|
|
)
|
|
assert selected is not None
|
|
assert selected["characters"] == list("Hi")
|
|
|
|
selected = _select_alignment(
|
|
msg,
|
|
normalized_key="normalized_alignment",
|
|
alignment_key="alignment",
|
|
prefer_normalized=True,
|
|
)
|
|
assert selected is not None
|
|
assert selected["characters"] == list(" Hi")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Keepalive vs context-init race
|
|
#
|
|
# The keepalive must only stamp a context_id once its context-init (carrying
|
|
# voice_settings) has been sent. Stamping it earlier makes the keepalive the
|
|
# context's first message, with no voice_settings, and ElevenLabs rejects the
|
|
# later context-init with a 1008 policy violation.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class _FakeWebSocket:
|
|
"""Minimal stand-in for the ElevenLabs websocket that records sends."""
|
|
|
|
def __init__(self):
|
|
self.state = State.OPEN
|
|
self.sent: list[dict] = []
|
|
|
|
async def send(self, data: str):
|
|
self.sent.append(json.loads(data))
|
|
|
|
|
|
def _make_service() -> ElevenLabsTTSService:
|
|
return ElevenLabsTTSService(
|
|
api_key="test-key",
|
|
settings=ElevenLabsTTSService.Settings(
|
|
voice="test-voice",
|
|
stability=0.55,
|
|
similarity_boost=0.85,
|
|
use_speaker_boost=True,
|
|
speed=0.81,
|
|
),
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_keepalive_does_not_stamp_context_before_init():
|
|
"""During the pre-init window the keepalive must not stamp the new context_id."""
|
|
service = _make_service()
|
|
ws = _FakeWebSocket()
|
|
service._websocket = ws
|
|
|
|
# Simulate the start of an LLM turn: TTSService sets the turn context id on
|
|
# LLMFullResponseStartFrame, before run_tts sends the voice_settings init.
|
|
service._turn_context_id = "ctx-1"
|
|
service._playing_context_id = None
|
|
assert "ctx-1" not in service._context_init_sent
|
|
|
|
await service._send_keepalive()
|
|
|
|
# Context-less keepalive: the real context-init stays the context's first
|
|
# message, so ElevenLabs won't reject it with 1008.
|
|
assert ws.sent == [{"text": ""}]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_keepalive_stamps_context_after_init():
|
|
"""Once the context-init has been sent, the keepalive targets that context."""
|
|
service = _make_service()
|
|
ws = _FakeWebSocket()
|
|
service._websocket = ws
|
|
service._turn_context_id = "ctx-1"
|
|
service._playing_context_id = None
|
|
# run_tts records the context once its voice_settings init has gone out.
|
|
service._context_init_sent.add("ctx-1")
|
|
|
|
await service._send_keepalive()
|
|
|
|
assert ws.sent == [{"text": "", "context_id": "ctx-1"}]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_keepalive_without_active_context_sends_empty():
|
|
"""With no active context, the keepalive sends a plain empty message."""
|
|
service = _make_service()
|
|
ws = _FakeWebSocket()
|
|
service._websocket = ws
|
|
service._turn_context_id = None
|
|
service._playing_context_id = None
|
|
|
|
await service._send_keepalive()
|
|
|
|
assert ws.sent == [{"text": ""}]
|