From 44a698cbccccd40199b84e0390bde3f1355eed47 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 23 Dec 2025 10:31:46 -0500 Subject: [PATCH] Add AudioContextTTSService base class, update AudioContextWordTTSService inheritance --- changelog/3289.added.md | 1 + src/pipecat/services/tts_service.py | 38 ++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 9 deletions(-) create mode 100644 changelog/3289.added.md diff --git a/changelog/3289.added.md b/changelog/3289.added.md new file mode 100644 index 000000000..fb19607eb --- /dev/null +++ b/changelog/3289.added.md @@ -0,0 +1 @@ +- Added a new `AudioContextTTSService` to the TTS service base classes. The `AudioContextWordTTSService` now inherits from `AudioContextTTSService` and `WebsocketWordTTSService`. diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index 39a6078e0..08e3c5220 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -901,12 +901,11 @@ class InterruptibleWordTTSService(WebsocketWordTTSService): self._bot_speaking = False -class AudioContextWordTTSService(WebsocketWordTTSService): - """Websocket-based TTS service with word timestamps and audio context management. +class AudioContextTTSService(WebsocketService): + """Base class for websocket-based TTS services with audio context management. - This is a base class for websocket-based TTS services that support word - timestamps and also allow correlating the generated audio with the requested - text. + This is a base class for websocket-based TTS services that allow correlating + the generated audio with the requested text through audio contexts. Each request could be multiple sentences long which are grouped by context. For this to work, the TTS service needs to support handling @@ -917,13 +916,14 @@ class AudioContextWordTTSService(WebsocketWordTTSService): audio from context ID "A" will be played first. """ - def __init__(self, **kwargs): - """Initialize the Audio Context Word TTS service. + def __init__(self, *, reconnect_on_error: bool = True, **kwargs): + """Initialize the Audio Context TTS service. Args: - **kwargs: Additional arguments passed to the parent WebsocketWordTTSService. + reconnect_on_error: Whether to automatically reconnect on websocket errors. + **kwargs: Additional arguments passed to the parent WebsocketService. """ - super().__init__(**kwargs) + super().__init__(reconnect_on_error=reconnect_on_error, **kwargs) self._contexts: Dict[str, asyncio.Queue] = {} self._audio_context_task = None @@ -1064,3 +1064,23 @@ class AudioContextWordTTSService(WebsocketWordTTSService): # We didn't get audio, so let's consider this context finished. logger.trace(f"{self} time out on audio context {context_id}") break + + +class AudioContextWordTTSService(AudioContextTTSService, WebsocketWordTTSService): + """Websocket-based TTS service with word timestamps and audio context management. + + This is a base class for websocket-based TTS services that support word + timestamps and also allow correlating the generated audio with the requested + text through audio contexts. + + Combines the audio context management capabilities of AudioContextTTSService + with the word timestamp functionality of WebsocketWordTTSService. + """ + + def __init__(self, **kwargs): + """Initialize the Audio Context Word TTS service. + + Args: + **kwargs: Additional arguments passed to parent classes. + """ + super().__init__(**kwargs)