Merge pull request #1681 from pipecat-ai/aleix/tts-service-llm-full-response-end-fix

TTSService: do not push LLMFullResponseEndFrame if not needed
This commit is contained in:
Aleix Conchillo Flaqué
2025-04-28 12:00:55 -07:00
committed by GitHub
5 changed files with 18 additions and 11 deletions

View File

@@ -73,6 +73,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Fixed a TTS services issue that could cause assistant output not to be
aggregated to the context when also using `TTSSpeakFrame`s.
- Fixed an issue where the `SmartTurnMetricsData` was reporting 0ms for
inference and processing time when using the `FalSmartTurnAnalyzer`.

View File

@@ -250,9 +250,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
continue
if msg["type"] == "done":
await self.stop_ttfb_metrics()
await self.add_word_timestamps(
[("TTSStoppedFrame", 0), ("LLMFullResponseEndFrame", 0), ("Reset", 0)]
)
await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)])
await self.remove_audio_context(msg["context_id"])
elif msg["type"] == "timestamps":
await self.add_word_timestamps(

View File

@@ -287,7 +287,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
self._started = False
if isinstance(frame, TTSStoppedFrame):
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
await self.add_word_timestamps([("Reset", 0)])
async def _connect(self):
await self._connect_websocket()
@@ -526,7 +526,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
self._reset_state()
if isinstance(frame, TTSStoppedFrame):
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
await self.add_word_timestamps([("Reset", 0)])
elif isinstance(frame, LLMFullResponseEndFrame):
# End of turn - reset previous text

View File

@@ -304,7 +304,7 @@ class RimeTTSService(AudioContextWordTTSService):
await super().push_frame(frame, direction)
if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
if isinstance(frame, TTSStoppedFrame):
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
await self.add_word_timestamps([("Reset", 0)])
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"""Generate speech from text.

View File

@@ -19,6 +19,7 @@ from pipecat.frames.frames import (
Frame,
InterimTranscriptionFrame,
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
StartFrame,
StartInterruptionFrame,
TextFrame,
@@ -308,6 +309,7 @@ class WordTTSService(TTSService):
self._initial_word_timestamp = -1
self._words_queue = asyncio.Queue()
self._words_task = None
self._llm_response_started: bool = False
def start_word_timestamps(self):
if self._initial_word_timestamp == -1:
@@ -335,11 +337,14 @@ class WordTTSService(TTSService):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
if isinstance(frame, LLMFullResponseStartFrame):
self._llm_response_started = True
elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
await self.flush_audio()
async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
await super()._handle_interruption(frame, direction)
self._llm_response_started = False
self.reset_word_timestamps()
def _create_words_task(self):
@@ -354,13 +359,14 @@ class WordTTSService(TTSService):
async def _words_task_handler(self):
last_pts = 0
while True:
frame = None
(word, timestamp) = await self._words_queue.get()
if word == "Reset" and timestamp == 0:
self.reset_word_timestamps()
frame = None
elif word == "LLMFullResponseEndFrame" and timestamp == 0:
frame = LLMFullResponseEndFrame()
frame.pts = last_pts
if self._llm_response_started:
self._llm_response_started = False
frame = LLMFullResponseEndFrame()
frame.pts = last_pts
elif word == "TTSStoppedFrame" and timestamp == 0:
frame = TTSStoppedFrame()
frame.pts = last_pts