Merge pull request #1681 from pipecat-ai/aleix/tts-service-llm-full-response-end-fix
TTSService: do not push LLMFullResponseEndFrame if not needed
This commit is contained in:
@@ -73,6 +73,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed a TTS services issue that could cause assistant output not to be
|
||||
aggregated to the context when also using `TTSSpeakFrame`s.
|
||||
|
||||
- Fixed an issue where the `SmartTurnMetricsData` was reporting 0ms for
|
||||
inference and processing time when using the `FalSmartTurnAnalyzer`.
|
||||
|
||||
|
||||
@@ -250,9 +250,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
||||
continue
|
||||
if msg["type"] == "done":
|
||||
await self.stop_ttfb_metrics()
|
||||
await self.add_word_timestamps(
|
||||
[("TTSStoppedFrame", 0), ("LLMFullResponseEndFrame", 0), ("Reset", 0)]
|
||||
)
|
||||
await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)])
|
||||
await self.remove_audio_context(msg["context_id"])
|
||||
elif msg["type"] == "timestamps":
|
||||
await self.add_word_timestamps(
|
||||
|
||||
@@ -287,7 +287,7 @@ class ElevenLabsTTSService(InterruptibleWordTTSService):
|
||||
if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
|
||||
self._started = False
|
||||
if isinstance(frame, TTSStoppedFrame):
|
||||
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
|
||||
await self.add_word_timestamps([("Reset", 0)])
|
||||
|
||||
async def _connect(self):
|
||||
await self._connect_websocket()
|
||||
@@ -526,7 +526,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
||||
self._reset_state()
|
||||
|
||||
if isinstance(frame, TTSStoppedFrame):
|
||||
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
|
||||
await self.add_word_timestamps([("Reset", 0)])
|
||||
|
||||
elif isinstance(frame, LLMFullResponseEndFrame):
|
||||
# End of turn - reset previous text
|
||||
|
||||
@@ -304,7 +304,7 @@ class RimeTTSService(AudioContextWordTTSService):
|
||||
await super().push_frame(frame, direction)
|
||||
if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
|
||||
if isinstance(frame, TTSStoppedFrame):
|
||||
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
|
||||
await self.add_word_timestamps([("Reset", 0)])
|
||||
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
"""Generate speech from text.
|
||||
|
||||
@@ -19,6 +19,7 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
TextFrame,
|
||||
@@ -308,6 +309,7 @@ class WordTTSService(TTSService):
|
||||
self._initial_word_timestamp = -1
|
||||
self._words_queue = asyncio.Queue()
|
||||
self._words_task = None
|
||||
self._llm_response_started: bool = False
|
||||
|
||||
def start_word_timestamps(self):
|
||||
if self._initial_word_timestamp == -1:
|
||||
@@ -335,11 +337,14 @@ class WordTTSService(TTSService):
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
|
||||
if isinstance(frame, LLMFullResponseStartFrame):
|
||||
self._llm_response_started = True
|
||||
elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
|
||||
await self.flush_audio()
|
||||
|
||||
async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
|
||||
await super()._handle_interruption(frame, direction)
|
||||
self._llm_response_started = False
|
||||
self.reset_word_timestamps()
|
||||
|
||||
def _create_words_task(self):
|
||||
@@ -354,13 +359,14 @@ class WordTTSService(TTSService):
|
||||
async def _words_task_handler(self):
|
||||
last_pts = 0
|
||||
while True:
|
||||
frame = None
|
||||
(word, timestamp) = await self._words_queue.get()
|
||||
if word == "Reset" and timestamp == 0:
|
||||
self.reset_word_timestamps()
|
||||
frame = None
|
||||
elif word == "LLMFullResponseEndFrame" and timestamp == 0:
|
||||
frame = LLMFullResponseEndFrame()
|
||||
frame.pts = last_pts
|
||||
if self._llm_response_started:
|
||||
self._llm_response_started = False
|
||||
frame = LLMFullResponseEndFrame()
|
||||
frame.pts = last_pts
|
||||
elif word == "TTSStoppedFrame" and timestamp == 0:
|
||||
frame = TTSStoppedFrame()
|
||||
frame.pts = last_pts
|
||||
|
||||
Reference in New Issue
Block a user