Merge pull request #4306 from YFortin/fix/azure-tts-last-word-race

fix(azure-tts): Route completion through word boundary queue to prevent last word from being missed
This commit is contained in:
Mark Backman
2026-05-19 22:27:50 -04:00
committed by GitHub
2 changed files with 27 additions and 11 deletions

1
changelog/4306.fixed.md Normal file
View File

@@ -0,0 +1 @@
- Fixed Azure TTS last word being missed by observers and RTVI UI. The completion signal was racing with word timestamp processing, causing the final word's `TTSTextFrame` to arrive after `TTSStoppedFrame`. Completion is now routed through the word boundary queue to ensure all words are processed before signaling stream end.

View File

@@ -540,14 +540,25 @@ class AzureTTSService(TTSService, AzureBaseTTSService):
self._last_timestamp = timestamp
async def _word_processor_task_handler(self):
"""Process word timestamps from the queue and call add_word_timestamps."""
"""Process word timestamps from the queue and call add_word_timestamps.
Also handles a None sentinel from _handle_completed: once all pending
words have been drained, it signals audio stream completion via
_audio_queue so that run_tts exits only after the last word has been
processed.
"""
while True:
try:
word, timestamp_seconds = await self._word_boundary_queue.get()
if self._current_context_id:
await self.add_word_timestamps(
[(word, timestamp_seconds)], self._current_context_id
)
item = await self._word_boundary_queue.get()
if item is None:
# All words drained — now signal audio completion.
self._audio_queue.put_nowait(None)
else:
word, timestamp_seconds = item
if self._current_context_id:
await self.add_word_timestamps(
[(word, timestamp_seconds)], self._current_context_id
)
self._word_boundary_queue.task_done()
except asyncio.CancelledError:
break
@@ -569,17 +580,21 @@ class AzureTTSService(TTSService, AzureBaseTTSService):
Args:
evt: Completion event from Azure Speech SDK.
"""
# Store duration for cumulative offset calculation
if evt.result and evt.result.audio_duration:
self._current_sentence_duration = evt.result.audio_duration.total_seconds()
# Flush any pending word before completing
if self._last_word is not None:
self._word_boundary_queue.put_nowait((self._last_word, self._last_timestamp))
self._last_word = None
self._last_timestamp = None
# Store duration for cumulative offset calculation
if evt.result and evt.result.audio_duration:
self._current_sentence_duration = evt.result.audio_duration.total_seconds()
self._audio_queue.put_nowait(None) # Signal completion
# Route completion through the word boundary queue so the word processor
# task drains all pending words before signaling audio stream completion.
# Without this, the last word's TTSTextFrame may arrive after
# TTSStoppedFrame, causing it to be missed by observers and the UI.
self._word_boundary_queue.put_nowait(None)
def _handle_canceled(self, evt):
"""Handle synthesis cancellation.