Call start_word_timestamps() when the first audio chunk arrives

2026-01-07 13:10:41 +09:00
parent 137bbb3d2c
commit f62c262f23
1 changed files with 7 additions and 1 deletions
--- a/src/pipecat/services/azure/tts.py
+++ b/src/pipecat/services/azure/tts.py
@@ -441,9 +441,9 @@ class AzureTTSService(WordTTSService, AzureBaseTTSService):
            try:
                if not self._started:
                    await self.start_ttfb_metrics()
-                    await self.start_word_timestamps()
                    yield TTSStartedFrame()
                    self._started = True
+                    self._first_chunk = True
                    self._cumulative_audio_offset = 0.0

                ssml = self._construct_ssml(text)
@@ -457,6 +457,12 @@ class AzureTTSService(WordTTSService, AzureBaseTTSService):
                        break

                    await self.stop_ttfb_metrics()
+
+                    # Start word timestamps when first chunk arrives
+                    if self._first_chunk:
+                        await self.start_word_timestamps()
+                        self._first_chunk = False
+
                    frame = TTSAudioRawFrame(
                        audio=chunk,
                        sample_rate=self.sample_rate,