From 67cf4b400a5e99c76585bf49c6cb856f36c02eab Mon Sep 17 00:00:00 2001
From: James Hush <james@daily.co>
Date: Wed, 17 Sep 2025 14:51:09 +0800
Subject: [PATCH] Fix AsyncAI TTS repeated transcription issue

- Add _transcription_generated flag to track if transcription was already created
- Only generate word timestamps once per text input, not on every audio chunk
- Reset flag when new text is processed
- Prevents duplicate transcription entries like 'One, two, three' being repeated multiple times
---
 src/pipecat/services/asyncai/tts.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/pipecat/services/asyncai/tts.py b/src/pipecat/services/asyncai/tts.py
index 47db09dca..4878f5911 100644
--- a/src/pipecat/services/asyncai/tts.py
+++ b/src/pipecat/services/asyncai/tts.py
@@ -149,6 +149,7 @@ class AsyncAITTSService(WebsocketWordTTSService):
         self._keepalive_task = None
         self._started = False
         self._current_text = ""  # Track current text for generating timestamps
+        self._transcription_generated = False  # Track if transcription was already generated
 
     def can_generate_metrics(self) -> bool:
         """Check if this service can generate processing metrics.
@@ -289,11 +290,14 @@ class AsyncAITTSService(WebsocketWordTTSService):
                 await self.stop_ttfb_metrics()
 
                 # Start word timestamps and add the entire text as one "word"
-                # This generates transcription frames for the bot
-                self.start_word_timestamps()
-                if self._current_text.strip():
+                # This generates transcription frames for the bot - but only once per text
+                if not self._transcription_generated and self._current_text.strip():
+                    self.start_word_timestamps()
                     # Add the entire text as a single timestamp at time 0
                     await self.add_word_timestamps([(self._current_text, 0.0)])
+                    # Add stop markers to end the word timestamps
+                    await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)])
+                    self._transcription_generated = True
 
                 frame = TTSAudioRawFrame(
                     audio=base64.b64decode(msg["audio"]),
@@ -302,9 +306,6 @@ class AsyncAITTSService(WebsocketWordTTSService):
                 )
                 await self.push_frame(frame)
 
-                # Add stop markers to end the word timestamps
-                await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)])
-
             elif msg.get("error_code"):
                 logger.error(f"{self} error: {msg}")
                 await self.push_frame(TTSStoppedFrame())
@@ -350,6 +351,7 @@ class AsyncAITTSService(WebsocketWordTTSService):
 
             # Store the current text for generating timestamps
             self._current_text = text
+            self._transcription_generated = False  # Reset for new text
             msg = self._build_msg(text=text, force=True)
 
             try: