Fix AsyncAI TTS repeated transcription issue

- Add _transcription_generated flag to track if transcription was already created - Only generate word timestamps once per text input, not on every audio chunk - Reset flag when new text is processed - Prevents duplicate transcription entries like 'One, two, three' being repeated multiple times
2025-09-17 14:51:09 +08:00
parent 0340431608
commit 67cf4b400a
1 changed files with 8 additions and 6 deletions
--- a/src/pipecat/services/asyncai/tts.py
+++ b/src/pipecat/services/asyncai/tts.py
@@ -149,6 +149,7 @@ class AsyncAITTSService(WebsocketWordTTSService):
        self._keepalive_task = None
        self._started = False
        self._current_text = ""  # Track current text for generating timestamps
+        self._transcription_generated = False  # Track if transcription was already generated

    def can_generate_metrics(self) -> bool:
        """Check if this service can generate processing metrics.
@@ -289,11 +290,14 @@ class AsyncAITTSService(WebsocketWordTTSService):
                await self.stop_ttfb_metrics()

                # Start word timestamps and add the entire text as one "word"
-                # This generates transcription frames for the bot
-                self.start_word_timestamps()
-                if self._current_text.strip():
+                # This generates transcription frames for the bot - but only once per text
+                if not self._transcription_generated and self._current_text.strip():
+                    self.start_word_timestamps()
                    # Add the entire text as a single timestamp at time 0
                    await self.add_word_timestamps([(self._current_text, 0.0)])
+                    # Add stop markers to end the word timestamps
+                    await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)])
+                    self._transcription_generated = True

                frame = TTSAudioRawFrame(
                    audio=base64.b64decode(msg["audio"]),
@@ -302,9 +306,6 @@ class AsyncAITTSService(WebsocketWordTTSService):
                )
                await self.push_frame(frame)

-                # Add stop markers to end the word timestamps
-                await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)])
-
            elif msg.get("error_code"):
                logger.error(f"{self} error: {msg}")
                await self.push_frame(TTSStoppedFrame())
@@ -350,6 +351,7 @@ class AsyncAITTSService(WebsocketWordTTSService):

            # Store the current text for generating timestamps
            self._current_text = text
+            self._transcription_generated = False  # Reset for new text
            msg = self._build_msg(text=text, force=True)

            try: