From 67cf4b400a5e99c76585bf49c6cb856f36c02eab Mon Sep 17 00:00:00 2001 From: James Hush Date: Wed, 17 Sep 2025 14:51:09 +0800 Subject: [PATCH] Fix AsyncAI TTS repeated transcription issue - Add _transcription_generated flag to track if transcription was already created - Only generate word timestamps once per text input, not on every audio chunk - Reset flag when new text is processed - Prevents duplicate transcription entries like 'One, two, three' being repeated multiple times --- src/pipecat/services/asyncai/tts.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/pipecat/services/asyncai/tts.py b/src/pipecat/services/asyncai/tts.py index 47db09dca..4878f5911 100644 --- a/src/pipecat/services/asyncai/tts.py +++ b/src/pipecat/services/asyncai/tts.py @@ -149,6 +149,7 @@ class AsyncAITTSService(WebsocketWordTTSService): self._keepalive_task = None self._started = False self._current_text = "" # Track current text for generating timestamps + self._transcription_generated = False # Track if transcription was already generated def can_generate_metrics(self) -> bool: """Check if this service can generate processing metrics. @@ -289,11 +290,14 @@ class AsyncAITTSService(WebsocketWordTTSService): await self.stop_ttfb_metrics() # Start word timestamps and add the entire text as one "word" - # This generates transcription frames for the bot - self.start_word_timestamps() - if self._current_text.strip(): + # This generates transcription frames for the bot - but only once per text + if not self._transcription_generated and self._current_text.strip(): + self.start_word_timestamps() # Add the entire text as a single timestamp at time 0 await self.add_word_timestamps([(self._current_text, 0.0)]) + # Add stop markers to end the word timestamps + await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)]) + self._transcription_generated = True frame = TTSAudioRawFrame( audio=base64.b64decode(msg["audio"]), @@ -302,9 +306,6 @@ class AsyncAITTSService(WebsocketWordTTSService): ) await self.push_frame(frame) - # Add stop markers to end the word timestamps - await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)]) - elif msg.get("error_code"): logger.error(f"{self} error: {msg}") await self.push_frame(TTSStoppedFrame()) @@ -350,6 +351,7 @@ class AsyncAITTSService(WebsocketWordTTSService): # Store the current text for generating timestamps self._current_text = text + self._transcription_generated = False # Reset for new text msg = self._build_msg(text=text, force=True) try: