Compare commits

...

5 Commits

Author SHA1 Message Date
James Hush
1672570060 Final fix for AsyncAI TTS transcription duplication
The debug logging confirmed the issue was resolved with the proper flag tracking
and reset_word_timestamps() call. AsyncAI TTS now generates transcription frames
once per text input, matching Cartesia TTS behavior.

Working solution:
- Track transcription generation with _transcription_generated flag
- Reset flag and word timestamps for each new text input
- Only generate transcription on first audio chunk per text
- Proper cleanup ensures no duplicate transcriptions
2025-09-17 15:01:49 +08:00
James Hush
77d8abcce5 Add debug logging to track transcription generation
- Add logging when new text is set and transcription flag is reset
- Add logging when transcription is generated vs skipped
- Add reset_word_timestamps() call to ensure clean state for new text
- This will help identify why transcriptions are still being repeated
2025-09-17 14:54:56 +08:00
James Hush
67cf4b400a Fix AsyncAI TTS repeated transcription issue
- Add _transcription_generated flag to track if transcription was already created
- Only generate word timestamps once per text input, not on every audio chunk
- Reset flag when new text is processed
- Prevents duplicate transcription entries like 'One, two, three' being repeated multiple times
2025-09-17 14:51:09 +08:00
copilot-swe-agent[bot]
0340431608 Initial plan 2025-09-17 06:31:00 +00:00
James Hush
451e4a9050 Checkpoint from VS Code for coding agent session 2025-09-17 14:30:53 +08:00

View File

@@ -27,7 +27,7 @@ from pipecat.frames.frames import (
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.tts_service import InterruptibleTTSService, TTSService
from pipecat.services.tts_service import TTSService, WebsocketWordTTSService
from pipecat.transcriptions.language import Language
from pipecat.utils.tracing.service_decorators import traced_tts
@@ -71,7 +71,7 @@ def language_to_async_language(language: Language) -> Optional[str]:
return result
class AsyncAITTSService(InterruptibleTTSService):
class AsyncAITTSService(WebsocketWordTTSService):
"""Async TTS service with WebSocket streaming.
Provides text-to-speech using Async's streaming WebSocket API.
@@ -148,6 +148,8 @@ class AsyncAITTSService(InterruptibleTTSService):
self._receive_task = None
self._keepalive_task = None
self._started = False
self._current_text = "" # Track current text for generating timestamps
self._transcription_generated = False # Track if transcription was already generated
def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.
@@ -286,12 +288,28 @@ class AsyncAITTSService(InterruptibleTTSService):
elif msg.get("audio"):
await self.stop_ttfb_metrics()
# Start word timestamps and add the entire text as one "word"
# This generates transcription frames for the bot - but only once per text
if not self._transcription_generated and self._current_text.strip():
logger.debug(
f"{self}: Generating transcription for text: '{self._current_text}'"
)
self.start_word_timestamps()
# Add the entire text as a single timestamp at time 0
await self.add_word_timestamps([(self._current_text, 0.0)])
# Add stop markers to end the word timestamps
await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)])
self._transcription_generated = True
logger.debug(f"{self}: Transcription flag set to True")
frame = TTSAudioRawFrame(
audio=base64.b64decode(msg["audio"]),
sample_rate=self.sample_rate,
num_channels=1,
)
await self.push_frame(frame)
elif msg.get("error_code"):
logger.error(f"{self} error: {msg}")
await self.push_frame(TTSStoppedFrame())
@@ -324,7 +342,7 @@ class AsyncAITTSService(InterruptibleTTSService):
Yields:
Frame: Audio frames containing the synthesized speech.
"""
logger.debug(f"{self}: Generating TTS [{text}]")
logger.debug(f"{self}: AsyncAI Generating TTS [{text}]")
try:
if not self._websocket or self._websocket.state is State.CLOSED:
@@ -335,6 +353,12 @@ class AsyncAITTSService(InterruptibleTTSService):
yield TTSStartedFrame()
self._started = True
# Store the current text for generating timestamps
self._current_text = text
self._transcription_generated = False # Reset for new text
# Reset word timestamps for new text to ensure clean state
self.reset_word_timestamps()
logger.debug(f"{self}: New text set: '{text}', transcription flag reset to False")
msg = self._build_msg(text=text, force=True)
try:
@@ -346,7 +370,6 @@ class AsyncAITTSService(InterruptibleTTSService):
await self._disconnect()
await self._connect()
return
yield None
except Exception as e:
logger.error(f"{self} exception: {e}")