Compare commits
5 Commits
hush/aggre
...
copilot/vs
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1672570060 | ||
|
|
77d8abcce5 | ||
|
|
67cf4b400a | ||
|
|
0340431608 | ||
|
|
451e4a9050 |
@@ -27,7 +27,7 @@ from pipecat.frames.frames import (
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.tts_service import InterruptibleTTSService, TTSService
|
||||
from pipecat.services.tts_service import TTSService, WebsocketWordTTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.tracing.service_decorators import traced_tts
|
||||
|
||||
@@ -71,7 +71,7 @@ def language_to_async_language(language: Language) -> Optional[str]:
|
||||
return result
|
||||
|
||||
|
||||
class AsyncAITTSService(InterruptibleTTSService):
|
||||
class AsyncAITTSService(WebsocketWordTTSService):
|
||||
"""Async TTS service with WebSocket streaming.
|
||||
|
||||
Provides text-to-speech using Async's streaming WebSocket API.
|
||||
@@ -148,6 +148,8 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
self._receive_task = None
|
||||
self._keepalive_task = None
|
||||
self._started = False
|
||||
self._current_text = "" # Track current text for generating timestamps
|
||||
self._transcription_generated = False # Track if transcription was already generated
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
@@ -286,12 +288,28 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
|
||||
elif msg.get("audio"):
|
||||
await self.stop_ttfb_metrics()
|
||||
|
||||
# Start word timestamps and add the entire text as one "word"
|
||||
# This generates transcription frames for the bot - but only once per text
|
||||
if not self._transcription_generated and self._current_text.strip():
|
||||
logger.debug(
|
||||
f"{self}: Generating transcription for text: '{self._current_text}'"
|
||||
)
|
||||
self.start_word_timestamps()
|
||||
# Add the entire text as a single timestamp at time 0
|
||||
await self.add_word_timestamps([(self._current_text, 0.0)])
|
||||
# Add stop markers to end the word timestamps
|
||||
await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)])
|
||||
self._transcription_generated = True
|
||||
logger.debug(f"{self}: Transcription flag set to True")
|
||||
|
||||
frame = TTSAudioRawFrame(
|
||||
audio=base64.b64decode(msg["audio"]),
|
||||
sample_rate=self.sample_rate,
|
||||
num_channels=1,
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
|
||||
elif msg.get("error_code"):
|
||||
logger.error(f"{self} error: {msg}")
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
@@ -324,7 +342,7 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
Yields:
|
||||
Frame: Audio frames containing the synthesized speech.
|
||||
"""
|
||||
logger.debug(f"{self}: Generating TTS [{text}]")
|
||||
logger.debug(f"{self}: AsyncAI Generating TTS [{text}]")
|
||||
|
||||
try:
|
||||
if not self._websocket or self._websocket.state is State.CLOSED:
|
||||
@@ -335,6 +353,12 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
yield TTSStartedFrame()
|
||||
self._started = True
|
||||
|
||||
# Store the current text for generating timestamps
|
||||
self._current_text = text
|
||||
self._transcription_generated = False # Reset for new text
|
||||
# Reset word timestamps for new text to ensure clean state
|
||||
self.reset_word_timestamps()
|
||||
logger.debug(f"{self}: New text set: '{text}', transcription flag reset to False")
|
||||
msg = self._build_msg(text=text, force=True)
|
||||
|
||||
try:
|
||||
@@ -346,7 +370,6 @@ class AsyncAITTSService(InterruptibleTTSService):
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
return
|
||||
yield None
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user