Merge pull request #1103 from pipecat-ai/aleix/tts-service-push-silence-before-tts-stop-frame

services(tts): allow pushing silence audio before TTSStoppedFrame
2025-01-30 08:48:41 -08:00
parent 4119d7a115 665a6017f9
commit bc4fdd587a
2 changed files with 20 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Added

+- Allow pushing silence audio frames before `TTSStoppedFrame`. This might be
+  useful for testing purposes, for example, passing bot audio to an STT service
+  which usually needs additional audio data to detect the utterance stopped.
+
 - `TwilioSerializer` now supports transport message frames. With this we can
  create Twilio emulators.

--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -208,6 +208,10 @@ class TTSService(AIService):
        push_stop_frames: bool = False,
        # if push_stop_frames is True, wait for this idle period before pushing TTSStoppedFrame
        stop_frame_timeout_s: float = 1.0,
+        # if True, TTSService will push silence audio frames after TTSStoppedFrame
+        push_silence_after_stop: bool = False,
+        # if push_silence_after_stop is True, send this amount of audio silence
+        silence_time_s: float = 2.0,
        # TTS output sample rate
        sample_rate: int = 24000,
        text_filter: Optional[BaseTextFilter] = None,
@@ -218,6 +222,8 @@ class TTSService(AIService):
        self._push_text_frames: bool = push_text_frames
        self._push_stop_frames: bool = push_stop_frames
        self._stop_frame_timeout_s: float = stop_frame_timeout_s
+        self._push_silence_after_stop: bool = push_silence_after_stop
+        self._silence_time_s: float = silence_time_s
        self._sample_rate: int = sample_rate
        self._voice_id: str = ""
        self._settings: Dict[str, Any] = {}
@@ -314,6 +320,16 @@ class TTSService(AIService):
            await self.push_frame(frame, direction)

    async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
+        if self._push_silence_after_stop and isinstance(frame, TTSStoppedFrame):
+            silence_num_bytes = int(self._silence_time_s * self.sample_rate * 2)  # 16-bit
+            await self.push_frame(
+                TTSAudioRawFrame(
+                    audio=b"\x00" * silence_num_bytes,
+                    sample_rate=self.sample_rate,
+                    num_channels=1,
+                )
+            )
+
        await super().push_frame(frame, direction)

        if self._push_stop_frames and (