diff --git a/CHANGELOG.md b/CHANGELOG.md index 950e6889b..86a4da21b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Allow pushing silence audio frames before `TTSStoppedFrame`. This might be + useful for testing purposes, for example, passing bot audio to an STT service + which usually needs additional audio data to detect the utterance stopped. + - `TwilioSerializer` now supports transport message frames. With this we can create Twilio emulators. diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index bee15ceab..056ee17f2 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -208,6 +208,10 @@ class TTSService(AIService): push_stop_frames: bool = False, # if push_stop_frames is True, wait for this idle period before pushing TTSStoppedFrame stop_frame_timeout_s: float = 1.0, + # if True, TTSService will push silence audio frames after TTSStoppedFrame + push_silence_after_stop: bool = False, + # if push_silence_after_stop is True, send this amount of audio silence + silence_time_s: float = 2.0, # TTS output sample rate sample_rate: int = 24000, text_filter: Optional[BaseTextFilter] = None, @@ -218,6 +222,8 @@ class TTSService(AIService): self._push_text_frames: bool = push_text_frames self._push_stop_frames: bool = push_stop_frames self._stop_frame_timeout_s: float = stop_frame_timeout_s + self._push_silence_after_stop: bool = push_silence_after_stop + self._silence_time_s: float = silence_time_s self._sample_rate: int = sample_rate self._voice_id: str = "" self._settings: Dict[str, Any] = {} @@ -314,6 +320,16 @@ class TTSService(AIService): await self.push_frame(frame, direction) async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM): + if self._push_silence_after_stop and isinstance(frame, TTSStoppedFrame): + silence_num_bytes = int(self._silence_time_s * self.sample_rate * 2) # 16-bit + await self.push_frame( + TTSAudioRawFrame( + audio=b"\x00" * silence_num_bytes, + sample_rate=self.sample_rate, + num_channels=1, + ) + ) + await super().push_frame(frame, direction) if self._push_stop_frames and (