Allowing to buffer audio inside the base output transport.

2026-05-21 11:54:01 -03:00
parent 359c9394d0
commit 7c938102ad
3 changed files with 38 additions and 1 deletions
--- a/src/pipecat/transports/base_output.py
+++ b/src/pipecat/transports/base_output.py
@@ -823,6 +823,23 @@ class BaseOutputTransport(FrameProcessor):

        async def _audio_task_handler(self):
            """Main audio processing task handler."""
+            # Pre-buffer: accumulate audio before sending anything to the transport.
+            #
+            # prebuffer is a list while we are still accumulating, and None once the
+            # threshold has been reached and all held frames have been flushed. Using
+            # None as the sentinel avoids a boolean flag and makes the steady-state
+            # branch a simple identity check.
+            #
+            # The pre-buffer resets automatically on each interruption because the
+            # audio task is cancelled and recreated, giving the next utterance a fresh
+            # local variable.
+            min_prebuffer_bytes = (
+                int(self._sample_rate * self._params.audio_out_prebuffer_secs)
+                * 2
+                * self._params.audio_out_channels
+            )
+            prebuffer: list[OutputAudioRawFrame] | None = [] if min_prebuffer_bytes > 0 else None
+
            async for frame in self._next_frame():
                # No need to push EndFrame, it's pushed from process_frame().
                if isinstance(frame, EndFrame):
@@ -840,7 +857,20 @@ class BaseOutputTransport(FrameProcessor):
                # Try to send audio to the transport.
                try:
                    if isinstance(frame, OutputAudioRawFrame):
-                        push_downstream = await self._transport.write_audio_frame(frame)
+                        if prebuffer is not None:
+                            # Accumulation phase: hold frames until we have enough audio.
+                            prebuffer.append(frame)
+                            if sum(len(f.audio) for f in prebuffer) >= min_prebuffer_bytes:
+                                # Threshold reached: flush all held frames at once, then
+                                # switch to direct-write mode for the rest of the utterance.
+                                for f in prebuffer:
+                                    await self._transport.write_audio_frame(f)
+                                prebuffer = None
+                            # push_downstream stays True so frames flow through the
+                            # pipeline even while we are still accumulating.
+                        else:
+                            # Steady-state: write directly to the transport.
+                            push_downstream = await self._transport.write_audio_frame(frame)
                except Exception as e:
                    logger.error(f"{self} Error writing {frame} to transport: {e}")
                    push_downstream = False
--- a/src/pipecat/transports/base_transport.py
+++ b/src/pipecat/transports/base_transport.py
@@ -34,6 +34,8 @@ class TransportParams(BaseModel):
        audio_out_mixer: Audio mixer instance or destination mapping.
        audio_out_destinations: List of audio output destination identifiers.
        audio_out_end_silence_secs: How much silence to send after an EndFrame (0 for no silence).
+        audio_out_prebuffer_secs: Seconds of audio to accumulate before sending anything to the
+            transport. Resets automatically on each interruption. Defaults to 0.0 (disabled).
        audio_out_auto_silence: Insert silence frames when the audio output queue is empty.
            When False, the transport will wait for audio data instead of inserting silence.
        audio_in_enabled: Enable audio input streaming.
@@ -70,6 +72,7 @@ class TransportParams(BaseModel):
    audio_out_mixer: BaseAudioMixer | Mapping[str | None, BaseAudioMixer] | None = None
    audio_out_destinations: list[str] = Field(default_factory=list)
    audio_out_end_silence_secs: int = 2
+    audio_out_prebuffer_secs: float = 0.0
    audio_out_auto_silence: bool = True
    audio_in_enabled: bool = False
    audio_in_sample_rate: int | None = None
--- a/src/pipecat/transports/tavus/transport.py
+++ b/src/pipecat/transports/tavus/transport.py
@@ -168,11 +168,15 @@ class TavusParams(DailyParams):
        audio_in_enabled: Whether to enable audio input from participants.
        audio_out_enabled: Whether to enable audio output to participants.
        microphone_out_enabled: Whether to enable microphone output track.
+        audio_out_prebuffer_secs: Seconds of audio to accumulate before sending to WebRTC.
+            Absorbs TTS jitter to prevent the WebRTC jitter buffer from injecting silence.
+            Defaults to 0.5.
    """

    audio_in_enabled: bool = True
    audio_out_enabled: bool = True
    microphone_out_enabled: bool = False
+    audio_out_prebuffer_secs: float = 0.5


 class TavusTransportClient: