Allowing to buffer audio inside the base output transport.

This commit is contained in:
filipi87
2026-05-21 11:54:01 -03:00
parent 359c9394d0
commit 7c938102ad
3 changed files with 38 additions and 1 deletions

View File

@@ -823,6 +823,23 @@ class BaseOutputTransport(FrameProcessor):
async def _audio_task_handler(self):
"""Main audio processing task handler."""
# Pre-buffer: accumulate audio before sending anything to the transport.
#
# prebuffer is a list while we are still accumulating, and None once the
# threshold has been reached and all held frames have been flushed. Using
# None as the sentinel avoids a boolean flag and makes the steady-state
# branch a simple identity check.
#
# The pre-buffer resets automatically on each interruption because the
# audio task is cancelled and recreated, giving the next utterance a fresh
# local variable.
min_prebuffer_bytes = (
int(self._sample_rate * self._params.audio_out_prebuffer_secs)
* 2
* self._params.audio_out_channels
)
prebuffer: list[OutputAudioRawFrame] | None = [] if min_prebuffer_bytes > 0 else None
async for frame in self._next_frame():
# No need to push EndFrame, it's pushed from process_frame().
if isinstance(frame, EndFrame):
@@ -840,7 +857,20 @@ class BaseOutputTransport(FrameProcessor):
# Try to send audio to the transport.
try:
if isinstance(frame, OutputAudioRawFrame):
push_downstream = await self._transport.write_audio_frame(frame)
if prebuffer is not None:
# Accumulation phase: hold frames until we have enough audio.
prebuffer.append(frame)
if sum(len(f.audio) for f in prebuffer) >= min_prebuffer_bytes:
# Threshold reached: flush all held frames at once, then
# switch to direct-write mode for the rest of the utterance.
for f in prebuffer:
await self._transport.write_audio_frame(f)
prebuffer = None
# push_downstream stays True so frames flow through the
# pipeline even while we are still accumulating.
else:
# Steady-state: write directly to the transport.
push_downstream = await self._transport.write_audio_frame(frame)
except Exception as e:
logger.error(f"{self} Error writing {frame} to transport: {e}")
push_downstream = False

View File

@@ -34,6 +34,8 @@ class TransportParams(BaseModel):
audio_out_mixer: Audio mixer instance or destination mapping.
audio_out_destinations: List of audio output destination identifiers.
audio_out_end_silence_secs: How much silence to send after an EndFrame (0 for no silence).
audio_out_prebuffer_secs: Seconds of audio to accumulate before sending anything to the
transport. Resets automatically on each interruption. Defaults to 0.0 (disabled).
audio_out_auto_silence: Insert silence frames when the audio output queue is empty.
When False, the transport will wait for audio data instead of inserting silence.
audio_in_enabled: Enable audio input streaming.
@@ -70,6 +72,7 @@ class TransportParams(BaseModel):
audio_out_mixer: BaseAudioMixer | Mapping[str | None, BaseAudioMixer] | None = None
audio_out_destinations: list[str] = Field(default_factory=list)
audio_out_end_silence_secs: int = 2
audio_out_prebuffer_secs: float = 0.0
audio_out_auto_silence: bool = True
audio_in_enabled: bool = False
audio_in_sample_rate: int | None = None

View File

@@ -168,11 +168,15 @@ class TavusParams(DailyParams):
audio_in_enabled: Whether to enable audio input from participants.
audio_out_enabled: Whether to enable audio output to participants.
microphone_out_enabled: Whether to enable microphone output track.
audio_out_prebuffer_secs: Seconds of audio to accumulate before sending to WebRTC.
Absorbs TTS jitter to prevent the WebRTC jitter buffer from injecting silence.
Defaults to 0.5.
"""
audio_in_enabled: bool = True
audio_out_enabled: bool = True
microphone_out_enabled: bool = False
audio_out_prebuffer_secs: float = 0.5
class TavusTransportClient: