AudioBufferProcessor: record with lowest sample rate

Fixes #1653
2025-06-19 14:18:54 -07:00
5 changed files with 27 additions and 28 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,8 +9,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Added

- Added `lexicon_names` parameter to `AWSPollyTTSService.InputParams`.
-
 - Added reconnection logic and audio buffer management to `GladiaSTTService`.

 - Added Polish support to `AWSTranscribeSTTService`.
@@ -39,15 +37,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 - Upgraded `daily-python` to 0.19.3.

+### Deprecated
+
+- `AudioBufferProcessor` parameter `user_continuos_stream` is deprecated.
+
 ### Fixed

- Fixed function calling in `AWSNovaSonicLLMService`.
+- Fixed an `AudioBufferProcessor` issue that was causing crackling on the audio
+  stream with lower sample rate (due to upsampling the other stream). We now
+  record with the lowest sample rate to avoid upsampling.

 - Fixed an issue that would cause multiple `PipelineTask.on_idle_timeout`
  events to be triggered repeatedly.

- Fixed an issue that was causing user and bot speech to not be synchronized
-  during recordings.
+- Fixed an `AudioBufferProcessor` issue that was causing user and bot speech to
+  not be synchronized during recordings.

 - Fixed an issue where voice settings weren't applied to ElevenLabsTTSService.

@@ -59,10 +63,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 - Fixed an issue where `GoogleLLMService`'s TTFB value was incorrect.

-### Deprecated
-
- `AudioBufferProcessor` parameter `user_continuos_stream` is deprecated.
-
 ### Other

 - Rename `14e-function-calling-gemini.py` to `14e-function-calling-google.py`.
--- a/src/pipecat/processors/audio/audio_buffer_processor.py
+++ b/src/pipecat/processors/audio/audio_buffer_processor.py
@@ -7,6 +7,8 @@
 import time
 from typing import Optional

+from loguru import logger
+
 from pipecat.audio.utils import create_default_resampler, interleave_stereo_audio, mix_audio
 from pipecat.frames.frames import (
    AudioRawFrame,
@@ -181,7 +183,14 @@ class AudioBufferProcessor(FrameProcessor):
        await self.push_frame(frame, direction)

    def _update_sample_rate(self, frame: StartFrame):
-        self._sample_rate = self._init_sample_rate or frame.audio_out_sample_rate
+        # Record to the minimum sample rate to avoid possible downsampling
+        # artifacts.
+        min_sample_rate = min(frame.audio_in_sample_rate, frame.audio_out_sample_rate)
+        if frame.audio_in_sample_rate != frame.audio_out_sample_rate:
+            logger.debug(
+                f"{self} Input and output sample rates don't match, recording with smaller sample rate: {min_sample_rate} (this might get fixed in the future)"
+            )
+        self._sample_rate = self._init_sample_rate or min_sample_rate
        self._audio_buffer_size_1s = self._sample_rate * 2

    async def _process_recording(self, frame: Frame):
--- a/src/pipecat/processors/frameworks/rtvi.py
+++ b/src/pipecat/processors/frameworks/rtvi.py
@@ -400,8 +400,6 @@ class RTVIObserverParams:
    """
    Parameters for configuring RTVI Observer behavior.

-    Protip: Set these all to `False` if the bot will talk to another bot.
-
    Attributes:
        bot_llm_enabled (bool): Indicates if the bot's LLM messages should be sent.
        bot_tts_enabled (bool): Indicates if the bot's TTS messages should be sent.
@@ -804,7 +802,7 @@ class RTVIProcessor(FrameProcessor):
            await self._message_queue.put(message)
        except ValidationError as e:
            await self.send_error(f"Invalid RTVI transport message: {e}")
-            logger.warning(f"Invalid RTVI transport message '{transport_message}': {e}")
+            logger.warning(f"Invalid RTVI transport message: {e}")

    async def _handle_message(self, message: RTVIMessage):
        try:
--- a/src/pipecat/services/aws/tts.py
+++ b/src/pipecat/services/aws/tts.py
@@ -6,7 +6,7 @@

 import asyncio
 import os
-from typing import AsyncGenerator, List, Optional
+from typing import AsyncGenerator, Optional

 from loguru import logger
 from pydantic import BaseModel
@@ -115,7 +115,6 @@ class AWSPollyTTSService(TTSService):
        pitch: Optional[str] = None
        rate: Optional[str] = None
        volume: Optional[str] = None
-        lexicon_names: Optional[List[str]] = None

    def __init__(
        self,
@@ -148,7 +147,6 @@ class AWSPollyTTSService(TTSService):
            "pitch": params.pitch,
            "rate": params.rate,
            "volume": params.volume,
-            "lexicon_names": params.lexicon_names,
        }

        self._resampler = create_default_resampler()
@@ -237,7 +235,6 @@ class AWSPollyTTSService(TTSService):
                "Engine": self._settings["engine"],
                # AWS only supports 8000 and 16000 for PCM. We select 16000.
                "SampleRate": "16000",
-                "LexiconNames": self._settings["lexicon_names"],
            }

            # Filter out None values
--- a/src/pipecat/services/aws_nova_sonic/aws.py
+++ b/src/pipecat/services/aws_nova_sonic/aws.py
@@ -25,7 +25,6 @@ from pipecat.frames.frames import (
    CancelFrame,
    EndFrame,
    Frame,
-    FunctionCallFromLLM,
    InputAudioRawFrame,
    InterimTranscriptionFrame,
    LLMFullResponseEndFrame,
@@ -805,16 +804,12 @@ class AWSNovaSonicLLMService(LLMService):
        # Call tool function
        if self.has_function(function_name):
            if function_name in self._functions.keys() or None in self._functions.keys():
-                function_calls_llm = [
-                    FunctionCallFromLLM(
-                        context=self._context,
-                        tool_call_id=tool_call_id,
-                        function_name=function_name,
-                        arguments=arguments,
-                    )
-                ]
-
-                await self.run_function_calls(function_calls_llm)
+                await self.call_function(
+                    context=self._context,
+                    tool_call_id=tool_call_id,
+                    function_name=function_name,
+                    arguments=arguments,
+                )
        else:
            raise AWSNovaSonicUnhandledFunctionException(
                f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."