Compare commits

..

1 Commits

Author SHA1 Message Date
Aleix Conchillo Flaqué
5a682f8c1f AudioBufferProcessor: record with lowest sample rate
Fixes #1653
2025-06-19 14:18:54 -07:00
5 changed files with 27 additions and 28 deletions

View File

@@ -9,8 +9,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added `lexicon_names` parameter to `AWSPollyTTSService.InputParams`.
- Added reconnection logic and audio buffer management to `GladiaSTTService`.
- Added Polish support to `AWSTranscribeSTTService`.
@@ -39,15 +37,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Upgraded `daily-python` to 0.19.3.
### Deprecated
- `AudioBufferProcessor` parameter `user_continuos_stream` is deprecated.
### Fixed
- Fixed function calling in `AWSNovaSonicLLMService`.
- Fixed an `AudioBufferProcessor` issue that was causing crackling on the audio
stream with lower sample rate (due to upsampling the other stream). We now
record with the lowest sample rate to avoid upsampling.
- Fixed an issue that would cause multiple `PipelineTask.on_idle_timeout`
events to be triggered repeatedly.
- Fixed an issue that was causing user and bot speech to not be synchronized
during recordings.
- Fixed an `AudioBufferProcessor` issue that was causing user and bot speech to
not be synchronized during recordings.
- Fixed an issue where voice settings weren't applied to ElevenLabsTTSService.
@@ -59,10 +63,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed an issue where `GoogleLLMService`'s TTFB value was incorrect.
### Deprecated
- `AudioBufferProcessor` parameter `user_continuos_stream` is deprecated.
### Other
- Rename `14e-function-calling-gemini.py` to `14e-function-calling-google.py`.

View File

@@ -7,6 +7,8 @@
import time
from typing import Optional
from loguru import logger
from pipecat.audio.utils import create_default_resampler, interleave_stereo_audio, mix_audio
from pipecat.frames.frames import (
AudioRawFrame,
@@ -181,7 +183,14 @@ class AudioBufferProcessor(FrameProcessor):
await self.push_frame(frame, direction)
def _update_sample_rate(self, frame: StartFrame):
self._sample_rate = self._init_sample_rate or frame.audio_out_sample_rate
# Record to the minimum sample rate to avoid possible downsampling
# artifacts.
min_sample_rate = min(frame.audio_in_sample_rate, frame.audio_out_sample_rate)
if frame.audio_in_sample_rate != frame.audio_out_sample_rate:
logger.debug(
f"{self} Input and output sample rates don't match, recording with smaller sample rate: {min_sample_rate} (this might get fixed in the future)"
)
self._sample_rate = self._init_sample_rate or min_sample_rate
self._audio_buffer_size_1s = self._sample_rate * 2
async def _process_recording(self, frame: Frame):

View File

@@ -400,8 +400,6 @@ class RTVIObserverParams:
"""
Parameters for configuring RTVI Observer behavior.
Protip: Set these all to `False` if the bot will talk to another bot.
Attributes:
bot_llm_enabled (bool): Indicates if the bot's LLM messages should be sent.
bot_tts_enabled (bool): Indicates if the bot's TTS messages should be sent.
@@ -804,7 +802,7 @@ class RTVIProcessor(FrameProcessor):
await self._message_queue.put(message)
except ValidationError as e:
await self.send_error(f"Invalid RTVI transport message: {e}")
logger.warning(f"Invalid RTVI transport message '{transport_message}': {e}")
logger.warning(f"Invalid RTVI transport message: {e}")
async def _handle_message(self, message: RTVIMessage):
try:

View File

@@ -6,7 +6,7 @@
import asyncio
import os
from typing import AsyncGenerator, List, Optional
from typing import AsyncGenerator, Optional
from loguru import logger
from pydantic import BaseModel
@@ -115,7 +115,6 @@ class AWSPollyTTSService(TTSService):
pitch: Optional[str] = None
rate: Optional[str] = None
volume: Optional[str] = None
lexicon_names: Optional[List[str]] = None
def __init__(
self,
@@ -148,7 +147,6 @@ class AWSPollyTTSService(TTSService):
"pitch": params.pitch,
"rate": params.rate,
"volume": params.volume,
"lexicon_names": params.lexicon_names,
}
self._resampler = create_default_resampler()
@@ -237,7 +235,6 @@ class AWSPollyTTSService(TTSService):
"Engine": self._settings["engine"],
# AWS only supports 8000 and 16000 for PCM. We select 16000.
"SampleRate": "16000",
"LexiconNames": self._settings["lexicon_names"],
}
# Filter out None values

View File

@@ -25,7 +25,6 @@ from pipecat.frames.frames import (
CancelFrame,
EndFrame,
Frame,
FunctionCallFromLLM,
InputAudioRawFrame,
InterimTranscriptionFrame,
LLMFullResponseEndFrame,
@@ -805,16 +804,12 @@ class AWSNovaSonicLLMService(LLMService):
# Call tool function
if self.has_function(function_name):
if function_name in self._functions.keys() or None in self._functions.keys():
function_calls_llm = [
FunctionCallFromLLM(
context=self._context,
tool_call_id=tool_call_id,
function_name=function_name,
arguments=arguments,
)
]
await self.run_function_calls(function_calls_llm)
await self.call_function(
context=self._context,
tool_call_id=tool_call_id,
function_name=function_name,
arguments=arguments,
)
else:
raise AWSNovaSonicUnhandledFunctionException(
f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."