drop v1 support from aic.
This commit is contained in:
@@ -48,7 +48,7 @@ Issues = "https://github.com/pipecat-ai/pipecat/issues"
|
||||
Changelog = "https://github.com/pipecat-ai/pipecat/blob/main/CHANGELOG.md"
|
||||
|
||||
[project.optional-dependencies]
|
||||
aic = [ "aic-sdk>=1.2.0" ]
|
||||
aic = [ "aic-sdk>=2.0.0" ]
|
||||
anthropic = [ "anthropic~=0.49.0" ]
|
||||
assemblyai = [ "pipecat-ai[websockets-base]" ]
|
||||
asyncai = [ "pipecat-ai[websockets-base]" ]
|
||||
|
||||
@@ -11,8 +11,7 @@ enhance audio streams in real time. It mirrors the structure of other filters li
|
||||
the Koala filter and integrates with Pipecat's input transport pipeline.
|
||||
|
||||
Classes:
|
||||
AICFilter: For aic-sdk < 2.0.0 (uses 'aic' module)
|
||||
AICFilterV2: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module)
|
||||
AICFilter: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module)
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -31,258 +30,8 @@ class AICFilter(BaseAudioFilter):
|
||||
Buffers incoming audio to the model's preferred block size and processes
|
||||
planar frames in-place using float32 samples in the linear -1..+1 range.
|
||||
|
||||
.. note::
|
||||
This class requires aic-sdk < 2.0.0 (uses 'aic' module).
|
||||
For aic-sdk >= 2.0.0, use :class:`AICFilterV2` instead.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
license_key: str = "",
|
||||
model_type: Optional["AICModelType"] = None,
|
||||
enhancement_level: Optional[float] = 1.0,
|
||||
voice_gain: Optional[float] = 1.0,
|
||||
noise_gate_enable: Optional[bool] = True,
|
||||
) -> None:
|
||||
"""Initialize the AIC filter.
|
||||
|
||||
Args:
|
||||
license_key: ai-coustics license key for authentication.
|
||||
model_type: Model variant to load. If None, defaults to AICModelType.QUAIL_STT.
|
||||
enhancement_level: Optional overall enhancement strength (0.0..1.0).
|
||||
voice_gain: Optional linear gain applied to detected speech (0.0..4.0).
|
||||
noise_gate_enable: Optional enable/disable noise gate (default: True).
|
||||
|
||||
.. deprecated:: 1.3.0
|
||||
The `noise_gate_enable` parameter is deprecated and no longer has any effect.
|
||||
It will be removed in a future version.
|
||||
"""
|
||||
from pipecat.audio.utils import check_aic_sdk_version
|
||||
|
||||
check_aic_sdk_version("v1")
|
||||
|
||||
# Import AIC SDK v1 types
|
||||
from aic import AICModelType
|
||||
|
||||
self._license_key = license_key
|
||||
self._model_type = model_type if model_type is not None else AICModelType.QUAIL_STT
|
||||
|
||||
self._enhancement_level = enhancement_level
|
||||
self._voice_gain = voice_gain
|
||||
if noise_gate_enable is not None:
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"Parameter `noise_gate_enable` is deprecated and no longer has any effect. "
|
||||
"It will be removed in a future version. Use AIC VAD instead (create_vad_analyzer()).",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
self._noise_gate_enable = noise_gate_enable
|
||||
|
||||
self._enabled = True
|
||||
self._sample_rate = 0
|
||||
self._aic_ready = False
|
||||
self._frames_per_block = 0
|
||||
self._audio_buffer = bytearray()
|
||||
# Model will be created in start() since the API now requires sample_rate
|
||||
self._aic = None
|
||||
|
||||
def get_vad_factory(self):
|
||||
"""Return a zero-arg factory that will create the VAD once the model exists.
|
||||
|
||||
Returns:
|
||||
A zero-argument callable that, when invoked, returns an initialized
|
||||
VoiceActivityDetector bound to the underlying AIC model. Raises a
|
||||
RuntimeError if the model has not been initialized (i.e. start()
|
||||
has not been called successfully).
|
||||
"""
|
||||
|
||||
def _factory():
|
||||
if self._aic is None:
|
||||
raise RuntimeError("AIC model not initialized yet. Call start(sample_rate) first.")
|
||||
return self._aic.create_vad()
|
||||
|
||||
return _factory
|
||||
|
||||
def create_vad_analyzer(
|
||||
self,
|
||||
*,
|
||||
lookback_buffer_size: Optional[float] = None,
|
||||
sensitivity: Optional[float] = None,
|
||||
):
|
||||
"""Return an analyzer that will lazily instantiate the AIC VAD when ready.
|
||||
|
||||
AIC VAD parameters:
|
||||
- lookback_buffer_size:
|
||||
Number of window-length audio buffers used as a lookback buffer.
|
||||
Higher values increase prediction stability but add latency.
|
||||
Range: 1.0 .. 20.0, Default (SDK): 6.0
|
||||
- sensitivity:
|
||||
Energy threshold sensitivity. Energy threshold = 10 ** (-sensitivity).
|
||||
Range: 1.0 .. 15.0, Default (SDK): 6.0
|
||||
|
||||
Args:
|
||||
lookback_buffer_size: Optional lookback buffer size to configure on the VAD.
|
||||
Range: 1.0 .. 20.0. If None, SDK default is used.
|
||||
sensitivity: Optional sensitivity (energy threshold) to configure on the VAD.
|
||||
Range: 1.0 .. 15.0. If None, SDK default is used.
|
||||
|
||||
Returns:
|
||||
A lazily-initialized AICVADAnalyzer that will bind to the VAD backend
|
||||
once the filter's model has been created (after start(sample_rate)).
|
||||
"""
|
||||
from pipecat.audio.vad.aic_vad import AICVADAnalyzer
|
||||
|
||||
return AICVADAnalyzer(
|
||||
vad_factory=self.get_vad_factory(),
|
||||
lookback_buffer_size=lookback_buffer_size,
|
||||
sensitivity=sensitivity,
|
||||
)
|
||||
|
||||
async def start(self, sample_rate: int):
|
||||
"""Initialize the filter with the transport's sample rate.
|
||||
|
||||
Args:
|
||||
sample_rate: The sample rate of the input transport in Hz.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
from aic import AICParameter, Model
|
||||
|
||||
self._sample_rate = sample_rate
|
||||
|
||||
try:
|
||||
# Create model with required runtime parameters
|
||||
self._aic = Model(
|
||||
model_type=self._model_type,
|
||||
license_key=self._license_key or None,
|
||||
sample_rate=self._sample_rate,
|
||||
channels=1,
|
||||
)
|
||||
self._frames_per_block = self._aic.optimal_num_frames()
|
||||
|
||||
# Optional parameter configuration
|
||||
if self._enhancement_level is not None:
|
||||
self._aic.set_parameter(
|
||||
AICParameter.ENHANCEMENT_LEVEL,
|
||||
float(self._enhancement_level if self._enabled else 0.0),
|
||||
)
|
||||
if self._voice_gain is not None:
|
||||
self._aic.set_parameter(AICParameter.VOICE_GAIN, float(self._voice_gain))
|
||||
|
||||
self._aic_ready = True
|
||||
|
||||
# Log processor information
|
||||
logger.debug(f"ai-coustics filter started:")
|
||||
logger.debug(f" Sample rate: {self._sample_rate} Hz")
|
||||
logger.debug(f" Frames per chunk: {self._frames_per_block}")
|
||||
logger.debug(f" Enhancement strength: {int(self._enhancement_level * 100)}%")
|
||||
logger.debug(f" Optimal input buffer size: {self._aic.optimal_num_frames()} samples")
|
||||
logger.debug(f" Optimal sample rate: {self._aic.optimal_sample_rate()} Hz")
|
||||
logger.debug(
|
||||
f" Current algorithmic latency: {self._aic.processing_latency() / self._sample_rate * 1000:.2f}ms"
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 - surfacing SDK initialization errors
|
||||
logger.error(f"AIC model initialization failed: {e}")
|
||||
self._aic_ready = False
|
||||
|
||||
async def stop(self):
|
||||
"""Clean up the AIC model when stopping.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
try:
|
||||
if self._aic is not None:
|
||||
self._aic.close()
|
||||
finally:
|
||||
self._aic = None
|
||||
self._aic_ready = False
|
||||
self._audio_buffer.clear()
|
||||
|
||||
async def process_frame(self, frame: FilterControlFrame):
|
||||
"""Process control frames to enable/disable filtering.
|
||||
|
||||
Args:
|
||||
frame: The control frame containing filter commands.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if isinstance(frame, FilterEnableFrame):
|
||||
from aic import AICParameter
|
||||
|
||||
self._enabled = frame.enable
|
||||
if self._aic is not None:
|
||||
try:
|
||||
level = float(self._enhancement_level if self._enabled else 0.0)
|
||||
self._aic.set_parameter(AICParameter.ENHANCEMENT_LEVEL, level)
|
||||
except Exception as e: # noqa: BLE001
|
||||
logger.error(f"AIC set_parameter failed: {e}")
|
||||
|
||||
async def filter(self, audio: bytes) -> bytes:
|
||||
"""Apply AIC enhancement to audio data.
|
||||
|
||||
Buffers incoming audio and processes it in chunks that match the AIC
|
||||
model's required block length. Returns enhanced audio data.
|
||||
|
||||
Args:
|
||||
audio: Raw audio data as bytes to be filtered (int16 PCM, planar).
|
||||
|
||||
Returns:
|
||||
Enhanced audio data as bytes (int16 PCM, planar).
|
||||
"""
|
||||
if not self._aic_ready or self._aic is None:
|
||||
return audio
|
||||
|
||||
self._audio_buffer.extend(audio)
|
||||
|
||||
filtered_chunks: List[bytes] = []
|
||||
|
||||
# Number of int16 samples currently buffered
|
||||
available_frames = len(self._audio_buffer) // 2
|
||||
|
||||
while available_frames >= self._frames_per_block:
|
||||
# Consume exactly one block worth of frames
|
||||
samples_to_consume = self._frames_per_block * 1
|
||||
bytes_to_consume = samples_to_consume * 2
|
||||
block_bytes = bytes(self._audio_buffer[:bytes_to_consume])
|
||||
|
||||
# Convert to float32 in -1..+1 range and reshape to planar (channels, frames)
|
||||
block_i16 = np.frombuffer(block_bytes, dtype=np.int16)
|
||||
block_f32 = (block_i16.astype(np.float32) / 32768.0).reshape(
|
||||
(1, self._frames_per_block)
|
||||
)
|
||||
|
||||
# Process planar in-place; returns ndarray (same shape)
|
||||
out_f32 = await self._aic.process_async(block_f32)
|
||||
|
||||
# Convert back to int16 bytes, planar layout
|
||||
out_i16 = np.clip(out_f32 * 32768.0, -32768, 32767).astype(np.int16)
|
||||
filtered_chunks.append(out_i16.reshape(-1).tobytes())
|
||||
|
||||
# Slide buffer
|
||||
self._audio_buffer = self._audio_buffer[bytes_to_consume:]
|
||||
available_frames = len(self._audio_buffer) // 2
|
||||
|
||||
# Do not flush incomplete frames; keep them buffered for the next call
|
||||
return b"".join(filtered_chunks)
|
||||
|
||||
|
||||
class AICFilterV2(BaseAudioFilter):
|
||||
"""Audio filter using ai-coustics' AIC SDK v2 for real-time enhancement.
|
||||
|
||||
Buffers incoming audio to the model's preferred block size and processes
|
||||
planar frames in-place using float32 samples in the linear -1..+1 range.
|
||||
|
||||
.. note::
|
||||
This class requires aic-sdk >= 2.0.0 (uses 'aic_sdk' module).
|
||||
For aic-sdk < 2.0.0, use :class:`AICFilter` instead.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -311,10 +60,6 @@ class AICFilterV2(BaseAudioFilter):
|
||||
Raises:
|
||||
ValueError: If neither model_id nor model_path is provided.
|
||||
"""
|
||||
from pipecat.audio.utils import check_aic_sdk_version
|
||||
|
||||
check_aic_sdk_version("v2")
|
||||
|
||||
if model_id is None and model_path is None:
|
||||
raise ValueError(
|
||||
"Either 'model_id' or 'model_path' must be provided. "
|
||||
@@ -337,7 +82,7 @@ class AICFilterV2(BaseAudioFilter):
|
||||
self._frames_per_block = 0
|
||||
self._audio_buffer = bytearray()
|
||||
|
||||
# v2 API objects
|
||||
# AIC SDK objects
|
||||
self._model = None
|
||||
self._processor = None
|
||||
self._processor_ctx = None
|
||||
@@ -362,7 +107,7 @@ class AICFilterV2(BaseAudioFilter):
|
||||
):
|
||||
"""Return an analyzer that will lazily instantiate the AIC VAD when ready.
|
||||
|
||||
AIC VAD parameters (v2):
|
||||
AIC VAD parameters:
|
||||
- speech_hold_duration:
|
||||
How long VAD continues detecting after speech ends (in seconds).
|
||||
Range: 0.0 .. 20x model window length, Default (SDK): 0.05s
|
||||
@@ -377,12 +122,12 @@ class AICFilterV2(BaseAudioFilter):
|
||||
Range: 1.0 .. 15.0. If None, SDK default (6.0) is used.
|
||||
|
||||
Returns:
|
||||
A lazily-initialized AICVADAnalyzerV2 that will bind to the VAD context
|
||||
A lazily-initialized AICVADAnalyzer that will bind to the VAD context
|
||||
once the filter's processor has been created (after start(sample_rate)).
|
||||
"""
|
||||
from pipecat.audio.vad.aic_vad import AICVADAnalyzerV2
|
||||
from pipecat.audio.vad.aic_vad import AICVADAnalyzer
|
||||
|
||||
return AICVADAnalyzerV2(
|
||||
return AICVADAnalyzer(
|
||||
vad_context_factory=lambda: self.get_vad_context(),
|
||||
speech_hold_duration=speech_hold_duration,
|
||||
sensitivity=sensitivity,
|
||||
@@ -446,7 +191,7 @@ class AICFilterV2(BaseAudioFilter):
|
||||
self._aic_ready = True
|
||||
|
||||
# Log processor information
|
||||
logger.debug(f"ai-coustics filter (v2) started:")
|
||||
logger.debug(f"ai-coustics filter started:")
|
||||
logger.debug(f" Model ID: {self._model.get_id()}")
|
||||
logger.debug(f" Sample rate: {self._sample_rate} Hz")
|
||||
logger.debug(f" Frames per chunk: {self._frames_per_block}")
|
||||
|
||||
@@ -14,10 +14,9 @@ various audio formats used in Pipecat pipelines.
|
||||
import audioop
|
||||
from typing import Literal
|
||||
|
||||
from loguru import logger
|
||||
|
||||
import numpy as np
|
||||
import pyloudnorm as pyln
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
|
||||
from pipecat.audio.resamplers.soxr_resampler import SOXRAudioResampler
|
||||
@@ -314,69 +313,3 @@ def is_silence(pcm_bytes: bytes) -> bool:
|
||||
|
||||
# If max value is lower than SPEAKING_THRESHOLD, consider it as silence
|
||||
return max_value <= SPEAKING_THRESHOLD
|
||||
|
||||
|
||||
def is_aic_sdk_v2() -> bool:
|
||||
"""Detect if aic-sdk v2 is installed by checking the module name.
|
||||
|
||||
In v2, the module was renamed from 'aic' to 'aic_sdk'.
|
||||
|
||||
Returns:
|
||||
True if aic-sdk v2 (aic_sdk module) is installed, False if v1 (aic module).
|
||||
|
||||
Raises:
|
||||
ImportError: If neither aic nor aic_sdk module is installed.
|
||||
"""
|
||||
try:
|
||||
import aic_sdk # noqa: F401
|
||||
|
||||
return True
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import aic # noqa: F401
|
||||
|
||||
return False
|
||||
except ModuleNotFoundError:
|
||||
logger.error("In order to use the AIC filter, you need to `pip install pipecat-ai[aic]`.")
|
||||
raise ImportError(
|
||||
"aic-sdk is not installed. Install with 'pip install pipecat-ai[aic]'."
|
||||
)
|
||||
|
||||
|
||||
def check_aic_sdk_version(required_version: Literal["v1", "v2"]) -> None:
|
||||
"""Check if the aic-sdk is installed and compatible with the module.
|
||||
|
||||
This function checks both that the aic-sdk is installed and that its version
|
||||
is compatible with the module requirements. Version detection is based on
|
||||
the module name: v2 uses 'aic_sdk', v1 uses 'aic'.
|
||||
|
||||
Args:
|
||||
required_version: Either "v1" (for aic-sdk < 2.0.0) or "v2" (for aic-sdk >= 2.0.0).
|
||||
|
||||
Raises:
|
||||
ImportError: If aic-sdk is not installed or version is incompatible.
|
||||
"""
|
||||
is_v2 = is_aic_sdk_v2()
|
||||
|
||||
if required_version == "v1" and is_v2:
|
||||
error_msg = (
|
||||
"aic-sdk v2 (aic_sdk module) detected, but v1 (aic module) is required. "
|
||||
"Please use the v2 classes instead: "
|
||||
"'from pipecat.audio.filters.aic_filter import AICFilterV2' or "
|
||||
"'from pipecat.audio.vad.aic_vad import AICVADAnalyzerV2'."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ImportError(error_msg)
|
||||
|
||||
if required_version == "v2" and not is_v2:
|
||||
error_msg = (
|
||||
"aic-sdk v1 (aic module) detected, but v2 (aic_sdk module) is required. "
|
||||
"Please update with 'pip install --upgrade aic-sdk>=2.0.0' "
|
||||
"or use the v1 classes: "
|
||||
"'from pipecat.audio.filters.aic_filter import AICFilter' or "
|
||||
"'from pipecat.audio.vad.aic_vad import AICVADAnalyzer'."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ImportError(error_msg)
|
||||
|
||||
@@ -5,8 +5,7 @@ is_speech_detected() and map it to a float confidence (1.0/0.0). They use
|
||||
10 ms windows based on the sample rate and apply optional AIC VAD parameters.
|
||||
|
||||
Classes:
|
||||
AICVADAnalyzer: For aic-sdk < 2.0.0 (uses 'aic' module)
|
||||
AICVADAnalyzerV2: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module)
|
||||
AICVADAnalyzer: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module)
|
||||
"""
|
||||
|
||||
from typing import Any, Callable, Optional
|
||||
@@ -17,155 +16,6 @@ from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams
|
||||
|
||||
|
||||
class AICVADAnalyzer(VADAnalyzer):
|
||||
"""VAD analyzer that lazily instantiates the AIC VoiceActivityDetector via a factory.
|
||||
|
||||
The analyzer can be constructed before the AIC Model exists. Once the filter has
|
||||
started and the Model is available, the provided factory will succeed and the
|
||||
backend VAD will be created. We then switch to single-sample updates where
|
||||
num_frames_required() returns 1 and confidence is derived from the backend's
|
||||
boolean is_speech_detected() state.
|
||||
|
||||
AIC VAD runtime parameters:
|
||||
- lookback_buffer_size:
|
||||
Controls the lookback buffer size used by the VAD, i.e. the number of
|
||||
window-length audio buffers used as a lookback buffer. Larger values improve
|
||||
stability but increase latency.
|
||||
Range: 1.0 .. 20.0
|
||||
Default (SDK): 6.0
|
||||
- sensitivity:
|
||||
Controls the energy threshold sensitivity. Higher values make the detector
|
||||
less sensitive (require more energy to count as speech).
|
||||
Range: 1.0 .. 15.0
|
||||
Formula: Energy threshold = 10 ** (-sensitivity)
|
||||
Default (SDK): 6.0
|
||||
|
||||
.. note::
|
||||
This class requires aic-sdk < 2.0.0 (uses 'aic' module).
|
||||
For aic-sdk >= 2.0.0, use :class:`AICVADAnalyzerV2` instead.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
vad_factory: Optional[Callable[[], Any]] = None,
|
||||
lookback_buffer_size: Optional[float] = None,
|
||||
sensitivity: Optional[float] = None,
|
||||
):
|
||||
"""Create an AIC VAD analyzer.
|
||||
|
||||
Args:
|
||||
vad_factory:
|
||||
Zero-arg callable that returns an initialized AIC VoiceActivityDetector.
|
||||
This may raise until the filter's Model has been created; the analyzer
|
||||
will retry on set_sample_rate/first use.
|
||||
lookback_buffer_size:
|
||||
Optional override for AIC VAD lookback buffer size.
|
||||
Range: 1.0 .. 20.0. Larger values increase stability at the cost of latency.
|
||||
If None, the SDK default (6.0) is used.
|
||||
sensitivity:
|
||||
Optional override for AIC VAD sensitivity (energy threshold).
|
||||
Range: 1.0 .. 15.0. Energy threshold = 10 ** (-sensitivity).
|
||||
If None, the SDK default (6.0) is used.
|
||||
"""
|
||||
from pipecat.audio.utils import check_aic_sdk_version
|
||||
|
||||
check_aic_sdk_version("v1")
|
||||
|
||||
# Use fixed VAD parameters for AIC: no user override
|
||||
fixed_params = VADParams(confidence=0.5, start_secs=0.0, stop_secs=0.0, min_volume=0.0)
|
||||
super().__init__(sample_rate=None, params=fixed_params)
|
||||
self._vad_factory = vad_factory
|
||||
self._backend_vad: Optional[Any] = None
|
||||
self._pending_lookback: Optional[float] = lookback_buffer_size
|
||||
self._pending_sensitivity: Optional[float] = sensitivity
|
||||
|
||||
def bind_vad_factory(self, vad_factory: Callable[[], Any]):
|
||||
"""Attach or replace the factory post-construction."""
|
||||
self._vad_factory = vad_factory
|
||||
self._ensure_backend_initialized()
|
||||
|
||||
def _apply_backend_params(self):
|
||||
"""Apply optional AIC VAD parameters if available."""
|
||||
from aic import AICVadParameter
|
||||
|
||||
if self._backend_vad is None or AICVadParameter is None:
|
||||
return
|
||||
try:
|
||||
if self._pending_lookback is not None:
|
||||
self._backend_vad.set_parameter(
|
||||
AICVadParameter.LOOKBACK_BUFFER_SIZE, float(self._pending_lookback)
|
||||
)
|
||||
if self._pending_sensitivity is not None:
|
||||
self._backend_vad.set_parameter(
|
||||
AICVadParameter.SENSITIVITY, float(self._pending_sensitivity)
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
logger.debug(f"AIC VAD parameter application deferred/failed: {e}")
|
||||
|
||||
def _ensure_backend_initialized(self):
|
||||
if self._backend_vad is not None:
|
||||
return
|
||||
if not self._vad_factory:
|
||||
return
|
||||
try:
|
||||
self._backend_vad = self._vad_factory()
|
||||
self._apply_backend_params()
|
||||
# With backend ready, recompute internal frame sizing
|
||||
super().set_params(self._params)
|
||||
logger.debug("AIC VAD backend initialized in analyzer.")
|
||||
except Exception as e: # noqa: BLE001
|
||||
# Filter may not be started yet; try again later
|
||||
logger.debug(f"Deferring AIC VAD backend initialization: {e}")
|
||||
|
||||
def set_sample_rate(self, sample_rate: int):
|
||||
"""Set the sample rate for audio processing.
|
||||
|
||||
Args:
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
"""
|
||||
# Set rate and attempt backend initialization once we know SR
|
||||
self._sample_rate = self._init_sample_rate or sample_rate
|
||||
self._ensure_backend_initialized()
|
||||
# Ensure params are initialized even if backend not ready yet
|
||||
try:
|
||||
super().set_params(self._params)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def num_frames_required(self) -> int:
|
||||
"""Get the number of audio frames required for analysis.
|
||||
|
||||
Returns:
|
||||
Number of frames needed for VAD processing.
|
||||
"""
|
||||
# Use 10 ms windows based on sample rate
|
||||
return int(self.sample_rate * 0.01) if self.sample_rate > 0 else 160
|
||||
|
||||
def voice_confidence(self, buffer: bytes) -> float:
|
||||
"""Calculate voice activity confidence for the given audio buffer.
|
||||
|
||||
Args:
|
||||
buffer: Audio buffer to analyze.
|
||||
|
||||
Returns:
|
||||
Voice confidence score is 0.0 or 1.0.
|
||||
"""
|
||||
# Ensure backend exists (filter might have started since last call)
|
||||
self._ensure_backend_initialized()
|
||||
if self._backend_vad is None:
|
||||
return 0.0
|
||||
|
||||
# We do not need to analyze 'buffer' here since the model's VAD is updated
|
||||
# as part of the enhancement pipeline. Simply query the boolean and map it.
|
||||
try:
|
||||
is_speech = self._backend_vad.is_speech_detected()
|
||||
return 1.0 if is_speech else 0.0
|
||||
except Exception as e: # noqa: BLE001
|
||||
logger.error(f"AIC VAD inference error: {e}")
|
||||
return 0.0
|
||||
|
||||
|
||||
class AICVADAnalyzerV2(VADAnalyzer):
|
||||
"""VAD analyzer that lazily binds to the AIC VadContext via a factory.
|
||||
|
||||
The analyzer can be constructed before the AIC Processor exists. Once the filter has
|
||||
@@ -173,7 +23,7 @@ class AICVADAnalyzerV2(VADAnalyzer):
|
||||
VadContext will be obtained. We then use the context's is_speech_detected() state
|
||||
to derive confidence values.
|
||||
|
||||
AIC VAD runtime parameters (v2):
|
||||
AIC VAD runtime parameters:
|
||||
- speech_hold_duration:
|
||||
Controls for how long the VAD continues to detect speech after the audio signal
|
||||
no longer contains speech (in seconds).
|
||||
@@ -188,7 +38,6 @@ class AICVADAnalyzerV2(VADAnalyzer):
|
||||
|
||||
.. note::
|
||||
This class requires aic-sdk >= 2.0.0 (uses 'aic_sdk' module).
|
||||
For aic-sdk < 2.0.0, use :class:`AICVADAnalyzer` instead.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -214,10 +63,6 @@ class AICVADAnalyzerV2(VADAnalyzer):
|
||||
Range: 1.0 .. 15.0. Energy threshold = 10 ** (-sensitivity).
|
||||
If None, the SDK default (6.0) is used.
|
||||
"""
|
||||
from pipecat.audio.utils import check_aic_sdk_version
|
||||
|
||||
check_aic_sdk_version("v2")
|
||||
|
||||
# Use fixed VAD parameters for AIC: no user override
|
||||
fixed_params = VADParams(confidence=0.5, start_secs=0.0, stop_secs=0.0, min_volume=0.0)
|
||||
super().__init__(sample_rate=None, params=fixed_params)
|
||||
@@ -259,7 +104,7 @@ class AICVADAnalyzerV2(VADAnalyzer):
|
||||
self._apply_vad_params()
|
||||
# With VAD context ready, recompute internal frame sizing
|
||||
super().set_params(self._params)
|
||||
logger.debug("AIC VAD context (v2) initialized in analyzer.")
|
||||
logger.debug("AIC VAD context initialized in analyzer.")
|
||||
except Exception as e: # noqa: BLE001
|
||||
# Filter may not be started yet; try again later
|
||||
logger.debug(f"Deferring AIC VAD context initialization: {e}")
|
||||
|
||||
Reference in New Issue
Block a user