drop v1 support from aic.

This commit is contained in:
Gökmen Görgen
2026-01-16 12:31:41 +01:00
parent d3bdd2d246
commit a90c15362c
4 changed files with 12 additions and 489 deletions

View File

@@ -48,7 +48,7 @@ Issues = "https://github.com/pipecat-ai/pipecat/issues"
Changelog = "https://github.com/pipecat-ai/pipecat/blob/main/CHANGELOG.md"
[project.optional-dependencies]
aic = [ "aic-sdk>=1.2.0" ]
aic = [ "aic-sdk>=2.0.0" ]
anthropic = [ "anthropic~=0.49.0" ]
assemblyai = [ "pipecat-ai[websockets-base]" ]
asyncai = [ "pipecat-ai[websockets-base]" ]

View File

@@ -11,8 +11,7 @@ enhance audio streams in real time. It mirrors the structure of other filters li
the Koala filter and integrates with Pipecat's input transport pipeline.
Classes:
AICFilter: For aic-sdk < 2.0.0 (uses 'aic' module)
AICFilterV2: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module)
AICFilter: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module)
"""
import os
@@ -31,258 +30,8 @@ class AICFilter(BaseAudioFilter):
Buffers incoming audio to the model's preferred block size and processes
planar frames in-place using float32 samples in the linear -1..+1 range.
.. note::
This class requires aic-sdk < 2.0.0 (uses 'aic' module).
For aic-sdk >= 2.0.0, use :class:`AICFilterV2` instead.
"""
def __init__(
self,
*,
license_key: str = "",
model_type: Optional["AICModelType"] = None,
enhancement_level: Optional[float] = 1.0,
voice_gain: Optional[float] = 1.0,
noise_gate_enable: Optional[bool] = True,
) -> None:
"""Initialize the AIC filter.
Args:
license_key: ai-coustics license key for authentication.
model_type: Model variant to load. If None, defaults to AICModelType.QUAIL_STT.
enhancement_level: Optional overall enhancement strength (0.0..1.0).
voice_gain: Optional linear gain applied to detected speech (0.0..4.0).
noise_gate_enable: Optional enable/disable noise gate (default: True).
.. deprecated:: 1.3.0
The `noise_gate_enable` parameter is deprecated and no longer has any effect.
It will be removed in a future version.
"""
from pipecat.audio.utils import check_aic_sdk_version
check_aic_sdk_version("v1")
# Import AIC SDK v1 types
from aic import AICModelType
self._license_key = license_key
self._model_type = model_type if model_type is not None else AICModelType.QUAIL_STT
self._enhancement_level = enhancement_level
self._voice_gain = voice_gain
if noise_gate_enable is not None:
import warnings
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Parameter `noise_gate_enable` is deprecated and no longer has any effect. "
"It will be removed in a future version. Use AIC VAD instead (create_vad_analyzer()).",
DeprecationWarning,
)
self._noise_gate_enable = noise_gate_enable
self._enabled = True
self._sample_rate = 0
self._aic_ready = False
self._frames_per_block = 0
self._audio_buffer = bytearray()
# Model will be created in start() since the API now requires sample_rate
self._aic = None
def get_vad_factory(self):
"""Return a zero-arg factory that will create the VAD once the model exists.
Returns:
A zero-argument callable that, when invoked, returns an initialized
VoiceActivityDetector bound to the underlying AIC model. Raises a
RuntimeError if the model has not been initialized (i.e. start()
has not been called successfully).
"""
def _factory():
if self._aic is None:
raise RuntimeError("AIC model not initialized yet. Call start(sample_rate) first.")
return self._aic.create_vad()
return _factory
def create_vad_analyzer(
self,
*,
lookback_buffer_size: Optional[float] = None,
sensitivity: Optional[float] = None,
):
"""Return an analyzer that will lazily instantiate the AIC VAD when ready.
AIC VAD parameters:
- lookback_buffer_size:
Number of window-length audio buffers used as a lookback buffer.
Higher values increase prediction stability but add latency.
Range: 1.0 .. 20.0, Default (SDK): 6.0
- sensitivity:
Energy threshold sensitivity. Energy threshold = 10 ** (-sensitivity).
Range: 1.0 .. 15.0, Default (SDK): 6.0
Args:
lookback_buffer_size: Optional lookback buffer size to configure on the VAD.
Range: 1.0 .. 20.0. If None, SDK default is used.
sensitivity: Optional sensitivity (energy threshold) to configure on the VAD.
Range: 1.0 .. 15.0. If None, SDK default is used.
Returns:
A lazily-initialized AICVADAnalyzer that will bind to the VAD backend
once the filter's model has been created (after start(sample_rate)).
"""
from pipecat.audio.vad.aic_vad import AICVADAnalyzer
return AICVADAnalyzer(
vad_factory=self.get_vad_factory(),
lookback_buffer_size=lookback_buffer_size,
sensitivity=sensitivity,
)
async def start(self, sample_rate: int):
"""Initialize the filter with the transport's sample rate.
Args:
sample_rate: The sample rate of the input transport in Hz.
Returns:
None
"""
from aic import AICParameter, Model
self._sample_rate = sample_rate
try:
# Create model with required runtime parameters
self._aic = Model(
model_type=self._model_type,
license_key=self._license_key or None,
sample_rate=self._sample_rate,
channels=1,
)
self._frames_per_block = self._aic.optimal_num_frames()
# Optional parameter configuration
if self._enhancement_level is not None:
self._aic.set_parameter(
AICParameter.ENHANCEMENT_LEVEL,
float(self._enhancement_level if self._enabled else 0.0),
)
if self._voice_gain is not None:
self._aic.set_parameter(AICParameter.VOICE_GAIN, float(self._voice_gain))
self._aic_ready = True
# Log processor information
logger.debug(f"ai-coustics filter started:")
logger.debug(f" Sample rate: {self._sample_rate} Hz")
logger.debug(f" Frames per chunk: {self._frames_per_block}")
logger.debug(f" Enhancement strength: {int(self._enhancement_level * 100)}%")
logger.debug(f" Optimal input buffer size: {self._aic.optimal_num_frames()} samples")
logger.debug(f" Optimal sample rate: {self._aic.optimal_sample_rate()} Hz")
logger.debug(
f" Current algorithmic latency: {self._aic.processing_latency() / self._sample_rate * 1000:.2f}ms"
)
except Exception as e: # noqa: BLE001 - surfacing SDK initialization errors
logger.error(f"AIC model initialization failed: {e}")
self._aic_ready = False
async def stop(self):
"""Clean up the AIC model when stopping.
Returns:
None
"""
try:
if self._aic is not None:
self._aic.close()
finally:
self._aic = None
self._aic_ready = False
self._audio_buffer.clear()
async def process_frame(self, frame: FilterControlFrame):
"""Process control frames to enable/disable filtering.
Args:
frame: The control frame containing filter commands.
Returns:
None
"""
if isinstance(frame, FilterEnableFrame):
from aic import AICParameter
self._enabled = frame.enable
if self._aic is not None:
try:
level = float(self._enhancement_level if self._enabled else 0.0)
self._aic.set_parameter(AICParameter.ENHANCEMENT_LEVEL, level)
except Exception as e: # noqa: BLE001
logger.error(f"AIC set_parameter failed: {e}")
async def filter(self, audio: bytes) -> bytes:
"""Apply AIC enhancement to audio data.
Buffers incoming audio and processes it in chunks that match the AIC
model's required block length. Returns enhanced audio data.
Args:
audio: Raw audio data as bytes to be filtered (int16 PCM, planar).
Returns:
Enhanced audio data as bytes (int16 PCM, planar).
"""
if not self._aic_ready or self._aic is None:
return audio
self._audio_buffer.extend(audio)
filtered_chunks: List[bytes] = []
# Number of int16 samples currently buffered
available_frames = len(self._audio_buffer) // 2
while available_frames >= self._frames_per_block:
# Consume exactly one block worth of frames
samples_to_consume = self._frames_per_block * 1
bytes_to_consume = samples_to_consume * 2
block_bytes = bytes(self._audio_buffer[:bytes_to_consume])
# Convert to float32 in -1..+1 range and reshape to planar (channels, frames)
block_i16 = np.frombuffer(block_bytes, dtype=np.int16)
block_f32 = (block_i16.astype(np.float32) / 32768.0).reshape(
(1, self._frames_per_block)
)
# Process planar in-place; returns ndarray (same shape)
out_f32 = await self._aic.process_async(block_f32)
# Convert back to int16 bytes, planar layout
out_i16 = np.clip(out_f32 * 32768.0, -32768, 32767).astype(np.int16)
filtered_chunks.append(out_i16.reshape(-1).tobytes())
# Slide buffer
self._audio_buffer = self._audio_buffer[bytes_to_consume:]
available_frames = len(self._audio_buffer) // 2
# Do not flush incomplete frames; keep them buffered for the next call
return b"".join(filtered_chunks)
class AICFilterV2(BaseAudioFilter):
"""Audio filter using ai-coustics' AIC SDK v2 for real-time enhancement.
Buffers incoming audio to the model's preferred block size and processes
planar frames in-place using float32 samples in the linear -1..+1 range.
.. note::
This class requires aic-sdk >= 2.0.0 (uses 'aic_sdk' module).
For aic-sdk < 2.0.0, use :class:`AICFilter` instead.
"""
def __init__(
@@ -311,10 +60,6 @@ class AICFilterV2(BaseAudioFilter):
Raises:
ValueError: If neither model_id nor model_path is provided.
"""
from pipecat.audio.utils import check_aic_sdk_version
check_aic_sdk_version("v2")
if model_id is None and model_path is None:
raise ValueError(
"Either 'model_id' or 'model_path' must be provided. "
@@ -337,7 +82,7 @@ class AICFilterV2(BaseAudioFilter):
self._frames_per_block = 0
self._audio_buffer = bytearray()
# v2 API objects
# AIC SDK objects
self._model = None
self._processor = None
self._processor_ctx = None
@@ -362,7 +107,7 @@ class AICFilterV2(BaseAudioFilter):
):
"""Return an analyzer that will lazily instantiate the AIC VAD when ready.
AIC VAD parameters (v2):
AIC VAD parameters:
- speech_hold_duration:
How long VAD continues detecting after speech ends (in seconds).
Range: 0.0 .. 20x model window length, Default (SDK): 0.05s
@@ -377,12 +122,12 @@ class AICFilterV2(BaseAudioFilter):
Range: 1.0 .. 15.0. If None, SDK default (6.0) is used.
Returns:
A lazily-initialized AICVADAnalyzerV2 that will bind to the VAD context
A lazily-initialized AICVADAnalyzer that will bind to the VAD context
once the filter's processor has been created (after start(sample_rate)).
"""
from pipecat.audio.vad.aic_vad import AICVADAnalyzerV2
from pipecat.audio.vad.aic_vad import AICVADAnalyzer
return AICVADAnalyzerV2(
return AICVADAnalyzer(
vad_context_factory=lambda: self.get_vad_context(),
speech_hold_duration=speech_hold_duration,
sensitivity=sensitivity,
@@ -446,7 +191,7 @@ class AICFilterV2(BaseAudioFilter):
self._aic_ready = True
# Log processor information
logger.debug(f"ai-coustics filter (v2) started:")
logger.debug(f"ai-coustics filter started:")
logger.debug(f" Model ID: {self._model.get_id()}")
logger.debug(f" Sample rate: {self._sample_rate} Hz")
logger.debug(f" Frames per chunk: {self._frames_per_block}")

View File

@@ -14,10 +14,9 @@ various audio formats used in Pipecat pipelines.
import audioop
from typing import Literal
from loguru import logger
import numpy as np
import pyloudnorm as pyln
from loguru import logger
from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
from pipecat.audio.resamplers.soxr_resampler import SOXRAudioResampler
@@ -314,69 +313,3 @@ def is_silence(pcm_bytes: bytes) -> bool:
# If max value is lower than SPEAKING_THRESHOLD, consider it as silence
return max_value <= SPEAKING_THRESHOLD
def is_aic_sdk_v2() -> bool:
"""Detect if aic-sdk v2 is installed by checking the module name.
In v2, the module was renamed from 'aic' to 'aic_sdk'.
Returns:
True if aic-sdk v2 (aic_sdk module) is installed, False if v1 (aic module).
Raises:
ImportError: If neither aic nor aic_sdk module is installed.
"""
try:
import aic_sdk # noqa: F401
return True
except ModuleNotFoundError:
pass
try:
import aic # noqa: F401
return False
except ModuleNotFoundError:
logger.error("In order to use the AIC filter, you need to `pip install pipecat-ai[aic]`.")
raise ImportError(
"aic-sdk is not installed. Install with 'pip install pipecat-ai[aic]'."
)
def check_aic_sdk_version(required_version: Literal["v1", "v2"]) -> None:
"""Check if the aic-sdk is installed and compatible with the module.
This function checks both that the aic-sdk is installed and that its version
is compatible with the module requirements. Version detection is based on
the module name: v2 uses 'aic_sdk', v1 uses 'aic'.
Args:
required_version: Either "v1" (for aic-sdk < 2.0.0) or "v2" (for aic-sdk >= 2.0.0).
Raises:
ImportError: If aic-sdk is not installed or version is incompatible.
"""
is_v2 = is_aic_sdk_v2()
if required_version == "v1" and is_v2:
error_msg = (
"aic-sdk v2 (aic_sdk module) detected, but v1 (aic module) is required. "
"Please use the v2 classes instead: "
"'from pipecat.audio.filters.aic_filter import AICFilterV2' or "
"'from pipecat.audio.vad.aic_vad import AICVADAnalyzerV2'."
)
logger.error(error_msg)
raise ImportError(error_msg)
if required_version == "v2" and not is_v2:
error_msg = (
"aic-sdk v1 (aic module) detected, but v2 (aic_sdk module) is required. "
"Please update with 'pip install --upgrade aic-sdk>=2.0.0' "
"or use the v1 classes: "
"'from pipecat.audio.filters.aic_filter import AICFilter' or "
"'from pipecat.audio.vad.aic_vad import AICVADAnalyzer'."
)
logger.error(error_msg)
raise ImportError(error_msg)

View File

@@ -5,8 +5,7 @@ is_speech_detected() and map it to a float confidence (1.0/0.0). They use
10 ms windows based on the sample rate and apply optional AIC VAD parameters.
Classes:
AICVADAnalyzer: For aic-sdk < 2.0.0 (uses 'aic' module)
AICVADAnalyzerV2: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module)
AICVADAnalyzer: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module)
"""
from typing import Any, Callable, Optional
@@ -17,155 +16,6 @@ from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams
class AICVADAnalyzer(VADAnalyzer):
"""VAD analyzer that lazily instantiates the AIC VoiceActivityDetector via a factory.
The analyzer can be constructed before the AIC Model exists. Once the filter has
started and the Model is available, the provided factory will succeed and the
backend VAD will be created. We then switch to single-sample updates where
num_frames_required() returns 1 and confidence is derived from the backend's
boolean is_speech_detected() state.
AIC VAD runtime parameters:
- lookback_buffer_size:
Controls the lookback buffer size used by the VAD, i.e. the number of
window-length audio buffers used as a lookback buffer. Larger values improve
stability but increase latency.
Range: 1.0 .. 20.0
Default (SDK): 6.0
- sensitivity:
Controls the energy threshold sensitivity. Higher values make the detector
less sensitive (require more energy to count as speech).
Range: 1.0 .. 15.0
Formula: Energy threshold = 10 ** (-sensitivity)
Default (SDK): 6.0
.. note::
This class requires aic-sdk < 2.0.0 (uses 'aic' module).
For aic-sdk >= 2.0.0, use :class:`AICVADAnalyzerV2` instead.
"""
def __init__(
self,
*,
vad_factory: Optional[Callable[[], Any]] = None,
lookback_buffer_size: Optional[float] = None,
sensitivity: Optional[float] = None,
):
"""Create an AIC VAD analyzer.
Args:
vad_factory:
Zero-arg callable that returns an initialized AIC VoiceActivityDetector.
This may raise until the filter's Model has been created; the analyzer
will retry on set_sample_rate/first use.
lookback_buffer_size:
Optional override for AIC VAD lookback buffer size.
Range: 1.0 .. 20.0. Larger values increase stability at the cost of latency.
If None, the SDK default (6.0) is used.
sensitivity:
Optional override for AIC VAD sensitivity (energy threshold).
Range: 1.0 .. 15.0. Energy threshold = 10 ** (-sensitivity).
If None, the SDK default (6.0) is used.
"""
from pipecat.audio.utils import check_aic_sdk_version
check_aic_sdk_version("v1")
# Use fixed VAD parameters for AIC: no user override
fixed_params = VADParams(confidence=0.5, start_secs=0.0, stop_secs=0.0, min_volume=0.0)
super().__init__(sample_rate=None, params=fixed_params)
self._vad_factory = vad_factory
self._backend_vad: Optional[Any] = None
self._pending_lookback: Optional[float] = lookback_buffer_size
self._pending_sensitivity: Optional[float] = sensitivity
def bind_vad_factory(self, vad_factory: Callable[[], Any]):
"""Attach or replace the factory post-construction."""
self._vad_factory = vad_factory
self._ensure_backend_initialized()
def _apply_backend_params(self):
"""Apply optional AIC VAD parameters if available."""
from aic import AICVadParameter
if self._backend_vad is None or AICVadParameter is None:
return
try:
if self._pending_lookback is not None:
self._backend_vad.set_parameter(
AICVadParameter.LOOKBACK_BUFFER_SIZE, float(self._pending_lookback)
)
if self._pending_sensitivity is not None:
self._backend_vad.set_parameter(
AICVadParameter.SENSITIVITY, float(self._pending_sensitivity)
)
except Exception as e: # noqa: BLE001
logger.debug(f"AIC VAD parameter application deferred/failed: {e}")
def _ensure_backend_initialized(self):
if self._backend_vad is not None:
return
if not self._vad_factory:
return
try:
self._backend_vad = self._vad_factory()
self._apply_backend_params()
# With backend ready, recompute internal frame sizing
super().set_params(self._params)
logger.debug("AIC VAD backend initialized in analyzer.")
except Exception as e: # noqa: BLE001
# Filter may not be started yet; try again later
logger.debug(f"Deferring AIC VAD backend initialization: {e}")
def set_sample_rate(self, sample_rate: int):
"""Set the sample rate for audio processing.
Args:
sample_rate: Audio sample rate in Hz.
"""
# Set rate and attempt backend initialization once we know SR
self._sample_rate = self._init_sample_rate or sample_rate
self._ensure_backend_initialized()
# Ensure params are initialized even if backend not ready yet
try:
super().set_params(self._params)
except Exception:
pass
def num_frames_required(self) -> int:
"""Get the number of audio frames required for analysis.
Returns:
Number of frames needed for VAD processing.
"""
# Use 10 ms windows based on sample rate
return int(self.sample_rate * 0.01) if self.sample_rate > 0 else 160
def voice_confidence(self, buffer: bytes) -> float:
"""Calculate voice activity confidence for the given audio buffer.
Args:
buffer: Audio buffer to analyze.
Returns:
Voice confidence score is 0.0 or 1.0.
"""
# Ensure backend exists (filter might have started since last call)
self._ensure_backend_initialized()
if self._backend_vad is None:
return 0.0
# We do not need to analyze 'buffer' here since the model's VAD is updated
# as part of the enhancement pipeline. Simply query the boolean and map it.
try:
is_speech = self._backend_vad.is_speech_detected()
return 1.0 if is_speech else 0.0
except Exception as e: # noqa: BLE001
logger.error(f"AIC VAD inference error: {e}")
return 0.0
class AICVADAnalyzerV2(VADAnalyzer):
"""VAD analyzer that lazily binds to the AIC VadContext via a factory.
The analyzer can be constructed before the AIC Processor exists. Once the filter has
@@ -173,7 +23,7 @@ class AICVADAnalyzerV2(VADAnalyzer):
VadContext will be obtained. We then use the context's is_speech_detected() state
to derive confidence values.
AIC VAD runtime parameters (v2):
AIC VAD runtime parameters:
- speech_hold_duration:
Controls for how long the VAD continues to detect speech after the audio signal
no longer contains speech (in seconds).
@@ -188,7 +38,6 @@ class AICVADAnalyzerV2(VADAnalyzer):
.. note::
This class requires aic-sdk >= 2.0.0 (uses 'aic_sdk' module).
For aic-sdk < 2.0.0, use :class:`AICVADAnalyzer` instead.
"""
def __init__(
@@ -214,10 +63,6 @@ class AICVADAnalyzerV2(VADAnalyzer):
Range: 1.0 .. 15.0. Energy threshold = 10 ** (-sensitivity).
If None, the SDK default (6.0) is used.
"""
from pipecat.audio.utils import check_aic_sdk_version
check_aic_sdk_version("v2")
# Use fixed VAD parameters for AIC: no user override
fixed_params = VADParams(confidence=0.5, start_secs=0.0, stop_secs=0.0, min_volume=0.0)
super().__init__(sample_rate=None, params=fixed_params)
@@ -259,7 +104,7 @@ class AICVADAnalyzerV2(VADAnalyzer):
self._apply_vad_params()
# With VAD context ready, recompute internal frame sizing
super().set_params(self._params)
logger.debug("AIC VAD context (v2) initialized in analyzer.")
logger.debug("AIC VAD context initialized in analyzer.")
except Exception as e: # noqa: BLE001
# Filter may not be started yet; try again later
logger.debug(f"Deferring AIC VAD context initialization: {e}")