diff --git a/pyproject.toml b/pyproject.toml index f209a4c64..65a2a4bf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ Issues = "https://github.com/pipecat-ai/pipecat/issues" Changelog = "https://github.com/pipecat-ai/pipecat/blob/main/CHANGELOG.md" [project.optional-dependencies] -aic = [ "aic-sdk>=1.2.0" ] +aic = [ "aic-sdk>=2.0.0" ] anthropic = [ "anthropic~=0.49.0" ] assemblyai = [ "pipecat-ai[websockets-base]" ] asyncai = [ "pipecat-ai[websockets-base]" ] diff --git a/src/pipecat/audio/filters/aic_filter.py b/src/pipecat/audio/filters/aic_filter.py index f71c09700..d9d7a18a1 100644 --- a/src/pipecat/audio/filters/aic_filter.py +++ b/src/pipecat/audio/filters/aic_filter.py @@ -11,8 +11,7 @@ enhance audio streams in real time. It mirrors the structure of other filters li the Koala filter and integrates with Pipecat's input transport pipeline. Classes: - AICFilter: For aic-sdk < 2.0.0 (uses 'aic' module) - AICFilterV2: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module) + AICFilter: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module) """ import os @@ -31,258 +30,8 @@ class AICFilter(BaseAudioFilter): Buffers incoming audio to the model's preferred block size and processes planar frames in-place using float32 samples in the linear -1..+1 range. - .. note:: - This class requires aic-sdk < 2.0.0 (uses 'aic' module). - For aic-sdk >= 2.0.0, use :class:`AICFilterV2` instead. - """ - - def __init__( - self, - *, - license_key: str = "", - model_type: Optional["AICModelType"] = None, - enhancement_level: Optional[float] = 1.0, - voice_gain: Optional[float] = 1.0, - noise_gate_enable: Optional[bool] = True, - ) -> None: - """Initialize the AIC filter. - - Args: - license_key: ai-coustics license key for authentication. - model_type: Model variant to load. If None, defaults to AICModelType.QUAIL_STT. - enhancement_level: Optional overall enhancement strength (0.0..1.0). - voice_gain: Optional linear gain applied to detected speech (0.0..4.0). - noise_gate_enable: Optional enable/disable noise gate (default: True). - - .. deprecated:: 1.3.0 - The `noise_gate_enable` parameter is deprecated and no longer has any effect. - It will be removed in a future version. - """ - from pipecat.audio.utils import check_aic_sdk_version - - check_aic_sdk_version("v1") - - # Import AIC SDK v1 types - from aic import AICModelType - - self._license_key = license_key - self._model_type = model_type if model_type is not None else AICModelType.QUAIL_STT - - self._enhancement_level = enhancement_level - self._voice_gain = voice_gain - if noise_gate_enable is not None: - import warnings - - with warnings.catch_warnings(): - warnings.simplefilter("always") - warnings.warn( - "Parameter `noise_gate_enable` is deprecated and no longer has any effect. " - "It will be removed in a future version. Use AIC VAD instead (create_vad_analyzer()).", - DeprecationWarning, - ) - - self._noise_gate_enable = noise_gate_enable - - self._enabled = True - self._sample_rate = 0 - self._aic_ready = False - self._frames_per_block = 0 - self._audio_buffer = bytearray() - # Model will be created in start() since the API now requires sample_rate - self._aic = None - - def get_vad_factory(self): - """Return a zero-arg factory that will create the VAD once the model exists. - - Returns: - A zero-argument callable that, when invoked, returns an initialized - VoiceActivityDetector bound to the underlying AIC model. Raises a - RuntimeError if the model has not been initialized (i.e. start() - has not been called successfully). - """ - - def _factory(): - if self._aic is None: - raise RuntimeError("AIC model not initialized yet. Call start(sample_rate) first.") - return self._aic.create_vad() - - return _factory - - def create_vad_analyzer( - self, - *, - lookback_buffer_size: Optional[float] = None, - sensitivity: Optional[float] = None, - ): - """Return an analyzer that will lazily instantiate the AIC VAD when ready. - - AIC VAD parameters: - - lookback_buffer_size: - Number of window-length audio buffers used as a lookback buffer. - Higher values increase prediction stability but add latency. - Range: 1.0 .. 20.0, Default (SDK): 6.0 - - sensitivity: - Energy threshold sensitivity. Energy threshold = 10 ** (-sensitivity). - Range: 1.0 .. 15.0, Default (SDK): 6.0 - - Args: - lookback_buffer_size: Optional lookback buffer size to configure on the VAD. - Range: 1.0 .. 20.0. If None, SDK default is used. - sensitivity: Optional sensitivity (energy threshold) to configure on the VAD. - Range: 1.0 .. 15.0. If None, SDK default is used. - - Returns: - A lazily-initialized AICVADAnalyzer that will bind to the VAD backend - once the filter's model has been created (after start(sample_rate)). - """ - from pipecat.audio.vad.aic_vad import AICVADAnalyzer - - return AICVADAnalyzer( - vad_factory=self.get_vad_factory(), - lookback_buffer_size=lookback_buffer_size, - sensitivity=sensitivity, - ) - - async def start(self, sample_rate: int): - """Initialize the filter with the transport's sample rate. - - Args: - sample_rate: The sample rate of the input transport in Hz. - - Returns: - None - """ - from aic import AICParameter, Model - - self._sample_rate = sample_rate - - try: - # Create model with required runtime parameters - self._aic = Model( - model_type=self._model_type, - license_key=self._license_key or None, - sample_rate=self._sample_rate, - channels=1, - ) - self._frames_per_block = self._aic.optimal_num_frames() - - # Optional parameter configuration - if self._enhancement_level is not None: - self._aic.set_parameter( - AICParameter.ENHANCEMENT_LEVEL, - float(self._enhancement_level if self._enabled else 0.0), - ) - if self._voice_gain is not None: - self._aic.set_parameter(AICParameter.VOICE_GAIN, float(self._voice_gain)) - - self._aic_ready = True - - # Log processor information - logger.debug(f"ai-coustics filter started:") - logger.debug(f" Sample rate: {self._sample_rate} Hz") - logger.debug(f" Frames per chunk: {self._frames_per_block}") - logger.debug(f" Enhancement strength: {int(self._enhancement_level * 100)}%") - logger.debug(f" Optimal input buffer size: {self._aic.optimal_num_frames()} samples") - logger.debug(f" Optimal sample rate: {self._aic.optimal_sample_rate()} Hz") - logger.debug( - f" Current algorithmic latency: {self._aic.processing_latency() / self._sample_rate * 1000:.2f}ms" - ) - except Exception as e: # noqa: BLE001 - surfacing SDK initialization errors - logger.error(f"AIC model initialization failed: {e}") - self._aic_ready = False - - async def stop(self): - """Clean up the AIC model when stopping. - - Returns: - None - """ - try: - if self._aic is not None: - self._aic.close() - finally: - self._aic = None - self._aic_ready = False - self._audio_buffer.clear() - - async def process_frame(self, frame: FilterControlFrame): - """Process control frames to enable/disable filtering. - - Args: - frame: The control frame containing filter commands. - - Returns: - None - """ - if isinstance(frame, FilterEnableFrame): - from aic import AICParameter - - self._enabled = frame.enable - if self._aic is not None: - try: - level = float(self._enhancement_level if self._enabled else 0.0) - self._aic.set_parameter(AICParameter.ENHANCEMENT_LEVEL, level) - except Exception as e: # noqa: BLE001 - logger.error(f"AIC set_parameter failed: {e}") - - async def filter(self, audio: bytes) -> bytes: - """Apply AIC enhancement to audio data. - - Buffers incoming audio and processes it in chunks that match the AIC - model's required block length. Returns enhanced audio data. - - Args: - audio: Raw audio data as bytes to be filtered (int16 PCM, planar). - - Returns: - Enhanced audio data as bytes (int16 PCM, planar). - """ - if not self._aic_ready or self._aic is None: - return audio - - self._audio_buffer.extend(audio) - - filtered_chunks: List[bytes] = [] - - # Number of int16 samples currently buffered - available_frames = len(self._audio_buffer) // 2 - - while available_frames >= self._frames_per_block: - # Consume exactly one block worth of frames - samples_to_consume = self._frames_per_block * 1 - bytes_to_consume = samples_to_consume * 2 - block_bytes = bytes(self._audio_buffer[:bytes_to_consume]) - - # Convert to float32 in -1..+1 range and reshape to planar (channels, frames) - block_i16 = np.frombuffer(block_bytes, dtype=np.int16) - block_f32 = (block_i16.astype(np.float32) / 32768.0).reshape( - (1, self._frames_per_block) - ) - - # Process planar in-place; returns ndarray (same shape) - out_f32 = await self._aic.process_async(block_f32) - - # Convert back to int16 bytes, planar layout - out_i16 = np.clip(out_f32 * 32768.0, -32768, 32767).astype(np.int16) - filtered_chunks.append(out_i16.reshape(-1).tobytes()) - - # Slide buffer - self._audio_buffer = self._audio_buffer[bytes_to_consume:] - available_frames = len(self._audio_buffer) // 2 - - # Do not flush incomplete frames; keep them buffered for the next call - return b"".join(filtered_chunks) - - -class AICFilterV2(BaseAudioFilter): - """Audio filter using ai-coustics' AIC SDK v2 for real-time enhancement. - - Buffers incoming audio to the model's preferred block size and processes - planar frames in-place using float32 samples in the linear -1..+1 range. - .. note:: This class requires aic-sdk >= 2.0.0 (uses 'aic_sdk' module). - For aic-sdk < 2.0.0, use :class:`AICFilter` instead. """ def __init__( @@ -311,10 +60,6 @@ class AICFilterV2(BaseAudioFilter): Raises: ValueError: If neither model_id nor model_path is provided. """ - from pipecat.audio.utils import check_aic_sdk_version - - check_aic_sdk_version("v2") - if model_id is None and model_path is None: raise ValueError( "Either 'model_id' or 'model_path' must be provided. " @@ -337,7 +82,7 @@ class AICFilterV2(BaseAudioFilter): self._frames_per_block = 0 self._audio_buffer = bytearray() - # v2 API objects + # AIC SDK objects self._model = None self._processor = None self._processor_ctx = None @@ -362,7 +107,7 @@ class AICFilterV2(BaseAudioFilter): ): """Return an analyzer that will lazily instantiate the AIC VAD when ready. - AIC VAD parameters (v2): + AIC VAD parameters: - speech_hold_duration: How long VAD continues detecting after speech ends (in seconds). Range: 0.0 .. 20x model window length, Default (SDK): 0.05s @@ -377,12 +122,12 @@ class AICFilterV2(BaseAudioFilter): Range: 1.0 .. 15.0. If None, SDK default (6.0) is used. Returns: - A lazily-initialized AICVADAnalyzerV2 that will bind to the VAD context + A lazily-initialized AICVADAnalyzer that will bind to the VAD context once the filter's processor has been created (after start(sample_rate)). """ - from pipecat.audio.vad.aic_vad import AICVADAnalyzerV2 + from pipecat.audio.vad.aic_vad import AICVADAnalyzer - return AICVADAnalyzerV2( + return AICVADAnalyzer( vad_context_factory=lambda: self.get_vad_context(), speech_hold_duration=speech_hold_duration, sensitivity=sensitivity, @@ -446,7 +191,7 @@ class AICFilterV2(BaseAudioFilter): self._aic_ready = True # Log processor information - logger.debug(f"ai-coustics filter (v2) started:") + logger.debug(f"ai-coustics filter started:") logger.debug(f" Model ID: {self._model.get_id()}") logger.debug(f" Sample rate: {self._sample_rate} Hz") logger.debug(f" Frames per chunk: {self._frames_per_block}") diff --git a/src/pipecat/audio/utils.py b/src/pipecat/audio/utils.py index c61aba4dd..29fc44ea9 100644 --- a/src/pipecat/audio/utils.py +++ b/src/pipecat/audio/utils.py @@ -14,10 +14,9 @@ various audio formats used in Pipecat pipelines. import audioop from typing import Literal -from loguru import logger - import numpy as np import pyloudnorm as pyln +from loguru import logger from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler from pipecat.audio.resamplers.soxr_resampler import SOXRAudioResampler @@ -314,69 +313,3 @@ def is_silence(pcm_bytes: bytes) -> bool: # If max value is lower than SPEAKING_THRESHOLD, consider it as silence return max_value <= SPEAKING_THRESHOLD - - -def is_aic_sdk_v2() -> bool: - """Detect if aic-sdk v2 is installed by checking the module name. - - In v2, the module was renamed from 'aic' to 'aic_sdk'. - - Returns: - True if aic-sdk v2 (aic_sdk module) is installed, False if v1 (aic module). - - Raises: - ImportError: If neither aic nor aic_sdk module is installed. - """ - try: - import aic_sdk # noqa: F401 - - return True - except ModuleNotFoundError: - pass - - try: - import aic # noqa: F401 - - return False - except ModuleNotFoundError: - logger.error("In order to use the AIC filter, you need to `pip install pipecat-ai[aic]`.") - raise ImportError( - "aic-sdk is not installed. Install with 'pip install pipecat-ai[aic]'." - ) - - -def check_aic_sdk_version(required_version: Literal["v1", "v2"]) -> None: - """Check if the aic-sdk is installed and compatible with the module. - - This function checks both that the aic-sdk is installed and that its version - is compatible with the module requirements. Version detection is based on - the module name: v2 uses 'aic_sdk', v1 uses 'aic'. - - Args: - required_version: Either "v1" (for aic-sdk < 2.0.0) or "v2" (for aic-sdk >= 2.0.0). - - Raises: - ImportError: If aic-sdk is not installed or version is incompatible. - """ - is_v2 = is_aic_sdk_v2() - - if required_version == "v1" and is_v2: - error_msg = ( - "aic-sdk v2 (aic_sdk module) detected, but v1 (aic module) is required. " - "Please use the v2 classes instead: " - "'from pipecat.audio.filters.aic_filter import AICFilterV2' or " - "'from pipecat.audio.vad.aic_vad import AICVADAnalyzerV2'." - ) - logger.error(error_msg) - raise ImportError(error_msg) - - if required_version == "v2" and not is_v2: - error_msg = ( - "aic-sdk v1 (aic module) detected, but v2 (aic_sdk module) is required. " - "Please update with 'pip install --upgrade aic-sdk>=2.0.0' " - "or use the v1 classes: " - "'from pipecat.audio.filters.aic_filter import AICFilter' or " - "'from pipecat.audio.vad.aic_vad import AICVADAnalyzer'." - ) - logger.error(error_msg) - raise ImportError(error_msg) diff --git a/src/pipecat/audio/vad/aic_vad.py b/src/pipecat/audio/vad/aic_vad.py index 9ff19feec..05b576ce8 100644 --- a/src/pipecat/audio/vad/aic_vad.py +++ b/src/pipecat/audio/vad/aic_vad.py @@ -5,8 +5,7 @@ is_speech_detected() and map it to a float confidence (1.0/0.0). They use 10 ms windows based on the sample rate and apply optional AIC VAD parameters. Classes: - AICVADAnalyzer: For aic-sdk < 2.0.0 (uses 'aic' module) - AICVADAnalyzerV2: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module) + AICVADAnalyzer: For aic-sdk >= 2.0.0 (uses 'aic_sdk' module) """ from typing import Any, Callable, Optional @@ -17,155 +16,6 @@ from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams class AICVADAnalyzer(VADAnalyzer): - """VAD analyzer that lazily instantiates the AIC VoiceActivityDetector via a factory. - - The analyzer can be constructed before the AIC Model exists. Once the filter has - started and the Model is available, the provided factory will succeed and the - backend VAD will be created. We then switch to single-sample updates where - num_frames_required() returns 1 and confidence is derived from the backend's - boolean is_speech_detected() state. - - AIC VAD runtime parameters: - - lookback_buffer_size: - Controls the lookback buffer size used by the VAD, i.e. the number of - window-length audio buffers used as a lookback buffer. Larger values improve - stability but increase latency. - Range: 1.0 .. 20.0 - Default (SDK): 6.0 - - sensitivity: - Controls the energy threshold sensitivity. Higher values make the detector - less sensitive (require more energy to count as speech). - Range: 1.0 .. 15.0 - Formula: Energy threshold = 10 ** (-sensitivity) - Default (SDK): 6.0 - - .. note:: - This class requires aic-sdk < 2.0.0 (uses 'aic' module). - For aic-sdk >= 2.0.0, use :class:`AICVADAnalyzerV2` instead. - """ - - def __init__( - self, - *, - vad_factory: Optional[Callable[[], Any]] = None, - lookback_buffer_size: Optional[float] = None, - sensitivity: Optional[float] = None, - ): - """Create an AIC VAD analyzer. - - Args: - vad_factory: - Zero-arg callable that returns an initialized AIC VoiceActivityDetector. - This may raise until the filter's Model has been created; the analyzer - will retry on set_sample_rate/first use. - lookback_buffer_size: - Optional override for AIC VAD lookback buffer size. - Range: 1.0 .. 20.0. Larger values increase stability at the cost of latency. - If None, the SDK default (6.0) is used. - sensitivity: - Optional override for AIC VAD sensitivity (energy threshold). - Range: 1.0 .. 15.0. Energy threshold = 10 ** (-sensitivity). - If None, the SDK default (6.0) is used. - """ - from pipecat.audio.utils import check_aic_sdk_version - - check_aic_sdk_version("v1") - - # Use fixed VAD parameters for AIC: no user override - fixed_params = VADParams(confidence=0.5, start_secs=0.0, stop_secs=0.0, min_volume=0.0) - super().__init__(sample_rate=None, params=fixed_params) - self._vad_factory = vad_factory - self._backend_vad: Optional[Any] = None - self._pending_lookback: Optional[float] = lookback_buffer_size - self._pending_sensitivity: Optional[float] = sensitivity - - def bind_vad_factory(self, vad_factory: Callable[[], Any]): - """Attach or replace the factory post-construction.""" - self._vad_factory = vad_factory - self._ensure_backend_initialized() - - def _apply_backend_params(self): - """Apply optional AIC VAD parameters if available.""" - from aic import AICVadParameter - - if self._backend_vad is None or AICVadParameter is None: - return - try: - if self._pending_lookback is not None: - self._backend_vad.set_parameter( - AICVadParameter.LOOKBACK_BUFFER_SIZE, float(self._pending_lookback) - ) - if self._pending_sensitivity is not None: - self._backend_vad.set_parameter( - AICVadParameter.SENSITIVITY, float(self._pending_sensitivity) - ) - except Exception as e: # noqa: BLE001 - logger.debug(f"AIC VAD parameter application deferred/failed: {e}") - - def _ensure_backend_initialized(self): - if self._backend_vad is not None: - return - if not self._vad_factory: - return - try: - self._backend_vad = self._vad_factory() - self._apply_backend_params() - # With backend ready, recompute internal frame sizing - super().set_params(self._params) - logger.debug("AIC VAD backend initialized in analyzer.") - except Exception as e: # noqa: BLE001 - # Filter may not be started yet; try again later - logger.debug(f"Deferring AIC VAD backend initialization: {e}") - - def set_sample_rate(self, sample_rate: int): - """Set the sample rate for audio processing. - - Args: - sample_rate: Audio sample rate in Hz. - """ - # Set rate and attempt backend initialization once we know SR - self._sample_rate = self._init_sample_rate or sample_rate - self._ensure_backend_initialized() - # Ensure params are initialized even if backend not ready yet - try: - super().set_params(self._params) - except Exception: - pass - - def num_frames_required(self) -> int: - """Get the number of audio frames required for analysis. - - Returns: - Number of frames needed for VAD processing. - """ - # Use 10 ms windows based on sample rate - return int(self.sample_rate * 0.01) if self.sample_rate > 0 else 160 - - def voice_confidence(self, buffer: bytes) -> float: - """Calculate voice activity confidence for the given audio buffer. - - Args: - buffer: Audio buffer to analyze. - - Returns: - Voice confidence score is 0.0 or 1.0. - """ - # Ensure backend exists (filter might have started since last call) - self._ensure_backend_initialized() - if self._backend_vad is None: - return 0.0 - - # We do not need to analyze 'buffer' here since the model's VAD is updated - # as part of the enhancement pipeline. Simply query the boolean and map it. - try: - is_speech = self._backend_vad.is_speech_detected() - return 1.0 if is_speech else 0.0 - except Exception as e: # noqa: BLE001 - logger.error(f"AIC VAD inference error: {e}") - return 0.0 - - -class AICVADAnalyzerV2(VADAnalyzer): """VAD analyzer that lazily binds to the AIC VadContext via a factory. The analyzer can be constructed before the AIC Processor exists. Once the filter has @@ -173,7 +23,7 @@ class AICVADAnalyzerV2(VADAnalyzer): VadContext will be obtained. We then use the context's is_speech_detected() state to derive confidence values. - AIC VAD runtime parameters (v2): + AIC VAD runtime parameters: - speech_hold_duration: Controls for how long the VAD continues to detect speech after the audio signal no longer contains speech (in seconds). @@ -188,7 +38,6 @@ class AICVADAnalyzerV2(VADAnalyzer): .. note:: This class requires aic-sdk >= 2.0.0 (uses 'aic_sdk' module). - For aic-sdk < 2.0.0, use :class:`AICVADAnalyzer` instead. """ def __init__( @@ -214,10 +63,6 @@ class AICVADAnalyzerV2(VADAnalyzer): Range: 1.0 .. 15.0. Energy threshold = 10 ** (-sensitivity). If None, the SDK default (6.0) is used. """ - from pipecat.audio.utils import check_aic_sdk_version - - check_aic_sdk_version("v2") - # Use fixed VAD parameters for AIC: no user override fixed_params = VADParams(confidence=0.5, start_secs=0.0, stop_secs=0.0, min_volume=0.0) super().__init__(sample_rate=None, params=fixed_params) @@ -259,7 +104,7 @@ class AICVADAnalyzerV2(VADAnalyzer): self._apply_vad_params() # With VAD context ready, recompute internal frame sizing super().set_params(self._params) - logger.debug("AIC VAD context (v2) initialized in analyzer.") + logger.debug("AIC VAD context initialized in analyzer.") except Exception as e: # noqa: BLE001 # Filter may not be started yet; try again later logger.debug(f"Deferring AIC VAD context initialization: {e}")