Files
pipecat/scripts/krisp/audio_file_utils.py
Aleix Conchillo Flaqué b3bb6fdaa5 Modernize Python typing across the codebase
Automated via ruff UP006, UP007, UP035, UP045 rules (target: py311):

- Replace `typing.List`, `Dict`, `Tuple`, `Set`, `FrozenSet`, `Type`
  with their built-in equivalents (`list`, `dict`, `tuple`, etc.)
- Replace `typing.Optional[X]` with `X | None`
- Replace `typing.Union[X, Y]` with `X | Y`
- Move `Mapping`, `Sequence`, `Callable`, `Awaitable`,
  `MutableMapping`, `MutableSequence`, `Iterator`, `AsyncIterator`,
  `AsyncGenerator` imports from `typing` to `collections.abc`
- Remove now-unused `typing` imports
- Add `from __future__ import annotations` to 5 files that use
  forward-reference strings in `X | "Y"` annotations
2026-04-16 09:28:23 -07:00

148 lines
4.9 KiB
Python

"""Utility functions for reading and writing audio files in integration tests.
This module provides consistent audio file I/O operations for test scripts,
handling format detection and conversion to int16 PCM format.
"""
import sys
import numpy as np
import soundfile as sf
def read_audio_file(input_path: str, verbose: bool = False) -> tuple[np.ndarray, int]:
"""Read an audio file and convert to int16 mono format.
This function:
- Detects the audio format from the file header
- Reads PCM_16 files directly without conversion
- Converts float formats by scaling to int16 range
- Converts stereo to mono
- Validates the audio data range
Args:
input_path: Path to the input audio file
verbose: If True, print detailed format information
Returns:
Tuple of (audio_data, sample_rate) where audio_data is int16 mono
Raises:
SystemExit: If the audio format is not supported
"""
if verbose:
print(f"Loading audio from: {input_path}")
# Get audio file info to determine the format
info = sf.info(input_path)
if verbose:
print(
f"Audio file format: {info.subtype}, {info.channels} channel(s), {info.samplerate} Hz"
)
# Read audio data based on the source format
if info.subtype in ["PCM_16", "PCM_S16"]:
# File is already int16, read directly to avoid unnecessary conversion
audio_data, sample_rate = sf.read(input_path, dtype="int16")
if verbose:
print("Read as int16 (native format)")
elif info.subtype in ["FLOAT", "DOUBLE"]:
# File is float format, read as float32 and scale to int16
audio_data, sample_rate = sf.read(input_path, dtype="float32")
# Convert float32 (-1.0 to 1.0) to int16 (-32768 to 32767)
audio_data = (audio_data * 32767).astype(np.int16)
if verbose:
print("Read as float32 and scaled to int16")
else:
print(f"Error: Unsupported audio format: {info.subtype}")
print(f"Supported formats: PCM_16, PCM_S16, FLOAT, DOUBLE")
sys.exit(1)
# Convert stereo to mono if needed
if len(audio_data.shape) > 1:
if verbose:
print(f"Converting from {audio_data.shape[1]} channels to mono")
if audio_data.dtype == np.int16:
# For int16, convert to int32 for averaging to avoid overflow
audio_data = audio_data.astype(np.int32).mean(axis=1).astype(np.int16)
else:
audio_data = audio_data.mean(axis=1).astype(np.int16)
# Verify the audio has proper range
audio_max = abs(audio_data.max())
audio_min = abs(audio_data.min())
audio_range = max(audio_max, audio_min)
if audio_range < 100:
print(
f"⚠️ WARNING: Audio values are very small (max: {audio_data.max()}, min: {audio_data.min()})"
)
print(f" Expected int16 range: -32768 to 32767")
print(f" This may indicate a format conversion issue.")
elif verbose:
print(f"Audio range: {audio_data.min()} to {audio_data.max()}")
if verbose:
print(
f"Audio info: {len(audio_data)} samples, {sample_rate} Hz, {len(audio_data) / sample_rate:.2f} seconds"
)
return audio_data, sample_rate
def write_audio_file(
output_path: str, audio_data: np.ndarray, sample_rate: int, verbose: bool = False
) -> None:
"""Write audio data to a file.
Args:
output_path: Path to the output audio file
audio_data: Audio data as numpy array (int16)
sample_rate: Sample rate in Hz
verbose: If True, print status information
Raises:
ValueError: If output file extension is not supported
"""
# Validate output file extension
valid_extensions = [".wav", ".flac", ".ogg"]
output_ext = output_path[output_path.rfind(".") :].lower() if "." in output_path else ""
if output_ext not in valid_extensions:
raise ValueError(
f"Invalid output file extension: '{output_ext}'. "
f"Supported formats: {', '.join(valid_extensions)}"
)
if verbose:
print(f"Saving audio to: {output_path}")
print(f" - Format: {output_ext[1:].upper()}")
print(f" - Samples: {len(audio_data)}")
print(f" - Sample rate: {sample_rate} Hz")
# Write the audio file
sf.write(output_path, audio_data, sample_rate)
if verbose:
print(f"✓ Audio saved successfully")
def calculate_audio_stats(audio_data: np.ndarray) -> dict:
"""Calculate statistics for audio data.
Args:
audio_data: Audio data as numpy array
Returns:
Dictionary with audio statistics
"""
rms = np.sqrt(np.mean(audio_data.astype(np.float32) ** 2))
return {
"min": int(audio_data.min()),
"max": int(audio_data.max()),
"mean": float(audio_data.mean()),
"std": float(audio_data.std()),
"rms": float(rms),
"samples": len(audio_data),
}