Audio Signal Processing Fundamentals
Audio signal processing forms the foundation for all speech and audio AI systems. Understanding how sound is captured, represented, and transformed is essential for building effective models for speech recognition, synthesis, and audio understanding. This section covers the core concepts of digital audio, spectral analysis, and feature extraction techniques that underpin modern audio AI.
Digital Audio Basics
Sound waves must be converted to digital form for processing:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, List, Dict, Union
from dataclasses import dataclass
import matplotlib.pyplot as plt
@dataclass
class AudioConfig:
"""Configuration for audio processing."""
sample_rate: int = 16000 # Samples per second (Hz)
n_fft: int = 400 # FFT window size
hop_length: int = 160 # Samples between STFT frames
n_mels: int = 80 # Number of mel filterbank channels
f_min: float = 0.0 # Minimum frequency for mel filterbank
f_max: Optional[float] = 8000.0 # Maximum frequency
window: str = "hann" # Window function
def audio_fundamentals():
"""
Digital Audio Fundamentals.
Key concepts:
- Sampling: Converting continuous signal to discrete samples
- Sample rate: Number of samples per second (Hz)
- Nyquist theorem: Sample rate must be > 2x highest frequency
- Bit depth: Bits per sample (quantization resolution)
"""
# Common sample rates
sample_rates = {
8000: "Telephone quality",
16000: "Speech recognition standard",
22050: "Low-quality music",
44100: "CD quality",
48000: "Professional audio/video",
96000: "High-resolution audio"
}
# Nyquist frequency = sample_rate / 2
# Human hearing: ~20 Hz to ~20,000 Hz
# CD quality (44.1 kHz) can represent up to 22.05 kHz
print("Common Sample Rates:")
for rate, description in sample_rates.items():
nyquist = rate // 2
print(f" {rate:5d} Hz: {description} (Nyquist: {nyquist} Hz)")
return sample_rates
class AudioLoader:
"""
Load and preprocess audio files.
"""
def __init__(self, config: AudioConfig):
self.config = config
def load(
self,
path: str,
normalize: bool = True
) -> Tuple[torch.Tensor, int]:
"""
Load audio file and resample if needed.
Args:
path: Path to audio file
normalize: Whether to normalize amplitude
Returns:
waveform: [1, num_samples] or [2, num_samples] tensor
sample_rate: Original sample rate
"""
import torchaudio
waveform, sr = torchaudio.load(path)
# Resample if necessary
if sr != self.config.sample_rate:
resampler = torchaudio.transforms.Resample(
orig_freq=sr,
new_freq=self.config.sample_rate
)
waveform = resampler(waveform)
# Convert stereo to mono
if waveform.size(0) > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Normalize
if normalize:
waveform = waveform / (waveform.abs().max() + 1e-8)
return waveform, self.config.sample_rate
def load_batch(
self,
paths: List[str],
max_length: Optional[int] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Load and pad batch of audio files.
Returns:
waveforms: [batch, max_samples] padded tensor
lengths: [batch] actual lengths
"""
waveforms = []
lengths = []
for path in paths:
wav, _ = self.load(path)
waveforms.append(wav.squeeze(0))
lengths.append(wav.size(1))
# Pad to max length
if max_length is None:
max_length = max(lengths)
padded = torch.zeros(len(waveforms), max_length)
for i, (wav, length) in enumerate(zip(waveforms, lengths)):
actual_length = min(length, max_length)
padded[i, :actual_length] = wav[:actual_length]
lengths[i] = actual_length
return padded, torch.tensor(lengths)
def generate_waveforms():
"""Generate example waveforms for visualization."""
sample_rate = 16000
duration = 0.1 # 100ms
t = np.linspace(0, duration, int(sample_rate * duration))
# Pure sine wave
freq = 440 # A4 note
sine_wave = np.sin(2 * np.pi * freq * t)
# Complex wave (multiple harmonics)
complex_wave = (
np.sin(2 * np.pi * freq * t) + # Fundamental
0.5 * np.sin(2 * np.pi * 2 * freq * t) + # 2nd harmonic
0.25 * np.sin(2 * np.pi * 3 * freq * t) # 3rd harmonic
)
complex_wave = complex_wave / complex_wave.max()
# White noise
noise = np.random.randn(len(t))
noise = noise / noise.max()
return {
'sine': (t, sine_wave),
'complex': (t, complex_wave),
'noise': (t, noise)
}Fourier Transform and Spectral Analysis
The Fourier transform reveals frequency content of audio signals:
class SpectralAnalysis:
"""
Spectral analysis tools for audio.
"""
def __init__(self, config: AudioConfig):
self.config = config
def fft(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Compute Fast Fourier Transform.
Transforms time-domain signal to frequency domain.
Args:
waveform: [batch, samples] or [samples]
Returns:
spectrum: Complex frequency spectrum
"""
return torch.fft.fft(waveform)
def magnitude_spectrum(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Compute magnitude spectrum (amplitude at each frequency).
"""
spectrum = self.fft(waveform)
magnitude = torch.abs(spectrum)
# Only keep positive frequencies (up to Nyquist)
n = waveform.size(-1)
return magnitude[..., :n // 2 + 1]
def power_spectrum(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Compute power spectrum (squared magnitude).
"""
magnitude = self.magnitude_spectrum(waveform)
return magnitude ** 2
def frequency_bins(self, n_fft: int) -> np.ndarray:
"""
Get frequency values for each FFT bin.
"""
return np.fft.rfftfreq(n_fft, d=1/self.config.sample_rate)
class STFT(nn.Module):
"""
Short-Time Fourier Transform.
Analyzes how frequency content changes over time by applying
FFT to overlapping windows of the signal.
"""
def __init__(self, config: AudioConfig):
super().__init__()
self.config = config
# Create window function
if config.window == "hann":
window = torch.hann_window(config.n_fft)
elif config.window == "hamming":
window = torch.hamming_window(config.n_fft)
else:
window = torch.ones(config.n_fft)
self.register_buffer('window', window)
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Compute STFT.
Args:
waveform: [batch, samples] audio signal
Returns:
stft: [batch, freq_bins, time_frames] complex spectrogram
"""
# Add channel dim if needed
if waveform.dim() == 1:
waveform = waveform.unsqueeze(0)
# Compute STFT
stft = torch.stft(
waveform,
n_fft=self.config.n_fft,
hop_length=self.config.hop_length,
win_length=self.config.n_fft,
window=self.window,
center=True,
pad_mode='reflect',
normalized=False,
onesided=True,
return_complex=True
)
return stft
def spectrogram(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Compute magnitude spectrogram.
Returns:
spectrogram: [batch, freq_bins, time_frames]
"""
stft = self.forward(waveform)
return torch.abs(stft)
def power_spectrogram(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Compute power spectrogram.
"""
return self.spectrogram(waveform) ** 2
def phase(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Extract phase information.
"""
stft = self.forward(waveform)
return torch.angle(stft)
class InverseSTFT(nn.Module):
"""
Inverse Short-Time Fourier Transform.
Reconstructs waveform from STFT representation.
"""
def __init__(self, config: AudioConfig):
super().__init__()
self.config = config
if config.window == "hann":
window = torch.hann_window(config.n_fft)
else:
window = torch.ones(config.n_fft)
self.register_buffer('window', window)
def forward(
self,
stft: torch.Tensor,
length: Optional[int] = None
) -> torch.Tensor:
"""
Reconstruct waveform from STFT.
Args:
stft: [batch, freq_bins, time_frames] complex spectrogram
length: Target output length
Returns:
waveform: [batch, samples]
"""
waveform = torch.istft(
stft,
n_fft=self.config.n_fft,
hop_length=self.config.hop_length,
win_length=self.config.n_fft,
window=self.window,
center=True,
normalized=False,
onesided=True,
length=length
)
return waveform
def stft_parameters_explained():
"""Explain STFT parameters and their effects."""
parameters = {
'n_fft': {
'description': 'FFT window size (samples)',
'effect': 'Larger = better frequency resolution, worse time resolution',
'typical': '400-2048 for speech, 2048-4096 for music',
'tradeoff': 'Time-frequency resolution tradeoff (uncertainty principle)'
},
'hop_length': {
'description': 'Samples between consecutive frames',
'effect': 'Smaller = more frames, finer time resolution',
'typical': 'n_fft // 4 for 75% overlap',
'tradeoff': 'Computation vs. time resolution'
},
'window': {
'description': 'Window function applied to each frame',
'options': ['hann', 'hamming', 'blackman', 'rectangular'],
'effect': 'Reduces spectral leakage (artifacts from non-periodic signals)',
'typical': 'Hann window most common'
},
'center': {
'description': 'Whether to pad signal so frames are centered',
'effect': 'True = first/last frames centered on signal edges',
'typical': 'True for most applications'
}
}
print("STFT Parameters:")
print("=" * 60)
for param, info in parameters.items():
print(f"\n{param}:")
for k, v in info.items():
print(f" {k}: {v}")Mel Spectrograms
Mel spectrograms provide a perceptually-motivated representation:
class MelSpectrogram(nn.Module):
"""
Mel Spectrogram computation.
Applies mel filterbank to power spectrogram, mimicking
human auditory perception (logarithmic frequency scaling).
"""
def __init__(self, config: AudioConfig):
super().__init__()
self.config = config
self.stft = STFT(config)
# Create mel filterbank
mel_fb = self._create_mel_filterbank()
self.register_buffer('mel_filterbank', mel_fb)
def _create_mel_filterbank(self) -> torch.Tensor:
"""
Create mel filterbank matrix.
Maps linear frequency bins to mel frequency bins.
"""
n_freqs = self.config.n_fft // 2 + 1
# Mel scale conversion functions
def hz_to_mel(hz):
return 2595 * np.log10(1 + hz / 700)
def mel_to_hz(mel):
return 700 * (10 ** (mel / 2595) - 1)
# Frequency range
f_min = self.config.f_min
f_max = self.config.f_max or self.config.sample_rate / 2
# Mel points
mel_min = hz_to_mel(f_min)
mel_max = hz_to_mel(f_max)
mel_points = np.linspace(mel_min, mel_max, self.config.n_mels + 2)
hz_points = mel_to_hz(mel_points)
# Convert to FFT bin indices
bin_points = np.floor(
(self.config.n_fft + 1) * hz_points / self.config.sample_rate
).astype(int)
# Create filterbank
filterbank = np.zeros((self.config.n_mels, n_freqs))
for i in range(self.config.n_mels):
# Rising edge
for j in range(bin_points[i], bin_points[i + 1]):
filterbank[i, j] = (j - bin_points[i]) / (bin_points[i + 1] - bin_points[i])
# Falling edge
for j in range(bin_points[i + 1], bin_points[i + 2]):
filterbank[i, j] = (bin_points[i + 2] - j) / (bin_points[i + 2] - bin_points[i + 1])
return torch.FloatTensor(filterbank)
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Compute mel spectrogram.
Args:
waveform: [batch, samples] audio signal
Returns:
mel_spec: [batch, n_mels, time_frames]
"""
# Compute power spectrogram
power_spec = self.stft.power_spectrogram(waveform)
# Apply mel filterbank
mel_spec = torch.matmul(self.mel_filterbank, power_spec)
return mel_spec
def log_mel_spectrogram(
self,
waveform: torch.Tensor,
log_offset: float = 1e-6
) -> torch.Tensor:
"""
Compute log mel spectrogram.
Log compression matches human loudness perception.
"""
mel_spec = self.forward(waveform)
return torch.log(mel_spec + log_offset)
def normalized_log_mel(
self,
waveform: torch.Tensor,
mean: Optional[torch.Tensor] = None,
std: Optional[torch.Tensor] = None
) -> torch.Tensor:
"""
Compute normalized log mel spectrogram.
Normalization improves model training stability.
"""
log_mel = self.log_mel_spectrogram(waveform)
if mean is not None and std is not None:
log_mel = (log_mel - mean) / (std + 1e-6)
else:
# Per-utterance normalization
log_mel = (log_mel - log_mel.mean()) / (log_mel.std() + 1e-6)
return log_mel
class MelScale:
"""
Mel scale utilities.
The mel scale approximates human perception of pitch,
where equal distances sound equally different to humans.
"""
@staticmethod
def hz_to_mel(hz: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
"""Convert Hz to mel scale."""
return 2595 * np.log10(1 + hz / 700)
@staticmethod
def mel_to_hz(mel: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
"""Convert mel to Hz."""
return 700 * (10 ** (mel / 2595) - 1)
@staticmethod
def visualize_mel_scale():
"""Visualize mel scale vs linear frequency."""
hz = np.linspace(0, 8000, 100)
mel = MelScale.hz_to_mel(hz)
print("Mel Scale Mapping:")
print("-" * 40)
for h in [100, 500, 1000, 2000, 4000, 8000]:
m = MelScale.hz_to_mel(h)
print(f" {h:5d} Hz -> {m:7.1f} mel")
MelScale.visualize_mel_scale()MFCCs (Mel-Frequency Cepstral Coefficients)
MFCCs are a compact representation widely used in speech processing:
class MFCC(nn.Module):
"""
Mel-Frequency Cepstral Coefficients.
Compact representation that captures spectral envelope
while removing fine harmonic structure.
"""
def __init__(
self,
config: AudioConfig,
n_mfcc: int = 13,
include_deltas: bool = True
):
super().__init__()
self.config = config
self.n_mfcc = n_mfcc
self.include_deltas = include_deltas
self.mel_spec = MelSpectrogram(config)
# DCT matrix for cepstral coefficients
dct_matrix = self._create_dct_matrix()
self.register_buffer('dct_matrix', dct_matrix)
def _create_dct_matrix(self) -> torch.Tensor:
"""
Create DCT-II matrix.
DCT decorrelates mel bands and compacts energy
into lower coefficients.
"""
n_mels = self.config.n_mels
n_mfcc = self.n_mfcc
# DCT-II matrix
dct = np.zeros((n_mfcc, n_mels))
for k in range(n_mfcc):
for n in range(n_mels):
dct[k, n] = np.cos(np.pi * k * (n + 0.5) / n_mels)
# Orthonormalize
dct[0] *= 1 / np.sqrt(2)
dct *= np.sqrt(2 / n_mels)
return torch.FloatTensor(dct)
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Compute MFCCs.
Args:
waveform: [batch, samples]
Returns:
mfcc: [batch, n_mfcc * (1 + 2*include_deltas), time_frames]
"""
# Log mel spectrogram
log_mel = self.mel_spec.log_mel_spectrogram(waveform)
# Apply DCT
mfcc = torch.matmul(self.dct_matrix, log_mel)
if self.include_deltas:
# First derivative (delta)
delta = self._compute_delta(mfcc)
# Second derivative (delta-delta)
delta2 = self._compute_delta(delta)
# Concatenate
mfcc = torch.cat([mfcc, delta, delta2], dim=1)
return mfcc
def _compute_delta(
self,
features: torch.Tensor,
order: int = 2
) -> torch.Tensor:
"""
Compute delta (derivative) features.
Uses regression over nearby frames.
"""
# Pad for edge frames
padded = F.pad(features, (order, order), mode='replicate')
# Compute weighted sum
delta = torch.zeros_like(features)
norm = 2 * sum(i ** 2 for i in range(1, order + 1))
for i in range(1, order + 1):
delta += i * (
padded[..., order + i:padded.size(-1) - order + i] -
padded[..., order - i:padded.size(-1) - order - i]
)
return delta / norm
def feature_comparison():
"""Compare different audio features."""
features = {
'Waveform': {
'dimensions': 'samples',
'info': 'Raw time-domain signal',
'pros': 'Complete information, end-to-end learning',
'cons': 'Very high dimensional, hard to learn from',
'use_cases': 'WaveNet, raw audio models'
},
'Spectrogram': {
'dimensions': 'freq_bins x time_frames',
'info': 'Time-frequency representation',
'pros': 'Shows frequency content over time',
'cons': 'Linear frequency scale not perceptually motivated',
'use_cases': 'General audio analysis'
},
'Mel Spectrogram': {
'dimensions': 'n_mels x time_frames (e.g., 80 x T)',
'info': 'Perceptually-motivated frequency scale',
'pros': 'Matches human perception, compact',
'cons': 'Loses phase information',
'use_cases': 'ASR, TTS, speaker recognition'
},
'Log Mel Spectrogram': {
'dimensions': 'n_mels x time_frames',
'info': 'Log-compressed mel spectrogram',
'pros': 'Log matches loudness perception, better dynamic range',
'cons': 'Still loses phase',
'use_cases': 'Most modern speech models (Whisper, etc.)'
},
'MFCC': {
'dimensions': 'n_mfcc x time_frames (e.g., 13-39 x T)',
'info': 'Cepstral coefficients from mel spectrum',
'pros': 'Very compact, decorrelated features',
'cons': 'Loses some information, may hurt end-to-end learning',
'use_cases': 'Traditional ASR, speaker verification'
}
}
print("Audio Feature Comparison:")
print("=" * 70)
for name, info in features.items():
print(f"\n{name}:")
for k, v in info.items():
print(f" {k}: {v}")
feature_comparison()Audio Data Augmentation
Augmentation improves model robustness:
class AudioAugmentation:
"""
Data augmentation for audio signals.
Improves model robustness to noise, variations in
recording conditions, and speaker differences.
"""
def __init__(self, config: AudioConfig):
self.config = config
def add_noise(
self,
waveform: torch.Tensor,
snr_db: float = 20.0
) -> torch.Tensor:
"""
Add Gaussian noise at specified SNR.
Args:
waveform: Input audio
snr_db: Signal-to-noise ratio in dB
"""
signal_power = waveform.pow(2).mean()
noise_power = signal_power / (10 ** (snr_db / 10))
noise = torch.randn_like(waveform) * noise_power.sqrt()
return waveform + noise
def time_stretch(
self,
waveform: torch.Tensor,
rate: float = 1.0
) -> torch.Tensor:
"""
Time stretch without changing pitch.
Args:
rate: Stretch factor (>1 = slower, <1 = faster)
"""
# Simple resampling (changes pitch too)
# For true time stretch, use phase vocoder
n_samples = int(waveform.size(-1) * rate)
return F.interpolate(
waveform.unsqueeze(0),
size=n_samples,
mode='linear',
align_corners=False
).squeeze(0)
def pitch_shift(
self,
waveform: torch.Tensor,
semitones: float = 0.0
) -> torch.Tensor:
"""
Shift pitch by semitones.
Uses resampling followed by time stretch to maintain duration.
"""
ratio = 2 ** (semitones / 12)
# Resample (changes pitch and duration)
n_samples = int(waveform.size(-1) / ratio)
resampled = F.interpolate(
waveform.unsqueeze(0),
size=n_samples,
mode='linear',
align_corners=False
).squeeze(0)
# Time stretch back to original duration
return F.interpolate(
resampled.unsqueeze(0),
size=waveform.size(-1),
mode='linear',
align_corners=False
).squeeze(0)
def random_crop(
self,
waveform: torch.Tensor,
crop_length: int
) -> torch.Tensor:
"""
Random crop to fixed length.
"""
if waveform.size(-1) <= crop_length:
# Pad if too short
padding = crop_length - waveform.size(-1)
return F.pad(waveform, (0, padding))
start = torch.randint(0, waveform.size(-1) - crop_length, (1,)).item()
return waveform[..., start:start + crop_length]
def volume_perturbation(
self,
waveform: torch.Tensor,
gain_db_range: Tuple[float, float] = (-10, 10)
) -> torch.Tensor:
"""
Random volume change.
"""
gain_db = torch.empty(1).uniform_(*gain_db_range).item()
gain = 10 ** (gain_db / 20)
return waveform * gain
class SpecAugment(nn.Module):
"""
SpecAugment: Spectrogram augmentation.
Applies time and frequency masking to spectrograms
during training for regularization.
"""
def __init__(
self,
freq_mask_param: int = 27,
time_mask_param: int = 100,
n_freq_masks: int = 2,
n_time_masks: int = 2
):
super().__init__()
self.freq_mask_param = freq_mask_param
self.time_mask_param = time_mask_param
self.n_freq_masks = n_freq_masks
self.n_time_masks = n_time_masks
def forward(self, spectrogram: torch.Tensor) -> torch.Tensor:
"""
Apply SpecAugment.
Args:
spectrogram: [batch, freq, time] or [freq, time]
Returns:
Augmented spectrogram
"""
augmented = spectrogram.clone()
if augmented.dim() == 2:
augmented = augmented.unsqueeze(0)
batch_size, n_freq, n_time = augmented.shape
for _ in range(self.n_freq_masks):
f = torch.randint(0, self.freq_mask_param, (1,)).item()
f0 = torch.randint(0, max(1, n_freq - f), (1,)).item()
augmented[:, f0:f0 + f, :] = 0
for _ in range(self.n_time_masks):
t = torch.randint(0, self.time_mask_param, (1,)).item()
t = min(t, int(n_time * 0.5)) # Limit to 50% of length
t0 = torch.randint(0, max(1, n_time - t), (1,)).item()
augmented[:, :, t0:t0 + t] = 0
return augmented.squeeze(0) if spectrogram.dim() == 2 else augmented
class RoomSimulator:
"""
Simulate room acoustics with reverberation.
"""
def __init__(self, sample_rate: int = 16000):
self.sample_rate = sample_rate
def generate_impulse_response(
self,
rt60: float = 0.5,
room_dim: Tuple[float, float, float] = (5, 4, 3)
) -> torch.Tensor:
"""
Generate synthetic room impulse response.
Args:
rt60: Reverberation time (time for 60dB decay)
room_dim: Room dimensions in meters (length, width, height)
Returns:
Impulse response
"""
# Simple exponential decay model
duration = rt60 * 2
n_samples = int(duration * self.sample_rate)
# Generate noise and apply exponential decay
ir = torch.randn(n_samples)
decay = torch.exp(-6.9 * torch.arange(n_samples).float() / (rt60 * self.sample_rate))
ir = ir * decay
# Normalize
ir = ir / ir.abs().max()
return ir
def apply_reverb(
self,
waveform: torch.Tensor,
impulse_response: torch.Tensor,
wet_level: float = 0.3
) -> torch.Tensor:
"""
Apply reverberation using convolution.
"""
# Convolve
reverb = F.conv1d(
waveform.unsqueeze(0).unsqueeze(0),
impulse_response.unsqueeze(0).unsqueeze(0),
padding=impulse_response.size(0) - 1
).squeeze()
# Truncate to original length
reverb = reverb[:waveform.size(-1)]
# Mix dry and wet
return (1 - wet_level) * waveform + wet_level * reverbKey Takeaways
Audio signal processing provides the foundational representations for all speech and audio AI. Key concepts include: (1) digital audio fundamentals like sampling rate and Nyquist frequency that determine what frequencies can be captured, (2) the Short-Time Fourier Transform (STFT) that reveals how frequency content changes over time, (3) mel spectrograms that provide perceptually-motivated frequency scaling matching human hearing, and (4) MFCCs that offer compact, decorrelated features. Modern deep learning models typically use log mel spectrograms as input, with SpecAugment providing regularization during training. Understanding these representations is essential for building effective speech recognition, synthesis, and audio understanding systems.