//! 音频特征提取模块 //! //! 实现从原始音频到模型输入特征的转换: //! - 预加重 //! - 分帧 & 加窗 //! - FFT + 功率谱 //! - Mel 滤波器组 //! - 对数能量 (log fbank) use realfft::{RealFftPlanner, num_complex::Complex}; /// 特征提取配置 #[derive(Debug, Clone)] pub struct FeatureConfig { /// 采样率 pub sample_rate: u32, /// FFT 窗口大小 pub n_fft: usize, /// 帧移 (hop length) pub hop_length: usize, /// 窗长 (win length) pub win_length: usize, /// Mel 滤波器数量 pub n_mels: usize, /// 最低频率 pub f_min: f32, /// 最高频率 pub f_max: f32, /// 预加重系数 pub pre_emphasis: f32, } impl Default for FeatureConfig { fn default() -> Self { Self { sample_rate: 16000, n_fft: 512, hop_length: 160, // 10ms at 16kHz win_length: 400, // 25ms at 16kHz n_mels: 80, f_min: 0.0, f_max: 8000.0, pre_emphasis: 0.97, } } } /// 从原始音频提取 log mel fbank 特征 pub fn extract_fbank(samples: &[f32], config: &FeatureConfig) -> Vec { // 1. 预加重 let emphasized = pre_emphasis(samples, config.pre_emphasis); // 2. 分帧加窗 let frames = frame(&emphasized, config.n_fft, config.hop_length, config.win_length); if frames.is_empty() { return vec![]; } // 3. FFT + 功率谱 + Mel 滤波器组 + 对数 let n_spec = config.n_fft / 2 + 1; let mut planner = RealFftPlanner::::new(); let r2c = planner.plan_fft_forward(config.n_fft); // 预计算汉宁窗和 mel 权重 let window: Vec = (0..config.win_length) .map(|i| { 0.5 * (1.0 - (2.0 * std::f32::consts::PI * i as f32 / (config.win_length - 1) as f32).cos()) }) .collect(); let mel_weights = create_mel_filterbank( config.n_fft, config.sample_rate, config.n_mels, config.f_min, config.f_max, ); // 复用缓冲区 let mut fft_input = vec![0.0f32; config.n_fft]; let mut fft_output = vec![Complex::new(0.0f32, 0.0f32); n_spec]; let mut mel_frame = vec![0.0f32; config.n_mels]; let mut all_features = Vec::new(); for frame_data in &frames { // 加窗 let copy_len = config.win_length.min(frame_data.len()); for i in 0..copy_len { fft_input[i] = frame_data[i] * window[i]; } for i in copy_len..config.n_fft { fft_input[i] = 0.0; } // FFT r2c.process(&mut fft_input, &mut fft_output).expect("FFT 失败"); // 计算 mel 能量 (直接在 FFT 输出上计算) for m in 0..config.n_mels { let mut energy = 0.0f32; for (i, weight) in mel_weights[m].iter().enumerate() { if i >= n_spec { break; } let re = fft_output[i].re; let im = fft_output[i].im; energy += (re * re + im * im) * weight * weight / config.n_fft as f32; } // 对数 mel_frame[m] = (energy + 1e-10).ln(); } all_features.extend_from_slice(&mel_frame); } all_features } /// 预加重滤波 fn pre_emphasis(samples: &[f32], coef: f32) -> Vec { if samples.len() < 2 { return samples.to_vec(); } let mut output = Vec::with_capacity(samples.len()); output.push(samples[0]); for i in 1..samples.len() { output.push(samples[i] - coef * samples[i - 1]); } output } /// 分帧 + 汉宁窗 fn frame(samples: &[f32], n_fft: usize, hop_length: usize, _win_length: usize) -> Vec> { let mut frames = Vec::new(); let mut start = 0; while start + n_fft <= samples.len() { let frame_data = samples[start..start + n_fft].to_vec(); frames.push(frame_data); start += hop_length; } frames } /// 频率到 mel fn hz_to_mel(hz: f32) -> f32 { 2595.0 * (1.0 + hz / 700.0).log10() } /// mel 到频率 fn mel_to_hz(mel: f32) -> f32 { 700.0 * (10.0_f32.powf(mel / 2595.0) - 1.0) } /// 创建 Mel 滤波器组 fn create_mel_filterbank( n_fft: usize, sample_rate: u32, n_mels: usize, f_min: f32, f_max: f32, ) -> Vec> { let n_spec = n_fft / 2 + 1; let mel_min = hz_to_mel(f_min); let mel_max = hz_to_mel(f_max.min(sample_rate as f32 / 2.0)); let mel_points: Vec = (0..=n_mels + 1) .map(|i| mel_min + (mel_max - mel_min) * i as f32 / (n_mels + 1) as f32) .collect(); let hz_points: Vec = mel_points.iter().map(|&m| mel_to_hz(m)).collect(); let bin_points: Vec = hz_points .iter() .map(|&h| ((n_fft as f32 + 1.0) * h / sample_rate as f32).floor() as usize) .collect(); let mut filterbank = vec![vec![0.0f32; n_spec]; n_mels]; for m in 0..n_mels { let left = bin_points[m]; let center = bin_points[m + 1]; let right = bin_points[m + 2].min(n_spec - 1); for i in left..center { if center > left { filterbank[m][i] = (i as f32 - left as f32) / (center as f32 - left as f32); } } for i in center..=right { if right > center { filterbank[m][i] = (right as f32 - i as f32) / (right as f32 - center as f32); } } } filterbank } /// 完整的特征提取管线: 原始音频 → 展平的 log mel fbank pub fn audio_to_features( samples: &[f32], sample_rate: u32, ) -> (Vec, usize, usize) { let config = FeatureConfig { sample_rate, ..Default::default() }; let features = extract_fbank(samples, &config); let n_frames = if config.n_mels > 0 { features.len() / config.n_mels } else { 0 }; (features, n_frames, config.n_mels) } #[cfg(test)] mod tests { use super::*; #[test] fn test_mel_conversion() { let mel = hz_to_mel(1000.0); assert!((mel_to_hz(mel) - 1000.0).abs() < 1.0); } #[test] fn test_pre_emphasis() { let input = vec![1.0, 1.0, 1.0, 1.0]; let output = pre_emphasis(&input, 0.97); assert_eq!(output[0], 1.0); assert_eq!(output[1], 1.0 - 0.97); } #[test] fn test_fbank_shape() { let samples = vec![0.0f32; 16000]; let (features, n_frames, n_mels) = audio_to_features(&samples, 16000); assert_eq!(n_mels, 80); assert!(n_frames > 0); assert_eq!(features.len(), n_frames * n_mels); } }