From 15b789d5c9e8c8e65374c47d8f62223f4f7583b8 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Sun, 4 Sep 2022 22:08:18 +0800 Subject: [PATCH] rm compliance&&add function test --- python/paddle/audio/__init__.py | 3 +- python/paddle/audio/compliance/__init__.py | 15 - python/paddle/audio/compliance/librosa.py | 442 --------------------- python/paddle/tests/test_audio_features.py | 384 ++++++++++-------- python/unittest_py/requirements.txt | 1 + 5 files changed, 219 insertions(+), 626 deletions(-) delete mode 100644 python/paddle/audio/compliance/__init__.py delete mode 100644 python/paddle/audio/compliance/librosa.py diff --git a/python/paddle/audio/__init__.py b/python/paddle/audio/__init__.py index 6895385ad3d08..e76a80300f5e6 100644 --- a/python/paddle/audio/__init__.py +++ b/python/paddle/audio/__init__.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import compliance from . import features from . import functional from . import utils -__all__ = ["compliance", "functional", "features", "utils"] +__all__ = ["functional", "features", "utils"] diff --git a/python/paddle/audio/compliance/__init__.py b/python/paddle/audio/compliance/__init__.py deleted file mode 100644 index 6083e3eba77ec..0000000000000 --- a/python/paddle/audio/compliance/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import librosa diff --git a/python/paddle/audio/compliance/librosa.py b/python/paddle/audio/compliance/librosa.py deleted file mode 100644 index ba8d1651e234d..0000000000000 --- a/python/paddle/audio/compliance/librosa.py +++ /dev/null @@ -1,442 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from librosa(https://github.com/librosa/librosa) - -import warnings -from typing import List -from typing import Optional -from typing import Union - -import numpy as np -import scipy -from numpy.lib.stride_tricks import as_strided -from scipy import signal -from scipy import fftpack -import paddle - -from ..utils import ParameterError -from ..functional import mel_frequencies -from ..functional import power_to_db - -__all__ = [ - # dsp - 'stft', - 'mfcc', - 'compute_fbank_matrix', - 'melspectrogram', - 'spectrogram', -] - - -def _pad_center(data: np.ndarray, - size: int, - axis: int = -1, - **kwargs) -> np.ndarray: - """Pad an array to a target length along a target axis. - - This differs from `np.pad` by centering the data prior to padding, - analogous to `str.center` - """ - - kwargs.setdefault("mode", "constant") - n = data.shape[axis] - lpad = int((size - n) // 2) - lengths = [(0, 0)] * data.ndim - lengths[axis] = (lpad, int(size - n - lpad)) - - if lpad < 0: - raise ParameterError(("Target size ({size:d}) must be " - "at least input size ({n:d})")) - - return np.pad(data, lengths, **kwargs) - - -def _split_frames(x: np.ndarray, - frame_length: int, - hop_length: int, - axis: int = -1) -> np.ndarray: - """Slice a data array into (overlapping) frames. - - This function is aligned with librosa.frame - """ - - if not isinstance(x, np.ndarray): - raise ParameterError( - f"Input must be of type numpy.ndarray, given type(x)={type(x)}") - - if x.shape[axis] < frame_length: - raise ParameterError(f"Input is too short (n={x.shape[axis]:d})" - f" for frame_length={frame_length:d}") - - if hop_length < 1: - raise ParameterError(f"Invalid hop_length: {hop_length:d}") - - if axis == -1 and not x.flags["F_CONTIGUOUS"]: - warnings.warn(f"librosa.util.frame called with axis={axis} " - "on a non-contiguous input. This will result in a copy.") - x = np.asfortranarray(x) - elif axis == 0 and not x.flags["C_CONTIGUOUS"]: - warnings.warn(f"librosa.util.frame called with axis={axis} " - "on a non-contiguous input. This will result in a copy.") - x = np.ascontiguousarray(x) - - n_frames = 1 + (x.shape[axis] - frame_length) // hop_length - strides = np.asarray(x.strides) - - new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize - - if axis == -1: - shape = list(x.shape)[:-1] + [frame_length, n_frames] - strides = list(strides) + [hop_length * new_stride] - - elif axis == 0: - shape = [n_frames, frame_length] + list(x.shape)[1:] - strides = [hop_length * new_stride] + list(strides) - - else: - raise ParameterError(f"Frame axis={axis} must be either 0 or -1") - - return as_strided(x, shape=shape, strides=strides) - - -def _check_audio(y, mono=True) -> bool: - """Determine whether a variable contains valid audio data. - - The audio y must be a np.ndarray, ether 1-channel or two channel - """ - if not isinstance(y, np.ndarray): - raise ParameterError("Audio data must be of type numpy.ndarray") - if y.ndim > 2: - raise ParameterError( - f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}") - - if mono and y.ndim == 2: - raise ParameterError( - f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}") - - if (mono and len(y) == 0) or (not mono and y.shape[1] < 0): - raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}") - - if not np.issubdtype(y.dtype, np.floating): - raise ParameterError("Audio data must be floating-point") - - if not np.isfinite(y).all(): - raise ParameterError("Audio buffer is not finite everywhere") - - return True - - -def fft_frequencies(sr: int, n_fft: int) -> np.ndarray: - """Compute fourier frequencies. - - Args: - sr (int): Sample rate. - n_fft (int): FFT size. - - Returns: - np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. - """ - return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True) - - -def compute_fbank_matrix(sr: int, - n_fft: int, - n_mels: int = 128, - fmin: float = 0.0, - fmax: Optional[float] = None, - htk: bool = False, - norm: str = "slaney", - dtype: type = np.float32) -> np.ndarray: - """Compute fbank matrix. - - Args: - sr (int): Sample rate. - n_fft (int): FFT size. - n_mels (int, optional): Number of mel bins. Defaults to 128. - fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0. - fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. - htk (bool, optional): Use htk scaling. Defaults to False. - norm (str, optional): Type of normalization. Defaults to "slaney". - dtype (type, optional): Data type. Defaults to np.float32. - - - Returns: - np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. - """ - if norm != "slaney": - raise ParameterError('norm must set to slaney') - - if fmax is None: - fmax = float(sr) / 2 - - # Initialize the weights - n_mels = int(n_mels) - weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) - - # Center freqs of each FFT bin - fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft) - - # 'Center freqs' of mel bands - uniformly spaced between limits - mel_f_t = mel_frequencies(n_mels=n_mels + 2, - f_min=fmin, - f_max=fmax, - htk=htk) - mel_f = mel_f_t.numpy() - - fdiff = np.diff(mel_f) - ramps = np.subtract.outer(mel_f, fftfreqs) - - for i in range(n_mels): - # lower and upper slopes for all bins - lower = -ramps[i] / fdiff[i] - upper = ramps[i + 2] / fdiff[i + 1] - - # .. then intersect them with each other and zero - weights[i] = np.maximum(0, np.minimum(lower, upper)) - - if norm == "slaney": - # Slaney-style mel is scaled to be approx constant energy per channel - enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) - weights *= enorm[:, np.newaxis] - - # Only check weights if f_mel[0] is positive - if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)): - # This means we have an empty channel somewhere - warnings.warn("Empty filters detected in mel frequency basis. " - "Some channels will produce empty responses. " - "Try increasing your sampling rate (and fmax) or " - "reducing n_mels.") - - return weights - - -def stft(x: np.ndarray, - n_fft: int = 2048, - hop_length: Optional[int] = None, - win_length: Optional[int] = None, - window: str = "hann", - center: bool = True, - dtype: type = np.complex64, - pad_mode: str = "reflect") -> np.ndarray: - """Short-time Fourier transform (STFT). - - Args: - x (np.ndarray): Input waveform in one dimension. - n_fft (int, optional): FFT size. Defaults to 2048. - hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None. - win_length (Optional[int], optional): The size of window. Defaults to None. - window (str, optional): A string of window specification. Defaults to "hann". - center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. - dtype (type, optional): Data type of STFT results. Defaults to np.complex64. - pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". - - Returns: - np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`. - """ - _check_audio(x) - - # By default, use the entire frame - if win_length is None: - win_length = n_fft - - # Set the default hop, if it's not already specified - if hop_length is None: - hop_length = int(win_length // 4) - - fft_window = signal.get_window(window, win_length, fftbins=True) - - # Pad the window out to n_fft size - fft_window = _pad_center(fft_window, n_fft) - - # Reshape so that the window can be broadcast - fft_window = fft_window.reshape((-1, 1)) - - # Pad the time series so that frames are centered - if center: - if n_fft > x.shape[-1]: - warnings.warn( - f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}" - ) - x = np.pad(x, int(n_fft // 2), mode=pad_mode) - - elif n_fft > x.shape[-1]: - raise ParameterError( - f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}" - ) - - # Window the time series. - x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length) - # Pre-allocate the STFT matrix - stft_matrix = np.empty((int(1 + n_fft // 2), x_frames.shape[1]), - dtype=dtype, - order="F") - fft = np.fft # use numpy fft as default - # Constrain STFT block sizes to 256 KB - MAX_MEM_BLOCK = 2**8 * 2**10 - # how many columns can we fit within MAX_MEM_BLOCK? - n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize) - n_columns = max(n_columns, 1) - - for bl_s in range(0, stft_matrix.shape[1], n_columns): - bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) - stft_matrix[:, - bl_s:bl_t] = fft.rfft(fft_window * x_frames[:, bl_s:bl_t], - axis=0) - - return stft_matrix - - -def mfcc(x: np.ndarray, - sr: int = 16000, - spect: Optional[np.ndarray] = None, - n_mfcc: int = 20, - dct_type: int = 2, - norm: str = "ortho", - lifter: int = 0, - **kwargs) -> np.ndarray: - """Mel-frequency cepstral coefficients (MFCCs) - - Args: - x (np.ndarray): Input waveform in one dimension. - sr (int, optional): Sample rate. Defaults to 16000. - spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None. - n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20. - dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2. - norm (str, optional): Type of normalization. Defaults to "ortho". - lifter (int, optional): Cepstral filtering. Defaults to 0. - - Returns: - np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`. - """ - if spect is None: - spect = melspectrogram(x, sr=sr, **kwargs) - - M = fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc] - - if lifter > 0: - factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) / - lifter) - return M * factor[:, np.newaxis] - elif lifter == 0: - return M - else: - raise ParameterError( - f"MFCC lifter={lifter} must be a non-negative number") - - -def melspectrogram(x: np.ndarray, - sr: int = 16000, - window_size: int = 512, - hop_length: int = 320, - n_mels: int = 64, - fmin: float = 50.0, - fmax: Optional[float] = None, - window: str = 'hann', - center: bool = True, - pad_mode: str = 'reflect', - power: float = 2.0, - to_db: bool = True, - ref: float = 1.0, - amin: float = 1e-10, - top_db: Optional[float] = None) -> np.ndarray: - """Compute mel-spectrogram. - - Args: - x (np.ndarray): Input waveform in one dimension. - sr (int, optional): Sample rate. Defaults to 16000. - window_size (int, optional): Size of FFT and window length. Defaults to 512. - hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. - n_mels (int, optional): Number of mel bins. Defaults to 64. - fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0. - fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. - window (str, optional): A string of window specification. Defaults to "hann". - center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. - pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". - power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. - to_db (bool, optional): Enable db scale. Defaults to True. - ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. - amin (float, optional): Minimum threshold. Defaults to 1e-10. - top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. - - Returns: - np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`. - """ - _check_audio(x, mono=True) - if len(x) <= 0: - raise ParameterError('The input waveform is empty') - - if fmax is None: - fmax = sr // 2 - if fmin < 0 or fmin >= fmax: - raise ParameterError('fmin and fmax must statisfy 0 np.ndarray: - """Compute spectrogram. - - Args: - x (np.ndarray): Input waveform in one dimension. - sr (int, optional): Sample rate. Defaults to 16000. - window_size (int, optional): Size of FFT and window length. Defaults to 512. - hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. - window (str, optional): A string of window specification. Defaults to "hann". - center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. - pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". - power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. - - Returns: - np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`. - """ - - s = stft(x, - n_fft=window_size, - hop_length=hop_length, - win_length=window_size, - window=window, - center=center, - pad_mode=pad_mode) - - return np.abs(s)**power diff --git a/python/paddle/tests/test_audio_features.py b/python/paddle/tests/test_audio_features.py index 5ac7f802c9d9a..036b11cdf59da 100644 --- a/python/paddle/tests/test_audio_features.py +++ b/python/paddle/tests/test_audio_features.py @@ -20,6 +20,12 @@ import paddle.audio from scipy import signal +import itertools +from parameterized import parameterized + + +def parameterize(*params): + return parameterized.expand(list(itertools.product(*params))) class TestFeatures(unittest.TestCase): @@ -53,69 +59,105 @@ def get_wav_data(dtype: str, num_channels: int, num_frames: int): num_frames=self.duration * self.sr) self.waveform = waveform_tensor.numpy() - def test_audio_function(self): - mel1 = paddle.audio.functional.hz_to_mel(2.0, True) - mel2 = paddle.audio.functional.hz_to_mel(paddle.to_tensor([9.0])) - self.assertTrue(mel1 == 3.215392139848255) - self.assertTrue(mel2 == paddle.to_tensor([0.13499999])) - hz1 = paddle.audio.functional.mel_to_hz(paddle.to_tensor([9.0]), True) - hz2 = paddle.audio.functional.mel_to_hz(25.0) - self.assertTrue(hz1, paddle.to_tensor(5.61244488)) - self.assertTrue(hz2, 1988.77281) + @parameterize([1.0, 3.0, 9.0, 25.0], [True, False]) + def test_audio_function(self, val: float, htk_flag: bool): + mel_paddle = paddle.audio.functional.hz_to_mel(val, htk_flag) + mel_paddle_tensor = paddle.audio.functional.hz_to_mel( + paddle.to_tensor(val), htk_flag) + mel_librosa = librosa.hz_to_mel(val, htk_flag) + np.testing.assert_almost_equal(mel_paddle, mel_librosa, decimal=5) + np.testing.assert_almost_equal(mel_paddle_tensor.numpy(), + mel_librosa, + decimal=4) + + hz_paddle = paddle.audio.functional.mel_to_hz(val, htk_flag) + hz_paddle_tensor = paddle.audio.functional.mel_to_hz( + paddle.to_tensor(val), htk_flag) + hz_librosa = librosa.mel_to_hz(val, htk_flag) + np.testing.assert_almost_equal(hz_paddle, hz_librosa, decimal=4) + np.testing.assert_almost_equal(hz_paddle_tensor.numpy(), + hz_librosa, + decimal=4) + + decibel_paddle = paddle.audio.functional.power_to_db( + paddle.to_tensor(val)) + decibel_librosa = librosa.power_to_db(val) + np.testing.assert_almost_equal(decibel_paddle.numpy(), + decibel_paddle, + decimal=5) + + @parameterize([64, 128, 256], [0.0, 0.5, 1.0], [10000, 11025], + [False, True]) + def test_audio_function_mel(self, n_mels: int, f_min: float, f_max: float, + htk_flag: bool): + librosa_mel_freq = librosa.mel_frequencies(n_mels, f_min, f_max, + htk_flag) + paddle_mel_freq = paddle.audio.functional.mel_frequencies( + n_mels, f_min, f_max, htk_flag, 'float64') + np.testing.assert_almost_equal(paddle_mel_freq, + librosa_mel_freq, + decimal=3) + + @parameterize([8000, 16000], [64, 128, 256]) + def test_audio_function_fft(self, sr: int, n_fft: int): + librosa_fft = librosa.fft_frequencies(sr, n_fft) + paddle_fft = paddle.audio.functional.fft_frequencies(sr, n_fft) + np.testing.assert_almost_equal(paddle_fft, librosa_fft, decimal=5) + + @parameterize([1.0, 3.0, 9.0]) + def test_audio_function_exception(self, spect: float): try: - paddle.audio.functional.power_to_db(paddle.to_tensor([9.0]), amin=0) + paddle.audio.functional.power_to_db(paddle.to_tensor([spect]), + amin=0) except Exception: pass try: - paddle.audio.functional.power_to_db(paddle.to_tensor([9.0]), + paddle.audio.functional.power_to_db(paddle.to_tensor([spect]), ref_value=0) except Exception: pass try: - paddle.audio.functional.power_to_db(paddle.to_tensor([9.0]), + paddle.audio.functional.power_to_db(paddle.to_tensor([spect]), top_db=-1) except Exception: pass - def test_window(self): - window_types = [ - "hamming", "hann", "triang", "bohman", "blackman", "cosine", - "tukey", "taylor" - ] - for window_type in window_types: - for n_fft in [1, self.n_fft]: - window_scipy = signal.get_window(window_type, n_fft) - window_paddle = paddle.audio.functional.get_window( - window_type, n_fft) - np.testing.assert_array_almost_equal(window_scipy, - window_paddle.numpy(), - decimal=5) - - for n_fft in [1, self.n_fft]: - window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7) - window_paddle_gaussian = paddle.audio.functional.get_window( - ('gaussian', 7), n_fft, False) - np.testing.assert_array_almost_equal(window_scipy_gaussain, - window_paddle_gaussian.numpy(), - decimal=5) - window_scipy_general_gaussain = signal.windows.general_gaussian( - n_fft, 1, 7) - window_paddle_general_gaussian = paddle.audio.functional.get_window( - ('general_gaussian', 1, 7), n_fft, False) - np.testing.assert_array_almost_equal(window_scipy_gaussain, - window_paddle_gaussian.numpy(), - decimal=5) - - window_scipy_exp = signal.windows.exponential(n_fft) - window_paddle_exp = paddle.audio.functional.get_window( - ('exponential', None, 1), n_fft, False) - np.testing.assert_array_almost_equal(window_scipy_exp, - window_paddle_exp.numpy(), - decimal=5) + @parameterize([ + "hamming", "hann", "triang", "bohman", "blackman", "cosine", "tukey", + "taylor" + ], [1, 512]) + def test_window(self, window_type: str, n_fft: int): + window_scipy = signal.get_window(window_type, n_fft) + window_paddle = paddle.audio.functional.get_window(window_type, n_fft) + np.testing.assert_array_almost_equal(window_scipy, + window_paddle.numpy(), + decimal=5) + + @parameterize([1, 512]) + def test_gussian_window_and_exception(self, n_fft: int): + window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7) + window_paddle_gaussian = paddle.audio.functional.get_window( + ('gaussian', 7), n_fft, False) + np.testing.assert_array_almost_equal(window_scipy_gaussain, + window_paddle_gaussian.numpy(), + decimal=5) + window_scipy_general_gaussain = signal.windows.general_gaussian( + n_fft, 1, 7) + window_paddle_general_gaussian = paddle.audio.functional.get_window( + ('general_gaussian', 1, 7), n_fft, False) + np.testing.assert_array_almost_equal(window_scipy_gaussain, + window_paddle_gaussian.numpy(), + decimal=5) + window_scipy_exp = signal.windows.exponential(n_fft) + window_paddle_exp = paddle.audio.functional.get_window( + ('exponential', None, 1), n_fft, False) + np.testing.assert_array_almost_equal(window_scipy_exp, + window_paddle_exp.numpy(), + decimal=5) try: window_paddle = paddle.audio.functional.get_window(("kaiser", 1.0), self.n_fft) @@ -138,41 +180,77 @@ def test_window(self): except ValueError: pass - def test_stft(self): + @parameterize([5, 13, 23], [257, 513, 1025]) + def test_create_dct(self, n_mfcc: int, n_mels: int): + + def dct(n_filters, n_input): + basis = np.empty((n_filters, n_input)) + basis[0, :] = 1.0 / np.sqrt(n_input) + samples = np.arange(1, 2 * n_input, 2) * np.pi / (2.0 * n_input) + + for i in range(1, n_filters): + basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / n_input) + return basis.T + + librosa_dct = dct(n_mfcc, n_mels) + paddle_dct = paddle.audio.functional.create_dct(n_mfcc, n_mels) + np.testing.assert_array_almost_equal(librosa_dct, paddle_dct, decimal=5) + + @parameterize([128, 256, 512], ["hamming", "hann", "triang", "bohman"], + [True, False]) + def test_stft_and_spect(self, n_fft: int, window_str: str, + center_flag: bool): + hop_length = int(n_fft / 4) if len(self.waveform.shape) == 2: # (C, T) self.waveform = self.waveform.squeeze( 0) # 1D input for librosa.feature.melspectrogram feature_librosa = librosa.core.stft( y=self.waveform, - n_fft=self.n_fft, - hop_length=self.hop_length, + n_fft=n_fft, + hop_length=hop_length, win_length=None, - window=self.window_str, - center=True, + window=window_str, + center=center_flag, dtype=None, pad_mode=self.pad_mode, ) x = paddle.to_tensor(self.waveform).unsqueeze(0) - window = paddle.audio.functional.get_window(self.window_str, - self.n_fft, + window = paddle.audio.functional.get_window(window_str, + n_fft, dtype=x.dtype) feature_paddle = paddle.signal.stft( x=x, - n_fft=self.n_fft, - hop_length=self.hop_length, + n_fft=n_fft, + hop_length=hop_length, win_length=None, window=window, - center=True, + center=center_flag, pad_mode=self.pad_mode, normalized=False, onesided=True, ).squeeze(0) - np.testing.assert_array_almost_equal(feature_librosa, feature_paddle, decimal=5) - def test_istft(self): + feature_bg = np.power(np.abs(feature_librosa), 2.0) + feature_extractor = paddle.audio.features.Spectrogram( + n_fft=n_fft, + hop_length=hop_length, + win_length=None, + window=window_str, + power=2.0, + center=center_flag, + pad_mode=self.pad_mode, + ) + feature_layer = feature_extractor(x).squeeze(0) + np.testing.assert_array_almost_equal(feature_layer, + feature_bg, + decimal=4) + + @parameterize([128, 256, 512], [64, 82], + ["hamming", "hann", "triang", "bohman"]) + def test_istft(self, n_fft: int, hop_length: int, window_str: str): if len(self.waveform.shape) == 2: # (C, T) self.waveform = self.waveform.squeeze( 0) # 1D input for librosa.feature.melspectrogram @@ -180,31 +258,31 @@ def test_istft(self): # Get stft result from librosa. stft_matrix = librosa.core.stft( y=self.waveform, - n_fft=self.n_fft, - hop_length=self.hop_length, + n_fft=n_fft, + hop_length=hop_length, win_length=None, - window=self.window_str, + window=window_str, center=True, pad_mode=self.pad_mode, ) feature_librosa = librosa.core.istft( stft_matrix=stft_matrix, - hop_length=self.hop_length, + hop_length=hop_length, win_length=None, - window=self.window_str, + window=window_str, center=True, dtype=None, length=None, ) x = paddle.to_tensor(stft_matrix).unsqueeze(0) - window = paddle.audio.functional.get_window(self.window_str, - self.n_fft, + window = paddle.audio.functional.get_window(window_str, + n_fft, dtype=paddle.to_tensor( self.waveform).dtype) feature_paddle = paddle.signal.istft( x=x, - n_fft=self.n_fft, - hop_length=self.hop_length, + n_fft=n_fft, + hop_length=hop_length, win_length=None, window=window, center=True, @@ -218,160 +296,132 @@ def test_istft(self): feature_paddle, decimal=5) - def test_mel(self): + @parameterize([8000, 16000], [128, 256, 512], [64, 32], [0.0, 0.5, 1.0], + ['float32', 'float64']) + def test_mel(self, sr: int, n_fft: int, n_mels: int, fmin: float, + dtype: str): feature_librosa = librosa.filters.mel( - sr=self.sr, - n_fft=self.n_fft, - n_mels=self.n_mels, - fmin=self.fmin, + sr=sr, + n_fft=n_fft, + n_mels=n_mels, + fmin=fmin, fmax=None, htk=False, norm='slaney', - dtype=self.waveform.dtype, + dtype=np.dtype(dtype), ) - feature_compliance = paddle.audio.compliance.librosa.compute_fbank_matrix( - sr=self.sr, - n_fft=self.n_fft, - n_mels=self.n_mels, - fmin=self.fmin, - fmax=None, - htk=False, - norm='slaney', - dtype=self.waveform.dtype, - ) - x = paddle.to_tensor(self.waveform) + paddle_dtype = getattr(paddle, dtype) feature_functional = paddle.audio.functional.compute_fbank_matrix( - sr=self.sr, - n_fft=self.n_fft, - n_mels=self.n_mels, - f_min=self.fmin, + sr=sr, + n_fft=n_fft, + n_mels=n_mels, + f_min=fmin, f_max=None, htk=False, norm='slaney', - dtype=x.dtype, + dtype=paddle_dtype, ) - np.testing.assert_array_almost_equal(feature_librosa, - feature_compliance) np.testing.assert_array_almost_equal(feature_librosa, feature_functional) - def test_melspect(self): + @parameterize([8000, 16000], [128, 256, 512], [64, 82], [40, 60, 80], + [0.0, 0.5, 1.0]) + def test_melspect(self, sr: int, n_fft: int, hop_length: int, n_mels: int, + fmin: int): if len(self.waveform.shape) == 2: # (C, T) self.waveform = self.waveform.squeeze( 0) # 1D input for librosa.feature.melspectrogram # librosa: - feature_librosa = librosa.feature.melspectrogram( - y=self.waveform, - sr=self.sr, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin) - - # paddle.audio.compliance.librosa: - feature_compliance = paddle.audio.compliance.librosa.melspectrogram( - x=self.waveform, - sr=self.sr, - window_size=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin, - to_db=False) + feature_librosa = librosa.feature.melspectrogram(y=self.waveform, + sr=sr, + n_fft=n_fft, + hop_length=hop_length, + n_mels=n_mels, + fmin=fmin) # paddle.audio.features.layer x = paddle.to_tensor(self.waveform, dtype=paddle.float64).unsqueeze( 0) # Add batch dim. feature_extractor = paddle.audio.features.MelSpectrogram( - sr=self.sr, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - f_min=self.fmin, + sr=sr, + n_fft=n_fft, + hop_length=hop_length, + n_mels=n_mels, + f_min=fmin, dtype=x.dtype) feature_layer = feature_extractor(x).squeeze(0).numpy() - np.testing.assert_array_almost_equal(feature_librosa, - feature_compliance, - decimal=5) np.testing.assert_array_almost_equal(feature_librosa, feature_layer, decimal=5) - def test_log_melspect(self): + @parameterize([16000, 8000], [512, 256, 128], [128, 64], [64, 32], + [0.0, 1.0, 50.0]) + def test_log_melspect(self, sr: int, n_fft: int, hop_length: int, + n_mels: int, fmin: float): if len(self.waveform.shape) == 2: # (C, T) self.waveform = self.waveform.squeeze( 0) # 1D input for librosa.feature.melspectrogram # librosa: - feature_librosa = librosa.feature.melspectrogram( - y=self.waveform, - sr=self.sr, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin) + feature_librosa = librosa.feature.melspectrogram(y=self.waveform, + sr=sr, + n_fft=n_fft, + hop_length=hop_length, + n_mels=n_mels, + center=True, + fmin=fmin, + pad_mode='reflect') feature_librosa = librosa.power_to_db(feature_librosa, top_db=None) - # paddle.audio.compliance.librosa: - feature_compliance = paddle.audio.compliance.librosa.melspectrogram( - x=self.waveform, - sr=self.sr, - window_size=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin) + x = paddle.to_tensor(self.waveform, dtype=paddle.float64).unsqueeze( + 0) # Add batch dim. + feature_extractor = paddle.audio.features.LogMelSpectrogram( + sr=sr, + n_fft=n_fft, + hop_length=hop_length, + center=True, + n_mels=n_mels, + f_min=fmin, + dtype=x.dtype) + feature_layer = feature_extractor(x).squeeze(0).numpy() np.testing.assert_array_almost_equal(feature_librosa, - feature_compliance, + feature_layer, decimal=4) - def test_mfcc(self): + @parameterize([16000, 8000], [512, 256, 128], [128, 64], [64, 32], + [0.0, 1.0, 50.0]) + def test_mfcc(self, sr: int, n_fft: int, hop_length: int, n_mels: int, + fmin: int): if len(self.waveform.shape) == 2: # (C, T) self.waveform = self.waveform.squeeze( 0) # 1D input for librosa.feature.melspectrogram # librosa: feature_librosa = librosa.feature.mfcc(y=self.waveform, - sr=self.sr, + sr=sr, S=None, n_mfcc=self.n_mfcc, dct_type=2, - norm='ortho', lifter=0, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin) - # paddle.audio.compliance.librosa: - feature_compliance = paddle.audio.compliance.librosa.mfcc( - x=self.waveform, - sr=self.sr, - n_mfcc=self.n_mfcc, - dct_type=2, - norm='ortho', - lifter=0, - window_size=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - fmin=self.fmin, - top_db=self.top_db) + n_fft=n_fft, + hop_length=hop_length, + n_mels=n_mels, + fmin=fmin) # paddlespeech.audio.features.layer x = paddle.to_tensor(self.waveform, dtype=paddle.float64).unsqueeze( 0) # Add batch dim. - feature_extractor = paddle.audio.features.MFCC( - sr=self.sr, - n_mfcc=self.n_mfcc, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels, - f_min=self.fmin, - top_db=self.top_db, - dtype=x.dtype) + feature_extractor = paddle.audio.features.MFCC(sr=sr, + n_mfcc=self.n_mfcc, + n_fft=n_fft, + hop_length=hop_length, + n_mels=n_mels, + f_min=fmin, + top_db=self.top_db, + dtype=x.dtype) feature_layer = feature_extractor(x).squeeze(0).numpy() - np.testing.assert_array_almost_equal(feature_librosa, - feature_compliance, - decimal=4) np.testing.assert_array_almost_equal(feature_librosa, feature_layer, decimal=4) diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt index eec2452e9409d..78c6518953bd2 100644 --- a/python/unittest_py/requirements.txt +++ b/python/unittest_py/requirements.txt @@ -16,3 +16,4 @@ distro numpy>=1.20,<1.22; python_version >= "3.7" autograd==1.4 librosa==0.8.1 +parameterized