From 15b789d5c9e8c8e65374c47d8f62223f4f7583b8 Mon Sep 17 00:00:00 2001
From: YangZhou <goat.zhou@qq.com>
Date: Sun, 4 Sep 2022 22:08:18 +0800
Subject: [PATCH] rm compliance&&add function test

---
 python/paddle/audio/__init__.py            |   3 +-
 python/paddle/audio/compliance/__init__.py |  15 -
 python/paddle/audio/compliance/librosa.py  | 442 ---------------------
 python/paddle/tests/test_audio_features.py | 384 ++++++++++--------
 python/unittest_py/requirements.txt        |   1 +
 5 files changed, 219 insertions(+), 626 deletions(-)
 delete mode 100644 python/paddle/audio/compliance/__init__.py
 delete mode 100644 python/paddle/audio/compliance/librosa.py

diff --git a/python/paddle/audio/__init__.py b/python/paddle/audio/__init__.py
index 6895385ad3d08..e76a80300f5e6 100644
--- a/python/paddle/audio/__init__.py
+++ b/python/paddle/audio/__init__.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import compliance
 from . import features
 from . import functional
 from . import utils
 
-__all__ = ["compliance", "functional", "features", "utils"]
+__all__ = ["functional", "features", "utils"]
diff --git a/python/paddle/audio/compliance/__init__.py b/python/paddle/audio/compliance/__init__.py
deleted file mode 100644
index 6083e3eba77ec..0000000000000
--- a/python/paddle/audio/compliance/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import librosa
diff --git a/python/paddle/audio/compliance/librosa.py b/python/paddle/audio/compliance/librosa.py
deleted file mode 100644
index ba8d1651e234d..0000000000000
--- a/python/paddle/audio/compliance/librosa.py
+++ /dev/null
@@ -1,442 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from librosa(https://github.com/librosa/librosa)
-
-import warnings
-from typing import List
-from typing import Optional
-from typing import Union
-
-import numpy as np
-import scipy
-from numpy.lib.stride_tricks import as_strided
-from scipy import signal
-from scipy import fftpack
-import paddle
-
-from ..utils import ParameterError
-from ..functional import mel_frequencies
-from ..functional import power_to_db
-
-__all__ = [
-    # dsp
-    'stft',
-    'mfcc',
-    'compute_fbank_matrix',
-    'melspectrogram',
-    'spectrogram',
-]
-
-
-def _pad_center(data: np.ndarray,
-                size: int,
-                axis: int = -1,
-                **kwargs) -> np.ndarray:
-    """Pad an array to a target length along a target axis.
-
-    This differs from `np.pad` by centering the data prior to padding,
-    analogous to `str.center`
-    """
-
-    kwargs.setdefault("mode", "constant")
-    n = data.shape[axis]
-    lpad = int((size - n) // 2)
-    lengths = [(0, 0)] * data.ndim
-    lengths[axis] = (lpad, int(size - n - lpad))
-
-    if lpad < 0:
-        raise ParameterError(("Target size ({size:d}) must be "
-                              "at least input size ({n:d})"))
-
-    return np.pad(data, lengths, **kwargs)
-
-
-def _split_frames(x: np.ndarray,
-                  frame_length: int,
-                  hop_length: int,
-                  axis: int = -1) -> np.ndarray:
-    """Slice a data array into (overlapping) frames.
-
-    This function is aligned with librosa.frame
-    """
-
-    if not isinstance(x, np.ndarray):
-        raise ParameterError(
-            f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
-
-    if x.shape[axis] < frame_length:
-        raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
-                             f" for frame_length={frame_length:d}")
-
-    if hop_length < 1:
-        raise ParameterError(f"Invalid hop_length: {hop_length:d}")
-
-    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
-        warnings.warn(f"librosa.util.frame called with axis={axis} "
-                      "on a non-contiguous input. This will result in a copy.")
-        x = np.asfortranarray(x)
-    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
-        warnings.warn(f"librosa.util.frame called with axis={axis} "
-                      "on a non-contiguous input. This will result in a copy.")
-        x = np.ascontiguousarray(x)
-
-    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
-    strides = np.asarray(x.strides)
-
-    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
-
-    if axis == -1:
-        shape = list(x.shape)[:-1] + [frame_length, n_frames]
-        strides = list(strides) + [hop_length * new_stride]
-
-    elif axis == 0:
-        shape = [n_frames, frame_length] + list(x.shape)[1:]
-        strides = [hop_length * new_stride] + list(strides)
-
-    else:
-        raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
-
-    return as_strided(x, shape=shape, strides=strides)
-
-
-def _check_audio(y, mono=True) -> bool:
-    """Determine whether a variable contains valid audio data.
-
-    The audio y must be a np.ndarray, ether 1-channel or two channel
-    """
-    if not isinstance(y, np.ndarray):
-        raise ParameterError("Audio data must be of type numpy.ndarray")
-    if y.ndim > 2:
-        raise ParameterError(
-            f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
-
-    if mono and y.ndim == 2:
-        raise ParameterError(
-            f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
-
-    if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
-        raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
-
-    if not np.issubdtype(y.dtype, np.floating):
-        raise ParameterError("Audio data must be floating-point")
-
-    if not np.isfinite(y).all():
-        raise ParameterError("Audio buffer is not finite everywhere")
-
-    return True
-
-
-def fft_frequencies(sr: int, n_fft: int) -> np.ndarray:
-    """Compute fourier frequencies.
-
-    Args:
-        sr (int): Sample rate.
-        n_fft (int): FFT size.
-
-    Returns:
-        np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
-    """
-    return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
-
-
-def compute_fbank_matrix(sr: int,
-                         n_fft: int,
-                         n_mels: int = 128,
-                         fmin: float = 0.0,
-                         fmax: Optional[float] = None,
-                         htk: bool = False,
-                         norm: str = "slaney",
-                         dtype: type = np.float32) -> np.ndarray:
-    """Compute fbank matrix.
-
-    Args:
-        sr (int): Sample rate.
-        n_fft (int): FFT size.
-        n_mels (int, optional): Number of mel bins. Defaults to 128.
-        fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
-        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-        norm (str, optional): Type of normalization. Defaults to "slaney".
-        dtype (type, optional): Data type. Defaults to np.float32.
-
-
-    Returns:
-        np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
-    """
-    if norm != "slaney":
-        raise ParameterError('norm must set to slaney')
-
-    if fmax is None:
-        fmax = float(sr) / 2
-
-    # Initialize the weights
-    n_mels = int(n_mels)
-    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
-
-    # Center freqs of each FFT bin
-    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
-
-    # 'Center freqs' of mel bands - uniformly spaced between limits
-    mel_f_t = mel_frequencies(n_mels=n_mels + 2,
-                              f_min=fmin,
-                              f_max=fmax,
-                              htk=htk)
-    mel_f = mel_f_t.numpy()
-
-    fdiff = np.diff(mel_f)
-    ramps = np.subtract.outer(mel_f, fftfreqs)
-
-    for i in range(n_mels):
-        # lower and upper slopes for all bins
-        lower = -ramps[i] / fdiff[i]
-        upper = ramps[i + 2] / fdiff[i + 1]
-
-        # .. then intersect them with each other and zero
-        weights[i] = np.maximum(0, np.minimum(lower, upper))
-
-    if norm == "slaney":
-        # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
-        weights *= enorm[:, np.newaxis]
-
-    # Only check weights if f_mel[0] is positive
-    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
-        # This means we have an empty channel somewhere
-        warnings.warn("Empty filters detected in mel frequency basis. "
-                      "Some channels will produce empty responses. "
-                      "Try increasing your sampling rate (and fmax) or "
-                      "reducing n_mels.")
-
-    return weights
-
-
-def stft(x: np.ndarray,
-         n_fft: int = 2048,
-         hop_length: Optional[int] = None,
-         win_length: Optional[int] = None,
-         window: str = "hann",
-         center: bool = True,
-         dtype: type = np.complex64,
-         pad_mode: str = "reflect") -> np.ndarray:
-    """Short-time Fourier transform (STFT).
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        n_fft (int, optional): FFT size. Defaults to 2048.
-        hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
-        win_length (Optional[int], optional): The size of window. Defaults to None.
-        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
-
-    Returns:
-        np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
-    """
-    _check_audio(x)
-
-    # By default, use the entire frame
-    if win_length is None:
-        win_length = n_fft
-
-    # Set the default hop, if it's not already specified
-    if hop_length is None:
-        hop_length = int(win_length // 4)
-
-    fft_window = signal.get_window(window, win_length, fftbins=True)
-
-    # Pad the window out to n_fft size
-    fft_window = _pad_center(fft_window, n_fft)
-
-    # Reshape so that the window can be broadcast
-    fft_window = fft_window.reshape((-1, 1))
-
-    # Pad the time series so that frames are centered
-    if center:
-        if n_fft > x.shape[-1]:
-            warnings.warn(
-                f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
-            )
-        x = np.pad(x, int(n_fft // 2), mode=pad_mode)
-
-    elif n_fft > x.shape[-1]:
-        raise ParameterError(
-            f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
-        )
-
-    # Window the time series.
-    x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length)
-    # Pre-allocate the STFT matrix
-    stft_matrix = np.empty((int(1 + n_fft // 2), x_frames.shape[1]),
-                           dtype=dtype,
-                           order="F")
-    fft = np.fft  # use numpy fft as default
-    # Constrain STFT block sizes to 256 KB
-    MAX_MEM_BLOCK = 2**8 * 2**10
-    # how many columns can we fit within MAX_MEM_BLOCK?
-    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
-    n_columns = max(n_columns, 1)
-
-    for bl_s in range(0, stft_matrix.shape[1], n_columns):
-        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
-        stft_matrix[:,
-                    bl_s:bl_t] = fft.rfft(fft_window * x_frames[:, bl_s:bl_t],
-                                          axis=0)
-
-    return stft_matrix
-
-
-def mfcc(x: np.ndarray,
-         sr: int = 16000,
-         spect: Optional[np.ndarray] = None,
-         n_mfcc: int = 20,
-         dct_type: int = 2,
-         norm: str = "ortho",
-         lifter: int = 0,
-         **kwargs) -> np.ndarray:
-    """Mel-frequency cepstral coefficients (MFCCs)
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        sr (int, optional): Sample rate. Defaults to 16000.
-        spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None.
-        n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20.
-        dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2.
-        norm (str, optional): Type of normalization. Defaults to "ortho".
-        lifter (int, optional): Cepstral filtering. Defaults to 0.
-
-    Returns:
-        np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
-    """
-    if spect is None:
-        spect = melspectrogram(x, sr=sr, **kwargs)
-
-    M = fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
-
-    if lifter > 0:
-        factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
-                        lifter)
-        return M * factor[:, np.newaxis]
-    elif lifter == 0:
-        return M
-    else:
-        raise ParameterError(
-            f"MFCC lifter={lifter} must be a non-negative number")
-
-
-def melspectrogram(x: np.ndarray,
-                   sr: int = 16000,
-                   window_size: int = 512,
-                   hop_length: int = 320,
-                   n_mels: int = 64,
-                   fmin: float = 50.0,
-                   fmax: Optional[float] = None,
-                   window: str = 'hann',
-                   center: bool = True,
-                   pad_mode: str = 'reflect',
-                   power: float = 2.0,
-                   to_db: bool = True,
-                   ref: float = 1.0,
-                   amin: float = 1e-10,
-                   top_db: Optional[float] = None) -> np.ndarray:
-    """Compute mel-spectrogram.
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        sr (int, optional): Sample rate. Defaults to 16000.
-        window_size (int, optional): Size of FFT and window length. Defaults to 512.
-        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
-        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
-        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
-        to_db (bool, optional): Enable db scale. Defaults to True.
-        ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
-        amin (float, optional): Minimum threshold. Defaults to 1e-10.
-        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
-
-    Returns:
-        np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`.
-    """
-    _check_audio(x, mono=True)
-    if len(x) <= 0:
-        raise ParameterError('The input waveform is empty')
-
-    if fmax is None:
-        fmax = sr // 2
-    if fmin < 0 or fmin >= fmax:
-        raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
-
-    s = stft(x,
-             n_fft=window_size,
-             hop_length=hop_length,
-             win_length=window_size,
-             window=window,
-             center=center,
-             pad_mode=pad_mode)
-
-    spect_power = np.abs(s)**power
-    fb_matrix = compute_fbank_matrix(sr=sr,
-                                     n_fft=window_size,
-                                     n_mels=n_mels,
-                                     fmin=fmin,
-                                     fmax=fmax)
-    mel_spect = np.matmul(fb_matrix, spect_power)
-    if to_db:
-        result = power_to_db(paddle.to_tensor(mel_spect),
-                             ref_value=ref,
-                             amin=amin,
-                             top_db=top_db)
-        return result.numpy()
-        #return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
-    else:
-        return mel_spect
-
-
-def spectrogram(x: np.ndarray,
-                sr: int = 16000,
-                window_size: int = 512,
-                hop_length: int = 320,
-                window: str = 'hann',
-                center: bool = True,
-                pad_mode: str = 'reflect',
-                power: float = 2.0) -> np.ndarray:
-    """Compute spectrogram.
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        sr (int, optional): Sample rate. Defaults to 16000.
-        window_size (int, optional): Size of FFT and window length. Defaults to 512.
-        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
-        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
-        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
-
-    Returns:
-        np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`.
-    """
-
-    s = stft(x,
-             n_fft=window_size,
-             hop_length=hop_length,
-             win_length=window_size,
-             window=window,
-             center=center,
-             pad_mode=pad_mode)
-
-    return np.abs(s)**power
diff --git a/python/paddle/tests/test_audio_features.py b/python/paddle/tests/test_audio_features.py
index 5ac7f802c9d9a..036b11cdf59da 100644
--- a/python/paddle/tests/test_audio_features.py
+++ b/python/paddle/tests/test_audio_features.py
@@ -20,6 +20,12 @@
 
 import paddle.audio
 from scipy import signal
+import itertools
+from parameterized import parameterized
+
+
+def parameterize(*params):
+    return parameterized.expand(list(itertools.product(*params)))
 
 
 class TestFeatures(unittest.TestCase):
@@ -53,69 +59,105 @@ def get_wav_data(dtype: str, num_channels: int, num_frames: int):
                                        num_frames=self.duration * self.sr)
         self.waveform = waveform_tensor.numpy()
 
-    def test_audio_function(self):
-        mel1 = paddle.audio.functional.hz_to_mel(2.0, True)
-        mel2 = paddle.audio.functional.hz_to_mel(paddle.to_tensor([9.0]))
-        self.assertTrue(mel1 == 3.215392139848255)
-        self.assertTrue(mel2 == paddle.to_tensor([0.13499999]))
-        hz1 = paddle.audio.functional.mel_to_hz(paddle.to_tensor([9.0]), True)
-        hz2 = paddle.audio.functional.mel_to_hz(25.0)
-        self.assertTrue(hz1, paddle.to_tensor(5.61244488))
-        self.assertTrue(hz2, 1988.77281)
+    @parameterize([1.0, 3.0, 9.0, 25.0], [True, False])
+    def test_audio_function(self, val: float, htk_flag: bool):
+        mel_paddle = paddle.audio.functional.hz_to_mel(val, htk_flag)
+        mel_paddle_tensor = paddle.audio.functional.hz_to_mel(
+            paddle.to_tensor(val), htk_flag)
+        mel_librosa = librosa.hz_to_mel(val, htk_flag)
+        np.testing.assert_almost_equal(mel_paddle, mel_librosa, decimal=5)
+        np.testing.assert_almost_equal(mel_paddle_tensor.numpy(),
+                                       mel_librosa,
+                                       decimal=4)
+
+        hz_paddle = paddle.audio.functional.mel_to_hz(val, htk_flag)
+        hz_paddle_tensor = paddle.audio.functional.mel_to_hz(
+            paddle.to_tensor(val), htk_flag)
+        hz_librosa = librosa.mel_to_hz(val, htk_flag)
+        np.testing.assert_almost_equal(hz_paddle, hz_librosa, decimal=4)
+        np.testing.assert_almost_equal(hz_paddle_tensor.numpy(),
+                                       hz_librosa,
+                                       decimal=4)
+
+        decibel_paddle = paddle.audio.functional.power_to_db(
+            paddle.to_tensor(val))
+        decibel_librosa = librosa.power_to_db(val)
+        np.testing.assert_almost_equal(decibel_paddle.numpy(),
+                                       decibel_paddle,
+                                       decimal=5)
+
+    @parameterize([64, 128, 256], [0.0, 0.5, 1.0], [10000, 11025],
+                  [False, True])
+    def test_audio_function_mel(self, n_mels: int, f_min: float, f_max: float,
+                                htk_flag: bool):
+        librosa_mel_freq = librosa.mel_frequencies(n_mels, f_min, f_max,
+                                                   htk_flag)
+        paddle_mel_freq = paddle.audio.functional.mel_frequencies(
+            n_mels, f_min, f_max, htk_flag, 'float64')
+        np.testing.assert_almost_equal(paddle_mel_freq,
+                                       librosa_mel_freq,
+                                       decimal=3)
+
+    @parameterize([8000, 16000], [64, 128, 256])
+    def test_audio_function_fft(self, sr: int, n_fft: int):
+        librosa_fft = librosa.fft_frequencies(sr, n_fft)
+        paddle_fft = paddle.audio.functional.fft_frequencies(sr, n_fft)
+        np.testing.assert_almost_equal(paddle_fft, librosa_fft, decimal=5)
+
+    @parameterize([1.0, 3.0, 9.0])
+    def test_audio_function_exception(self, spect: float):
         try:
-            paddle.audio.functional.power_to_db(paddle.to_tensor([9.0]), amin=0)
+            paddle.audio.functional.power_to_db(paddle.to_tensor([spect]),
+                                                amin=0)
         except Exception:
             pass
 
         try:
-            paddle.audio.functional.power_to_db(paddle.to_tensor([9.0]),
+            paddle.audio.functional.power_to_db(paddle.to_tensor([spect]),
                                                 ref_value=0)
 
         except Exception:
             pass
 
         try:
-            paddle.audio.functional.power_to_db(paddle.to_tensor([9.0]),
+            paddle.audio.functional.power_to_db(paddle.to_tensor([spect]),
                                                 top_db=-1)
         except Exception:
             pass
 
-    def test_window(self):
-        window_types = [
-            "hamming", "hann", "triang", "bohman", "blackman", "cosine",
-            "tukey", "taylor"
-        ]
-        for window_type in window_types:
-            for n_fft in [1, self.n_fft]:
-                window_scipy = signal.get_window(window_type, n_fft)
-                window_paddle = paddle.audio.functional.get_window(
-                    window_type, n_fft)
-                np.testing.assert_array_almost_equal(window_scipy,
-                                                     window_paddle.numpy(),
-                                                     decimal=5)
-
-        for n_fft in [1, self.n_fft]:
-            window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7)
-            window_paddle_gaussian = paddle.audio.functional.get_window(
-                ('gaussian', 7), n_fft, False)
-            np.testing.assert_array_almost_equal(window_scipy_gaussain,
-                                                 window_paddle_gaussian.numpy(),
-                                                 decimal=5)
-            window_scipy_general_gaussain = signal.windows.general_gaussian(
-                n_fft, 1, 7)
-            window_paddle_general_gaussian = paddle.audio.functional.get_window(
-                ('general_gaussian', 1, 7), n_fft, False)
-            np.testing.assert_array_almost_equal(window_scipy_gaussain,
-                                                 window_paddle_gaussian.numpy(),
-                                                 decimal=5)
-
-            window_scipy_exp = signal.windows.exponential(n_fft)
-            window_paddle_exp = paddle.audio.functional.get_window(
-                ('exponential', None, 1), n_fft, False)
-            np.testing.assert_array_almost_equal(window_scipy_exp,
-                                                 window_paddle_exp.numpy(),
-                                                 decimal=5)
+    @parameterize([
+        "hamming", "hann", "triang", "bohman", "blackman", "cosine", "tukey",
+        "taylor"
+    ], [1, 512])
+    def test_window(self, window_type: str, n_fft: int):
+        window_scipy = signal.get_window(window_type, n_fft)
+        window_paddle = paddle.audio.functional.get_window(window_type, n_fft)
+        np.testing.assert_array_almost_equal(window_scipy,
+                                             window_paddle.numpy(),
+                                             decimal=5)
+
+    @parameterize([1, 512])
+    def test_gussian_window_and_exception(self, n_fft: int):
+        window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7)
+        window_paddle_gaussian = paddle.audio.functional.get_window(
+            ('gaussian', 7), n_fft, False)
+        np.testing.assert_array_almost_equal(window_scipy_gaussain,
+                                             window_paddle_gaussian.numpy(),
+                                             decimal=5)
+        window_scipy_general_gaussain = signal.windows.general_gaussian(
+            n_fft, 1, 7)
+        window_paddle_general_gaussian = paddle.audio.functional.get_window(
+            ('general_gaussian', 1, 7), n_fft, False)
+        np.testing.assert_array_almost_equal(window_scipy_gaussain,
+                                             window_paddle_gaussian.numpy(),
+                                             decimal=5)
 
+        window_scipy_exp = signal.windows.exponential(n_fft)
+        window_paddle_exp = paddle.audio.functional.get_window(
+            ('exponential', None, 1), n_fft, False)
+        np.testing.assert_array_almost_equal(window_scipy_exp,
+                                             window_paddle_exp.numpy(),
+                                             decimal=5)
         try:
             window_paddle = paddle.audio.functional.get_window(("kaiser", 1.0),
                                                                self.n_fft)
@@ -138,41 +180,77 @@ def test_window(self):
         except ValueError:
             pass
 
-    def test_stft(self):
+    @parameterize([5, 13, 23], [257, 513, 1025])
+    def test_create_dct(self, n_mfcc: int, n_mels: int):
+
+        def dct(n_filters, n_input):
+            basis = np.empty((n_filters, n_input))
+            basis[0, :] = 1.0 / np.sqrt(n_input)
+            samples = np.arange(1, 2 * n_input, 2) * np.pi / (2.0 * n_input)
+
+            for i in range(1, n_filters):
+                basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / n_input)
+            return basis.T
+
+        librosa_dct = dct(n_mfcc, n_mels)
+        paddle_dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
+        np.testing.assert_array_almost_equal(librosa_dct, paddle_dct, decimal=5)
+
+    @parameterize([128, 256, 512], ["hamming", "hann", "triang", "bohman"],
+                  [True, False])
+    def test_stft_and_spect(self, n_fft: int, window_str: str,
+                            center_flag: bool):
+        hop_length = int(n_fft / 4)
         if len(self.waveform.shape) == 2:  # (C, T)
             self.waveform = self.waveform.squeeze(
                 0)  # 1D input for librosa.feature.melspectrogram
         feature_librosa = librosa.core.stft(
             y=self.waveform,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
+            n_fft=n_fft,
+            hop_length=hop_length,
             win_length=None,
-            window=self.window_str,
-            center=True,
+            window=window_str,
+            center=center_flag,
             dtype=None,
             pad_mode=self.pad_mode,
         )
         x = paddle.to_tensor(self.waveform).unsqueeze(0)
-        window = paddle.audio.functional.get_window(self.window_str,
-                                                    self.n_fft,
+        window = paddle.audio.functional.get_window(window_str,
+                                                    n_fft,
                                                     dtype=x.dtype)
         feature_paddle = paddle.signal.stft(
             x=x,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
+            n_fft=n_fft,
+            hop_length=hop_length,
             win_length=None,
             window=window,
-            center=True,
+            center=center_flag,
             pad_mode=self.pad_mode,
             normalized=False,
             onesided=True,
         ).squeeze(0)
-
         np.testing.assert_array_almost_equal(feature_librosa,
                                              feature_paddle,
                                              decimal=5)
 
-    def test_istft(self):
+        feature_bg = np.power(np.abs(feature_librosa), 2.0)
+        feature_extractor = paddle.audio.features.Spectrogram(
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=None,
+            window=window_str,
+            power=2.0,
+            center=center_flag,
+            pad_mode=self.pad_mode,
+        )
+        feature_layer = feature_extractor(x).squeeze(0)
+        np.testing.assert_array_almost_equal(feature_layer,
+                                             feature_bg,
+                                             decimal=4)
+
+    @parameterize([128, 256, 512], [64, 82],
+                  ["hamming", "hann", "triang", "bohman"])
+    def test_istft(self, n_fft: int, hop_length: int, window_str: str):
         if len(self.waveform.shape) == 2:  # (C, T)
             self.waveform = self.waveform.squeeze(
                 0)  # 1D input for librosa.feature.melspectrogram
@@ -180,31 +258,31 @@ def test_istft(self):
         # Get stft result from librosa.
         stft_matrix = librosa.core.stft(
             y=self.waveform,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
+            n_fft=n_fft,
+            hop_length=hop_length,
             win_length=None,
-            window=self.window_str,
+            window=window_str,
             center=True,
             pad_mode=self.pad_mode,
         )
         feature_librosa = librosa.core.istft(
             stft_matrix=stft_matrix,
-            hop_length=self.hop_length,
+            hop_length=hop_length,
             win_length=None,
-            window=self.window_str,
+            window=window_str,
             center=True,
             dtype=None,
             length=None,
         )
         x = paddle.to_tensor(stft_matrix).unsqueeze(0)
-        window = paddle.audio.functional.get_window(self.window_str,
-                                                    self.n_fft,
+        window = paddle.audio.functional.get_window(window_str,
+                                                    n_fft,
                                                     dtype=paddle.to_tensor(
                                                         self.waveform).dtype)
         feature_paddle = paddle.signal.istft(
             x=x,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
+            n_fft=n_fft,
+            hop_length=hop_length,
             win_length=None,
             window=window,
             center=True,
@@ -218,160 +296,132 @@ def test_istft(self):
                                              feature_paddle,
                                              decimal=5)
 
-    def test_mel(self):
+    @parameterize([8000, 16000], [128, 256, 512], [64, 32], [0.0, 0.5, 1.0],
+                  ['float32', 'float64'])
+    def test_mel(self, sr: int, n_fft: int, n_mels: int, fmin: float,
+                 dtype: str):
         feature_librosa = librosa.filters.mel(
-            sr=self.sr,
-            n_fft=self.n_fft,
-            n_mels=self.n_mels,
-            fmin=self.fmin,
+            sr=sr,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            fmin=fmin,
             fmax=None,
             htk=False,
             norm='slaney',
-            dtype=self.waveform.dtype,
+            dtype=np.dtype(dtype),
         )
-        feature_compliance = paddle.audio.compliance.librosa.compute_fbank_matrix(
-            sr=self.sr,
-            n_fft=self.n_fft,
-            n_mels=self.n_mels,
-            fmin=self.fmin,
-            fmax=None,
-            htk=False,
-            norm='slaney',
-            dtype=self.waveform.dtype,
-        )
-        x = paddle.to_tensor(self.waveform)
+        paddle_dtype = getattr(paddle, dtype)
         feature_functional = paddle.audio.functional.compute_fbank_matrix(
-            sr=self.sr,
-            n_fft=self.n_fft,
-            n_mels=self.n_mels,
-            f_min=self.fmin,
+            sr=sr,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            f_min=fmin,
             f_max=None,
             htk=False,
             norm='slaney',
-            dtype=x.dtype,
+            dtype=paddle_dtype,
         )
 
-        np.testing.assert_array_almost_equal(feature_librosa,
-                                             feature_compliance)
         np.testing.assert_array_almost_equal(feature_librosa,
                                              feature_functional)
 
-    def test_melspect(self):
+    @parameterize([8000, 16000], [128, 256, 512], [64, 82], [40, 60, 80],
+                  [0.0, 0.5, 1.0])
+    def test_melspect(self, sr: int, n_fft: int, hop_length: int, n_mels: int,
+                      fmin: int):
         if len(self.waveform.shape) == 2:  # (C, T)
             self.waveform = self.waveform.squeeze(
                 0)  # 1D input for librosa.feature.melspectrogram
 
         # librosa:
-        feature_librosa = librosa.feature.melspectrogram(
-            y=self.waveform,
-            sr=self.sr,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            fmin=self.fmin)
-
-        # paddle.audio.compliance.librosa:
-        feature_compliance = paddle.audio.compliance.librosa.melspectrogram(
-            x=self.waveform,
-            sr=self.sr,
-            window_size=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            fmin=self.fmin,
-            to_db=False)
+        feature_librosa = librosa.feature.melspectrogram(y=self.waveform,
+                                                         sr=sr,
+                                                         n_fft=n_fft,
+                                                         hop_length=hop_length,
+                                                         n_mels=n_mels,
+                                                         fmin=fmin)
 
         # paddle.audio.features.layer
         x = paddle.to_tensor(self.waveform, dtype=paddle.float64).unsqueeze(
             0)  # Add batch dim.
         feature_extractor = paddle.audio.features.MelSpectrogram(
-            sr=self.sr,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            f_min=self.fmin,
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            n_mels=n_mels,
+            f_min=fmin,
             dtype=x.dtype)
         feature_layer = feature_extractor(x).squeeze(0).numpy()
 
-        np.testing.assert_array_almost_equal(feature_librosa,
-                                             feature_compliance,
-                                             decimal=5)
         np.testing.assert_array_almost_equal(feature_librosa,
                                              feature_layer,
                                              decimal=5)
 
-    def test_log_melspect(self):
+    @parameterize([16000, 8000], [512, 256, 128], [128, 64], [64, 32],
+                  [0.0, 1.0, 50.0])
+    def test_log_melspect(self, sr: int, n_fft: int, hop_length: int,
+                          n_mels: int, fmin: float):
         if len(self.waveform.shape) == 2:  # (C, T)
             self.waveform = self.waveform.squeeze(
                 0)  # 1D input for librosa.feature.melspectrogram
 
         # librosa:
-        feature_librosa = librosa.feature.melspectrogram(
-            y=self.waveform,
-            sr=self.sr,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            fmin=self.fmin)
+        feature_librosa = librosa.feature.melspectrogram(y=self.waveform,
+                                                         sr=sr,
+                                                         n_fft=n_fft,
+                                                         hop_length=hop_length,
+                                                         n_mels=n_mels,
+                                                         center=True,
+                                                         fmin=fmin,
+                                                         pad_mode='reflect')
         feature_librosa = librosa.power_to_db(feature_librosa, top_db=None)
-        # paddle.audio.compliance.librosa:
-        feature_compliance = paddle.audio.compliance.librosa.melspectrogram(
-            x=self.waveform,
-            sr=self.sr,
-            window_size=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            fmin=self.fmin)
+        x = paddle.to_tensor(self.waveform, dtype=paddle.float64).unsqueeze(
+            0)  # Add batch dim.
+        feature_extractor = paddle.audio.features.LogMelSpectrogram(
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            center=True,
+            n_mels=n_mels,
+            f_min=fmin,
+            dtype=x.dtype)
+        feature_layer = feature_extractor(x).squeeze(0).numpy()
         np.testing.assert_array_almost_equal(feature_librosa,
-                                             feature_compliance,
+                                             feature_layer,
                                              decimal=4)
 
-    def test_mfcc(self):
+    @parameterize([16000, 8000], [512, 256, 128], [128, 64], [64, 32],
+                  [0.0, 1.0, 50.0])
+    def test_mfcc(self, sr: int, n_fft: int, hop_length: int, n_mels: int,
+                  fmin: int):
         if len(self.waveform.shape) == 2:  # (C, T)
             self.waveform = self.waveform.squeeze(
                 0)  # 1D input for librosa.feature.melspectrogram
 
         # librosa:
         feature_librosa = librosa.feature.mfcc(y=self.waveform,
-                                               sr=self.sr,
+                                               sr=sr,
                                                S=None,
                                                n_mfcc=self.n_mfcc,
                                                dct_type=2,
-                                               norm='ortho',
                                                lifter=0,
-                                               n_fft=self.n_fft,
-                                               hop_length=self.hop_length,
-                                               n_mels=self.n_mels,
-                                               fmin=self.fmin)
-        # paddle.audio.compliance.librosa:
-        feature_compliance = paddle.audio.compliance.librosa.mfcc(
-            x=self.waveform,
-            sr=self.sr,
-            n_mfcc=self.n_mfcc,
-            dct_type=2,
-            norm='ortho',
-            lifter=0,
-            window_size=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            fmin=self.fmin,
-            top_db=self.top_db)
+                                               n_fft=n_fft,
+                                               hop_length=hop_length,
+                                               n_mels=n_mels,
+                                               fmin=fmin)
         # paddlespeech.audio.features.layer
         x = paddle.to_tensor(self.waveform, dtype=paddle.float64).unsqueeze(
             0)  # Add batch dim.
-        feature_extractor = paddle.audio.features.MFCC(
-            sr=self.sr,
-            n_mfcc=self.n_mfcc,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            f_min=self.fmin,
-            top_db=self.top_db,
-            dtype=x.dtype)
+        feature_extractor = paddle.audio.features.MFCC(sr=sr,
+                                                       n_mfcc=self.n_mfcc,
+                                                       n_fft=n_fft,
+                                                       hop_length=hop_length,
+                                                       n_mels=n_mels,
+                                                       f_min=fmin,
+                                                       top_db=self.top_db,
+                                                       dtype=x.dtype)
         feature_layer = feature_extractor(x).squeeze(0).numpy()
 
-        np.testing.assert_array_almost_equal(feature_librosa,
-                                             feature_compliance,
-                                             decimal=4)
         np.testing.assert_array_almost_equal(feature_librosa,
                                              feature_layer,
                                              decimal=4)
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index eec2452e9409d..78c6518953bd2 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -16,3 +16,4 @@ distro
 numpy>=1.20,<1.22; python_version >= "3.7"
 autograd==1.4
 librosa==0.8.1
+parameterized