/
test_audio_logmel_feature.py
125 lines (108 loc) · 5.2 KB
/
test_audio_logmel_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import librosa
import numpy as np
import os
import paddle
import paddle.audio
from scipy import signal
import itertools
from parameterized import parameterized
def parameterize(*params):
return parameterized.expand(list(itertools.product(*params)))
class TestFeatures(unittest.TestCase):
def setUp(self):
self.initParmas()
def initParmas(self):
def get_wav_data(dtype: str, num_channels: int, num_frames: int):
dtype_ = getattr(paddle, dtype)
base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) * 0.1
data = base.tile([num_channels, 1])
return data
self.fmin = 0.0
self.top_db = 80.0
self.duration = 0.5
self.num_channels = 1
self.sr = 16000
self.dtype = "float32"
waveform_tensor = get_wav_data(self.dtype,
self.num_channels,
num_frames=self.duration * self.sr)
self.waveform = waveform_tensor.numpy()
@parameterize([16000], ["hamming", "bohman"], [128], [128, 64], [64, 32],
[0.0, 50.0])
def test_log_melspect(self, sr: int, window_str: str, n_fft: int,
hop_length: int, n_mels: int, fmin: float):
if len(self.waveform.shape) == 2: # (C, T)
self.waveform = self.waveform.squeeze(
0) # 1D input for librosa.feature.melspectrogram
# librosa:
feature_librosa = librosa.feature.melspectrogram(y=self.waveform,
sr=sr,
n_fft=n_fft,
hop_length=hop_length,
window=window_str,
n_mels=n_mels,
center=True,
fmin=fmin,
pad_mode='reflect')
feature_librosa = librosa.power_to_db(feature_librosa, top_db=None)
x = paddle.to_tensor(self.waveform, dtype=paddle.float64).unsqueeze(
0) # Add batch dim.
feature_extractor = paddle.audio.features.LogMelSpectrogram(
sr=sr,
n_fft=n_fft,
hop_length=hop_length,
window=window_str,
center=True,
n_mels=n_mels,
f_min=fmin,
dtype=x.dtype)
feature_layer = feature_extractor(x).squeeze(0).numpy()
np.testing.assert_array_almost_equal(feature_librosa,
feature_layer,
decimal=3)
@parameterize([16000, 8000], [256, 128], [40, 64], [64, 128])
def test_mfcc(self, sr: int, n_fft: int, n_mfcc: int, n_mels: int):
if len(self.waveform.shape) == 2: # (C, T)
self.waveform = self.waveform.squeeze(
0) # 1D input for librosa.feature.melspectrogram
# librosa:
feature_librosa = librosa.feature.mfcc(y=self.waveform,
sr=sr,
S=None,
n_mfcc=n_mfcc,
dct_type=2,
lifter=0,
n_fft=n_fft,
hop_length=64,
n_mels=n_mels,
fmin=50.0)
# paddlespeech.audio.features.layer
x = paddle.to_tensor(self.waveform,
dtype='float64').unsqueeze(0) # Add batch dim.
feature_extractor = paddle.audio.features.MFCC(sr=sr,
n_mfcc=n_mfcc,
n_fft=n_fft,
hop_length=64,
n_mels=n_mels,
top_db=self.top_db,
dtype=x.dtype)
feature_layer = feature_extractor(x).squeeze(0).numpy()
np.testing.assert_array_almost_equal(feature_librosa,
feature_layer,
decimal=2)
if __name__ == '__main__':
unittest.main()