From e604a0f1659c104d0827e0c4dc95da0ebe053282 Mon Sep 17 00:00:00 2001 From: YangZhou Date: Tue, 6 Sep 2022 16:40:26 +0800 Subject: [PATCH] fix dtype in layers.mfcc && fix ci-static-check --- paddle/fluid/API.spec | 24 +++++++++++++++++++ python/paddle/audio/features/layers.py | 10 ++++---- .../paddle/tests/test_audio_logmel_feature.py | 5 ++-- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index fc5aa11148359..4f6c530a7025e 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -1,3 +1,27 @@ paddle.fluid.optimizer.PipelineOptimizer (paddle.fluid.optimizer.PipelineOptimizer, ('document', '2e55a29dbeb874934f7a1a1af3a22b8c')) paddle.fluid.optimizer.PipelineOptimizer.__init__ (ArgSpec(args=['self', 'optimizer', 'num_microbatches', 'start_cpu_core_id'], varargs=None, keywords=None, defaults=(1, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.audio.features (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e')) +paddle.audio.features.layers.LogMelSpectrogram (ArgSpec(), ('document', 'c38b53606aa89215c4f00d3833e158b8')) +paddle.audio.features.layers.LogMelSpectrogram.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'x': }), ('document', '6c14f6f78dc697a6981cf90412e2f1ea')) +paddle.audio.features.layers.LogMelSpectrogram.load_dict (ArgSpec(args=[], varargs='args', varkw='kwargs', defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={}), ('document', '01221a60445ee437f439a8cbe293f759')) +paddle.audio.features.layers.LogMelSpectrogram.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers', 'structured_name_prefix', 'use_hook'], varargs=None, varkw=None, defaults=(None, True, '', True), kwonlyargs=[], kwonlydefaults=None, annotations={}), ('document', '0c01cb0c12220c9426ae49549b145b0b')) +paddle.audio.features.layers.MFCC (ArgSpec(), ('document', 'bcbe6499830d9228a4f746ddd63b6c0f')) +paddle.audio.features.layers.MFCC.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'x': }), ('document', 'd86bcaa345f26851089bfdb3efecd9e7')) +paddle.audio.features.layers.MelSpectrogram (ArgSpec(), ('document', 'adf4012310984568ae9da6170aa89f91')) +paddle.audio.features.layers.MelSpectrogram.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'x': }), ('document', '458e9d454c8773091567c6b400f48cf5')) +paddle.audio.features.layers.Spectrogram (ArgSpec(), ('document', '83811af6da032099bf147e3e01a458e1')) +paddle.audio.features.layers.Spectrogram.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'x': }), ('document', 'ab11e318fca1410f743b5432394dea35')) +paddle.audio.functional (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e')) +paddle.audio.functional.functional.compute_fbank_matrix (ArgSpec(args=['sr', 'n_fft', 'n_mels', 'f_min', 'f_max', 'htk', 'norm', 'dtype'], varargs=None, varkw=None, defaults=(64, 0.0, None, False, 'slaney', 'float32'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'sr': , 'n_fft': , 'n_mels': , 'f_min': , 'f_max': typing.Union[float, NoneType], 'htk': , 'norm': typing.Union[str, float], 'dtype': }), ('document', '3c5411caa6baedb68860b09c81e0147c')) +paddle.audio.functional.functional.create_dct (ArgSpec(args=['n_mfcc', 'n_mels', 'norm', 'dtype'], varargs=None, varkw=None, defaults=('ortho', 'float32'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'n_mfcc': , 'n_mels': , 'norm': typing.Union[str, NoneType], 'dtype': }), ('document', 'c9c57550671f9725b053769411d2f65a')) +paddle.audio.functional.functional.fft_frequencies (ArgSpec(args=['sr', 'n_fft', 'dtype'], varargs=None, varkw=None, defaults=('float32',), kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'sr': , 'n_fft': , 'dtype': }), ('document', '057b990e79c9c780622407267c0a43c6')) +paddle.audio.functional.functional.hz_to_mel (ArgSpec(args=['freq', 'htk'], varargs=None, varkw=None, defaults=(False,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Union[paddle.Tensor, float], 'freq': typing.Union[paddle.Tensor, float], 'htk': }), ('document', '7ca01521dd0bf26cd3f72c67f7168dc4')) +paddle.audio.functional.functional.mel_frequencies (ArgSpec(args=['n_mels', 'f_min', 'f_max', 'htk', 'dtype'], varargs=None, varkw=None, defaults=(64, 0.0, 11025.0, False, 'float32'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'n_mels': , 'f_min': , 'f_max': , 'htk': , 'dtype': }), ('document', '2af3cf997ed1274214ec240b2b59a98d')) +paddle.audio.functional.functional.mel_to_hz (ArgSpec(args=['mel', 'htk'], varargs=None, varkw=None, defaults=(False,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Union[float, paddle.Tensor], 'mel': typing.Union[float, paddle.Tensor], 'htk': }), ('document', 'e93b432d382f98c60d7c7599489e7072')) +paddle.audio.functional.functional.power_to_db (ArgSpec(args=['spect', 'ref_value', 'amin', 'top_db'], varargs=None, varkw=None, defaults=(1.0, 1e-10, None), kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'spect': , 'ref_value': , 'amin': , 'top_db': typing.Union[float, NoneType]}), ('document', '28bbb1973e8399e856bfaea0415cecb9')) +paddle.audio.functional.window.get_window (ArgSpec(args=['window', 'win_length', 'fftbins', 'dtype'], varargs=None, varkw=None, defaults=(True, 'float64'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'window': typing.Union[str, typing.Tuple[str, float]], 'win_length': , 'fftbins': , 'dtype': }), ('document', '2418d63da10c0cd5da9ecf0a88ddf783')) +paddle.audio.utils (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e')) +paddle.audio.utils.error.ParameterError (ArgSpec(), ('document', 'e12783df4d137af121ebadceb389bf7a')) +paddle.audio.utils.error.ParameterError.args (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e')) +paddle.audio.utils.error.ParameterError.with_traceback (ArgSpec(), ('document', '3f2d1353ad5034ed0f4628f2c9f066cc')) diff --git a/python/paddle/audio/features/layers.py b/python/paddle/audio/features/layers.py index fe0fcb9684a04..2625bb09d48dc 100644 --- a/python/paddle/audio/features/layers.py +++ b/python/paddle/audio/features/layers.py @@ -285,8 +285,7 @@ def __init__(self, norm: Union[str, float] = 'slaney', ref_value: float = 1.0, amin: float = 1e-10, - top_db: Optional[float] = None, - dtype: str = paddle.float32) -> None: + top_db: Optional[float] = None) -> None: super(MFCC, self).__init__() assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( n_mfcc, n_mels) @@ -306,8 +305,10 @@ def __init__(self, ref_value=ref_value, amin=amin, top_db=top_db, - dtype=dtype) - self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype) + dtype=paddle.float64) + self.dct_matrix = create_dct(n_mfcc=n_mfcc, + n_mels=n_mels, + dtype=paddle.float64) self.register_buffer('dct_matrix', self.dct_matrix) def forward(self, x: Tensor) -> Tensor: @@ -318,6 +319,7 @@ def forward(self, x: Tensor) -> Tensor: Returns: Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`. """ + x = paddle.cast(x, paddle.float64) log_mel_feature = self._log_melspectrogram(x) mfcc = paddle.matmul(log_mel_feature.transpose( (0, 2, 1)), self.dct_matrix).transpose((0, 2, 1)) # (B, n_mels, L) diff --git a/python/paddle/tests/test_audio_logmel_feature.py b/python/paddle/tests/test_audio_logmel_feature.py index 09a1e31251100..1a48c11abf475 100644 --- a/python/paddle/tests/test_audio_logmel_feature.py +++ b/python/paddle/tests/test_audio_logmel_feature.py @@ -106,14 +106,13 @@ def test_mfcc(self, sr: int, n_fft: int, n_mfcc: int, n_mels: int): fmin=50.0) # paddlespeech.audio.features.layer x = paddle.to_tensor(self.waveform, - dtype='float64').unsqueeze(0) # Add batch dim. + dtype='float32').unsqueeze(0) # Add batch dim. feature_extractor = paddle.audio.features.MFCC(sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=64, n_mels=n_mels, - top_db=self.top_db, - dtype=x.dtype) + top_db=self.top_db) feature_layer = feature_extractor(x).squeeze(0).numpy() np.testing.assert_array_almost_equal(feature_librosa,