From ec5b27f573314bac0817cd77221bad8fcec82b67 Mon Sep 17 00:00:00 2001 From: YangZhou <56786796+SmileGoat@users.noreply.github.com> Date: Thu, 20 Oct 2022 20:21:11 +0800 Subject: [PATCH] add paddle audio dataset && backend (#45939) * add audio feature dataset * fix coding style * fix coding style2 * rm librosa * rm voxceleb * rm librosa in test * add scipy fftpack * add functional * fix setup * fix setup2 * rm colorlog * refactor dataset __init__.py * fix converage * fix librosa import error * fix windows test * fix windows ci * rm datasets * fix setup * remove testdata * add librosa in requirement * add librosa in requirement2 * change librosa to 0.8.1 * update ci docker * fix ci error * fix ci error2 * fix ci coverage * fix converage * fix coverage * rm audio_base in test, notest,test=coverage * fix copyright * rm backend * add datast in __init__ * rm compliance&&add function test * fix setup * fix windows * fix windows2 * fix test timeout * add backend & datasets * fix bugs * fix ci time issue * add dataset test * rm test_audio_feature * avoid windows isssue, tmp * note windows isssue * skip windows issue * refactor dataset test * add dataset.py * fix dtype in layers.mfcc * fix ci-static-check * fix dtype in layers.mfcc && fix ci-static-check * add relative accuracy * modity API.spec * skip cuda11.2 test * skip cuda11.2 test2 * skip cuda11.2 * change dataset name * fix format * update api.spec * update api.spec2 * fix coverage * add dataset test * rm download load dict * rm download load dict in init * update api.spec3 * fix dataset coverage * fix coverage * fix coverage2 * restore api.spec * restore api.spec2 * fix api-spec 3 * fix api-spec 4 * fix api.spec * fix api.spec6 * refactor init_backend * fix typo * change paddleaudio backend set * fix get_current_audio_backend() * fix format * fix format2 * remove format in parameters * fix format2 * add warning massage in wave_backend && remove redundant audio util * rm audio util in print_signatures * fix format3 * add tess dataset license * format warning * add more info in warning msg * add paddleaudio version check * replace dataset esc50 with tess * add tess dataset && rm numpy transform in dataset.py * fix set audio backend bug * fix equal error * fix format && coverage error * add api example * fix format * fix error * fix typo * add noqa in __init__ * fix backend doc example error * rm seed in dataset * update bakcend example * fix typo * fix typo * fix example err * fix typo * fix ci dataset test * fix example fil * try to fix ci * clean dataset doc * change get_current_audio_backend to get_current_backend * creplace paddle.audio.backends.info with paddle.audio.info, same with load, save * fix ci error * repalce api in test_audio_backend * fix save&&set_backend exmaple --- paddle/fluid/API.spec | 10 + python/paddle/audio/__init__.py | 8 +- python/paddle/audio/backends/__init__.py | 25 ++ python/paddle/audio/backends/backend.py | 146 ++++++++++++ python/paddle/audio/backends/init_backend.py | 185 +++++++++++++++ python/paddle/audio/backends/wave_backend.py | 226 +++++++++++++++++++ python/paddle/audio/datasets/__init__.py | 18 ++ python/paddle/audio/datasets/dataset.py | 96 ++++++++ python/paddle/audio/datasets/esc50.py | 182 +++++++++++++++ python/paddle/audio/datasets/tess.py | 149 ++++++++++++ python/paddle/tests/test_audio_backend.py | 153 +++++++++++++ python/paddle/tests/test_audio_datasets.py | 123 ++++++++++ python/setup.py.in | 2 + tools/print_signatures.py | 52 ++++- 14 files changed, 1363 insertions(+), 12 deletions(-) create mode 100644 python/paddle/audio/backends/__init__.py create mode 100644 python/paddle/audio/backends/backend.py create mode 100644 python/paddle/audio/backends/init_backend.py create mode 100644 python/paddle/audio/backends/wave_backend.py create mode 100644 python/paddle/audio/datasets/__init__.py create mode 100644 python/paddle/audio/datasets/dataset.py create mode 100644 python/paddle/audio/datasets/esc50.py create mode 100644 python/paddle/audio/datasets/tess.py create mode 100644 python/paddle/tests/test_audio_backend.py create mode 100644 python/paddle/tests/test_audio_datasets.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8a2e65922a114..5771a0abd75b6 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -21,3 +21,13 @@ paddle.audio.functional.functional.mel_frequencies (ArgSpec(args=['n_mels', 'f_m paddle.audio.functional.functional.mel_to_hz (ArgSpec(args=['mel', 'htk'], varargs=None, varkw=None, defaults=(False,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Union[float, paddle.Tensor], 'mel': typing.Union[float, paddle.Tensor], 'htk': }), ('document', 'e93b432d382f98c60d7c7599489e7072')) paddle.audio.functional.functional.power_to_db (ArgSpec(args=['spect', 'ref_value', 'amin', 'top_db'], varargs=None, varkw=None, defaults=(1.0, 1e-10, 80.0), kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'spect': , 'ref_value': , 'amin': , 'top_db': typing.Union[float, NoneType]}), ('document', '28bbb1973e8399e856bfaea0415cecb9')) paddle.audio.functional.window.get_window (ArgSpec(args=['window', 'win_length', 'fftbins', 'dtype'], varargs=None, varkw=None, defaults=(True, 'float64'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'window': typing.Union[str, typing.Tuple[str, float]], 'win_length': , 'fftbins': , 'dtype': }), ('document', '2418d63da10c0cd5da9ecf0a88ddf783')) +paddle.audio.backends (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e')) +paddle.audio.backends.init_backend.get_current_audio_backend (ArgSpec(args=[], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': }), ('document', '3ff9fd62e8be1f3dc7e34afaf50e1645')) +paddle.audio.backends.init_backend.list_available_backends (ArgSpec(args=[], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.List[str]}), ('document', '8eba49f1b69f7ec7fa139a0714a2724e')) +paddle.audio.backends.init_backend.set_backend (ArgSpec(args=['backend_name'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'backend_name': }), ('document', '9680247dd97274d345dee415e2787527')) +paddle.audio.backends.wave_backend.info (ArgSpec(args=['filepath', 'format'], varargs=None, varkw=None, defaults=(None,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': , 'filepath': , 'format': typing.Union[str, NoneType]}), ('document', 'e0ffd3accd942a9b0a4c08463a9f60f6')) +paddle.audio.backends.wave_backend.load (ArgSpec(args=['filepath', 'frame_offset', 'num_frames', 'normalize', 'channels_first', 'format'], varargs=None, varkw=None, defaults=(0, -1, True, True, None), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Tuple[paddle.Tensor, int], 'filepath': typing.Union[str, pathlib.Path], 'frame_offset': , 'num_frames': , 'normalize': , 'channels_first': , 'format': typing.Union[str, NoneType]}), ('document', '4de50575ca516b4b7c7c82c7fdec808f')) +paddle.audio.backends.wave_backend.save (ArgSpec(args=['filepath', 'src', 'sample_rate', 'channels_first', 'compression', 'format', 'encoding', 'bits_per_sample'], varargs=None, varkw=None, defaults=(True, None, None, None, None), kwonlyargs=[], kwonlydefaults=None, annotations={'filepath': , 'src': , 'sample_rate': , 'channels_first': , 'compression': typing.Union[float, NoneType], 'format': typing.Union[str, NoneType], 'encoding': typing.Union[str, NoneType], 'bits_per_sample': typing.Union[int, NoneType]}), ('document', '4c85cfcd29a0dcdfc32e74db8c0c3961')) +paddle.audio.datasets (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e')) +paddle.audio.datasets.TESS (ArgSpec(), ('document', '3605f3aa2191ede7ddbe594cd27bb067')) +paddle.audio.datasets.TESS.meta_info (ArgSpec(), ('document', '60d548a6f71629c3b69bcda3a30d4819')) diff --git a/python/paddle/audio/__init__.py b/python/paddle/audio/__init__.py index aaf11b5b2c131..ee768ab6d029c 100644 --- a/python/paddle/audio/__init__.py +++ b/python/paddle/audio/__init__.py @@ -14,5 +14,11 @@ from . import features from . import functional +from . import datasets +from . import backends -__all__ = ["functional", "features"] +from .backends.backend import info, load, save + +__all__ = [ + "functional", "features", "datasets", "backends", "load", "info", "save" +] diff --git a/python/paddle/audio/backends/__init__.py b/python/paddle/audio/backends/__init__.py new file mode 100644 index 0000000000000..ac19a14c69a01 --- /dev/null +++ b/python/paddle/audio/backends/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from . import init_backend +from .init_backend import get_current_backend # noqa: F401 +from .init_backend import list_available_backends # noqa: F401 +from .init_backend import set_backend + +init_backend._init_set_audio_backend() + +__all__ = [ + 'get_current_backend', + 'list_available_backends', + 'set_backend', +] diff --git a/python/paddle/audio/backends/backend.py b/python/paddle/audio/backends/backend.py new file mode 100644 index 0000000000000..fbfd11d20e0b5 --- /dev/null +++ b/python/paddle/audio/backends/backend.py @@ -0,0 +1,146 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +import paddle + +from pathlib import Path +from typing import Optional, Tuple, Union + + +class AudioInfo: + """ Audio info, return type of backend info function """ + + def __init__(self, sample_rate: int, num_samples: int, num_channels: int, + bits_per_sample: int, encoding: str): + self.sample_rate = sample_rate + self.num_samples = num_samples + self.num_channels = num_channels + self.bits_per_sample = bits_per_sample + self.encoding = encoding + + +def info(filepath: str) -> AudioInfo: + """Get signal information of input audio file. + + Args: + filepath: audio path or file object. + + Returns: + AudioInfo: info of the given audio. + + Example: + .. code-block:: python + + import os + import paddle + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + base_dir = os.getcwd() + filepath = os.path.join(base_dir, "test.wav") + + paddle.audio.save(filepath, waveform, sample_rate) + wav_info = paddle.audio.info(filepath) + """ + # for API doc + raise NotImplementedError("please set audio backend") + + +def load(filepath: Union[str, Path], + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True) -> Tuple[paddle.Tensor, int]: + """Load audio data from file.Load the audio content start form frame_offset, and get num_frames. + + Args: + frame_offset: from 0 to total frames, + num_frames: from -1 (means total frames) or number frames which want to read, + normalize: + if True: return audio which norm to (-1, 1), dtype=float32 + if False: return audio with raw data, dtype=int16 + + channels_first: + if True: return audio with shape (channels, time) + + Return: + Tuple[paddle.Tensor, int]: (audio_content, sample rate) + + Exampels: + .. code-block:: python + + import os + import paddle + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + base_dir = os.getcwd() + filepath = os.path.join(base_dir, "test.wav") + + paddle.audio.save(filepath, waveform, sample_rate) + wav_data_read, sr = paddle.audio.load(filepath) + """ + # for API doc + raise NotImplementedError("please set audio backend") + + +def save( + filepath: str, + src: paddle.Tensor, + sample_rate: int, + channels_first: bool = True, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = 16, +): + """ + Save audio tensor to file. + + Args: + filepath: saved path + src: the audio tensor + sample_rate: the number of samples of audio per second. + channels_first: src channel infomation + if True, means input tensor is (channels, time) + if False, means input tensor is (time, channels) + encoding:encoding format, wave_backend only support PCM16 now. + bits_per_sample: bits per sample, wave_backend only support 16 bits now. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + filepath = "./test.wav" + + paddle.audio.save(filepath, waveform, sample_rate) + """ + # for API doc + raise NotImplementedError("please set audio backend") diff --git a/python/paddle/audio/backends/init_backend.py b/python/paddle/audio/backends/init_backend.py new file mode 100644 index 0000000000000..a066e4e23a64e --- /dev/null +++ b/python/paddle/audio/backends/init_backend.py @@ -0,0 +1,185 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import warnings +from . import wave_backend +from . import backend +from typing import List + +import paddle + + +def _check_version(version: str) -> bool: + # require paddleaudio >= 1.0.2 + ver_arr = version.split('.') + v0 = int(ver_arr[0]) + v1 = int(ver_arr[1]) + v2 = int(ver_arr[2]) + if v0 < 1: + return False + if v0 == 1 and v1 == 0 and v2 <= 1: + return False + return True + + +def list_available_backends() -> List[str]: + """ List available backends, the backends in paddleaudio and the default backend. + + Returns: + List[str]: The list of available backends. + + Examples: + .. code-block:: python + + import paddle + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + wav_path = "./test.wav" + + current_backend = paddle.audio.backends.get_current_backend() + print(current_backend) # wave_backend, the default backend. + backends = paddle.audio.backends.list_available_backends() + # default backends is ['wave_backend'] + # backends is ['wave_backend', 'soundfile'], if have installed paddleaudio >= 1.0.2 + if 'soundfile' in backends: + paddle.audio.backends.set_backend('soundfile') + + paddle.audio.save(wav_path, waveform, sample_rate) + + """ + backends = [] + try: + import paddleaudio + except ImportError: + package = "paddleaudio" + warn_msg = ( + "Failed importing {}. \n" + "only wave_banckend(only can deal with PCM16 WAV) supportted.\n" + "if want soundfile_backend(more audio type suppported),\n" + "please manually installed (usually with `pip install {} >= 1.0.2`). " + ).format(package, package) + warnings.warn(warn_msg) + + if "paddleaudio" in sys.modules: + version = paddleaudio.__version__ + if _check_version(version) == False: + err_msg = ( + "the version of paddleaudio installed is {},\n" + "please ensure the paddleaudio >= 1.0.2.").format(version) + raise ImportError(err_msg) + backends = paddleaudio.backends.list_audio_backends() + backends.append("wave_backend") + return backends + + +def get_current_backend() -> str: + """ Get the name of the current audio backend + + Returns: + str: The name of the current backend, + the wave_backend or backend imported from paddleaudio + + Examples: + .. code-block:: python + + import paddle + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + wav_path = "./test.wav" + + current_backend = paddle.audio.backends.get_current_backend() + print(current_backend) # wave_backend, the default backend. + backends = paddle.audio.backends.list_available_backends() + # default backends is ['wave_backend'] + # backends is ['wave_backend', 'soundfile'], if have installed paddleaudio >= 1.0.2 + + if 'soundfile' in backends: + paddle.audio.backends.set_backend('soundfile') + + paddle.audio.save(wav_path, waveform, sample_rate) + + """ + current_backend = None + if "paddleaudio" in sys.modules: + import paddleaudio + current_backend = paddleaudio.backends.get_audio_backend() + if paddle.audio.load == paddleaudio.load: + return current_backend + return "wave_backend" + + +def set_backend(backend_name: str): + """Set the backend by one of the list_audio_backend return. + + Args: + backend (str): one of the list_audio_backend. "wave_backend" is the default. "soundfile" imported from paddleaudio. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + wav_path = "./test.wav" + + current_backend = paddle.audio.backends.get_current_backend() + print(current_backend) # wave_backend, the default backend. + backends = paddle.audio.backends.list_available_backends() + # default backends is ['wave_backend'] + # backends is ['wave_backend', 'soundfile'], if have installed paddleaudio >= 1.0.2 + + if 'soundfile' in backends: + paddle.audio.backends.set_backend('soundfile') + + paddle.audio.save(wav_path, waveform, sample_rate) + + """ + if backend_name not in list_available_backends(): + raise NotImplementedError() + + if backend_name == "wave_backend": + module = wave_backend + else: + import paddleaudio + paddleaudio.backends.set_audio_backend(backend_name) + module = paddleaudio + + for func in ["save", "load", "info"]: + setattr(backend, func, getattr(module, func)) + setattr(paddle.audio, func, getattr(module, func)) + + +def _init_set_audio_backend(): + # init the default wave_backend. + for func in ["save", "load", "info"]: + setattr(backend, func, getattr(wave_backend, func)) diff --git a/python/paddle/audio/backends/wave_backend.py b/python/paddle/audio/backends/wave_backend.py new file mode 100644 index 0000000000000..66f2d48fe19a5 --- /dev/null +++ b/python/paddle/audio/backends/wave_backend.py @@ -0,0 +1,226 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + +import wave +import numpy as np +from pathlib import Path + +from typing import Optional, Tuple, Union +from .backend import AudioInfo + + +def _error_message(): + package = "paddleaudio" + warn_msg = ( + "only PCM16 WAV supportted. \n" + "if want support more other audio types, please " + "manually installed (usually with `pip install {}`). \n " + "and use paddle.audio.backends.set_backend('soundfile') to set audio backend" + ).format(package) + return warn_msg + + +def info(filepath: str) -> AudioInfo: + """Get signal information of input audio file. + + Args: + filepath: audio path or file object. + + Returns: + AudioInfo: info of the given audio. + + Example: + .. code-block:: python + + import os + import paddle + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + base_dir = os.getcwd() + filepath = os.path.join(base_dir, "test.wav") + + paddle.audio.save(filepath, waveform, sample_rate) + wav_info = paddle.audio.info(filepath) + """ + + if hasattr(filepath, 'read'): + file_obj = filepath + else: + file_obj = open(filepath, 'rb') + + try: + file_ = wave.open(file_obj) + except wave.Error: + file_obj.seek(0) + file_obj.close() + err_msg = _error_message() + raise NotImplementedError(err_msg) + + channels = file_.getnchannels() + sample_rate = file_.getframerate() + sample_frames = file_.getnframes() # audio frame + bits_per_sample = file_.getsampwidth() * 8 + encoding = "PCM_S" # default WAV encoding, only support + file_obj.close() + return AudioInfo(sample_rate, sample_frames, channels, bits_per_sample, + encoding) + + +def load(filepath: Union[str, Path], + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True) -> Tuple[paddle.Tensor, int]: + """Load audio data from file. load the audio content start form frame_offset, and get num_frames. + + Args: + frame_offset: from 0 to total frames, + num_frames: from -1 (means total frames) or number frames which want to read, + normalize: + if True: return audio which norm to (-1, 1), dtype=float32 + if False: return audio with raw data, dtype=int16 + + channels_first: + if True: return audio with shape (channels, time) + + Return: + Tuple[paddle.Tensor, int]: (audio_content, sample rate) + + Exampels: + .. code-block:: python + + import os + import paddle + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + base_dir = os.getcwd() + filepath = os.path.join(base_dir, "test.wav") + + paddle.audio.save(filepath, waveform, sample_rate) + wav_data_read, sr = paddle.audio.load(filepath) + """ + if hasattr(filepath, 'read'): + file_obj = filepath + else: + file_obj = open(filepath, 'rb') + + try: + file_ = wave.open(file_obj) + except wave.Error: + file_obj.seek(0) + file_obj.close() + err_msg = _error_message() + raise NotImplementedError(err_msg) + + channels = file_.getnchannels() + sample_rate = file_.getframerate() + frames = file_.getnframes() # audio frame + + audio_content = file_.readframes(frames) + file_obj.close() + + # default_subtype = "PCM_16", only support PCM16 WAV + audio_as_np16 = np.frombuffer(audio_content, dtype=np.int16) + audio_as_np32 = audio_as_np16.astype(np.float32) + if normalize: + # dtype = "float32" + audio_norm = audio_as_np32 / (2**15) + else: + # dtype = "int16" + audio_norm = audio_as_np32 + + waveform = np.reshape(audio_norm, (frames, channels)) + if num_frames != -1: + waveform = waveform[frame_offset:frame_offset + num_frames, :] + waveform = paddle.to_tensor(waveform) + if channels_first: + waveform = paddle.transpose(waveform, perm=[1, 0]) + return waveform, sample_rate + + +def save( + filepath: str, + src: paddle.Tensor, + sample_rate: int, + channels_first: bool = True, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = 16, +): + """ + Save audio tensor to file. + + Args: + filepath: saved path + src: the audio tensor + sample_rate: the number of samples of audio per second. + channels_first: src channel infomation + if True, means input tensor is (channels, time) + if False, means input tensor is (time, channels) + encoding: audio encoding format, wave_backend only support PCM16 now. + bits_per_sample: bits per sample, wave_backend only support 16 bits now. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + filepath = "./test.wav" + + paddle.audio.save(filepath, waveform, sample_rate) + """ + assert src.ndim == 2, "Expected 2D tensor" + + audio_numpy = src.numpy() + + # change src shape to (time, channels) + if channels_first: + audio_numpy = np.transpose(audio_numpy) + + channels = audio_numpy.shape[1] + + # only support PCM16 + if bits_per_sample not in (None, 16): + raise ValueError("Invalid bits_per_sample, only supprt 16 bit") + + sample_width = int(bits_per_sample / 8) # 2 + + if src.dtype == paddle.float32: + audio_numpy = (audio_numpy * (2**15)).astype(" List[collections.namedtuple]: + ret = [] + with open(os.path.join(DATA_HOME, self.meta), 'r') as rf: + for line in rf.readlines()[1:]: + ret.append(self.meta_info(*line.strip().split(','))) + return ret + + def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]: + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ + not os.path.isfile(os.path.join(DATA_HOME, self.meta)): + download.get_path_from_url(self.archive['url'], + DATA_HOME, + self.archive['md5'], + decompress=True) + + meta_info = self._get_meta_info() + + files = [] + labels = [] + for sample in meta_info: + filename, fold, target, _, _, _, _ = sample + if mode == 'train' and int(fold) != split: + files.append(os.path.join(DATA_HOME, self.audio_path, filename)) + labels.append(int(target)) + + if mode != 'train' and int(fold) == split: + files.append(os.path.join(DATA_HOME, self.audio_path, filename)) + labels.append(int(target)) + + return files, labels diff --git a/python/paddle/audio/datasets/tess.py b/python/paddle/audio/datasets/tess.py new file mode 100644 index 0000000000000..0f375aa2b0172 --- /dev/null +++ b/python/paddle/audio/datasets/tess.py @@ -0,0 +1,149 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import os +from typing import List +from typing import Tuple + +from paddle.utils import download +from paddle.dataset.common import DATA_HOME +from .dataset import AudioClassificationDataset + +__all__ = ['TESS'] + + +class TESS(AudioClassificationDataset): + """ + TESS is a set of 200 target words were spoken in the carrier phrase + "Say the word _____' by two actresses (aged 26 and 64 years) and + recordings were made of the set portraying each of seven emotions(anger, + disgust, fear, happiness, pleasant surprise, sadness, and neutral). + There are 2800 stimuli in total. + + Reference: + Toronto emotional speech set (TESS) https://tspace.library.utoronto.ca/handle/1807/24487 + https://doi.org/10.5683/SP2/E8H2MF + + Args: + mode (str, optional): It identifies the dataset mode (train or dev). Defaults to train. + n_folds (int, optional): Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. Defaults to 5. + split (int, optional): It specify the fold of dev dataset. Defaults to 1. + feat_type (str, optional): It identifies the feature type that user wants to extrace of an audio file. Defaults to raw. + archive(dict): it tells where to download the audio archive. Defaults to None. + + Returns: + :ref:`api_paddle_io_Dataset`. An instance of TESS dataset. + + Examples: + + .. code-block:: python + + import paddle + + mode = 'dev' + tess_dataset = paddle.audio.datasets.TESS(mode=mode, + feat_type='raw') + for idx in range(5): + audio, label = tess_dataset[idx] + # do something with audio, label + print(audio.shape, label) + # [audio_data_length] , label_id + + tess_dataset = paddle.audio.datasets.TESS(mode=mode, + feat_type='mfcc', + n_mfcc=40) + for idx in range(5): + audio, label = tess_dataset[idx] + # do something with mfcc feature, label + print(audio.shape, label) + # [feature_dim, num_frames] , label_id + """ + + archive = { + 'url': + 'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip', + 'md5': '1465311b24d1de704c4c63e4ccc470c7', + } + + label_list = [ + 'angry', + 'disgust', + 'fear', + 'happy', + 'neutral', + 'ps', # pleasant surprise + 'sad', + ] + meta_info = collections.namedtuple('META_INFO', + ('speaker', 'word', 'emotion')) + audio_path = 'TESS_Toronto_emotional_speech_set' + + def __init__(self, + mode='train', + n_folds=5, + split=1, + feat_type='raw', + archive=None, + **kwargs): + """ + + """ + assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' + if archive is not None: + self.archive = archive + files, labels = self._get_data(mode, n_folds, split) + super(TESS, self).__init__(files=files, + labels=labels, + feat_type=feat_type, + **kwargs) + + def _get_meta_info(self, files) -> List[collections.namedtuple]: + ret = [] + for file in files: + basename_without_extend = os.path.basename(file)[:-4] + ret.append(self.meta_info(*basename_without_extend.split('_'))) + return ret + + def _get_data(self, mode, n_folds, split) -> Tuple[List[str], List[int]]: + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)): + download.get_path_from_url(self.archive['url'], + DATA_HOME, + self.archive['md5'], + decompress=True) + + wav_files = [] + for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)): + for file in files: + if file.endswith('.wav'): + wav_files.append(os.path.join(root, file)) + + meta_info = self._get_meta_info(wav_files) + + files = [] + labels = [] + n_samples_per_fold = len(meta_info) // n_folds + for idx, sample in enumerate(meta_info): + _, _, emotion = sample + target = self.label_list.index(emotion) + fold = idx // n_samples_per_fold + 1 + + if mode == 'train' and int(fold) != split: + files.append(wav_files[idx]) + labels.append(target) + + if mode != 'train' and int(fold) == split: + files.append(wav_files[idx]) + labels.append(target) + + return files, labels diff --git a/python/paddle/tests/test_audio_backend.py b/python/paddle/tests/test_audio_backend.py new file mode 100644 index 0000000000000..79e793e2dc865 --- /dev/null +++ b/python/paddle/tests/test_audio_backend.py @@ -0,0 +1,153 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import soundfile +import numpy as np +import os +import paddle.audio + + +class TestAudioBackends(unittest.TestCase): + + def setUp(self): + self.initParmas() + + def initParmas(self): + + def get_wav_data(dtype: str, num_channels: int, num_frames: int): + dtype_ = getattr(paddle, dtype) + base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) * 0.1 + data = base.tile([num_channels, 1]) + return data + + self.duration = 0.5 + self.num_channels = 1 + self.sr = 16000 + self.dtype = "float32" + self.window_size = 1024 + waveform_tensor = get_wav_data(self.dtype, + self.num_channels, + num_frames=self.duration * self.sr) + # shape (1, 8000) + self.waveform = waveform_tensor.numpy() + + def test_backend(self): + base_dir = os.getcwd() + wave_wav_path = os.path.join(base_dir, "wave_test.wav") + paddle.audio.save(wave_wav_path, + paddle.to_tensor(self.waveform), + self.sr, + channels_first=True) + + # test backends(wave)(wave_backend) info + wav_info = paddle.audio.info(wave_wav_path) + self.assertTrue(wav_info.sample_rate, self.sr) + self.assertTrue(wav_info.num_channels, self.num_channels) + self.assertTrue(wav_info.bits_per_sample, 16) + + with open(wave_wav_path, 'rb') as file_: + wav_info = paddle.audio.info(file_) + self.assertTrue(wav_info.sample_rate, self.sr) + self.assertTrue(wav_info.num_channels, self.num_channels) + self.assertTrue(wav_info.bits_per_sample, 16) + + # test backends(wave_backend) load & save + wav_data, sr = paddle.audio.load(wave_wav_path) + np.testing.assert_array_almost_equal(wav_data, self.waveform, decimal=4) + with soundfile.SoundFile(wave_wav_path, "r") as file_: + dtype = "float32" + frames = file_._prepare_read(0, None, -1) + waveform = file_.read(frames, dtype, always_2d=True) + waveform = waveform.T + np.testing.assert_array_almost_equal(wav_data, waveform) + + with open(wave_wav_path, 'rb') as file_: + wav_data, sr = paddle.audio.load(file_, + normalize=False, + num_frames=10000) + with soundfile.SoundFile(wave_wav_path, "r") as file_: + dtype = "int16" + frames = file_._prepare_read(0, None, -1) + waveform = file_.read(frames, dtype, always_2d=True) + waveform = waveform.T + np.testing.assert_array_almost_equal(wav_data, waveform) + + current_backend = paddle.audio.backends.get_current_backend() + self.assertTrue(current_backend in ["wave_backend", "soundfile"]) + + paddle.audio.backends.set_backend("wave_backend") + + backends = paddle.audio.backends.list_available_backends() + for backend in backends: + self.assertTrue(backend in ["wave_backend", "soundfile"]) + + # Test error + try: + paddle.audio.backends.set_backend("jfiji") + except NotImplementedError: + pass + + try: + import paddleaudio + backends = paddle.audio.backends.list_available_backends() + for backend in backends: + self.assertTrue(backend in ["wave_backend", "soundfile"]) + current_backend = paddle.audio.backends.get_current_backend() + self.assertTrue(current_backend, "wave_backend") + paddleaudio.backends.set_audio_backend("soundfile") + paddle.audio.backends.set_backend("soundfile") + current_backend = paddle.audio.backends.get_current_backend() + self.assertTrue(current_backend, "soundfile") + wav_info = paddle.audio.info(wave_wav_path) + self.assertTrue(wav_info.sample_rate, self.sr) + self.assertTrue(wav_info.num_channels, self.num_channels) + self.assertTrue(wav_info.bits_per_sample, 16) + paddle.audio.backends.set_backend("wave_backend") + except ImportError: + pass + + try: + paddle.audio.save(wave_wav_path, + paddle.to_tensor(self.waveform), + self.sr, + bits_per_sample=24, + channels_first=True) + except ValueError: + pass + + try: + paddle.audio.save(wave_wav_path, + paddle.to_tensor(self.waveform).unsqueeze(0), + self.sr) + except AssertionError: + pass + + fake_data = np.array([0, 1, 2, 3, 4, 6], np.float32) + soundfile.write(wave_wav_path, fake_data, 1, subtype="DOUBLE") + try: + wav_info = paddle.audio.info(wave_wav_path) + except NotImplementedError: + pass + try: + wav_data = paddle.audio.load(wave_wav_path) + except NotImplementedError: + pass + + if os.path.exists(wave_wav_path): + os.remove(wave_wav_path) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/tests/test_audio_datasets.py b/python/paddle/tests/test_audio_datasets.py new file mode 100644 index 0000000000000..59ba1d543bda6 --- /dev/null +++ b/python/paddle/tests/test_audio_datasets.py @@ -0,0 +1,123 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np +import paddle +import itertools +from parameterized import parameterized + + +def parameterize(*params): + return parameterized.expand(list(itertools.product(*params))) + + +class TestAudioDatasets(unittest.TestCase): + + @parameterize(["dev", "train"], [40, 64]) + def test_tess_dataset(self, mode: str, params: int): + """ + TESS dataset + Reference: + Toronto emotional speech set (TESS) https://tspace.library.utoronto.ca/handle/1807/24487 + https://doi.org/10.5683/SP2/E8H2MF + """ + archive = { + 'url': + 'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set_lite.zip', + 'md5': '9ffb5e3adf28d4d6b787fa94bd59b975', + } # small part of TESS dataset for test. + tess_dataset = paddle.audio.datasets.TESS(mode=mode, + feat_type='mfcc', + n_mfcc=params, + archive=archive) + idx = np.random.randint(0, 30) + elem = tess_dataset[idx] + self.assertTrue(elem[0].shape[0] == params) + self.assertTrue(0 <= elem[1] <= 6) + + tess_dataset = paddle.audio.datasets.TESS(mode=mode, + feat_type='spectrogram', + n_fft=params) + elem = tess_dataset[idx] + self.assertTrue(elem[0].shape[0] == (params // 2 + 1)) + self.assertTrue(0 <= elem[1] <= 6) + + tess_dataset = paddle.audio.datasets.TESS(mode="dev", + feat_type='logmelspectrogram', + n_mels=params) + elem = tess_dataset[idx] + self.assertTrue(elem[0].shape[0] == params) + self.assertTrue(0 <= elem[1] <= 6) + + tess_dataset = paddle.audio.datasets.TESS(mode="dev", + feat_type='melspectrogram', + n_mels=params) + elem = tess_dataset[idx] + self.assertTrue(elem[0].shape[0] == params) + self.assertTrue(0 <= elem[1] <= 6) + + @parameterize(["dev", "train"], [40, 64]) + def test_esc50_dataset(self, mode: str, params: int): + """ + ESC50 dataset + Reference: + ESC: Dataset for Environmental Sound Classification + http://dx.doi.org/10.1145/2733373.2806390 + """ + archive = { + 'url': + 'https://bj.bcebos.com/paddleaudio/datasets/ESC-50-master-lite.zip', + 'md5': '1e9ba53265143df5b2804a743f2d1956', + } # small part of ESC50 dataset for test. + esc50_dataset = paddle.audio.datasets.ESC50(mode=mode, + feat_type='raw', + archive=archive) + idx = np.random.randint(0, 6) + elem = esc50_dataset[idx] + self.assertTrue(elem[0].shape[0] == 220500) + self.assertTrue(0 <= elem[1] <= 2) + + esc50_dataset = paddle.audio.datasets.ESC50(mode=mode, + feat_type='mfcc', + n_mfcc=params, + archive=archive) + idx = np.random.randint(0, 6) + elem = esc50_dataset[idx] + self.assertTrue(elem[0].shape[0] == params) + self.assertTrue(0 <= elem[1] <= 2) + + esc50_dataset = paddle.audio.datasets.ESC50(mode=mode, + feat_type='spectrogram', + n_fft=params) + elem = esc50_dataset[idx] + self.assertTrue(elem[0].shape[0] == (params // 2 + 1)) + self.assertTrue(0 <= elem[1] <= 2) + + esc50_dataset = paddle.audio.datasets.ESC50( + mode=mode, feat_type='logmelspectrogram', n_mels=params) + elem = esc50_dataset[idx] + self.assertTrue(elem[0].shape[0] == params) + self.assertTrue(0 <= elem[1] <= 2) + + esc50_dataset = paddle.audio.datasets.ESC50(mode=mode, + feat_type='melspectrogram', + n_mels=params) + elem = esc50_dataset[idx] + self.assertTrue(elem[0].shape[0] == params) + self.assertTrue(0 <= elem[1] <= 2) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 4b2128a96755f..648d7089e7e10 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -373,6 +373,8 @@ packages=['paddle', 'paddle.audio', 'paddle.audio.functional', 'paddle.audio.features', + 'paddle.audio.datasets', + 'paddle.audio.backends', 'paddle.text', 'paddle.text.datasets', 'paddle.incubate', diff --git a/tools/print_signatures.py b/tools/print_signatures.py index 9a280c691aaad..f3c5f708b8478 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -221,17 +221,47 @@ def process_module(m, attr="__all__"): def check_public_api(): modulelist = [ #npqa - paddle, paddle.amp, paddle.nn, paddle.nn.functional, - paddle.nn.initializer, paddle.nn.utils, paddle.static, paddle.static.nn, - paddle.io, paddle.jit, paddle.metric, paddle.distribution, - paddle.optimizer, paddle.optimizer.lr, paddle.regularizer, paddle.text, - paddle.utils, paddle.utils.download, paddle.utils.profiler, - paddle.utils.cpp_extension, paddle.sysconfig, paddle.vision, - paddle.vision.datasets, paddle.vision.models, paddle.vision.transforms, - paddle.vision.ops, paddle.distributed, paddle.distributed.fleet, - paddle.distributed.fleet.utils, paddle.distributed.parallel, - paddle.distributed.utils, paddle.callbacks, paddle.hub, paddle.autograd, - paddle.incubate, paddle.inference, paddle.onnx, paddle.device + paddle, + paddle.amp, + paddle.nn, + paddle.nn.functional, + paddle.nn.initializer, + paddle.nn.utils, + paddle.static, + paddle.static.nn, + paddle.io, + paddle.jit, + paddle.metric, + paddle.distribution, + paddle.optimizer, + paddle.optimizer.lr, + paddle.regularizer, + paddle.text, + paddle.utils, + paddle.utils.download, + paddle.utils.profiler, + paddle.utils.cpp_extension, + paddle.sysconfig, + paddle.vision, + paddle.vision.datasets, + paddle.vision.models, + paddle.vision.transforms, + paddle.vision.ops, + paddle.distributed, + paddle.distributed.fleet, + paddle.distributed.fleet.utils, + paddle.distributed.parallel, + paddle.distributed.utils, + paddle.callbacks, + paddle.hub, + paddle.autograd, + paddle.incubate, + paddle.inference, + paddle.onnx, + paddle.device, + paddle.audio, + paddle.audio.backends, + paddle.audio.datasets, ] apinum = 0