From ec5b27f573314bac0817cd77221bad8fcec82b67 Mon Sep 17 00:00:00 2001
From: YangZhou <56786796+SmileGoat@users.noreply.github.com>
Date: Thu, 20 Oct 2022 20:21:11 +0800
Subject: [PATCH] add paddle audio dataset && backend (#45939)

* add audio feature dataset

* fix coding style

* fix coding style2

* rm librosa

* rm voxceleb

* rm librosa in test

* add scipy fftpack

* add functional

* fix setup

* fix setup2

* rm colorlog

* refactor dataset __init__.py

* fix converage

* fix librosa import error

* fix windows test

* fix windows ci

* rm datasets

* fix setup

* remove testdata

* add librosa in requirement

* add librosa in requirement2

* change librosa to 0.8.1

* update ci docker

* fix ci error

* fix ci error2

* fix ci coverage

* fix converage

* fix coverage

* rm audio_base in test, notest,test=coverage

* fix copyright

* rm backend

* add datast in __init__

* rm compliance&&add function test

* fix setup

* fix windows

* fix windows2

* fix test timeout

* add backend & datasets

* fix bugs

* fix ci time issue

* add dataset test

* rm test_audio_feature

* avoid windows isssue, tmp

* note windows isssue

* skip windows issue

* refactor dataset test

* add dataset.py

* fix dtype in layers.mfcc

* fix ci-static-check

* fix dtype in layers.mfcc && fix ci-static-check

* add relative accuracy

* modity API.spec

* skip cuda11.2 test

* skip cuda11.2 test2

* skip cuda11.2

* change dataset name

* fix format

* update api.spec

* update api.spec2

* fix coverage

* add dataset test

* rm download load dict

* rm download load dict in init

* update api.spec3

* fix dataset coverage

* fix coverage

* fix coverage2

* restore api.spec

* restore api.spec2

* fix api-spec 3

* fix api-spec 4

* fix api.spec

* fix api.spec6

* refactor init_backend

* fix typo

* change paddleaudio backend set

* fix get_current_audio_backend()

* fix format

* fix format2

* remove format in parameters

* fix format2

* add warning massage in wave_backend && remove redundant audio util

* rm audio util in print_signatures

* fix format3

* add tess dataset license

* format warning

* add more info in warning msg

* add paddleaudio version check

* replace dataset esc50 with tess

* add tess dataset && rm numpy transform in dataset.py

* fix set audio backend bug

* fix equal error

* fix format && coverage error

* add api example

* fix format

* fix error

* fix typo

* add noqa in __init__

* fix backend doc example error

* rm seed in dataset

* update bakcend example

* fix typo

* fix typo

* fix example err

* fix typo

* fix ci dataset test

* fix example fil

* try to fix ci

* clean dataset doc

* change get_current_audio_backend to get_current_backend

* creplace paddle.audio.backends.info with paddle.audio.info, same with load, save

* fix ci error

* repalce api in test_audio_backend

* fix save&&set_backend exmaple
---
 paddle/fluid/API.spec                        |  10 +
 python/paddle/audio/__init__.py              |   8 +-
 python/paddle/audio/backends/__init__.py     |  25 ++
 python/paddle/audio/backends/backend.py      | 146 ++++++++++++
 python/paddle/audio/backends/init_backend.py | 185 +++++++++++++++
 python/paddle/audio/backends/wave_backend.py | 226 +++++++++++++++++++
 python/paddle/audio/datasets/__init__.py     |  18 ++
 python/paddle/audio/datasets/dataset.py      |  96 ++++++++
 python/paddle/audio/datasets/esc50.py        | 182 +++++++++++++++
 python/paddle/audio/datasets/tess.py         | 149 ++++++++++++
 python/paddle/tests/test_audio_backend.py    | 153 +++++++++++++
 python/paddle/tests/test_audio_datasets.py   | 123 ++++++++++
 python/setup.py.in                           |   2 +
 tools/print_signatures.py                    |  52 ++++-
 14 files changed, 1363 insertions(+), 12 deletions(-)
 create mode 100644 python/paddle/audio/backends/__init__.py
 create mode 100644 python/paddle/audio/backends/backend.py
 create mode 100644 python/paddle/audio/backends/init_backend.py
 create mode 100644 python/paddle/audio/backends/wave_backend.py
 create mode 100644 python/paddle/audio/datasets/__init__.py
 create mode 100644 python/paddle/audio/datasets/dataset.py
 create mode 100644 python/paddle/audio/datasets/esc50.py
 create mode 100644 python/paddle/audio/datasets/tess.py
 create mode 100644 python/paddle/tests/test_audio_backend.py
 create mode 100644 python/paddle/tests/test_audio_datasets.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 8a2e65922a114..5771a0abd75b6 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -21,3 +21,13 @@ paddle.audio.functional.functional.mel_frequencies (ArgSpec(args=['n_mels', 'f_m
 paddle.audio.functional.functional.mel_to_hz (ArgSpec(args=['mel', 'htk'], varargs=None, varkw=None, defaults=(False,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Union[float, paddle.Tensor], 'mel': typing.Union[float, paddle.Tensor], 'htk': <class 'bool'>}), ('document', 'e93b432d382f98c60d7c7599489e7072'))
 paddle.audio.functional.functional.power_to_db (ArgSpec(args=['spect', 'ref_value', 'amin', 'top_db'], varargs=None, varkw=None, defaults=(1.0, 1e-10, 80.0), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'spect': <class 'paddle.Tensor'>, 'ref_value': <class 'float'>, 'amin': <class 'float'>, 'top_db': typing.Union[float, NoneType]}), ('document', '28bbb1973e8399e856bfaea0415cecb9'))
 paddle.audio.functional.window.get_window (ArgSpec(args=['window', 'win_length', 'fftbins', 'dtype'], varargs=None, varkw=None, defaults=(True, 'float64'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'window': typing.Union[str, typing.Tuple[str, float]], 'win_length': <class 'int'>, 'fftbins': <class 'bool'>, 'dtype': <class 'str'>}), ('document', '2418d63da10c0cd5da9ecf0a88ddf783'))
+paddle.audio.backends (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e'))
+paddle.audio.backends.init_backend.get_current_audio_backend (ArgSpec(args=[], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'str'>}), ('document', '3ff9fd62e8be1f3dc7e34afaf50e1645'))
+paddle.audio.backends.init_backend.list_available_backends (ArgSpec(args=[], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.List[str]}), ('document', '8eba49f1b69f7ec7fa139a0714a2724e'))
+paddle.audio.backends.init_backend.set_backend (ArgSpec(args=['backend_name'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'backend_name': <class 'str'>}), ('document', '9680247dd97274d345dee415e2787527'))
+paddle.audio.backends.wave_backend.info (ArgSpec(args=['filepath', 'format'], varargs=None, varkw=None, defaults=(None,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.audio.backends.backend.AudioInfo'>, 'filepath': <class 'str'>, 'format': typing.Union[str, NoneType]}), ('document', 'e0ffd3accd942a9b0a4c08463a9f60f6'))
+paddle.audio.backends.wave_backend.load (ArgSpec(args=['filepath', 'frame_offset', 'num_frames', 'normalize', 'channels_first', 'format'], varargs=None, varkw=None, defaults=(0, -1, True, True, None), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Tuple[paddle.Tensor, int], 'filepath': typing.Union[str, pathlib.Path], 'frame_offset': <class 'int'>, 'num_frames': <class 'int'>, 'normalize': <class 'bool'>, 'channels_first': <class 'bool'>, 'format': typing.Union[str, NoneType]}), ('document', '4de50575ca516b4b7c7c82c7fdec808f'))
+paddle.audio.backends.wave_backend.save (ArgSpec(args=['filepath', 'src', 'sample_rate', 'channels_first', 'compression', 'format', 'encoding', 'bits_per_sample'], varargs=None, varkw=None, defaults=(True, None, None, None, None), kwonlyargs=[], kwonlydefaults=None, annotations={'filepath': <class 'str'>, 'src': <class 'paddle.Tensor'>, 'sample_rate': <class 'int'>, 'channels_first': <class 'bool'>, 'compression': typing.Union[float, NoneType], 'format': typing.Union[str, NoneType], 'encoding': typing.Union[str, NoneType], 'bits_per_sample': typing.Union[int, NoneType]}), ('document', '4c85cfcd29a0dcdfc32e74db8c0c3961'))
+paddle.audio.datasets (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e'))
+paddle.audio.datasets.TESS (ArgSpec(), ('document', '3605f3aa2191ede7ddbe594cd27bb067'))
+paddle.audio.datasets.TESS.meta_info (ArgSpec(), ('document', '60d548a6f71629c3b69bcda3a30d4819'))
diff --git a/python/paddle/audio/__init__.py b/python/paddle/audio/__init__.py
index aaf11b5b2c131..ee768ab6d029c 100644
--- a/python/paddle/audio/__init__.py
+++ b/python/paddle/audio/__init__.py
@@ -14,5 +14,11 @@
 
 from . import features
 from . import functional
+from . import datasets
+from . import backends
 
-__all__ = ["functional", "features"]
+from .backends.backend import info, load, save
+
+__all__ = [
+    "functional", "features", "datasets", "backends", "load", "info", "save"
+]
diff --git a/python/paddle/audio/backends/__init__.py b/python/paddle/audio/backends/__init__.py
new file mode 100644
index 0000000000000..ac19a14c69a01
--- /dev/null
+++ b/python/paddle/audio/backends/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import init_backend
+from .init_backend import get_current_backend  # noqa: F401
+from .init_backend import list_available_backends  # noqa: F401
+from .init_backend import set_backend
+
+init_backend._init_set_audio_backend()
+
+__all__ = [
+    'get_current_backend',
+    'list_available_backends',
+    'set_backend',
+]
diff --git a/python/paddle/audio/backends/backend.py b/python/paddle/audio/backends/backend.py
new file mode 100644
index 0000000000000..fbfd11d20e0b5
--- /dev/null
+++ b/python/paddle/audio/backends/backend.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import paddle
+
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+
+class AudioInfo:
+    """ Audio info, return type of backend info function """
+
+    def __init__(self, sample_rate: int, num_samples: int, num_channels: int,
+                 bits_per_sample: int, encoding: str):
+        self.sample_rate = sample_rate
+        self.num_samples = num_samples
+        self.num_channels = num_channels
+        self.bits_per_sample = bits_per_sample
+        self.encoding = encoding
+
+
+def info(filepath: str) -> AudioInfo:
+    """Get signal information of input audio file.
+
+    Args:
+       filepath: audio path or file object.
+
+    Returns:
+        AudioInfo: info of the given audio.
+
+    Example:
+        .. code-block:: python
+
+            import os
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            base_dir = os.getcwd()
+            filepath = os.path.join(base_dir, "test.wav")
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+            wav_info = paddle.audio.info(filepath)
+    """
+    # for API doc
+    raise NotImplementedError("please set audio backend")
+
+
+def load(filepath: Union[str, Path],
+         frame_offset: int = 0,
+         num_frames: int = -1,
+         normalize: bool = True,
+         channels_first: bool = True) -> Tuple[paddle.Tensor, int]:
+    """Load audio data from file.Load the audio content start form frame_offset, and get num_frames.
+
+    Args:
+        frame_offset: from 0 to total frames,
+        num_frames: from -1 (means total frames) or number frames which want to read,
+        normalize:
+            if True: return audio which norm to (-1, 1), dtype=float32
+            if False: return audio with raw data, dtype=int16
+
+        channels_first:
+            if True: return audio with shape (channels, time)
+
+    Return:
+        Tuple[paddle.Tensor, int]: (audio_content, sample rate)
+
+    Exampels:
+        .. code-block:: python
+
+            import os
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            base_dir = os.getcwd()
+            filepath = os.path.join(base_dir, "test.wav")
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+            wav_data_read, sr = paddle.audio.load(filepath)
+    """
+    # for API doc
+    raise NotImplementedError("please set audio backend")
+
+
+def save(
+    filepath: str,
+    src: paddle.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = 16,
+):
+    """
+    Save audio tensor to file.
+
+    Args:
+        filepath: saved path
+        src: the audio tensor
+        sample_rate: the number of samples of audio per second.
+        channels_first: src channel infomation
+            if True, means input tensor is (channels, time)
+            if False, means input tensor is (time, channels)
+        encoding:encoding format, wave_backend only support PCM16 now.
+        bits_per_sample: bits per sample, wave_backend only support 16 bits now.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            filepath = "./test.wav"
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+    """
+    # for API doc
+    raise NotImplementedError("please set audio backend")
diff --git a/python/paddle/audio/backends/init_backend.py b/python/paddle/audio/backends/init_backend.py
new file mode 100644
index 0000000000000..a066e4e23a64e
--- /dev/null
+++ b/python/paddle/audio/backends/init_backend.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import warnings
+from . import wave_backend
+from . import backend
+from typing import List
+
+import paddle
+
+
+def _check_version(version: str) -> bool:
+    # require paddleaudio >= 1.0.2
+    ver_arr = version.split('.')
+    v0 = int(ver_arr[0])
+    v1 = int(ver_arr[1])
+    v2 = int(ver_arr[2])
+    if v0 < 1:
+        return False
+    if v0 == 1 and v1 == 0 and v2 <= 1:
+        return False
+    return True
+
+
+def list_available_backends() -> List[str]:
+    """ List available backends, the backends in paddleaudio and the default backend.
+
+    Returns:
+        List[str]: The list of available backends.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            wav_path = "./test.wav"
+
+            current_backend = paddle.audio.backends.get_current_backend()
+            print(current_backend) # wave_backend, the default backend.
+            backends = paddle.audio.backends.list_available_backends()
+            # default backends is ['wave_backend']
+            # backends is ['wave_backend', 'soundfile'], if have installed paddleaudio >= 1.0.2
+            if 'soundfile' in backends:
+                paddle.audio.backends.set_backend('soundfile')
+
+            paddle.audio.save(wav_path, waveform, sample_rate)
+
+    """
+    backends = []
+    try:
+        import paddleaudio
+    except ImportError:
+        package = "paddleaudio"
+        warn_msg = (
+            "Failed importing {}. \n"
+            "only wave_banckend(only can deal with PCM16 WAV) supportted.\n"
+            "if want soundfile_backend(more audio type suppported),\n"
+            "please manually installed (usually with `pip install {} >= 1.0.2`). "
+        ).format(package, package)
+        warnings.warn(warn_msg)
+
+    if "paddleaudio" in sys.modules:
+        version = paddleaudio.__version__
+        if _check_version(version) == False:
+            err_msg = (
+                "the version of paddleaudio installed is {},\n"
+                "please ensure the paddleaudio >= 1.0.2.").format(version)
+            raise ImportError(err_msg)
+        backends = paddleaudio.backends.list_audio_backends()
+    backends.append("wave_backend")
+    return backends
+
+
+def get_current_backend() -> str:
+    """ Get the name of the current audio backend
+
+    Returns:
+        str: The name of the current backend,
+        the wave_backend or backend imported from paddleaudio
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            wav_path = "./test.wav"
+
+            current_backend = paddle.audio.backends.get_current_backend()
+            print(current_backend) # wave_backend, the default backend.
+            backends = paddle.audio.backends.list_available_backends()
+            # default backends is ['wave_backend']
+            # backends is ['wave_backend', 'soundfile'], if have installed paddleaudio >= 1.0.2
+
+            if 'soundfile' in backends:
+                paddle.audio.backends.set_backend('soundfile')
+
+            paddle.audio.save(wav_path, waveform, sample_rate)
+
+    """
+    current_backend = None
+    if "paddleaudio" in sys.modules:
+        import paddleaudio
+        current_backend = paddleaudio.backends.get_audio_backend()
+        if paddle.audio.load == paddleaudio.load:
+            return current_backend
+    return "wave_backend"
+
+
+def set_backend(backend_name: str):
+    """Set the backend by one of the list_audio_backend return.
+
+    Args:
+        backend (str): one of the list_audio_backend. "wave_backend" is the default. "soundfile" imported from paddleaudio.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            wav_path = "./test.wav"
+
+            current_backend = paddle.audio.backends.get_current_backend()
+            print(current_backend) # wave_backend, the default backend.
+            backends = paddle.audio.backends.list_available_backends()
+            # default backends is ['wave_backend']
+            # backends is ['wave_backend', 'soundfile'], if have installed paddleaudio >= 1.0.2
+
+            if 'soundfile' in backends:
+                paddle.audio.backends.set_backend('soundfile')
+
+            paddle.audio.save(wav_path, waveform, sample_rate)
+
+    """
+    if backend_name not in list_available_backends():
+        raise NotImplementedError()
+
+    if backend_name == "wave_backend":
+        module = wave_backend
+    else:
+        import paddleaudio
+        paddleaudio.backends.set_audio_backend(backend_name)
+        module = paddleaudio
+
+    for func in ["save", "load", "info"]:
+        setattr(backend, func, getattr(module, func))
+        setattr(paddle.audio, func, getattr(module, func))
+
+
+def _init_set_audio_backend():
+    # init the default wave_backend.
+    for func in ["save", "load", "info"]:
+        setattr(backend, func, getattr(wave_backend, func))
diff --git a/python/paddle/audio/backends/wave_backend.py b/python/paddle/audio/backends/wave_backend.py
new file mode 100644
index 0000000000000..66f2d48fe19a5
--- /dev/null
+++ b/python/paddle/audio/backends/wave_backend.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+import wave
+import numpy as np
+from pathlib import Path
+
+from typing import Optional, Tuple, Union
+from .backend import AudioInfo
+
+
+def _error_message():
+    package = "paddleaudio"
+    warn_msg = (
+        "only PCM16 WAV supportted. \n"
+        "if want support more other audio types, please "
+        "manually installed (usually with `pip install {}`). \n "
+        "and use paddle.audio.backends.set_backend('soundfile') to set audio backend"
+    ).format(package)
+    return warn_msg
+
+
+def info(filepath: str) -> AudioInfo:
+    """Get signal information of input audio file.
+
+    Args:
+       filepath: audio path or file object.
+
+    Returns:
+        AudioInfo: info of the given audio.
+
+    Example:
+        .. code-block:: python
+
+            import os
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            base_dir = os.getcwd()
+            filepath = os.path.join(base_dir, "test.wav")
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+            wav_info = paddle.audio.info(filepath)
+    """
+
+    if hasattr(filepath, 'read'):
+        file_obj = filepath
+    else:
+        file_obj = open(filepath, 'rb')
+
+    try:
+        file_ = wave.open(file_obj)
+    except wave.Error:
+        file_obj.seek(0)
+        file_obj.close()
+        err_msg = _error_message()
+        raise NotImplementedError(err_msg)
+
+    channels = file_.getnchannels()
+    sample_rate = file_.getframerate()
+    sample_frames = file_.getnframes()  # audio frame
+    bits_per_sample = file_.getsampwidth() * 8
+    encoding = "PCM_S"  # default WAV encoding, only support
+    file_obj.close()
+    return AudioInfo(sample_rate, sample_frames, channels, bits_per_sample,
+                     encoding)
+
+
+def load(filepath: Union[str, Path],
+         frame_offset: int = 0,
+         num_frames: int = -1,
+         normalize: bool = True,
+         channels_first: bool = True) -> Tuple[paddle.Tensor, int]:
+    """Load audio data from file. load the audio content start form frame_offset, and get num_frames.
+
+    Args:
+        frame_offset: from 0 to total frames,
+        num_frames: from -1 (means total frames) or number frames which want to read,
+        normalize:
+            if True: return audio which norm to (-1, 1), dtype=float32
+            if False: return audio with raw data, dtype=int16
+
+        channels_first:
+            if True: return audio with shape (channels, time)
+
+    Return:
+        Tuple[paddle.Tensor, int]: (audio_content, sample rate)
+
+    Exampels:
+        .. code-block:: python
+
+            import os
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            base_dir = os.getcwd()
+            filepath = os.path.join(base_dir, "test.wav")
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+            wav_data_read, sr = paddle.audio.load(filepath)
+    """
+    if hasattr(filepath, 'read'):
+        file_obj = filepath
+    else:
+        file_obj = open(filepath, 'rb')
+
+    try:
+        file_ = wave.open(file_obj)
+    except wave.Error:
+        file_obj.seek(0)
+        file_obj.close()
+        err_msg = _error_message()
+        raise NotImplementedError(err_msg)
+
+    channels = file_.getnchannels()
+    sample_rate = file_.getframerate()
+    frames = file_.getnframes()  # audio frame
+
+    audio_content = file_.readframes(frames)
+    file_obj.close()
+
+    # default_subtype = "PCM_16", only support PCM16 WAV
+    audio_as_np16 = np.frombuffer(audio_content, dtype=np.int16)
+    audio_as_np32 = audio_as_np16.astype(np.float32)
+    if normalize:
+        # dtype = "float32"
+        audio_norm = audio_as_np32 / (2**15)
+    else:
+        # dtype = "int16"
+        audio_norm = audio_as_np32
+
+    waveform = np.reshape(audio_norm, (frames, channels))
+    if num_frames != -1:
+        waveform = waveform[frame_offset:frame_offset + num_frames, :]
+    waveform = paddle.to_tensor(waveform)
+    if channels_first:
+        waveform = paddle.transpose(waveform, perm=[1, 0])
+    return waveform, sample_rate
+
+
+def save(
+    filepath: str,
+    src: paddle.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = 16,
+):
+    """
+    Save audio tensor to file.
+
+    Args:
+        filepath: saved path
+        src: the audio tensor
+        sample_rate: the number of samples of audio per second.
+        channels_first: src channel infomation
+            if True, means input tensor is (channels, time)
+            if False, means input tensor is (time, channels)
+        encoding: audio encoding format, wave_backend only support PCM16 now.
+        bits_per_sample: bits per sample, wave_backend only support 16 bits now.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            filepath = "./test.wav"
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+    """
+    assert src.ndim == 2, "Expected 2D tensor"
+
+    audio_numpy = src.numpy()
+
+    # change src shape to (time, channels)
+    if channels_first:
+        audio_numpy = np.transpose(audio_numpy)
+
+    channels = audio_numpy.shape[1]
+
+    # only support PCM16
+    if bits_per_sample not in (None, 16):
+        raise ValueError("Invalid bits_per_sample, only supprt 16 bit")
+
+    sample_width = int(bits_per_sample / 8)  # 2
+
+    if src.dtype == paddle.float32:
+        audio_numpy = (audio_numpy * (2**15)).astype("<h")
+
+    with wave.open(filepath, 'w') as f:
+        f.setnchannels(channels)
+        f.setsampwidth(sample_width)
+        f.setframerate(sample_rate)
+        f.writeframes(audio_numpy.tobytes())
diff --git a/python/paddle/audio/datasets/__init__.py b/python/paddle/audio/datasets/__init__.py
new file mode 100644
index 0000000000000..56d176ba4ef3a
--- /dev/null
+++ b/python/paddle/audio/datasets/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .esc50 import ESC50
+from .tess import TESS
+
+__all__ = ["ESC50", "TESS"]
diff --git a/python/paddle/audio/datasets/dataset.py b/python/paddle/audio/datasets/dataset.py
new file mode 100644
index 0000000000000..67fda01f3fde9
--- /dev/null
+++ b/python/paddle/audio/datasets/dataset.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import paddle
+
+from ..features import MelSpectrogram
+from ..features import Spectrogram
+from ..features import MFCC
+from ..features import LogMelSpectrogram
+
+feat_funcs = {
+    'raw': None,
+    'melspectrogram': MelSpectrogram,
+    'mfcc': MFCC,
+    'logmelspectrogram': LogMelSpectrogram,
+    'spectrogram': Spectrogram
+}
+
+
+class AudioClassificationDataset(paddle.io.Dataset):
+    """
+    Base class of audio classification dataset.
+    """
+
+    def __init__(self,
+                 files: List[str],
+                 labels: List[int],
+                 feat_type: str = 'raw',
+                 sample_rate: int = None,
+                 **kwargs):
+        """
+        Ags:
+            files (:obj:`List[str]`): A list of absolute path of audio files.
+            labels (:obj:`List[int]`): Labels of audio files.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        super(AudioClassificationDataset, self).__init__()
+
+        if feat_type not in feat_funcs.keys():
+            raise RuntimeError(
+                f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
+            )
+
+        self.files = files
+        self.labels = labels
+
+        self.feat_type = feat_type
+        self.sample_rate = sample_rate
+        self.feat_config = kwargs  # Pass keyword arguments to customize feature config
+
+    def _get_data(self, input_file: str):
+        raise NotImplementedError
+
+    def _convert_to_record(self, idx):
+        file, label = self.files[idx], self.labels[idx]
+        waveform, sample_rate = paddle.audio.load(file)
+        self.sample_rate = sample_rate
+
+        feat_func = feat_funcs[self.feat_type]
+
+        record = {}
+        if len(waveform.shape) == 2:
+            waveform = waveform.squeeze(0)  # 1D input
+        waveform = paddle.to_tensor(waveform, dtype=paddle.float32)
+        if feat_func is not None:
+            waveform = waveform.unsqueeze(0)  # (batch_size, T)
+            if self.feat_type != 'spectrogram':
+                feature_extractor = feat_func(sr=self.sample_rate,
+                                              **self.feat_config)
+            else:
+                feature_extractor = feat_func(**self.feat_config)
+            record['feat'] = feature_extractor(waveform).squeeze(0)
+        else:
+            record['feat'] = waveform
+        record['label'] = label
+        return record
+
+    def __getitem__(self, idx):
+        record = self._convert_to_record(idx)
+        return record['feat'], record['label']
+
+    def __len__(self):
+        return len(self.files)
diff --git a/python/paddle/audio/datasets/esc50.py b/python/paddle/audio/datasets/esc50.py
new file mode 100644
index 0000000000000..f702fe518facb
--- /dev/null
+++ b/python/paddle/audio/datasets/esc50.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from paddle.utils import download
+from paddle.dataset.common import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['ESC50']
+
+
+class ESC50(AudioClassificationDataset):
+    """
+    The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
+    suitable for benchmarking methods of environmental sound classification. The dataset
+    consists of 5-second-long recordings organized into 50 semantical classes (with
+    40 examples per class)
+
+    Reference:
+        ESC: Dataset for Environmental Sound Classification
+        http://dx.doi.org/10.1145/2733373.2806390
+
+    Args:
+       mode (str, optional): It identifies the dataset mode (train or dev). Default:train.
+       split (int, optional): It specify the fold of dev dataset. Default:1.
+       feat_type (str, optional): It identifies the feature type that user wants to extrace of an audio file. Default:raw.
+       archive(dict, optional): it tells where to download the audio archive. Default:None.
+
+    Returns:
+        :ref:`api_paddle_io_Dataset`. An instance of ESC50 dataset.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            mode = 'dev'
+            esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='raw')
+            for idx in range(5):
+                audio, label = esc50_dataset[idx]
+                # do something with audio, label
+                print(audio.shape, label)
+                # [audio_data_length] , label_id
+
+            esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='mfcc',
+                                                    n_mfcc=40)
+            for idx in range(5):
+                audio, label = esc50_dataset[idx]
+                # do something with mfcc feature, label
+                print(audio.shape, label)
+                # [feature_dim, length] , label_id
+    """
+
+    archive = {
+        'url': 'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
+        'md5': '7771e4b9d86d0945acce719c7a59305a',
+    }
+
+    label_list = [
+        # Animals
+        'Dog',
+        'Rooster',
+        'Pig',
+        'Cow',
+        'Frog',
+        'Cat',
+        'Hen',
+        'Insects (flying)',
+        'Sheep',
+        'Crow',
+        # Natural soundscapes & water sounds
+        'Rain',
+        'Sea waves',
+        'Crackling fire',
+        'Crickets',
+        'Chirping birds',
+        'Water drops',
+        'Wind',
+        'Pouring water',
+        'Toilet flush',
+        'Thunderstorm',
+        # Human, non-speech sounds
+        'Crying baby',
+        'Sneezing',
+        'Clapping',
+        'Breathing',
+        'Coughing',
+        'Footsteps',
+        'Laughing',
+        'Brushing teeth',
+        'Snoring',
+        'Drinking, sipping',
+        # Interior/domestic sounds
+        'Door knock',
+        'Mouse click',
+        'Keyboard typing',
+        'Door, wood creaks',
+        'Can opening',
+        'Washing machine',
+        'Vacuum cleaner',
+        'Clock alarm',
+        'Clock tick',
+        'Glass breaking',
+        # Exterior/urban noises
+        'Helicopter',
+        'Chainsaw',
+        'Siren',
+        'Car horn',
+        'Engine',
+        'Train',
+        'Church bells',
+        'Airplane',
+        'Fireworks',
+        'Hand saw',
+    ]
+    meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
+    meta_info = collections.namedtuple(
+        'META_INFO',
+        ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
+    audio_path = os.path.join('ESC-50-master', 'audio')
+
+    def __init__(self,
+                 mode: str = 'train',
+                 split: int = 1,
+                 feat_type: str = 'raw',
+                 archive=None,
+                 **kwargs):
+        if archive is not None:
+            self.archive = archive
+        files, labels = self._get_data(mode, split)
+        super(ESC50, self).__init__(files=files,
+                                    labels=labels,
+                                    feat_type=feat_type,
+                                    **kwargs)
+
+    def _get_meta_info(self) -> List[collections.namedtuple]:
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                ret.append(self.meta_info(*line.strip().split(',')))
+        return ret
+
+    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download.get_path_from_url(self.archive['url'],
+                                       DATA_HOME,
+                                       self.archive['md5'],
+                                       decompress=True)
+
+        meta_info = self._get_meta_info()
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, fold, target, _, _, _, _ = sample
+            if mode == 'train' and int(fold) != split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+
+            if mode != 'train' and int(fold) == split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+
+        return files, labels
diff --git a/python/paddle/audio/datasets/tess.py b/python/paddle/audio/datasets/tess.py
new file mode 100644
index 0000000000000..0f375aa2b0172
--- /dev/null
+++ b/python/paddle/audio/datasets/tess.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from paddle.utils import download
+from paddle.dataset.common import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['TESS']
+
+
+class TESS(AudioClassificationDataset):
+    """
+    TESS is a set of 200 target words were spoken in the carrier phrase
+    "Say the word _____' by two actresses (aged 26 and 64 years) and
+    recordings were made of the set portraying each of seven emotions(anger,
+    disgust, fear, happiness, pleasant surprise, sadness, and neutral).
+    There are 2800 stimuli in total.
+
+    Reference:
+        Toronto emotional speech set (TESS) https://tspace.library.utoronto.ca/handle/1807/24487
+        https://doi.org/10.5683/SP2/E8H2MF
+
+    Args:
+       mode (str, optional): It identifies the dataset mode (train or dev). Defaults to train.
+       n_folds (int, optional): Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. Defaults to 5.
+       split (int, optional): It specify the fold of dev dataset. Defaults to 1.
+       feat_type (str, optional): It identifies the feature type that user wants to extrace of an audio file. Defaults to raw.
+       archive(dict): it tells where to download the audio archive. Defaults to None.
+
+    Returns:
+        :ref:`api_paddle_io_Dataset`. An instance of TESS dataset.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            mode = 'dev'
+            tess_dataset = paddle.audio.datasets.TESS(mode=mode,
+                                                    feat_type='raw')
+            for idx in range(5):
+                audio, label = tess_dataset[idx]
+                # do something with audio, label
+                print(audio.shape, label)
+                # [audio_data_length] , label_id
+
+            tess_dataset = paddle.audio.datasets.TESS(mode=mode,
+                                                    feat_type='mfcc',
+                                                    n_mfcc=40)
+            for idx in range(5):
+                audio, label = tess_dataset[idx]
+                # do something with mfcc feature, label
+                print(audio.shape, label)
+                # [feature_dim, num_frames] , label_id
+    """
+
+    archive = {
+        'url':
+        'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
+        'md5': '1465311b24d1de704c4c63e4ccc470c7',
+    }
+
+    label_list = [
+        'angry',
+        'disgust',
+        'fear',
+        'happy',
+        'neutral',
+        'ps',  # pleasant surprise
+        'sad',
+    ]
+    meta_info = collections.namedtuple('META_INFO',
+                                       ('speaker', 'word', 'emotion'))
+    audio_path = 'TESS_Toronto_emotional_speech_set'
+
+    def __init__(self,
+                 mode='train',
+                 n_folds=5,
+                 split=1,
+                 feat_type='raw',
+                 archive=None,
+                 **kwargs):
+        """
+
+        """
+        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
+        if archive is not None:
+            self.archive = archive
+        files, labels = self._get_data(mode, n_folds, split)
+        super(TESS, self).__init__(files=files,
+                                   labels=labels,
+                                   feat_type=feat_type,
+                                   **kwargs)
+
+    def _get_meta_info(self, files) -> List[collections.namedtuple]:
+        ret = []
+        for file in files:
+            basename_without_extend = os.path.basename(file)[:-4]
+            ret.append(self.meta_info(*basename_without_extend.split('_')))
+        return ret
+
+    def _get_data(self, mode, n_folds, split) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
+            download.get_path_from_url(self.archive['url'],
+                                       DATA_HOME,
+                                       self.archive['md5'],
+                                       decompress=True)
+
+        wav_files = []
+        for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
+            for file in files:
+                if file.endswith('.wav'):
+                    wav_files.append(os.path.join(root, file))
+
+        meta_info = self._get_meta_info(wav_files)
+
+        files = []
+        labels = []
+        n_samples_per_fold = len(meta_info) // n_folds
+        for idx, sample in enumerate(meta_info):
+            _, _, emotion = sample
+            target = self.label_list.index(emotion)
+            fold = idx // n_samples_per_fold + 1
+
+            if mode == 'train' and int(fold) != split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+            if mode != 'train' and int(fold) == split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+        return files, labels
diff --git a/python/paddle/tests/test_audio_backend.py b/python/paddle/tests/test_audio_backend.py
new file mode 100644
index 0000000000000..79e793e2dc865
--- /dev/null
+++ b/python/paddle/tests/test_audio_backend.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import soundfile
+import numpy as np
+import os
+import paddle.audio
+
+
+class TestAudioBackends(unittest.TestCase):
+
+    def setUp(self):
+        self.initParmas()
+
+    def initParmas(self):
+
+        def get_wav_data(dtype: str, num_channels: int, num_frames: int):
+            dtype_ = getattr(paddle, dtype)
+            base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) * 0.1
+            data = base.tile([num_channels, 1])
+            return data
+
+        self.duration = 0.5
+        self.num_channels = 1
+        self.sr = 16000
+        self.dtype = "float32"
+        self.window_size = 1024
+        waveform_tensor = get_wav_data(self.dtype,
+                                       self.num_channels,
+                                       num_frames=self.duration * self.sr)
+        # shape (1, 8000)
+        self.waveform = waveform_tensor.numpy()
+
+    def test_backend(self):
+        base_dir = os.getcwd()
+        wave_wav_path = os.path.join(base_dir, "wave_test.wav")
+        paddle.audio.save(wave_wav_path,
+                          paddle.to_tensor(self.waveform),
+                          self.sr,
+                          channels_first=True)
+
+        # test backends(wave)(wave_backend) info
+        wav_info = paddle.audio.info(wave_wav_path)
+        self.assertTrue(wav_info.sample_rate, self.sr)
+        self.assertTrue(wav_info.num_channels, self.num_channels)
+        self.assertTrue(wav_info.bits_per_sample, 16)
+
+        with open(wave_wav_path, 'rb') as file_:
+            wav_info = paddle.audio.info(file_)
+            self.assertTrue(wav_info.sample_rate, self.sr)
+            self.assertTrue(wav_info.num_channels, self.num_channels)
+            self.assertTrue(wav_info.bits_per_sample, 16)
+
+        # test backends(wave_backend) load & save
+        wav_data, sr = paddle.audio.load(wave_wav_path)
+        np.testing.assert_array_almost_equal(wav_data, self.waveform, decimal=4)
+        with soundfile.SoundFile(wave_wav_path, "r") as file_:
+            dtype = "float32"
+            frames = file_._prepare_read(0, None, -1)
+            waveform = file_.read(frames, dtype, always_2d=True)
+            waveform = waveform.T
+            np.testing.assert_array_almost_equal(wav_data, waveform)
+
+        with open(wave_wav_path, 'rb') as file_:
+            wav_data, sr = paddle.audio.load(file_,
+                                             normalize=False,
+                                             num_frames=10000)
+        with soundfile.SoundFile(wave_wav_path, "r") as file_:
+            dtype = "int16"
+            frames = file_._prepare_read(0, None, -1)
+            waveform = file_.read(frames, dtype, always_2d=True)
+            waveform = waveform.T
+            np.testing.assert_array_almost_equal(wav_data, waveform)
+
+        current_backend = paddle.audio.backends.get_current_backend()
+        self.assertTrue(current_backend in ["wave_backend", "soundfile"])
+
+        paddle.audio.backends.set_backend("wave_backend")
+
+        backends = paddle.audio.backends.list_available_backends()
+        for backend in backends:
+            self.assertTrue(backend in ["wave_backend", "soundfile"])
+
+        # Test error
+        try:
+            paddle.audio.backends.set_backend("jfiji")
+        except NotImplementedError:
+            pass
+
+        try:
+            import paddleaudio
+            backends = paddle.audio.backends.list_available_backends()
+            for backend in backends:
+                self.assertTrue(backend in ["wave_backend", "soundfile"])
+            current_backend = paddle.audio.backends.get_current_backend()
+            self.assertTrue(current_backend, "wave_backend")
+            paddleaudio.backends.set_audio_backend("soundfile")
+            paddle.audio.backends.set_backend("soundfile")
+            current_backend = paddle.audio.backends.get_current_backend()
+            self.assertTrue(current_backend, "soundfile")
+            wav_info = paddle.audio.info(wave_wav_path)
+            self.assertTrue(wav_info.sample_rate, self.sr)
+            self.assertTrue(wav_info.num_channels, self.num_channels)
+            self.assertTrue(wav_info.bits_per_sample, 16)
+            paddle.audio.backends.set_backend("wave_backend")
+        except ImportError:
+            pass
+
+        try:
+            paddle.audio.save(wave_wav_path,
+                              paddle.to_tensor(self.waveform),
+                              self.sr,
+                              bits_per_sample=24,
+                              channels_first=True)
+        except ValueError:
+            pass
+
+        try:
+            paddle.audio.save(wave_wav_path,
+                              paddle.to_tensor(self.waveform).unsqueeze(0),
+                              self.sr)
+        except AssertionError:
+            pass
+
+        fake_data = np.array([0, 1, 2, 3, 4, 6], np.float32)
+        soundfile.write(wave_wav_path, fake_data, 1, subtype="DOUBLE")
+        try:
+            wav_info = paddle.audio.info(wave_wav_path)
+        except NotImplementedError:
+            pass
+        try:
+            wav_data = paddle.audio.load(wave_wav_path)
+        except NotImplementedError:
+            pass
+
+        if os.path.exists(wave_wav_path):
+            os.remove(wave_wav_path)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_audio_datasets.py b/python/paddle/tests/test_audio_datasets.py
new file mode 100644
index 0000000000000..59ba1d543bda6
--- /dev/null
+++ b/python/paddle/tests/test_audio_datasets.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import paddle
+import itertools
+from parameterized import parameterized
+
+
+def parameterize(*params):
+    return parameterized.expand(list(itertools.product(*params)))
+
+
+class TestAudioDatasets(unittest.TestCase):
+
+    @parameterize(["dev", "train"], [40, 64])
+    def test_tess_dataset(self, mode: str, params: int):
+        """
+        TESS dataset
+        Reference:
+            Toronto emotional speech set (TESS) https://tspace.library.utoronto.ca/handle/1807/24487
+            https://doi.org/10.5683/SP2/E8H2MF
+        """
+        archive = {
+            'url':
+            'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set_lite.zip',
+            'md5': '9ffb5e3adf28d4d6b787fa94bd59b975',
+        }  # small part of TESS dataset for test.
+        tess_dataset = paddle.audio.datasets.TESS(mode=mode,
+                                                  feat_type='mfcc',
+                                                  n_mfcc=params,
+                                                  archive=archive)
+        idx = np.random.randint(0, 30)
+        elem = tess_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 6)
+
+        tess_dataset = paddle.audio.datasets.TESS(mode=mode,
+                                                  feat_type='spectrogram',
+                                                  n_fft=params)
+        elem = tess_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == (params // 2 + 1))
+        self.assertTrue(0 <= elem[1] <= 6)
+
+        tess_dataset = paddle.audio.datasets.TESS(mode="dev",
+                                                  feat_type='logmelspectrogram',
+                                                  n_mels=params)
+        elem = tess_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 6)
+
+        tess_dataset = paddle.audio.datasets.TESS(mode="dev",
+                                                  feat_type='melspectrogram',
+                                                  n_mels=params)
+        elem = tess_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 6)
+
+    @parameterize(["dev", "train"], [40, 64])
+    def test_esc50_dataset(self, mode: str, params: int):
+        """
+        ESC50 dataset
+        Reference:
+            ESC: Dataset for Environmental Sound Classification
+            http://dx.doi.org/10.1145/2733373.2806390
+        """
+        archive = {
+            'url':
+            'https://bj.bcebos.com/paddleaudio/datasets/ESC-50-master-lite.zip',
+            'md5': '1e9ba53265143df5b2804a743f2d1956',
+        }  # small part of ESC50 dataset for test.
+        esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='raw',
+                                                    archive=archive)
+        idx = np.random.randint(0, 6)
+        elem = esc50_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == 220500)
+        self.assertTrue(0 <= elem[1] <= 2)
+
+        esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='mfcc',
+                                                    n_mfcc=params,
+                                                    archive=archive)
+        idx = np.random.randint(0, 6)
+        elem = esc50_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 2)
+
+        esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='spectrogram',
+                                                    n_fft=params)
+        elem = esc50_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == (params // 2 + 1))
+        self.assertTrue(0 <= elem[1] <= 2)
+
+        esc50_dataset = paddle.audio.datasets.ESC50(
+            mode=mode, feat_type='logmelspectrogram', n_mels=params)
+        elem = esc50_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 2)
+
+        esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='melspectrogram',
+                                                    n_mels=params)
+        elem = esc50_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 4b2128a96755f..648d7089e7e10 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -373,6 +373,8 @@ packages=['paddle',
           'paddle.audio',
 	  'paddle.audio.functional',
 	  'paddle.audio.features',
+	  'paddle.audio.datasets',
+	  'paddle.audio.backends',
           'paddle.text',
           'paddle.text.datasets',
           'paddle.incubate',
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 9a280c691aaad..f3c5f708b8478 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -221,17 +221,47 @@ def process_module(m, attr="__all__"):
 
 def check_public_api():
     modulelist = [  #npqa
-        paddle, paddle.amp, paddle.nn, paddle.nn.functional,
-        paddle.nn.initializer, paddle.nn.utils, paddle.static, paddle.static.nn,
-        paddle.io, paddle.jit, paddle.metric, paddle.distribution,
-        paddle.optimizer, paddle.optimizer.lr, paddle.regularizer, paddle.text,
-        paddle.utils, paddle.utils.download, paddle.utils.profiler,
-        paddle.utils.cpp_extension, paddle.sysconfig, paddle.vision,
-        paddle.vision.datasets, paddle.vision.models, paddle.vision.transforms,
-        paddle.vision.ops, paddle.distributed, paddle.distributed.fleet,
-        paddle.distributed.fleet.utils, paddle.distributed.parallel,
-        paddle.distributed.utils, paddle.callbacks, paddle.hub, paddle.autograd,
-        paddle.incubate, paddle.inference, paddle.onnx, paddle.device
+        paddle,
+        paddle.amp,
+        paddle.nn,
+        paddle.nn.functional,
+        paddle.nn.initializer,
+        paddle.nn.utils,
+        paddle.static,
+        paddle.static.nn,
+        paddle.io,
+        paddle.jit,
+        paddle.metric,
+        paddle.distribution,
+        paddle.optimizer,
+        paddle.optimizer.lr,
+        paddle.regularizer,
+        paddle.text,
+        paddle.utils,
+        paddle.utils.download,
+        paddle.utils.profiler,
+        paddle.utils.cpp_extension,
+        paddle.sysconfig,
+        paddle.vision,
+        paddle.vision.datasets,
+        paddle.vision.models,
+        paddle.vision.transforms,
+        paddle.vision.ops,
+        paddle.distributed,
+        paddle.distributed.fleet,
+        paddle.distributed.fleet.utils,
+        paddle.distributed.parallel,
+        paddle.distributed.utils,
+        paddle.callbacks,
+        paddle.hub,
+        paddle.autograd,
+        paddle.incubate,
+        paddle.inference,
+        paddle.onnx,
+        paddle.device,
+        paddle.audio,
+        paddle.audio.backends,
+        paddle.audio.datasets,
     ]
 
     apinum = 0