diff --git a/datasets/audiofolder/README.md b/datasets/audiofolder/README.md new file mode 100644 index 00000000000..7ab03192318 --- /dev/null +++ b/datasets/audiofolder/README.md @@ -0,0 +1,4 @@ + +### Contributions + +Thanks to [@polinaeterna](https://github.com/polinaeterna), [@nateraw](https://github.com/nateraw), [@lhoestq](https://github.com/lhoestq) and [@mariosasko](https://github.com/mariosasko) for adding this dataset. diff --git a/datasets/audiofolder/dummy/0.0.0/dummy_data.zip b/datasets/audiofolder/dummy/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..b8afccba2fa Binary files /dev/null and b/datasets/audiofolder/dummy/0.0.0/dummy_data.zip differ diff --git a/docs/source/audio_load.mdx b/docs/source/audio_load.mdx index 8bad47e1a78..2af31c32e9d 100644 --- a/docs/source/audio_load.mdx +++ b/docs/source/audio_load.mdx @@ -55,3 +55,99 @@ If you only want to load the underlying path to the audio dataset without decodi 'transcription': 'I would like to set up a joint account with my partner'} ``` +## AudioFolder + +You can also load a dataset with an `AudioFolder` dataset builder. It does not require writing a custom dataloader, making it useful for quickly loading audio data. + +## AudioFolder with metadata + +To link your audio files with metadata information, make sure your dataset has a `metadata.jsonl` file. Your dataset structure might look like: + +``` +folder/train/metadata.jsonl +folder/train/first_audio_file.mp3 +folder/train/second_audio_file.mp3 +folder/train/third_audio_file.mp3 +``` + +Your `metadata.jsonl` file must have a `file_name` column which links audio files with their metadata. An example `metadata.jsonl` file might look like: + +```python +{"file_name": "first_audio_file.mp3", "transcription": "znowu się duch z ciałem zrośnie w młodocianej wstaniesz wiosnie i możesz skutkiem tych leków umierać wstawać wiek wieków dalej tam były przestrogi jak siekać głowę jak nogi"} +{"file_name": "second_audio_file.mp3", "transcription": "już u źwierzyńca podwojów król zasiada przy nim książęta i panowie rada a gdzie wzniosły krążył ganek rycerze obok kochanek król skinął palcem zaczęto igrzysko"} +{"file_name": "third_audio_file.mp3", "transcription": "pewnie kędyś w obłędzie ubite minęły szlaki zaczekajmy dzień jaki poślemy szukać wszędzie dziś jutro pewnie będzie posłali wszędzie sługi czekali dzień i drugi gdy nic nie doczekali z płaczem chcą jechać dali"} +``` + +Load your audio dataset by specifying `audiofolder` and the directory containing your data in `data_dir`: + +```py +>>> from datasets import load_dataset + +>>> dataset = load_dataset("audiofolder", data_dir="/path/to/folder") +``` + +`AudioFolder` will load audio data and create a `transcription` column containing texts from `metadata.jsonl`: + +```py +>>> dataset["train"][0] +{'audio': + {'path': '/path/to/extracted/audio/first_audio_file.mp3', + 'array': array([ 0.00088501, 0.0012207 , 0.00131226, ..., -0.00045776, -0.00054932, -0.00054932], dtype=float32), + 'sampling_rate': 16000}, + 'transcription': 'znowu się duch z ciałem zrośnie w młodocianej wstaniesz wiosnie i możesz skutkiem tych leków umierać wstawać wiek wieków dalej tam były przestrogi jak siekać głowę jak nogi' +} +``` + +You can load remote datasets from their URLs with the `data_files` parameter: + +```py +>>> dataset = load_dataset("audiofolder", data_files="https://s3.amazonaws.com/datasets.huggingface.co/SpeechCommands/v0.01/v0.01_test.tar.gz") +``` + +## AudioFolder with labels + +If your data directory doesn't contain any metadata files, by default `AudioFolder` automatically adds a `label` column of [`~datasets.features.ClassLabel`] type, with labels based on the directory name. +It might be useful if you have an audio classification task. + +### Language identification + +Language identification datasets have audio recordings of speech in multiple languages: + +``` +folder/train/ar/0197_720_0207_190.wav +folder/train/ar/0179_830_0185_540.mp3 +folder/train/ar/0179_830_0185_540.mp3 + +folder/train/zh/0442_690_0454_380.mp3 +``` + +As there are no metadata files, `AudioFolder` will create a `label` column with the language id based on the directory name: + +``` +>>> dataset = load_dataset("audiofolder", data_dir="/path/to/folder", drop_labels=False) +>>> dataset["train"][0] +{'audio': + {'path': '/path/to/extracted/audio/0197_720_0207_190.mp3', + 'array': array([-3.6621094e-04, -6.1035156e-05, 6.1035156e-05, ..., -5.1879883e-04, -1.0070801e-03, -7.6293945e-04], + 'sampling_rate': 16000} + 'label': 0 # "ar" +} + +>>> dataset["train"][-1] +{'audio': + {'path': '/path/to/extracted/audio/0442_690_0454_380.mp3', + 'array': array([1.8920898e-03, 9.4604492e-04, 1.9226074e-03, ..., 9.1552734e-05, 1.8310547e-04, 6.1035156e-05], + 'sampling_rate': 16000} + 'label': 99 # "zh" +} +``` + +If you have metadata files inside your data directory, but you still want to infer labels from directories names, set `drop_labels=False` as defined in [`~datasets.packaged_modules.audiofolder.AudioFolderConfig`]. + + + +Alternatively, you can add `label` column to your `metadata.jsonl` file. + + + +If you have no metadata files and want to drop automatically created labels, set `drop_labels=True`. In this case your dataset would contain only an `audio` column. diff --git a/docs/source/image_load.mdx b/docs/source/image_load.mdx index fdfc50a35be..7596a094bc8 100644 --- a/docs/source/image_load.mdx +++ b/docs/source/image_load.mdx @@ -47,7 +47,7 @@ If you only want to load the underlying path to the image dataset without decodi ## ImageFolder -You can also load a dataset with a `ImageFolder` dataset builder. It does not require writing a custom dataloader, making it useful for quickly loading a dataset for certain vision tasks. Your image dataset structure should look like this: +You can also load a dataset with an `ImageFolder` dataset builder. It does not require writing a custom dataloader, making it useful for quickly loading a dataset for certain vision tasks. Your image dataset structure should look like this: ``` folder/train/dog/golden_retriever.png diff --git a/docs/source/package_reference/loading_methods.mdx b/docs/source/package_reference/loading_methods.mdx index 66508e3dbf1..bc478ffe777 100644 --- a/docs/source/package_reference/loading_methods.mdx +++ b/docs/source/package_reference/loading_methods.mdx @@ -68,3 +68,7 @@ load_dataset("csv", data_dir="path/to/data/dir", sep="\t") ### Images [[autodoc]] datasets.packaged_modules.imagefolder.ImageFolderConfig + +### Audio + +[[autodoc]] datasets.packaged_modules.audiofolder.AudioFolderConfig diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index be5d1669b31..deabca1f35a 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -3,6 +3,7 @@ from hashlib import sha256 from typing import List +from .audiofolder import audiofolder from .csv import csv from .imagefolder import imagefolder from .json import json @@ -32,6 +33,7 @@ def _hash_python_lines(lines: List[str]) -> str: "parquet": (parquet.__name__, _hash_python_lines(inspect.getsource(parquet).splitlines())), "text": (text.__name__, _hash_python_lines(inspect.getsource(text).splitlines())), "imagefolder": (imagefolder.__name__, _hash_python_lines(inspect.getsource(imagefolder).splitlines())), + "audiofolder": (audiofolder.__name__, _hash_python_lines(inspect.getsource(audiofolder).splitlines())), } _EXTENSION_TO_MODULE = { @@ -42,7 +44,8 @@ def _hash_python_lines(lines: List[str]) -> str: "parquet": ("parquet", {}), "txt": ("text", {}), } -_EXTENSION_TO_MODULE.update({ext[1:]: ("imagefolder", {}) for ext in imagefolder.ImageFolder.IMAGE_EXTENSIONS}) -_EXTENSION_TO_MODULE.update({ext[1:].upper(): ("imagefolder", {}) for ext in imagefolder.ImageFolder.IMAGE_EXTENSIONS}) - -_MODULE_SUPPORTS_METADATA = {"imagefolder"} +_EXTENSION_TO_MODULE.update({ext[1:]: ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS}) +_EXTENSION_TO_MODULE.update({ext[1:].upper(): ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS}) +_EXTENSION_TO_MODULE.update({ext[1:]: ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS}) +_EXTENSION_TO_MODULE.update({ext[1:].upper(): ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS}) +_MODULE_SUPPORTS_METADATA = {"imagefolder", "audiofolder"} diff --git a/src/datasets/packaged_modules/audiofolder/__init__.py b/src/datasets/packaged_modules/audiofolder/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/datasets/packaged_modules/audiofolder/audiofolder.py b/src/datasets/packaged_modules/audiofolder/audiofolder.py new file mode 100644 index 00000000000..ab90d8de378 --- /dev/null +++ b/src/datasets/packaged_modules/audiofolder/audiofolder.py @@ -0,0 +1,66 @@ +from typing import List + +import datasets + +from ..folder_based_builder import folder_based_builder + + +logger = datasets.utils.logging.get_logger(__name__) + + +class AudioFolderConfig(folder_based_builder.FolderBasedBuilderConfig): + """Builder Config for AudioFolder.""" + + drop_labels: bool = None + drop_metadata: bool = None + + +class AudioFolder(folder_based_builder.FolderBasedBuilder): + BASE_FEATURE = datasets.Audio() + BASE_COLUMN_NAME = "audio" + BUILDER_CONFIG_CLASS = AudioFolderConfig + EXTENSIONS: List[str] # definition at the bottom of the script + + +# Obtained with: +# ``` +# import soundfile as sf +# +# AUDIO_EXTENSIONS = [f".{format.lower()}" for format in sf.available_formats().keys()] +# +# # .mp3 is currently decoded via `torchaudio`, .opus decoding is supported if version of `libsndfile` >= 1.0.30: +# AUDIO_EXTENSIONS.extend([".mp3", ".opus"]) +# ``` +# We intentionally do not run this code on launch because: +# (1) Soundfile is an optional dependency, so importing it in global namespace is not allowed +# (2) To ensure the list of supported extensions is deterministic +AUDIO_EXTENSIONS = [ + ".aiff", + ".au", + ".avr", + ".caf", + ".flac", + ".htk", + ".svx", + ".mat4", + ".mat5", + ".mpc2k", + ".ogg", + ".paf", + ".pvf", + ".raw", + ".rf64", + ".sd2", + ".sds", + ".ircam", + ".voc", + ".w64", + ".wav", + ".nist", + ".wavex", + ".wve", + ".xi", + ".mp3", + ".opus", +] +AudioFolder.EXTENSIONS = AUDIO_EXTENSIONS diff --git a/src/datasets/packaged_modules/folder_based_builder/__init__.py b/src/datasets/packaged_modules/folder_based_builder/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py new file mode 100644 index 00000000000..379019ae083 --- /dev/null +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -0,0 +1,391 @@ +import collections +import itertools +import os +from dataclasses import dataclass +from typing import Any, List, Optional, Tuple + +import pyarrow.compute as pc +import pyarrow.json as paj + +import datasets + + +logger = datasets.utils.logging.get_logger(__name__) + + +if datasets.config.PYARROW_VERSION.major >= 7: + + def pa_table_to_pylist(table): + return table.to_pylist() + +else: + + def pa_table_to_pylist(table): + keys = table.column_names + values = table.to_pydict().values() + return [{k: v for k, v in zip(keys, row_values)} for row_values in zip(*values)] + + +def count_path_segments(path): + cnt = 0 + while True: + parts = os.path.split(path) + if parts[0] == path: + break + elif parts[1] == path: + break + else: + path = parts[0] + cnt += 1 + return cnt + + +@dataclass +class FolderBasedBuilderConfig(datasets.BuilderConfig): + """BuilderConfig for AutoFolder.""" + + features: Optional[datasets.Features] = None + drop_labels: bool = None + drop_metadata: bool = None + + +class FolderBasedBuilder(datasets.GeneratorBasedBuilder): + """ + Base class for generic data loaders for vision and image data. + + + Abstract class attributes to be overridden by a child class: + BASE_FEATURE: feature object to decode data (i.e. datasets.Image(), datasets.Audio(), ...) + BASE_COLUMN_NAME: string key name of a base feature (i.e. "image", "audio", ...) + BUILDER_CONFIG_CLASS: builder config inherited from `folder_based_builder.FolderBasedBuilderConfig` + EXTENSIONS: list of allowed extensions (only files with these extensions and METADATA_FILENAME files + will be included in a dataset) + """ + + BASE_FEATURE: Any + BASE_COLUMN_NAME: str + BUILDER_CONFIG_CLASS: FolderBasedBuilderConfig + EXTENSIONS: List[str] + + SKIP_CHECKSUM_COMPUTATION_BY_DEFAULT: bool = True + METADATA_FILENAME: str = "metadata.jsonl" + + def _info(self): + return datasets.DatasetInfo(features=self.config.features) + + def _split_generators(self, dl_manager): + if not self.config.data_files: + raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") + + # Do an early pass if: + # * `drop_labels` is None (default) or False, to infer the class labels + # * `drop_metadata` is None (default) or False, to find the metadata files + do_analyze = not self.config.drop_labels or not self.config.drop_metadata + labels = set() + metadata_files = collections.defaultdict(set) + + def analyze(files_or_archives, downloaded_files_or_dirs, split): + if len(downloaded_files_or_dirs) == 0: + return + # The files are separated from the archives at this point, so check the first sample + # to see if it's a file or a directory and iterate accordingly + if os.path.isfile(downloaded_files_or_dirs[0]): + original_files, downloaded_files = files_or_archives, downloaded_files_or_dirs + for original_file, downloaded_file in zip(original_files, downloaded_files): + original_file, downloaded_file = str(original_file), str(downloaded_file) + _, original_file_ext = os.path.splitext(original_file) + if original_file_ext.lower() in self.EXTENSIONS: + if not self.config.drop_labels: + labels.add(os.path.basename(os.path.dirname(original_file))) + elif os.path.basename(original_file) == self.METADATA_FILENAME: + metadata_files[split].add((original_file, downloaded_file)) + else: + original_file_name = os.path.basename(original_file) + logger.debug( + f"The file '{original_file_name}' was ignored: it is not an {self.BASE_COLUMN_NAME}, and is not {self.METADATA_FILENAME} either." + ) + else: + archives, downloaded_dirs = files_or_archives, downloaded_files_or_dirs + for archive, downloaded_dir in zip(archives, downloaded_dirs): + archive, downloaded_dir = str(archive), str(downloaded_dir) + for downloaded_dir_file in dl_manager.iter_files(downloaded_dir): + _, downloaded_dir_file_ext = os.path.splitext(downloaded_dir_file) + if downloaded_dir_file_ext in self.EXTENSIONS: + if not self.config.drop_labels: + labels.add(os.path.basename(os.path.dirname(downloaded_dir_file))) + elif os.path.basename(downloaded_dir_file) == self.METADATA_FILENAME: + metadata_files[split].add((None, downloaded_dir_file)) + else: + archive_file_name = os.path.basename(archive) + original_file_name = os.path.basename(downloaded_dir_file) + logger.debug( + f"The file '{original_file_name}' from the archive '{archive_file_name}' was ignored: it is not an {self.BASE_COLUMN_NAME}, and is not {self.METADATA_FILENAME} either." + ) + + data_files = self.config.data_files + splits = [] + for split_name, files in data_files.items(): + if isinstance(files, str): + files = [files] + files, archives = self._split_files_and_archives(files) + downloaded_files = dl_manager.download(files) + downloaded_dirs = dl_manager.download_and_extract(archives) + if do_analyze: # drop_metadata is None or False, drop_labels is None or False + logger.info(f"Searching for labels and/or metadata files in {split_name} data files...") + analyze(files, downloaded_files, split_name) + analyze(archives, downloaded_dirs, split_name) + + if metadata_files: + # add metadata if `metadata_files` are found and `drop_metadata` is None (default) or False + add_metadata = not (self.config.drop_metadata is True) + # if `metadata_files` are found, add labels only if + # `drop_labels` is set up to False explicitly (not-default behavior) + add_labels = self.config.drop_labels is False + else: + # if `metadata_files` are not found, don't add metadata + add_metadata = False + # if `metadata_files` are not found but `drop_labels` is None (default) or False, add them + add_labels = not (self.config.drop_labels is True) + + if add_labels: + logger.info("Adding the labels inferred from data directories to the dataset's features...") + if add_metadata: + logger.info("Adding metadata to the dataset...") + else: + add_labels, add_metadata, metadata_files = False, False, {} + + splits.append( + datasets.SplitGenerator( + name=split_name, + gen_kwargs={ + "files": [(file, downloaded_file) for file, downloaded_file in zip(files, downloaded_files)] + + [(None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs], + "metadata_files": metadata_files, + "split_name": split_name, + "add_labels": add_labels, + "add_metadata": add_metadata, + }, + ) + ) + + if add_metadata: + # Verify that: + # * all metadata files have the same set of features + # * the `file_name` key is one of the metadata keys and is of type string + features_per_metadata_file: List[Tuple[str, datasets.Features]] = [] + for _, downloaded_metadata_file in itertools.chain.from_iterable(metadata_files.values()): + with open(downloaded_metadata_file, "rb") as f: + pa_metadata_table = paj.read_json(f) + features_per_metadata_file.append( + (downloaded_metadata_file, datasets.Features.from_arrow_schema(pa_metadata_table.schema)) + ) + for downloaded_metadata_file, metadata_features in features_per_metadata_file: + if metadata_features != features_per_metadata_file[0][1]: + raise ValueError( + f"Metadata files {downloaded_metadata_file} and {features_per_metadata_file[0][0]} have different features: {features_per_metadata_file[0]} != {metadata_features}" + ) + metadata_features = features_per_metadata_file[0][1] + if "file_name" not in metadata_features: + raise ValueError("`file_name` must be present as dictionary key in metadata files") + if metadata_features["file_name"] != datasets.Value("string"): + raise ValueError("`file_name` key must be a string") + del metadata_features["file_name"] + else: + metadata_features = None + + # Normally, we would do this in _info, but we need to know the labels and/or metadata + # before building the features + if self.config.features is None: + if add_labels: + self.info.features = datasets.Features( + { + self.BASE_COLUMN_NAME: self.BASE_FEATURE, + "label": datasets.ClassLabel(names=sorted(labels)), + } + ) + else: + self.info.features = datasets.Features({self.BASE_COLUMN_NAME: self.BASE_FEATURE}) + + if add_metadata: + # Warn if there are duplicated keys in metadata compared to the existing features + # (`BASE_COLUMN_NAME`, optionally "label") + duplicated_keys = set(self.info.features) & set(metadata_features) + if duplicated_keys: + logger.warning( + f"Ignoring metadata columns {list(duplicated_keys)} as they are already present in " + f"the features dictionary." + ) + # skip metadata duplicated keys + self.info.features.update( + { + feature: metadata_features[feature] + for feature in metadata_features + if feature not in duplicated_keys + } + ) + + return splits + + def _split_files_and_archives(self, data_files): + files, archives = [], [] + for data_file in data_files: + _, data_file_ext = os.path.splitext(data_file) + if data_file_ext.lower() in self.EXTENSIONS: + files.append(data_file) + elif os.path.basename(data_file) == self.METADATA_FILENAME: + files.append(data_file) + else: + archives.append(data_file) + return files, archives + + def _generate_examples(self, files, metadata_files, split_name, add_metadata, add_labels): + split_metadata_files = metadata_files.get(split_name, []) + sample_empty_metadata = ( + {k: None for k in self.info.features if k != self.BASE_COLUMN_NAME} if self.info.features else {} + ) + last_checked_dir = None + metadata_dir = None + metadata_dict = None + downloaded_metadata_file = None + + file_idx = 0 + for original_file, downloaded_file_or_dir in files: + if original_file is not None: + _, original_file_ext = os.path.splitext(original_file) + if original_file_ext.lower() in self.EXTENSIONS: + if add_metadata: + # If the file is a file of a needed type, and we've just entered a new directory, + # find the nereast metadata file (by counting path segments) for the directory + current_dir = os.path.dirname(original_file) + if last_checked_dir is None or last_checked_dir != current_dir: + last_checked_dir = current_dir + metadata_file_candidates = [ + ( + os.path.relpath(original_file, os.path.dirname(metadata_file_candidate)), + metadata_file_candidate, + downloaded_metadata_file, + ) + for metadata_file_candidate, downloaded_metadata_file in split_metadata_files + if metadata_file_candidate + is not None # ignore metadata_files that are inside archives + and not os.path.relpath( + original_file, os.path.dirname(metadata_file_candidate) + ).startswith("..") + ] + if metadata_file_candidates: + _, metadata_file, downloaded_metadata_file = min( + metadata_file_candidates, key=lambda x: count_path_segments(x[0]) + ) + with open(downloaded_metadata_file, "rb") as f: + pa_metadata_table = paj.read_json(f) + pa_file_name_array = pa_metadata_table["file_name"] + pa_file_name_array = pc.replace_substring( + pa_file_name_array, pattern="\\", replacement="/" + ) + pa_metadata_table = pa_metadata_table.drop(["file_name"]) + metadata_dir = os.path.dirname(metadata_file) + metadata_dict = { + file_name: sample_metadata + for file_name, sample_metadata in zip( + pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table) + ) + } + else: + raise ValueError( + f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." + ) + if metadata_dir is not None and downloaded_metadata_file is not None: + file_relpath = os.path.relpath(original_file, metadata_dir) + file_relpath = file_relpath.replace("\\", "/") + if file_relpath not in metadata_dict: + raise ValueError( + f"{self.BASE_COLUMN_NAME} at {file_relpath} doesn't have metadata in {downloaded_metadata_file}." + ) + sample_metadata = metadata_dict[file_relpath] + else: + raise ValueError( + f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." + ) + else: + sample_metadata = {} + if add_labels: + sample_label = {"label": os.path.basename(os.path.dirname(original_file))} + else: + sample_label = {} + yield file_idx, { + **sample_empty_metadata, + self.BASE_COLUMN_NAME: downloaded_file_or_dir, + **sample_metadata, + **sample_label, + } + file_idx += 1 + else: + for downloaded_dir_file in downloaded_file_or_dir: + _, downloaded_dir_file_ext = os.path.splitext(downloaded_dir_file) + if downloaded_dir_file_ext.lower() in self.EXTENSIONS: + if add_metadata: + current_dir = os.path.dirname(downloaded_dir_file) + if last_checked_dir is None or last_checked_dir != current_dir: + last_checked_dir = current_dir + metadata_file_candidates = [ + ( + os.path.relpath( + downloaded_dir_file, os.path.dirname(downloaded_metadata_file) + ), + metadata_file_candidate, + downloaded_metadata_file, + ) + for metadata_file_candidate, downloaded_metadata_file in split_metadata_files + if metadata_file_candidate + is None # ignore metadata_files that are not inside archives + and not os.path.relpath( + downloaded_dir_file, os.path.dirname(downloaded_metadata_file) + ).startswith("..") + ] + if metadata_file_candidates: + _, metadata_file, downloaded_metadata_file = min( + metadata_file_candidates, key=lambda x: count_path_segments(x[0]) + ) + with open(downloaded_metadata_file, "rb") as f: + pa_metadata_table = paj.read_json(f) + pa_file_name_array = pa_metadata_table["file_name"] + pa_file_name_array = pc.replace_substring( + pa_file_name_array, pattern="\\", replacement="/" + ) + pa_metadata_table = pa_metadata_table.drop(["file_name"]) + metadata_dir = os.path.dirname(downloaded_metadata_file) + metadata_dict = { + file_name: sample_metadata + for file_name, sample_metadata in zip( + pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table) + ) + } + else: + raise ValueError( + f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." + ) + if metadata_dir is not None and downloaded_metadata_file is not None: + downloaded_dir_file_relpath = os.path.relpath(downloaded_dir_file, metadata_dir) + downloaded_dir_file_relpath = downloaded_dir_file_relpath.replace("\\", "/") + if downloaded_dir_file_relpath not in metadata_dict: + raise ValueError( + f"{self.BASE_COLUMN_NAME} at {downloaded_dir_file_relpath} doesn't have metadata in {downloaded_metadata_file}." + ) + sample_metadata = metadata_dict[downloaded_dir_file_relpath] + else: + raise ValueError( + f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." + ) + else: + sample_metadata = {} + if add_labels: + sample_label = {"label": os.path.basename(os.path.dirname(downloaded_dir_file))} + else: + sample_label = {} + yield file_idx, { + **sample_empty_metadata, + self.BASE_COLUMN_NAME: downloaded_dir_file, + **sample_metadata, + **sample_label, + } + file_idx += 1 diff --git a/src/datasets/packaged_modules/imagefolder/imagefolder.py b/src/datasets/packaged_modules/imagefolder/imagefolder.py index 1a7e1c9ad49..d4e25866750 100644 --- a/src/datasets/packaged_modules/imagefolder/imagefolder.py +++ b/src/datasets/packaged_modules/imagefolder/imagefolder.py @@ -1,373 +1,25 @@ -import collections -import itertools -import os -from dataclasses import dataclass -from typing import List, Optional, Tuple - -import pyarrow.compute as pc -import pyarrow.json as paj +from typing import List import datasets -from datasets.tasks import ImageClassification - - -logger = datasets.utils.logging.get_logger(__name__) - - -if datasets.config.PYARROW_VERSION.major >= 7: - - def pa_table_to_pylist(table): - return table.to_pylist() -else: +from ..folder_based_builder import folder_based_builder - def pa_table_to_pylist(table): - keys = table.column_names - values = table.to_pydict().values() - return [{k: v for k, v in zip(keys, row_values)} for row_values in zip(*values)] - -def count_path_segments(path): - cnt = 0 - while True: - parts = os.path.split(path) - if parts[0] == path: - break - elif parts[1] == path: - break - else: - path = parts[0] - cnt += 1 - return cnt +logger = datasets.utils.logging.get_logger(__name__) -@dataclass -class ImageFolderConfig(datasets.BuilderConfig): +class ImageFolderConfig(folder_based_builder.FolderBasedBuilderConfig): """BuilderConfig for ImageFolder.""" - features: Optional[datasets.Features] = None drop_labels: bool = None drop_metadata: bool = None -class ImageFolder(datasets.GeneratorBasedBuilder): +class ImageFolder(folder_based_builder.FolderBasedBuilder): + BASE_FEATURE = datasets.Image() + BASE_COLUMN_NAME = "image" BUILDER_CONFIG_CLASS = ImageFolderConfig - - IMAGE_EXTENSIONS: List[str] = [] # definition at the bottom of the script - SKIP_CHECKSUM_COMPUTATION_BY_DEFAULT = True - METADATA_FILENAME: str = "metadata.jsonl" - - def _info(self): - return datasets.DatasetInfo(features=self.config.features) - - def _split_generators(self, dl_manager): - if not self.config.data_files: - raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") - - # Do an early pass if: - # * `drop_labels` is None (default) or False, to infer the class labels - # * `drop_metadata` is None (default) or False, to find the metadata files - do_analyze = not self.config.drop_labels or not self.config.drop_metadata - labels = set() - metadata_files = collections.defaultdict(set) - - def analyze(files_or_archives, downloaded_files_or_dirs, split): - if len(downloaded_files_or_dirs) == 0: - return - # The files are separated from the archives at this point, so check the first sample - # to see if it's a file or a directory and iterate accordingly - if os.path.isfile(downloaded_files_or_dirs[0]): - original_files, downloaded_files = files_or_archives, downloaded_files_or_dirs - for original_file, downloaded_file in zip(original_files, downloaded_files): - original_file, downloaded_file = str(original_file), str(downloaded_file) - _, original_file_ext = os.path.splitext(original_file) - if original_file_ext.lower() in self.IMAGE_EXTENSIONS: - if not self.config.drop_labels: - labels.add(os.path.basename(os.path.dirname(original_file))) - elif os.path.basename(original_file) == self.METADATA_FILENAME: - metadata_files[split].add((original_file, downloaded_file)) - else: - original_file_name = os.path.basename(original_file) - logger.debug( - f"The file '{original_file_name}' was ignored: it is not an image, and is not {self.METADATA_FILENAME} either." - ) - else: - archives, downloaded_dirs = files_or_archives, downloaded_files_or_dirs - for archive, downloaded_dir in zip(archives, downloaded_dirs): - archive, downloaded_dir = str(archive), str(downloaded_dir) - for downloaded_dir_file in dl_manager.iter_files(downloaded_dir): - _, downloaded_dir_file_ext = os.path.splitext(downloaded_dir_file) - if downloaded_dir_file_ext in self.IMAGE_EXTENSIONS: - if not self.config.drop_labels: - labels.add(os.path.basename(os.path.dirname(downloaded_dir_file))) - elif os.path.basename(downloaded_dir_file) == self.METADATA_FILENAME: - metadata_files[split].add((None, downloaded_dir_file)) - else: - archive_file_name = os.path.basename(archive) - original_file_name = os.path.basename(downloaded_dir_file) - logger.debug( - f"The file '{original_file_name}' from the archive '{archive_file_name}' was ignored: it is not an image, and is not {self.METADATA_FILENAME} either." - ) - - data_files = self.config.data_files - splits = [] - for split_name, files in data_files.items(): - if isinstance(files, str): - files = [files] - files, archives = self._split_files_and_archives(files) - downloaded_files = dl_manager.download(files) - downloaded_dirs = dl_manager.download_and_extract(archives) - if do_analyze: # drop_metadata is None or False, drop_labels is None or False - logger.info(f"Searching for labels and/or metadata files in {split_name} data files...") - analyze(files, downloaded_files, split_name) - analyze(archives, downloaded_dirs, split_name) - - if metadata_files: - # add metadata if `metadata_files` are found and `drop_metadata` is None (default) or False - add_metadata = not (self.config.drop_metadata is True) - # if `metadata_files` are found, add labels only if - # `drop_labels` is set up to False explicitly (not-default behavior) - add_labels = self.config.drop_labels is False - else: - # if `metadata_files` are not found, don't add metadata - add_metadata = False - # if `metadata_files` are not found but `drop_labels` is None (default) or False, add them - add_labels = not (self.config.drop_labels is True) - - if add_labels: - logger.info("Adding the labels inferred from data directories to the dataset's features...") - if add_metadata: - logger.info("Adding metadata to the dataset...") - else: - add_labels, add_metadata, metadata_files = False, False, {} - - splits.append( - datasets.SplitGenerator( - name=split_name, - gen_kwargs={ - "files": [(file, downloaded_file) for file, downloaded_file in zip(files, downloaded_files)] - + [(None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs], - "metadata_files": metadata_files, - "split_name": split_name, - "add_labels": add_labels, - "add_metadata": add_metadata, - }, - ) - ) - - if add_metadata: - # Verify that: - # * all metadata files have the same set of features - # * the `file_name` key is one of the metadata keys and is of type string - features_per_metadata_file: List[Tuple[str, datasets.Features]] = [] - for _, downloaded_metadata_file in itertools.chain.from_iterable(metadata_files.values()): - with open(downloaded_metadata_file, "rb") as f: - pa_metadata_table = paj.read_json(f) - features_per_metadata_file.append( - (downloaded_metadata_file, datasets.Features.from_arrow_schema(pa_metadata_table.schema)) - ) - for downloaded_metadata_file, metadata_features in features_per_metadata_file: - if metadata_features != features_per_metadata_file[0][1]: - raise ValueError( - f"Metadata files {downloaded_metadata_file} and {features_per_metadata_file[0][0]} have different features: {features_per_metadata_file[0]} != {metadata_features}" - ) - metadata_features = features_per_metadata_file[0][1] - if "file_name" not in metadata_features: - raise ValueError("`file_name` must be present as dictionary key in metadata files") - if metadata_features["file_name"] != datasets.Value("string"): - raise ValueError("`file_name` key must be a string") - del metadata_features["file_name"] - else: - metadata_features = None - - # Normally, we would do this in _info, but we need to know the labels and/or metadata - # before building the features - if self.config.features is None: - if add_labels: - self.info.features = datasets.Features( - {"image": datasets.Image(), "label": datasets.ClassLabel(names=sorted(labels))} - ) - task_template = ImageClassification(image_column="image", label_column="label") - task_template = task_template.align_with_features(self.info.features) - self.info.task_templates = [task_template] - else: - self.info.features = datasets.Features({"image": datasets.Image()}) - - if add_metadata: - # Warn if there are duplicated keys in metadata compared to the existing features ("image", optionally "label") - duplicated_keys = set(self.info.features) & set(metadata_features) - if duplicated_keys: - logger.warning( - f"Ignoring metadata columns {list(duplicated_keys)} as they are already present in " - f"the features dictionary." - ) - # skip metadata duplicated keys - self.info.features.update( - { - feature: metadata_features[feature] - for feature in metadata_features - if feature not in duplicated_keys - } - ) - - return splits - - def _split_files_and_archives(self, data_files): - files, archives = [], [] - for data_file in data_files: - _, data_file_ext = os.path.splitext(data_file) - if data_file_ext.lower() in self.IMAGE_EXTENSIONS: - files.append(data_file) - elif os.path.basename(data_file) == self.METADATA_FILENAME: - files.append(data_file) - else: - archives.append(data_file) - return files, archives - - def _generate_examples(self, files, metadata_files, split_name, add_metadata, add_labels): - split_metadata_files = metadata_files.get(split_name, []) - image_empty = {k: None for k in self.info.features if k != "image"} if self.info.features else {} - last_checked_dir = None - metadata_dir = None - metadata_dict = None - downloaded_metadata_file = None - - file_idx = 0 - for original_file, downloaded_file_or_dir in files: - if original_file is not None: - _, original_file_ext = os.path.splitext(original_file) - if original_file_ext.lower() in self.IMAGE_EXTENSIONS: - if add_metadata: - # If the file is an image, and we've just entered a new directory, - # find the nereast metadata file (by counting path segments) for the directory - current_dir = os.path.dirname(original_file) - if last_checked_dir is None or last_checked_dir != current_dir: - last_checked_dir = current_dir - metadata_file_candidates = [ - ( - os.path.relpath(original_file, os.path.dirname(metadata_file_candidate)), - metadata_file_candidate, - downloaded_metadata_file, - ) - for metadata_file_candidate, downloaded_metadata_file in split_metadata_files - if metadata_file_candidate - is not None # ignore metadata_files that are inside archives - and not os.path.relpath( - original_file, os.path.dirname(metadata_file_candidate) - ).startswith("..") - ] - if metadata_file_candidates: - _, metadata_file, downloaded_metadata_file = min( - metadata_file_candidates, key=lambda x: count_path_segments(x[0]) - ) - with open(downloaded_metadata_file, "rb") as f: - pa_metadata_table = paj.read_json(f) - pa_file_name_array = pa_metadata_table["file_name"] - pa_file_name_array = pc.replace_substring( - pa_file_name_array, pattern="\\", replacement="/" - ) - pa_metadata_table = pa_metadata_table.drop(["file_name"]) - metadata_dir = os.path.dirname(metadata_file) - metadata_dict = { - file_name: image_metadata - for file_name, image_metadata in zip( - pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table) - ) - } - else: - raise ValueError( - f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." - ) - if metadata_dir is not None and downloaded_metadata_file is not None: - file_relpath = os.path.relpath(original_file, metadata_dir) - file_relpath = file_relpath.replace("\\", "/") - if file_relpath not in metadata_dict: - raise ValueError( - f"Image at {file_relpath} doesn't have metadata in {downloaded_metadata_file}." - ) - image_metadata = metadata_dict[file_relpath] - else: - raise ValueError( - f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." - ) - else: - image_metadata = {} - if add_labels: - image_label = {"label": os.path.basename(os.path.dirname(original_file))} - else: - image_label = {} - yield file_idx, {**image_empty, "image": downloaded_file_or_dir, **image_metadata, **image_label} - file_idx += 1 - else: - for downloaded_dir_file in downloaded_file_or_dir: - _, downloaded_dir_file_ext = os.path.splitext(downloaded_dir_file) - if downloaded_dir_file_ext.lower() in self.IMAGE_EXTENSIONS: - if add_metadata: - current_dir = os.path.dirname(downloaded_dir_file) - if last_checked_dir is None or last_checked_dir != current_dir: - last_checked_dir = current_dir - metadata_file_candidates = [ - ( - os.path.relpath( - downloaded_dir_file, os.path.dirname(downloaded_metadata_file) - ), - metadata_file_candidate, - downloaded_metadata_file, - ) - for metadata_file_candidate, downloaded_metadata_file in split_metadata_files - if metadata_file_candidate - is None # ignore metadata_files that are not inside archives - and not os.path.relpath( - downloaded_dir_file, os.path.dirname(downloaded_metadata_file) - ).startswith("..") - ] - if metadata_file_candidates: - _, metadata_file, downloaded_metadata_file = min( - metadata_file_candidates, key=lambda x: count_path_segments(x[0]) - ) - with open(downloaded_metadata_file, "rb") as f: - pa_metadata_table = paj.read_json(f) - pa_file_name_array = pa_metadata_table["file_name"] - pa_file_name_array = pc.replace_substring( - pa_file_name_array, pattern="\\", replacement="/" - ) - pa_metadata_table = pa_metadata_table.drop(["file_name"]) - metadata_dir = os.path.dirname(downloaded_metadata_file) - metadata_dict = { - file_name: image_metadata - for file_name, image_metadata in zip( - pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table) - ) - } - else: - raise ValueError( - f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." - ) - if metadata_dir is not None and downloaded_metadata_file is not None: - downloaded_dir_file_relpath = os.path.relpath(downloaded_dir_file, metadata_dir) - downloaded_dir_file_relpath = downloaded_dir_file_relpath.replace("\\", "/") - if downloaded_dir_file_relpath not in metadata_dict: - raise ValueError( - f"Image at {downloaded_dir_file_relpath} doesn't have metadata in {downloaded_metadata_file}." - ) - image_metadata = metadata_dict[downloaded_dir_file_relpath] - else: - raise ValueError( - f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." - ) - else: - image_metadata = {} - if add_labels: - image_label = {"label": os.path.basename(os.path.dirname(downloaded_dir_file))} - else: - image_label = {} - yield file_idx, { - **image_empty, - "image": downloaded_dir_file, - **image_metadata, - **image_label, - } - file_idx += 1 + EXTENSIONS: List[str] # definition at the bottom of the script # Obtained with: @@ -382,7 +34,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad # We intentionally do not run this code on launch because: # (1) Pillow is an optional dependency, so importing Pillow in global namespace is not allowed # (2) To ensure the list of supported extensions is deterministic -ImageFolder.IMAGE_EXTENSIONS = [ +IMAGE_EXTENSIONS = [ ".blp", ".bmp", ".dib", @@ -447,3 +99,4 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad ".xbm", ".xpm", ] +ImageFolder.EXTENSIONS = IMAGE_EXTENSIONS diff --git a/src/datasets/streaming.py b/src/datasets/streaming.py index cd25f0153f5..26d7982b53b 100644 --- a/src/datasets/streaming.py +++ b/src/datasets/streaming.py @@ -52,7 +52,8 @@ def extend_module_for_streaming(module_path, use_auth_token: Optional[Union[str, Args: module_path: Path to the module to be extended. - use_auth_token: Whether to use authentication token. + use_auth_token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. + If True, will get token from `"~/.huggingface"`. """ module = importlib.import_module(module_path) @@ -100,8 +101,6 @@ def extend_dataset_builder_for_streaming(builder: "DatasetBuilder"): Args: builder (:class:`DatasetBuilder`): Dataset builder instance. - use_auth_token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. - If True, will get token from `"~/.huggingface"`. """ # this extends the open and os.path.join functions for data streaming extend_module_for_streaming(builder.__module__, use_auth_token=builder.use_auth_token) @@ -112,3 +111,16 @@ def extend_dataset_builder_for_streaming(builder: "DatasetBuilder"): internal_import_name = imports[1] internal_module_name = ".".join(builder.__module__.split(".")[:-1] + [internal_import_name]) extend_module_for_streaming(internal_module_name, use_auth_token=builder.use_auth_token) + + # builders can inherit from other builders that might use streaming functionality + # (for example, ImageFolder and AudioFolder inherit from FolderBuilder which implements examples generation) + # but these parents builders are not patched automatically as they are not instantiated, so we patch them here + from .builder import DatasetBuilder + + parent_builder_modules = [ + cls.__module__ + for cls in type(builder).__mro__[1:] # make sure it's not the same module we've already patched + if issubclass(cls, DatasetBuilder) and cls.__module__ != DatasetBuilder.__module__ + ] # check it's not a standard builder from datasets.builder + for module in parent_builder_modules: + extend_module_for_streaming(module, use_auth_token=builder.use_auth_token) diff --git a/tests/fixtures/files.py b/tests/fixtures/files.py index 46e0d07dfa7..a6502468549 100644 --- a/tests/fixtures/files.py +++ b/tests/fixtures/files.py @@ -472,6 +472,11 @@ def image_file(): return os.path.join("tests", "features", "data", "test_image_rgb.jpg") +@pytest.fixture(scope="session") +def audio_file(): + return os.path.join("tests", "features", "data", "test_audio_44100.wav") + + @pytest.fixture(scope="session") def zip_image_path(image_file, tmp_path_factory): path = tmp_path_factory.mktemp("data") / "dataset.img.zip" diff --git a/tests/packaged_modules/test_audiofolder.py b/tests/packaged_modules/test_audiofolder.py new file mode 100644 index 00000000000..25a0f8141fe --- /dev/null +++ b/tests/packaged_modules/test_audiofolder.py @@ -0,0 +1,444 @@ +import shutil +import textwrap + +import librosa +import numpy as np +import pytest +import soundfile as sf + +from datasets import Audio, ClassLabel, Features, Value +from datasets.data_files import DataFilesDict, get_data_patterns_locally +from datasets.download.streaming_download_manager import StreamingDownloadManager +from datasets.packaged_modules.audiofolder.audiofolder import AudioFolder + +from ..utils import require_sndfile + + +@pytest.fixture +def cache_dir(tmp_path): + return str(tmp_path / "audiofolder_cache_dir") + + +@pytest.fixture +def data_files_with_labels_no_metadata(tmp_path, audio_file): + data_dir = tmp_path / "data_files_with_labels_no_metadata" + data_dir.mkdir(parents=True, exist_ok=True) + subdir_class_0 = data_dir / "fr" + subdir_class_0.mkdir(parents=True, exist_ok=True) + # data dirs can be nested but audiofolder should care only about the last part of the path: + subdir_class_1 = data_dir / "subdir" / "uk" + subdir_class_1.mkdir(parents=True, exist_ok=True) + + audio_filename = subdir_class_0 / "audio_fr.wav" + shutil.copyfile(audio_file, audio_filename) + audio_filename2 = subdir_class_1 / "audio_uk.wav" + shutil.copyfile(audio_file, audio_filename2) + + data_files_with_labels_no_metadata = DataFilesDict.from_local_or_remote( + get_data_patterns_locally(str(data_dir)), str(data_dir) + ) + + return data_files_with_labels_no_metadata + + +@pytest.fixture +def audio_files_with_labels_and_duplicated_label_key_in_metadata(tmp_path, audio_file): + data_dir = tmp_path / "audio_files_with_labels_and_label_key_in_metadata" + data_dir.mkdir(parents=True, exist_ok=True) + subdir_class_0 = data_dir / "fr" + subdir_class_0.mkdir(parents=True, exist_ok=True) + subdir_class_1 = data_dir / "uk" + subdir_class_1.mkdir(parents=True, exist_ok=True) + + audio_filename = subdir_class_0 / "audio_fr.wav" + shutil.copyfile(audio_file, audio_filename) + audio_filename2 = subdir_class_1 / "audio_uk.wav" + shutil.copyfile(audio_file, audio_filename2) + + audio_metadata_filename = tmp_path / data_dir / "metadata.jsonl" + audio_metadata = textwrap.dedent( + """\ + {"file_name": "fr/audio_fr.wav", "text": "Audio in French", "label": "Fr"} + {"file_name": "uk/audio_uk.wav", "text": "Audio in Ukrainian", "label": "Uk"} + """ + ) + with open(audio_metadata_filename, "w", encoding="utf-8") as f: + f.write(audio_metadata) + + return str(audio_filename), str(audio_filename2), str(audio_metadata_filename) + + +@pytest.fixture +def audio_file_with_metadata(tmp_path, audio_file): + audio_filename = tmp_path / "audio_file.wav" + shutil.copyfile(audio_file, audio_filename) + audio_metadata_filename = tmp_path / "metadata.jsonl" + audio_metadata = textwrap.dedent( + """\ + {"file_name": "audio_file.wav", "text": "Audio transcription"} + """ + ) + with open(audio_metadata_filename, "w", encoding="utf-8") as f: + f.write(audio_metadata) + return str(audio_filename), str(audio_metadata_filename) + + +@pytest.fixture +def audio_files_with_metadata_that_misses_one_audio(tmp_path, audio_file): + audio_filename = tmp_path / "audio_file.wav" + shutil.copyfile(audio_file, audio_filename) + audio_filename2 = tmp_path / "audio_file2.wav" + shutil.copyfile(audio_file, audio_filename2) + audio_metadata_filename = tmp_path / "metadata.jsonl" + audio_metadata = textwrap.dedent( + """\ + {"file_name": "audio_file.wav", "text": "Audio transcription"} + """ + ) + with open(audio_metadata_filename, "w", encoding="utf-8") as f: + f.write(audio_metadata) + return str(audio_filename), str(audio_filename2), str(audio_metadata_filename) + + +@pytest.fixture +def data_files_with_one_split_and_metadata(tmp_path, audio_file): + data_dir = tmp_path / "audiofolder_data_dir_with_metadata" + data_dir.mkdir(parents=True, exist_ok=True) + subdir = data_dir / "subdir" + subdir.mkdir(parents=True, exist_ok=True) + + audio_filename = data_dir / "audio_file.wav" + shutil.copyfile(audio_file, audio_filename) + audio_filename2 = data_dir / "audio_file2.wav" + shutil.copyfile(audio_file, audio_filename2) + audio_filename3 = subdir / "audio_file3.wav" # in subdir + shutil.copyfile(audio_file, audio_filename3) + + audio_metadata_filename = data_dir / "metadata.jsonl" + audio_metadata = textwrap.dedent( + """\ + {"file_name": "audio_file.wav", "text": "First audio transcription"} + {"file_name": "audio_file2.wav", "text": "Second audio transcription"} + {"file_name": "subdir/audio_file3.wav", "text": "Third audio transcription (in subdir)"} + """ + ) + with open(audio_metadata_filename, "w", encoding="utf-8") as f: + f.write(audio_metadata) + data_files_with_one_split_and_metadata = DataFilesDict.from_local_or_remote( + get_data_patterns_locally(str(data_dir)), str(data_dir) + ) + assert len(data_files_with_one_split_and_metadata) == 1 + assert len(data_files_with_one_split_and_metadata["train"]) == 4 + return data_files_with_one_split_and_metadata + + +@pytest.fixture +def data_files_with_two_splits_and_metadata(tmp_path, audio_file): + data_dir = tmp_path / "audiofolder_data_dir_with_metadata" + data_dir.mkdir(parents=True, exist_ok=True) + train_dir = data_dir / "train" + train_dir.mkdir(parents=True, exist_ok=True) + test_dir = data_dir / "test" + test_dir.mkdir(parents=True, exist_ok=True) + + audio_filename = train_dir / "audio_file.wav" # train audio + shutil.copyfile(audio_file, audio_filename) + audio_filename2 = train_dir / "audio_file2.wav" # train audio + shutil.copyfile(audio_file, audio_filename2) + audio_filename3 = test_dir / "audio_file3.wav" # test audio + shutil.copyfile(audio_file, audio_filename3) + + train_audio_metadata_filename = train_dir / "metadata.jsonl" + audio_metadata = textwrap.dedent( + """\ + {"file_name": "audio_file.wav", "text": "First train audio transcription"} + {"file_name": "audio_file2.wav", "text": "Second train audio transcription"} + """ + ) + with open(train_audio_metadata_filename, "w", encoding="utf-8") as f: + f.write(audio_metadata) + test_audio_metadata_filename = test_dir / "metadata.jsonl" + audio_metadata = textwrap.dedent( + """\ + {"file_name": "audio_file3.wav", "text": "Test audio transcription"} + """ + ) + with open(test_audio_metadata_filename, "w", encoding="utf-8") as f: + f.write(audio_metadata) + data_files_with_two_splits_and_metadata = DataFilesDict.from_local_or_remote( + get_data_patterns_locally(str(data_dir)), str(data_dir) + ) + assert len(data_files_with_two_splits_and_metadata) == 2 + assert len(data_files_with_two_splits_and_metadata["train"]) == 3 + assert len(data_files_with_two_splits_and_metadata["test"]) == 2 + return data_files_with_two_splits_and_metadata + + +@pytest.fixture +def data_files_with_zip_archives(tmp_path, audio_file): + data_dir = tmp_path / "audiofolder_data_dir_with_zip_archives" + data_dir.mkdir(parents=True, exist_ok=True) + archive_dir = data_dir / "archive" + archive_dir.mkdir(parents=True, exist_ok=True) + subdir = archive_dir / "subdir" + subdir.mkdir(parents=True, exist_ok=True) + + audio_filename = archive_dir / "audio_file.wav" + shutil.copyfile(audio_file, audio_filename) + audio_filename2 = subdir / "audio_file2.wav" # in subdir + # make sure they're two different audios + # Indeed we won't be able to compare the audio filenames, since the archive is not extracted in streaming mode + array, sampling_rate = librosa.load(str(audio_filename), sr=16000) # original sampling rate is 44100 + sf.write(str(audio_filename2), array, samplerate=16000) + + audio_metadata_filename = archive_dir / "metadata.jsonl" + audio_metadata = textwrap.dedent( + """\ + {"file_name": "audio_file.wav", "text": "First audio transcription"} + {"file_name": "subdir/audio_file2.wav", "text": "Second audio transcription (in subdir)"} + """ + ) + + with open(audio_metadata_filename, "w", encoding="utf-8") as f: + f.write(audio_metadata) + + shutil.make_archive(str(archive_dir), "zip", archive_dir) + shutil.rmtree(str(archive_dir)) + + data_files_with_zip_archives = DataFilesDict.from_local_or_remote( + get_data_patterns_locally(str(data_dir)), str(data_dir) + ) + + assert len(data_files_with_zip_archives) == 1 + assert len(data_files_with_zip_archives["train"]) == 1 + return data_files_with_zip_archives + + +@require_sndfile +# check that labels are inferred correctly from dir names +def test_generate_examples_with_labels(data_files_with_labels_no_metadata, cache_dir): + # there are no metadata.jsonl files in this test case + audiofolder = AudioFolder(data_files=data_files_with_labels_no_metadata, cache_dir=cache_dir, drop_labels=False) + audiofolder.download_and_prepare() + assert audiofolder.info.features == Features({"audio": Audio(), "label": ClassLabel(names=["fr", "uk"])}) + dataset = list(audiofolder.as_dataset()["train"]) + label_feature = audiofolder.info.features["label"] + + assert dataset[0]["label"] == label_feature._str2int["fr"] + assert dataset[1]["label"] == label_feature._str2int["uk"] + + +@require_sndfile +@pytest.mark.parametrize("drop_metadata", [None, True, False]) +@pytest.mark.parametrize("drop_labels", [None, True, False]) +def test_generate_examples_duplicated_label_key( + audio_files_with_labels_and_duplicated_label_key_in_metadata, drop_metadata, drop_labels, cache_dir, caplog +): + fr_audio_file, uk_audio_file, audio_metadata_file = audio_files_with_labels_and_duplicated_label_key_in_metadata + audiofolder = AudioFolder( + drop_metadata=drop_metadata, + drop_labels=drop_labels, + data_files=[fr_audio_file, uk_audio_file, audio_metadata_file], + cache_dir=cache_dir, + ) + if drop_labels is False: + # infer labels from directories even if metadata files are found + audiofolder.download_and_prepare() + warning_in_logs = any("ignoring metadata columns" in record.msg.lower() for record in caplog.records) + assert warning_in_logs if drop_metadata is not True else not warning_in_logs + dataset = audiofolder.as_dataset()["train"] + assert audiofolder.info.features["label"] == ClassLabel(names=["fr", "uk"]) + assert all(example["label"] in audiofolder.info.features["label"]._str2int.values() for example in dataset) + else: + audiofolder.download_and_prepare() + dataset = audiofolder.as_dataset()["train"] + if drop_metadata is not True: + # labels are from metadata + assert audiofolder.info.features["label"] == Value("string") + assert all(example["label"] in ["Fr", "Uk"] for example in dataset) + else: + # drop both labels and metadata + assert audiofolder.info.features == Features({"audio": Audio()}) + assert all(example.keys() == {"audio"} for example in dataset) + + +@require_sndfile +@pytest.mark.parametrize("drop_metadata", [None, True, False]) +@pytest.mark.parametrize("drop_labels", [None, True, False]) +def test_generate_examples_drop_labels(audio_file, drop_metadata, drop_labels): + audiofolder = AudioFolder(drop_metadata=drop_metadata, drop_labels=drop_labels, data_files={"train": [audio_file]}) + gen_kwargs = audiofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs + # removing the labels explicitly requires drop_labels=True + assert gen_kwargs["add_labels"] is not bool(drop_labels) + assert gen_kwargs["add_metadata"] is False # metadata files is not present in this case + generator = audiofolder._generate_examples(**gen_kwargs) + if not drop_labels: + assert all( + example.keys() == {"audio", "label"} and all(val is not None for val in example.values()) + for _, example in generator + ) + else: + assert all( + example.keys() == {"audio"} and all(val is not None for val in example.values()) + for _, example in generator + ) + + +@require_sndfile +@pytest.mark.parametrize("drop_metadata", [None, True, False]) +@pytest.mark.parametrize("drop_labels", [None, True, False]) +def test_generate_examples_drop_metadata(audio_file_with_metadata, drop_metadata, drop_labels): + audio_file, audio_metadata_file = audio_file_with_metadata + audiofolder = AudioFolder( + drop_metadata=drop_metadata, drop_labels=drop_labels, data_files={"train": [audio_file, audio_metadata_file]} + ) + gen_kwargs = audiofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs + # since the dataset has metadata, removing the metadata explicitly requires drop_metadata=True + assert gen_kwargs["add_metadata"] is not bool(drop_metadata) + # since the dataset has metadata, adding the labels explicitly requires drop_labels=False + assert gen_kwargs["add_labels"] is (drop_labels is False) + generator = audiofolder._generate_examples(**gen_kwargs) + expected_columns = {"audio"} + if gen_kwargs["add_metadata"]: + expected_columns.add("text") + if gen_kwargs["add_labels"]: + expected_columns.add("label") + result = [example for _, example in generator] + assert len(result) == 1 + example = result[0] + assert example.keys() == expected_columns + for column in expected_columns: + assert example[column] is not None + + +@require_sndfile +@pytest.mark.parametrize("drop_metadata", [None, True, False]) +def test_generate_examples_with_metadata_in_wrong_location(audio_file, audio_file_with_metadata, drop_metadata): + _, audio_metadata_file = audio_file_with_metadata + audiofolder = AudioFolder(drop_metadata=drop_metadata, data_files={"train": [audio_file, audio_metadata_file]}) + gen_kwargs = audiofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs + generator = audiofolder._generate_examples(**gen_kwargs) + if not drop_metadata: + with pytest.raises(ValueError): + list(generator) + else: + assert all( + example.keys() == {"audio"} and all(val is not None for val in example.values()) + for _, example in generator + ) + + +@require_sndfile +@pytest.mark.parametrize("drop_metadata", [None, True, False]) +def test_generate_examples_with_metadata_that_misses_one_audio( + audio_files_with_metadata_that_misses_one_audio, drop_metadata +): + audio_file, audio_file2, audio_metadata_file = audio_files_with_metadata_that_misses_one_audio + if not drop_metadata: + features = Features({"audio": Audio(), "text": Value("string")}) + else: + features = Features({"audio": Audio()}) + audiofolder = AudioFolder( + drop_metadata=drop_metadata, + features=features, + data_files={"train": [audio_file, audio_file2, audio_metadata_file]}, + ) + gen_kwargs = audiofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs + generator = audiofolder._generate_examples(**gen_kwargs) + if not drop_metadata: + with pytest.raises(ValueError): + _ = list(generator) + else: + assert all( + example.keys() == {"audio"} and all(val is not None for val in example.values()) + for _, example in generator + ) + + +@require_sndfile +@pytest.mark.parametrize("streaming", [False, True]) +@pytest.mark.parametrize("n_splits", [1, 2]) +def test_data_files_with_metadata_and_splits( + streaming, cache_dir, n_splits, data_files_with_one_split_and_metadata, data_files_with_two_splits_and_metadata +): + data_files = data_files_with_one_split_and_metadata if n_splits == 1 else data_files_with_two_splits_and_metadata + audiofolder = AudioFolder(data_files=data_files, cache_dir=cache_dir) + audiofolder.download_and_prepare() + datasets = audiofolder.as_streaming_dataset() if streaming else audiofolder.as_dataset() + for split, data_files in data_files.items(): + expected_num_of_audios = len(data_files) - 1 # don't count the metadata file + assert split in datasets + dataset = list(datasets[split]) + assert len(dataset) == expected_num_of_audios + # make sure each sample has its own audio and metadata + assert len(set(example["audio"]["path"] for example in dataset)) == expected_num_of_audios + assert len(set(example["text"] for example in dataset)) == expected_num_of_audios + assert all(example["text"] is not None for example in dataset) + + +@require_sndfile +@pytest.mark.parametrize("streaming", [False, True]) +def test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_with_zip_archives): + audiofolder = AudioFolder(data_files=data_files_with_zip_archives, cache_dir=cache_dir) + audiofolder.download_and_prepare() + datasets = audiofolder.as_streaming_dataset() if streaming else audiofolder.as_dataset() + for split, data_files in data_files_with_zip_archives.items(): + num_of_archives = len(data_files) # the metadata file is inside the archive + expected_num_of_audios = 2 * num_of_archives + assert split in datasets + dataset = list(datasets[split]) + assert len(dataset) == expected_num_of_audios + # make sure each sample has its own audio (all arrays are different) and metadata + assert ( + sum(np.array_equal(dataset[0]["audio"]["array"], example["audio"]["array"]) for example in dataset[1:]) + == 0 + ) + assert len(set(example["text"] for example in dataset)) == expected_num_of_audios + assert all(example["text"] is not None for example in dataset) + + +@require_sndfile +def test_data_files_with_wrong_metadata_file_name(cache_dir, tmp_path, audio_file): + data_dir = tmp_path / "data_dir_with_bad_metadata" + data_dir.mkdir(parents=True, exist_ok=True) + shutil.copyfile(audio_file, data_dir / "audio_file.wav") + audio_metadata_filename = data_dir / "bad_metadata.jsonl" # bad file + audio_metadata = textwrap.dedent( + """\ + {"file_name": "audio_file.wav", "text": "Audio transcription"} + """ + ) + with open(audio_metadata_filename, "w", encoding="utf-8") as f: + f.write(audio_metadata) + + data_files_with_bad_metadata = DataFilesDict.from_local_or_remote( + get_data_patterns_locally(str(data_dir)), str(data_dir) + ) + audiofolder = AudioFolder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir) + audiofolder.download_and_prepare() + dataset = audiofolder.as_dataset(split="train") + # check that there are no metadata, since the metadata file name doesn't have the right name + assert "text" not in dataset.column_names + + +@require_sndfile +def test_data_files_with_wrong_audio_file_name_column_in_metadata_file(cache_dir, tmp_path, audio_file): + data_dir = tmp_path / "data_dir_with_bad_metadata" + data_dir.mkdir(parents=True, exist_ok=True) + shutil.copyfile(audio_file, data_dir / "audio_file.wav") + audio_metadata_filename = data_dir / "metadata.jsonl" + audio_metadata = textwrap.dedent( # with bad column "bad_file_name" instead of "file_name" + """\ + {"bad_file_name_column": "audio_file.wav", "text": "Audio transcription"} + """ + ) + with open(audio_metadata_filename, "w", encoding="utf-8") as f: + f.write(audio_metadata) + + data_files_with_bad_metadata = DataFilesDict.from_local_or_remote( + get_data_patterns_locally(str(data_dir)), str(data_dir) + ) + audiofolder = AudioFolder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir) + with pytest.raises(ValueError) as exc_info: + audiofolder.download_and_prepare() + assert "`file_name` must be present" in str(exc_info.value) diff --git a/tests/packaged_modules/test_folder_based_builder.py b/tests/packaged_modules/test_folder_based_builder.py new file mode 100644 index 00000000000..51b5d46d8f9 --- /dev/null +++ b/tests/packaged_modules/test_folder_based_builder.py @@ -0,0 +1,435 @@ +import importlib +import shutil +import textwrap + +import pytest + +from datasets import ClassLabel, DownloadManager, Features, Value +from datasets.data_files import DataFilesDict, get_data_patterns_locally +from datasets.download.streaming_download_manager import StreamingDownloadManager +from datasets.packaged_modules.folder_based_builder.folder_based_builder import ( + FolderBasedBuilder, + FolderBasedBuilderConfig, +) + + +class DummyFolderBasedBuilder(FolderBasedBuilder): + BASE_FEATURE = None + BASE_COLUMN_NAME = "base" + BUILDER_CONFIG_CLASS = FolderBasedBuilderConfig + EXTENSIONS = [".txt"] + + +@pytest.fixture +def cache_dir(tmp_path): + return str(tmp_path / "autofolder_cache_dir") + + +@pytest.fixture +def auto_text_file(text_file): + return str(text_file) + + +@pytest.fixture +def data_files_with_labels_no_metadata(tmp_path, auto_text_file): + data_dir = tmp_path / "data_files_with_labels_no_metadata" + data_dir.mkdir(parents=True, exist_ok=True) + subdir_class_0 = data_dir / "class0" + subdir_class_0.mkdir(parents=True, exist_ok=True) + # data dirs can be nested but FolderBasedBuilder should care only about the last part of the path: + subdir_class_1 = data_dir / "subdir" / "class1" + subdir_class_1.mkdir(parents=True, exist_ok=True) + + filename = subdir_class_0 / "file0.txt" + shutil.copyfile(auto_text_file, filename) + filename2 = subdir_class_1 / "file1.txt" + shutil.copyfile(auto_text_file, filename2) + + data_files_with_labels_no_metadata = DataFilesDict.from_local_or_remote( + get_data_patterns_locally(str(data_dir)), str(data_dir) + ) + + return data_files_with_labels_no_metadata + + +@pytest.fixture +def files_with_labels_and_duplicated_label_key_in_metadata(tmp_path, auto_text_file): + data_dir = tmp_path / "files_with_labels_and_label_key_in_metadata" + data_dir.mkdir(parents=True, exist_ok=True) + subdir_class_0 = data_dir / "class0" + subdir_class_0.mkdir(parents=True, exist_ok=True) + subdir_class_1 = data_dir / "class1" + subdir_class_1.mkdir(parents=True, exist_ok=True) + + filename = subdir_class_0 / "file_class0.txt" + shutil.copyfile(auto_text_file, filename) + filename2 = subdir_class_1 / "file_class1.txt" + shutil.copyfile(auto_text_file, filename2) + + metadata_filename = tmp_path / data_dir / "metadata.jsonl" + metadata = textwrap.dedent( + """\ + {"file_name": "class0/file_class0.txt", "additional_feature": "First dummy file", "label": "CLASS_0"} + {"file_name": "class1/file_class1.txt", "additional_feature": "Second dummy file", "label": "CLASS_1"} + """ + ) + with open(metadata_filename, "w", encoding="utf-8") as f: + f.write(metadata) + + return str(filename), str(filename2), str(metadata_filename) + + +@pytest.fixture +def file_with_metadata(tmp_path, text_file): + filename = tmp_path / "file.txt" + shutil.copyfile(text_file, filename) + metadata_filename = tmp_path / "metadata.jsonl" + metadata = textwrap.dedent( + """\ + {"file_name": "file.txt", "additional_feature": "Dummy file"} + """ + ) + with open(metadata_filename, "w", encoding="utf-8") as f: + f.write(metadata) + return str(filename), str(metadata_filename) + + +@pytest.fixture() +def files_with_metadata_that_misses_one_sample(tmp_path, auto_text_file): + filename = tmp_path / "file.txt" + shutil.copyfile(auto_text_file, filename) + filename2 = tmp_path / "file2.txt" + shutil.copyfile(auto_text_file, filename2) + metadata_filename = tmp_path / "metadata.jsonl" + metadata = textwrap.dedent( + """\ + {"file_name": "file.txt", "additional_feature": "Dummy file"} + """ + ) + with open(metadata_filename, "w", encoding="utf-8") as f: + f.write(metadata) + return str(filename), str(filename2), str(metadata_filename) + + +@pytest.fixture +def data_files_with_one_split_and_metadata(tmp_path, auto_text_file): + data_dir = tmp_path / "autofolder_data_dir_with_metadata_one_split" + data_dir.mkdir(parents=True, exist_ok=True) + subdir = data_dir / "subdir" + subdir.mkdir(parents=True, exist_ok=True) + + filename = data_dir / "file.txt" + shutil.copyfile(auto_text_file, filename) + filename2 = data_dir / "file2.txt" + shutil.copyfile(auto_text_file, filename2) + filename3 = subdir / "file3.txt" # in subdir + shutil.copyfile(auto_text_file, filename3) + + metadata_filename = data_dir / "metadata.jsonl" + metadata = textwrap.dedent( + """\ + {"file_name": "file.txt", "additional_feature": "Dummy file"} + {"file_name": "file2.txt", "additional_feature": "Second dummy file"} + {"file_name": "subdir/file3.txt", "additional_feature": "Third dummy file"} + """ + ) + with open(metadata_filename, "w", encoding="utf-8") as f: + f.write(metadata) + data_files_with_one_split_and_metadata = DataFilesDict.from_local_or_remote( + get_data_patterns_locally(data_dir), data_dir + ) + assert len(data_files_with_one_split_and_metadata) == 1 + assert len(data_files_with_one_split_and_metadata["train"]) == 4 + return data_files_with_one_split_and_metadata + + +@pytest.fixture +def data_files_with_two_splits_and_metadata(tmp_path, auto_text_file): + data_dir = tmp_path / "autofolder_data_dir_with_metadata_two_splits" + data_dir.mkdir(parents=True, exist_ok=True) + train_dir = data_dir / "train" + train_dir.mkdir(parents=True, exist_ok=True) + test_dir = data_dir / "test" + test_dir.mkdir(parents=True, exist_ok=True) + + filename = train_dir / "file.txt" # train + shutil.copyfile(auto_text_file, filename) + filename2 = train_dir / "file2.txt" # train + shutil.copyfile(auto_text_file, filename2) + filename3 = test_dir / "file3.txt" # test + shutil.copyfile(auto_text_file, filename3) + + train_metadata_filename = train_dir / "metadata.jsonl" + train_metadata = textwrap.dedent( + """\ + {"file_name": "file.txt", "additional_feature": "Train dummy file"} + {"file_name": "file2.txt", "additional_feature": "Second train dummy file"} + """ + ) + with open(train_metadata_filename, "w", encoding="utf-8") as f: + f.write(train_metadata) + test_metadata_filename = test_dir / "metadata.jsonl" + test_metadata = textwrap.dedent( + """\ + {"file_name": "file3.txt", "additional_feature": "Test dummy file"} + """ + ) + with open(test_metadata_filename, "w", encoding="utf-8") as f: + f.write(test_metadata) + data_files_with_two_splits_and_metadata = DataFilesDict.from_local_or_remote( + get_data_patterns_locally(data_dir), data_dir + ) + assert len(data_files_with_two_splits_and_metadata) == 2 + assert len(data_files_with_two_splits_and_metadata["train"]) == 3 + assert len(data_files_with_two_splits_and_metadata["test"]) == 2 + return data_files_with_two_splits_and_metadata + + +@pytest.fixture +def data_files_with_zip_archives(tmp_path, auto_text_file): + data_dir = tmp_path / "autofolder_data_dir_with_zip_archives" + data_dir.mkdir(parents=True, exist_ok=True) + archive_dir = data_dir / "archive" + archive_dir.mkdir(parents=True, exist_ok=True) + subdir = archive_dir / "subdir" + subdir.mkdir(parents=True, exist_ok=True) + + filename = archive_dir / "file.txt" + shutil.copyfile(auto_text_file, filename) + filename2 = subdir / "file2.txt" # in subdir + shutil.copyfile(auto_text_file, filename2) + + metadata_filename = archive_dir / "metadata.jsonl" + metadata = textwrap.dedent( + """\ + {"file_name": "file.txt", "additional_feature": "Dummy file"} + {"file_name": "subdir/file2.txt", "additional_feature": "Second dummy file"} + """ + ) + with open(metadata_filename, "w", encoding="utf-8") as f: + f.write(metadata) + + shutil.make_archive(archive_dir, "zip", archive_dir) + shutil.rmtree(str(archive_dir)) + + data_files_with_zip_archives = DataFilesDict.from_local_or_remote(get_data_patterns_locally(data_dir), data_dir) + + assert len(data_files_with_zip_archives) == 1 + assert len(data_files_with_zip_archives["train"]) == 1 + return data_files_with_zip_archives + + +def test_inferring_labels_from_data_dirs(data_files_with_labels_no_metadata, cache_dir): + autofolder = DummyFolderBasedBuilder( + data_files=data_files_with_labels_no_metadata, cache_dir=cache_dir, drop_labels=False + ) + gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs + assert autofolder.info.features == Features({"base": None, "label": ClassLabel(names=["class0", "class1"])}) + generator = autofolder._generate_examples(**gen_kwargs) + assert all(example["label"] in {"class0", "class1"} for _, example in generator) + + +def test_default_autofolder_not_usable(data_files_with_labels_no_metadata, cache_dir): + # builder would try to access non-existing attributes of a default `BuilderConfig` class + # as a custom one is not provided + with pytest.raises(AttributeError): + _ = FolderBasedBuilder( + data_files=data_files_with_labels_no_metadata, + cache_dir=cache_dir, + ) + + +# test that AutoFolder is extended for streaming when it's child class is instantiated: +# see line 115 in src/datasets/streaming.py +def test_streaming_patched(): + _ = DummyFolderBasedBuilder() + module = importlib.import_module(FolderBasedBuilder.__module__) + assert hasattr(module, "_patched_for_streaming") + assert module._patched_for_streaming + + +@pytest.mark.parametrize("drop_metadata", [None, True, False]) +@pytest.mark.parametrize("drop_labels", [None, True, False]) +def test_prepare_generate_examples_duplicated_label_key( + files_with_labels_and_duplicated_label_key_in_metadata, drop_metadata, drop_labels, cache_dir, caplog +): + class0_file, class1_file, metadata_file = files_with_labels_and_duplicated_label_key_in_metadata + autofolder = DummyFolderBasedBuilder( + data_files=[class0_file, class1_file, metadata_file], + cache_dir=cache_dir, + drop_metadata=drop_metadata, + drop_labels=drop_labels, + ) + gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs + generator = autofolder._generate_examples(**gen_kwargs) + if drop_labels is False: + # infer labels from directories even if metadata files are found + warning_in_logs = any("ignoring metadata columns" in record.msg.lower() for record in caplog.records) + assert warning_in_logs if drop_metadata is not True else not warning_in_logs + assert autofolder.info.features["label"] == ClassLabel(names=["class0", "class1"]) + assert all(example["label"] in ["class0", "class1"] for _, example in generator) + + else: + if drop_metadata is not True: + # labels are from metadata + assert autofolder.info.features["label"] == Value("string") + assert all(example["label"] in ["CLASS_0", "CLASS_1"] for _, example in generator) + else: + # drop both labels and metadata + assert autofolder.info.features == Features({"base": None}) + assert all(example.keys() == {"base"} for _, example in generator) + + +@pytest.mark.parametrize("drop_metadata", [None, True, False]) +@pytest.mark.parametrize("drop_labels", [None, True, False]) +def test_prepare_generate_examples_drop_labels(auto_text_file, drop_metadata, drop_labels): + autofolder = DummyFolderBasedBuilder( + data_files={"train": [auto_text_file]}, + drop_metadata=drop_metadata, + drop_labels=drop_labels, + ) + gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs + # removing the labels explicitly requires drop_labels=True + assert gen_kwargs["add_labels"] is not bool(drop_labels) + assert gen_kwargs["add_metadata"] is False + generator = autofolder._generate_examples(**gen_kwargs) + if not drop_labels: + assert all( + example.keys() == {"base", "label"} and all(val is not None for val in example.values()) + for _, example in generator + ) + else: + assert all( + example.keys() == {"base"} and all(val is not None for val in example.values()) for _, example in generator + ) + + +@pytest.mark.parametrize("drop_metadata", [None, True, False]) +@pytest.mark.parametrize("drop_labels", [None, True, False]) +def test_prepare_generate_examples_drop_metadata(file_with_metadata, drop_metadata, drop_labels): + file, metadata_file = file_with_metadata + autofolder = DummyFolderBasedBuilder( + data_files=[file, metadata_file], + drop_metadata=drop_metadata, + drop_labels=drop_labels, + ) + gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs + # since the dataset has metadata, removing the metadata explicitly requires drop_metadata=True + assert gen_kwargs["add_metadata"] is not bool(drop_metadata) + # since the dataset has metadata, adding the labels explicitly requires drop_labels=False + assert gen_kwargs["add_labels"] is (drop_labels is False) + generator = autofolder._generate_examples(**gen_kwargs) + expected_columns = {"base"} + if gen_kwargs["add_metadata"]: + expected_columns.add("additional_feature") + if gen_kwargs["add_labels"]: + expected_columns.add("label") + result = [example for _, example in generator] + assert len(result) == 1 + example = result[0] + assert example.keys() == expected_columns + for column in expected_columns: + assert example[column] is not None + + +@pytest.mark.parametrize("drop_metadata", [None, True, False]) +def test_prepare_generate_examples_with_metadata_that_misses_one_sample( + files_with_metadata_that_misses_one_sample, drop_metadata +): + file, file2, metadata_file = files_with_metadata_that_misses_one_sample + if not drop_metadata: + features = Features({"base": None, "additional_feature": Value("string")}) + else: + features = Features({"base": None}) + autofolder = DummyFolderBasedBuilder( + data_files=[file, file2, metadata_file], + drop_metadata=drop_metadata, + features=features, + ) + gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs + generator = autofolder._generate_examples(**gen_kwargs) + if not drop_metadata: + with pytest.raises(ValueError): + list(generator) + else: + assert all( + example.keys() == {"base"} and all(val is not None for val in example.values()) for _, example in generator + ) + + +@pytest.mark.parametrize("streaming", [False, True]) +@pytest.mark.parametrize("n_splits", [1, 2]) +def test_data_files_with_metadata_and_splits( + streaming, cache_dir, n_splits, data_files_with_one_split_and_metadata, data_files_with_two_splits_and_metadata +): + data_files = data_files_with_one_split_and_metadata if n_splits == 1 else data_files_with_two_splits_and_metadata + autofolder = DummyFolderBasedBuilder( + data_files=data_files, + cache_dir=cache_dir, + ) + download_manager = StreamingDownloadManager() if streaming else DownloadManager() + generated_splits = autofolder._split_generators(download_manager) + for (split, files), generated_split in zip(data_files.items(), generated_splits): + assert split == generated_split.name + expected_num_of_examples = len(files) - 1 + generated_examples = list(autofolder._generate_examples(**generated_split.gen_kwargs)) + assert len(generated_examples) == expected_num_of_examples + assert len(set(example["base"] for _, example in generated_examples)) == expected_num_of_examples + assert len(set(example["additional_feature"] for _, example in generated_examples)) == expected_num_of_examples + assert all(example["additional_feature"] is not None for _, example in generated_examples) + + +@pytest.mark.parametrize("streaming", [False, True]) +def test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_with_zip_archives): + autofolder = DummyFolderBasedBuilder(data_files=data_files_with_zip_archives, cache_dir=cache_dir) + download_manager = StreamingDownloadManager() if streaming else DownloadManager() + generated_splits = autofolder._split_generators(download_manager) + for (split, files), generated_split in zip(data_files_with_zip_archives.items(), generated_splits): + assert split == generated_split.name + num_of_archives = len(files) + expected_num_of_examples = 2 * num_of_archives + generated_examples = list(autofolder._generate_examples(**generated_split.gen_kwargs)) + assert len(generated_examples) == expected_num_of_examples + assert len(set(example["base"] for _, example in generated_examples)) == expected_num_of_examples + assert len(set(example["additional_feature"] for _, example in generated_examples)) == expected_num_of_examples + assert all(example["additional_feature"] is not None for _, example in generated_examples) + + +def test_data_files_with_wrong_metadata_file_name(cache_dir, tmp_path, auto_text_file): + data_dir = tmp_path / "data_dir_with_bad_metadata" + data_dir.mkdir(parents=True, exist_ok=True) + shutil.copyfile(auto_text_file, data_dir / "file.txt") + metadata_filename = data_dir / "bad_metadata.jsonl" # bad file + metadata = textwrap.dedent( + """\ + {"file_name": "file.txt", "additional_feature": "Dummy file"} + """ + ) + with open(metadata_filename, "w", encoding="utf-8") as f: + f.write(metadata) + + data_files_with_bad_metadata = DataFilesDict.from_local_or_remote(get_data_patterns_locally(data_dir), data_dir) + autofolder = DummyFolderBasedBuilder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir) + gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs + generator = autofolder._generate_examples(**gen_kwargs) + assert all("additional_feature" not in example for _, example in generator) + + +def test_data_files_with_wrong_file_name_column_in_metadata_file(cache_dir, tmp_path, auto_text_file): + data_dir = tmp_path / "data_dir_with_bad_metadata" + data_dir.mkdir(parents=True, exist_ok=True) + shutil.copyfile(auto_text_file, data_dir / "file.txt") + metadata_filename = data_dir / "metadata.jsonl" + metadata = textwrap.dedent( # with bad column "bad_file_name" instead of "file_name" + """\ + {"bad_file_name": "file.txt", "additional_feature": "Dummy file"} + """ + ) + with open(metadata_filename, "w", encoding="utf-8") as f: + f.write(metadata) + + data_files_with_bad_metadata = DataFilesDict.from_local_or_remote(get_data_patterns_locally(data_dir), data_dir) + autofolder = DummyFolderBasedBuilder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir) + with pytest.raises(ValueError) as exc_info: + _ = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs + assert "`file_name` must be present" in str(exc_info.value) diff --git a/tests/packaged_modules/test_imagefolder.py b/tests/packaged_modules/test_imagefolder.py index 13fc1b9feac..1c1c4b6752a 100644 --- a/tests/packaged_modules/test_imagefolder.py +++ b/tests/packaged_modules/test_imagefolder.py @@ -8,7 +8,6 @@ from datasets.data_files import DataFilesDict, get_data_patterns_locally from datasets.download.streaming_download_manager import StreamingDownloadManager from datasets.packaged_modules.imagefolder.imagefolder import ImageFolder -from datasets.streaming import extend_module_for_streaming from ..utils import require_pil @@ -376,8 +375,6 @@ def test_data_files_with_metadata_and_splits( @require_pil @pytest.mark.parametrize("streaming", [False, True]) def test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_with_zip_archives): - if streaming: - extend_module_for_streaming(ImageFolder.__module__) imagefolder = ImageFolder(data_files=data_files_with_zip_archives, cache_dir=cache_dir) imagefolder.download_and_prepare() datasets = imagefolder.as_streaming_dataset() if streaming else imagefolder.as_dataset() diff --git a/tests/test_dataset_common.py b/tests/test_dataset_common.py index 3712e94e2af..8875c2f76d2 100644 --- a/tests/test_dataset_common.py +++ b/tests/test_dataset_common.py @@ -82,6 +82,7 @@ def get_packaged_dataset_dummy_data_files(dataset_name, path_to_dummy_data): "csv": ".csv", "parquet": ".parquet", "imagefolder": "/", + "audiofolder": "/", } return { "train": os.path.join(path_to_dummy_data, "train" + extensions[dataset_name]),