From 01af9065dc342b09bdc859e0d18584cfef4f793d Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 30 Aug 2022 17:33:01 +0200 Subject: [PATCH] Minor adjustments --- src/datasets/data_files.py | 4 +-- .../folder_based_builder.py | 25 ++++++++++++------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 47196e765d8..942184d683d 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -80,10 +80,10 @@ class Url(str): DEFAULT_PATTERNS_ALL, ] METADATA_PATTERNS = [ - "metadata.jsonl", - "**/metadata.jsonl", "metadata.csv", "**/metadata.csv", + "metadata.jsonl", + "**/metadata.jsonl", ] # metadata file for ImageFolder and AudioFolder WILDCARD_CHARACTERS = "*[]" FILES_TO_IGNORE = ["README.md", "config.json", "dataset_infos.json", "dummy_data.zip", "dataset_dict.json"] diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py index 4705037b4fb..f7a09d7c05d 100644 --- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -70,7 +70,7 @@ class FolderBasedBuilder(datasets.GeneratorBasedBuilder): EXTENSIONS: List[str] SKIP_CHECKSUM_COMPUTATION_BY_DEFAULT: bool = True - METADATA_FILENAMES: List[str] = ["metadata.jsonl", "metadata.csv"] + METADATA_FILENAMES: List[str] = ["metadata.csv", "metadata.jsonl"] def _info(self): return datasets.DatasetInfo(features=self.config.features) @@ -251,12 +251,12 @@ def _split_files_and_archives(self, data_files): def _read_metadata(self, metadata_file): metadata_file_ext = os.path.splitext(metadata_file)[1][1:] - if metadata_file_ext == "jsonl": - with open(metadata_file, "rb") as f: - return paj.read_json(f) - else: + if metadata_file_ext == "csv": # Use `pd.read_csv` (although slower) instead of `pyarrow.csv.read_csv` for reading CSV files for consistency with the CSV packaged module return pa.Table.from_pandas(pd.read_csv(metadata_file)) + else: + with open(metadata_file, "rb") as f: + return paj.read_json(f) def _generate_examples(self, files, metadata_files, split_name, add_metadata, add_labels): split_metadata_files = metadata_files.get(split_name, []) @@ -268,6 +268,13 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad metadata_dict = None downloaded_metadata_file = None + if split_metadata_files: + metadata_ext = set( + os.path.splitext(downloaded_metadata_file)[1][1:] + for _, downloaded_metadata_file in split_metadata_files + ) + metadata_ext = metadata_ext.pop() + file_idx = 0 for original_file, downloaded_file_or_dir in files: if original_file is not None: @@ -311,7 +318,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad } else: raise ValueError( - f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." + f"One or several metadata.{metadata_ext} were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." ) if metadata_dir is not None and downloaded_metadata_file is not None: file_relpath = os.path.relpath(original_file, metadata_dir) @@ -323,7 +330,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad sample_metadata = metadata_dict[file_relpath] else: raise ValueError( - f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." + f"One or several metadata.{metadata_ext} were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." ) else: sample_metadata = {} @@ -380,7 +387,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad } else: raise ValueError( - f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." + f"One or several metadata.{metadata_ext} were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." ) if metadata_dir is not None and downloaded_metadata_file is not None: downloaded_dir_file_relpath = os.path.relpath(downloaded_dir_file, metadata_dir) @@ -392,7 +399,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad sample_metadata = metadata_dict[downloaded_dir_file_relpath] else: raise ValueError( - f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." + f"One or several metadata.{metadata_ext} were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." ) else: sample_metadata = {}