Minor adjustments

huggingface · Aug 30, 2022 · 01af906 · 01af906 · github-actions · Aug 30, 2022
1 parent b642dd9
commit 01af906
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 11 deletions.
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
@@ -80,10 +80,10 @@ class Url(str):
     DEFAULT_PATTERNS_ALL,
 ]
 METADATA_PATTERNS = [
-    "metadata.jsonl",
-    "**/metadata.jsonl",
     "metadata.csv",
     "**/metadata.csv",
+    "metadata.jsonl",
+    "**/metadata.jsonl",
 ]  # metadata file for ImageFolder and AudioFolder
 WILDCARD_CHARACTERS = "*[]"
 FILES_TO_IGNORE = ["README.md", "config.json", "dataset_infos.json", "dummy_data.zip", "dataset_dict.json"]

diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py
@@ -70,7 +70,7 @@ class FolderBasedBuilder(datasets.GeneratorBasedBuilder):
     EXTENSIONS: List[str]
 
     SKIP_CHECKSUM_COMPUTATION_BY_DEFAULT: bool = True
-    METADATA_FILENAMES: List[str] = ["metadata.jsonl", "metadata.csv"]
+    METADATA_FILENAMES: List[str] = ["metadata.csv", "metadata.jsonl"]
 
     def _info(self):
         return datasets.DatasetInfo(features=self.config.features)
@@ -251,12 +251,12 @@ def _split_files_and_archives(self, data_files):
 
     def _read_metadata(self, metadata_file):
         metadata_file_ext = os.path.splitext(metadata_file)[1][1:]
-        if metadata_file_ext == "jsonl":
-            with open(metadata_file, "rb") as f:
-                return paj.read_json(f)
-        else:
+        if metadata_file_ext == "csv":
             # Use `pd.read_csv` (although slower) instead of `pyarrow.csv.read_csv` for reading CSV files for consistency with the CSV packaged module
             return pa.Table.from_pandas(pd.read_csv(metadata_file))
+        else:
+            with open(metadata_file, "rb") as f:
+                return paj.read_json(f)
 
     def _generate_examples(self, files, metadata_files, split_name, add_metadata, add_labels):
         split_metadata_files = metadata_files.get(split_name, [])
@@ -268,6 +268,13 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
         metadata_dict = None
         downloaded_metadata_file = None
 
+        if split_metadata_files:
+            metadata_ext = set(
+                os.path.splitext(downloaded_metadata_file)[1][1:]
+                for _, downloaded_metadata_file in split_metadata_files
+            )
+            metadata_ext = metadata_ext.pop()
+
         file_idx = 0
         for original_file, downloaded_file_or_dir in files:
             if original_file is not None:
@@ -311,7 +318,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
                                 }
                             else:
                                 raise ValueError(
-                                    f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}."
+                                    f"One or several metadata.{metadata_ext} were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}."
                                 )
                         if metadata_dir is not None and downloaded_metadata_file is not None:
                             file_relpath = os.path.relpath(original_file, metadata_dir)
@@ -323,7 +330,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
                             sample_metadata = metadata_dict[file_relpath]
                         else:
                             raise ValueError(
-                                f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}."
+                                f"One or several metadata.{metadata_ext} were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}."
                             )
                     else:
                         sample_metadata = {}
@@ -380,7 +387,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
                                     }
                                 else:
                                     raise ValueError(
-                                        f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}."
+                                        f"One or several metadata.{metadata_ext} were found, but not in the same directory or in a parent directory of {downloaded_dir_file}."
                                     )
                             if metadata_dir is not None and downloaded_metadata_file is not None:
                                 downloaded_dir_file_relpath = os.path.relpath(downloaded_dir_file, metadata_dir)
@@ -392,7 +399,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
                                 sample_metadata = metadata_dict[downloaded_dir_file_relpath]
                             else:
                                 raise ValueError(
-                                    f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}."
+                                    f"One or several metadata.{metadata_ext} were found, but not in the same directory or in a parent directory of {downloaded_dir_file}."
                                 )
                         else:
                             sample_metadata = {}