Merge remote-tracking branch 'upstream/main' into load-github-dataset…

…s-from-hub
huggingface · Sep 16, 2022 · 80f4fbc · 80f4fbc · github-actions · Sep 16, 2022
2 parents 14e322f + 51aef08
commit 80f4fbc
Show file tree

Hide file tree

Showing 1,301 changed files with 58,172 additions and 28,503 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
diff --git a/.circleci/deploy.sh b/.circleci/deploy.sh
diff --git a/.github/ISSUE_TEMPLATE/add-dataset.md b/.github/ISSUE_TEMPLATE/add-dataset.md
@@ -14,4 +14,4 @@ assignees: ''
 - **Data:** *link to the Github repository or current dataset location*
 - **Motivation:** *what are some good reasons to have this dataset*
 
-Instructions to add a new dataset can be found [here](https://github.com/huggingface/datasets/blob/master/ADD_NEW_DATASET.md).
+Instructions to add a new dataset can be found [here](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md).
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,4 +1,7 @@
 contact_links:
+  - name: Datasets on the Hugging Face Hub
+    url: https://huggingface.co/datasets
+    about: Open a Pull request / Discussion related to a specific dataset on the Hugging Face Hub (PRs for datasets with no namespace still have to be on GitHub though)
   - name: Forum
     url: https://discuss.huggingface.co/c/datasets/10
     about: Please ask and answer questions here, and engage with other community members
diff --git a/.github/ISSUE_TEMPLATE/dataset-viewer.yml b/.github/ISSUE_TEMPLATE/dataset-viewer.yml
@@ -0,0 +1,27 @@
+name: Dataset Viewer Issue
+description: Issue related to the Dataset Viewer on huggingface.co/datasets
+title: "Dataset Viewer issue for [dataset name]"
+labels: ["dataset-viewer"]
+assignees:
+  - severo
+body:
+  - type: input
+    id: url
+    attributes:
+      label: Link
+      description: Link to the dataset viewer page
+      placeholder: ex. https://huggingface.co/datasets/glue/viewer/cola/test
+  - type: textarea
+    id: description
+    attributes:
+      label: Description
+      description: Short description of the issue
+      placeholder: Tell us what the issue is and which error you get
+  - type: dropdown
+    id: owner
+    attributes:
+      label: Owner
+      description: Is it you who added this dataset?
+      options:
+        - "Yes"
+        - "No"
diff --git a/.github/ISSUE_TEMPLATE/dataset_viewer.md b/.github/ISSUE_TEMPLATE/dataset_viewer.md
diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml
@@ -15,7 +15,7 @@ requirements:
     - python
     - pip
     - numpy >=1.17
-    - pyarrow >=5.0.0
+    - pyarrow >=6.0.0
     - python-xxhash
     - dill
     - pandas
@@ -32,7 +32,7 @@ requirements:
     - python
     - pip
     - numpy >=1.17
-    - pyarrow >=5.0.0
+    - pyarrow >=6.0.0
     - python-xxhash
     - dill
     - pandas

diff --git a/.github/hub/requirements.txt b/.github/hub/requirements.txt
@@ -1,4 +1,4 @@
-GitPython==3.1.11
+GitPython==3.1.27
 python-dotenv==0.19.2
 requests==2.25.1
 tqdm==4.62.3
diff --git a/.github/hub/update_hub_repositories.py b/.github/hub/update_hub_repositories.py
@@ -29,7 +29,9 @@
 HUB_CANONICAL_WHOAMI = HUB_ENDPOINT + "/api/whoami-v2"
 HUB_CANONICAL_CREATE_URL = HUB_ENDPOINT + "/api/repos/create"
 HUB_CANONICAL_INFO_URL = HUB_ENDPOINT + "/api/datasets/{dataset_name}"
-HUB_CANONICAL_DATASET_GIT_URL = HUB_ENDPOINT.replace("https://", "https://user:{token}@") + "/datasets/{dataset_name}.git"
+HUB_CANONICAL_DATASET_GIT_URL = (
+    HUB_ENDPOINT.replace("https://", "https://user:{token}@") + "/datasets/{dataset_name}.git"
+)
 HUB_API_GH_TO_HF = HUB_ENDPOINT + "/api/gh-to-hf/{github_username}"
 DATASETS_LIB_CATALOG_DIR_NAME = "datasets"
 DATASETS_LIB_COMMIT_URL = "https://github.com/huggingface/datasets/commit/{hexsha}"
@@ -102,17 +104,6 @@ def check_authorizations(user_info: dict):
         )
 
 
-def apply_hacks_for_moon_landing(dataset_repo_path: Path):
-    if (dataset_repo_path / "README.md").is_file():
-        with (dataset_repo_path / "README.md").open() as f:
-            readme_content = f.read()
-        if readme_content.count("---\n") > 1:
-            _, tags, content = readme_content.split("---\n", 2)
-            tags = tags.replace("\nlicense:", "\nlicenses:").replace(".", "-").replace("$", "%")
-            with (dataset_repo_path / "README.md").open("w") as f:
-                f.write("---\n".join(["", tags, content]))
-
-
 class update_main:
     def __init__(
         self,
@@ -136,7 +127,8 @@ def __call__(self, dataset_name: str) -> bool:
                 logger.warning(f"[{dataset_name}] " + repr(e))
         if not canonical_dataset_path(dataset_name).is_dir():
             repo = Repo.clone_from(
-                canonical_dataset_git_url(dataset_name, self.token), to_path=canonical_dataset_path(dataset_name)
+                canonical_dataset_git_url(dataset_name, token=self.token),
+                to_path=canonical_dataset_path(dataset_name),
             )
         else:
             repo = Repo(canonical_dataset_path(dataset_name))
@@ -145,7 +137,11 @@ def __call__(self, dataset_name: str) -> bool:
         logs.append(repo.git.reset("--hard"))
         logs.append(repo.git.clean("-f", "-d"))
         logs.append(repo.git.checkout(CANONICAL_DATASET_REPO_MAIN_BRANCH))
-        logs.append(repo.remote().pull())
+        try:
+            logs.append(repo.remote().pull())
+        except Exception as e:
+            logs.append("pull failed !")
+            logs.append(repr(e))
         # Copy the changes and commit
         distutils.dir_util.copy_tree(
             str(src_canonical_dataset_path(datasets_lib_path, dataset_name)), str(canonical_dataset_path(dataset_name))
@@ -155,7 +151,6 @@ def __call__(self, dataset_name: str) -> bool:
                 (canonical_dataset_path(dataset_name) / filepath_to_delete).unlink()
             except Exception as e:
                 logger.warning(f"[{dataset_name}] Couldn't delete file at {filepath_to_delete}: {repr(e)}")
-        apply_hacks_for_moon_landing(canonical_dataset_path(dataset_name))
         logs.append(repo.git.add("."))
         if "Changes to be committed:" in repo.git.status():
             logs.append(repo.git.commit(*self.commit_args))
@@ -168,6 +163,7 @@ def __call__(self, dataset_name: str) -> bool:
                 logs.append(repo.git.tag(self.tag_name, f"-m Add tag from datasets {self.tag_name}"))
                 logs.append(repo.git.push("--tags"))
         except Exception as e:
+            logs.append("push failed !")
             logs.append(repr(e))
         if "Your branch is up to date with" not in repo.git.status():
             logs.append(repo.git.status())
@@ -210,31 +206,29 @@ def __call__(self, dataset_name: str) -> bool:
         path
         for diff in datasets_lib_repo.index.diff(prev_commit)
         for path in [diff.a_path, diff.b_path]
-        if path.startswith(DATASETS_LIB_CATALOG_DIR_NAME)
-        and path.count("/") >= 2
+        if path.startswith(DATASETS_LIB_CATALOG_DIR_NAME) and path.count("/") >= 2
     ]
 
     changed_datasets_names_since_last_commit = {path.split("/")[1] for path in changed_files_since_last_commit}
     # ignore json, csv etc.
     changed_datasets_names_since_last_commit = {
-        dataset_name for dataset_name in changed_datasets_names_since_last_commit
+        dataset_name
+        for dataset_name in changed_datasets_names_since_last_commit
         if (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME / dataset_name / (dataset_name + ".py")).is_file()
     }
 
     deleted_files = {dataset_name: set() for dataset_name in changed_datasets_names_since_last_commit}
     for path in changed_files_since_last_commit:
         _, dataset_name, rel_path = path.split("/", 2)
-        if (
-            dataset_name in changed_datasets_names_since_last_commit
-            and not (datasets_lib_path / path).is_file()
-        ):
+        if dataset_name in changed_datasets_names_since_last_commit and not (datasets_lib_path / path).is_file():
             deleted_files[dataset_name].add(rel_path)
 
     dataset_names = sys.argv[1:]
     if dataset_names:
         if dataset_names[0] == "--all":
             dataset_names = sorted(
-                d.name for d in (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME).glob("*")
+                d.name
+                for d in (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME).glob("*")
                 if d.is_dir() and (d / (d.name + ".py")).is_file()  # ignore json, csv etc.
             )
         if dataset_names[0] == "--auto":
@@ -245,7 +239,8 @@ def __call__(self, dataset_name: str) -> bool:
                 )
                 dataset_names = sorted(d.name for d in (ROOT / HUB_DIR_NAME).glob("*") if d.is_dir())
                 dataset_names = sorted(
-                    d.name for d in (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME).glob("*")
+                    d.name
+                    for d in (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME).glob("*")
                     if d.is_dir() and (d / (d.name + ".py")).is_file()  # ignore json, csv etc.
                 )
             else:
@@ -268,7 +263,9 @@ def __call__(self, dataset_name: str) -> bool:
                 ),
                 dataset_names,
             )
-            datasets_with_errors = [dataset_name for success, dataset_name in zip(successes, dataset_names) if not success]
+            datasets_with_errors = [
+                dataset_name for success, dataset_name in zip(successes, dataset_names) if not success
+            ]
             if datasets_with_errors:
                 raise UpdateFailed(
                     f"Those datasets couldn't be updated: {' '.join(datasets_with_errors)}\n"