Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into load-github-dataset…
Browse files Browse the repository at this point in the history
…s-from-hub
  • Loading branch information
albertvillanova committed Sep 16, 2022
2 parents 14e322f + 51aef08 commit 80f4fbc
Show file tree
Hide file tree
Showing 1,301 changed files with 58,172 additions and 28,503 deletions.
102 changes: 0 additions & 102 deletions .circleci/config.yml

This file was deleted.

81 changes: 0 additions & 81 deletions .circleci/deploy.sh

This file was deleted.

2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/add-dataset.md
Expand Up @@ -14,4 +14,4 @@ assignees: ''
- **Data:** *link to the Github repository or current dataset location*
- **Motivation:** *what are some good reasons to have this dataset*

Instructions to add a new dataset can be found [here](https://github.com/huggingface/datasets/blob/master/ADD_NEW_DATASET.md).
Instructions to add a new dataset can be found [here](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md).
3 changes: 3 additions & 0 deletions .github/ISSUE_TEMPLATE/config.yml
@@ -1,4 +1,7 @@
contact_links:
- name: Datasets on the Hugging Face Hub
url: https://huggingface.co/datasets
about: Open a Pull request / Discussion related to a specific dataset on the Hugging Face Hub (PRs for datasets with no namespace still have to be on GitHub though)
- name: Forum
url: https://discuss.huggingface.co/c/datasets/10
about: Please ask and answer questions here, and engage with other community members
27 changes: 27 additions & 0 deletions .github/ISSUE_TEMPLATE/dataset-viewer.yml
@@ -0,0 +1,27 @@
name: Dataset Viewer Issue
description: Issue related to the Dataset Viewer on huggingface.co/datasets
title: "Dataset Viewer issue for [dataset name]"
labels: ["dataset-viewer"]
assignees:
- severo
body:
- type: input
id: url
attributes:
label: Link
description: Link to the dataset viewer page
placeholder: ex. https://huggingface.co/datasets/glue/viewer/cola/test
- type: textarea
id: description
attributes:
label: Description
description: Short description of the issue
placeholder: Tell us what the issue is and which error you get
- type: dropdown
id: owner
attributes:
label: Owner
description: Is it you who added this dataset?
options:
- "Yes"
- "No"
15 changes: 0 additions & 15 deletions .github/ISSUE_TEMPLATE/dataset_viewer.md

This file was deleted.

4 changes: 2 additions & 2 deletions .github/conda/meta.yaml
Expand Up @@ -15,7 +15,7 @@ requirements:
- python
- pip
- numpy >=1.17
- pyarrow >=5.0.0
- pyarrow >=6.0.0
- python-xxhash
- dill
- pandas
Expand All @@ -32,7 +32,7 @@ requirements:
- python
- pip
- numpy >=1.17
- pyarrow >=5.0.0
- pyarrow >=6.0.0
- python-xxhash
- dill
- pandas
Expand Down
2 changes: 1 addition & 1 deletion .github/hub/requirements.txt
@@ -1,4 +1,4 @@
GitPython==3.1.11
GitPython==3.1.27
python-dotenv==0.19.2
requests==2.25.1
tqdm==4.62.3
47 changes: 22 additions & 25 deletions .github/hub/update_hub_repositories.py
Expand Up @@ -29,7 +29,9 @@
HUB_CANONICAL_WHOAMI = HUB_ENDPOINT + "/api/whoami-v2"
HUB_CANONICAL_CREATE_URL = HUB_ENDPOINT + "/api/repos/create"
HUB_CANONICAL_INFO_URL = HUB_ENDPOINT + "/api/datasets/{dataset_name}"
HUB_CANONICAL_DATASET_GIT_URL = HUB_ENDPOINT.replace("https://", "https://user:{token}@") + "/datasets/{dataset_name}.git"
HUB_CANONICAL_DATASET_GIT_URL = (
HUB_ENDPOINT.replace("https://", "https://user:{token}@") + "/datasets/{dataset_name}.git"
)
HUB_API_GH_TO_HF = HUB_ENDPOINT + "/api/gh-to-hf/{github_username}"
DATASETS_LIB_CATALOG_DIR_NAME = "datasets"
DATASETS_LIB_COMMIT_URL = "https://github.com/huggingface/datasets/commit/{hexsha}"
Expand Down Expand Up @@ -102,17 +104,6 @@ def check_authorizations(user_info: dict):
)


def apply_hacks_for_moon_landing(dataset_repo_path: Path):
if (dataset_repo_path / "README.md").is_file():
with (dataset_repo_path / "README.md").open() as f:
readme_content = f.read()
if readme_content.count("---\n") > 1:
_, tags, content = readme_content.split("---\n", 2)
tags = tags.replace("\nlicense:", "\nlicenses:").replace(".", "-").replace("$", "%")
with (dataset_repo_path / "README.md").open("w") as f:
f.write("---\n".join(["", tags, content]))


class update_main:
def __init__(
self,
Expand All @@ -136,7 +127,8 @@ def __call__(self, dataset_name: str) -> bool:
logger.warning(f"[{dataset_name}] " + repr(e))
if not canonical_dataset_path(dataset_name).is_dir():
repo = Repo.clone_from(
canonical_dataset_git_url(dataset_name, self.token), to_path=canonical_dataset_path(dataset_name)
canonical_dataset_git_url(dataset_name, token=self.token),
to_path=canonical_dataset_path(dataset_name),
)
else:
repo = Repo(canonical_dataset_path(dataset_name))
Expand All @@ -145,7 +137,11 @@ def __call__(self, dataset_name: str) -> bool:
logs.append(repo.git.reset("--hard"))
logs.append(repo.git.clean("-f", "-d"))
logs.append(repo.git.checkout(CANONICAL_DATASET_REPO_MAIN_BRANCH))
logs.append(repo.remote().pull())
try:
logs.append(repo.remote().pull())
except Exception as e:
logs.append("pull failed !")
logs.append(repr(e))
# Copy the changes and commit
distutils.dir_util.copy_tree(
str(src_canonical_dataset_path(datasets_lib_path, dataset_name)), str(canonical_dataset_path(dataset_name))
Expand All @@ -155,7 +151,6 @@ def __call__(self, dataset_name: str) -> bool:
(canonical_dataset_path(dataset_name) / filepath_to_delete).unlink()
except Exception as e:
logger.warning(f"[{dataset_name}] Couldn't delete file at {filepath_to_delete}: {repr(e)}")
apply_hacks_for_moon_landing(canonical_dataset_path(dataset_name))
logs.append(repo.git.add("."))
if "Changes to be committed:" in repo.git.status():
logs.append(repo.git.commit(*self.commit_args))
Expand All @@ -168,6 +163,7 @@ def __call__(self, dataset_name: str) -> bool:
logs.append(repo.git.tag(self.tag_name, f"-m Add tag from datasets {self.tag_name}"))
logs.append(repo.git.push("--tags"))
except Exception as e:
logs.append("push failed !")
logs.append(repr(e))
if "Your branch is up to date with" not in repo.git.status():
logs.append(repo.git.status())
Expand Down Expand Up @@ -210,31 +206,29 @@ def __call__(self, dataset_name: str) -> bool:
path
for diff in datasets_lib_repo.index.diff(prev_commit)
for path in [diff.a_path, diff.b_path]
if path.startswith(DATASETS_LIB_CATALOG_DIR_NAME)
and path.count("/") >= 2
if path.startswith(DATASETS_LIB_CATALOG_DIR_NAME) and path.count("/") >= 2
]

changed_datasets_names_since_last_commit = {path.split("/")[1] for path in changed_files_since_last_commit}
# ignore json, csv etc.
changed_datasets_names_since_last_commit = {
dataset_name for dataset_name in changed_datasets_names_since_last_commit
dataset_name
for dataset_name in changed_datasets_names_since_last_commit
if (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME / dataset_name / (dataset_name + ".py")).is_file()
}

deleted_files = {dataset_name: set() for dataset_name in changed_datasets_names_since_last_commit}
for path in changed_files_since_last_commit:
_, dataset_name, rel_path = path.split("/", 2)
if (
dataset_name in changed_datasets_names_since_last_commit
and not (datasets_lib_path / path).is_file()
):
if dataset_name in changed_datasets_names_since_last_commit and not (datasets_lib_path / path).is_file():
deleted_files[dataset_name].add(rel_path)

dataset_names = sys.argv[1:]
if dataset_names:
if dataset_names[0] == "--all":
dataset_names = sorted(
d.name for d in (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME).glob("*")
d.name
for d in (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME).glob("*")
if d.is_dir() and (d / (d.name + ".py")).is_file() # ignore json, csv etc.
)
if dataset_names[0] == "--auto":
Expand All @@ -245,7 +239,8 @@ def __call__(self, dataset_name: str) -> bool:
)
dataset_names = sorted(d.name for d in (ROOT / HUB_DIR_NAME).glob("*") if d.is_dir())
dataset_names = sorted(
d.name for d in (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME).glob("*")
d.name
for d in (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME).glob("*")
if d.is_dir() and (d / (d.name + ".py")).is_file() # ignore json, csv etc.
)
else:
Expand All @@ -268,7 +263,9 @@ def __call__(self, dataset_name: str) -> bool:
),
dataset_names,
)
datasets_with_errors = [dataset_name for success, dataset_name in zip(successes, dataset_names) if not success]
datasets_with_errors = [
dataset_name for success, dataset_name in zip(successes, dataset_names) if not success
]
if datasets_with_errors:
raise UpdateFailed(
f"Those datasets couldn't be updated: {' '.join(datasets_with_errors)}\n"
Expand Down

1 comment on commit 80f4fbc

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008575 / 0.011353 (-0.002777) 0.004120 / 0.011008 (-0.006888) 0.030212 / 0.038508 (-0.008296) 0.036298 / 0.023109 (0.013189) 0.307666 / 0.275898 (0.031768) 0.377267 / 0.323480 (0.053787) 0.006258 / 0.007986 (-0.001728) 0.003696 / 0.004328 (-0.000633) 0.007446 / 0.004250 (0.003196) 0.053678 / 0.037052 (0.016626) 0.309355 / 0.258489 (0.050866) 0.350897 / 0.293841 (0.057056) 0.032265 / 0.128546 (-0.096281) 0.009827 / 0.075646 (-0.065819) 0.259681 / 0.419271 (-0.159591) 0.053305 / 0.043533 (0.009772) 0.302543 / 0.255139 (0.047404) 0.316246 / 0.283200 (0.033047) 0.111838 / 0.141683 (-0.029845) 1.501336 / 1.452155 (0.049182) 1.545709 / 1.492716 (0.052993)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.288641 / 0.018006 (0.270635) 0.628912 / 0.000490 (0.628422) 0.003202 / 0.000200 (0.003002) 0.000159 / 0.000054 (0.000104)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.023733 / 0.037411 (-0.013678) 0.106245 / 0.014526 (0.091719) 0.115388 / 0.176557 (-0.061168) 0.163702 / 0.737135 (-0.573434) 0.121241 / 0.296338 (-0.175098)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.392781 / 0.215209 (0.177572) 3.915303 / 2.077655 (1.837648) 1.750749 / 1.504120 (0.246629) 1.564623 / 1.541195 (0.023429) 1.684868 / 1.468490 (0.216378) 0.422391 / 4.584777 (-4.162386) 3.816375 / 3.745712 (0.070663) 2.091313 / 5.269862 (-3.178549) 1.247689 / 4.565676 (-3.317988) 0.051707 / 0.424275 (-0.372568) 0.011374 / 0.007607 (0.003767) 0.502026 / 0.226044 (0.275982) 5.012250 / 2.268929 (2.743322) 2.258016 / 55.444624 (-53.186608) 1.922069 / 6.876477 (-4.954408) 2.154072 / 2.142072 (0.012000) 0.543573 / 4.805227 (-4.261654) 0.120687 / 6.500664 (-6.379977) 0.062695 / 0.075469 (-0.012774)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.527544 / 1.841788 (-0.314244) 14.666360 / 8.074308 (6.592052) 25.325495 / 10.191392 (15.134103) 0.920398 / 0.680424 (0.239974) 0.585767 / 0.534201 (0.051566) 0.388743 / 0.579283 (-0.190540) 0.443646 / 0.434364 (0.009282) 0.275864 / 0.540337 (-0.264474) 0.289195 / 1.386936 (-1.097741)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.006786 / 0.011353 (-0.004567) 0.004187 / 0.011008 (-0.006821) 0.028123 / 0.038508 (-0.010385) 0.037326 / 0.023109 (0.014216) 0.344320 / 0.275898 (0.068422) 0.432991 / 0.323480 (0.109511) 0.004460 / 0.007986 (-0.003526) 0.005872 / 0.004328 (0.001544) 0.005110 / 0.004250 (0.000860) 0.053204 / 0.037052 (0.016151) 0.343643 / 0.258489 (0.085154) 0.390535 / 0.293841 (0.096694) 0.030995 / 0.128546 (-0.097552) 0.009765 / 0.075646 (-0.065881) 0.258627 / 0.419271 (-0.160645) 0.055471 / 0.043533 (0.011938) 0.337905 / 0.255139 (0.082766) 0.379363 / 0.283200 (0.096163) 0.115000 / 0.141683 (-0.026683) 1.545361 / 1.452155 (0.093206) 1.533539 / 1.492716 (0.040823)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.306741 / 0.018006 (0.288735) 0.556095 / 0.000490 (0.555605) 0.001073 / 0.000200 (0.000873) 0.000119 / 0.000054 (0.000064)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.025147 / 0.037411 (-0.012265) 0.106211 / 0.014526 (0.091685) 0.115644 / 0.176557 (-0.060912) 0.168901 / 0.737135 (-0.568234) 0.122370 / 0.296338 (-0.173968)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.423923 / 0.215209 (0.208714) 4.217780 / 2.077655 (2.140125) 2.037572 / 1.504120 (0.533452) 1.856054 / 1.541195 (0.314859) 1.984273 / 1.468490 (0.515783) 0.419432 / 4.584777 (-4.165345) 3.851239 / 3.745712 (0.105526) 2.071152 / 5.269862 (-3.198710) 1.247571 / 4.565676 (-3.318106) 0.052303 / 0.424275 (-0.371972) 0.011374 / 0.007607 (0.003767) 0.528096 / 0.226044 (0.302052) 5.282259 / 2.268929 (3.013331) 2.519953 / 55.444624 (-52.924672) 2.186548 / 6.876477 (-4.689929) 2.399501 / 2.142072 (0.257429) 0.530341 / 4.805227 (-4.274886) 0.119955 / 6.500664 (-6.380709) 0.062132 / 0.075469 (-0.013337)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.559076 / 1.841788 (-0.282711) 15.272435 / 8.074308 (7.198127) 25.668309 / 10.191392 (15.476917) 0.958876 / 0.680424 (0.278453) 0.614031 / 0.534201 (0.079830) 0.388004 / 0.579283 (-0.191279) 0.435810 / 0.434364 (0.001447) 0.269921 / 0.540337 (-0.270417) 0.276272 / 1.386936 (-1.110665)

CML watermark

Please sign in to comment.