Merge branch 'main' of github.com:huggingface/datasets into drop-pyth…

…on36
huggingface · Jul 22, 2022 · c4b4cb6 · c4b4cb6 · github-actions · Jul 22, 2022
2 parents 4db3cf9 + 5088e95
commit c4b4cb6
Show file tree

Hide file tree

Showing 37 changed files with 790 additions and 660 deletions.
diff --git a/datasets/crd3/README.md b/datasets/crd3/README.md
@@ -55,9 +55,6 @@ paperswithcode_id: crd3
 - **Repository:** [CRD3 repository](https://github.com/RevanthRameshkumar/CRD3)
 - **Paper:** [Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset](https://www.aclweb.org/anthology/2020.acl-main.459/)
 - **Point of Contact:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
-- **Size of downloaded dataset files:** 279.93 MB
-- **Size of the generated dataset:** 4020.33 MB
-- **Total amount of disk used:** 4300.25 MB
 
 ### Dataset Summary
 
@@ -69,6 +66,7 @@ collaboration and spoken interaction. For each dialogue, there are a large numbe
 and semantic ties to the previous dialogues.
 
 ### Supported Tasks and Leaderboards
+
 `summarization`: The dataset can be used to train a model for abstractive summarization. A [fast abstractive summarization-RL](https://github.com/ChenRocks/fast_abs_rl) model was presented as a baseline, which achieves ROUGE-L-F1 of 25.18.
 
 ### Languages
@@ -79,13 +77,8 @@ The text in the dataset is in English, as spoken by actors on The Critical Role
 
 ### Data Instances
 
-#### default
-
-- **Size of downloaded dataset files:** 279.93 MB
-- **Size of the generated dataset:** 4020.33 MB
-- **Total amount of disk used:** 4300.25 MB
-
 An example of 'train' looks as follows.
+
 ```
 {
     "alignment_score": 3.679936647415161,
@@ -105,7 +98,6 @@ An example of 'train' looks as follows.
 
 The data fields are the same among all splits.
 
-#### default
 - `chunk`: a `string` feature.
 - `chunk_id`: a `int32` feature.
 - `turn_start`: a `int32` feature.
@@ -120,7 +112,7 @@ The data fields are the same among all splits.
 
 | name  | train |validation| test  |
 |-------|------:|---------:|------:|
-|default|26,232|   3,470|4,541|
+|default|38,969|   6,327|7,500|
 
 ## Dataset Creation
 
@@ -180,19 +172,16 @@ This work is licensed under a [Creative Commons Attribution-ShareAlike 4.0 Inter
 
 ### Citation Information
 
-```
-
+```bibtex
 @inproceedings{
 title = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},
 author = {Rameshkumar, Revanth  and Bailey, Peter},
 year = {2020},
 publisher = {Association for Computational Linguistics},
 conference = {ACL}
 }
-
 ```
 
-
 ### Contributions
 
 Thanks to [@thomwolf](https://github.com/thomwolf), [@lhoestq](https://github.com/lhoestq), [@mariamabarham](https://github.com/mariamabarham), [@lewtun](https://github.com/lewtun) for adding this dataset.
diff --git a/datasets/crd3/crd3.py b/datasets/crd3/crd3.py
@@ -45,11 +45,11 @@
 and semantic ties to the previous dialogues.
 """
 
-_URL = "https://github.com/RevanthRameshkumar/CRD3/archive/master.zip"
+_URL = "https://huggingface.co/datasets/crd3/resolve/72bffe55b4d5bf19b530d3e417447b3384ba3673/data/aligned%20data.zip"
 
 
 def get_train_test_dev_files(files, test_split, train_split, dev_split):
-    test_files = dev_files = train_files = []
+    test_files, dev_files, train_files = [], [], []
     for file in files:
         filename = os.path.split(file)[1].split("_")[0]
         if filename in test_split:
@@ -88,20 +88,22 @@ def _info(self):
         )
 
     def _split_generators(self, dl_manager):
-        path = dl_manager.download_and_extract(_URL)
-        test_file = os.path.join(path, "CRD3-master", "data", "aligned data", "test_files")
-        train_file = os.path.join(path, "CRD3-master", "data", "aligned data", "train_files")
-        dev_file = os.path.join(path, "CRD3-master", "data", "aligned data", "val_files")
+        root = dl_manager.download_and_extract(_URL)
+        path = os.path.join(root, "aligned data")
+
+        test_file = os.path.join(path, "test_files")
+        train_file = os.path.join(path, "train_files")
+        dev_file = os.path.join(path, "val_files")
         with open(test_file, encoding="utf-8") as f:
             test_splits = [file.replace("\n", "") for file in f.readlines()]
 
         with open(train_file, encoding="utf-8") as f:
             train_splits = [file.replace("\n", "") for file in f.readlines()]
         with open(dev_file, encoding="utf-8") as f:
             dev_splits = [file.replace("\n", "") for file in f.readlines()]
-        c2 = "CRD3-master/data/aligned data/c=2"
-        c3 = "CRD3-master/data/aligned data/c=3"
-        c4 = "CRD3-master/data/aligned data/c=4"
+        c2 = "c=2"
+        c3 = "c=3"
+        c4 = "c=4"
         files = [os.path.join(path, c2, file) for file in sorted(os.listdir(os.path.join(path, c2)))]
         files.extend([os.path.join(path, c3, file) for file in sorted(os.listdir(os.path.join(path, c3)))])
         files.extend([os.path.join(path, c4, file) for file in sorted(os.listdir(os.path.join(path, c4)))])

diff --git a/datasets/crd3/dataset_infos.json b/datasets/crd3/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "\nStorytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game.\nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding\nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player\ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail,\nand semantic ties to the previous dialogues.\n", "citation": "\n@inproceedings{\ntitle = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},\nauthor = {Rameshkumar, Revanth  and Bailey, Peter},\nyear = {2020},\npublisher = {Association for Computational Linguistics},\nconference = {ACL}\n}\n ", "homepage": "https://github.com/RevanthRameshkumar/CRD3", "license": "", "features": {"chunk": {"dtype": "string", "id": null, "_type": "Value"}, "chunk_id": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_start": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_end": {"dtype": "int32", "id": null, "_type": "Value"}, "alignment_score": {"dtype": "float32", "id": null, "_type": "Value"}, "turns": {"feature": {"names": {"dtype": "string", "id": null, "_type": "Value"}, "utterances": {"dtype": "string", "id": null, "_type": "Value"}, "number": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "crd3", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 318560673, "num_examples": 52796, "dataset_name": "crd3"}, "test": {"name": "test", "num_bytes": 318560673, "num_examples": 52796, "dataset_name": "crd3"}, "validation": {"name": "validation", "num_bytes": 318560673, "num_examples": 52796, "dataset_name": "crd3"}}, "download_checksums": {"https://github.com/RevanthRameshkumar/CRD3/archive/master.zip": {"num_bytes": 294222220, "checksum": "c77a937394f265735ba54b32a7a051f77a97d264c74b0535dee77ef9791815b5"}}, "download_size": 294222220, "post_processing_size": null, "dataset_size": 955682019, "size_in_bytes": 1249904239}}
+{"default": {"description": "\nStorytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game.\nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding\nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player\ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail,\nand semantic ties to the previous dialogues.\n", "citation": "\n@inproceedings{\ntitle = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},\nauthor = {Rameshkumar, Revanth  and Bailey, Peter},\nyear = {2020},\npublisher = {Association for Computational Linguistics},\nconference = {ACL}\n}\n ", "homepage": "https://github.com/RevanthRameshkumar/CRD3", "license": "", "features": {"chunk": {"dtype": "string", "id": null, "_type": "Value"}, "chunk_id": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_start": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_end": {"dtype": "int32", "id": null, "_type": "Value"}, "alignment_score": {"dtype": "float32", "id": null, "_type": "Value"}, "turns": [{"names": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "utterances": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "number": {"dtype": "int32", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "crd3", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 236605152, "num_examples": 38969, "dataset_name": "crd3"}, "test": {"name": "test", "num_bytes": 40269203, "num_examples": 7500, "dataset_name": "crd3"}, "validation": {"name": "validation", "num_bytes": 41543528, "num_examples": 6327, "dataset_name": "crd3"}}, "download_checksums": {"https://huggingface.co/datasets/crd3/resolve/72bffe55b4d5bf19b530d3e417447b3384ba3673/data/aligned%20data.zip": {"num_bytes": 117519820, "checksum": "c66bd9f7848bcd514a35c154edd2fc874f1a3076876d8bd7208bf3caf4b7fb0b"}}, "download_size": 117519820, "post_processing_size": null, "dataset_size": 318417883, "size_in_bytes": 435937703}}
diff --git a/datasets/crd3/dummy/0.0.0/dummy_data.zip b/datasets/crd3/dummy/0.0.0/dummy_data.zip
diff --git a/datasets/mlsum/README.md b/datasets/mlsum/README.md
@@ -20,12 +20,13 @@ source_datasets:
 - extended|cnn_dailymail
 - original
 task_categories:
+- summarization
 - translation
 - text-classification
 task_ids:
+- news-articles-summarization
 - multi-class-classification
 - multi-label-classification
-- summarization
 - topic-classification
 paperswithcode_id: mlsum
 pretty_name: MLSUM