huggingface · mariosasko · Jul 26, 2022 · Jun 8, 2022 · Jun 8, 2022 · Jun 8, 2022
diff --git a/.github/hub/update_hub_repositories.py b/.github/hub/update_hub_repositories.py
@@ -1,4 +1,3 @@
-import base64
 import distutils.dir_util
 import logging
 import os

diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
@@ -3,13 +3,16 @@ on: [push]
 jobs:
   run:
     runs-on: [ubuntu-latest]
-    container: docker://dvcorg/cml-py3:latest
+    container: docker://dvcorg/cml:latest
     steps:
       - uses: actions/checkout@v2
       - name: cml_run
         env:
           repo_token: ${{ secrets.GITHUB_TOKEN }}
         run: |
+          # See https://github.com/actions/checkout/issues/760
+          git config --global --add safe.directory /__w/datasets/datasets
+
           # Your ML workflow goes here
 
           pip install --upgrade pip

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.6"
+          python-version: "3.7"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
@@ -49,21 +49,15 @@ jobs:
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
-      - name: Set up Python 3.6
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.6
       - name: Set up Python 3.7
-        if: ${{ matrix.os == 'windows-latest' }}
         uses: actions/setup-python@v4
         with:
           python-version: 3.7
       - name: Upgrade pip
         run: python -m pip install --upgrade pip
       - name: Pin setuptools-scm
         if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.6" && pip install "setuptools-scm==6.4.2"
+        run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.7" && pip install "setuptools-scm==6.4.2"
       - name: Install dependencies
         run: |
           pip install .[tests]

diff --git a/Makefile b/Makefile
@@ -3,14 +3,14 @@
 # Check that source code meets quality standards
 
 quality:
-	black --check --line-length 119 --target-version py36 tests src benchmarks datasets/**/*.py metrics
+	black --check --line-length 119 --target-version py37 tests src benchmarks datasets/**/*.py metrics
 	isort --check-only tests src benchmarks datasets/**/*.py metrics
 	flake8 tests src benchmarks datasets/**/*.py metrics
 
 # Format source code automatically
 
 style:
-	black --line-length 119 --target-version py36 tests src benchmarks datasets/**/*.py metrics
+	black --line-length 119 --target-version py37 tests src benchmarks datasets/**/*.py metrics
 	isort tests src benchmarks datasets/**/*.py metrics
 
 # Run tests for the library

diff --git a/additional-tests-requirements.txt b/additional-tests-requirements.txt
@@ -1,4 +1,4 @@
-unbabel-comet>=1.0.0;python_version>'3.6'
+unbabel-comet>=1.0.0
 git+https://github.com/google-research/bleurt.git
 git+https://github.com/ns-moosavi/coval.git
 git+https://github.com/hendrycks/math.git
diff --git a/docs/source/installation.md b/docs/source/installation.md
@@ -1,6 +1,6 @@
 # Installation
 
-Before you start, you'll need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.6+**.
+Before you start, you'll need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.7+**.
 
 <Tip>
 

diff --git a/setup.py b/setup.py
@@ -55,7 +55,6 @@
    Then push the change with a message 'set dev version'
 """
 
-import os
 
 from setuptools import find_packages, setup
 
@@ -74,8 +73,6 @@
     "requests>=2.19.0",
     # progress bars in download and scripts
     "tqdm>=4.62.1",
-    # dataclasses for Python versions that don't have it
-    "dataclasses;python_version<'3.7'",
     # for fast hashing
     "xxhash",
     # for better multiprocessing
@@ -105,7 +102,7 @@
 BENCHMARKS_REQUIRE = [
     "numpy==1.18.5",
     "tensorflow==2.3.0",
-    "torch==1.6.0",
+    "torch==1.7.1",
     "transformers==3.0.2",
 ]
 
@@ -165,8 +162,6 @@
     "texttable>=1.6.3",
     "Werkzeug>=1.0.1",
     "six~=1.15.0",
-    # metadata validation
-    "importlib_resources;python_version<'3.7'",
 ]
 
 TESTS_REQUIRE.extend(VISION_REQURE)
@@ -214,6 +209,7 @@
     packages=find_packages("src"),
     package_data={"datasets": ["py.typed", "scripts/templates/*"], "datasets.utils.resources": ["*.json", "*.yaml", "*.tsv"]},
     entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
+    python_requires=">=3.7.0",
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,
     classifiers=[
@@ -224,7 +220,6 @@
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",

diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
@@ -19,10 +19,17 @@
 
 __version__ = "2.3.3.dev0"
 
+import platform
+
 import pyarrow
 from packaging import version
 
 
+if version.parse(platform.python_version()) < version.parse("3.7"):
+    raise ImportWarning(
+        "To use `datasets`, Python>=3.7 is required, and the current version of Python doesn't match this condition."
+    )
+
 if version.parse(pyarrow.__version__).major < 6:
     raise ImportWarning(
         "To use `datasets`, the module `pyarrow>=6.0.0` is required, and the current version of `pyarrow` doesn't match this condition.\n"
@@ -31,6 +38,7 @@
 
 SCRIPTS_VERSION = "main" if version.parse(__version__).is_devrelease else __version__
 
+del platform
 del pyarrow
 del version
 

diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -824,7 +824,7 @@ def __getitem__(self, item: Union[int, slice, np.ndarray]) -> Union[np.ndarray,
     def take(
         self, indices: Sequence_[int], allow_fill: bool = False, fill_value: bool = None
     ) -> "PandasArrayExtensionArray":
-        indices: np.ndarray = np.asarray(indices, dtype=np.int)
+        indices: np.ndarray = np.asarray(indices, dtype=int)
         if allow_fill:
             fill_value = (
                 self.dtype.na_value if fill_value is None else np.asarray(fill_value, dtype=self.dtype.value_type)

diff --git a/src/datasets/packaged_modules/text/dataset_infos.json b/src/datasets/packaged_modules/text/dataset_infos.json
@@ -0,0 +1 @@
+{"bigscience": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "text", "config_name": "bigscience", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 938, "num_examples": 22, "dataset_name": "text"}}, "download_checksums": {"C:\\Users\\Mario\\Desktop\\bigscience\\biscience.txt": {"num_bytes": 892, "checksum": "1e1f85c9e2aefb6990dc6ec4a8805af1e5451ebecb7e9f50face10c83eed742e"}}, "download_size": 892, "post_processing_size": null, "dataset_size": 938, "size_in_bytes": 1830}}
diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py
@@ -23,7 +23,6 @@
 import os
 import pickle
 import re
-import sys
 import types
 from contextlib import contextmanager
 from io import BytesIO as StringIO
@@ -515,19 +514,6 @@ class Pickler(dill.Pickler):
 
     dispatch = dill._dill.MetaCatchingDict(dill.Pickler.dispatch.copy())
 
-    def save_global(self, obj, name=None):
-        if sys.version_info[:2] < (3, 7) and _CloudPickleTypeHintFix._is_parametrized_type_hint(
-            obj
-        ):  # noqa  # pragma: no branch
-            # Parametrized typing constructs in Python < 3.7 are not compatible
-            # with type checks and ``isinstance`` semantics. For this reason,
-            # it is easier to detect them using a duck-typing-based check
-            # (``_is_parametrized_type_hint``) than to populate the Pickler's
-            # dispatch with type-specific savers.
-            _CloudPickleTypeHintFix._save_parametrized_type_hint(self, obj)
-        else:
-            dill.Pickler.save_global(self, obj, name=name)
-
     def memoize(self, obj):
         # don't memoize strings since two identical strings can have different python ids
         if type(obj) != str:

diff --git a/tests/commands/test_dummy_data.py b/tests/commands/test_dummy_data.py
@@ -1,45 +1,24 @@
 import os
 from collections import namedtuple
-from dataclasses import dataclass
 
-from packaging import version
-
-from datasets import config
 from datasets.commands.dummy_data import DummyDataCommand
 
 
-if config.PY_VERSION >= version.parse("3.7"):
-    DummyDataCommandArgs = namedtuple(
-        "DummyDataCommandArgs",
-        [
-            "path_to_dataset",
-            "auto_generate",
-            "n_lines",
-            "json_field",
-            "xml_tag",
-            "match_text_files",
-            "keep_uncompressed",
-            "cache_dir",
-            "encoding",
-        ],
-        defaults=[False, 5, None, None, None, False, None, None],
-    )
-else:
-
-    @dataclass
-    class DummyDataCommandArgs:
-        path_to_dataset: str
-        auto_generate: bool = False
-        n_lines: int = 5
-        json_field: str = None
-        xml_tag: str = None
-        match_text_files: str = None
-        keep_uncompressed: bool = False
-        cache_dir: str = None
-        encoding: str = None
-
-        def __iter__(self):
-            return iter(self.__dict__.values())
+DummyDataCommandArgs = namedtuple(
+    "DummyDataCommandArgs",
+    [
+        "path_to_dataset",
+        "auto_generate",
+        "n_lines",
+        "json_field",
+        "xml_tag",
+        "match_text_files",
+        "keep_uncompressed",
+        "cache_dir",
+        "encoding",
+    ],
+    defaults=[False, 5, None, None, None, False, None, None],
+)
 
 
 class MockDummyDataCommand(DummyDataCommand):

diff --git a/tests/commands/test_test.py b/tests/commands/test_test.py
@@ -1,46 +1,26 @@
 import json
 import os
 from collections import namedtuple
-from dataclasses import dataclass
-
-from packaging import version
 
 from datasets import config
 from datasets.commands.test import TestCommand
 
 
-if config.PY_VERSION >= version.parse("3.7"):
-    _TestCommandArgs = namedtuple(
-        "_TestCommandArgs",
-        [
-            "dataset",
-            "name",
-            "cache_dir",
-            "data_dir",
-            "all_configs",
-            "save_infos",
-            "ignore_verifications",
-            "force_redownload",
-            "clear_cache",
-        ],
-        defaults=[None, None, None, False, False, False, False, False],
-    )
-else:
-
-    @dataclass
-    class _TestCommandArgs:
-        dataset: str
-        name: str = None
-        cache_dir: str = None
-        data_dir: str = None
-        all_configs: bool = False
-        save_infos: bool = False
-        ignore_verifications: bool = False
-        force_redownload: bool = False
-        clear_cache: bool = False
-
-        def __iter__(self):
-            return iter(self.__dict__.values())
+_TestCommandArgs = namedtuple(
+    "_TestCommandArgs",
+    [
+        "dataset",
+        "name",
+        "cache_dir",
+        "data_dir",
+        "all_configs",
+        "save_infos",
+        "ignore_verifications",
+        "force_redownload",
+        "clear_cache",
+    ],
+    defaults=[None, None, None, False, False, False, False, False],
+)
 
 
 def test_test_command(dataset_loading_script_dir):

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -3119,7 +3119,7 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param
 
 
 @pytest.mark.skipif(
-    os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
+    os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
     reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
 )  # TODO: find what's wrong with CircleCI / GitHub Actions
 @require_s3

diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py
@@ -665,7 +665,7 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path):
 
 
 @pytest.mark.skipif(
-    os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
+    os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
     reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
 )  # TODO: find what's wrong with CircleCI / GitHub Actions
 @require_s3

diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py
@@ -226,21 +226,6 @@ def globalvars_mock2_side_effect(func, *args, **kwargs):
         self.assertEqual(hash1, hash2)
 
 
-class TypeHintDumpTest(TestCase):
-    def test_dump_type_hint(self):
-        from typing import Union
-
-        t1 = Union[str, None]  # this type is not picklable in python 3.6
-        # let's check that we can pickle it anyway using our pickler, even in 3.6
-        hash1 = md5(datasets.utils.py_utils.dumps(t1)).hexdigest()
-        t2 = Union[str]  # this type is picklable in python 3.6
-        hash2 = md5(datasets.utils.py_utils.dumps(t2)).hexdigest()
-        t3 = Union[str, None]
-        hash3 = md5(datasets.utils.py_utils.dumps(t3)).hexdigest()
-        self.assertEqual(hash1, hash3)
-        self.assertNotEqual(hash1, hash2)
-
-
 class HashingTest(TestCase):
     def test_hash_simple(self):
         hash1 = Hasher.hash("hello")