Drop Python 3.6 support (#4460)

* Remove python 3.6 code * Update requirements * Style * Update audio gh action * Benchmarks fix attempt #1 * Benchmarks fix attempt no.2 * Use newer image * Remove backticks * Add suggested command to benchmark action * Avoid some FutureWarnings and DeprecationWarnings * Disable test * Remove 3.6 pickling test * CI test * Use python 3.7 in ubuntu-latest * Disable s3 test on Linux * Remove weird json file * Remove cloudpickle stuff * Use lower torchaudio version * Try to fix s3 errors * Another attempt * Disable test
huggingface · Jul 26, 2022 · 75e6b74 · 75e6b74 · github-actions · Jul 26, 2022
1 parent 10b1355
commit 75e6b74
Show file tree

Hide file tree

Showing 15 changed files with 55 additions and 168 deletions.
diff --git a/.github/hub/update_hub_repositories.py b/.github/hub/update_hub_repositories.py
@@ -1,4 +1,3 @@
-import base64
 import distutils.dir_util
 import logging
 import os

diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
@@ -3,13 +3,16 @@ on: [push]
 jobs:
   run:
     runs-on: [ubuntu-latest]
-    container: docker://dvcorg/cml-py3:latest
+    container: docker://dvcorg/cml:latest
     steps:
       - uses: actions/checkout@v2
       - name: cml_run
         env:
           repo_token: ${{ secrets.GITHUB_TOKEN }}
         run: |
+          # See https://github.com/actions/checkout/issues/760
+          git config --global --add safe.directory /__w/datasets/datasets
+
           # Your ML workflow goes here
 
           pip install --upgrade pip

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.6"
+          python-version: "3.7"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
@@ -49,21 +49,15 @@ jobs:
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
-      - name: Set up Python 3.6
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.6
       - name: Set up Python 3.7
-        if: ${{ matrix.os == 'windows-latest' }}
         uses: actions/setup-python@v4
         with:
           python-version: 3.7
       - name: Upgrade pip
         run: python -m pip install --upgrade pip
       - name: Pin setuptools-scm
         if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.6" && pip install "setuptools-scm==6.4.2"
+        run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.7" && pip install "setuptools-scm==6.4.2"
       - name: Install dependencies
         run: |
           pip install .[tests]

diff --git a/Makefile b/Makefile
@@ -3,14 +3,14 @@
 # Check that source code meets quality standards
 
 quality:
-	black --check --line-length 119 --target-version py36 tests src benchmarks datasets/**/*.py metrics
+	black --check --line-length 119 --target-version py37 tests src benchmarks datasets/**/*.py metrics
 	isort --check-only tests src benchmarks datasets/**/*.py metrics
 	flake8 tests src benchmarks datasets/**/*.py metrics
 
 # Format source code automatically
 
 style:
-	black --line-length 119 --target-version py36 tests src benchmarks datasets/**/*.py metrics
+	black --line-length 119 --target-version py37 tests src benchmarks datasets/**/*.py metrics
 	isort tests src benchmarks datasets/**/*.py metrics
 
 # Run tests for the library

diff --git a/additional-tests-requirements.txt b/additional-tests-requirements.txt
@@ -1,4 +1,4 @@
-unbabel-comet>=1.0.0;python_version>'3.6'
+unbabel-comet>=1.0.0
 git+https://github.com/google-research/bleurt.git
 git+https://github.com/ns-moosavi/coval.git
 git+https://github.com/hendrycks/math.git
diff --git a/docs/source/installation.md b/docs/source/installation.md
@@ -1,6 +1,6 @@
 # Installation
 
-Before you start, you'll need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.6+**.
+Before you start, you'll need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.7+**.
 
 <Tip>
 

diff --git a/setup.py b/setup.py
@@ -55,7 +55,6 @@
    Then push the change with a message 'set dev version'
 """
 
-import os
 
 from setuptools import find_packages, setup
 
@@ -74,8 +73,6 @@
     "requests>=2.19.0",
     # progress bars in download and scripts
     "tqdm>=4.62.1",
-    # dataclasses for Python versions that don't have it
-    "dataclasses;python_version<'3.7'",
     # for fast hashing
     "xxhash",
     # for better multiprocessing
@@ -105,7 +102,7 @@
 BENCHMARKS_REQUIRE = [
     "numpy==1.18.5",
     "tensorflow==2.3.0",
-    "torch==1.6.0",
+    "torch==1.7.1",
     "transformers==3.0.2",
 ]
 
@@ -128,7 +125,7 @@
     "s3fs>=2021.11.1",  # aligned with fsspec[http]>=2021.11.1
     "tensorflow>=2.3,!=2.6.0,!=2.6.1",
     "torch",
-    "torchaudio",
+    "torchaudio<0.12.0",
     "soundfile",
     "transformers",
     # datasets dependencies
@@ -165,8 +162,6 @@
     "texttable>=1.6.3",
     "Werkzeug>=1.0.1",
     "six~=1.15.0",
-    # metadata validation
-    "importlib_resources;python_version<'3.7'",
 ]
 
 TESTS_REQUIRE.extend(VISION_REQURE)
@@ -214,6 +209,7 @@
     packages=find_packages("src"),
     package_data={"datasets": ["py.typed", "scripts/templates/*"], "datasets.utils.resources": ["*.json", "*.yaml", "*.tsv"]},
     entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
+    python_requires=">=3.7.0",
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,
     classifiers=[
@@ -224,7 +220,6 @@
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",

diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
@@ -19,10 +19,17 @@
 
 __version__ = "2.4.1.dev0"
 
+import platform
+
 import pyarrow
 from packaging import version
 
 
+if version.parse(platform.python_version()) < version.parse("3.7"):
+    raise ImportWarning(
+        "To use `datasets`, Python>=3.7 is required, and the current version of Python doesn't match this condition."
+    )
+
 if version.parse(pyarrow.__version__).major < 6:
     raise ImportWarning(
         "To use `datasets`, the module `pyarrow>=6.0.0` is required, and the current version of `pyarrow` doesn't match this condition.\n"
@@ -31,6 +38,7 @@
 
 SCRIPTS_VERSION = "main" if version.parse(__version__).is_devrelease else __version__
 
+del platform
 del pyarrow
 del version
 

diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -824,7 +824,7 @@ def __getitem__(self, item: Union[int, slice, np.ndarray]) -> Union[np.ndarray,
     def take(
         self, indices: Sequence_[int], allow_fill: bool = False, fill_value: bool = None
     ) -> "PandasArrayExtensionArray":
-        indices: np.ndarray = np.asarray(indices, dtype=np.int)
+        indices: np.ndarray = np.asarray(indices, dtype=int)
         if allow_fill:
             fill_value = (
                 self.dtype.na_value if fill_value is None else np.asarray(fill_value, dtype=self.dtype.value_type)

diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py
@@ -22,17 +22,15 @@
 import functools
 import itertools
 import os
-import pickle
 import re
-import sys
 import types
 from contextlib import contextmanager
 from dataclasses import fields, is_dataclass
 from io import BytesIO as StringIO
 from multiprocessing import Pool, RLock
 from shutil import disk_usage
 from types import CodeType, FunctionType
-from typing import Callable, ClassVar, Dict, Generic, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
 
 import dill
@@ -552,19 +550,6 @@ class Pickler(dill.Pickler):
 
     dispatch = dill._dill.MetaCatchingDict(dill.Pickler.dispatch.copy())
 
-    def save_global(self, obj, name=None):
-        if sys.version_info[:2] < (3, 7) and _CloudPickleTypeHintFix._is_parametrized_type_hint(
-            obj
-        ):  # noqa  # pragma: no branch
-            # Parametrized typing constructs in Python < 3.7 are not compatible
-            # with type checks and ``isinstance`` semantics. For this reason,
-            # it is easier to detect them using a duck-typing-based check
-            # (``_is_parametrized_type_hint``) than to populate the Pickler's
-            # dispatch with type-specific savers.
-            _CloudPickleTypeHintFix._save_parametrized_type_hint(self, obj)
-        else:
-            dill.Pickler.save_global(self, obj, name=name)
-
     def memoize(self, obj):
         # don't memoize strings since two identical strings can have different python ids
         if type(obj) != str:
@@ -610,47 +595,6 @@ def proxy(func):
     return proxy
 
 
-class _CloudPickleTypeHintFix:
-    """
-    Type hints can't be properly pickled in python < 3.7
-    CloudPickle provided a way to make it work in older versions.
-    This class provide utilities to fix pickling of type hints in older versions.
-    from https://github.com/cloudpipe/cloudpickle/pull/318/files
-    """
-
-    def _is_parametrized_type_hint(obj):
-        # This is very cheap but might generate false positives.
-        origin = getattr(obj, "__origin__", None)  # typing Constructs
-        values = getattr(obj, "__values__", None)  # typing_extensions.Literal
-        type_ = getattr(obj, "__type__", None)  # typing_extensions.Final
-        return origin is not None or values is not None or type_ is not None
-
-    def _create_parametrized_type_hint(origin, args):
-        return origin[args]
-
-    def _save_parametrized_type_hint(pickler, obj):
-        # The distorted type check sematic for typing construct becomes:
-        # ``type(obj) is type(TypeHint)``, which means "obj is a
-        # parametrized TypeHint"
-        if type(obj) is type(Literal):  # pragma: no branch
-            initargs = (Literal, obj.__values__)
-        elif type(obj) is type(Final):  # pragma: no branch
-            initargs = (Final, obj.__type__)
-        elif type(obj) is type(ClassVar):
-            initargs = (ClassVar, obj.__type__)
-        elif type(obj) in [type(Union), type(Tuple), type(Generic)]:
-            initargs = (obj.__origin__, obj.__args__)
-        elif type(obj) is type(Callable):
-            args = obj.__args__
-            if args[0] is Ellipsis:
-                initargs = (obj.__origin__, args)
-            else:
-                initargs = (obj.__origin__, (list(args[:-1]), args[-1]))
-        else:  # pragma: no cover
-            raise pickle.PicklingError(f"Datasets pickle Error: Unknown type {type(obj)}")
-        pickler.save_reduce(_CloudPickleTypeHintFix._create_parametrized_type_hint, initargs, obj=obj)
-
-
 @pklregister(CodeType)
 def _save_code(pickler, obj):
     """

diff --git a/tests/commands/test_dummy_data.py b/tests/commands/test_dummy_data.py
@@ -1,45 +1,24 @@
 import os
 from collections import namedtuple
-from dataclasses import dataclass
 
-from packaging import version
-
-from datasets import config
 from datasets.commands.dummy_data import DummyDataCommand
 
 
-if config.PY_VERSION >= version.parse("3.7"):
-    DummyDataCommandArgs = namedtuple(
-        "DummyDataCommandArgs",
-        [
-            "path_to_dataset",
-            "auto_generate",
-            "n_lines",
-            "json_field",
-            "xml_tag",
-            "match_text_files",
-            "keep_uncompressed",
-            "cache_dir",
-            "encoding",
-        ],
-        defaults=[False, 5, None, None, None, False, None, None],
-    )
-else:
-
-    @dataclass
-    class DummyDataCommandArgs:
-        path_to_dataset: str
-        auto_generate: bool = False
-        n_lines: int = 5
-        json_field: str = None
-        xml_tag: str = None
-        match_text_files: str = None
-        keep_uncompressed: bool = False
-        cache_dir: str = None
-        encoding: str = None
-
-        def __iter__(self):
-            return iter(self.__dict__.values())
+DummyDataCommandArgs = namedtuple(
+    "DummyDataCommandArgs",
+    [
+        "path_to_dataset",
+        "auto_generate",
+        "n_lines",
+        "json_field",
+        "xml_tag",
+        "match_text_files",
+        "keep_uncompressed",
+        "cache_dir",
+        "encoding",
+    ],
+    defaults=[False, 5, None, None, None, False, None, None],
+)
 
 
 class MockDummyDataCommand(DummyDataCommand):

diff --git a/tests/commands/test_test.py b/tests/commands/test_test.py
@@ -1,46 +1,26 @@
 import json
 import os
 from collections import namedtuple
-from dataclasses import dataclass
-
-from packaging import version
 
 from datasets import config
 from datasets.commands.test import TestCommand
 
 
-if config.PY_VERSION >= version.parse("3.7"):
-    _TestCommandArgs = namedtuple(
-        "_TestCommandArgs",
-        [
-            "dataset",
-            "name",
-            "cache_dir",
-            "data_dir",
-            "all_configs",
-            "save_infos",
-            "ignore_verifications",
-            "force_redownload",
-            "clear_cache",
-        ],
-        defaults=[None, None, None, False, False, False, False, False],
-    )
-else:
-
-    @dataclass
-    class _TestCommandArgs:
-        dataset: str
-        name: str = None
-        cache_dir: str = None
-        data_dir: str = None
-        all_configs: bool = False
-        save_infos: bool = False
-        ignore_verifications: bool = False
-        force_redownload: bool = False
-        clear_cache: bool = False
-
-        def __iter__(self):
-            return iter(self.__dict__.values())
+_TestCommandArgs = namedtuple(
+    "_TestCommandArgs",
+    [
+        "dataset",
+        "name",
+        "cache_dir",
+        "data_dir",
+        "all_configs",
+        "save_infos",
+        "ignore_verifications",
+        "force_redownload",
+        "clear_cache",
+    ],
+    defaults=[None, None, None, False, False, False, False, False],
+)
 
 
 def test_test_command(dataset_loading_script_dir):

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -3118,7 +3118,7 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param
 
 
 @pytest.mark.skipif(
-    os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
+    os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"),
     reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"',
 )  # TODO: find what's wrong with CircleCI / GitHub Actions
 @require_s3