diff --git a/doc/conftest.py b/doc/conftest.py index d0f49ac087477..719c46fccf2d2 100644 --- a/doc/conftest.py +++ b/doc/conftest.py @@ -126,8 +126,6 @@ def pytest_runtest_setup(item): setup_working_with_text_data() elif fname.endswith("modules/compose.rst") or is_index: setup_compose() - elif IS_PYPY and fname.endswith("modules/feature_extraction.rst"): - raise SkipTest("FeatureHasher is not compatible with PyPy") elif fname.endswith("datasets/loading_other_datasets.rst"): setup_loading_other_datasets() elif fname.endswith("modules/impute.rst"): diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index cccf7546d6f19..4555c84a12619 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -474,6 +474,9 @@ Changelog :mod:`sklearn.feature_extraction` ................................. +- |Feature| :class:`feature_extraction.FeatureHasher` now supports PyPy. + :pr:`23023` by `Thomas Fan`_. + - |Fix| :class:`feature_extraction.FeatureHasher` now validates input parameters in `transform` instead of `__init__`. :pr:`21573` by :user:`Hannah Bohle ` and :user:`Maren Westermann `. diff --git a/sklearn/conftest.py b/sklearn/conftest.py index 821b9c2044dbc..27ac720cbfe2e 100644 --- a/sklearn/conftest.py +++ b/sklearn/conftest.py @@ -118,17 +118,8 @@ def pytest_collection_modifyitems(config, items): dataset_fetchers[name]() for item in items: - # FeatureHasher is not compatible with PyPy - if ( - item.name.endswith(("_hash.FeatureHasher", "text.HashingVectorizer")) - and platform.python_implementation() == "PyPy" - ): - marker = pytest.mark.skip( - reason="FeatureHasher is not compatible with PyPy" - ) - item.add_marker(marker) # Known failure on with GradientBoostingClassifier on ARM64 - elif ( + if ( item.name.endswith("GradientBoostingClassifier") and platform.machine() == "aarch64" ): diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index d6c4c95e540b5..dcb19d9acded8 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -6,19 +6,8 @@ import numpy as np import scipy.sparse as sp -from ..utils import IS_PYPY from ..base import BaseEstimator, TransformerMixin - -if not IS_PYPY: - from ._hashing_fast import transform as _hashing_transform -else: - - def _hashing_transform(*args, **kwargs): - raise NotImplementedError( - "FeatureHasher is not compatible with PyPy (see " - "https://github.com/scikit-learn/scikit-learn/issues/11540 " - "for the status updates)." - ) +from ._hashing_fast import transform as _hashing_transform def _iteritems(d): diff --git a/sklearn/feature_extraction/_hashing_fast.pyx b/sklearn/feature_extraction/_hashing_fast.pyx index 722538fe166d3..4de6df5975790 100644 --- a/sklearn/feature_extraction/_hashing_fast.pyx +++ b/sklearn/feature_extraction/_hashing_fast.pyx @@ -3,13 +3,15 @@ import sys import array -from cpython cimport array cimport cython from libc.stdlib cimport abs +from libcpp.vector cimport vector + cimport numpy as np import numpy as np - +from ..utils._typedefs cimport INT32TYPE_t, INT64TYPE_t from ..utils.murmurhash cimport murmurhash3_bytes_s32 +from ..utils._vector_sentinel cimport vector_to_nd_array np.import_array() @@ -25,19 +27,12 @@ def transform(raw_X, Py_ssize_t n_features, dtype, For constructing a scipy.sparse.csr_matrix. """ - assert n_features > 0 - - cdef np.int32_t h + cdef INT32TYPE_t h cdef double value - cdef array.array indices - cdef array.array indptr - indices = array.array("i") - indices_array_dtype = "q" - indices_np_dtype = np.longlong - - - indptr = array.array(indices_array_dtype, [0]) + cdef vector[INT32TYPE_t] indices + cdef vector[INT64TYPE_t] indptr + indptr.push_back(0) # Since Python array does not understand Numpy dtypes, we grow the indices # and values arrays ourselves. Use a Py_ssize_t capacity for safety. @@ -65,13 +60,12 @@ def transform(raw_X, Py_ssize_t n_features, dtype, h = murmurhash3_bytes_s32(f, seed) - array.resize_smart(indices, len(indices) + 1) if h == - 2147483648: # abs(-2**31) is undefined behavior because h is a `np.int32` # The following is defined such that it is equal to: abs(-2**31) % n_features - indices[len(indices) - 1] = (2147483647 - (n_features - 1)) % n_features + indices.push_back((2147483647 - (n_features - 1)) % n_features) else: - indices[len(indices) - 1] = abs(h) % n_features + indices.push_back(abs(h) % n_features) # improve inner product preservation in the hashed space if alternate_sign: value *= (h >= 0) * 2 - 1 @@ -84,16 +78,15 @@ def transform(raw_X, Py_ssize_t n_features, dtype, # references to the arrays due to Cython's error checking values = np.resize(values, capacity) - array.resize_smart(indptr, len(indptr) + 1) - indptr[len(indptr) - 1] = size + indptr.push_back(size) - indices_a = np.frombuffer(indices, dtype=np.int32) - indptr_a = np.frombuffer(indptr, dtype=indices_np_dtype) + indicies_array = vector_to_nd_array(&indices) + indptr_array = vector_to_nd_array(&indptr) - if indptr[len(indptr) - 1] > np.iinfo(np.int32).max: # = 2**31 - 1 + if indptr_array[indptr_array.shape[0]-1] > np.iinfo(np.int32).max: # = 2**31 - 1 # both indices and indptr have the same dtype in CSR arrays - indices_a = indices_a.astype(np.int64, copy=False) + indicies_array = indicies_array.astype(np.int64, copy=False) else: - indptr_a = indptr_a.astype(np.int32, copy=False) + indptr_array = indptr_array.astype(np.int32, copy=False) - return (indices_a, indptr_a, values[:size]) + return (indicies_array, indptr_array, values[:size]) diff --git a/sklearn/feature_extraction/setup.py b/sklearn/feature_extraction/setup.py index c475e9d84f13f..a7f2ff0f9dcee 100644 --- a/sklearn/feature_extraction/setup.py +++ b/sklearn/feature_extraction/setup.py @@ -1,5 +1,4 @@ import os -import platform def configuration(parent_package="", top_path=None): @@ -11,13 +10,13 @@ def configuration(parent_package="", top_path=None): if os.name == "posix": libraries.append("m") - if platform.python_implementation() != "PyPy": - config.add_extension( - "_hashing_fast", - sources=["_hashing_fast.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) + config.add_extension( + "_hashing_fast", + sources=["_hashing_fast.pyx"], + include_dirs=[numpy.get_include()], + language="c++", + libraries=libraries, + ) config.add_subpackage("tests") return config diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index 7eff8d1ebf49b..79ec2922e16d8 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -3,9 +3,7 @@ import pytest from sklearn.feature_extraction import FeatureHasher -from sklearn.utils._testing import fails_if_pypy - -pytestmark = fails_if_pypy +from sklearn.feature_extraction._hashing_fast import transform as _hashing_transform def test_feature_hasher_dicts(): @@ -47,9 +45,6 @@ def test_feature_hasher_strings(): def test_hashing_transform_seed(): # check the influence of the seed when computing the hashes - # import is here to avoid importing on pypy - from sklearn.feature_extraction._hashing_fast import transform as _hashing_transform - raw_X = [ ["foo", "bar", "baz", "foo".encode("ascii")], ["bar".encode("ascii"), "baz", "quux"], diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 83ff96428a257..aa056e92b3d12 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -1210,16 +1210,6 @@ def is_abstract(c): classes = [ (name, est_cls) for name, est_cls in classes if not name.startswith("_") ] - - # TODO: Remove when FeatureHasher is implemented in PYPY - # Skips FeatureHasher for PYPY - if IS_PYPY and "feature_extraction" in modname: - classes = [ - (name, est_cls) - for name, est_cls in classes - if name == "FeatureHasher" - ] - all_classes.extend(classes) all_classes = set(all_classes) diff --git a/sklearn/utils/_typedefs.pxd b/sklearn/utils/_typedefs.pxd index c98a741efe25a..05b3f4e189f9f 100644 --- a/sklearn/utils/_typedefs.pxd +++ b/sklearn/utils/_typedefs.pxd @@ -7,7 +7,11 @@ ctypedef np.float64_t DTYPE_t # WARNING: should match DTYPE in typedefs.pyx cdef enum: DTYPECODE = np.NPY_FLOAT64 ITYPECODE = np.NPY_INTP + INT32TYPECODE = np.NPY_INT32 + INT64TYPECODE = np.NPY_INT64 # Index/integer type. # WARNING: ITYPE_t must be a signed integer type or you will have a bad time! ctypedef np.intp_t ITYPE_t # WARNING: should match ITYPE in typedefs.pyx +ctypedef np.int32_t INT32TYPE_t # WARNING: should match INT32TYPE in typedefs.pyx +ctypedef np.int64_t INT64TYPE_t # WARNING: should match INT32TYPE in typedefs.pyx diff --git a/sklearn/utils/_typedefs.pyx b/sklearn/utils/_typedefs.pyx index 789afb4997dd1..96e0fcb38337c 100644 --- a/sklearn/utils/_typedefs.pyx +++ b/sklearn/utils/_typedefs.pyx @@ -14,6 +14,8 @@ np.import_array() #cdef ITYPE_t[:] idummy_view = &idummy #ITYPE = np.asarray(idummy_view).dtype ITYPE = np.intp # WARNING: this should match ITYPE_t in typedefs.pxd +INT32TYPE = np.int32 # WARNING: should match INT32TYPE_t in typedefs.pyx +INT64TYPE = np.int64 # WARNING: this should match INT64TYPE_t in typedefs.pxd #cdef DTYPE_t ddummy #cdef DTYPE_t[:] ddummy_view = &ddummy diff --git a/sklearn/utils/_vector_sentinel.pxd b/sklearn/utils/_vector_sentinel.pxd index 61c7200fdf7f5..5fa0f6ad8d00a 100644 --- a/sklearn/utils/_vector_sentinel.pxd +++ b/sklearn/utils/_vector_sentinel.pxd @@ -1,10 +1,12 @@ cimport numpy as np from libcpp.vector cimport vector -from ..utils._typedefs cimport ITYPE_t, DTYPE_t +from ..utils._typedefs cimport ITYPE_t, DTYPE_t, INT32TYPE_t, INT64TYPE_t ctypedef fused vector_typed: vector[DTYPE_t] vector[ITYPE_t] + vector[INT32TYPE_t] + vector[INT64TYPE_t] cdef np.ndarray vector_to_nd_array(vector_typed * vect_ptr) diff --git a/sklearn/utils/_vector_sentinel.pyx b/sklearn/utils/_vector_sentinel.pyx index c742ad6c9d9a9..0938ada0f56c1 100644 --- a/sklearn/utils/_vector_sentinel.pyx +++ b/sklearn/utils/_vector_sentinel.pyx @@ -2,7 +2,7 @@ from cython.operator cimport dereference as deref from cpython.ref cimport Py_INCREF cimport numpy as np -from ._typedefs cimport DTYPECODE, ITYPECODE +from ._typedefs cimport DTYPECODE, ITYPECODE, INT32TYPECODE, INT64TYPECODE np.import_array() @@ -10,6 +10,10 @@ np.import_array() cdef StdVectorSentinel _create_sentinel(vector_typed * vect_ptr): if vector_typed is vector[DTYPE_t]: return StdVectorSentinelFloat64.create_for(vect_ptr) + elif vector_typed is vector[INT32TYPE_t]: + return StdVectorSentinelInt32.create_for(vect_ptr) + elif vector_typed is vector[INT64TYPE_t]: + return StdVectorSentinelInt64.create_for(vect_ptr) else: return StdVectorSentinelIntP.create_for(vect_ptr) @@ -64,6 +68,42 @@ cdef class StdVectorSentinelIntP(StdVectorSentinel): return ITYPECODE +cdef class StdVectorSentinelInt32(StdVectorSentinel): + cdef vector[INT32TYPE_t] vec + + @staticmethod + cdef StdVectorSentinel create_for(vector[INT32TYPE_t] * vect_ptr): + # This initializes the object directly without calling __init__ + # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa + cdef StdVectorSentinelInt32 sentinel = StdVectorSentinelInt32.__new__(StdVectorSentinelInt32) + sentinel.vec.swap(deref(vect_ptr)) + return sentinel + + cdef void* get_data(self): + return self.vec.data() + + cdef int get_typenum(self): + return INT32TYPECODE + + +cdef class StdVectorSentinelInt64(StdVectorSentinel): + cdef vector[INT64TYPE_t] vec + + @staticmethod + cdef StdVectorSentinel create_for(vector[INT64TYPE_t] * vect_ptr): + # This initializes the object directly without calling __init__ + # See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa + cdef StdVectorSentinelInt64 sentinel = StdVectorSentinelInt64.__new__(StdVectorSentinelInt64) + sentinel.vec.swap(deref(vect_ptr)) + return sentinel + + cdef void* get_data(self): + return self.vec.data() + + cdef int get_typenum(self): + return INT64TYPECODE + + cdef np.ndarray vector_to_nd_array(vector_typed * vect_ptr): cdef: np.npy_intp size = deref(vect_ptr).size()