Skip to content

Commit

Permalink
Merge pull request #39 from enricogandini/pandas_output
Browse files Browse the repository at this point in the history
Pandas output
  • Loading branch information
EBjerrum committed Mar 18, 2024
2 parents ba3d190 + 0027c13 commit 077c022
Show file tree
Hide file tree
Showing 12 changed files with 5,184 additions and 25 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/run_pytests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ jobs:
python-version: ["3.10"]
include:
# test python version compatibility on linux only
- os: ubuntu-latest
python-version: 3.12
- os: ubuntu-latest
python-version: 3.11
- os: ubuntu-latest
python-version: 3.10
- os: ubuntu-latest
python-version: 3.9
- os: ubuntu-latest
Expand Down
5,004 changes: 5,004 additions & 0 deletions notebooks/10_pipeline_pandas_output.ipynb

Large diffs are not rendered by default.

11 changes: 3 additions & 8 deletions scikit_mol/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

from scikit_mol.core import check_transform_input
from scikit_mol.core import check_transform_input, feature_names_default_mol ,DEFAULT_MOL_COLUMN_NAME


class SmilesToMolTransformer(BaseEstimator, TransformerMixin):
Expand All @@ -15,12 +15,9 @@ def __init__(self, parallel: Union[bool, int] = False):
self.parallel = parallel
self.start_method = None #TODO implement handling of start_method

@feature_names_default_mol
def get_feature_names_out(self, input_features=None):
prefix = "ROMol"
if input_features is not None:
return np.array([f'{prefix}_{name}' for name in input_features])
else:
return np.array([prefix])
return input_features

def fit(self, X=None, y=None):
"""Included for scikit-learn compatibility, does nothing"""
Expand Down Expand Up @@ -53,9 +50,7 @@ def transform(self, X_smiles_list, y=None):
n_chunks = n_processes*2 if n_processes is not None else multiprocessing.cpu_count()*2 #TODO, tune the number of chunks per child process
with get_context(self.start_method).Pool(processes=n_processes) as pool:
x_chunks = np.array_split(X_smiles_list, n_chunks)
#x_chunks = [x.reshape(-1, 1) for x in x_chunks] Why the reshape? it doesn't exist on things like e.g. Pandas Arrays or Series
arrays = pool.map(self._transform, x_chunks) #is the helper function a safer way of handling the picklind and child process communication

arr = np.concatenate(arrays)
return arr

Expand Down
20 changes: 20 additions & 0 deletions scikit_mol/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@

import numpy as np
import pandas as pd
from packaging.version import Version

SKLEARN_VERSION_PANDAS_OUT = Version("1.2")

DEFAULT_MOL_COLUMN_NAME = "ROMol"


def _validate_transform_input(X):
"""Validate and adapt the input of the _transform method"""
Expand Down Expand Up @@ -40,4 +46,18 @@ def wrapper(obj, X):
# must be changed depending on the initial type of X, do it here.
return result

return wrapper

def feature_names_default_mol(method):
"""
Decorator that returns the default feature names for the mol object
"""
@functools.wraps(method)
def wrapper(obj, input_features=None):
prefix = DEFAULT_MOL_COLUMN_NAME
if input_features is not None:
return np.array([f'{prefix}_{name}' for name in input_features])
else:
return np.array([prefix])

return wrapper
17 changes: 16 additions & 1 deletion scikit_mol/fingerprints.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,26 @@ def _get_column_prefix(self) -> str:
def _get_n_digits_column_suffix(self) -> int:
return len(str(self.nBits))

def get_feature_names_out(self, input_features=None):
def get_display_feature_names_out(self, input_features=None):
"""Get feature names for display purposes
All feature names will have the same length,
since the different elements will be prefixed with zeros
depending on the number of bits.
"""
prefix = self._get_column_prefix()
n_digits = self._get_n_digits_column_suffix()
return np.array([f"{prefix}_{str(i).zfill(n_digits)}" for i in range(1, self.nBits + 1)])

def get_feature_names_out(self, input_features=None):
"""Get feature names for fingerprint transformers
This method is used by the scikit-learn set_output API
to get the column names of the transformed dataframe.
"""
prefix = self._get_column_prefix()
return np.array([f"{prefix}_{i}" for i in range(1, self.nBits + 1)])

@abstractmethod
def _mol2fp(self, mol):
"""Generate descriptor from mol
Expand Down
8 changes: 6 additions & 2 deletions scikit_mol/standardizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from rdkit.rdBase import BlockLogs
import numpy as np

from scikit_mol.core import check_transform_input
from scikit_mol.core import check_transform_input, feature_names_default_mol


class Standardizer(BaseEstimator, TransformerMixin):
Expand Down Expand Up @@ -42,7 +42,11 @@ def _transform(self, X):

del block # Release logging block to previous state
return np.array(arr).reshape(-1,1)


@feature_names_default_mol
def get_feature_names_out(self, input_features=None):
return input_features

@check_transform_input
def transform(self, X, y=None):
if not self.parallel:
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ install_requires =
numpy
pandas
scikit-learn
packaging

[options.packages.find]
exclude =
Expand Down
Binary file not shown.
78 changes: 73 additions & 5 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
import os
from pathlib import Path
import pytest
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from packaging.version import Version
import sklearn
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from scikit_mol.fingerprints import MACCSKeysFingerprintTransformer, RDKitFingerprintTransformer, AtomPairFingerprintTransformer, \
TopologicalTorsionFingerprintTransformer, MorganFingerprintTransformer, SECFingerprintTransformer, \
MHFingerprintTransformer, AvalonFingerprintTransformer
from scikit_mol.descriptors import MolecularDescriptorTransformer
from scikit_mol.conversions import SmilesToMolTransformer
from scikit_mol.standardizer import Standardizer
from scikit_mol.core import SKLEARN_VERSION_PANDAS_OUT, DEFAULT_MOL_COLUMN_NAME

#TODO these should really go into the conftest.py, so that they are automatically imported in the tests

Expand All @@ -24,9 +37,18 @@ def smiles_list():
lambda x: x,
lambda x: np.array(x),
lambda x: np.array(x).reshape(-1, 1),
lambda x: pd.Series(x),
lambda x: pd.DataFrame({"hello": x}),
]
_names_to_test = [
"molecule",
"mol",
"smiles",
DEFAULT_MOL_COLUMN_NAME,
"hello",
None,
]
for name in _names_to_test:
_CONTAINER_CREATORS.append(lambda x, name=name: pd.Series(x, name=name))
_CONTAINER_CREATORS.append(lambda x, name=name: pd.DataFrame({name: x}) if name else pd.DataFrame(x))

@pytest.fixture(params=[container(_CANONICAL_SMILES_LIST) for container in _CONTAINER_CREATORS]
)
Expand Down Expand Up @@ -64,8 +86,54 @@ def chiral_mols_list(chiral_smiles_list):
def fingerprint(mols_list):
return rdMolDescriptors.GetHashedMorganFingerprint(mols_list[0],2,nBits=1000)

_DIR_DATA = Path(__file__).parent / "data"
_FILE_SLC6A4 = _DIR_DATA / "SLC6A4_active_excapedb_subset.csv"
_FILE_SLC6A4_WITH_CDDD = _DIR_DATA / "CDDD_SLC6A4_active_excapedb_subset.csv.gz"

@pytest.fixture
def SLC6A4_subset():
file_path = os.path.realpath(__file__)
data = pd.read_csv(f"{os.path.split(file_path)[0]}/data/SLC6A4_active_excapedb_subset.csv")
return data
data = pd.read_csv(_FILE_SLC6A4)
return data

@pytest.fixture
def SLC6A4_subset_with_cddd(SLC6A4_subset):
data = SLC6A4_subset.copy().drop_duplicates(subset="Ambit_InchiKey")
cddd = pd.read_csv(_FILE_SLC6A4_WITH_CDDD, index_col="Ambit_InchiKey")
data = data.merge(cddd, left_on="Ambit_InchiKey", right_index=True, how="inner", validate="one_to_one")
return data

skip_pandas_output_test = pytest.mark.skipif(Version(sklearn.__version__) < SKLEARN_VERSION_PANDAS_OUT, reason=f"requires scikit-learn {SKLEARN_VERSION_PANDAS_OUT} or higher")

_FEATURIZER_CLASSES = [
MACCSKeysFingerprintTransformer,
RDKitFingerprintTransformer,
AtomPairFingerprintTransformer,
TopologicalTorsionFingerprintTransformer,
MorganFingerprintTransformer,
SECFingerprintTransformer,
MHFingerprintTransformer,
AvalonFingerprintTransformer,
MolecularDescriptorTransformer,
]
@pytest.fixture(params=_FEATURIZER_CLASSES)
def featurizer(request):
return request.param()

@pytest.fixture
def combined_transformer(featurizer):
descriptors_pipeline = make_pipeline(
SmilesToMolTransformer(),
Standardizer(),
featurizer,
)
# A pipeline that just passes the input data.
# We will use it to preserve the CDDD features and pass them to downstream steps.
identity_pipeline = make_pipeline(
FunctionTransformer(),
)
transformer = make_column_transformer(
(descriptors_pipeline, make_column_selector(pattern="SMILES")),
(identity_pipeline, make_column_selector(pattern=r"^cddd_\d+$")),
remainder="drop",
)
return transformer
8 changes: 6 additions & 2 deletions tests/test_desctransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@
import numpy as np
import pandas as pd
from rdkit.Chem import Descriptors
import sklearn
from packaging.version import Version
from scikit_mol.conversions import SmilesToMolTransformer
from scikit_mol.descriptors import MolecularDescriptorTransformer
from fixtures import mols_list, smiles_list, mols_container, smiles_container
from scikit_mol.core import SKLEARN_VERSION_PANDAS_OUT
from fixtures import mols_list, smiles_list, mols_container, smiles_container, skip_pandas_output_test
from sklearn import clone
from sklearn.pipeline import Pipeline
import joblib



@pytest.fixture
def default_descriptor_transformer():
return MolecularDescriptorTransformer()
Expand Down Expand Up @@ -74,13 +76,15 @@ def test_descriptor_transformer_parallel(mols_list, default_descriptor_transform
assert(len(features2[0]) == len(Descriptors._descList))


@skip_pandas_output_test
def test_descriptor_transformer_pandas_output(mols_container, default_descriptor_transformer, selected_descriptor_transformer, pandas_output):
for transformer in [default_descriptor_transformer, selected_descriptor_transformer]:
features = transformer.transform(mols_container)
assert isinstance(features, pd.DataFrame)
assert features.shape[0] == len(mols_container)
assert features.columns.tolist() == transformer.selected_descriptors

@skip_pandas_output_test
def test_descriptor_transformer_pandas_output_pipeline(smiles_container, default_descriptor_transformer, pandas_output):
pipeline = Pipeline([("s2m", SmilesToMolTransformer()), ("desc", default_descriptor_transformer)])
features = pipeline.fit_transform(smiles_container)
Expand Down
8 changes: 6 additions & 2 deletions tests/test_smilestomol.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import pytest
import numpy as np
import pandas as pd
from packaging.version import Version
from sklearn import clone
from rdkit import Chem
import sklearn
from scikit_mol.conversions import SmilesToMolTransformer
from fixtures import smiles_list, invalid_smiles_list, smiles_container
from scikit_mol.core import SKLEARN_VERSION_PANDAS_OUT, DEFAULT_MOL_COLUMN_NAME
from fixtures import smiles_list, invalid_smiles_list, smiles_container, skip_pandas_output_test


@pytest.fixture
Expand Down Expand Up @@ -39,8 +42,9 @@ def test_descriptor_transformer_parallel(smiles_container, smilestomol_transform
expected_smiles = smiles_container
assert all([ a == b for a, b in zip(expected_smiles, [Chem.MolToSmiles(mol) for mol in mol_list.flatten()])])

@skip_pandas_output_test
def test_pandas_output(smiles_container, smilestomol_transformer, pandas_output):
mols = smilestomol_transformer.transform(smiles_container)
assert isinstance(mols, pd.DataFrame)
assert mols.shape[0] == len(smiles_container)
assert mols.columns.tolist() == ["ROMol"]
assert mols.columns.tolist() == [DEFAULT_MOL_COLUMN_NAME]
48 changes: 43 additions & 5 deletions tests/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,19 @@

import pytest
import pandas as pd
from packaging.version import Version
import sklearn
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from scikit_mol.conversions import SmilesToMolTransformer
from scikit_mol.fingerprints import MACCSKeysFingerprintTransformer, RDKitFingerprintTransformer, AtomPairFingerprintTransformer, \
from scikit_mol.core import SKLEARN_VERSION_PANDAS_OUT
from scikit_mol.fingerprints import FpsTransformer, MACCSKeysFingerprintTransformer, RDKitFingerprintTransformer, AtomPairFingerprintTransformer, \
TopologicalTorsionFingerprintTransformer, MorganFingerprintTransformer, SECFingerprintTransformer, \
MHFingerprintTransformer, AvalonFingerprintTransformer
from scikit_mol.descriptors import MolecularDescriptorTransformer


from fixtures import SLC6A4_subset
from fixtures import SLC6A4_subset, SLC6A4_subset_with_cddd, skip_pandas_output_test, mols_container, featurizer, combined_transformer

def test_transformer(SLC6A4_subset):
# load some toy data for quick testing on a small number of samples
Expand Down Expand Up @@ -61,6 +64,7 @@ def test_transformer(SLC6A4_subset):
assert len(failed_FP) == 0, f"the following FP have failed {failed_FP}"


@skip_pandas_output_test
def test_transformer_pandas_output(SLC6A4_subset, pandas_output):
# load some toy data for quick testing on a small number of samples
X_smiles = SLC6A4_subset.SMILES
Expand Down Expand Up @@ -103,8 +107,42 @@ def test_transformer_pandas_output(SLC6A4_subset, pandas_output):
# overall result
assert len(failed_FP) == 0, f"the following FP have failed pandas transformation {failed_FP}"



@skip_pandas_output_test
def test_pandas_out_same_values(featurizer, mols_container):
featurizer_default = sklearn.base.clone(featurizer)
featurizer_default.set_output(transform="default")
featurizer_pandas = sklearn.base.clone(featurizer)
featurizer_pandas.set_output(transform="pandas")
result_default = featurizer_default.fit_transform(mols_container)
result_pandas = featurizer_pandas.fit_transform(mols_container)
assert isinstance(result_default, np.ndarray)
assert isinstance(result_pandas, pd.DataFrame)
assert result_default.shape == result_pandas.shape
featurizer_class_with_nan = MolecularDescriptorTransformer
if isinstance(featurizer, featurizer_class_with_nan):
assert (pd.isna(result_default) == pd.isna(result_pandas.values)).all(), "NaN values are not in the same positions in the default and pandas output"
nan_replacement = 0.
result_default = np.nan_to_num(result_default, nan=nan_replacement)
result_pandas = result_pandas.fillna(nan_replacement)
else:
assert (result_default == result_pandas.values).all()

@skip_pandas_output_test
def test_combined_transformer_pandas_out(combined_transformer, SLC6A4_subset_with_cddd, pandas_output):
result = combined_transformer.fit_transform(SLC6A4_subset_with_cddd)
assert isinstance(result, pd.DataFrame)
assert result.shape[0] == SLC6A4_subset_with_cddd.shape[0]
n_cddd_features = SLC6A4_subset_with_cddd.columns.str.match(r"^cddd_\d+$").sum()
pipeline_skmol = combined_transformer.named_transformers_["pipeline-1"]
featurizer_skmol = pipeline_skmol[-1]
if isinstance(featurizer_skmol, FpsTransformer):
n_skmol_features = featurizer_skmol.nBits
elif isinstance(featurizer_skmol, MolecularDescriptorTransformer):
n_skmol_features = len(featurizer_skmol.desc_list)
else:
raise ValueError(f"Unexpected featurizer type {type(featurizer_skmol)}")
expected_n_features = n_cddd_features + n_skmol_features
assert result.shape[1] == expected_n_features



0 comments on commit 077c022

Please sign in to comment.