chemprop · JacksonBurns · Apr 1, 2024 · Mar 7, 2024 · Mar 7, 2024 · Mar 7, 2024
diff --git a/.flake8 b/.flake8
@@ -4,5 +4,5 @@ max-line-length = 100
 max-complexity = 18
 per-file-ignores =
     __init__.py: F401
-    chemprop/v2/nn/predictors.py: F405
-    chemprop/v2/metrics.py: F405
+    chemprop/nn/predictors.py: F405
+    chemprop/nn/metrics.py: F405
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,114 @@
+# ci.yml
+#
+# Continuous Integration for Chemprop - checks build, code formatting, and runs tests for all
+# proposed changes and on a regular schedule
+#
+# Note: this file contains extensive inline documentation to aid with knowledge transfer.
+
+name: Continuous Integration
+
+on:
+  # run on pushes/pull requests to/against main or v2/dev
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main, v2/dev]
+  # run this in the morning on weekdays to catch dependency issues
+  schedule:
+    - cron: "0 8 * * 1-5"
+  # allow manual runs
+  workflow_dispatch:
+
+# cancel previously running tests if new commits are made
+# https://docs.github.com/en/actions/examples/using-concurrency-expressions-and-a-test-matrix
+concurrency:
+  group: actions-id-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    name: Check Build
+    runs-on: ubuntu-latest
+    steps:
+      # clone the repo, attempt to build
+      - uses: actions/checkout@v4
+      - run: python -m pip install build
+      - run: python -m build .
+
+  lint:
+    name: Check Formatting
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      # clone the repo, run black and flake8 on it
+      - uses: actions/checkout@v4
+      - run: python -m pip install black==23.* flake8
+      - run: black --check .
+      - run: flake8 .
+
+  test:
+    name: Execute Tests
+    needs: lint
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        # run with a login shell (so that the conda environment is activated)
+        # and echo the commands we run as we do them (for debugging purposes)
+        shell: bash -el {0}
+    strategy:
+      # if one platform/python version fails, continue testing the others
+      fail-fast: false
+      matrix:
+        # test on all platforms with both supported versions of Python
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: [3.11, 3.12]
+    steps:
+      - uses: actions/checkout@v4
+      # use a version of the conda virtual environment manager to set up an
+      # isolated environment with the Python version we want
+      - uses: mamba-org/setup-micromamba@main
+        with:
+          environment-name: temp
+          condarc: |
+            channels:
+              - defaults
+              - conda-forge
+            channel_priority: flexible
+          create-args: |
+            python=${{ matrix.python-version }}
+      - name: Install dependencies
+        shell: bash -l {0}
+        run: |
+          python -m pip install nbmake
+          python -m pip install ".[dev,docs,test]"
+      - name: Test with pytest
+        shell: bash -l {0}
+        run: |
+          pytest -v tests
+      - name: Test notebooks
+        shell: bash -l {0}
+        run: |
+          pytest --nbmake examples/training.ipynb
+          pytest --nbmake examples/predicting.ipynb
+          pytest --nbmake examples/convert_v1_to_v2.ipynb
+          pytest --nbmake examples/training_regression_multicomponent.ipynb
+          pytest --nbmake examples/predicting_regression_multicomponent.ipynb
+
+  # GitHub allows you to set checks as required before PRs can be merged, which is annoying to do
+  # for matrix jobs like the one above which have six actual jobs. Instead we have a summary job
+  # like this, which we mark as required.
+  # Copied in part from:
+  # https://github.com/orgs/community/discussions/26822#discussioncomment-3305794
+  ci-report-status:
+    name: report CI status
+    needs: test
+    runs-on: ubuntu-latest
+    steps:
+      - run: |
+          result="${{ needs.test.result }}"
+          if test $result == "success"; then
+            exit 0
+          else
+            exit 1
+          fi
+
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -11,11 +11,6 @@ build:
   tools:
     python: "3.11"
   jobs:
-    # We need to install torch and torch-scatter separately as a pre-install step
-    # due to an issue with torch-scatter
-    pre_install:
-      - python -m pip install --upgrade --no-cache-dir torch
-      - python -m pip install --upgrade --no-cache-dir torch-scatter
     post_install:
       - python -m pip install --upgrade --upgrade-strategy only-if-needed --no-cache-dir ".[docs]"
 

diff --git a/Dockerfile b/Dockerfile
@@ -36,13 +36,11 @@ RUN conda create --name chemprop_env python=3.11* && \
 # in a Dockerfile build script)
 SHELL ["conda", "run", "--no-capture-output", "-n", "chemprop_env", "/bin/bash", "-c"]
 
-# Follow the installation instructions (but with fixed versions, for stability of this image) then clear the cache
+# Follow the installation instructions then clear the cache
 ADD chemprop chemprop
 ENV PYTHONPATH /opt/chemprop
 ADD LICENSE.txt pyproject.toml README.md .
-RUN python -m pip install torch==2.2.0 --index-url https://download.pytorch.org/whl/cpu && \
-    python -m pip install torch-scatter -f https://data.pyg.org/whl/torch-2.2.0+cpu.html && \
-    python -m pip install . && \
+RUN python -m pip install . && \
     python -m pip cache purge
 
 # when running this image, open an interactive bash terminal inside the conda environment

diff --git a/chemprop/__init__.py b/chemprop/__init__.py
@@ -1,3 +1,5 @@
 from . import data, featurizers, models, nn, utils, conf, exceptions, schedulers
 
+__all__ = ["data", "featurizers", "models", "nn", "utils", "conf", "exceptions", "schedulers"]
+
 __version__ = "2.0.0-rc.1"
diff --git a/chemprop/cli/common.py b/chemprop/cli/common.py
@@ -34,8 +34,9 @@ def add_common_args(parser: ArgumentParser) -> ArgumentParser:
         "-n",
         "--num-workers",
         type=int,
-        default=8,
-        help="Number of workers for the parallel data loading (0 means sequential).",
+        default=0,
+        help="""Number of workers for parallel data loading (0 means sequential).
+Warning: setting num_workers>0 can cause hangs on Windows and MacOS.""",
     )
     dataloader_args.add_argument("-b", "--batch-size", type=int, default=64, help="Batch size.")
 

diff --git a/chemprop/cli/main.py b/chemprop/cli/main.py
@@ -49,6 +49,7 @@ def construct_parser():
 
     return parser
 
+
 def main():
     parser = construct_parser()
     args = parser.parse_args()

diff --git a/chemprop/cli/train.py b/chemprop/cli/train.py
@@ -5,7 +5,6 @@
 import json
 from copy import deepcopy
 import pandas as pd
-from rdkit import Chem
 
 from lightning import pytorch as pl
 from lightning.pytorch.loggers import TensorBoardLogger, CSVLogger
@@ -439,7 +438,7 @@ def add_train_args(parser: ArgumentParser) -> ArgumentParser:
         default=0,
         help="Random seed to use when splitting data into train/val/test sets. When :code`num_folds > 1`, the first fold uses this seed and all subsequent folds add 1 to the seed. Also used for shuffling data in :code:`MolGraphDataLoader` when :code:`shuffle` is True.",
     )
-    
+
     parser.add_argument(
         "--pytorch-seed",
         type=int,

diff --git a/chemprop/cli/utils/__init__.py b/chemprop/cli/utils/__init__.py
@@ -2,4 +2,23 @@
 from .actions import LookupAction
 from .command import Subcommand
 from .parsing import build_data_from_files, make_datapoints, make_dataset, get_column_names
-from .utils import *
+from .utils import pop_attr, _pop_attr, _pop_attr_d, validate_loss_function
+
+__all__ = [
+    "bounded",
+    "LookupAction",
+    "Subcommand",
+    "build_data_from_files",
+    "make_datapoints",
+    "make_dataset",
+    "get_column_names",
+    "actions",
+    "args",
+    "command",
+    "parsing",
+    "utils",
+    "pop_attr",
+    "_pop_attr",
+    "_pop_attr_d",
+    "validate_loss_function",
+]
diff --git a/chemprop/data/__init__.py b/chemprop/data/__init__.py
@@ -10,3 +10,22 @@
 )
 from .samplers import ClassBalanceSampler, SeededSampler
 from .splitting import split_component, SplitType
+
+__all__ = [
+    "BatchMolGraph",
+    "TrainingBatch",
+    "collate_batch",
+    "collate_multicomponent",
+    "MolGraphDataLoader",
+    "MoleculeDatapoint",
+    "ReactionDatapoint",
+    "MoleculeDataset",
+    "ReactionDataset",
+    "Datum",
+    "MulticomponentDataset",
+    "MolGraphDataset",
+    "ClassBalanceSampler",
+    "SeededSampler",
+    "split_component",
+    "SplitType",
+]
diff --git a/chemprop/data/datapoints.py b/chemprop/data/datapoints.py
@@ -73,7 +73,7 @@ class MoleculeDatapoint(_DatapointMixin, _MoleculeDatapointMixin):
     concatenated to bond-level features *before* message passing"""
     V_d: np.ndarray | None = None
     """A numpy array of shape ``V x d_vd``, where ``V`` is the number of atoms in the molecule, and
-    ``d_vd`` is the number of additional descriptors that will be concatenated to atom-level 
+    ``d_vd`` is the number of additional descriptors that will be concatenated to atom-level
     descriptors *after* message passing"""
 
     def __post_init__(self, mfs: list[MoleculeFeaturizer] | None):

diff --git a/chemprop/data/datasets.py b/chemprop/data/datasets.py
@@ -250,13 +250,13 @@ def normalize_inputs(
 
         match key:
             case "X_d":
-                X = None if np.all(self.X_d == None) else self.X_d
+                X = None if all(i is None for i in self.X_d) else self.X_d
             case "V_f":
-                X = None if np.all(self.V_fs == None) else np.concatenate(self.V_fs, axis=0)
+                X = None if all(i is None for i in self.V_fs) else np.concatenate(self.V_fs, axis=0)
             case "E_f":
-                X = None if np.all(self.E_fs == None) else np.concatenate(self.E_fs, axis=0)
+                X = None if all(i is None for i in self.E_fs) else np.concatenate(self.E_fs, axis=0)
             case "V_d":
-                X = None if np.all(self.V_ds == None) else np.concatenate(self.V_ds, axis=0)
+                X = None if all(i is None for i in self.V_ds) else np.concatenate(self.V_ds, axis=0)
             case None:
                 return [self.normalize_inputs(k, scaler) for k in VALID_KEYS - {None}]
             case _:

diff --git a/chemprop/data/splitting.py b/chemprop/data/splitting.py
@@ -3,7 +3,6 @@
 import logging
 from enum import auto
 from typing import Sequence
-from random import Random
 import numpy as np
 from astartes import train_test_split, train_val_test_split
 from astartes.molecules import train_test_split_molecules, train_val_test_split_molecules

diff --git a/chemprop/featurizers/__init__.py b/chemprop/featurizers/__init__.py
@@ -1,4 +1,41 @@
 from .atom import MultiHotAtomFeaturizer, AtomFeaturizer
 from .bond import MultiHotBondFeaturizer, BondFeaturizer
-from .molgraph import *
-from .molecule import *
+from .molgraph import (
+    MolGraph,
+    MoleculeMolGraphFeaturizer,
+    SimpleMoleculeMolGraphFeaturizer,
+    RxnMolGraphFeaturizer,
+    CondensedGraphOfReactionFeaturizer,
+    CGRFeaturizer,
+    RxnMode,
+)
+from .molecule import (
+    MoleculeFeaturizer,
+    MorganFeaturizerMixin,
+    BinaryFeaturizerMixin,
+    CountFeaturizerMixin,
+    MorganBinaryFeaturizer,
+    MorganCountFeaturizer,
+    MoleculeFeaturizerRegistry,
+)
+
+__all__ = [
+    "MultiHotAtomFeaturizer",
+    "AtomFeaturizer",
+    "MultiHotBondFeaturizer",
+    "BondFeaturizer",
+    "MolGraph",
+    "MoleculeMolGraphFeaturizer",
+    "SimpleMoleculeMolGraphFeaturizer",
+    "RxnMolGraphFeaturizer",
+    "CondensedGraphOfReactionFeaturizer",
+    "CGRFeaturizer",
+    "RxnMode",
+    "MoleculeFeaturizer",
+    "MoleculeFeaturizerRegistry",
+    "MorganFeaturizerMixin",
+    "BinaryFeaturizerMixin",
+    "CountFeaturizerMixin",
+    "MorganBinaryFeaturizer",
+    "MorganCountFeaturizer",
+]
diff --git a/chemprop/featurizers/molecule.py b/chemprop/featurizers/molecule.py
@@ -47,7 +47,7 @@ def __call__(self, mol: Chem.Mol) -> np.ndarray:
 
 
 @MoleculeFeaturizerRegistry("morgan_binary")
-class MorganBinaryFeaturzer(MorganFeaturizerMixin, BinaryFeaturizerMixin, MoleculeFeaturizer):
+class MorganBinaryFeaturizer(MorganFeaturizerMixin, BinaryFeaturizerMixin, MoleculeFeaturizer):
     pass