diff --git a/ml_project/Makefile b/ml_project/Makefile index 5aa376b..de6a2f6 100644 --- a/ml_project/Makefile +++ b/ml_project/Makefile @@ -23,7 +23,7 @@ endif ## Install Python Dependencies requirements: test_environment $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel - $(PYTHON_INTERPRETER) -m pip install -r requirements.txt + $(PYTHON_INTERPRETER) -m pip install -r dev_requirements.txt ## Build EDA report eda_report: requirements diff --git a/ml_project/config/experiment_1/train_config.yaml b/ml_project/config/experiment_1/train_config.yaml index d396f5c..b247c2e 100644 --- a/ml_project/config/experiment_1/train_config.yaml +++ b/ml_project/config/experiment_1/train_config.yaml @@ -1,5 +1,6 @@ model_save_path: models/model_experiment_1.pkl pipeline_save_path: models/pipeline_experiment_1.pkl +metadata_save_path: models/metadata_experiment_1.pkl data_load_config: data_path: data/heart.csv split_config: diff --git a/ml_project/config/experiment_2/train_config.yaml b/ml_project/config/experiment_2/train_config.yaml index 615016a..54983b4 100644 --- a/ml_project/config/experiment_2/train_config.yaml +++ b/ml_project/config/experiment_2/train_config.yaml @@ -1,5 +1,6 @@ model_save_path: models/model_experiment_2.pkl pipeline_save_path: models/pipeline_experiment_2.pkl +metadata_save_path: models/metadata_experiment_2.pkl data_load_config: data_path: data/heart.csv split_config: diff --git a/ml_project/config/train_config.yaml b/ml_project/config/train_config.yaml index eea124d..8030c70 100644 --- a/ml_project/config/train_config.yaml +++ b/ml_project/config/train_config.yaml @@ -1,5 +1,6 @@ model_save_path: models/model.pkl pipeline_save_path: models/pipeline.pkl +metadata_save_path: models/metadata.pkl data_load_config: data_path: data/heart.csv split_config: diff --git a/ml_project/dev_requirements.txt b/ml_project/dev_requirements.txt new file mode 100644 index 0000000..93c9460 --- /dev/null +++ b/ml_project/dev_requirements.txt @@ -0,0 +1,4 @@ +# local package +-e . + +-r requirements.txt diff --git a/ml_project/heart_disease/entities/pipeline_config.py b/ml_project/heart_disease/entities/pipeline_config.py index 5b0e95c..2862d28 100644 --- a/ml_project/heart_disease/entities/pipeline_config.py +++ b/ml_project/heart_disease/entities/pipeline_config.py @@ -15,6 +15,7 @@ class TrainingConfig: evaluation_config: EvaluateModelConfig = field(default_factory=lambda: EvaluateModelConfig) model_save_path: str = omegaconf.MISSING pipeline_save_path: str = omegaconf.MISSING + metadata_save_path: str = omegaconf.MISSING @dataclass diff --git a/ml_project/heart_disease/features/build_features.py b/ml_project/heart_disease/features/build_features.py index 52de418..f286d5f 100644 --- a/ml_project/heart_disease/features/build_features.py +++ b/ml_project/heart_disease/features/build_features.py @@ -1,5 +1,4 @@ -import pickle -from typing import List +from typing import List, Dict import numpy as np import pandas as pd @@ -12,6 +11,7 @@ from sklearn.random_projection import SparseRandomProjection from heart_disease.entities.feature_config import FeatureConfig +from heart_disease.utils import serialize_object, deserialize_object class StatisticalFeaturesExtractor(TransformerMixin): @@ -95,11 +95,22 @@ def extract_target(df: pd.DataFrame, config: FeatureConfig) -> pd.Series: return target +def extract_raw_features(df: pd.DataFrame, config: FeatureConfig) -> pd.DataFrame: + return df[config.raw_features.numeric_features + config.raw_features.categorical_features] + + def serialize_pipeline(pipeline: Pipeline, path: str): - with open(path, "wb") as f: - pickle.dump(pipeline, f) + serialize_object(pipeline, path) def deserialize_pipeline(path: str) -> Pipeline: - with open(path, "rb") as f: - return pickle.load(f) + return deserialize_object(path) + + +def serialize_metadata(df: pd.DataFrame, config: FeatureConfig, path: str): + all_features = config.raw_features.numeric_features + config.raw_features.categorical_features + return serialize_object(df[all_features].dtypes.to_dict(), path) + + +def deserialize_metadata(path: str) -> Dict[str, np.dtype]: + return deserialize_object(path) diff --git a/ml_project/heart_disease/models/model.py b/ml_project/heart_disease/models/model.py index 64048f1..5c8a30d 100644 --- a/ml_project/heart_disease/models/model.py +++ b/ml_project/heart_disease/models/model.py @@ -1,4 +1,3 @@ -import pickle from typing import Union, Dict, List import numpy as np @@ -7,6 +6,7 @@ from sklearn.metrics import get_scorer from heart_disease.entities.model_config import TrainModelConfig, ModelType +from heart_disease.utils import deserialize_object, serialize_object Classifier = Union[RandomForestClassifier, ExtraTreesClassifier] @@ -41,10 +41,8 @@ def save_metrics(metrics: Dict[str, float], path: str): def serialize_model(model: Classifier, path: str): - with open(path, "wb") as f: - pickle.dump(model, f) + serialize_object(model, path) def deserialize_model(path: str) -> Classifier: - with open(path, "rb") as f: - return pickle.load(f) + return deserialize_object(path) diff --git a/ml_project/heart_disease/models/train_model.py b/ml_project/heart_disease/models/train_model.py index a137b2f..6a9a4da 100644 --- a/ml_project/heart_disease/models/train_model.py +++ b/ml_project/heart_disease/models/train_model.py @@ -7,7 +7,8 @@ from heart_disease.data.make_dataset import load_datasets from heart_disease.entities.pipeline_config import TrainingConfig -from heart_disease.features.build_features import build_feature_pipeline, extract_target, serialize_pipeline +from heart_disease.features.build_features import build_feature_pipeline, extract_target, serialize_pipeline, \ + serialize_metadata, extract_raw_features from heart_disease.models.model import train_model, evaluate_model, serialize_model, save_metrics log = logging.getLogger(__name__) @@ -27,19 +28,21 @@ def train_pipeline(cfg: TrainingConfig): log.info("Building features...") feature_pipeline = build_feature_pipeline(cfg.feature_config) - feature_pipeline.fit(train_data) - train_features = feature_pipeline.transform(train_data) - val_features = feature_pipeline.transform(val_data) + raw_train_features = extract_raw_features(train_data, cfg.feature_config) + raw_val_features = extract_raw_features(val_data, cfg.feature_config) + feature_pipeline.fit(raw_train_features) + train_features = feature_pipeline.transform(raw_train_features) + val_features = feature_pipeline.transform(raw_val_features) train_target = extract_target(train_data, cfg.feature_config) val_target = extract_target(val_data, cfg.feature_config) log.info("Features built") log.info(f"Training model {cfg.model_config.model.value}...") - model = train_model(train_features, train_target, cfg.model_config) + model = train_model(train_features, train_target.values, cfg.model_config) log.info("Model trained") log.info("Evaluating model...") - metrics = evaluate_model(model, val_features, val_target, cfg.evaluation_config.metrics) + metrics = evaluate_model(model, val_features, val_target.values, cfg.evaluation_config.metrics) save_metrics(metrics, to_absolute_path(cfg.evaluation_config.metric_file_path)) log.info("Model evaluated:") for metric, value in metrics.items(): @@ -48,6 +51,7 @@ def train_pipeline(cfg: TrainingConfig): log.info("Serializing...") serialize_model(model, to_absolute_path(cfg.model_save_path)) serialize_pipeline(feature_pipeline, to_absolute_path(cfg.pipeline_save_path)) + serialize_metadata(train_data, cfg.feature_config, to_absolute_path(cfg.metadata_save_path)) log.info("Model and pipeline serialized") diff --git a/ml_project/heart_disease/utils.py b/ml_project/heart_disease/utils.py new file mode 100644 index 0000000..a2fd51f --- /dev/null +++ b/ml_project/heart_disease/utils.py @@ -0,0 +1,14 @@ +import cloudpickle +from typing import Any + +cloudpickle.register_deep_serialization("heart_disease") + + +def serialize_object(obj: Any, path: str): + with open(path, "wb") as f: + cloudpickle.dump(obj, f) + + +def deserialize_object(path: str) -> Any: + with open(path, "rb") as f: + return cloudpickle.load(f) diff --git a/ml_project/requirements.txt b/ml_project/requirements.txt index 960b888..05c6058 100644 --- a/ml_project/requirements.txt +++ b/ml_project/requirements.txt @@ -1,6 +1,3 @@ -# local package --e . - # external requirements click==7.1.2 coverage==5.5 @@ -12,3 +9,5 @@ seaborn==0.11.1 scikit-learn==0.24.1 hydra-core==1.0.6 pytest==6.2.3 +python-dotenv==0.17.1 +git+git://github.com/polikutinevgeny/cloudpickle.git@206-deep-serialization diff --git a/ml_project/setup.py b/ml_project/setup.py index 0c16588..6c87d12 100644 --- a/ml_project/setup.py +++ b/ml_project/setup.py @@ -1,4 +1,13 @@ from setuptools import find_packages, setup +import pathlib +import pkg_resources + +with pathlib.Path('requirements.txt').open() as requirements_txt: + install_requires = [ + str(requirement) + for requirement + in pkg_resources.parse_requirements(requirements_txt) + ] setup( name='heart_disease', @@ -7,4 +16,5 @@ description='Predicting heart disease', author='Evgenii Polikutin', license='MIT', + install_requires=install_requires ) diff --git a/ml_project/tests/conftest.py b/ml_project/tests/conftest.py index 5206648..1427de9 100644 --- a/ml_project/tests/conftest.py +++ b/ml_project/tests/conftest.py @@ -1,6 +1,5 @@ from collections import OrderedDict -from pathlib import Path -from typing import List, Union, Dict, Callable +from typing import List, Union, Dict, Callable, Tuple import numpy as np import pandas as pd @@ -8,9 +7,14 @@ from numpy.random import Generator, PCG64 from heart_disease.data.make_dataset import read_data +from heart_disease.entities.data_loading_config import DataLoadingConfig from heart_disease.entities.feature_config import FeatureConfig, RandomProjectionFeaturesConfig, \ StatisticalFeaturesConfig, KMeansFeaturesConfig, PolynomialFeaturesConfig, RawFeaturesConfig -from heart_disease.features.build_features import build_feature_pipeline +from heart_disease.entities.model_config import TrainModelConfig, EvaluateModelConfig, ModelType +from heart_disease.entities.pipeline_config import TrainingConfig +from heart_disease.entities.splitting_config import SplittingConfig +from heart_disease.features.build_features import build_feature_pipeline, extract_raw_features +from heart_disease.models.train_model import train_pipeline def get_row_generators(rng: Generator) -> Dict[str, Callable]: @@ -32,17 +36,17 @@ def get_row_generators(rng: Generator) -> Dict[str, Callable]: } -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def categorical_features() -> List[str]: return ["thal", "ca", "slope", "exang", "restecg", "fbs", "cp", "sex"] -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def numerical_features() -> List[str]: return ["age", "trestbps", "chol", "thalach", "oldpeak"] -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def target_column() -> str: return "target" @@ -54,31 +58,48 @@ def generate_random_row(row_generators: Dict[str, Callable]) -> Dict[str, Union[ return row -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def dataset_filename() -> str: return "data.csv" -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def dataset_size() -> int: return 200 -@pytest.fixture(scope='function') -def dataset_file(tmp_path: Path, dataset_filename: str, dataset_size: int) -> str: +@pytest.fixture(scope="session") +def dataset_file(tmp_path_factory, dataset_filename: str, dataset_size: int) -> str: + path = tmp_path_factory.mktemp("path") rng = Generator(PCG64(12345)) data = pd.DataFrame.from_records([generate_random_row(get_row_generators(rng)) for _ in range(dataset_size)]) - dataset_path = tmp_path / dataset_filename + dataset_path = path / dataset_filename data.to_csv(dataset_path, index=False) return str(dataset_path) -@pytest.fixture(scope='function') +@pytest.fixture(scope="session") +def test_dataset_file(tmp_path_factory, dataset_filename: str, dataset_size: int, target_column: str) -> str: + path = tmp_path_factory.mktemp("path") + rng = Generator(PCG64(12345)) + data = pd.DataFrame.from_records([generate_random_row(get_row_generators(rng)) for _ in range(dataset_size)]) + data.drop(columns=[target_column, ], inplace=True) + dataset_path = path / dataset_filename + data.to_csv(dataset_path, index=False) + return str(dataset_path) + + +@pytest.fixture(scope="session") def dataset(dataset_file: str) -> pd.DataFrame: return read_data(dataset_file) -@pytest.fixture(scope='function') +@pytest.fixture(scope="session") +def test_dataset(test_dataset_file: str) -> pd.DataFrame: + return read_data(test_dataset_file) + + +@pytest.fixture(scope="session") def features( dataset: pd.DataFrame, categorical_features: List[str], @@ -92,16 +113,16 @@ def features( config = get_feature_config(target_column, categorical_features, n_clusters, numerical_features, polynomial_degree, projection_features, statistics) pipeline = build_feature_pipeline(config) - transformed_features = pipeline.fit_transform(dataset) + transformed_features = pipeline.fit_transform(extract_raw_features(dataset, config)) return transformed_features -@pytest.fixture(scope='function') +@pytest.fixture(scope="session") def target(dataset: pd.DataFrame, target_column: str) -> np.ndarray: return dataset[target_column].values -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def statistics() -> OrderedDict[str, Callable]: return OrderedDict(sum=np.sum, var=lambda x, **kwargs: np.var(x, ddof=1, **kwargs), median=np.median, mean=np.mean, std=lambda x, **kwargs: np.std(x, ddof=1, **kwargs), max=np.max, min=np.min) @@ -132,6 +153,71 @@ def get_feature_config( return config -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def metrics() -> List[str]: return ["accuracy", "f1", "precision", "recall"] + + +@pytest.fixture(scope="session") +def metrics_path(tmp_path_factory) -> str: + return str(tmp_path_factory.mktemp("path") / "metrics.yaml") + + +@pytest.fixture(scope="session") +def model_save_path(tmp_path_factory) -> str: + return str(tmp_path_factory.mktemp("path") / "model.pkl") + + +@pytest.fixture(scope="session") +def pipeline_save_path(tmp_path_factory) -> str: + return str(tmp_path_factory.mktemp("path") / "pipeline.pkl") + + +@pytest.fixture(scope="session") +def metadata_path(tmp_path_factory) -> str: + return str(tmp_path_factory.mktemp("path") / "metadata.pkl") + + +@pytest.fixture(scope="session") +def train_artifacts( + categorical_features: List[str], + dataset_file: str, + metrics: List[str], + numerical_features: List[str], + statistics: OrderedDict[str, Callable], + model_save_path: str, + pipeline_save_path: str, + metrics_path: str, + metadata_path: str, + target_column: str +) -> Tuple[str, str, str, str]: + projection_features = 5 + polynomial_degree = 2 + n_clusters = 2 + feature_config = get_feature_config(target_column, categorical_features, n_clusters, numerical_features, + polynomial_degree, + projection_features, statistics) + config = TrainingConfig( + data_load_config=DataLoadingConfig( + split_config=SplittingConfig( + random_state=42, + val_size=0.2 + ), + data_path=dataset_file + ), + feature_config=feature_config, + model_config=TrainModelConfig( + model=ModelType.random_forest, + random_state=42, + params=dict(n_estimators=55) + ), + evaluation_config=EvaluateModelConfig( + metrics=metrics, + metric_file_path=metrics_path + ), + pipeline_save_path=pipeline_save_path, + model_save_path=model_save_path, + metadata_save_path=metadata_path + ) + train_pipeline(config) + return metrics_path, model_save_path, pipeline_save_path, metadata_path diff --git a/ml_project/tests/features/test_build_features.py b/ml_project/tests/features/test_build_features.py index 3679602..442df2f 100644 --- a/ml_project/tests/features/test_build_features.py +++ b/ml_project/tests/features/test_build_features.py @@ -9,7 +9,7 @@ from heart_disease.entities.feature_config import FeatureConfig, RandomProjectionFeaturesConfig, \ KMeansFeaturesConfig, StatisticalFeaturesConfig, PolynomialFeaturesConfig from heart_disease.features.build_features import \ - StatisticalFeaturesExtractor, KMeansFeaturesExtractor + StatisticalFeaturesExtractor, KMeansFeaturesExtractor, serialize_metadata, deserialize_metadata from heart_disease.features.build_features import \ build_categorical_feature_pipeline, \ build_numerical_feature_pipeline, \ @@ -132,3 +132,24 @@ def test_serialize_pipeline( serialize_pipeline(pipeline, filename) loaded_pipeline = deserialize_pipeline(filename) assert np.all(pipeline.transform(dataset) == loaded_pipeline.transform(dataset)) + + +def test_serialize_metadata( + dataset: pd.DataFrame, + numerical_features: List[str], + categorical_features: List[str], + tmp_path: Path, + statistics: OrderedDict[str, Callable], + target_column: str +): + filename = str(tmp_path / "metadata.pkl") + projection_features = 5 + polynomial_degree = 2 + n_clusters = 2 + config = get_feature_config(target_column, categorical_features, n_clusters, numerical_features, polynomial_degree, + projection_features, statistics) + serialize_metadata(dataset, config, filename) + metadata = deserialize_metadata(filename) + for k, v in metadata.items(): + assert dataset[k].dtype == v + assert set(metadata.keys()) == set(numerical_features + categorical_features) diff --git a/ml_project/tests/models/test_models.py b/ml_project/tests/models/test_models.py index 699b738..650b9a6 100644 --- a/ml_project/tests/models/test_models.py +++ b/ml_project/tests/models/test_models.py @@ -21,7 +21,6 @@ def test_predict(features: np.ndarray, target: np.ndarray): config = TrainModelConfig(model=ModelType.random_forest, random_state=42, params=dict(n_estimators=50)) model = train_model(features, target, config) predicted = predict_model(model, features) - print(predicted.shape, target.shape) assert predicted.shape[0] == target.shape[0] diff --git a/ml_project/tests/models/test_pipelines.py b/ml_project/tests/models/test_pipelines.py index 16b6c9e..52bf651 100644 --- a/ml_project/tests/models/test_pipelines.py +++ b/ml_project/tests/models/test_pipelines.py @@ -1,75 +1,20 @@ from pathlib import Path -from typing import List, Callable, OrderedDict +from typing import Tuple import numpy as np import yaml -from heart_disease.entities.data_loading_config import DataLoadingConfig -from heart_disease.entities.model_config import TrainModelConfig, EvaluateModelConfig, ModelType -from heart_disease.entities.pipeline_config import TrainingConfig, PredictConfig -from heart_disease.entities.splitting_config import SplittingConfig +from heart_disease.entities.pipeline_config import PredictConfig from heart_disease.models.predict_model import predict -from heart_disease.models.train_model import train_pipeline -from tests.conftest import get_feature_config - - -def train_model( - categorical_features: List[str], - dataset_file: str, - metrics: List[str], - numerical_features: List[str], - statistics: OrderedDict[str, Callable], - target_column: str, - tmpdir: Path -): - model_save_path = str(tmpdir / "model.pkl") - pipeline_save_path = str(tmpdir / "pipeline.pkl") - metrics_path = str(tmpdir / "metrics.yaml") - projection_features = 5 - polynomial_degree = 2 - n_clusters = 2 - feature_config = get_feature_config(target_column, categorical_features, n_clusters, numerical_features, - polynomial_degree, - projection_features, statistics) - config = TrainingConfig( - data_load_config=DataLoadingConfig( - split_config=SplittingConfig( - random_state=42, - val_size=0.2 - ), - data_path=dataset_file - ), - feature_config=feature_config, - model_config=TrainModelConfig( - model=ModelType.random_forest, - random_state=42, - params=dict(n_estimators=55) - ), - evaluation_config=EvaluateModelConfig( - metrics=metrics, - metric_file_path=metrics_path - ), - pipeline_save_path=pipeline_save_path, - model_save_path=model_save_path, - ) - train_pipeline(config) - return metrics_path, model_save_path, pipeline_save_path def test_train_pipeline( - tmpdir: Path, - dataset_file: str, - categorical_features: List[str], - numerical_features: List[str], - target_column: str, - metrics: List[str], - statistics: OrderedDict[str, Callable] + train_artifacts: Tuple[str, str, str, str] ): - metrics_path, model_save_path, pipeline_save_path = train_model(categorical_features, dataset_file, metrics, - numerical_features, statistics, target_column, - tmpdir) + metrics_path, model_save_path, pipeline_save_path, metadata_path = train_artifacts assert Path(model_save_path).exists() assert Path(pipeline_save_path).exists() + assert Path(metadata_path).exists() assert Path(metrics_path).exists() with open(metrics_path, "r") as f: metric_values = yaml.safe_load(f) @@ -81,22 +26,16 @@ def test_train_pipeline( def test_predict_pipeline( tmpdir: Path, - dataset_file: str, + train_artifacts: Tuple[str, str, str, str], + test_dataset_file: str, dataset_size: int, - categorical_features: List[str], - numerical_features: List[str], - target_column: str, - metrics: List[str], - statistics: OrderedDict[str, Callable] ): + metrics_path, model_save_path, pipeline_save_path, metadata_path = train_artifacts output = str(tmpdir / "output.txt") - _, model_save_path, pipeline_save_path = train_model(categorical_features, dataset_file, metrics, - numerical_features, statistics, target_column, - tmpdir) config = PredictConfig( model_load_path=model_save_path, pipeline_load_path=pipeline_save_path, - data_path=dataset_file, + data_path=test_dataset_file, output_path=output ) predict(config) diff --git a/ml_project/tests/test_utils.py b/ml_project/tests/test_utils.py new file mode 100644 index 0000000..b9eeefe --- /dev/null +++ b/ml_project/tests/test_utils.py @@ -0,0 +1,11 @@ +from pathlib import Path + +from heart_disease.utils import serialize_object, deserialize_object + + +def test_serialize_object(tmp_path: Path): + path = str(tmp_path / "path.pkl") + objects = [1, 2.0, "test", {1, 2, "test2"}, {"hello": "there", "general": 42}, ["one", 2, 3.0]] + for obj in objects: + serialize_object(obj, path) + assert deserialize_object(path) == obj diff --git a/online_inference/.env b/online_inference/.env new file mode 100644 index 0000000..b7f27c9 --- /dev/null +++ b/online_inference/.env @@ -0,0 +1,3 @@ +model_path=model.pkl +pipeline_path=pipeline.pkl +metadata_path=metadata.pkl diff --git a/online_inference/Dockerfile b/online_inference/Dockerfile new file mode 100644 index 0000000..81782a8 --- /dev/null +++ b/online_inference/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.9.2-buster + +COPY requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +COPY api api + +COPY model.pkl model.pkl +COPY pipeline.pkl pipeline.pkl +COPY metadata.pkl metadata.pkl + +WORKDIR . + +ENV model_path="/model.pkl" +ENV pipeline_path="/pipeline.pkl" +ENV metadata_path="/metadata.pkl" + +CMD ["uvicorn", "api.api:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/online_inference/README.md b/online_inference/README.md new file mode 100644 index 0000000..2f6aaa8 --- /dev/null +++ b/online_inference/README.md @@ -0,0 +1,23 @@ +Сборка образа +----------------- +Сначала обучить и положить артефакты рядом, затем: +```shell +docker build . -t ml_project:latest +``` + +Публикация образа +----------------- +```shell +docker tag ml_project:latest polikutinevgeny/ml_project:latest +docker push polikutinevgeny/ml_project:latest +``` + +Запуск образа +------------- +```shell +docker pull polikutinevgeny/ml_project:latest +docker run -p 8000:80 polikutinevgeny/ml_project:latest +``` + +Протыкать скриптом: +`python -m make_request` diff --git a/online_inference/api/__init__.py b/online_inference/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/online_inference/api/api.py b/online_inference/api/api.py new file mode 100644 index 0000000..2e3e19d --- /dev/null +++ b/online_inference/api/api.py @@ -0,0 +1,71 @@ +import logging +from pathlib import Path +from typing import List, Dict, Any + +import cloudpickle +import numpy as np +import pandas as pd +from dotenv import load_dotenv +from fastapi import FastAPI, HTTPException, Request + +from .schemas import HeartDiseaseModel, HeartDiseaseResponseModel, Settings + +VALIDATION_ERROR_STATUS_CODE = 400 + +logger = logging.getLogger(__name__) + +load_dotenv() +settings = Settings() +app = FastAPI( + title="Heart disease prediction", +) + + +def deserialize_object(path: Path) -> Any: + with open(path, "rb") as f: + return cloudpickle.load(f) + + +@app.on_event("startup") +def load_artifacts(): + logger.info("Loading artifacts...") + app.state.metadata = deserialize_object(settings.metadata_path) + app.state.pipeline = deserialize_object(settings.pipeline_path) + app.state.model = deserialize_object(settings.model_path) + logger.info("Artifacts loaded") + + +def rebuild_dataframe(params: HeartDiseaseModel, metadata: Dict[str, np.dtype]) -> pd.DataFrame: + try: + data = pd.DataFrame(params.features, columns=params.columns) + except ValueError: + error_msg = "Failed to construct DataFrame from passed data" + logger.exception(error_msg) + raise_validation_error(error_msg) + for key, dtype in metadata.items(): + if key not in data.columns: + error_msg = f"Column {key} not found in data" + logger.error(error_msg) + raise_validation_error(error_msg) + if data[key].dtype != dtype: + try: + data[key] = data[key].astype(dtype) + except ValueError: + error_msg = f"Failed to cast column {key} to dtype {dtype}" + logger.exception(error_msg) + raise_validation_error(error_msg) + return data[list(metadata.keys())] + + +def raise_validation_error(error_msg): + raise HTTPException(status_code=VALIDATION_ERROR_STATUS_CODE, detail=error_msg) + + +@app.post("/predict", response_model=List[HeartDiseaseResponseModel]) +def predict(request: Request, params: HeartDiseaseModel): + data = rebuild_dataframe(params, app.state.metadata) + processed_features = request.app.state.pipeline.transform(data) + predictions = request.app.state.model.predict(processed_features) + return [ + HeartDiseaseResponseModel(id=id_, has_disease=pred == 1) for id_, pred in zip(params.ids, predictions) + ] diff --git a/online_inference/api/schemas.py b/online_inference/api/schemas.py new file mode 100644 index 0000000..e829017 --- /dev/null +++ b/online_inference/api/schemas.py @@ -0,0 +1,21 @@ +from pathlib import Path +from typing import Union, List + +from pydantic import BaseModel, BaseSettings + + +class HeartDiseaseModel(BaseModel): + ids: List[int] + features: List[List[Union[int, float]]] + columns: List[str] + + +class HeartDiseaseResponseModel(BaseModel): + id: int + has_disease: bool + + +class Settings(BaseSettings): + model_path: Path + pipeline_path: Path + metadata_path: Path diff --git a/online_inference/make_request.py b/online_inference/make_request.py new file mode 100644 index 0000000..899e114 --- /dev/null +++ b/online_inference/make_request.py @@ -0,0 +1,17 @@ +import requests +import pandas as pd + +DATA_PATH = "subset.csv" + +if __name__ == '__main__': + data = pd.read_csv(DATA_PATH) + response = requests.post( + "http://localhost:8000/predict", + json={ + "ids": list(range(data.shape[0])), + "features": data.values.tolist(), + "columns": data.columns.tolist() + }, + ) + print(response.status_code) + print(response.json()) diff --git a/online_inference/requirements.txt b/online_inference/requirements.txt new file mode 100644 index 0000000..f6e6362 --- /dev/null +++ b/online_inference/requirements.txt @@ -0,0 +1,8 @@ +pandas==1.2.4 +pytest==6.2.3 +uvicorn[standard]==0.13.4 +fastapi==0.64.0 +python-dotenv==0.17.1 +scikit-learn==0.24.1 +omegaconf==2.0.6 +git+git://github.com/polikutinevgeny/cloudpickle.git@206-deep-serialization diff --git a/online_inference/subset.csv b/online_inference/subset.csv new file mode 100644 index 0000000..a9457ec --- /dev/null +++ b/online_inference/subset.csv @@ -0,0 +1,10 @@ +age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target +63,1,3,145,233,1,0,150,0,2.3,0,0,1,1 +37,1,2,130,250,0,1,187,0,3.5,0,0,2,1 +41,0,1,130,204,0,0,172,0,1.4,2,0,2,1 +56,1,1,120,236,0,1,178,0,0.8,2,0,2,1 +57,0,0,120,354,0,1,163,1,0.6,2,0,2,1 +57,1,0,140,192,0,1,148,0,0.4,1,0,1,1 +56,0,1,140,294,0,0,153,0,1.3,1,0,2,1 +44,1,1,120,263,0,1,173,0,0,2,0,3,1 +52,1,2,172,199,1,1,162,0,0.5,2,0,3,1 diff --git a/online_inference/tests/__init__.py b/online_inference/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/online_inference/tests/conftest.py b/online_inference/tests/conftest.py new file mode 100644 index 0000000..474a811 --- /dev/null +++ b/online_inference/tests/conftest.py @@ -0,0 +1,53 @@ +from typing import Union, Dict, Callable, List + +import pandas as pd +import pytest +from numpy.random import Generator, PCG64 + + +def get_row_generators(rng: Generator) -> Dict[str, Callable]: + return { + "age": lambda: rng.normal(54, 9), + "trestbps": lambda: rng.normal(131, 18), + "chol": lambda: rng.normal(246, 52), + "thalach": lambda: rng.normal(150, 23), + "oldpeak": lambda: rng.uniform(0, 6.2), + "thal": lambda: rng.integers(0, 4), + "ca": lambda: rng.integers(0, 5), + "slope": lambda: rng.integers(0, 3), + "exang": lambda: rng.integers(0, 2), + "restecg": lambda: rng.integers(0, 3), + "fbs": lambda: rng.integers(0, 2), + "cp": lambda: rng.integers(0, 4), + "sex": lambda: rng.integers(0, 2), + } + + +@pytest.fixture(scope="session") +def categorical_features() -> List[str]: + return ["thal", "ca", "slope", "exang", "restecg", "fbs", "cp", "sex"] + + +def generate_random_row(row_generators: Dict[str, Callable]) -> Dict[str, Union[int, float]]: + row = {} + for key, generator in row_generators.items(): + row[key] = generator() + return row + + +@pytest.fixture(scope="session") +def dataset_filename() -> str: + return "data.csv" + + +@pytest.fixture(scope="session") +def dataset_size() -> int: + return 200 + + +@pytest.fixture(scope="session") +def test_dataset(tmp_path_factory, dataset_filename: str, dataset_size: int) -> pd.DataFrame: + path = tmp_path_factory.mktemp("path") + rng = Generator(PCG64(12345)) + data = pd.DataFrame.from_records([generate_random_row(get_row_generators(rng)) for _ in range(dataset_size)]) + return data diff --git a/online_inference/tests/test_api.py b/online_inference/tests/test_api.py new file mode 100644 index 0000000..633055f --- /dev/null +++ b/online_inference/tests/test_api.py @@ -0,0 +1,66 @@ +from typing import Tuple, List + +import pandas as pd +import pytest +from dotenv import load_dotenv +from fastapi.testclient import TestClient +from pydantic import parse_obj_as + +from api.api import app +from api.schemas import HeartDiseaseModel, HeartDiseaseResponseModel, Settings + +test_client = TestClient(app) + + +def test_predict(test_dataset: pd.DataFrame): + ids = list(range(test_dataset.shape[0])) + request = HeartDiseaseModel( + ids=ids, + features=test_dataset.values.tolist(), + columns=test_dataset.columns.tolist() + ) + with test_client as client: + response = client.post("/predict", data=request.json()) + assert response.status_code == 200 + preds = parse_obj_as(List[HeartDiseaseResponseModel], response.json()) + assert len(preds) == test_dataset.shape[0] + assert set([i.id for i in preds]) == set(ids) + + +def test_predict_wrong_shape(test_dataset: pd.DataFrame): + ids = list(range(test_dataset.shape[0])) + request = HeartDiseaseModel( + ids=ids, + features=test_dataset.values.tolist(), + columns=test_dataset.columns.tolist()[:-1] + ) + with test_client as client: + response = client.post("/predict", data=request.json()) + assert response.status_code == 400 + + +def test_predict_wrong_column(test_dataset: pd.DataFrame): + ids = list(range(test_dataset.shape[0])) + columns = test_dataset.columns.tolist()[:-1] + ["obviously_extra_column"] + request = HeartDiseaseModel( + ids=ids, + features=test_dataset.values.tolist(), + columns=columns + ) + with test_client as client: + response = client.post("/predict", data=request.json()) + assert response.status_code == 400 + + +def test_predict_wrong_dtype(test_dataset: pd.DataFrame, categorical_features: List[str]): + dataset_copy = test_dataset.copy(deep=True) + ids = list(range(dataset_copy.shape[0])) + dataset_copy[categorical_features[0]] = float('nan') + request = HeartDiseaseModel( + ids=ids, + features=dataset_copy.values.tolist(), + columns=dataset_copy.columns.tolist() + ) + with test_client as client: + response = client.post("/predict", data=request.json()) + assert response.status_code == 400