From 00f42598e5bd8210237f56a5aa48a9d7aec4dfc4 Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Sun, 9 May 2021 12:01:03 +1000 Subject: [PATCH 01/14] Add metadata saving --- .../config/experiment_1/train_config.yaml | 1 + .../config/experiment_2/train_config.yaml | 1 + ml_project/config/train_config.yaml | 1 + .../heart_disease/entities/pipeline_config.py | 1 + .../heart_disease/features/build_features.py | 19 ++++++++---- ml_project/heart_disease/models/model.py | 8 ++--- .../heart_disease/models/train_model.py | 8 +++-- ml_project/heart_disease/utils.py | 12 ++++++++ .../tests/features/test_build_features.py | 23 ++++++++++++++- ml_project/tests/models/test_pipelines.py | 29 ++++++++++++++----- ml_project/tests/test_utils.py | 11 +++++++ 11 files changed, 92 insertions(+), 22 deletions(-) create mode 100644 ml_project/heart_disease/utils.py create mode 100644 ml_project/tests/test_utils.py diff --git a/ml_project/config/experiment_1/train_config.yaml b/ml_project/config/experiment_1/train_config.yaml index d396f5c..b247c2e 100644 --- a/ml_project/config/experiment_1/train_config.yaml +++ b/ml_project/config/experiment_1/train_config.yaml @@ -1,5 +1,6 @@ model_save_path: models/model_experiment_1.pkl pipeline_save_path: models/pipeline_experiment_1.pkl +metadata_save_path: models/metadata_experiment_1.pkl data_load_config: data_path: data/heart.csv split_config: diff --git a/ml_project/config/experiment_2/train_config.yaml b/ml_project/config/experiment_2/train_config.yaml index 615016a..54983b4 100644 --- a/ml_project/config/experiment_2/train_config.yaml +++ b/ml_project/config/experiment_2/train_config.yaml @@ -1,5 +1,6 @@ model_save_path: models/model_experiment_2.pkl pipeline_save_path: models/pipeline_experiment_2.pkl +metadata_save_path: models/metadata_experiment_2.pkl data_load_config: data_path: data/heart.csv split_config: diff --git a/ml_project/config/train_config.yaml b/ml_project/config/train_config.yaml index eea124d..8030c70 100644 --- a/ml_project/config/train_config.yaml +++ b/ml_project/config/train_config.yaml @@ -1,5 +1,6 @@ model_save_path: models/model.pkl pipeline_save_path: models/pipeline.pkl +metadata_save_path: models/metadata.pkl data_load_config: data_path: data/heart.csv split_config: diff --git a/ml_project/heart_disease/entities/pipeline_config.py b/ml_project/heart_disease/entities/pipeline_config.py index 5b0e95c..2862d28 100644 --- a/ml_project/heart_disease/entities/pipeline_config.py +++ b/ml_project/heart_disease/entities/pipeline_config.py @@ -15,6 +15,7 @@ class TrainingConfig: evaluation_config: EvaluateModelConfig = field(default_factory=lambda: EvaluateModelConfig) model_save_path: str = omegaconf.MISSING pipeline_save_path: str = omegaconf.MISSING + metadata_save_path: str = omegaconf.MISSING @dataclass diff --git a/ml_project/heart_disease/features/build_features.py b/ml_project/heart_disease/features/build_features.py index 52de418..47903e0 100644 --- a/ml_project/heart_disease/features/build_features.py +++ b/ml_project/heart_disease/features/build_features.py @@ -1,5 +1,4 @@ -import pickle -from typing import List +from typing import List, Dict import numpy as np import pandas as pd @@ -12,6 +11,7 @@ from sklearn.random_projection import SparseRandomProjection from heart_disease.entities.feature_config import FeatureConfig +from heart_disease.utils import serialize_object, deserialize_object class StatisticalFeaturesExtractor(TransformerMixin): @@ -96,10 +96,17 @@ def extract_target(df: pd.DataFrame, config: FeatureConfig) -> pd.Series: def serialize_pipeline(pipeline: Pipeline, path: str): - with open(path, "wb") as f: - pickle.dump(pipeline, f) + serialize_object(pipeline, path) def deserialize_pipeline(path: str) -> Pipeline: - with open(path, "rb") as f: - return pickle.load(f) + return deserialize_object(path) + + +def serialize_metadata(df: pd.DataFrame, config: FeatureConfig, path: str): + all_features = config.raw_features.numeric_features + config.raw_features.categorical_features + return serialize_object(df[all_features].dtypes.to_dict(), path) + + +def deserialize_metadata(path: str) -> Dict[str, np.dtype]: + return deserialize_object(path) diff --git a/ml_project/heart_disease/models/model.py b/ml_project/heart_disease/models/model.py index 64048f1..5c8a30d 100644 --- a/ml_project/heart_disease/models/model.py +++ b/ml_project/heart_disease/models/model.py @@ -1,4 +1,3 @@ -import pickle from typing import Union, Dict, List import numpy as np @@ -7,6 +6,7 @@ from sklearn.metrics import get_scorer from heart_disease.entities.model_config import TrainModelConfig, ModelType +from heart_disease.utils import deserialize_object, serialize_object Classifier = Union[RandomForestClassifier, ExtraTreesClassifier] @@ -41,10 +41,8 @@ def save_metrics(metrics: Dict[str, float], path: str): def serialize_model(model: Classifier, path: str): - with open(path, "wb") as f: - pickle.dump(model, f) + serialize_object(model, path) def deserialize_model(path: str) -> Classifier: - with open(path, "rb") as f: - return pickle.load(f) + return deserialize_object(path) diff --git a/ml_project/heart_disease/models/train_model.py b/ml_project/heart_disease/models/train_model.py index a137b2f..4ad56b9 100644 --- a/ml_project/heart_disease/models/train_model.py +++ b/ml_project/heart_disease/models/train_model.py @@ -7,7 +7,8 @@ from heart_disease.data.make_dataset import load_datasets from heart_disease.entities.pipeline_config import TrainingConfig -from heart_disease.features.build_features import build_feature_pipeline, extract_target, serialize_pipeline +from heart_disease.features.build_features import build_feature_pipeline, extract_target, serialize_pipeline, \ + serialize_metadata from heart_disease.models.model import train_model, evaluate_model, serialize_model, save_metrics log = logging.getLogger(__name__) @@ -35,11 +36,11 @@ def train_pipeline(cfg: TrainingConfig): log.info("Features built") log.info(f"Training model {cfg.model_config.model.value}...") - model = train_model(train_features, train_target, cfg.model_config) + model = train_model(train_features, train_target.values, cfg.model_config) log.info("Model trained") log.info("Evaluating model...") - metrics = evaluate_model(model, val_features, val_target, cfg.evaluation_config.metrics) + metrics = evaluate_model(model, val_features, val_target.values, cfg.evaluation_config.metrics) save_metrics(metrics, to_absolute_path(cfg.evaluation_config.metric_file_path)) log.info("Model evaluated:") for metric, value in metrics.items(): @@ -48,6 +49,7 @@ def train_pipeline(cfg: TrainingConfig): log.info("Serializing...") serialize_model(model, to_absolute_path(cfg.model_save_path)) serialize_pipeline(feature_pipeline, to_absolute_path(cfg.pipeline_save_path)) + serialize_metadata(train_data, cfg.feature_config, to_absolute_path(cfg.metadata_save_path)) log.info("Model and pipeline serialized") diff --git a/ml_project/heart_disease/utils.py b/ml_project/heart_disease/utils.py new file mode 100644 index 0000000..144b839 --- /dev/null +++ b/ml_project/heart_disease/utils.py @@ -0,0 +1,12 @@ +import pickle +from typing import Any + + +def serialize_object(obj: Any, path: str): + with open(path, "wb") as f: + pickle.dump(obj, f) + + +def deserialize_object(path: str) -> Any: + with open(path, "rb") as f: + return pickle.load(f) diff --git a/ml_project/tests/features/test_build_features.py b/ml_project/tests/features/test_build_features.py index 3679602..442df2f 100644 --- a/ml_project/tests/features/test_build_features.py +++ b/ml_project/tests/features/test_build_features.py @@ -9,7 +9,7 @@ from heart_disease.entities.feature_config import FeatureConfig, RandomProjectionFeaturesConfig, \ KMeansFeaturesConfig, StatisticalFeaturesConfig, PolynomialFeaturesConfig from heart_disease.features.build_features import \ - StatisticalFeaturesExtractor, KMeansFeaturesExtractor + StatisticalFeaturesExtractor, KMeansFeaturesExtractor, serialize_metadata, deserialize_metadata from heart_disease.features.build_features import \ build_categorical_feature_pipeline, \ build_numerical_feature_pipeline, \ @@ -132,3 +132,24 @@ def test_serialize_pipeline( serialize_pipeline(pipeline, filename) loaded_pipeline = deserialize_pipeline(filename) assert np.all(pipeline.transform(dataset) == loaded_pipeline.transform(dataset)) + + +def test_serialize_metadata( + dataset: pd.DataFrame, + numerical_features: List[str], + categorical_features: List[str], + tmp_path: Path, + statistics: OrderedDict[str, Callable], + target_column: str +): + filename = str(tmp_path / "metadata.pkl") + projection_features = 5 + polynomial_degree = 2 + n_clusters = 2 + config = get_feature_config(target_column, categorical_features, n_clusters, numerical_features, polynomial_degree, + projection_features, statistics) + serialize_metadata(dataset, config, filename) + metadata = deserialize_metadata(filename) + for k, v in metadata.items(): + assert dataset[k].dtype == v + assert set(metadata.keys()) == set(numerical_features + categorical_features) diff --git a/ml_project/tests/models/test_pipelines.py b/ml_project/tests/models/test_pipelines.py index 16b6c9e..2bb24d3 100644 --- a/ml_project/tests/models/test_pipelines.py +++ b/ml_project/tests/models/test_pipelines.py @@ -25,6 +25,7 @@ def train_model( model_save_path = str(tmpdir / "model.pkl") pipeline_save_path = str(tmpdir / "pipeline.pkl") metrics_path = str(tmpdir / "metrics.yaml") + metadata_path = str(tmpdir / "metadata.pkl") projection_features = 5 polynomial_degree = 2 n_clusters = 2 @@ -51,9 +52,10 @@ def train_model( ), pipeline_save_path=pipeline_save_path, model_save_path=model_save_path, + metadata_save_path=metadata_path ) train_pipeline(config) - return metrics_path, model_save_path, pipeline_save_path + return metrics_path, model_save_path, pipeline_save_path, metadata_path def test_train_pipeline( @@ -65,11 +67,18 @@ def test_train_pipeline( metrics: List[str], statistics: OrderedDict[str, Callable] ): - metrics_path, model_save_path, pipeline_save_path = train_model(categorical_features, dataset_file, metrics, - numerical_features, statistics, target_column, - tmpdir) + metrics_path, model_save_path, pipeline_save_path, metadata_path = train_model( + categorical_features, + dataset_file, + metrics, + numerical_features, + statistics, + target_column, + tmpdir + ) assert Path(model_save_path).exists() assert Path(pipeline_save_path).exists() + assert Path(metadata_path).exists() assert Path(metrics_path).exists() with open(metrics_path, "r") as f: metric_values = yaml.safe_load(f) @@ -90,9 +99,15 @@ def test_predict_pipeline( statistics: OrderedDict[str, Callable] ): output = str(tmpdir / "output.txt") - _, model_save_path, pipeline_save_path = train_model(categorical_features, dataset_file, metrics, - numerical_features, statistics, target_column, - tmpdir) + _, model_save_path, pipeline_save_path, _ = train_model( + categorical_features, + dataset_file, + metrics, + numerical_features, + statistics, + target_column, + tmpdir + ) config = PredictConfig( model_load_path=model_save_path, pipeline_load_path=pipeline_save_path, diff --git a/ml_project/tests/test_utils.py b/ml_project/tests/test_utils.py new file mode 100644 index 0000000..b9eeefe --- /dev/null +++ b/ml_project/tests/test_utils.py @@ -0,0 +1,11 @@ +from pathlib import Path + +from heart_disease.utils import serialize_object, deserialize_object + + +def test_serialize_object(tmp_path: Path): + path = str(tmp_path / "path.pkl") + objects = [1, 2.0, "test", {1, 2, "test2"}, {"hello": "there", "general": 42}, ["one", 2, 3.0]] + for obj in objects: + serialize_object(obj, path) + assert deserialize_object(path) == obj From b7fd1b5e3aa9a1f5ca0ccf85b9f0475bcbb12a9b Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Sun, 9 May 2021 15:41:01 +1000 Subject: [PATCH 02/14] Add API for online predictions --- .../heart_disease/features/build_features.py | 4 + .../heart_disease/models/train_model.py | 10 +- ml_project/online_inference/__init__.py | 0 ml_project/online_inference/api.py | 58 +++++++++ ml_project/online_inference/schemas.py | 21 +++ ml_project/requirements.txt | 3 + ml_project/tests/conftest.py | 120 +++++++++++++++--- ml_project/tests/models/test_models.py | 1 - ml_project/tests/models/test_pipelines.py | 92 ++------------ ml_project/tests/online_inference/__init__.py | 0 ml_project/tests/online_inference/test_api.py | 84 ++++++++++++ 11 files changed, 287 insertions(+), 106 deletions(-) create mode 100644 ml_project/online_inference/__init__.py create mode 100644 ml_project/online_inference/api.py create mode 100644 ml_project/online_inference/schemas.py create mode 100644 ml_project/tests/online_inference/__init__.py create mode 100644 ml_project/tests/online_inference/test_api.py diff --git a/ml_project/heart_disease/features/build_features.py b/ml_project/heart_disease/features/build_features.py index 47903e0..f286d5f 100644 --- a/ml_project/heart_disease/features/build_features.py +++ b/ml_project/heart_disease/features/build_features.py @@ -95,6 +95,10 @@ def extract_target(df: pd.DataFrame, config: FeatureConfig) -> pd.Series: return target +def extract_raw_features(df: pd.DataFrame, config: FeatureConfig) -> pd.DataFrame: + return df[config.raw_features.numeric_features + config.raw_features.categorical_features] + + def serialize_pipeline(pipeline: Pipeline, path: str): serialize_object(pipeline, path) diff --git a/ml_project/heart_disease/models/train_model.py b/ml_project/heart_disease/models/train_model.py index 4ad56b9..6a9a4da 100644 --- a/ml_project/heart_disease/models/train_model.py +++ b/ml_project/heart_disease/models/train_model.py @@ -8,7 +8,7 @@ from heart_disease.data.make_dataset import load_datasets from heart_disease.entities.pipeline_config import TrainingConfig from heart_disease.features.build_features import build_feature_pipeline, extract_target, serialize_pipeline, \ - serialize_metadata + serialize_metadata, extract_raw_features from heart_disease.models.model import train_model, evaluate_model, serialize_model, save_metrics log = logging.getLogger(__name__) @@ -28,9 +28,11 @@ def train_pipeline(cfg: TrainingConfig): log.info("Building features...") feature_pipeline = build_feature_pipeline(cfg.feature_config) - feature_pipeline.fit(train_data) - train_features = feature_pipeline.transform(train_data) - val_features = feature_pipeline.transform(val_data) + raw_train_features = extract_raw_features(train_data, cfg.feature_config) + raw_val_features = extract_raw_features(val_data, cfg.feature_config) + feature_pipeline.fit(raw_train_features) + train_features = feature_pipeline.transform(raw_train_features) + val_features = feature_pipeline.transform(raw_val_features) train_target = extract_target(train_data, cfg.feature_config) val_target = extract_target(val_data, cfg.feature_config) log.info("Features built") diff --git a/ml_project/online_inference/__init__.py b/ml_project/online_inference/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ml_project/online_inference/api.py b/ml_project/online_inference/api.py new file mode 100644 index 0000000..906ad35 --- /dev/null +++ b/ml_project/online_inference/api.py @@ -0,0 +1,58 @@ +import logging +from typing import List, Dict + +import numpy as np +import pandas as pd +from dotenv import load_dotenv +from fastapi import FastAPI, HTTPException, Request + +from heart_disease.features.build_features import deserialize_metadata, deserialize_pipeline +from heart_disease.models.model import deserialize_model +from online_inference.schemas import HeartDiseaseModel, HeartDiseaseResponseModel, Settings + +logger = logging.getLogger(__name__) + +load_dotenv() +settings = Settings() +app = FastAPI( + title="Heart disease prediction", +) + + +@app.on_event("startup") +def load_artifacts(): + app.state.metadata = deserialize_metadata(str(settings.metadata_path)) + app.state.pipeline = deserialize_pipeline(str(settings.pipeline_path)) + app.state.model = deserialize_model(str(settings.model_path)) + + +def rebuild_dataframe(params: HeartDiseaseModel, metadata: Dict[str, np.dtype]) -> pd.DataFrame: + try: + data = pd.DataFrame(params.features, columns=params.columns) + except ValueError: + error_msg = "Failed to construct DataFrame from passed data" + logger.exception(error_msg) + raise HTTPException(status_code=400, detail=error_msg) + for key, dtype in metadata.items(): + if key not in data.columns: + error_msg = f"Column {key} not found in data" + logger.error(error_msg) + raise HTTPException(status_code=400, detail=error_msg) + if data[key].dtype != dtype: + try: + data[key] = data[key].astype(dtype) + except ValueError: + error_msg = f"Failed to cast column {key} to dtype {dtype}" + logger.exception(error_msg) + raise HTTPException(status_code=400, detail=error_msg) + return data + + +@app.post("/predict", response_model=List[HeartDiseaseResponseModel]) +def predict(request: Request, params: HeartDiseaseModel): + data = rebuild_dataframe(params, app.state.metadata) + processed_features = request.app.state.pipeline.transform(data) + predictions = request.app.state.model.predict(processed_features) + return [ + HeartDiseaseResponseModel(id=id_, has_disease=pred == 1) for id_, pred in zip(params.ids, predictions) + ] diff --git a/ml_project/online_inference/schemas.py b/ml_project/online_inference/schemas.py new file mode 100644 index 0000000..fa15805 --- /dev/null +++ b/ml_project/online_inference/schemas.py @@ -0,0 +1,21 @@ +from typing import Dict, Union, List +from pathlib import Path + +from pydantic import BaseModel, BaseSettings + + +class HeartDiseaseModel(BaseModel): + ids: List[int] + features: List[List[Union[int, float]]] + columns: List[str] + + +class HeartDiseaseResponseModel(BaseModel): + id: int + has_disease: bool + + +class Settings(BaseSettings): + model_path: Path + pipeline_path: Path + metadata_path: Path diff --git a/ml_project/requirements.txt b/ml_project/requirements.txt index 960b888..827181c 100644 --- a/ml_project/requirements.txt +++ b/ml_project/requirements.txt @@ -12,3 +12,6 @@ seaborn==0.11.1 scikit-learn==0.24.1 hydra-core==1.0.6 pytest==6.2.3 +uvicorn[standard]==0.13.4 +fastapi==0.64.0 +python-dotenv==0.17.1 diff --git a/ml_project/tests/conftest.py b/ml_project/tests/conftest.py index 5206648..1427de9 100644 --- a/ml_project/tests/conftest.py +++ b/ml_project/tests/conftest.py @@ -1,6 +1,5 @@ from collections import OrderedDict -from pathlib import Path -from typing import List, Union, Dict, Callable +from typing import List, Union, Dict, Callable, Tuple import numpy as np import pandas as pd @@ -8,9 +7,14 @@ from numpy.random import Generator, PCG64 from heart_disease.data.make_dataset import read_data +from heart_disease.entities.data_loading_config import DataLoadingConfig from heart_disease.entities.feature_config import FeatureConfig, RandomProjectionFeaturesConfig, \ StatisticalFeaturesConfig, KMeansFeaturesConfig, PolynomialFeaturesConfig, RawFeaturesConfig -from heart_disease.features.build_features import build_feature_pipeline +from heart_disease.entities.model_config import TrainModelConfig, EvaluateModelConfig, ModelType +from heart_disease.entities.pipeline_config import TrainingConfig +from heart_disease.entities.splitting_config import SplittingConfig +from heart_disease.features.build_features import build_feature_pipeline, extract_raw_features +from heart_disease.models.train_model import train_pipeline def get_row_generators(rng: Generator) -> Dict[str, Callable]: @@ -32,17 +36,17 @@ def get_row_generators(rng: Generator) -> Dict[str, Callable]: } -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def categorical_features() -> List[str]: return ["thal", "ca", "slope", "exang", "restecg", "fbs", "cp", "sex"] -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def numerical_features() -> List[str]: return ["age", "trestbps", "chol", "thalach", "oldpeak"] -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def target_column() -> str: return "target" @@ -54,31 +58,48 @@ def generate_random_row(row_generators: Dict[str, Callable]) -> Dict[str, Union[ return row -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def dataset_filename() -> str: return "data.csv" -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def dataset_size() -> int: return 200 -@pytest.fixture(scope='function') -def dataset_file(tmp_path: Path, dataset_filename: str, dataset_size: int) -> str: +@pytest.fixture(scope="session") +def dataset_file(tmp_path_factory, dataset_filename: str, dataset_size: int) -> str: + path = tmp_path_factory.mktemp("path") rng = Generator(PCG64(12345)) data = pd.DataFrame.from_records([generate_random_row(get_row_generators(rng)) for _ in range(dataset_size)]) - dataset_path = tmp_path / dataset_filename + dataset_path = path / dataset_filename data.to_csv(dataset_path, index=False) return str(dataset_path) -@pytest.fixture(scope='function') +@pytest.fixture(scope="session") +def test_dataset_file(tmp_path_factory, dataset_filename: str, dataset_size: int, target_column: str) -> str: + path = tmp_path_factory.mktemp("path") + rng = Generator(PCG64(12345)) + data = pd.DataFrame.from_records([generate_random_row(get_row_generators(rng)) for _ in range(dataset_size)]) + data.drop(columns=[target_column, ], inplace=True) + dataset_path = path / dataset_filename + data.to_csv(dataset_path, index=False) + return str(dataset_path) + + +@pytest.fixture(scope="session") def dataset(dataset_file: str) -> pd.DataFrame: return read_data(dataset_file) -@pytest.fixture(scope='function') +@pytest.fixture(scope="session") +def test_dataset(test_dataset_file: str) -> pd.DataFrame: + return read_data(test_dataset_file) + + +@pytest.fixture(scope="session") def features( dataset: pd.DataFrame, categorical_features: List[str], @@ -92,16 +113,16 @@ def features( config = get_feature_config(target_column, categorical_features, n_clusters, numerical_features, polynomial_degree, projection_features, statistics) pipeline = build_feature_pipeline(config) - transformed_features = pipeline.fit_transform(dataset) + transformed_features = pipeline.fit_transform(extract_raw_features(dataset, config)) return transformed_features -@pytest.fixture(scope='function') +@pytest.fixture(scope="session") def target(dataset: pd.DataFrame, target_column: str) -> np.ndarray: return dataset[target_column].values -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def statistics() -> OrderedDict[str, Callable]: return OrderedDict(sum=np.sum, var=lambda x, **kwargs: np.var(x, ddof=1, **kwargs), median=np.median, mean=np.mean, std=lambda x, **kwargs: np.std(x, ddof=1, **kwargs), max=np.max, min=np.min) @@ -132,6 +153,71 @@ def get_feature_config( return config -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def metrics() -> List[str]: return ["accuracy", "f1", "precision", "recall"] + + +@pytest.fixture(scope="session") +def metrics_path(tmp_path_factory) -> str: + return str(tmp_path_factory.mktemp("path") / "metrics.yaml") + + +@pytest.fixture(scope="session") +def model_save_path(tmp_path_factory) -> str: + return str(tmp_path_factory.mktemp("path") / "model.pkl") + + +@pytest.fixture(scope="session") +def pipeline_save_path(tmp_path_factory) -> str: + return str(tmp_path_factory.mktemp("path") / "pipeline.pkl") + + +@pytest.fixture(scope="session") +def metadata_path(tmp_path_factory) -> str: + return str(tmp_path_factory.mktemp("path") / "metadata.pkl") + + +@pytest.fixture(scope="session") +def train_artifacts( + categorical_features: List[str], + dataset_file: str, + metrics: List[str], + numerical_features: List[str], + statistics: OrderedDict[str, Callable], + model_save_path: str, + pipeline_save_path: str, + metrics_path: str, + metadata_path: str, + target_column: str +) -> Tuple[str, str, str, str]: + projection_features = 5 + polynomial_degree = 2 + n_clusters = 2 + feature_config = get_feature_config(target_column, categorical_features, n_clusters, numerical_features, + polynomial_degree, + projection_features, statistics) + config = TrainingConfig( + data_load_config=DataLoadingConfig( + split_config=SplittingConfig( + random_state=42, + val_size=0.2 + ), + data_path=dataset_file + ), + feature_config=feature_config, + model_config=TrainModelConfig( + model=ModelType.random_forest, + random_state=42, + params=dict(n_estimators=55) + ), + evaluation_config=EvaluateModelConfig( + metrics=metrics, + metric_file_path=metrics_path + ), + pipeline_save_path=pipeline_save_path, + model_save_path=model_save_path, + metadata_save_path=metadata_path + ) + train_pipeline(config) + return metrics_path, model_save_path, pipeline_save_path, metadata_path diff --git a/ml_project/tests/models/test_models.py b/ml_project/tests/models/test_models.py index 699b738..650b9a6 100644 --- a/ml_project/tests/models/test_models.py +++ b/ml_project/tests/models/test_models.py @@ -21,7 +21,6 @@ def test_predict(features: np.ndarray, target: np.ndarray): config = TrainModelConfig(model=ModelType.random_forest, random_state=42, params=dict(n_estimators=50)) model = train_model(features, target, config) predicted = predict_model(model, features) - print(predicted.shape, target.shape) assert predicted.shape[0] == target.shape[0] diff --git a/ml_project/tests/models/test_pipelines.py b/ml_project/tests/models/test_pipelines.py index 2bb24d3..52bf651 100644 --- a/ml_project/tests/models/test_pipelines.py +++ b/ml_project/tests/models/test_pipelines.py @@ -1,81 +1,17 @@ from pathlib import Path -from typing import List, Callable, OrderedDict +from typing import Tuple import numpy as np import yaml -from heart_disease.entities.data_loading_config import DataLoadingConfig -from heart_disease.entities.model_config import TrainModelConfig, EvaluateModelConfig, ModelType -from heart_disease.entities.pipeline_config import TrainingConfig, PredictConfig -from heart_disease.entities.splitting_config import SplittingConfig +from heart_disease.entities.pipeline_config import PredictConfig from heart_disease.models.predict_model import predict -from heart_disease.models.train_model import train_pipeline -from tests.conftest import get_feature_config - - -def train_model( - categorical_features: List[str], - dataset_file: str, - metrics: List[str], - numerical_features: List[str], - statistics: OrderedDict[str, Callable], - target_column: str, - tmpdir: Path -): - model_save_path = str(tmpdir / "model.pkl") - pipeline_save_path = str(tmpdir / "pipeline.pkl") - metrics_path = str(tmpdir / "metrics.yaml") - metadata_path = str(tmpdir / "metadata.pkl") - projection_features = 5 - polynomial_degree = 2 - n_clusters = 2 - feature_config = get_feature_config(target_column, categorical_features, n_clusters, numerical_features, - polynomial_degree, - projection_features, statistics) - config = TrainingConfig( - data_load_config=DataLoadingConfig( - split_config=SplittingConfig( - random_state=42, - val_size=0.2 - ), - data_path=dataset_file - ), - feature_config=feature_config, - model_config=TrainModelConfig( - model=ModelType.random_forest, - random_state=42, - params=dict(n_estimators=55) - ), - evaluation_config=EvaluateModelConfig( - metrics=metrics, - metric_file_path=metrics_path - ), - pipeline_save_path=pipeline_save_path, - model_save_path=model_save_path, - metadata_save_path=metadata_path - ) - train_pipeline(config) - return metrics_path, model_save_path, pipeline_save_path, metadata_path def test_train_pipeline( - tmpdir: Path, - dataset_file: str, - categorical_features: List[str], - numerical_features: List[str], - target_column: str, - metrics: List[str], - statistics: OrderedDict[str, Callable] + train_artifacts: Tuple[str, str, str, str] ): - metrics_path, model_save_path, pipeline_save_path, metadata_path = train_model( - categorical_features, - dataset_file, - metrics, - numerical_features, - statistics, - target_column, - tmpdir - ) + metrics_path, model_save_path, pipeline_save_path, metadata_path = train_artifacts assert Path(model_save_path).exists() assert Path(pipeline_save_path).exists() assert Path(metadata_path).exists() @@ -90,28 +26,16 @@ def test_train_pipeline( def test_predict_pipeline( tmpdir: Path, - dataset_file: str, + train_artifacts: Tuple[str, str, str, str], + test_dataset_file: str, dataset_size: int, - categorical_features: List[str], - numerical_features: List[str], - target_column: str, - metrics: List[str], - statistics: OrderedDict[str, Callable] ): + metrics_path, model_save_path, pipeline_save_path, metadata_path = train_artifacts output = str(tmpdir / "output.txt") - _, model_save_path, pipeline_save_path, _ = train_model( - categorical_features, - dataset_file, - metrics, - numerical_features, - statistics, - target_column, - tmpdir - ) config = PredictConfig( model_load_path=model_save_path, pipeline_load_path=pipeline_save_path, - data_path=dataset_file, + data_path=test_dataset_file, output_path=output ) predict(config) diff --git a/ml_project/tests/online_inference/__init__.py b/ml_project/tests/online_inference/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ml_project/tests/online_inference/test_api.py b/ml_project/tests/online_inference/test_api.py new file mode 100644 index 0000000..8449ecc --- /dev/null +++ b/ml_project/tests/online_inference/test_api.py @@ -0,0 +1,84 @@ +from typing import Tuple, List + +import pandas as pd +import pytest +from dotenv import load_dotenv +from fastapi.testclient import TestClient +from pydantic import parse_obj_as + +from online_inference.schemas import HeartDiseaseModel, HeartDiseaseResponseModel + + +@pytest.fixture(scope="session") +def test_client(tmp_path_factory, train_artifacts: Tuple[str, str, str, str]): + # Alas, FastAPI currently cannot use DI in events. + # Setting up .env config manually + # https://github.com/tiangolo/fastapi/issues/2057 + + path = tmp_path_factory.mktemp("path") + + _, model_save_path, pipeline_save_path, metadata_path = train_artifacts + new_env = path / ".env" + with open(new_env, "w") as f: + print(f"model_path={model_save_path}", file=f) + print(f"pipeline_path={pipeline_save_path}", file=f) + print(f"metadata_path={metadata_path}", file=f) + load_dotenv(dotenv_path=new_env) + + from online_inference.api import app + client = TestClient(app) + return client + + +def test_predict(test_dataset: pd.DataFrame, test_client: TestClient): + ids = list(range(test_dataset.shape[0])) + request = HeartDiseaseModel( + ids=ids, + features=test_dataset.values.tolist(), + columns=test_dataset.columns.tolist() + ) + with test_client as client: + response = client.post("/predict", data=request.json()) + assert response.status_code == 200 + preds = parse_obj_as(List[HeartDiseaseResponseModel], response.json()) + assert len(preds) == test_dataset.shape[0] + assert set([i.id for i in preds]) == set(ids) + + +def test_predict_wrong_shape(test_dataset: pd.DataFrame, test_client: TestClient): + ids = list(range(test_dataset.shape[0])) + request = HeartDiseaseModel( + ids=ids, + features=test_dataset.values.tolist(), + columns=test_dataset.columns.tolist()[:-1] + ) + with test_client as client: + response = client.post("/predict", data=request.json()) + assert response.status_code == 400 + + +def test_predict_wrong_column(test_dataset: pd.DataFrame, test_client: TestClient): + ids = list(range(test_dataset.shape[0])) + columns = test_dataset.columns.tolist()[:-1] + ["obviously_extra_column"] + request = HeartDiseaseModel( + ids=ids, + features=test_dataset.values.tolist(), + columns=columns + ) + with test_client as client: + response = client.post("/predict", data=request.json()) + assert response.status_code == 400 + + +def test_predict_wrong_dtype(test_dataset: pd.DataFrame, test_client: TestClient, categorical_features: List[str]): + dataset_copy = test_dataset.copy(deep=True) + ids = list(range(dataset_copy.shape[0])) + dataset_copy[categorical_features[0]] = float('nan') + request = HeartDiseaseModel( + ids=ids, + features=dataset_copy.values.tolist(), + columns=dataset_copy.columns.tolist() + ) + with test_client as client: + response = client.post("/predict", data=request.json()) + assert response.status_code == 400 From d36a68b3af246a84330ac8ffeb2e71d70a1660bd Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Sun, 9 May 2021 15:51:58 +1000 Subject: [PATCH 03/14] Ignore extra columns --- ml_project/online_inference/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml_project/online_inference/api.py b/ml_project/online_inference/api.py index 906ad35..dc95111 100644 --- a/ml_project/online_inference/api.py +++ b/ml_project/online_inference/api.py @@ -45,7 +45,7 @@ def rebuild_dataframe(params: HeartDiseaseModel, metadata: Dict[str, np.dtype]) error_msg = f"Failed to cast column {key} to dtype {dtype}" logger.exception(error_msg) raise HTTPException(status_code=400, detail=error_msg) - return data + return data[list(metadata.keys())] @app.post("/predict", response_model=List[HeartDiseaseResponseModel]) From 6e8769cd30c94c598aa82a1699134307fb3eb198 Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Sun, 9 May 2021 15:52:09 +1000 Subject: [PATCH 04/14] Add request script --- ml_project/online_inference/make_request.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 ml_project/online_inference/make_request.py diff --git a/ml_project/online_inference/make_request.py b/ml_project/online_inference/make_request.py new file mode 100644 index 0000000..bbd7753 --- /dev/null +++ b/ml_project/online_inference/make_request.py @@ -0,0 +1,18 @@ +import requests + +from heart_disease.data.make_dataset import read_data + +DATA_PATH = "data/heart.csv" + +if __name__ == '__main__': + data = read_data(DATA_PATH) + response = requests.post( + "http://localhost:8000/predict", + json={ + "ids": list(range(data.shape[0])), + "features": data.values.tolist(), + "columns": data.columns.tolist() + }, + ) + print(response.status_code) + print(response.json()) From f6d426d7c5f3f7e77dc5c4b8a1f4bba02f76abdf Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Sun, 9 May 2021 16:28:17 +1000 Subject: [PATCH 05/14] Separate requirements.txt --- ml_project/Makefile | 2 +- ml_project/dev_requirements.txt | 4 ++++ ml_project/requirements.txt | 3 --- 3 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 ml_project/dev_requirements.txt diff --git a/ml_project/Makefile b/ml_project/Makefile index 5aa376b..de6a2f6 100644 --- a/ml_project/Makefile +++ b/ml_project/Makefile @@ -23,7 +23,7 @@ endif ## Install Python Dependencies requirements: test_environment $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel - $(PYTHON_INTERPRETER) -m pip install -r requirements.txt + $(PYTHON_INTERPRETER) -m pip install -r dev_requirements.txt ## Build EDA report eda_report: requirements diff --git a/ml_project/dev_requirements.txt b/ml_project/dev_requirements.txt new file mode 100644 index 0000000..93c9460 --- /dev/null +++ b/ml_project/dev_requirements.txt @@ -0,0 +1,4 @@ +# local package +-e . + +-r requirements.txt diff --git a/ml_project/requirements.txt b/ml_project/requirements.txt index 827181c..6211586 100644 --- a/ml_project/requirements.txt +++ b/ml_project/requirements.txt @@ -1,6 +1,3 @@ -# local package --e . - # external requirements click==7.1.2 coverage==5.5 From 06d4af509afaf455c762ae834cee065016390983 Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Sun, 9 May 2021 16:28:40 +1000 Subject: [PATCH 06/14] Add Dockerfile and instructions to run --- ml_project/Dockerfile | 19 +++++++++++++++++++ ml_project/README.md | 22 ++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 ml_project/Dockerfile diff --git a/ml_project/Dockerfile b/ml_project/Dockerfile new file mode 100644 index 0000000..176971a --- /dev/null +++ b/ml_project/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.9.2-buster + +COPY requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +COPY heart_disease heart_disease +COPY online_inference online_inference + +COPY models/model.pkl model.pkl +COPY models/pipeline.pkl pipeline.pkl +COPY models/metadata.pkl metadata.pkl + +WORKDIR . + +ENV model_path="/model.pkl" +ENV pipeline_path="/pipeline.pkl" +ENV metadata_path="/metadata.pkl" + +CMD ["uvicorn", "online_inference.api:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/ml_project/README.md b/ml_project/README.md index 535f65e..9cae78a 100644 --- a/ml_project/README.md +++ b/ml_project/README.md @@ -39,3 +39,25 @@ make requirements Пути к файлам можно указывать через файлы конфигов, либо через параметры командной строки (e.g. `python -m heart_disease.models.predict_model data_path="data/new_data.csv"`) + + +Сборка образа +----------------- +```shell +python -m heart_disease.models.train_model +docker build . -t ml_project:latest +``` + +Публикация образа +----------------- +```shell +docker tag ml_project:latest polikutinevgeny/ml_project:latest +docker push polikutinevgeny/ml_project:latest +``` + +Запуск образа +------------- +```shell +docker pull polikutinevgeny/ml_project:latest +docker run -p 8000:80 polikutinevgeny/ml_project:latest +``` From 9539dbb002a22a0d11f641e65d30ff9ba88b6f41 Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Sun, 9 May 2021 16:29:27 +1000 Subject: [PATCH 07/14] Add request instructions --- ml_project/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ml_project/README.md b/ml_project/README.md index 9cae78a..000c736 100644 --- a/ml_project/README.md +++ b/ml_project/README.md @@ -61,3 +61,6 @@ docker push polikutinevgeny/ml_project:latest docker pull polikutinevgeny/ml_project:latest docker run -p 8000:80 polikutinevgeny/ml_project:latest ``` + +Протыкать скриптом: +`python -m online_inference.make_request` From c3ffef227dd3e715537a9002df157d9fd55e46ea Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Sat, 22 May 2021 12:32:25 +1000 Subject: [PATCH 08/14] Move online inference into a "separate" package. --- Dockerfile | 28 +++++++++++++++++++ README.md | 24 ++++++++++++++++ ml_project/tests/conftest.py => conftest.py | 0 ml_project/Dockerfile | 19 ------------- ml_project/README.md | 25 ----------------- ml_project/{online_inference => }/__init__.py | 0 ml_project/setup.py | 10 +++++++ .../tests/features/test_build_features.py | 2 +- .../__init__.py | 0 online_inference/api/__init__.py | 0 .../api}/api.py | 19 ++++++++----- .../api}/make_request.py | 7 ++--- .../api}/schemas.py | 0 online_inference/requirements.txt | 6 ++++ online_inference/tests/__init__.py | 0 .../tests}/test_api.py | 4 +-- 16 files changed, 86 insertions(+), 58 deletions(-) create mode 100644 Dockerfile rename ml_project/tests/conftest.py => conftest.py (100%) delete mode 100644 ml_project/Dockerfile rename ml_project/{online_inference => }/__init__.py (100%) rename {ml_project/tests/online_inference => online_inference}/__init__.py (100%) create mode 100644 online_inference/api/__init__.py rename {ml_project/online_inference => online_inference/api}/api.py (78%) rename {ml_project/online_inference => online_inference/api}/make_request.py (74%) rename {ml_project/online_inference => online_inference/api}/schemas.py (100%) create mode 100644 online_inference/requirements.txt create mode 100644 online_inference/tests/__init__.py rename {ml_project/tests/online_inference => online_inference/tests}/test_api.py (95%) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3b496ac --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.9.2-buster AS build + +COPY ml_project ml_project +WORKDIR /ml_project +RUN pip install --upgrade build +RUN python -m build + +FROM python:3.9.2-buster + +COPY online_inference/requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +COPY --from=build /ml_project/dist dist +RUN pip install dist/*.whl && rm -rf dist + +COPY online_inference/api api + +COPY ml_project/models/model.pkl model.pkl +COPY ml_project/models/pipeline.pkl pipeline.pkl +COPY ml_project/models/metadata.pkl metadata.pkl + +WORKDIR . + +ENV model_path="/model.pkl" +ENV pipeline_path="/pipeline.pkl" +ENV metadata_path="/metadata.pkl" + +CMD ["uvicorn", "api.api:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/README.md b/README.md index 9224c6f..9d1129e 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,27 @@ Репозиторий для курса "Машинное обучение в продакшене" MADE. [Профиль](https://data.mail.ru/profile/e.polikutin/) + +Сборка образа +----------------- +```shell +python -m heart_disease.models.train_model +docker build . -t ml_project:latest +``` + +Публикация образа +----------------- +```shell +docker tag ml_project:latest polikutinevgeny/ml_project:latest +docker push polikutinevgeny/ml_project:latest +``` + +Запуск образа +------------- +```shell +docker pull polikutinevgeny/ml_project:latest +docker run -p 8000:80 polikutinevgeny/ml_project:latest +``` + +Протыкать скриптом: +`python -m online_inference.api.make_request` diff --git a/ml_project/tests/conftest.py b/conftest.py similarity index 100% rename from ml_project/tests/conftest.py rename to conftest.py diff --git a/ml_project/Dockerfile b/ml_project/Dockerfile deleted file mode 100644 index 176971a..0000000 --- a/ml_project/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -FROM python:3.9.2-buster - -COPY requirements.txt ./requirements.txt -RUN pip install -r requirements.txt - -COPY heart_disease heart_disease -COPY online_inference online_inference - -COPY models/model.pkl model.pkl -COPY models/pipeline.pkl pipeline.pkl -COPY models/metadata.pkl metadata.pkl - -WORKDIR . - -ENV model_path="/model.pkl" -ENV pipeline_path="/pipeline.pkl" -ENV metadata_path="/metadata.pkl" - -CMD ["uvicorn", "online_inference.api:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/ml_project/README.md b/ml_project/README.md index 000c736..535f65e 100644 --- a/ml_project/README.md +++ b/ml_project/README.md @@ -39,28 +39,3 @@ make requirements Пути к файлам можно указывать через файлы конфигов, либо через параметры командной строки (e.g. `python -m heart_disease.models.predict_model data_path="data/new_data.csv"`) - - -Сборка образа ------------------ -```shell -python -m heart_disease.models.train_model -docker build . -t ml_project:latest -``` - -Публикация образа ------------------ -```shell -docker tag ml_project:latest polikutinevgeny/ml_project:latest -docker push polikutinevgeny/ml_project:latest -``` - -Запуск образа -------------- -```shell -docker pull polikutinevgeny/ml_project:latest -docker run -p 8000:80 polikutinevgeny/ml_project:latest -``` - -Протыкать скриптом: -`python -m online_inference.make_request` diff --git a/ml_project/online_inference/__init__.py b/ml_project/__init__.py similarity index 100% rename from ml_project/online_inference/__init__.py rename to ml_project/__init__.py diff --git a/ml_project/setup.py b/ml_project/setup.py index 0c16588..6c87d12 100644 --- a/ml_project/setup.py +++ b/ml_project/setup.py @@ -1,4 +1,13 @@ from setuptools import find_packages, setup +import pathlib +import pkg_resources + +with pathlib.Path('requirements.txt').open() as requirements_txt: + install_requires = [ + str(requirement) + for requirement + in pkg_resources.parse_requirements(requirements_txt) + ] setup( name='heart_disease', @@ -7,4 +16,5 @@ description='Predicting heart disease', author='Evgenii Polikutin', license='MIT', + install_requires=install_requires ) diff --git a/ml_project/tests/features/test_build_features.py b/ml_project/tests/features/test_build_features.py index 442df2f..7849a6c 100644 --- a/ml_project/tests/features/test_build_features.py +++ b/ml_project/tests/features/test_build_features.py @@ -17,7 +17,7 @@ serialize_pipeline, \ deserialize_pipeline, \ extract_target -from tests.conftest import get_feature_config +from conftest import get_feature_config EPS = 1e-6 diff --git a/ml_project/tests/online_inference/__init__.py b/online_inference/__init__.py similarity index 100% rename from ml_project/tests/online_inference/__init__.py rename to online_inference/__init__.py diff --git a/online_inference/api/__init__.py b/online_inference/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ml_project/online_inference/api.py b/online_inference/api/api.py similarity index 78% rename from ml_project/online_inference/api.py rename to online_inference/api/api.py index dc95111..e440f50 100644 --- a/ml_project/online_inference/api.py +++ b/online_inference/api/api.py @@ -1,14 +1,14 @@ import logging -from typing import List, Dict +import pickle +from pathlib import Path +from typing import List, Dict, Any import numpy as np import pandas as pd from dotenv import load_dotenv from fastapi import FastAPI, HTTPException, Request -from heart_disease.features.build_features import deserialize_metadata, deserialize_pipeline -from heart_disease.models.model import deserialize_model -from online_inference.schemas import HeartDiseaseModel, HeartDiseaseResponseModel, Settings +from .schemas import HeartDiseaseModel, HeartDiseaseResponseModel, Settings logger = logging.getLogger(__name__) @@ -19,11 +19,16 @@ ) +def deserialize_object(path: Path) -> Any: + with open(path, "rb") as f: + return pickle.load(f) + + @app.on_event("startup") def load_artifacts(): - app.state.metadata = deserialize_metadata(str(settings.metadata_path)) - app.state.pipeline = deserialize_pipeline(str(settings.pipeline_path)) - app.state.model = deserialize_model(str(settings.model_path)) + app.state.metadata = deserialize_object(settings.metadata_path) + app.state.pipeline = deserialize_object(settings.pipeline_path) + app.state.model = deserialize_object(settings.model_path) def rebuild_dataframe(params: HeartDiseaseModel, metadata: Dict[str, np.dtype]) -> pd.DataFrame: diff --git a/ml_project/online_inference/make_request.py b/online_inference/api/make_request.py similarity index 74% rename from ml_project/online_inference/make_request.py rename to online_inference/api/make_request.py index bbd7753..4684e4a 100644 --- a/ml_project/online_inference/make_request.py +++ b/online_inference/api/make_request.py @@ -1,11 +1,10 @@ import requests +import pandas as pd -from heart_disease.data.make_dataset import read_data - -DATA_PATH = "data/heart.csv" +DATA_PATH = "ml_project/data/heart.csv" if __name__ == '__main__': - data = read_data(DATA_PATH) + data = pd.read_csv(DATA_PATH) response = requests.post( "http://localhost:8000/predict", json={ diff --git a/ml_project/online_inference/schemas.py b/online_inference/api/schemas.py similarity index 100% rename from ml_project/online_inference/schemas.py rename to online_inference/api/schemas.py diff --git a/online_inference/requirements.txt b/online_inference/requirements.txt new file mode 100644 index 0000000..c54630a --- /dev/null +++ b/online_inference/requirements.txt @@ -0,0 +1,6 @@ +pandas==1.2.4 +scikit-learn==0.24.1 +pytest==6.2.3 +uvicorn[standard]==0.13.4 +fastapi==0.64.0 +python-dotenv==0.17.1 diff --git a/online_inference/tests/__init__.py b/online_inference/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ml_project/tests/online_inference/test_api.py b/online_inference/tests/test_api.py similarity index 95% rename from ml_project/tests/online_inference/test_api.py rename to online_inference/tests/test_api.py index 8449ecc..d82b20e 100644 --- a/ml_project/tests/online_inference/test_api.py +++ b/online_inference/tests/test_api.py @@ -6,7 +6,7 @@ from fastapi.testclient import TestClient from pydantic import parse_obj_as -from online_inference.schemas import HeartDiseaseModel, HeartDiseaseResponseModel +from online_inference.api.schemas import HeartDiseaseModel, HeartDiseaseResponseModel @pytest.fixture(scope="session") @@ -25,7 +25,7 @@ def test_client(tmp_path_factory, train_artifacts: Tuple[str, str, str, str]): print(f"metadata_path={metadata_path}", file=f) load_dotenv(dotenv_path=new_env) - from online_inference.api import app + from online_inference.api.api import app client = TestClient(app) return client From 2c725730cf28da82ce3ee22e12b2b0c034a9d8ab Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Tue, 25 May 2021 20:46:39 +1000 Subject: [PATCH 09/14] Remove dependency on heart_disease package in online inference --- Dockerfile | 28 ---------- ml_project/__init__.py | 0 ml_project/heart_disease/utils.py | 8 +-- ml_project/requirements.txt | 3 +- conftest.py => ml_project/tests/conftest.py | 0 .../tests/features/test_build_features.py | 2 +- online_inference/.env | 3 ++ online_inference/Dockerfile | 18 +++++++ README.md => online_inference/README.md | 4 +- online_inference/__init__.py | 0 online_inference/api/api.py | 4 +- online_inference/{api => }/make_request.py | 2 +- online_inference/requirements.txt | 4 +- online_inference/tests/conftest.py | 53 +++++++++++++++++++ online_inference/tests/test_api.py | 32 +++-------- 15 files changed, 96 insertions(+), 65 deletions(-) delete mode 100644 Dockerfile delete mode 100644 ml_project/__init__.py rename conftest.py => ml_project/tests/conftest.py (100%) create mode 100644 online_inference/.env create mode 100644 online_inference/Dockerfile rename README.md => online_inference/README.md (85%) delete mode 100644 online_inference/__init__.py rename online_inference/{api => }/make_request.py (90%) create mode 100644 online_inference/tests/conftest.py diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 3b496ac..0000000 --- a/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM python:3.9.2-buster AS build - -COPY ml_project ml_project -WORKDIR /ml_project -RUN pip install --upgrade build -RUN python -m build - -FROM python:3.9.2-buster - -COPY online_inference/requirements.txt ./requirements.txt -RUN pip install -r requirements.txt - -COPY --from=build /ml_project/dist dist -RUN pip install dist/*.whl && rm -rf dist - -COPY online_inference/api api - -COPY ml_project/models/model.pkl model.pkl -COPY ml_project/models/pipeline.pkl pipeline.pkl -COPY ml_project/models/metadata.pkl metadata.pkl - -WORKDIR . - -ENV model_path="/model.pkl" -ENV pipeline_path="/pipeline.pkl" -ENV metadata_path="/metadata.pkl" - -CMD ["uvicorn", "api.api:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/ml_project/__init__.py b/ml_project/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/ml_project/heart_disease/utils.py b/ml_project/heart_disease/utils.py index 144b839..a2fd51f 100644 --- a/ml_project/heart_disease/utils.py +++ b/ml_project/heart_disease/utils.py @@ -1,12 +1,14 @@ -import pickle +import cloudpickle from typing import Any +cloudpickle.register_deep_serialization("heart_disease") + def serialize_object(obj: Any, path: str): with open(path, "wb") as f: - pickle.dump(obj, f) + cloudpickle.dump(obj, f) def deserialize_object(path: str) -> Any: with open(path, "rb") as f: - return pickle.load(f) + return cloudpickle.load(f) diff --git a/ml_project/requirements.txt b/ml_project/requirements.txt index 6211586..05c6058 100644 --- a/ml_project/requirements.txt +++ b/ml_project/requirements.txt @@ -9,6 +9,5 @@ seaborn==0.11.1 scikit-learn==0.24.1 hydra-core==1.0.6 pytest==6.2.3 -uvicorn[standard]==0.13.4 -fastapi==0.64.0 python-dotenv==0.17.1 +git+git://github.com/polikutinevgeny/cloudpickle.git@206-deep-serialization diff --git a/conftest.py b/ml_project/tests/conftest.py similarity index 100% rename from conftest.py rename to ml_project/tests/conftest.py diff --git a/ml_project/tests/features/test_build_features.py b/ml_project/tests/features/test_build_features.py index 7849a6c..442df2f 100644 --- a/ml_project/tests/features/test_build_features.py +++ b/ml_project/tests/features/test_build_features.py @@ -17,7 +17,7 @@ serialize_pipeline, \ deserialize_pipeline, \ extract_target -from conftest import get_feature_config +from tests.conftest import get_feature_config EPS = 1e-6 diff --git a/online_inference/.env b/online_inference/.env new file mode 100644 index 0000000..b7f27c9 --- /dev/null +++ b/online_inference/.env @@ -0,0 +1,3 @@ +model_path=model.pkl +pipeline_path=pipeline.pkl +metadata_path=metadata.pkl diff --git a/online_inference/Dockerfile b/online_inference/Dockerfile new file mode 100644 index 0000000..81782a8 --- /dev/null +++ b/online_inference/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.9.2-buster + +COPY requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +COPY api api + +COPY model.pkl model.pkl +COPY pipeline.pkl pipeline.pkl +COPY metadata.pkl metadata.pkl + +WORKDIR . + +ENV model_path="/model.pkl" +ENV pipeline_path="/pipeline.pkl" +ENV metadata_path="/metadata.pkl" + +CMD ["uvicorn", "api.api:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/README.md b/online_inference/README.md similarity index 85% rename from README.md rename to online_inference/README.md index 9d1129e..a5c01a4 100644 --- a/README.md +++ b/online_inference/README.md @@ -6,8 +6,8 @@ Сборка образа ----------------- +Сначала обучить и положить артефакты рядом, затем: ```shell -python -m heart_disease.models.train_model docker build . -t ml_project:latest ``` @@ -26,4 +26,4 @@ docker run -p 8000:80 polikutinevgeny/ml_project:latest ``` Протыкать скриптом: -`python -m online_inference.api.make_request` +`python -m make_request` diff --git a/online_inference/__init__.py b/online_inference/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/online_inference/api/api.py b/online_inference/api/api.py index e440f50..6296d5e 100644 --- a/online_inference/api/api.py +++ b/online_inference/api/api.py @@ -1,5 +1,5 @@ import logging -import pickle +import cloudpickle from pathlib import Path from typing import List, Dict, Any @@ -21,7 +21,7 @@ def deserialize_object(path: Path) -> Any: with open(path, "rb") as f: - return pickle.load(f) + return cloudpickle.load(f) @app.on_event("startup") diff --git a/online_inference/api/make_request.py b/online_inference/make_request.py similarity index 90% rename from online_inference/api/make_request.py rename to online_inference/make_request.py index 4684e4a..8e41f81 100644 --- a/online_inference/api/make_request.py +++ b/online_inference/make_request.py @@ -1,7 +1,7 @@ import requests import pandas as pd -DATA_PATH = "ml_project/data/heart.csv" +DATA_PATH = "../ml_project/data/heart.csv" if __name__ == '__main__': data = pd.read_csv(DATA_PATH) diff --git a/online_inference/requirements.txt b/online_inference/requirements.txt index c54630a..f6e6362 100644 --- a/online_inference/requirements.txt +++ b/online_inference/requirements.txt @@ -1,6 +1,8 @@ pandas==1.2.4 -scikit-learn==0.24.1 pytest==6.2.3 uvicorn[standard]==0.13.4 fastapi==0.64.0 python-dotenv==0.17.1 +scikit-learn==0.24.1 +omegaconf==2.0.6 +git+git://github.com/polikutinevgeny/cloudpickle.git@206-deep-serialization diff --git a/online_inference/tests/conftest.py b/online_inference/tests/conftest.py new file mode 100644 index 0000000..474a811 --- /dev/null +++ b/online_inference/tests/conftest.py @@ -0,0 +1,53 @@ +from typing import Union, Dict, Callable, List + +import pandas as pd +import pytest +from numpy.random import Generator, PCG64 + + +def get_row_generators(rng: Generator) -> Dict[str, Callable]: + return { + "age": lambda: rng.normal(54, 9), + "trestbps": lambda: rng.normal(131, 18), + "chol": lambda: rng.normal(246, 52), + "thalach": lambda: rng.normal(150, 23), + "oldpeak": lambda: rng.uniform(0, 6.2), + "thal": lambda: rng.integers(0, 4), + "ca": lambda: rng.integers(0, 5), + "slope": lambda: rng.integers(0, 3), + "exang": lambda: rng.integers(0, 2), + "restecg": lambda: rng.integers(0, 3), + "fbs": lambda: rng.integers(0, 2), + "cp": lambda: rng.integers(0, 4), + "sex": lambda: rng.integers(0, 2), + } + + +@pytest.fixture(scope="session") +def categorical_features() -> List[str]: + return ["thal", "ca", "slope", "exang", "restecg", "fbs", "cp", "sex"] + + +def generate_random_row(row_generators: Dict[str, Callable]) -> Dict[str, Union[int, float]]: + row = {} + for key, generator in row_generators.items(): + row[key] = generator() + return row + + +@pytest.fixture(scope="session") +def dataset_filename() -> str: + return "data.csv" + + +@pytest.fixture(scope="session") +def dataset_size() -> int: + return 200 + + +@pytest.fixture(scope="session") +def test_dataset(tmp_path_factory, dataset_filename: str, dataset_size: int) -> pd.DataFrame: + path = tmp_path_factory.mktemp("path") + rng = Generator(PCG64(12345)) + data = pd.DataFrame.from_records([generate_random_row(get_row_generators(rng)) for _ in range(dataset_size)]) + return data diff --git a/online_inference/tests/test_api.py b/online_inference/tests/test_api.py index d82b20e..633055f 100644 --- a/online_inference/tests/test_api.py +++ b/online_inference/tests/test_api.py @@ -6,31 +6,13 @@ from fastapi.testclient import TestClient from pydantic import parse_obj_as -from online_inference.api.schemas import HeartDiseaseModel, HeartDiseaseResponseModel +from api.api import app +from api.schemas import HeartDiseaseModel, HeartDiseaseResponseModel, Settings +test_client = TestClient(app) -@pytest.fixture(scope="session") -def test_client(tmp_path_factory, train_artifacts: Tuple[str, str, str, str]): - # Alas, FastAPI currently cannot use DI in events. - # Setting up .env config manually - # https://github.com/tiangolo/fastapi/issues/2057 - path = tmp_path_factory.mktemp("path") - - _, model_save_path, pipeline_save_path, metadata_path = train_artifacts - new_env = path / ".env" - with open(new_env, "w") as f: - print(f"model_path={model_save_path}", file=f) - print(f"pipeline_path={pipeline_save_path}", file=f) - print(f"metadata_path={metadata_path}", file=f) - load_dotenv(dotenv_path=new_env) - - from online_inference.api.api import app - client = TestClient(app) - return client - - -def test_predict(test_dataset: pd.DataFrame, test_client: TestClient): +def test_predict(test_dataset: pd.DataFrame): ids = list(range(test_dataset.shape[0])) request = HeartDiseaseModel( ids=ids, @@ -45,7 +27,7 @@ def test_predict(test_dataset: pd.DataFrame, test_client: TestClient): assert set([i.id for i in preds]) == set(ids) -def test_predict_wrong_shape(test_dataset: pd.DataFrame, test_client: TestClient): +def test_predict_wrong_shape(test_dataset: pd.DataFrame): ids = list(range(test_dataset.shape[0])) request = HeartDiseaseModel( ids=ids, @@ -57,7 +39,7 @@ def test_predict_wrong_shape(test_dataset: pd.DataFrame, test_client: TestClient assert response.status_code == 400 -def test_predict_wrong_column(test_dataset: pd.DataFrame, test_client: TestClient): +def test_predict_wrong_column(test_dataset: pd.DataFrame): ids = list(range(test_dataset.shape[0])) columns = test_dataset.columns.tolist()[:-1] + ["obviously_extra_column"] request = HeartDiseaseModel( @@ -70,7 +52,7 @@ def test_predict_wrong_column(test_dataset: pd.DataFrame, test_client: TestClien assert response.status_code == 400 -def test_predict_wrong_dtype(test_dataset: pd.DataFrame, test_client: TestClient, categorical_features: List[str]): +def test_predict_wrong_dtype(test_dataset: pd.DataFrame, categorical_features: List[str]): dataset_copy = test_dataset.copy(deep=True) ids = list(range(dataset_copy.shape[0])) dataset_copy[categorical_features[0]] = float('nan') From f2c2fb05c724340766a374388a01e5949e95ba73 Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Tue, 25 May 2021 20:48:48 +1000 Subject: [PATCH 10/14] Cleanup formatting --- online_inference/api/api.py | 2 +- online_inference/api/schemas.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/online_inference/api/api.py b/online_inference/api/api.py index 6296d5e..9383a51 100644 --- a/online_inference/api/api.py +++ b/online_inference/api/api.py @@ -1,8 +1,8 @@ import logging -import cloudpickle from pathlib import Path from typing import List, Dict, Any +import cloudpickle import numpy as np import pandas as pd from dotenv import load_dotenv diff --git a/online_inference/api/schemas.py b/online_inference/api/schemas.py index fa15805..e829017 100644 --- a/online_inference/api/schemas.py +++ b/online_inference/api/schemas.py @@ -1,5 +1,5 @@ -from typing import Dict, Union, List from pathlib import Path +from typing import Union, List from pydantic import BaseModel, BaseSettings From 779a1ba26e454018a36f03ad370e71474adda42b Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Tue, 25 May 2021 21:03:15 +1000 Subject: [PATCH 11/14] Bring README back --- README.md | 5 +++++ online_inference/README.md | 6 ------ 2 files changed, 5 insertions(+), 6 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..9224c6f --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# Машинное обучение в продакшене + +Репозиторий для курса "Машинное обучение в продакшене" MADE. + +[Профиль](https://data.mail.ru/profile/e.polikutin/) diff --git a/online_inference/README.md b/online_inference/README.md index a5c01a4..2f6aaa8 100644 --- a/online_inference/README.md +++ b/online_inference/README.md @@ -1,9 +1,3 @@ -# Машинное обучение в продакшене - -Репозиторий для курса "Машинное обучение в продакшене" MADE. - -[Профиль](https://data.mail.ru/profile/e.polikutin/) - Сборка образа ----------------- Сначала обучить и положить артефакты рядом, затем: From 7f2b484b367e63f788e5e7ca7e2df2e670fcd0da Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Tue, 25 May 2021 21:10:05 +1000 Subject: [PATCH 12/14] Update make_request.py to not depend on data from ml_project --- online_inference/make_request.py | 2 +- online_inference/subset.csv | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 online_inference/subset.csv diff --git a/online_inference/make_request.py b/online_inference/make_request.py index 8e41f81..899e114 100644 --- a/online_inference/make_request.py +++ b/online_inference/make_request.py @@ -1,7 +1,7 @@ import requests import pandas as pd -DATA_PATH = "../ml_project/data/heart.csv" +DATA_PATH = "subset.csv" if __name__ == '__main__': data = pd.read_csv(DATA_PATH) diff --git a/online_inference/subset.csv b/online_inference/subset.csv new file mode 100644 index 0000000..a9457ec --- /dev/null +++ b/online_inference/subset.csv @@ -0,0 +1,10 @@ +age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target +63,1,3,145,233,1,0,150,0,2.3,0,0,1,1 +37,1,2,130,250,0,1,187,0,3.5,0,0,2,1 +41,0,1,130,204,0,0,172,0,1.4,2,0,2,1 +56,1,1,120,236,0,1,178,0,0.8,2,0,2,1 +57,0,0,120,354,0,1,163,1,0.6,2,0,2,1 +57,1,0,140,192,0,1,148,0,0.4,1,0,1,1 +56,0,1,140,294,0,0,153,0,1.3,1,0,2,1 +44,1,1,120,263,0,1,173,0,0,2,0,3,1 +52,1,2,172,199,1,1,162,0,0.5,2,0,3,1 From bfbf8a1fe853459a3c6d2e39f88a2ca01cf36178 Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Tue, 25 May 2021 21:15:24 +1000 Subject: [PATCH 13/14] Extract constant --- online_inference/api/api.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/online_inference/api/api.py b/online_inference/api/api.py index 9383a51..4ef6367 100644 --- a/online_inference/api/api.py +++ b/online_inference/api/api.py @@ -10,6 +10,8 @@ from .schemas import HeartDiseaseModel, HeartDiseaseResponseModel, Settings +VALIDATION_ERROR_STATUS_CODE = 400 + logger = logging.getLogger(__name__) load_dotenv() @@ -37,22 +39,26 @@ def rebuild_dataframe(params: HeartDiseaseModel, metadata: Dict[str, np.dtype]) except ValueError: error_msg = "Failed to construct DataFrame from passed data" logger.exception(error_msg) - raise HTTPException(status_code=400, detail=error_msg) + raise_validation_error(error_msg) for key, dtype in metadata.items(): if key not in data.columns: error_msg = f"Column {key} not found in data" logger.error(error_msg) - raise HTTPException(status_code=400, detail=error_msg) + raise_validation_error(error_msg) if data[key].dtype != dtype: try: data[key] = data[key].astype(dtype) except ValueError: error_msg = f"Failed to cast column {key} to dtype {dtype}" logger.exception(error_msg) - raise HTTPException(status_code=400, detail=error_msg) + raise_validation_error(error_msg) return data[list(metadata.keys())] +def raise_validation_error(error_msg): + raise HTTPException(status_code=VALIDATION_ERROR_STATUS_CODE, detail=error_msg) + + @app.post("/predict", response_model=List[HeartDiseaseResponseModel]) def predict(request: Request, params: HeartDiseaseModel): data = rebuild_dataframe(params, app.state.metadata) From c3b57be16a17d6cf2354050d3780c294086fa2e4 Mon Sep 17 00:00:00 2001 From: Evgenii Polikutin Date: Tue, 25 May 2021 21:18:36 +1000 Subject: [PATCH 14/14] Add startup logging --- online_inference/api/api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/online_inference/api/api.py b/online_inference/api/api.py index 4ef6367..2e3e19d 100644 --- a/online_inference/api/api.py +++ b/online_inference/api/api.py @@ -28,9 +28,11 @@ def deserialize_object(path: Path) -> Any: @app.on_event("startup") def load_artifacts(): + logger.info("Loading artifacts...") app.state.metadata = deserialize_object(settings.metadata_path) app.state.pipeline = deserialize_object(settings.pipeline_path) app.state.model = deserialize_object(settings.model_path) + logger.info("Artifacts loaded") def rebuild_dataframe(params: HeartDiseaseModel, metadata: Dict[str, np.dtype]) -> pd.DataFrame: