Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Домашнее задание №2 #3

Merged
merged 14 commits into from Jun 6, 2021
Merged
19 changes: 19 additions & 0 deletions ml_project/Dockerfile
@@ -0,0 +1,19 @@
FROM python:3.9.2-buster

COPY requirements.txt ./requirements.txt
RUN pip install -r requirements.txt

COPY heart_disease heart_disease
COPY online_inference online_inference

COPY models/model.pkl model.pkl
COPY models/pipeline.pkl pipeline.pkl
COPY models/metadata.pkl metadata.pkl

WORKDIR .

ENV model_path="/model.pkl"
ENV pipeline_path="/pipeline.pkl"
ENV metadata_path="/metadata.pkl"

CMD ["uvicorn", "online_inference.api:app", "--host", "0.0.0.0", "--port", "80"]
2 changes: 1 addition & 1 deletion ml_project/Makefile
Expand Up @@ -23,7 +23,7 @@ endif
## Install Python Dependencies
requirements: test_environment
$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
$(PYTHON_INTERPRETER) -m pip install -r dev_requirements.txt

## Build EDA report
eda_report: requirements
Expand Down
25 changes: 25 additions & 0 deletions ml_project/README.md
Expand Up @@ -39,3 +39,28 @@ make requirements

Пути к файлам можно указывать через файлы конфигов, либо через
параметры командной строки (e.g. `python -m heart_disease.models.predict_model data_path="data/new_data.csv"`)


Сборка образа
-----------------
```shell
python -m heart_disease.models.train_model
docker build . -t ml_project:latest
```

Публикация образа
-----------------
```shell
docker tag ml_project:latest polikutinevgeny/ml_project:latest
docker push polikutinevgeny/ml_project:latest
```

Запуск образа
-------------
```shell
docker pull polikutinevgeny/ml_project:latest
docker run -p 8000:80 polikutinevgeny/ml_project:latest
```

Протыкать скриптом:
`python -m online_inference.make_request`
1 change: 1 addition & 0 deletions ml_project/config/experiment_1/train_config.yaml
@@ -1,5 +1,6 @@
model_save_path: models/model_experiment_1.pkl
pipeline_save_path: models/pipeline_experiment_1.pkl
metadata_save_path: models/metadata_experiment_1.pkl
data_load_config:
data_path: data/heart.csv
split_config:
Expand Down
1 change: 1 addition & 0 deletions ml_project/config/experiment_2/train_config.yaml
@@ -1,5 +1,6 @@
model_save_path: models/model_experiment_2.pkl
pipeline_save_path: models/pipeline_experiment_2.pkl
metadata_save_path: models/metadata_experiment_2.pkl
data_load_config:
data_path: data/heart.csv
split_config:
Expand Down
1 change: 1 addition & 0 deletions ml_project/config/train_config.yaml
@@ -1,5 +1,6 @@
model_save_path: models/model.pkl
pipeline_save_path: models/pipeline.pkl
metadata_save_path: models/metadata.pkl
data_load_config:
data_path: data/heart.csv
split_config:
Expand Down
4 changes: 4 additions & 0 deletions ml_project/dev_requirements.txt
@@ -0,0 +1,4 @@
# local package
-e .

-r requirements.txt
1 change: 1 addition & 0 deletions ml_project/heart_disease/entities/pipeline_config.py
Expand Up @@ -15,6 +15,7 @@ class TrainingConfig:
evaluation_config: EvaluateModelConfig = field(default_factory=lambda: EvaluateModelConfig)
model_save_path: str = omegaconf.MISSING
pipeline_save_path: str = omegaconf.MISSING
metadata_save_path: str = omegaconf.MISSING


@dataclass
Expand Down
23 changes: 17 additions & 6 deletions ml_project/heart_disease/features/build_features.py
@@ -1,5 +1,4 @@
import pickle
from typing import List
from typing import List, Dict

import numpy as np
import pandas as pd
Expand All @@ -12,6 +11,7 @@
from sklearn.random_projection import SparseRandomProjection

from heart_disease.entities.feature_config import FeatureConfig
from heart_disease.utils import serialize_object, deserialize_object


class StatisticalFeaturesExtractor(TransformerMixin):
Expand Down Expand Up @@ -95,11 +95,22 @@ def extract_target(df: pd.DataFrame, config: FeatureConfig) -> pd.Series:
return target


def extract_raw_features(df: pd.DataFrame, config: FeatureConfig) -> pd.DataFrame:
return df[config.raw_features.numeric_features + config.raw_features.categorical_features]


def serialize_pipeline(pipeline: Pipeline, path: str):
with open(path, "wb") as f:
pickle.dump(pipeline, f)
serialize_object(pipeline, path)


def deserialize_pipeline(path: str) -> Pipeline:
with open(path, "rb") as f:
return pickle.load(f)
return deserialize_object(path)


def serialize_metadata(df: pd.DataFrame, config: FeatureConfig, path: str):
all_features = config.raw_features.numeric_features + config.raw_features.categorical_features
return serialize_object(df[all_features].dtypes.to_dict(), path)


def deserialize_metadata(path: str) -> Dict[str, np.dtype]:
return deserialize_object(path)
8 changes: 3 additions & 5 deletions ml_project/heart_disease/models/model.py
@@ -1,4 +1,3 @@
import pickle
from typing import Union, Dict, List

import numpy as np
Expand All @@ -7,6 +6,7 @@
from sklearn.metrics import get_scorer

from heart_disease.entities.model_config import TrainModelConfig, ModelType
from heart_disease.utils import deserialize_object, serialize_object

Classifier = Union[RandomForestClassifier, ExtraTreesClassifier]

Expand Down Expand Up @@ -41,10 +41,8 @@ def save_metrics(metrics: Dict[str, float], path: str):


def serialize_model(model: Classifier, path: str):
with open(path, "wb") as f:
pickle.dump(model, f)
serialize_object(model, path)


def deserialize_model(path: str) -> Classifier:
with open(path, "rb") as f:
return pickle.load(f)
return deserialize_object(path)
16 changes: 10 additions & 6 deletions ml_project/heart_disease/models/train_model.py
Expand Up @@ -7,7 +7,8 @@

from heart_disease.data.make_dataset import load_datasets
from heart_disease.entities.pipeline_config import TrainingConfig
from heart_disease.features.build_features import build_feature_pipeline, extract_target, serialize_pipeline
from heart_disease.features.build_features import build_feature_pipeline, extract_target, serialize_pipeline, \
serialize_metadata, extract_raw_features
from heart_disease.models.model import train_model, evaluate_model, serialize_model, save_metrics

log = logging.getLogger(__name__)
Expand All @@ -27,19 +28,21 @@ def train_pipeline(cfg: TrainingConfig):

log.info("Building features...")
feature_pipeline = build_feature_pipeline(cfg.feature_config)
feature_pipeline.fit(train_data)
train_features = feature_pipeline.transform(train_data)
val_features = feature_pipeline.transform(val_data)
raw_train_features = extract_raw_features(train_data, cfg.feature_config)
raw_val_features = extract_raw_features(val_data, cfg.feature_config)
feature_pipeline.fit(raw_train_features)
train_features = feature_pipeline.transform(raw_train_features)
val_features = feature_pipeline.transform(raw_val_features)
train_target = extract_target(train_data, cfg.feature_config)
val_target = extract_target(val_data, cfg.feature_config)
log.info("Features built")

log.info(f"Training model {cfg.model_config.model.value}...")
model = train_model(train_features, train_target, cfg.model_config)
model = train_model(train_features, train_target.values, cfg.model_config)
log.info("Model trained")

log.info("Evaluating model...")
metrics = evaluate_model(model, val_features, val_target, cfg.evaluation_config.metrics)
metrics = evaluate_model(model, val_features, val_target.values, cfg.evaluation_config.metrics)
save_metrics(metrics, to_absolute_path(cfg.evaluation_config.metric_file_path))
log.info("Model evaluated:")
for metric, value in metrics.items():
Expand All @@ -48,6 +51,7 @@ def train_pipeline(cfg: TrainingConfig):
log.info("Serializing...")
serialize_model(model, to_absolute_path(cfg.model_save_path))
serialize_pipeline(feature_pipeline, to_absolute_path(cfg.pipeline_save_path))
serialize_metadata(train_data, cfg.feature_config, to_absolute_path(cfg.metadata_save_path))
log.info("Model and pipeline serialized")


Expand Down
12 changes: 12 additions & 0 deletions ml_project/heart_disease/utils.py
@@ -0,0 +1,12 @@
import pickle
from typing import Any


def serialize_object(obj: Any, path: str):
with open(path, "wb") as f:
pickle.dump(obj, f)


def deserialize_object(path: str) -> Any:
with open(path, "rb") as f:
return pickle.load(f)
Empty file.
58 changes: 58 additions & 0 deletions ml_project/online_inference/api.py
@@ -0,0 +1,58 @@
import logging
from typing import List, Dict

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException, Request

from heart_disease.features.build_features import deserialize_metadata, deserialize_pipeline
from heart_disease.models.model import deserialize_model
from online_inference.schemas import HeartDiseaseModel, HeartDiseaseResponseModel, Settings

logger = logging.getLogger(__name__)

load_dotenv()
settings = Settings()
app = FastAPI(
title="Heart disease prediction",
)


@app.on_event("startup")
def load_artifacts():
app.state.metadata = deserialize_metadata(str(settings.metadata_path))
app.state.pipeline = deserialize_pipeline(str(settings.pipeline_path))
app.state.model = deserialize_model(str(settings.model_path))


def rebuild_dataframe(params: HeartDiseaseModel, metadata: Dict[str, np.dtype]) -> pd.DataFrame:
try:
data = pd.DataFrame(params.features, columns=params.columns)
except ValueError:
error_msg = "Failed to construct DataFrame from passed data"
logger.exception(error_msg)
raise HTTPException(status_code=400, detail=error_msg)
for key, dtype in metadata.items():
if key not in data.columns:
error_msg = f"Column {key} not found in data"
logger.error(error_msg)
raise HTTPException(status_code=400, detail=error_msg)
if data[key].dtype != dtype:
try:
data[key] = data[key].astype(dtype)
except ValueError:
error_msg = f"Failed to cast column {key} to dtype {dtype}"
logger.exception(error_msg)
raise HTTPException(status_code=400, detail=error_msg)
return data[list(metadata.keys())]


@app.post("/predict", response_model=List[HeartDiseaseResponseModel])
def predict(request: Request, params: HeartDiseaseModel):
data = rebuild_dataframe(params, app.state.metadata)
processed_features = request.app.state.pipeline.transform(data)
predictions = request.app.state.model.predict(processed_features)
return [
HeartDiseaseResponseModel(id=id_, has_disease=pred == 1) for id_, pred in zip(params.ids, predictions)
]
18 changes: 18 additions & 0 deletions ml_project/online_inference/make_request.py
@@ -0,0 +1,18 @@
import requests

from heart_disease.data.make_dataset import read_data

DATA_PATH = "data/heart.csv"

if __name__ == '__main__':
data = read_data(DATA_PATH)
response = requests.post(
"http://localhost:8000/predict",
json={
"ids": list(range(data.shape[0])),
"features": data.values.tolist(),
"columns": data.columns.tolist()
},
)
print(response.status_code)
print(response.json())
21 changes: 21 additions & 0 deletions ml_project/online_inference/schemas.py
@@ -0,0 +1,21 @@
from typing import Dict, Union, List
from pathlib import Path

from pydantic import BaseModel, BaseSettings


class HeartDiseaseModel(BaseModel):
ids: List[int]
features: List[List[Union[int, float]]]
columns: List[str]


class HeartDiseaseResponseModel(BaseModel):
id: int
has_disease: bool


class Settings(BaseSettings):
model_path: Path
pipeline_path: Path
metadata_path: Path
6 changes: 3 additions & 3 deletions ml_project/requirements.txt
@@ -1,6 +1,3 @@
# local package
-e .

# external requirements
click==7.1.2
coverage==5.5
Expand All @@ -12,3 +9,6 @@ seaborn==0.11.1
scikit-learn==0.24.1
hydra-core==1.0.6
pytest==6.2.3
uvicorn[standard]==0.13.4
fastapi==0.64.0
python-dotenv==0.17.1