From 166009fa6d3b71d0dfad5575fcafd2aca4ad3360 Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Tue, 25 Oct 2022 18:24:14 -0700 Subject: [PATCH] feat: from_sample Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --- src/bentoml/_internal/io_descriptors/base.py | 18 +++-- src/bentoml/_internal/io_descriptors/file.py | 24 +++++-- src/bentoml/_internal/io_descriptors/image.py | 59 +++++++++++---- src/bentoml/_internal/io_descriptors/json.py | 42 ++++++++++- .../_internal/io_descriptors/multipart.py | 11 ++- src/bentoml/_internal/io_descriptors/numpy.py | 28 ++++---- .../_internal/io_descriptors/pandas.py | 71 ++++++++++++++----- src/bentoml/_internal/io_descriptors/text.py | 6 ++ .../service/openapi/specification.py | 2 +- 9 files changed, 194 insertions(+), 67 deletions(-) diff --git a/src/bentoml/_internal/io_descriptors/base.py b/src/bentoml/_internal/io_descriptors/base.py index b6f86ca6720..4bbd74fa8d9 100644 --- a/src/bentoml/_internal/io_descriptors/base.py +++ b/src/bentoml/_internal/io_descriptors/base.py @@ -53,7 +53,7 @@ class IODescriptor(ABC, t.Generic[IOType]): _mime_type: str _rpc_content_type: str = "application/grpc" _proto_fields: tuple[ProtoField] - _sample_input: IOType | None = None + _sample: IOType | None = None descriptor_id: str | None def __init_subclass__(cls, *, descriptor_id: str | None = None): @@ -66,12 +66,12 @@ def __init_subclass__(cls, *, descriptor_id: str | None = None): cls.descriptor_id = descriptor_id @property - def sample_input(self) -> IOType | None: - return self._sample_input + def sample(self) -> IOType | None: + return self._sample - @sample_input.setter - def sample_input(self, value: IOType) -> None: - self._sample_input = value + @sample.setter + def sample(self, value: IOType) -> None: + self._sample = value @abstractmethod def to_spec(self) -> dict[str, t.Any]: @@ -93,6 +93,10 @@ def input_type(self) -> InputType: def openapi_schema(self) -> Schema | Reference: raise NotImplementedError + def openapi_example(self) -> t.Any: + if self.sample is not None: + return self.sample + @abstractmethod def openapi_components(self) -> dict[str, t.Any] | None: raise NotImplementedError @@ -125,5 +129,5 @@ async def to_proto(self, obj: IOType) -> t.Any: @classmethod @abstractmethod - def from_sample(cls, sample_input: IOType, **kwargs: t.Any) -> Self: + def from_sample(cls, sample: IOType, **kwargs: t.Any) -> Self: ... diff --git a/src/bentoml/_internal/io_descriptors/file.py b/src/bentoml/_internal/io_descriptors/file.py index a0c45a8bf0b..edd175b4124 100644 --- a/src/bentoml/_internal/io_descriptors/file.py +++ b/src/bentoml/_internal/io_descriptors/file.py @@ -1,6 +1,7 @@ from __future__ import annotations import io +import os import typing as t import logging from typing import TYPE_CHECKING @@ -122,12 +123,23 @@ def __new__(cls, kind: FileKind = "binaryio", mime_type: str | None = None) -> F return res @classmethod - def from_sample(cls, sample_input: FileType, kind: FileKind = "binaryio") -> Self: + def from_sample(cls, sample: FileType | str, kind: FileKind = "binaryio") -> Self: import filetype - mime_type: str | None = filetype.guess_mime(sample_input) + mime_type: str | None = filetype.guess_mime(sample) + kls = cls(kind=kind, mime_type=mime_type) - kls.sample_input = sample_input + + if isinstance(sample, FileLike): + kls.sample = sample + elif isinstance(sample, t.IO): + kls.sample = FileLike[bytes](sample, "") + elif isinstance(sample, str) and os.path.exists(sample): + with open(sample, "rb") as f: + kls.sample = FileLike[bytes](f, "") + else: + raise InvalidArgument(f"Unknown sample type: '{sample}'") + return kls @classmethod @@ -196,7 +208,7 @@ async def to_proto(self, obj: FileType) -> pb.File: async def from_proto(self, field: pb.File | bytes) -> FileLike[bytes]: raise NotImplementedError - async def from_http_request(self, request: Request) -> t.IO[bytes]: + async def from_http_request(self, request: Request) -> FileLike[bytes]: raise NotImplementedError def to_spec(self) -> dict[str, t.Any]: @@ -213,7 +225,7 @@ def to_spec(self) -> dict[str, t.Any]: }, } - async def from_http_request(self, request: Request) -> t.IO[bytes]: + async def from_http_request(self, request: Request) -> FileLike[bytes]: content_type, _ = parse_options_header(request.headers["content-type"]) if content_type.decode("utf-8") == "multipart/form-data": form = await request.form() @@ -235,7 +247,7 @@ async def from_http_request(self, request: Request) -> t.IO[bytes]: return res # type: ignore if content_type.decode("utf-8") == self._mime_type: body = await request.body() - return t.cast(t.IO[bytes], FileLike(io.BytesIO(body), "")) + return FileLike[bytes](io.BytesIO(body), "") raise BentoMLException( f"File should have Content-Type '{self._mime_type}' or 'multipart/form-data', got {content_type} instead" ) diff --git a/src/bentoml/_internal/io_descriptors/image.py b/src/bentoml/_internal/io_descriptors/image.py index 8d57c0e3966..7f37c25acbb 100644 --- a/src/bentoml/_internal/io_descriptors/image.py +++ b/src/bentoml/_internal/io_descriptors/image.py @@ -1,6 +1,7 @@ from __future__ import annotations import io +import os import typing as t import functools from typing import TYPE_CHECKING @@ -22,6 +23,8 @@ from ..service.openapi.specification import Schema from ..service.openapi.specification import MediaType +PIL_EXC_MSG = "'Pillow' is required to use the Image IO descriptor. Install with 'pip install bentoml[io-image]'." + if TYPE_CHECKING: from types import UnionType @@ -30,7 +33,6 @@ from typing_extensions import Self from bentoml.grpc.v1alpha1 import service_pb2 as pb - from .base import OpenAPIResponse from .. import external_typing as ext from .base import OpenAPIResponse @@ -44,9 +46,8 @@ # NOTE: pillow-simd only benefits users who want to do preprocessing # TODO: add options for users to choose between simd and native mode - _exc = "'Pillow' is required to use the Image IO descriptor. Install it with: 'pip install -U Pillow'." - PIL = LazyLoader("PIL", globals(), "PIL", exc_msg=_exc) - PIL.Image = LazyLoader("PIL.Image", globals(), "PIL.Image", exc_msg=_exc) + PIL = LazyLoader("PIL", globals(), "PIL", exc_msg=PIL_EXC_MSG) + PIL.Image = LazyLoader("PIL.Image", globals(), "PIL.Image", exc_msg=PIL_EXC_MSG) pb, _ = import_generated_stubs() @@ -59,10 +60,7 @@ DEFAULT_PIL_MODE = "RGB" -PIL_WRITE_ONLY_FORMATS = { - "PALM", - "PDF", -} +PIL_WRITE_ONLY_FORMATS = {"PALM", "PDF"} READABLE_MIMES: set[str] = None # type: ignore (lazy constant) MIME_EXT_MAPPING: dict[str, str] = None # type: ignore (lazy constant) @@ -75,9 +73,7 @@ def initialize_pillow(): try: import PIL.Image except ImportError: - raise InternalServerError( - f"'Pillow' is required to use {__name__}. Install Pillow with 'pip install bentoml[io-image]'" - ) + raise InternalServerError(PIL_EXC_MSG) PIL.Image.init() MIME_EXT_MAPPING = {v: k for k, v in PIL.Image.MIME.items()} # type: ignore (lazy constant) @@ -214,6 +210,41 @@ def __init__( self._pilmode: _Mode | None = pilmode self._format: str = MIME_EXT_MAPPING[self._mime_type] + @classmethod + def from_sample( + cls, + sample: ImageType | str, + pilmode: _Mode | None = DEFAULT_PIL_MODE, + *, + allowed_mime_types: t.Iterable[str] | None = None, + ) -> Self: + from filetype.match import image_match + + img_type = image_match(sample) + if img_type is None: + raise InvalidArgument(f"{sample} is not a valid image file type.") + + kls = cls( + mime_type=img_type.mime, + pilmode=pilmode, + allowed_mime_types=allowed_mime_types, + ) + + if isinstance(sample, str) and os.path.exists(sample): + try: + with open(sample, "rb") as f: + kls.sample = PIL.Image.open(f) + except PIL.UnidentifiedImageError as err: + raise BadInput(f"Failed to parse sample image file: {err}") from None + elif LazyType["ext.NpNDArray"]("numpy.ndarray").isinstance(sample): + kls.sample = PIL.Image.fromarray(sample, mode=pilmode) + elif LazyType["PIL.Image.Image"]("PIL.Image.Image").isinstance(sample): + kls.sample = sample + else: + raise InvalidArgument(f"Unknown sample type: '{sample}'") + + return kls + def to_spec(self) -> dict[str, t.Any]: return { "id": self.descriptor_id, @@ -318,15 +349,15 @@ async def from_http_request(self, request: Request) -> ImageType: try: return PIL.Image.open(io.BytesIO(bytes_)) - except PIL.UnidentifiedImageError: # type: ignore (bad pillow types) - raise BadInput("Failed to parse uploaded image file") from None + except PIL.UnidentifiedImageError as err: + raise BadInput(f"Failed to parse uploaded image file: {err}") from None async def to_http_response( self, obj: ImageType, ctx: Context | None = None ) -> Response: if LazyType["ext.NpNDArray"]("numpy.ndarray").isinstance(obj): image = PIL.Image.fromarray(obj, mode=self._pilmode) - elif LazyType[PIL.Image.Image]("PIL.Image.Image").isinstance(obj): + elif LazyType["PIL.Image.Image"]("PIL.Image.Image").isinstance(obj): image = obj else: raise BadInput( diff --git a/src/bentoml/_internal/io_descriptors/json.py b/src/bentoml/_internal/io_descriptors/json.py index ba8619b49a8..c5b33e7cf75 100644 --- a/src/bentoml/_internal/io_descriptors/json.py +++ b/src/bentoml/_internal/io_descriptors/json.py @@ -23,6 +23,8 @@ from ..service.openapi.specification import Schema from ..service.openapi.specification import MediaType +EXC_MSG = "'pydantic' must be installed to use 'pydantic_model'. Install with 'pip install bentoml[io-json]'." + if TYPE_CHECKING: from types import UnionType @@ -36,9 +38,8 @@ from ..context import InferenceApiContext as Context else: - _exc_msg = "'pydantic' must be installed to use 'pydantic_model'. Install with 'pip install pydantic'." - pydantic = LazyLoader("pydantic", globals(), "pydantic", exc_msg=_exc_msg) - schema = LazyLoader("schema", globals(), "pydantic.schema", exc_msg=_exc_msg) + pydantic = LazyLoader("pydantic", globals(), "pydantic", exc_msg=EXC_MSG) + schema = LazyLoader("schema", globals(), "pydantic.schema", exc_msg=EXC_MSG) # lazy load our proto generated. struct_pb2 = LazyLoader("struct_pb2", globals(), "google.protobuf.struct_pb2") # lazy load numpy for processing ndarray. @@ -200,6 +201,22 @@ def __init__( "'validate_json' option from 'bentoml.io.JSON' has been deprecated. Use a Pydantic model to specify validation options instead." ) + @classmethod + def from_sample( + cls, + sample: JSONType, + *, + json_encoder: t.Type[json.JSONEncoder] = DefaultJsonEncoder, + ) -> Self: + pydantic_model: t.Type[pydantic.BaseModel] | None = None + if LazyType["pydantic.BaseModel"]("pydantic.BaseModel").isinstance(sample): + pydantic_model = sample.__class__ + + kls = cls(pydantic_model=pydantic_model, json_encoder=json_encoder) + + kls.sample = sample + return kls + def to_spec(self) -> dict[str, t.Any]: return { "id": self.descriptor_id, @@ -250,6 +267,25 @@ def openapi_components(self) -> dict[str, t.Any] | None: return {"schemas": pydantic_components_schema(self._pydantic_model)} + def openapi_example(self) -> t.Any: + if self.sample is not None: + if LazyType["pydantic.BaseModel"]("pydantic.BaseModel").isinstance( + self.sample + ): + return self.sample.dict() + elif isinstance(self.sample, str): + return json.dumps( + self.sample, + cls=self._json_encoder, + ensure_ascii=False, + allow_nan=False, + indent=None, + separators=(",", ":"), + ) + elif isinstance(self.sample, dict): + return self.sample + return + def openapi_request_body(self) -> dict[str, t.Any]: return { "content": {self._mime_type: MediaType(schema=self.openapi_schema())}, diff --git a/src/bentoml/_internal/io_descriptors/multipart.py b/src/bentoml/_internal/io_descriptors/multipart.py index 9f90bdc3fec..985ea281c29 100644 --- a/src/bentoml/_internal/io_descriptors/multipart.py +++ b/src/bentoml/_internal/io_descriptors/multipart.py @@ -16,15 +16,16 @@ from ..utils.formparser import populate_multipart_requests from ..utils.formparser import concat_to_multipart_response from ..service.openapi.specification import Schema -from ..service.openapi.specification import Response as OpenAPIResponse from ..service.openapi.specification import MediaType -from ..service.openapi.specification import RequestBody if TYPE_CHECKING: from types import UnionType + from typing_extensions import Self + from bentoml.grpc.v1alpha1 import service_pb2 as pb + from .base import OpenAPIResponse from ..types import LazyType from ..context import InferenceApiContext as Context else: @@ -174,6 +175,10 @@ def __init__(self, **inputs: IODescriptor[t.Any]): def __repr__(self) -> str: return f"Multipart({','.join([f'{k}={v}' for k,v in zip(self._inputs, map(repr, self._inputs.values()))])})" + @classmethod + def from_sample(cls, sample: dict[str, t.Any]) -> Self: + pass + def input_type( self, ) -> dict[str, t.Type[t.Any] | UnionType | LazyType[t.Any]]: @@ -217,7 +222,7 @@ def openapi_schema(self) -> Schema: def openapi_components(self) -> dict[str, t.Any] | None: pass - def openapi_request_body(self) -> RequestBody: + def openapi_request_body(self) -> dict[str, t.Any]: return { "content": {self._mime_type: MediaType(schema=self.openapi_schema())}, "required": True, diff --git a/src/bentoml/_internal/io_descriptors/numpy.py b/src/bentoml/_internal/io_descriptors/numpy.py index 2e5927e95b9..0716bb2247f 100644 --- a/src/bentoml/_internal/io_descriptors/numpy.py +++ b/src/bentoml/_internal/io_descriptors/numpy.py @@ -280,13 +280,11 @@ def openapi_components(self) -> dict[str, t.Any] | None: pass def openapi_example(self) -> t.Any: - if self.sample_input is not None: - if isinstance(self.sample_input, np.generic): - raise BadInput( - "NumpyNdarray: sample_input must be a numpy array." - ) from None - return self.sample_input.tolist() - return + if self.sample is not None: + if isinstance(self.sample, np.generic): + raise BadInput("NumpyNdarray: sample must be a numpy array.") from None + # NOTE: we only need to + return self.sample.ravel().tolist() def openapi_request_body(self) -> dict[str, t.Any]: return { @@ -394,7 +392,7 @@ async def to_http_response(self, obj: ext.NpNDArray, ctx: Context | None = None) @classmethod def from_sample( cls, - sample_input: ext.NpNDArray, + sample: ext.NpNDArray, enforce_dtype: bool = True, enforce_shape: bool = True, ) -> Self: @@ -402,7 +400,7 @@ def from_sample( Create a :obj:`NumpyNdarray` IO Descriptor from given inputs. Args: - sample_input: Given sample ``np.ndarray`` data + sample: Given sample ``np.ndarray`` data enforce_dtype: Enforce a certain data type. :code:`dtype` must be specified at function signature. If you don't want to enforce a specific dtype then change :code:`enforce_dtype=False`. @@ -436,20 +434,20 @@ def from_sample( async def predict(input: NDArray[np.int16]) -> NDArray[Any]: return await runner.async_run(input) """ - if isinstance(sample_input, np.generic): + if isinstance(sample, np.generic): raise BentoMLException( "'NumpyNdarray.from_sample()' expects a 'numpy.array', not 'numpy.generic'." ) from None - inst = cls( - dtype=sample_input.dtype, - shape=sample_input.shape, + kls = cls( + dtype=sample.dtype, + shape=sample.shape, enforce_dtype=enforce_dtype, enforce_shape=enforce_shape, ) - inst.sample_input = sample_input + kls.sample = sample - return inst + return kls async def from_proto(self, field: pb.NDArray | bytes) -> ext.NpNDArray: """ diff --git a/src/bentoml/_internal/io_descriptors/pandas.py b/src/bentoml/_internal/io_descriptors/pandas.py index 90e44ec69a9..b41e66d9bd3 100644 --- a/src/bentoml/_internal/io_descriptors/pandas.py +++ b/src/bentoml/_internal/io_descriptors/pandas.py @@ -24,6 +24,7 @@ from ..service.openapi.specification import Schema from ..service.openapi.specification import MediaType +EXC_MSG = "pandas' is required to use PandasDataFrame or PandasSeries. Install with 'pip install bentoml[io-pandas]'" if TYPE_CHECKING: import numpy as np import pandas as pd @@ -32,19 +33,14 @@ from bentoml.grpc.v1alpha1 import service_pb2 as pb from .. import external_typing as ext + from .base import OpenAPIResponse from ..context import InferenceApiContext as Context else: from bentoml.grpc.utils import import_generated_stubs pb, _ = import_generated_stubs() - np = LazyLoader("np", globals(), "numpy") - pd = LazyLoader( - "pd", - globals(), - "pandas", - exc_msg='pandas" is required to use PandasDataFrame or PandasSeries. Install with "pip install bentoml[io-pandas]"', - ) + pd = LazyLoader("pd", globals(), "pandas", exc_msg=EXC_MSG) logger = logging.getLogger(__name__) @@ -80,7 +76,8 @@ def _openapi_types(item: str) -> str: # pragma: no cover def _dataframe_openapi_schema( - dtype: bool | ext.PdDTypeArg | None, orient: ext.DataFrameOrient = None + dtype: bool | ext.PdDTypeArg | None, + orient: ext.DataFrameOrient = None, ) -> Schema: # pragma: no cover if isinstance(dtype, dict): if orient == "records": @@ -154,6 +151,8 @@ def __str__(self) -> str: return "parquet" elif self == SerializationFormat.CSV: return "csv" + else: + raise ValueError(f"Unknown serialization format: {self}") def _infer_serialization_format_from_request( @@ -323,7 +322,7 @@ def __init__( enforce_shape: bool = False, default_format: t.Literal["json", "parquet", "csv"] = "json", ): - self._orient = orient + self._orient: ext.DataFrameOrient = orient self._columns = columns self._apply_column_names = apply_column_names # TODO: convert dtype to numpy dtype @@ -363,6 +362,14 @@ def _convert_dtype( return None def to_spec(self) -> dict[str, t.Any]: + # TODO: support extension dtypes + dtype = None + if self._dtype is not None: + if isinstance(self._dtype, bool): + dtype = self._dtype + else: + dtype = self._dtype.name + return { "id": self.descriptor_id, "args": { @@ -392,6 +399,11 @@ def openapi_schema(self) -> Schema: def openapi_components(self) -> dict[str, t.Any] | None: pass + def openapi_example(self) -> t.Any: + if self.sample is not None: + return t.cast("dict[str, t.Any]", self.sample.to_dict()) + return + def openapi_request_body(self) -> dict[str, t.Any]: return { "content": {self._mime_type: MediaType(schema=self.openapi_schema())}, @@ -493,18 +505,18 @@ async def to_http_response( @classmethod def from_sample( cls, - sample_input: ext.PdDataFrame, + sample: ext.PdDataFrame, orient: ext.DataFrameOrient = "records", apply_column_names: bool = True, enforce_shape: bool = True, enforce_dtype: bool = True, default_format: t.Literal["json", "parquet", "csv"] = "json", - ) -> PandasDataFrame: + ) -> Self: """ Create a :obj:`PandasDataFrame` IO Descriptor from given inputs. Args: - sample_input: Given sample ``pd.DataFrame`` data + sample: Given sample ``pd.DataFrame`` data orient: Indication of expected JSON string format. Compatible JSON strings can be produced by :func:`pandas.io.json.to_json()` with a corresponding orient value. Possible orients are: @@ -547,19 +559,19 @@ def from_sample( @svc.api(input=input_spec, output=PandasDataFrame()) def predict(inputs: pd.DataFrame) -> pd.DataFrame: ... """ - inst = cls( + kls = cls( orient=orient, enforce_shape=enforce_shape, - shape=sample_input.shape, + shape=sample.shape, apply_column_names=apply_column_names, - columns=[str(x) for x in list(sample_input.columns)], + columns=[str(x) for x in list(sample.columns)], enforce_dtype=enforce_dtype, dtype=True, # set to True to infer from given input default_format=default_format, ) - inst.sample_input = sample_input + kls.sample = sample - return inst + return kls def validate_dataframe( self, dataframe: ext.PdDataFrame, exception_cls: t.Type[Exception] = BadInput @@ -796,7 +808,7 @@ def __init__( shape: tuple[int, ...] | None = None, enforce_shape: bool = False, ): - self._orient = orient + self._orient: ext.SeriesOrient = orient self._dtype = dtype self._enforce_dtype = enforce_dtype self._shape = shape @@ -842,6 +854,24 @@ def to_spec(self) -> dict[str, t.Any]: }, } + @classmethod + def from_sample( + cls, + sample: ext.PdSeries, + orient: ext.SeriesOrient = "records", + enforce_shape: bool = True, + enforce_dtype: bool = True, + ) -> Self: + kls = cls( + orient=orient, + dtype=sample.dtype, + enforce_dtype=enforce_dtype, + shape=sample.shape, + enforce_shape=enforce_shape, + ) + kls.sample = sample + return kls + @classmethod def from_spec(cls, spec: dict[str, t.Any]) -> Self: if "args" not in spec: @@ -855,6 +885,11 @@ def openapi_schema(self) -> Schema: def openapi_components(self) -> dict[str, t.Any] | None: pass + def openapi_example(self) -> t.Any: + if self.sample is not None: + return t.cast("dict[str, t.Any]", self.sample.to_dict()) + return + def openapi_request_body(self) -> dict[str, t.Any]: return { "content": {self._mime_type: MediaType(schema=self.openapi_schema())}, diff --git a/src/bentoml/_internal/io_descriptors/text.py b/src/bentoml/_internal/io_descriptors/text.py index e5d0873cf13..6eb41e4f386 100644 --- a/src/bentoml/_internal/io_descriptors/text.py +++ b/src/bentoml/_internal/io_descriptors/text.py @@ -99,6 +99,12 @@ def __init__(self, *args: t.Any, **kwargs: t.Any): f"'{self.__class__.__name__}' is not designed to take any args or kwargs during initialization." ) from None + @classmethod + def from_sample(cls, sample: str) -> Self: + kls = cls() + kls.sample = sample + return kls + def input_type(self) -> t.Type[str]: return str diff --git a/src/bentoml/_internal/service/openapi/specification.py b/src/bentoml/_internal/service/openapi/specification.py index a88763b1275..6f0ea7e6dc8 100644 --- a/src/bentoml/_internal/service/openapi/specification.py +++ b/src/bentoml/_internal/service/openapi/specification.py @@ -104,7 +104,7 @@ class Schema: anyOf: t.Optional[t.List[Schema]] = None not_: t.Optional[Schema] = None items: t.Optional[t.Union[Schema, t.List[Schema]]] = None - properties: t.Optional[t.Dict[str, Schema]] = None + properties: t.Optional[t.Dict[str, Schema | Reference]] = None additionalProperties: t.Optional[t.Union[Schema, Reference, bool]] = None description: t.Optional[str] = None format: t.Optional[str] = None