From 7cba0ac0fb4e3845b0e969ba7d9a02147cd452e6 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Mon, 25 Jul 2022 16:43:03 -0700
Subject: [PATCH] grpc: serialization process (#2804)

* chore: renaming for better coherency

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>

* refactor: cleanup files

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>

* chore: fix formats

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>

* chore: refactor exception to interceptor

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>

* chore: cleanup some impl

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>

* chore: cleanup

note: codec are very much broken rn
Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
---
 .gitignore                                    |   2 +
 MANIFEST.in                                   |   1 +
 .../_internal/bento/build_dev_bentoml_whl.py  |   5 +-
 bentoml/_internal/io_descriptors/base.py      |  10 +-
 bentoml/_internal/io_descriptors/numpy.py     |  57 +--
 bentoml/_internal/server/__init__.py          |   6 +-
 ...v_api_server.py => grpc_dev_api_server.py} |   0
 bentoml/_internal/server/cli/http/__init__.py |   0
 .../api_server.py => rest_api_server.py}      |   2 +-
 ...v_api_server.py => rest_dev_api_server.py} |   0
 .../server/grpc/interceptors/__init__.py      | 230 +++++++++++
 bentoml/_internal/server/grpc/server.py       |   2 -
 bentoml/_internal/server/grpc/servicer.py     |  16 +-
 bentoml/_internal/server/grpc/types.py        |  98 +++++
 bentoml/_internal/server/grpc/utils.py        |  66 ----
 bentoml/_internal/server/grpc_app.py          |  13 +-
 bentoml/_internal/service/service.py          |   6 +
 bentoml/_internal/utils/grpc/__init__.py      |  58 +++
 bentoml/_internal/utils/grpc/codec.py         | 369 ++++++++++++++++++
 bentoml/_internal/utils/grpc/serializer.py    |  33 +-
 .../server/cli/grpc => grpc/v1}/__init__.py   |   0
 bentoml/grpc/v1/service.proto                 | 196 ++++++++--
 bentoml/grpc/v1/service_pb2.pyi               | 201 ----------
 bentoml/grpc/v1/service_pb2_grpc.pyi          |  69 ----
 bentoml/grpc/v1/service_test.proto            |  12 +-
 bentoml/grpc/v1/service_test_pb2.pyi          | 114 ------
 bentoml/grpc/v1/service_test_pb2_grpc.pyi     |  69 ----
 bentoml/grpc/v1/struct.proto                  | 116 +++---
 bentoml/grpc/v1/struct_pb2.pyi                | 284 --------------
 bentoml/grpc/v1/types.proto                   |  61 ---
 bentoml/grpc/v1/types_pb2.pyi                 | 119 ------
 bentoml/testing/server.py                     |   2 +-
 pyproject.toml                                |   1 +
 setup.cfg                                     |   3 +
 setup.py                                      |  21 +-
 35 files changed, 1066 insertions(+), 1176 deletions(-)
 rename bentoml/_internal/server/cli/{grpc/dev_api_server.py => grpc_dev_api_server.py} (100%)
 delete mode 100644 bentoml/_internal/server/cli/http/__init__.py
 rename bentoml/_internal/server/cli/{http/api_server.py => rest_api_server.py} (98%)
 rename bentoml/_internal/server/cli/{http/dev_api_server.py => rest_dev_api_server.py} (100%)
 create mode 100644 bentoml/_internal/server/grpc/types.py
 delete mode 100644 bentoml/_internal/server/grpc/utils.py
 create mode 100644 bentoml/_internal/utils/grpc/codec.py
 rename bentoml/{_internal/server/cli/grpc => grpc/v1}/__init__.py (100%)
 delete mode 100644 bentoml/grpc/v1/service_pb2.pyi
 delete mode 100644 bentoml/grpc/v1/service_pb2_grpc.pyi
 delete mode 100644 bentoml/grpc/v1/service_test_pb2.pyi
 delete mode 100644 bentoml/grpc/v1/service_test_pb2_grpc.pyi
 delete mode 100644 bentoml/grpc/v1/struct_pb2.pyi
 delete mode 100644 bentoml/grpc/v1/types.proto
 delete mode 100644 bentoml/grpc/v1/types_pb2.pyi

diff --git a/.gitignore b/.gitignore
index 85e2e8aaa1d..38df946fcb1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -117,3 +117,5 @@ catboost_info
 
 # generated stub that is included in distribution
 *_pb2*.py
+# also ignore generated stubs for typing
+bentoml/grpc/**/*.pyi
diff --git a/MANIFEST.in b/MANIFEST.in
index 979de6e61c0..7b5d7154047 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,6 +5,7 @@ exclude CONTRIBUTING.md GOVERNANCE.md CODE_OF_CONDUCT.md DEVELOPMENT.md SECURITY
 exclude Makefile MANIFEST.in
 exclude *.yml
 exclude .git*
+exclude bentoml/grpc/buf.yaml
 
 # Directories to exclude in PyPI package
 prune requirements
diff --git a/bentoml/_internal/bento/build_dev_bentoml_whl.py b/bentoml/_internal/bento/build_dev_bentoml_whl.py
index a4fd0e89864..ea03e6eb02f 100644
--- a/bentoml/_internal/bento/build_dev_bentoml_whl.py
+++ b/bentoml/_internal/bento/build_dev_bentoml_whl.py
@@ -43,15 +43,14 @@ def build_bentoml_editable_wheel(target_path: str) -> None:
             "BentoML is installed in `editable` mode; building BentoML distribution with the local BentoML code base. The built wheel file will be included in the target bento."
         )
         try:
-            import build.env
-
             from build import ProjectBuilder
+            from build.env import IsolatedEnvBuilder
         except ModuleNotFoundError:
             raise BentoMLException(
                 f"Environment variable {BENTOML_DEV_BUILD}=True detected, which requires the `pypa/build` package. Make sure to install all dev dependencies via `pip install -r requirements/dev-requirements.txt` and try again."
             )
 
-        with tempfile.TemporaryDirectory() as dist_dir, build.env.IsolatedEnvBuilder() as env:
+        with tempfile.TemporaryDirectory() as dist_dir, IsolatedEnvBuilder() as env:
             builder = ProjectBuilder(os.path.dirname(pyproject))
             builder.python_executable = env.executable
             builder.scripts_dir = env.scripts_dir
diff --git a/bentoml/_internal/io_descriptors/base.py b/bentoml/_internal/io_descriptors/base.py
index 8385b21d9aa..09645c55e58 100644
--- a/bentoml/_internal/io_descriptors/base.py
+++ b/bentoml/_internal/io_descriptors/base.py
@@ -11,12 +11,12 @@
     from starlette.requests import Request
     from starlette.responses import Response
 
-    from bentoml.grpc.v1.service_pb2 import CallRequest
-    from bentoml.grpc.v1.service_pb2 import CallResponse
+    from bentoml.grpc.v1.service_pb2 import InferenceRequest
+    from bentoml.grpc.v1.service_pb2 import InferenceResponse
 
     from ..types import LazyType
     from ..context import InferenceApiContext as Context
-    from ..server.grpc.utils import BentoServicerContext
+    from ..server.grpc.types import BentoServicerContext
 
     InputType = (
         UnionType
@@ -86,12 +86,12 @@ def generate_protobuf(self):
 
     @abstractmethod
     async def from_grpc_request(
-        self, request: CallRequest, context: BentoServicerContext
+        self, request: InferenceRequest, context: BentoServicerContext
     ) -> IOPyObj:
         ...
 
     @abstractmethod
     async def to_grpc_response(
         self, obj: IOPyObj, context: BentoServicerContext
-    ) -> CallResponse:
+    ) -> InferenceResponse:
         ...
diff --git a/bentoml/_internal/io_descriptors/numpy.py b/bentoml/_internal/io_descriptors/numpy.py
index 03265c729c6..dc011c3f9d5 100644
--- a/bentoml/_internal/io_descriptors/numpy.py
+++ b/bentoml/_internal/io_descriptors/numpy.py
@@ -18,13 +18,13 @@
 from ...exceptions import InternalServerError
 
 if TYPE_CHECKING:
-    import grpc
     import numpy as np
 
     from bentoml.grpc.v1 import service_pb2
 
     from .. import external_typing as ext
     from ..context import InferenceApiContext as Context
+    from ..server.grpc.types import BentoServicerContext
 else:
     np = LazyLoader("np", globals(), "numpy")
 
@@ -251,36 +251,8 @@ async def to_http_response(self, obj: ext.NpNDArray, ctx: Context | None = None)
         else:
             return Response(json.dumps(obj.tolist()), media_type=MIME_TYPE_JSON)
 
-    def proto_to_arr(self, proto_arr):
-        """
-        Convert given protobuf array to python list
-        """
-        from google.protobuf.duration_pb2 import Duration
-        from google.protobuf.timestamp_pb2 import Timestamp
-
-        from bentoml.grpc import service_pb2
-
-        array_type = self.WhichArray(proto_arr)
-        if not array_type:
-            raise ValueError("Provided array is either empty or invalid.")
-
-        return_arr = [i for i in getattr(proto_arr, array_type)]
-
-        if array_type == "timestamp_value":
-            return_arr = [Timestamp.ToDatetime(dt) for dt in return_arr]
-        elif array_type == "duration_value":
-            return_arr = [Duration.ToTimedelta(td) for td in return_arr]
-
-        for i, item in enumerate(return_arr):
-            if isinstance(item, service_pb2.Array):
-                return_arr[i] = self.proto_to_arr(item)
-            elif isinstance(item, service_pb2.Tuple):
-                return_arr[i] = self.handle_tuple(item)
-
-        return return_arr
-
     async def from_grpc_request(
-        self, request: service_pb2.Request, context: grpc.ServicerContext
+        self, request: service_pb2.InferenceRequest, context: BentoServicerContext
     ) -> ext.NpNDArray:
         """
         Process incoming protobuf request and convert it to `numpy.ndarray`
@@ -292,29 +264,28 @@ async def from_grpc_request(
             a `numpy.ndarray` object. This can then be used
              inside users defined logics.
         """
-        res: "ext.NpNDArray"
-        try:
-            res = np.array(self.proto_to_arr(request), dtype=self._dtype)
-        except ValueError:
-            res = np.array(self.proto_to_arr(request))
-        res = self._verify_ndarray(res, BadInput)
-        return res
+        from bentoml.grpc.v1 import struct_pb2
+
+        from ..utils.grpc import proto_to_dict
+
+        logger.info([f for f in struct_pb2.ContentsProto.DESCRIPTOR.fields])
+        contents = proto_to_dict(request.contents)
+        return np.frombuffer(contents)
 
     async def to_grpc_response(
-        self, obj: ext.NpNDArray, context: grpc.ServicerContext
-    ) -> service_pb2.Response:
+        self, obj: ext.NpNDArray, context: BentoServicerContext
+    ) -> service_pb2.InferenceResponse:
         """
         Process given objects and convert it to grpc protobuf response.
 
         Args:
-            obj (`np.ndarray`):
-                `np.ndarray` that will be serialized to protobuf
+            obj: `np.ndarray` that will be serialized to protobuf
+            context: grpc.aio.ServicerContext from grpc.aio.Server
         Returns:
             `io_descriptor_pb2.Array`:
                 Protobuf representation of given `np.ndarray`
         """
-        obj = self._verify_ndarray(obj, InternalServerError)
-        return self.arr_to_proto(obj)
+        pass
 
     def generate_protobuf(self):
         pass
diff --git a/bentoml/_internal/server/__init__.py b/bentoml/_internal/server/__init__.py
index 704681bc93e..276cb062f2e 100644
--- a/bentoml/_internal/server/__init__.py
+++ b/bentoml/_internal/server/__init__.py
@@ -29,9 +29,9 @@
 
 
 SCRIPT_RUNNER = "bentoml._internal.server.cli.runner"
-SCRIPT_API_SERVER = "bentoml._internal.server.cli.http.api_server"
-SCRIPT_DEV_API_SERVER = "bentoml._internal.server.cli.http.dev_api_server"
-SCRIPT_GRPC_DEV_API_SERVER = "bentoml._internal.server.cli.grpc.dev_api_server"
+SCRIPT_API_SERVER = "bentoml._internal.server.cli.rest_api_server"
+SCRIPT_DEV_API_SERVER = "bentoml._internal.server.cli.rest_dev_api_server"
+SCRIPT_GRPC_DEV_API_SERVER = "bentoml._internal.server.cli.grpc_dev_api_server"
 
 MAX_AF_UNIX_PATH_LENGTH = 103
 
diff --git a/bentoml/_internal/server/cli/grpc/dev_api_server.py b/bentoml/_internal/server/cli/grpc_dev_api_server.py
similarity index 100%
rename from bentoml/_internal/server/cli/grpc/dev_api_server.py
rename to bentoml/_internal/server/cli/grpc_dev_api_server.py
diff --git a/bentoml/_internal/server/cli/http/__init__.py b/bentoml/_internal/server/cli/http/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/bentoml/_internal/server/cli/http/api_server.py b/bentoml/_internal/server/cli/rest_api_server.py
similarity index 98%
rename from bentoml/_internal/server/cli/http/api_server.py
rename to bentoml/_internal/server/cli/rest_api_server.py
index 1e891dbc43a..809d2d48e4c 100644
--- a/bentoml/_internal/server/cli/http/api_server.py
+++ b/bentoml/_internal/server/cli/rest_api_server.py
@@ -92,7 +92,7 @@ def main(
         watcher = Watcher(
             name="bento_api_server",
             cmd=sys.executable,
-            args=["-m", "bentoml._internal.server.cli.http.api_server"]
+            args=["-m", "bentoml._internal.server.cli.rest_api_server"]
             + unparse_click_params(params, ctx.command.params, factory=str),
             copy_env=True,
             numprocesses=1,
diff --git a/bentoml/_internal/server/cli/http/dev_api_server.py b/bentoml/_internal/server/cli/rest_dev_api_server.py
similarity index 100%
rename from bentoml/_internal/server/cli/http/dev_api_server.py
rename to bentoml/_internal/server/cli/rest_dev_api_server.py
diff --git a/bentoml/_internal/server/grpc/interceptors/__init__.py b/bentoml/_internal/server/grpc/interceptors/__init__.py
index e69de29bb2d..15008d6b1c1 100644
--- a/bentoml/_internal/server/grpc/interceptors/__init__.py
+++ b/bentoml/_internal/server/grpc/interceptors/__init__.py
@@ -0,0 +1,230 @@
+from __future__ import annotations
+
+import sys
+import typing as t
+import asyncio
+import logging
+from abc import ABCMeta
+from abc import abstractmethod
+from typing import TYPE_CHECKING
+
+import grpc
+from grpc import aio
+
+from bentoml.exceptions import BentoMLException
+
+from ....utils.grpc import grpc_status_code
+from ....utils.grpc import get_factory_and_method
+
+if TYPE_CHECKING:
+    from ..types import RequestType
+    from ..types import ResponseType
+    from ..types import RpcMethodHandler
+    from ..types import HandlerCallDetails
+    from ..types import BentoServicerContext
+
+AsyncClientInterceptorReturn = type(
+    "AsyncClientInterceptorReturn", (aio.Call, grpc.Future), {}
+)
+
+logger = logging.getLogger(__name__)
+
+
+# TODO: test cases.
+class AsyncClientInterceptor(aio.UnaryUnaryClientInterceptor, metaclass=ABCMeta):
+    """
+    Base class for BentoService client-side interceptors.
+    To implement, subclass this class and override ``intercept`` method.
+    """
+
+    @abstractmethod
+    async def intercept(
+        self,
+        continuation: t.Callable[
+            [aio.ClientCallDetails, RequestType],
+            t.Awaitable[AsyncClientInterceptorReturn],
+        ],
+        client_call_details: aio.ClientCallDetails,
+        request: RequestType,
+    ) -> AsyncClientInterceptorReturn:
+        """
+        Override this method to implement an async client interceptor.
+        This method is currently only support unary RPCs.
+
+        The interceptor implementation should call ``method`` via ``grpc.aio.ClientCallDetails`` and
+        the ``request`` object as parameters.
+
+        The `request` parameter may be type checked to determine if this is a singluar request
+        for unary RPCs or an iterator for client-streaming or client-server streaming RPCs.
+
+        Args:
+          continuation: A coroutine that proceeds with the invocation by
+                        executing the next interceptor in the chain or invoking the
+                        actual RPC on the underlying Channel. It is the interceptor's
+                        responsibility to call it if it decides to move the RPC forward.
+                        The interceptor can use
+                        ``call = await continuation(client_call_details, request)``
+                        to continue with the RPC. `continuation` returns the call to the
+                        RPC.
+          client_call_details: A ClientCallDetails object describing the outgoing RPC.
+          request: The request value for the RPC.
+
+        Returns:
+            The type returns should match the type of return value received by calling ``method``.
+            This is an object of both ``grpc.aio.Call`` for the RPC and a ``grpc.Future``.
+            The actual result from the RPC can be got by calling ``.result()`` on the value returned from ``method``.
+        """
+        return await continuation(client_call_details, request)
+
+    async def intercept_unary_unary(
+        self,
+        continuation: t.Callable[
+            [aio.ClientCallDetails, ResponseType],
+            aio.UnaryUnaryCall[RequestType, ResponseType],
+        ],
+        client_call_details: aio.ClientCallDetails,
+        request: t.Any,
+    ) -> aio.UnaryUnaryCall[RequestType, ResponseType] | ResponseType:
+        """Implementation of grpc.UnaryUnaryClientInterceptor."""
+        return await self.intercept(continuation, client_call_details, request)  # type: ignore (unable to infer from mro)
+
+
+# Modification from https://github.com/d5h-foss/grpc-interceptor
+# added better typing that fits with BentoService signatures.
+class AsyncServerInterceptor(aio.ServerInterceptor, metaclass=ABCMeta):
+    """
+    Base class for BentoService server-side interceptors.
+
+    To implement, subclass this class and override ``intercept`` method.
+    """
+
+    @abstractmethod
+    async def intercept(
+        self,
+        method: t.Callable[
+            [RequestType, BentoServicerContext], t.Awaitable[ResponseType]
+        ],
+        request: RequestType,
+        context: BentoServicerContext,
+        method_name: str,
+    ) -> t.Any:
+        response_or_iterator = method(request, context)
+        if hasattr(response_or_iterator, "__aiter__"):
+            return response_or_iterator
+        else:
+            return await response_or_iterator
+
+    # Implementation of grpc.ServerInterceptor, do not override.
+    async def intercept_service(
+        self,
+        continuation: t.Callable[[HandlerCallDetails], t.Awaitable[RpcMethodHandler]],
+        handler_call_details: HandlerCallDetails,
+    ) -> RpcMethodHandler:
+        """Implementation of grpc.aio.ServerInterceptor.
+        This is not part of the grpc_interceptor.AsyncServerInterceptor API, but must
+        have a public name. Do not override it, unless you know what you're doing.
+        """
+        next_handler = await continuation(handler_call_details)
+        handler_factory, next_handler_method = get_factory_and_method(next_handler)
+
+        if next_handler.response_streaming:
+
+            async def invoke_intercept_method(  # type: ignore (function redefinition)
+                request: RequestType, context: BentoServicerContext
+            ) -> t.AsyncGenerator[ResponseType, None]:
+                method_name = handler_call_details.method
+                coroutine_or_asyncgen = self.intercept(
+                    next_handler_method, request, context, method_name
+                )
+
+                # Async server streaming handlers return async_generator, because they
+                # use the async def + yield syntax. However, this is NOT a coroutine
+                # and hence is not awaitable. This can be a problem if the interceptor
+                # ignores the individual streaming response items and simply returns the
+                # result of method(request, context). In that case the interceptor IS a
+                # coroutine, and hence should be awaited. In both cases, we need
+                # something we can iterate over so that THIS function is an
+                # async_generator like the actual RPC method.
+                if asyncio.iscoroutine(coroutine_or_asyncgen):
+                    asyncgen_or_none = await coroutine_or_asyncgen
+                    # If a handler is using the read/write API, it will return None.
+                    if not asyncgen_or_none:
+                        return
+                    asyncgen = asyncgen_or_none
+                else:
+                    asyncgen = coroutine_or_asyncgen
+
+                async for r in asyncgen:
+                    yield r
+
+        else:
+
+            async def invoke_intercept_method(
+                request: RequestType, context: BentoServicerContext
+            ) -> t.Awaitable[ResponseType]:
+                method_name = handler_call_details.method
+                return await self.intercept(
+                    next_handler_method,
+                    request,
+                    context,
+                    method_name,
+                )
+
+        return handler_factory(
+            invoke_intercept_method,
+            request_deserializer=next_handler.request_deserializer,
+            response_serializer=next_handler.response_serializer,
+        )
+
+
+class ExceptionHandlerInterceptor(AsyncServerInterceptor):
+    """An async interceptor that handles exceptions raised via BentoService."""
+
+    async def handle_exception(
+        self,
+        ex: BentoMLException,
+        context: BentoServicerContext,
+        method_name: str,
+    ) -> None:
+        """Handle an exception raised by a method.
+
+        Args:
+            ex: The exception raised by the method.
+            context: The context of the RPC.
+            method_name: The name of the method.
+        """
+        logger.error(
+            f"Error while invoking {method_name}: {ex}", exc_info=sys.exc_info()
+        )
+        await context.abort(code=grpc_status_code(ex), details=str(ex))
+
+    async def generate_responses(
+        self,
+        context: BentoServicerContext,
+        method_name: str,
+        response_iterator: t.AsyncIterable[ResponseType],
+    ) -> t.AsyncGenerator[t.Any, None]:
+        """Yield all the responses, but check for errors along the way."""
+        try:
+            async for r in response_iterator:
+                yield r
+        except BentoMLException as ex:
+            await self.handle_exception(ex, context, method_name)
+
+    async def intercept(
+        self,
+        method: t.Callable[
+            [RequestType, BentoServicerContext], t.Awaitable[ResponseType]
+        ],
+        request: RequestType,
+        context: BentoServicerContext,
+        method_name: str,
+    ) -> t.Any:
+        try:
+            response_or_iterator = method(request, context)
+            if not hasattr(response_or_iterator, "__aiter__"):
+                return await response_or_iterator
+        except BentoMLException as ex:
+            await self.handle_exception(ex, context, method_name)
+
+        return self.generate_responses(context, method_name, response_or_iterator)  # type: ignore (unknown variable warning)
diff --git a/bentoml/_internal/server/grpc/server.py b/bentoml/_internal/server/grpc/server.py
index 5aacb3556f4..1a4be0ed102 100644
--- a/bentoml/_internal/server/grpc/server.py
+++ b/bentoml/_internal/server/grpc/server.py
@@ -62,7 +62,6 @@ async def startup(self) -> None:
                 handler()
 
         await self.server.start()
-        logger.debug("GRPC server started.")
 
     async def shutdown(self):
         # Running on_startup callback.
@@ -73,7 +72,6 @@ async def shutdown(self):
                 handler()
 
         await self.server.stop(grace=self._grace_period)
-        logger.debug("GRPC server stopped.")
 
     async def wait_for_termination(self, timeout: int | None = None) -> bool:
         return await self.server.wait_for_termination(timeout=timeout)
diff --git a/bentoml/_internal/server/grpc/servicer.py b/bentoml/_internal/server/grpc/servicer.py
index 328429c43e6..6cf61e77178 100644
--- a/bentoml/_internal/server/grpc/servicer.py
+++ b/bentoml/_internal/server/grpc/servicer.py
@@ -9,26 +9,28 @@
 from bentoml.exceptions import MissingDependencyException
 from bentoml._internal.service.service import Service
 
-from .utils import handle_grpc_error
 from ...configuration import get_debug_mode
 
 if TYPE_CHECKING:
-    from .utils import BentoServicerContext
+    from .types import BentoServicerContext
 
 
 def register_bento_servicer(service: Service, server: aio.Server) -> None:
+    """
+    This is the actual implementation of BentoServicer.
+    Main inference entrypoint will be invoked via /bentoml.grpc.<version>.BentoService/Inference
+    """
     from bentoml.grpc.v1 import service_pb2 as _service_pb2
     from bentoml.grpc.v1 import service_pb2_grpc as _service_pb2_grpc
 
     class BentoServiceServicer(_service_pb2_grpc.BentoServiceServicer):
         """An asyncio implementation of BentoService servicer."""
 
-        @handle_grpc_error
-        async def Call(
+        async def Inference(  # type: ignore (no async types)
             self,
-            request: _service_pb2.CallRequest,
+            request: _service_pb2.InferenceRequest,
             context: BentoServicerContext,
-        ) -> _service_pb2.CallResponse | None:
+        ) -> _service_pb2.InferenceResponse | None:
             if request.api_name not in service.apis:
                 raise UnprocessableEntity(
                     f"given 'api_name' is not defined in {service.name}",
@@ -46,7 +48,7 @@ async def Call(
             response = await api.output.to_grpc_response(output, context)
             return response
 
-    _service_pb2_grpc.add_BentoServiceServicer_to_server(BentoServiceServicer(), server)
+    _service_pb2_grpc.add_BentoServiceServicer_to_server(BentoServiceServicer(), server)  # type: ignore (lack of asyncio types)
 
 
 async def register_health_servicer(server: aio.Server) -> None:
diff --git a/bentoml/_internal/server/grpc/types.py b/bentoml/_internal/server/grpc/types.py
new file mode 100644
index 00000000000..4a77a1760af
--- /dev/null
+++ b/bentoml/_internal/server/grpc/types.py
@@ -0,0 +1,98 @@
+"""
+Specific types for BentoService gRPC server.
+"""
+from __future__ import annotations
+
+from typing import Tuple
+from typing import TypeVar
+from typing import Callable
+from typing import Optional
+from typing import Awaitable
+from typing import NamedTuple
+from typing import TYPE_CHECKING
+
+import grpc
+
+
+class HandlerCallDetails(
+    NamedTuple("HandlerCallDetails", method=str, invocation_metadata=Tuple[str, bytes]),
+    grpc.HandlerCallDetails,
+):
+    """Describes an RPC that has just arrived for service.
+
+    Attributes:
+    method: The method name of the RPC.
+    invocation_metadata: A sequence of metadatum, a key-value pair included in the HTTP header.
+                        An example is: ``('binary-metadata-bin', b'\\x00\\xFF')``
+    """
+
+    method: str
+    invocation_metadata: Tuple[str, bytes]
+
+
+if TYPE_CHECKING:
+    from typing import Protocol
+
+    from grpc import aio
+
+    from bentoml.grpc.v1.service_pb2 import InferenceRequest
+    from bentoml.grpc.v1.service_pb2 import InferenceResponse
+    from bentoml.grpc.v1.service_pb2 import ServerLiveRequest
+    from bentoml.grpc.v1.service_pb2 import ServerLiveResponse
+    from bentoml.grpc.v1.service_pb2 import ServerReadyRequest
+    from bentoml.grpc.v1.service_pb2 import ServerReadyResponse
+    from bentoml.grpc.v1.service_pb2_grpc import BentoServiceServicer
+
+    P = TypeVar("P", contravariant=True)
+
+    ResponseType = InferenceResponse | ServerLiveResponse | ServerReadyResponse
+    RequestType = InferenceRequest | ServerLiveRequest | ServerReadyRequest
+    BentoServicerContext = aio.ServicerContext[ResponseType, RequestType]
+
+    RequestDeserializerFn = Optional[Callable[[RequestType], object]]
+    ResponseSerializerFn = Optional[Callable[[bytes], ResponseType]]
+
+    class AsyncHandlerProtocol(Protocol[P]):
+        def __call__(
+            self,
+            behaviour: Callable[[RequestType, BentoServicerContext], Awaitable[P]],
+            request_deserializer: RequestDeserializerFn = None,
+            response_serializer: ResponseSerializerFn = None,
+        ) -> grpc.RpcMethodHandler:
+            ...
+
+    AsyncHandlerMethod = AsyncHandlerProtocol[ResponseType]
+
+    class RpcMethodHandler(
+        NamedTuple(
+            "RpcMethodHandler",
+            request_streaming=bool,
+            response_streaming=bool,
+            request_deserializer=RequestDeserializerFn,
+            response_serializer=ResponseSerializerFn,
+            unary_unary=Optional[AsyncHandlerMethod],
+            unary_stream=Optional[AsyncHandlerMethod],
+            stream_unary=Optional[AsyncHandlerMethod],
+            stream_stream=Optional[AsyncHandlerMethod],
+        ),
+        grpc.RpcMethodHandler,
+    ):
+        """An implementation of a single RPC method."""
+
+        request_streaming: bool
+        response_streaming: bool
+        request_deserializer: RequestDeserializerFn
+        response_serializer: ResponseSerializerFn
+        unary_unary: Optional[AsyncHandlerMethod]
+        unary_stream: Optional[AsyncHandlerMethod]
+        stream_unary: Optional[AsyncHandlerMethod]
+        stream_stream: Optional[AsyncHandlerMethod]
+
+    __all__ = [
+        "RequestType",
+        "ResponseType",
+        "BentoServicerContext",
+        "BentoServiceServicer",
+        "HandlerCallDetails",
+        "RpcMethodHandler",
+    ]
diff --git a/bentoml/_internal/server/grpc/utils.py b/bentoml/_internal/server/grpc/utils.py
deleted file mode 100644
index 4d6afb5cc4e..00000000000
--- a/bentoml/_internal/server/grpc/utils.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from __future__ import annotations
-
-import typing as t
-import asyncio
-import logging
-from http import HTTPStatus
-from typing import TYPE_CHECKING
-from functools import wraps
-
-import grpc
-
-from bentoml.exceptions import BentoMLException
-
-logger = logging.getLogger(__name__)
-
-if TYPE_CHECKING:
-    from grpc import aio
-
-    from bentoml.grpc.v1.service_pb2 import CallRequest
-    from bentoml.grpc.v1.service_pb2 import CallResponse
-    from bentoml.grpc.v1.service_pb2 import ServerLiveRequest
-    from bentoml.grpc.v1.service_pb2 import ServerLiveResponse
-    from bentoml.grpc.v1.service_pb2 import ServerReadyRequest
-    from bentoml.grpc.v1.service_pb2 import ServerReadyResponse
-    from bentoml.grpc.v1.service_pb2_grpc import BentoServiceServicer
-
-    ResponseType = CallResponse | ServerLiveResponse | ServerReadyResponse
-    RequestType = CallRequest | ServerLiveRequest | ServerReadyRequest
-    BentoServicerContext = aio.ServicerContext[ResponseType, RequestType]
-
-_STATUS_CODE_MAPPING = {
-    HTTPStatus.BAD_REQUEST: grpc.StatusCode.INVALID_ARGUMENT,
-    HTTPStatus.INTERNAL_SERVER_ERROR: grpc.StatusCode.INTERNAL,
-    HTTPStatus.NOT_FOUND: grpc.StatusCode.NOT_FOUND,
-    HTTPStatus.UNPROCESSABLE_ENTITY: grpc.StatusCode.FAILED_PRECONDITION,
-}
-
-
-def grpc_status_code(err: BentoMLException) -> grpc.StatusCode:
-    """
-    Convert BentoMLException.error_code to grpc.StatusCode.
-    """
-    return _STATUS_CODE_MAPPING.get(err.error_code, grpc.StatusCode.UNKNOWN)
-
-
-def handle_grpc_error(fn: t.Callable[..., t.Any]) -> t.Any:
-    """
-    Decorator to handle grpc error.
-    """
-
-    @wraps(fn)
-    async def wrapper(
-        self: BentoServiceServicer,
-        request: RequestType,
-        context: BentoServicerContext,
-    ) -> ResponseType | None:
-        try:
-            if asyncio.iscoroutinefunction(fn):
-                return await fn(self, request, context)
-            else:
-                return fn(self, request, context)
-        except BentoMLException as e:
-            logger.error(e)
-            await context.abort(code=grpc_status_code(e), details=str(e))
-
-    return wrapper
diff --git a/bentoml/_internal/server/grpc_app.py b/bentoml/_internal/server/grpc_app.py
index 8c428851486..a224eba4f0d 100644
--- a/bentoml/_internal/server/grpc_app.py
+++ b/bentoml/_internal/server/grpc_app.py
@@ -53,7 +53,7 @@ def on_startup(self) -> OnStartup:
 
         on_startup: OnStartup = [
             self.mark_as_ready,
-            self.bento_service.on_asgi_app_startup,
+            self.bento_service.on_grpc_server_startup,
         ]
         if BentoMLContainer.development_mode.get():
             for runner in self.bento_service.runners:
@@ -77,7 +77,7 @@ def on_startup(self) -> OnStartup:
 
     @property
     def on_shutdown(self) -> list[t.Callable[[], None]]:
-        on_shutdown = [self.bento_service.on_asgi_app_shutdown]
+        on_shutdown = [self.bento_service.on_grpc_server_shutdown]
         for runner in self.bento_service.runners:
             on_shutdown.append(runner.destroy)
 
@@ -110,10 +110,13 @@ def options(
 
     @property
     def interceptors(self) -> list[aio.ServerInterceptor]:
-        interceptors: list[aio.ServerInterceptor] = []
+        from .grpc.interceptors import ExceptionHandlerInterceptor
+
+        # TODO: add access log, tracing, prometheus interceptors.
+        interceptors: list[aio.ServerInterceptor] = [ExceptionHandlerInterceptor()]
+
+        # add users-defined interceptors.
         interceptors.extend(
             [interceptor() for interceptor in self.bento_service.interceptors]
         )
-
-        # TODO: add access log, tracing, prometheus interceptors.
         return interceptors
diff --git a/bentoml/_internal/service/service.py b/bentoml/_internal/service/service.py
index a1abe793196..24f8b439e64 100644
--- a/bentoml/_internal/service/service.py
+++ b/bentoml/_internal/service/service.py
@@ -210,6 +210,12 @@ def on_asgi_app_startup(self) -> None:
     def on_asgi_app_shutdown(self) -> None:
         pass
 
+    def on_grpc_server_startup(self) -> None:
+        pass
+
+    def on_grpc_server_shutdown(self) -> None:
+        pass
+
     @property
     def grpc_server(self) -> GRPCServer:
         from ..server.grpc_app import GRPCAppFactory
diff --git a/bentoml/_internal/utils/grpc/__init__.py b/bentoml/_internal/utils/grpc/__init__.py
index d231dd8f6c2..a5c940717e4 100644
--- a/bentoml/_internal/utils/grpc/__init__.py
+++ b/bentoml/_internal/utils/grpc/__init__.py
@@ -1,8 +1,48 @@
 from __future__ import annotations
 
 import enum
+import typing as t
+import logging
+from http import HTTPStatus
+from typing import TYPE_CHECKING
 from dataclasses import dataclass
 
+import grpc
+
+from bentoml.exceptions import BentoMLException
+
+from .serializer import proto_to_dict
+
+if TYPE_CHECKING:
+    from ...server.grpc.types import RequestType
+    from ...server.grpc.types import ResponseType
+    from ...server.grpc.types import RpcMethodHandler
+    from ...server.grpc.types import BentoServicerContext
+
+__all__ = [
+    "grpc_status_code",
+    "parse_method_name",
+    "get_method_type",
+    "get_factory_and_method",
+    "proto_to_dict",
+]
+
+logger = logging.getLogger(__name__)
+
+_STATUS_CODE_MAPPING = {
+    HTTPStatus.BAD_REQUEST: grpc.StatusCode.INVALID_ARGUMENT,
+    HTTPStatus.INTERNAL_SERVER_ERROR: grpc.StatusCode.INTERNAL,
+    HTTPStatus.NOT_FOUND: grpc.StatusCode.NOT_FOUND,
+    HTTPStatus.UNPROCESSABLE_ENTITY: grpc.StatusCode.FAILED_PRECONDITION,
+}
+
+
+def grpc_status_code(err: BentoMLException) -> grpc.StatusCode:
+    """
+    Convert BentoMLException.error_code to grpc.StatusCode.
+    """
+    return _STATUS_CODE_MAPPING.get(err.error_code, grpc.StatusCode.UNKNOWN)
+
 
 class RpcMethodType(str, enum.Enum):
     UNARY = "UNARY"
@@ -59,3 +99,21 @@ def get_method_type(request_streaming: bool, response_streaming: bool) -> str:
         return RpcMethodType.BIDI_STREAMING
     else:
         return RpcMethodType.UNKNOWN
+
+
+def get_factory_and_method(
+    rpc_handler: RpcMethodHandler,
+) -> tuple[
+    t.Callable[..., t.Any],
+    t.Callable[[RequestType, BentoServicerContext], t.Awaitable[ResponseType]],
+]:
+    if rpc_handler.unary_unary:
+        return grpc.unary_unary_rpc_method_handler, rpc_handler.unary_unary
+    elif rpc_handler.unary_stream:
+        return grpc.unary_stream_rpc_method_handler, rpc_handler.unary_stream
+    elif rpc_handler.stream_unary:
+        return grpc.stream_unary_rpc_method_handler, rpc_handler.stream_unary
+    elif rpc_handler.stream_stream:
+        return grpc.stream_stream_rpc_method_handler, rpc_handler.stream_stream
+    else:
+        raise BentoMLException(f"RPC method handler {rpc_handler} does not exist.")
diff --git a/bentoml/_internal/utils/grpc/codec.py b/bentoml/_internal/utils/grpc/codec.py
new file mode 100644
index 00000000000..44a9fc02c9f
--- /dev/null
+++ b/bentoml/_internal/utils/grpc/codec.py
@@ -0,0 +1,369 @@
+from __future__ import annotations
+
+import typing as t
+from abc import ABCMeta
+from abc import abstractmethod
+from http import HTTPStatus
+from typing import TYPE_CHECKING
+from functools import partial
+from collections import namedtuple
+
+from google.protobuf.message import Message
+
+from bentoml.exceptions import BentoMLException
+
+from ..lazy_loader import LazyLoader
+
+if TYPE_CHECKING:
+    from bentoml.grpc.v1 import struct_pb2
+
+    CoderCallback = list[
+        tuple[
+            t.Callable[[t.Any], bool],
+            t.Callable[[t.Any], bytes] | t.Callable[[bytes], t.Any],
+        ]
+    ]
+else:
+    struct_pb2 = LazyLoader("struct_pb2", globals(), "bentoml.grpc.v1.struct_pb2")
+
+
+F = t.Callable[..., t.Any]
+P = t.TypeVar("P")
+
+__all__ = ["can_encode", "encode_structure", "decode_proto", "register_codec"]
+
+
+def register_codec(cdc: t.Type[Codec[t.Any]]) -> None:
+    # register a codec.
+    _codecs.append(cdc())
+
+
+def _get_encoders() -> CoderCallback:
+    return [(c.can_encode, c.do_encode) for c in _codecs]
+
+
+def _get_decoders() -> CoderCallback:
+    return [(c.can_decode, c.do_decode) for c in _codecs]
+
+
+def _map_structure(obj: t.Any, coders: CoderCallback) -> Message:
+    for can, do in coders:
+        if can(obj):
+            fn = partial(_map_structure, coders=coders)
+            return do(obj, fn)
+    raise EncodeError(f"No codec found for {str(obj)} of type {type(obj)}")
+
+
+def can_encode(nested: t.Any) -> bool:
+    # Determines whether a nested structure can be encoded into a proto.
+    try:
+        encode_structure(nested)
+    except EncodeError:
+        return False
+    return True
+
+
+def encode_structure(nested: t.Any) -> Message:
+    # encode given nested object to result protos.
+    return _map_structure(nested, _get_encoders())
+
+
+def decode_proto(proto: Message) -> t.Any:
+    # decode given proto to nested object.
+    return _map_structure(proto, _get_decoders())
+
+
+class EncodeError(BentoMLException):
+    """Error raised when a codec cannot encode an object or not implemented."""
+
+    error_code = HTTPStatus.NOT_IMPLEMENTED
+
+
+class Codec(t.Generic[P], metaclass=ABCMeta):
+    @abstractmethod
+    def can_encode(self, obj: t.Any) -> bool:
+        """
+        Check if the codec can encode the given object.
+        """
+
+    @abstractmethod
+    def do_encode(self, obj: P, encode_fn: F) -> struct_pb2.ContentsProto:
+        """
+        Encode the given object.
+        """
+
+    def can_decode(self, value: Message) -> bool:
+        """
+        Check if the codec can decode the given bytes.
+        """
+
+    def do_decode(self, value: struct_pb2.ContentsProto, decode_fn: F) -> P:
+        """
+        Decode the given bytes.
+        """
+
+
+class _NoneCodec(Codec[None]):
+    def can_encode(self, obj: t.Any) -> bool:
+        return obj is None
+
+    def do_encode(self, obj: None, encode_fn: F) -> struct_pb2.ContentsProto:
+        del obj, encode_fn
+        value = struct_pb2.ContentsProto()
+        value.none.CopyFrom(struct_pb2.NoneValue())
+        return value
+
+    def can_decode(self, value: Message) -> bool:
+        return value.HasField("none")
+
+    def do_decode(self, value: struct_pb2.ContentsProto, decode_fn: F) -> None:
+        del value, decode_fn
+        return None
+
+
+class _Float64Codec(Codec[float]):
+    def can_encode(self, obj: t.Any) -> bool:
+        return isinstance(obj, float)
+
+    def do_encode(self, obj: float, encode_fn: F) -> struct_pb2.ContentsProto:
+        del encode_fn
+        val = struct_pb2.ContentsProto()
+        val.float64 = obj
+        return val
+
+    def can_decode(self, value: Message) -> bool:
+        return value.HasField("float64")
+
+    def do_decode(self, value: struct_pb2.ContentsProto, decode_fn: F) -> float:
+        del decode_fn
+        return float(value.float64)
+
+
+class _Int64Codec(Codec[int]):
+    def can_encode(self, obj: t.Any) -> bool:
+        return not isinstance(obj, bool) and isinstance(obj, int)
+
+    def do_encode(self, obj: int, encode_fn: F) -> struct_pb2.ContentsProto:
+        del encode_fn
+        value = struct_pb2.ContentsProto()
+        value.int64 = obj
+        return value
+
+    def can_decode(self, value: Message) -> bool:
+        return value.HasField("int64")
+
+    def do_decode(self, value: struct_pb2.ContentsProto, decode_fn: F) -> int:
+        del decode_fn
+        return int(value.int64)
+
+
+class _StringCodec(Codec[str]):
+    def can_encode(self, obj: t.Any) -> bool:
+        return isinstance(obj, str)
+
+    def do_encode(self, obj: str, encode_fn: F) -> struct_pb2.ContentsProto:
+        del encode_fn
+        val = struct_pb2.ContentsProto()
+        val.string = obj
+        return val
+
+    def can_decode(self, value: Message) -> bool:
+        return value.HasField("string")
+
+    def do_decode(self, value: struct_pb2.ContentsProto, decode_fn: F) -> str:
+        del decode_fn
+        return str(value.string)
+
+
+class _BoolCodec(Codec[bool]):
+    def can_encode(self, obj: t.Any) -> bool:
+        return isinstance(obj, bool)
+
+    def do_encode(self, obj: bool, encode_fn: F) -> struct_pb2.ContentsProto:
+        del encode_fn
+        val = struct_pb2.ContentsProto()
+        val.bool = obj
+        return val
+
+    def can_decode(self, value: Message) -> bool:
+        return value.HasField("bool")
+
+    def do_decode(self, value: struct_pb2.ContentsProto, decode_fn: F) -> bool:
+        del decode_fn
+        return value.bool
+
+
+class _ShapeCodec(Codec[t.Tuple[t.List[t.Tuple[int, str]], bool]]):
+    def can_encode(self, obj: t.Any) -> bool:
+        return _is_tuple(obj)
+
+    def do_encode(
+        self, obj: t.Tuple[t.List[t.Tuple[int, str]], bool], encode_fn: F
+    ) -> struct_pb2.ContentsProto:
+        return struct_pb2.ContentsProto()
+
+    def can_decode(self, value: Message) -> bool:
+        return value.HasField("shape")
+
+    def do_decode(
+        self, value: struct_pb2.ContentsProto, decode_fn: F
+    ) -> t.Tuple[t.List[t.Tuple[int, str]], bool]:
+        return value
+
+
+class _ListCodec(Codec[t.List[t.Any]]):
+    def can_encode(self, obj: t.Any) -> bool:
+        return isinstance(obj, list)
+
+    def do_encode(self, obj: list[t.Any], encode_fn: F) -> struct_pb2.ContentsProto:
+        encoded = struct_pb2.ContentsProto()
+        encoded.list.CopyFrom(struct_pb2.ListValue())
+        for element in obj:
+            encoded.list.values.add().CopyFrom(encode_fn(element))
+        return encoded
+
+    def can_decode(self, value: Message) -> bool:
+        return value.HasField("list")
+
+    def do_decode(self, value: struct_pb2.ContentsProto, decode_fn: F) -> list[t.Any]:
+        return [decode_fn(element) for element in value.list.values]
+
+
+class _TupleCodec(Codec[t.Tuple[t.Any, ...]]):
+    def can_encode(self, obj: t.Any) -> bool:
+        return _is_tuple(obj)
+
+    def do_encode(
+        self, obj: tuple[t.Any, ...], encode_fn: F
+    ) -> struct_pb2.ContentsProto:
+        encoded = struct_pb2.ContentsProto()
+        encoded.tuple.CopyFrom(struct_pb2.TupleValue())
+        for element in obj:
+            encoded.tuple.values.add().CopyFrom(encode_fn(element))
+        return encoded
+
+    def can_decode(self, value: Message) -> bool:
+        return value.HasField("tuple")
+
+    def do_decode(self, value: struct_pb2.ContentsProto, decode_fn: F) -> tuple[t.Any]:
+        return tuple(decode_fn(element) for element in value.tuple.values)
+
+
+def _is_tuple(obj: t.Any) -> bool:
+    return not _is_named_tuple(obj) and isinstance(obj, tuple)
+
+
+def _is_named_tuple(instance: type) -> bool:
+    # Returns True iff `instance` is a `namedtuple`.
+    if not isinstance(instance, tuple):
+        return False
+    return (
+        hasattr(instance, "_fields")
+        and isinstance(instance._fields, t.Sequence)
+        and all(isinstance(f, str) for f in instance._fields)  # type: ignore
+    )
+
+
+class _DictCodec(Codec[t.Dict[str, t.Any]]):
+    def can_encode(self, obj: t.Any) -> bool:
+        return isinstance(obj, dict)
+
+    def do_encode(
+        self, obj: dict[str, t.Any], encode_fn: F
+    ) -> struct_pb2.ContentsProto:
+        encoded = struct_pb2.ContentsProto()
+        encoded.dict.CopyFrom(struct_pb2.DictValue())
+        for key, value in obj.items():
+            encoded.dict.fields[key].CopyFrom(encode_fn(value))
+        return encoded
+
+    def can_decode(self, value: Message) -> bool:
+        return value.HasField("dict")
+
+    def do_decode(
+        self, value: struct_pb2.ContentsProto, decode_fn: F
+    ) -> dict[str, t.Any]:
+        return {key: decode_fn(val) for key, val in value.dict.fields.items()}
+
+
+class _NamedTupleCodec(Codec[t.NamedTuple]):
+    def can_encode(self, obj: t.Any) -> bool:
+        return _is_named_tuple(obj)
+
+    def do_encode(self, obj: t.NamedTuple, encode_fn: F) -> struct_pb2.ContentsProto:
+        encoded = struct_pb2.ContentsProto()
+        encoded.namedtuple.CopyFrom(struct_pb2.NamedTupleValue())
+        encoded.namedtuple.name = obj.__class__.__name__
+        for key in obj._fields:
+            pair = encoded.namedtuple.values.add()
+            pair.key = key
+            pair.value.CopyFrom(encode_fn(obj._asdict()[key]))
+        return encoded
+
+    def can_decode(self, value: Message) -> bool:
+        return value.HasField("namedtuple")
+
+    def do_decode(self, value: struct_pb2.ContentsProto, decode_fn: F) -> t.NamedTuple:
+        kv_pairs = value.namedtuple.values
+        items = [(pair.key, decode_fn(pair.value)) for pair in kv_pairs]
+        # TODO: makes this type safe by using t.NamedTuple
+        namedtuple_klass = namedtuple(  # type: ignore (no type support yet)
+            value.namedtuple.name, list(map(lambda x: x[0], items))
+        )
+        return namedtuple_klass(**dict(items))
+
+
+class _DataFrameCodec(Codec[t.List[str]]):
+    _df_type: t.Literal["dataframe_columns", "dataframe_indices"]
+
+    def __new__(cls):
+        assert cls._df_type in ["dataframe_columns", "dataframe_indices"]
+        if cls._df_type == "dataframe_columns":
+            res = object.__new__(_DataFrameColumnsCodec)
+        elif cls._df_type == "dataframe_indices":
+            res = object.__new__(_DataFrameIndicesCodec)
+        else:
+            raise EncodeError("Unsupported DataFrame type.")
+        return res
+
+    def can_encode(self, obj: t.Iterable[t.Any]) -> bool:
+        return isinstance(obj, list) and all(isinstance(x, str) for x in obj)
+
+    def do_encode(self, obj: list[t.Any], encode_fn: F) -> struct_pb2.ContentsProto:
+        del encode_fn
+        encoded = struct_pb2.ContentsProto()
+        df = getattr(encoded, self._df_type)
+        df.CopyFrom(struct_pb2.ListValue())
+        for element in obj:
+            df.values.add().CopyFrom(str(element))
+        return encoded
+
+    def can_decode(self, value: Message) -> bool:
+        return value.HasField(self._df_type)
+
+    def do_decode(self, value: struct_pb2.ContentsProto, decode_fn: F) -> list[str]:
+        del decode_fn
+        return [str(element) for element in getattr(value, self._df_type).values]
+
+
+class _DataFrameColumnsCodec(_DataFrameCodec):
+    _df_type = "dataframe_columns"
+
+
+class _DataFrameIndicesCodec(_DataFrameCodec):
+    _df_type = "dataframe_indices"
+
+
+_codecs: list[Codec[t.Any]] = [
+    _NoneCodec(),
+    _Float64Codec(),
+    _Int64Codec(),
+    _StringCodec(),
+    _BoolCodec(),
+    _ListCodec(),
+    _TupleCodec(),
+    _DictCodec(),
+    _NamedTupleCodec(),
+    _DataFrameColumnsCodec(),
+    _DataFrameIndicesCodec(),
+]
diff --git a/bentoml/_internal/utils/grpc/serializer.py b/bentoml/_internal/utils/grpc/serializer.py
index 9e78c1558e1..ba8d37932f2 100644
--- a/bentoml/_internal/utils/grpc/serializer.py
+++ b/bentoml/_internal/utils/grpc/serializer.py
@@ -1,25 +1,16 @@
 from __future__ import annotations
 
-import builtins
+import typing as t
+from typing import TYPE_CHECKING
 
-from google.protobuf.descriptor import FieldDescriptor
+if TYPE_CHECKING:
+    from google.protobuf.message import Message
 
-# defines a descriptor type to python native type.
-TYPE_CALLABLE_MAP: dict[int, type] = {
-    FieldDescriptor.TYPE_DOUBLE: builtins.float,
-    FieldDescriptor.TYPE_FLOAT: builtins.float,
-    FieldDescriptor.TYPE_INT32: builtins.int,
-    FieldDescriptor.TYPE_INT64: builtins.int,
-    FieldDescriptor.TYPE_UINT32: builtins.int,
-    FieldDescriptor.TYPE_UINT64: builtins.int,
-    FieldDescriptor.TYPE_SINT32: builtins.int,
-    FieldDescriptor.TYPE_SINT64: builtins.int,
-    FieldDescriptor.TYPE_FIXED32: builtins.int,
-    FieldDescriptor.TYPE_FIXED64: builtins.int,
-    FieldDescriptor.TYPE_SFIXED32: builtins.int,
-    FieldDescriptor.TYPE_SFIXED64: builtins.int,
-    FieldDescriptor.TYPE_BOOL: builtins.bool,
-    FieldDescriptor.TYPE_STRING: builtins.str,
-    FieldDescriptor.TYPE_BYTES: builtins.bytes,
-    FieldDescriptor.TYPE_ENUM: builtins.int,
-}
+
+def proto_to_dict(msg: Message, **kwargs: t.Any) -> dict[str, t.Any]:
+    from google.protobuf.json_format import MessageToDict
+
+    if "preserving_proto_field_name" not in kwargs:
+        kwargs.setdefault("preserving_proto_field_name", True)
+
+    return MessageToDict(msg, **kwargs)
diff --git a/bentoml/_internal/server/cli/grpc/__init__.py b/bentoml/grpc/v1/__init__.py
similarity index 100%
rename from bentoml/_internal/server/cli/grpc/__init__.py
rename to bentoml/grpc/v1/__init__.py
diff --git a/bentoml/grpc/v1/service.proto b/bentoml/grpc/v1/service.proto
index fd34a2bfe05..86ffaa6ba6f 100644
--- a/bentoml/grpc/v1/service.proto
+++ b/bentoml/grpc/v1/service.proto
@@ -3,6 +3,9 @@ syntax = "proto3";
 package bentoml.grpc.v1;
 
 import "bentoml/grpc/v1/struct.proto";
+import "google/protobuf/any.proto";
+import "google/rpc/error_details.proto";
+import "google/rpc/status.proto";
 
 // cc_enable_arenas pre-allocate memory for given message to improve speed. (C++ only)
 option cc_enable_arenas = true;
@@ -22,11 +25,8 @@ service BentoService {
   // Check server readiness
   rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {}
 
-  // Call handles unary API.
-  rpc Call(CallRequest) returns (CallResponse) {}
-
-  // CallStream handles streaming API.
-  rpc CallStream(stream CallStreamRequest) returns (stream CallStreamResponse) {}
+  // Inference handles unary API.
+  rpc Inference(InferenceRequest) returns (InferenceResponse) {}
 }
 
 // request for ServerLive that takes no arguments.
@@ -45,31 +45,175 @@ message ServerReadyResponse {
   bool ready = 1;
 }
 
-// Request for Call.
-message CallRequest {
+// Request for Inference.
+message InferenceRequest {
   // a given API route the rpc request is sent to.
-  string api_name = 1;
+  string api_name = 14;
+  string api_version = 15;
 
   // representation of the input value.
-  bentoml.grpc.v1.StructuredValue contents = 2;
-}
-
-// Response from Call.
-message CallResponse {
-  // representation of the output value.
-  bentoml.grpc.v1.StructuredValue contents = 1;
+  oneof contents {
+    // Serialized bytes contents.
+    bytes bytes_contents = 1;
+
+    // DT_HALF, DT_BFLOAT16
+    int32 half_contents = 10;
+
+    // DT_FLOAT.
+    float float_contents = 2;
+
+    // DT_DOUBLE.
+    double double_contents = 3;
+
+    // DT_INT32, DT_INT16, DT_UINT16, DT_INT8, DT_UINT8.
+    int32 int_contents = 4;
+
+    // DT_STRING
+    bytes string_contents = 5;
+
+    // DT_COMPLEX64.
+    float scomplex_contents = 6;
+
+    // DT_INT64
+    int64 int64_contents = 7;
+
+    // DT_BOOL
+    bool bool_contents = 8;
+
+    // DT_COMPLEX128
+    double dcomplex_contents = 9;
+
+    // DT_UINT32
+    uint32 uint32_contents = 11;
+
+    // DT_UINT64
+    uint64 uint64_contents = 12;
+
+    // Represents arbitrary Python data structures.
+    NoneValue none_contents = 50;
+    ListValue list_contents = 51;
+    TupleValue tuple_contents = 52;
+    DictValue dict_contents = 53;
+    NamedTupleValue namedtuple_contents = 54;
+
+    google.protobuf.Any any_contents = 55;
+  }
+
+  // The data contained in an input can be represented in
+  // "raw" bytes form or in the repeated type that matches the
+  // data type.
+  // Using the "raw" bytes form will typically allow higher performance due to the way protobuf
+  // allocation and reuse interacts with GRPC.
+  // For example, see https://github.com/grpc/grpc/issues/23231.
+  //
+  // To use the raw representation 'raw_input_contents' must be
+  // initialized with data for each tensor in the same order as
+  // 'inputs'. For each tensor, the size of this content must
+  // match what is expected by the tensor's shape and data
+  // type. The raw data must be the flattened, one-dimensional,
+  // row-major order of the tensor elements without any stride
+  // or padding between the elements.
+  //
+  // Note that the FP16 and BF16 data
+  // types must be represented as raw content as there is no
+  // specific data type for a 16-bit float type.
+  //
+  // If this field is specified then contents must not be specified for any input tensor.
+  repeated bytes raw_bytes_contents = 13;
+
+  // dataframes_columns and dataframe_indices are used
+  // in conjunction with contents to represent a dataframe.
+  // Recommendation: for better performance, use raw_bytes_contents in
+  // conjunction with the below fields.
+  repeated string dataframe_columns = 101;
+  repeated string dataframe_indices = 102;
 }
 
-// Request message for CallStream.
-message CallStreamRequest {
-  CallRequest stream_inputs = 1;
-}
-
-// Response message for CallStream.
-message CallStreamResponse {
-  // message describing the error. Empty message signals the call was successful without error.
-  string error_message = 1;
-
+// Response from Inference.
+message InferenceResponse {
   // representation of the output value.
-  CallResponse stream_outputs = 2;
+  // representation of the input value.
+  oneof contents {
+    // Serialized bytes contents.
+    bytes bytes_contents = 1;
+
+    // DT_HALF, DT_BFLOAT16
+    int32 half_contents = 10;
+
+    // DT_FLOAT.
+    float float_contents = 2;
+
+    // DT_DOUBLE.
+    double double_contents = 3;
+
+    // DT_INT32, DT_INT16, DT_UINT16, DT_INT8, DT_UINT8.
+    int32 int_contents = 4;
+
+    // DT_STRING
+    bytes string_contents = 5;
+
+    // DT_COMPLEX64.
+    float scomplex_contents = 6;
+
+    // DT_INT64
+    int64 int64_contents = 7;
+
+    // DT_BOOL
+    bool bool_contents = 8;
+
+    // DT_COMPLEX128
+    double dcomplex_contents = 9;
+
+    // DT_UINT32
+    uint32 uint32_contents = 11;
+
+    // DT_UINT64
+    uint64 uint64_contents = 12;
+
+    // Represents arbitrary Python data structures.
+    NoneValue none_contents = 50;
+    ListValue list_contents = 51;
+    TupleValue tuple_contents = 52;
+    DictValue dict_contents = 53;
+    NamedTupleValue namedtuple_contents = 54;
+
+    google.protobuf.Any any_contents = 55;
+  }
+
+  // The data contained in an output can be represented in
+  // "raw" bytes form or in the repeated type that matches the
+  // data type.
+  // Using the "raw" bytes form will typically allow higher performance due to the way protobuf
+  // allocation and reuse interacts with GRPC.
+  // For example, see https://github.com/grpc/grpc/issues/23231.
+  //
+  // To use the raw representation 'raw_input_contents' must be
+  // initialized with data for each tensor in the same order as
+  // 'inputs'. For each tensor, the size of this content must
+  // match what is expected by the tensor's shape and data
+  // type. The raw data must be the flattened, one-dimensional,
+  // row-major order of the tensor elements without any stride
+  // or padding between the elements.
+  //
+  // Note that the FP16 and BF16 data
+  // types must be represented as raw content as there is no
+  // specific data type for a 16-bit float type.
+  //
+  // If this field is specified then contents must not be specified for any input tensor.
+  repeated bytes raw_bytes_contents = 13;
+
+  // Sends a rpc status back to the client.
+  google.rpc.Status status = 14;
+
+  // the response should also include an error message type
+  oneof errors {
+    google.rpc.RetryInfo retry_info = 100;
+    google.rpc.DebugInfo debug_info = 101;
+    google.rpc.QuotaFailure quota_failure = 102;
+    google.rpc.ErrorInfo error_info = 103;
+    google.rpc.PreconditionFailure precondition_failure = 104;
+    google.rpc.BadRequest bad_request = 105;
+    google.rpc.RequestInfo request_info = 106;
+    google.rpc.LocalizedMessage localized_message = 107;
+  }
 }
diff --git a/bentoml/grpc/v1/service_pb2.pyi b/bentoml/grpc/v1/service_pb2.pyi
deleted file mode 100644
index 5f550d2962d..00000000000
--- a/bentoml/grpc/v1/service_pb2.pyi
+++ /dev/null
@@ -1,201 +0,0 @@
-"""
-@generated by mypy-protobuf.  Do not edit manually!
-isort:skip_file
-"""
-import abc
-import bentoml.grpc.v1.struct_pb2
-import builtins
-import concurrent.futures
-import google.protobuf.descriptor
-import google.protobuf.message
-import google.protobuf.service
-import typing
-import typing_extensions
-
-DESCRIPTOR: google.protobuf.descriptor.FileDescriptor
-
-class ServerLiveRequest(google.protobuf.message.Message):
-    """request for ServerLive that takes no arguments."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    def __init__(self,
-        ) -> None: ...
-global___ServerLiveRequest = ServerLiveRequest
-
-class ServerLiveResponse(google.protobuf.message.Message):
-    """response for ServerLive returns a boolean determine server's liveliness."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    LIVE_FIELD_NUMBER: builtins.int
-    live: builtins.bool
-    def __init__(self,
-        *,
-        live: builtins.bool = ...,
-        ) -> None: ...
-    def ClearField(self, field_name: typing_extensions.Literal["live",b"live"]) -> None: ...
-global___ServerLiveResponse = ServerLiveResponse
-
-class ServerReadyRequest(google.protobuf.message.Message):
-    """request for ServerReady that takes no arguments."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    def __init__(self,
-        ) -> None: ...
-global___ServerReadyRequest = ServerReadyRequest
-
-class ServerReadyResponse(google.protobuf.message.Message):
-    """response for ServerReady returns a boolean determine server's readiness."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    READY_FIELD_NUMBER: builtins.int
-    ready: builtins.bool
-    def __init__(self,
-        *,
-        ready: builtins.bool = ...,
-        ) -> None: ...
-    def ClearField(self, field_name: typing_extensions.Literal["ready",b"ready"]) -> None: ...
-global___ServerReadyResponse = ServerReadyResponse
-
-class CallRequest(google.protobuf.message.Message):
-    """Request for Call."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    API_NAME_FIELD_NUMBER: builtins.int
-    CONTENTS_FIELD_NUMBER: builtins.int
-    api_name: typing.Text
-    """a given API route the rpc request is sent to."""
-
-    @property
-    def contents(self) -> bentoml.grpc.v1.struct_pb2.StructuredValue:
-        """representation of the input value."""
-        pass
-    def __init__(self,
-        *,
-        api_name: typing.Text = ...,
-        contents: typing.Optional[bentoml.grpc.v1.struct_pb2.StructuredValue] = ...,
-        ) -> None: ...
-    def HasField(self, field_name: typing_extensions.Literal["contents",b"contents"]) -> builtins.bool: ...
-    def ClearField(self, field_name: typing_extensions.Literal["api_name",b"api_name","contents",b"contents"]) -> None: ...
-global___CallRequest = CallRequest
-
-class CallResponse(google.protobuf.message.Message):
-    """Response from Call."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    CONTENTS_FIELD_NUMBER: builtins.int
-    @property
-    def contents(self) -> bentoml.grpc.v1.struct_pb2.StructuredValue:
-        """representation of the output value."""
-        pass
-    def __init__(self,
-        *,
-        contents: typing.Optional[bentoml.grpc.v1.struct_pb2.StructuredValue] = ...,
-        ) -> None: ...
-    def HasField(self, field_name: typing_extensions.Literal["contents",b"contents"]) -> builtins.bool: ...
-    def ClearField(self, field_name: typing_extensions.Literal["contents",b"contents"]) -> None: ...
-global___CallResponse = CallResponse
-
-class CallStreamRequest(google.protobuf.message.Message):
-    """Request message for CallStream."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    STREAM_INPUTS_FIELD_NUMBER: builtins.int
-    @property
-    def stream_inputs(self) -> global___CallRequest: ...
-    def __init__(self,
-        *,
-        stream_inputs: typing.Optional[global___CallRequest] = ...,
-        ) -> None: ...
-    def HasField(self, field_name: typing_extensions.Literal["stream_inputs",b"stream_inputs"]) -> builtins.bool: ...
-    def ClearField(self, field_name: typing_extensions.Literal["stream_inputs",b"stream_inputs"]) -> None: ...
-global___CallStreamRequest = CallStreamRequest
-
-class CallStreamResponse(google.protobuf.message.Message):
-    """Response message for CallStream."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    ERROR_MESSAGE_FIELD_NUMBER: builtins.int
-    STREAM_OUTPUTS_FIELD_NUMBER: builtins.int
-    error_message: typing.Text
-    """message describing the error. Empty message signals the call was successful without error."""
-
-    @property
-    def stream_outputs(self) -> global___CallResponse:
-        """representation of the output value."""
-        pass
-    def __init__(self,
-        *,
-        error_message: typing.Text = ...,
-        stream_outputs: typing.Optional[global___CallResponse] = ...,
-        ) -> None: ...
-    def HasField(self, field_name: typing_extensions.Literal["stream_outputs",b"stream_outputs"]) -> builtins.bool: ...
-    def ClearField(self, field_name: typing_extensions.Literal["error_message",b"error_message","stream_outputs",b"stream_outputs"]) -> None: ...
-global___CallStreamResponse = CallStreamResponse
-
-class BentoService(google.protobuf.service.Service, metaclass=abc.ABCMeta):
-    """a gRPC BentoServer."""
-    DESCRIPTOR: google.protobuf.descriptor.ServiceDescriptor
-    @abc.abstractmethod
-    def ServerLive(
-        inst: BentoService,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___ServerLiveRequest,
-        callback: typing.Optional[typing.Callable[[global___ServerLiveResponse], None]],
-    ) -> concurrent.futures.Future[global___ServerLiveResponse]:
-        """Check server liveliness."""
-        pass
-    @abc.abstractmethod
-    def ServerReady(
-        inst: BentoService,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___ServerReadyRequest,
-        callback: typing.Optional[typing.Callable[[global___ServerReadyResponse], None]],
-    ) -> concurrent.futures.Future[global___ServerReadyResponse]:
-        """Check server readiness"""
-        pass
-    @abc.abstractmethod
-    def Call(
-        inst: BentoService,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___CallRequest,
-        callback: typing.Optional[typing.Callable[[global___CallResponse], None]],
-    ) -> concurrent.futures.Future[global___CallResponse]:
-        """Call handles unary API."""
-        pass
-    @abc.abstractmethod
-    def CallStream(
-        inst: BentoService,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___CallStreamRequest,
-        callback: typing.Optional[typing.Callable[[global___CallStreamResponse], None]],
-    ) -> concurrent.futures.Future[global___CallStreamResponse]:
-        """CallStream handles streaming API."""
-        pass
-class BentoService_Stub(BentoService):
-    """a gRPC BentoServer."""
-    def __init__(self, rpc_channel: google.protobuf.service.RpcChannel) -> None: ...
-    DESCRIPTOR: google.protobuf.descriptor.ServiceDescriptor
-    def ServerLive(
-        inst: BentoService_Stub,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___ServerLiveRequest,
-        callback: typing.Optional[typing.Callable[[global___ServerLiveResponse], None]] = None,
-    ) -> concurrent.futures.Future[global___ServerLiveResponse]:
-        """Check server liveliness."""
-        pass
-    def ServerReady(
-        inst: BentoService_Stub,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___ServerReadyRequest,
-        callback: typing.Optional[typing.Callable[[global___ServerReadyResponse], None]] = None,
-    ) -> concurrent.futures.Future[global___ServerReadyResponse]:
-        """Check server readiness"""
-        pass
-    def Call(
-        inst: BentoService_Stub,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___CallRequest,
-        callback: typing.Optional[typing.Callable[[global___CallResponse], None]] = None,
-    ) -> concurrent.futures.Future[global___CallResponse]:
-        """Call handles unary API."""
-        pass
-    def CallStream(
-        inst: BentoService_Stub,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___CallStreamRequest,
-        callback: typing.Optional[typing.Callable[[global___CallStreamResponse], None]] = None,
-    ) -> concurrent.futures.Future[global___CallStreamResponse]:
-        """CallStream handles streaming API."""
-        pass
\ No newline at end of file
diff --git a/bentoml/grpc/v1/service_pb2_grpc.pyi b/bentoml/grpc/v1/service_pb2_grpc.pyi
deleted file mode 100644
index 659c3d587a2..00000000000
--- a/bentoml/grpc/v1/service_pb2_grpc.pyi
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-@generated by mypy-protobuf.  Do not edit manually!
-isort:skip_file
-"""
-import abc
-import bentoml.grpc.v1.service_pb2
-import grpc
-import typing
-
-class BentoServiceStub:
-    """a gRPC BentoServer."""
-    def __init__(self, channel: grpc.Channel) -> None: ...
-    ServerLive: grpc.UnaryUnaryMultiCallable[
-        bentoml.grpc.v1.service_pb2.ServerLiveRequest,
-        bentoml.grpc.v1.service_pb2.ServerLiveResponse]
-    """Check server liveliness."""
-
-    ServerReady: grpc.UnaryUnaryMultiCallable[
-        bentoml.grpc.v1.service_pb2.ServerReadyRequest,
-        bentoml.grpc.v1.service_pb2.ServerReadyResponse]
-    """Check server readiness"""
-
-    Call: grpc.UnaryUnaryMultiCallable[
-        bentoml.grpc.v1.service_pb2.CallRequest,
-        bentoml.grpc.v1.service_pb2.CallResponse]
-    """Call handles unary API."""
-
-    CallStream: grpc.StreamStreamMultiCallable[
-        bentoml.grpc.v1.service_pb2.CallStreamRequest,
-        bentoml.grpc.v1.service_pb2.CallStreamResponse]
-    """CallStream handles streaming API."""
-
-
-class BentoServiceServicer(metaclass=abc.ABCMeta):
-    """a gRPC BentoServer."""
-    @abc.abstractmethod
-    def ServerLive(self,
-        request: bentoml.grpc.v1.service_pb2.ServerLiveRequest,
-        context: grpc.ServicerContext,
-    ) -> bentoml.grpc.v1.service_pb2.ServerLiveResponse:
-        """Check server liveliness."""
-        pass
-
-    @abc.abstractmethod
-    def ServerReady(self,
-        request: bentoml.grpc.v1.service_pb2.ServerReadyRequest,
-        context: grpc.ServicerContext,
-    ) -> bentoml.grpc.v1.service_pb2.ServerReadyResponse:
-        """Check server readiness"""
-        pass
-
-    @abc.abstractmethod
-    def Call(self,
-        request: bentoml.grpc.v1.service_pb2.CallRequest,
-        context: grpc.ServicerContext,
-    ) -> bentoml.grpc.v1.service_pb2.CallResponse:
-        """Call handles unary API."""
-        pass
-
-    @abc.abstractmethod
-    def CallStream(self,
-        request_iterator: typing.Iterator[bentoml.grpc.v1.service_pb2.CallStreamRequest],
-        context: grpc.ServicerContext,
-    ) -> typing.Iterator[bentoml.grpc.v1.service_pb2.CallStreamResponse]:
-        """CallStream handles streaming API."""
-        pass
-
-
-def add_BentoServiceServicer_to_server(servicer: BentoServiceServicer, server: grpc.Server) -> None: ...
diff --git a/bentoml/grpc/v1/service_test.proto b/bentoml/grpc/v1/service_test.proto
index 12c11e2ee6d..2674e2035da 100644
--- a/bentoml/grpc/v1/service_test.proto
+++ b/bentoml/grpc/v1/service_test.proto
@@ -9,23 +9,23 @@ option optimize_for = SPEED;
 option py_generic_services = true;
 
 // Represents a request for TestService.
-message TestRequest {
+message ExecuteRequest {
   string input = 1;
 }
 
 // Represents a response from TestService.
-message TestResponse {
+message ExecuteResponse {
   string output = 1;
 }
 
 // Use for testing interceptors per RPC call.
 service TestService {
   // Unary API
-  rpc Execute(TestRequest) returns (TestResponse);
+  rpc Execute(ExecuteRequest) returns (ExecuteResponse);
   // Client-streaming API
-  rpc ClientStreamExecute(stream TestRequest) returns (TestResponse);
+  rpc ClientStreamExecute(stream ExecuteRequest) returns (ExecuteResponse);
   // Server-streaming API
-  rpc ServerStreamExecute(TestRequest) returns (stream TestResponse);
+  rpc ServerStreamExecute(ExecuteRequest) returns (stream ExecuteResponse);
   // Bidirectional streaming API
-  rpc BidiStreamExecute(stream TestRequest) returns (stream TestResponse);
+  rpc BidiStreamExecute(stream ExecuteRequest) returns (stream ExecuteResponse);
 }
diff --git a/bentoml/grpc/v1/service_test_pb2.pyi b/bentoml/grpc/v1/service_test_pb2.pyi
deleted file mode 100644
index 54190501b7c..00000000000
--- a/bentoml/grpc/v1/service_test_pb2.pyi
+++ /dev/null
@@ -1,114 +0,0 @@
-"""
-@generated by mypy-protobuf.  Do not edit manually!
-isort:skip_file
-"""
-import abc
-import builtins
-import concurrent.futures
-import google.protobuf.descriptor
-import google.protobuf.message
-import google.protobuf.service
-import typing
-import typing_extensions
-
-DESCRIPTOR: google.protobuf.descriptor.FileDescriptor
-
-class TestRequest(google.protobuf.message.Message):
-    """Represents a request for TestService."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    INPUT_FIELD_NUMBER: builtins.int
-    input: typing.Text
-    def __init__(self,
-        *,
-        input: typing.Text = ...,
-        ) -> None: ...
-    def ClearField(self, field_name: typing_extensions.Literal["input",b"input"]) -> None: ...
-global___TestRequest = TestRequest
-
-class TestResponse(google.protobuf.message.Message):
-    """Represents a response from TestService."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    OUTPUT_FIELD_NUMBER: builtins.int
-    output: typing.Text
-    def __init__(self,
-        *,
-        output: typing.Text = ...,
-        ) -> None: ...
-    def ClearField(self, field_name: typing_extensions.Literal["output",b"output"]) -> None: ...
-global___TestResponse = TestResponse
-
-class TestService(google.protobuf.service.Service, metaclass=abc.ABCMeta):
-    """Use for testing interceptors per RPC call."""
-    DESCRIPTOR: google.protobuf.descriptor.ServiceDescriptor
-    @abc.abstractmethod
-    def Execute(
-        inst: TestService,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___TestRequest,
-        callback: typing.Optional[typing.Callable[[global___TestResponse], None]],
-    ) -> concurrent.futures.Future[global___TestResponse]:
-        """Unary API"""
-        pass
-    @abc.abstractmethod
-    def ClientStreamExecute(
-        inst: TestService,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___TestRequest,
-        callback: typing.Optional[typing.Callable[[global___TestResponse], None]],
-    ) -> concurrent.futures.Future[global___TestResponse]:
-        """Client-streaming API"""
-        pass
-    @abc.abstractmethod
-    def ServerStreamExecute(
-        inst: TestService,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___TestRequest,
-        callback: typing.Optional[typing.Callable[[global___TestResponse], None]],
-    ) -> concurrent.futures.Future[global___TestResponse]:
-        """Server-streaming API"""
-        pass
-    @abc.abstractmethod
-    def BidiStreamExecute(
-        inst: TestService,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___TestRequest,
-        callback: typing.Optional[typing.Callable[[global___TestResponse], None]],
-    ) -> concurrent.futures.Future[global___TestResponse]:
-        """Bidirectional streaming API"""
-        pass
-class TestService_Stub(TestService):
-    """Use for testing interceptors per RPC call."""
-    def __init__(self, rpc_channel: google.protobuf.service.RpcChannel) -> None: ...
-    DESCRIPTOR: google.protobuf.descriptor.ServiceDescriptor
-    def Execute(
-        inst: TestService_Stub,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___TestRequest,
-        callback: typing.Optional[typing.Callable[[global___TestResponse], None]] = None,
-    ) -> concurrent.futures.Future[global___TestResponse]:
-        """Unary API"""
-        pass
-    def ClientStreamExecute(
-        inst: TestService_Stub,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___TestRequest,
-        callback: typing.Optional[typing.Callable[[global___TestResponse], None]] = None,
-    ) -> concurrent.futures.Future[global___TestResponse]:
-        """Client-streaming API"""
-        pass
-    def ServerStreamExecute(
-        inst: TestService_Stub,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___TestRequest,
-        callback: typing.Optional[typing.Callable[[global___TestResponse], None]] = None,
-    ) -> concurrent.futures.Future[global___TestResponse]:
-        """Server-streaming API"""
-        pass
-    def BidiStreamExecute(
-        inst: TestService_Stub,
-        rpc_controller: google.protobuf.service.RpcController,
-        request: global___TestRequest,
-        callback: typing.Optional[typing.Callable[[global___TestResponse], None]] = None,
-    ) -> concurrent.futures.Future[global___TestResponse]:
-        """Bidirectional streaming API"""
-        pass
\ No newline at end of file
diff --git a/bentoml/grpc/v1/service_test_pb2_grpc.pyi b/bentoml/grpc/v1/service_test_pb2_grpc.pyi
deleted file mode 100644
index d2249871b34..00000000000
--- a/bentoml/grpc/v1/service_test_pb2_grpc.pyi
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-@generated by mypy-protobuf.  Do not edit manually!
-isort:skip_file
-"""
-import abc
-import bentoml.grpc.v1.service_test_pb2
-import grpc
-import typing
-
-class TestServiceStub:
-    """Use for testing interceptors per RPC call."""
-    def __init__(self, channel: grpc.Channel) -> None: ...
-    Execute: grpc.UnaryUnaryMultiCallable[
-        bentoml.grpc.v1.service_test_pb2.TestRequest,
-        bentoml.grpc.v1.service_test_pb2.TestResponse]
-    """Unary API"""
-
-    ClientStreamExecute: grpc.StreamUnaryMultiCallable[
-        bentoml.grpc.v1.service_test_pb2.TestRequest,
-        bentoml.grpc.v1.service_test_pb2.TestResponse]
-    """Client-streaming API"""
-
-    ServerStreamExecute: grpc.UnaryStreamMultiCallable[
-        bentoml.grpc.v1.service_test_pb2.TestRequest,
-        bentoml.grpc.v1.service_test_pb2.TestResponse]
-    """Server-streaming API"""
-
-    BidiStreamExecute: grpc.StreamStreamMultiCallable[
-        bentoml.grpc.v1.service_test_pb2.TestRequest,
-        bentoml.grpc.v1.service_test_pb2.TestResponse]
-    """Bidirectional streaming API"""
-
-
-class TestServiceServicer(metaclass=abc.ABCMeta):
-    """Use for testing interceptors per RPC call."""
-    @abc.abstractmethod
-    def Execute(self,
-        request: bentoml.grpc.v1.service_test_pb2.TestRequest,
-        context: grpc.ServicerContext,
-    ) -> bentoml.grpc.v1.service_test_pb2.TestResponse:
-        """Unary API"""
-        pass
-
-    @abc.abstractmethod
-    def ClientStreamExecute(self,
-        request_iterator: typing.Iterator[bentoml.grpc.v1.service_test_pb2.TestRequest],
-        context: grpc.ServicerContext,
-    ) -> bentoml.grpc.v1.service_test_pb2.TestResponse:
-        """Client-streaming API"""
-        pass
-
-    @abc.abstractmethod
-    def ServerStreamExecute(self,
-        request: bentoml.grpc.v1.service_test_pb2.TestRequest,
-        context: grpc.ServicerContext,
-    ) -> typing.Iterator[bentoml.grpc.v1.service_test_pb2.TestResponse]:
-        """Server-streaming API"""
-        pass
-
-    @abc.abstractmethod
-    def BidiStreamExecute(self,
-        request_iterator: typing.Iterator[bentoml.grpc.v1.service_test_pb2.TestRequest],
-        context: grpc.ServicerContext,
-    ) -> typing.Iterator[bentoml.grpc.v1.service_test_pb2.TestResponse]:
-        """Bidirectional streaming API"""
-        pass
-
-
-def add_TestServiceServicer_to_server(servicer: TestServiceServicer, server: grpc.Server) -> None: ...
diff --git a/bentoml/grpc/v1/struct.proto b/bentoml/grpc/v1/struct.proto
index 225cc2aab5a..95e7330d41f 100644
--- a/bentoml/grpc/v1/struct.proto
+++ b/bentoml/grpc/v1/struct.proto
@@ -2,7 +2,7 @@ syntax = "proto3";
 
 package bentoml.grpc.v1;
 
-import "bentoml/grpc/v1/types.proto";
+import "google/protobuf/any.proto";
 
 // cc_enable_arenas pre-allocate memory for given message to improve speed. (C++ only)
 option cc_enable_arenas = true;
@@ -13,101 +13,78 @@ option java_outer_classname = "StructProto";
 option java_package = "com.bentoml.grpc.v1";
 option objc_class_prefix = "STCT";
 
-// Protocol buffer representing contents of a given rpc call.
-// Partial verbatim sources of tensorflow/core/framework/tensor.proto.
-//
-// One key different from the original is that since we are using proto3,
-//  repeated values are packed by default.
-message Value {
-  // dtype represents the data type of given contents. This would help us
-  // to use the correct serialization protocol on the server side if needed.
-  bentoml.grpc.v1.DataType dtype = 1;
-
-  // Only one of the representations below is set, one of "contents" and
-  // the "xxx_value" attributes.  We are not using oneof because as oneofs cannot
-  // contain repeated fields it would require another extra set of messages.
-
-  // Serialized raw contents. This representation
-  // can be used for all tensor types. The purpose of this representation is to
+// Representing contents of a given rpc call.
+message ContentsProto {
+  // Serialized bytes contents.
+  // The purpose of this representation is to
   // reduce serialization overhead during RPC call by avoiding serialization of
   // many repeated small items.
-  bytes contents = 2;
+  repeated bytes bytes_contents = 1;
 
-  // Type specific representations that make it easy to create tensor protos in
-  // all languages.  Only the representation corresponding to "dtype" can
-  // be set. The values hold the flattened representation of the inputs in
-  // row major order.
+  // Type specific representations that make it easy to create protos in
+  // all languages. The values hold the flattened representation of the inputs in row major order.
+  //
+  // Note that for quantized types, one should use ``bytes_contents`` and provide dtype at IO descriptor level.
 
   // DT_HALF, DT_BFLOAT16. Note that since protobuf has no int16 type, we'll
   // have some pointless zero padding for each value here.
-  repeated int32 half_value = 11;
+  repeated int32 half_contents = 10 [packed = true];
 
   // DT_FLOAT.
-  repeated float float_value = 3;
+  repeated float float_contents = 2 [packed = true];
 
   // DT_DOUBLE.
-  repeated double double_value = 4;
+  repeated double double_contents = 3 [packed = true];
 
   // DT_INT32, DT_INT16, DT_UINT16, DT_INT8, DT_UINT8.
-  repeated int32 int_value = 5;
+  repeated int32 int_contents = 4 [packed = true];
 
   // DT_STRING
-  repeated bytes string_value = 6;
+  repeated bytes string_contents = 5;
 
-  // DT_COMPLEX64. scomplex_value(2*i) and scomplex_value(2*i+1) are real
+  // DT_COMPLEX64. scomplex_contents(2*i) and scomplex_contents(2*i+1) are real
   // and imaginary parts of i-th single precision complex.
-  repeated float scomplex_value = 7;
+  repeated float scomplex_contents = 6 [packed = true];
 
   // DT_INT64
-  repeated int64 int64_value = 8;
+  repeated int64 int64_contents = 7 [packed = true];
 
   // DT_BOOL
-  repeated bool bool_value = 9;
+  repeated bool bool_contents = 8 [packed = true];
 
-  // DT_COMPLEX128. dcomplex_value(2*i) and dcomplex_value(2*i+1) are real
+  // DT_COMPLEX128. dcomplex_contents(2*i) and dcomplex_contents(2*i+1) are real
   // and imaginary parts of i-th double precision complex.
-  repeated double dcomplex_value = 10;
+  repeated double dcomplex_contents = 9 [packed = true];
 
   // DT_UINT32
-  repeated uint32 uint32_value = 12;
+  repeated uint32 uint32_contents = 11 [packed = true];
 
   // DT_UINT64
-  repeated uint64 uint64_value = 13;
-}
+  repeated uint64 uint64_contents = 12 [packed = true];
 
-// StructuredValue represents a dynamic typed value representing various
-// data structures that are inspired by Python's data structures.
-// Partial verbatim sources of tensorflow/core/protobuf/struct.proto.
-//
-// [16, 100] will be reserved for future use. Users are free to use from [3, 15]
-message StructuredValue {
-  oneof kind {
-    // Represents a None value.
-    NoneValue none_value = 1;
+  // This section represents a dynamic typed value representing various
+  // data structures that are inspired by Python's data structures.
 
-    // Represents any value that can be represented as a payload contents.
-    Value any_value = 2;
+  // Represents a None value.
+  repeated NoneValue none_contents = 50;
 
-    // Represents a list of `Value`.
-    ListValue list_value = 51;
+  // Represents a list of `ContentsProto`.
+  repeated ListValue list_contents = 51;
 
-    // Represents a tuple of `Value`.
-    TupleValue tuple_value = 52;
+  // Represents a tuple of `ContentsProto`.
+  repeated TupleValue tuple_contents = 52;
 
-    // Represents a dict `Value`.
-    DictValue dict_value = 53;
+  // Represents a dict `ContentsProto`.
+  repeated DictValue dict_contents = 53;
 
-    // Represents Python's namedtuple.
-    NamedTupleValue namedtuple_value = 54;
-  }
+  // Represents Python's namedtuple.
+  repeated NamedTupleValue namedtuple_contents = 54;
 
-  // dataframes_columns and dataframe_indices are used
-  // in conjunction with contents to represent a dataframe.
-  repeated string dataframe_columns = 101;
-  repeated string dataframe_indices = 102;
+  // To represent any type contents.
+  repeated google.protobuf.Any any_contents = 55;
 
   // We want to reserve these for future uses.
-  reserved 16 to 50, 55 to 100;
+  reserved 56 to 100;
 }
 
 // represents None type.
@@ -115,28 +92,31 @@ message NoneValue {}
 
 // Represents a Python list.
 message ListValue {
-  repeated StructuredValue values = 1;
+  repeated ContentsProto values = 1;
 }
 
 // Represents a Python tuple.
 message TupleValue {
-  repeated StructuredValue values = 1;
+  repeated ContentsProto values = 1;
 }
 
-// Represents a Python dict with key<str> and value<StructuredValue>.
-// This key is analogous with Value.string_value
+// Represents a Python dict with key<str> and value<ContentsProto>.
+// This key is analogous with ContentsProto.string_contents.
+// Note that map<> does not preserve order from given message.
 message DictValue {
-  map<string, StructuredValue> fields = 1;
+  map<string, ContentsProto> fields = 1;
 }
 
 // Represents a (key, value) pair.
 message PairValue {
   string key = 1;
-  StructuredValue value = 2;
+  ContentsProto value = 2;
 }
 
 // Represents a namedtuple.
+// One key different from DictValue is that NamedTupleValue reserved
+// the order of key value map, whereas map doesn't.
 message NamedTupleValue {
-  string type_name = 1;
+  string name = 1;
   repeated PairValue values = 2;
 }
diff --git a/bentoml/grpc/v1/struct_pb2.pyi b/bentoml/grpc/v1/struct_pb2.pyi
deleted file mode 100644
index ce030f269be..00000000000
--- a/bentoml/grpc/v1/struct_pb2.pyi
+++ /dev/null
@@ -1,284 +0,0 @@
-"""
-@generated by mypy-protobuf.  Do not edit manually!
-isort:skip_file
-"""
-import bentoml.grpc.v1.types_pb2
-import builtins
-import google.protobuf.descriptor
-import google.protobuf.internal.containers
-import google.protobuf.message
-import typing
-import typing_extensions
-
-DESCRIPTOR: google.protobuf.descriptor.FileDescriptor
-
-class Value(google.protobuf.message.Message):
-    """Protocol buffer representing contents of a given rpc call.
-    Partial verbatim sources of tensorflow/core/framework/tensor.proto.
-
-    One key different from the original is that since we are using proto3,
-     repeated values are packed by default.
-    """
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    DTYPE_FIELD_NUMBER: builtins.int
-    CONTENTS_FIELD_NUMBER: builtins.int
-    HALF_VALUE_FIELD_NUMBER: builtins.int
-    FLOAT_VALUE_FIELD_NUMBER: builtins.int
-    DOUBLE_VALUE_FIELD_NUMBER: builtins.int
-    INT_VALUE_FIELD_NUMBER: builtins.int
-    STRING_VALUE_FIELD_NUMBER: builtins.int
-    SCOMPLEX_VALUE_FIELD_NUMBER: builtins.int
-    INT64_VALUE_FIELD_NUMBER: builtins.int
-    BOOL_VALUE_FIELD_NUMBER: builtins.int
-    DCOMPLEX_VALUE_FIELD_NUMBER: builtins.int
-    UINT32_VALUE_FIELD_NUMBER: builtins.int
-    UINT64_VALUE_FIELD_NUMBER: builtins.int
-    dtype: bentoml.grpc.v1.types_pb2.DataType.ValueType
-    """dtype represents the data type of given contents. This would help us
-    to use the correct serialization protocol on the server side if needed.
-    """
-
-    contents: builtins.bytes
-    """Only one of the representations below is set, one of "contents" and
-    the "xxx_value" attributes.  We are not using oneof because as oneofs cannot
-    contain repeated fields it would require another extra set of messages.
-
-    Serialized raw contents. This representation
-    can be used for all tensor types. The purpose of this representation is to
-    reduce serialization overhead during RPC call by avoiding serialization of
-    many repeated small items.
-    """
-
-    @property
-    def half_value(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
-        """Type specific representations that make it easy to create tensor protos in
-        all languages.  Only the representation corresponding to "dtype" can
-        be set. The values hold the flattened representation of the inputs in
-        row major order.
-
-        DT_HALF, DT_BFLOAT16. Note that since protobuf has no int16 type, we'll
-        have some pointless zero padding for each value here.
-        """
-        pass
-    @property
-    def float_value(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
-        """DT_FLOAT."""
-        pass
-    @property
-    def double_value(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
-        """DT_DOUBLE."""
-        pass
-    @property
-    def int_value(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
-        """DT_INT32, DT_INT16, DT_UINT16, DT_INT8, DT_UINT8."""
-        pass
-    @property
-    def string_value(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.bytes]:
-        """DT_STRING"""
-        pass
-    @property
-    def scomplex_value(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
-        """DT_COMPLEX64. scomplex_value(2*i) and scomplex_value(2*i+1) are real
-        and imaginary parts of i-th single precision complex.
-        """
-        pass
-    @property
-    def int64_value(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
-        """DT_INT64"""
-        pass
-    @property
-    def bool_value(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.bool]:
-        """DT_BOOL"""
-        pass
-    @property
-    def dcomplex_value(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.float]:
-        """DT_COMPLEX128. dcomplex_value(2*i) and dcomplex_value(2*i+1) are real
-        and imaginary parts of i-th double precision complex.
-        """
-        pass
-    @property
-    def uint32_value(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
-        """DT_UINT32"""
-        pass
-    @property
-    def uint64_value(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
-        """DT_UINT64"""
-        pass
-    def __init__(self,
-        *,
-        dtype: bentoml.grpc.v1.types_pb2.DataType.ValueType = ...,
-        contents: builtins.bytes = ...,
-        half_value: typing.Optional[typing.Iterable[builtins.int]] = ...,
-        float_value: typing.Optional[typing.Iterable[builtins.float]] = ...,
-        double_value: typing.Optional[typing.Iterable[builtins.float]] = ...,
-        int_value: typing.Optional[typing.Iterable[builtins.int]] = ...,
-        string_value: typing.Optional[typing.Iterable[builtins.bytes]] = ...,
-        scomplex_value: typing.Optional[typing.Iterable[builtins.float]] = ...,
-        int64_value: typing.Optional[typing.Iterable[builtins.int]] = ...,
-        bool_value: typing.Optional[typing.Iterable[builtins.bool]] = ...,
-        dcomplex_value: typing.Optional[typing.Iterable[builtins.float]] = ...,
-        uint32_value: typing.Optional[typing.Iterable[builtins.int]] = ...,
-        uint64_value: typing.Optional[typing.Iterable[builtins.int]] = ...,
-        ) -> None: ...
-    def ClearField(self, field_name: typing_extensions.Literal["bool_value",b"bool_value","contents",b"contents","dcomplex_value",b"dcomplex_value","double_value",b"double_value","dtype",b"dtype","float_value",b"float_value","half_value",b"half_value","int64_value",b"int64_value","int_value",b"int_value","scomplex_value",b"scomplex_value","string_value",b"string_value","uint32_value",b"uint32_value","uint64_value",b"uint64_value"]) -> None: ...
-global___Value = Value
-
-class StructuredValue(google.protobuf.message.Message):
-    """StructuredValue represents a dynamic typed value representing various
-    data structures that are inspired by Python's data structures.
-    Partial verbatim sources of tensorflow/core/protobuf/struct.proto.
-
-    [16, 100] will be reserved for future use. Users are free to use from [3, 15]
-    """
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    NONE_VALUE_FIELD_NUMBER: builtins.int
-    ANY_VALUE_FIELD_NUMBER: builtins.int
-    LIST_VALUE_FIELD_NUMBER: builtins.int
-    TUPLE_VALUE_FIELD_NUMBER: builtins.int
-    DICT_VALUE_FIELD_NUMBER: builtins.int
-    NAMEDTUPLE_VALUE_FIELD_NUMBER: builtins.int
-    DATAFRAME_COLUMNS_FIELD_NUMBER: builtins.int
-    DATAFRAME_INDICES_FIELD_NUMBER: builtins.int
-    @property
-    def none_value(self) -> global___NoneValue:
-        """Represents a None value."""
-        pass
-    @property
-    def any_value(self) -> global___Value:
-        """Represents any value that can be represented as a payload contents."""
-        pass
-    @property
-    def list_value(self) -> global___ListValue:
-        """Represents a list of `Value`."""
-        pass
-    @property
-    def tuple_value(self) -> global___TupleValue:
-        """Represents a tuple of `Value`."""
-        pass
-    @property
-    def dict_value(self) -> global___DictValue:
-        """Represents a dict `Value`."""
-        pass
-    @property
-    def namedtuple_value(self) -> global___NamedTupleValue:
-        """Represents Python's namedtuple."""
-        pass
-    @property
-    def dataframe_columns(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[typing.Text]:
-        """dataframes_columns and dataframe_indices are used
-        in conjunction with contents to represent a dataframe.
-        """
-        pass
-    @property
-    def dataframe_indices(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[typing.Text]: ...
-    def __init__(self,
-        *,
-        none_value: typing.Optional[global___NoneValue] = ...,
-        any_value: typing.Optional[global___Value] = ...,
-        list_value: typing.Optional[global___ListValue] = ...,
-        tuple_value: typing.Optional[global___TupleValue] = ...,
-        dict_value: typing.Optional[global___DictValue] = ...,
-        namedtuple_value: typing.Optional[global___NamedTupleValue] = ...,
-        dataframe_columns: typing.Optional[typing.Iterable[typing.Text]] = ...,
-        dataframe_indices: typing.Optional[typing.Iterable[typing.Text]] = ...,
-        ) -> None: ...
-    def HasField(self, field_name: typing_extensions.Literal["any_value",b"any_value","dict_value",b"dict_value","kind",b"kind","list_value",b"list_value","namedtuple_value",b"namedtuple_value","none_value",b"none_value","tuple_value",b"tuple_value"]) -> builtins.bool: ...
-    def ClearField(self, field_name: typing_extensions.Literal["any_value",b"any_value","dataframe_columns",b"dataframe_columns","dataframe_indices",b"dataframe_indices","dict_value",b"dict_value","kind",b"kind","list_value",b"list_value","namedtuple_value",b"namedtuple_value","none_value",b"none_value","tuple_value",b"tuple_value"]) -> None: ...
-    def WhichOneof(self, oneof_group: typing_extensions.Literal["kind",b"kind"]) -> typing.Optional[typing_extensions.Literal["none_value","any_value","list_value","tuple_value","dict_value","namedtuple_value"]]: ...
-global___StructuredValue = StructuredValue
-
-class NoneValue(google.protobuf.message.Message):
-    """represents None type."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    def __init__(self,
-        ) -> None: ...
-global___NoneValue = NoneValue
-
-class ListValue(google.protobuf.message.Message):
-    """Represents a Python list."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    VALUES_FIELD_NUMBER: builtins.int
-    @property
-    def values(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___StructuredValue]: ...
-    def __init__(self,
-        *,
-        values: typing.Optional[typing.Iterable[global___StructuredValue]] = ...,
-        ) -> None: ...
-    def ClearField(self, field_name: typing_extensions.Literal["values",b"values"]) -> None: ...
-global___ListValue = ListValue
-
-class TupleValue(google.protobuf.message.Message):
-    """Represents a Python tuple."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    VALUES_FIELD_NUMBER: builtins.int
-    @property
-    def values(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___StructuredValue]: ...
-    def __init__(self,
-        *,
-        values: typing.Optional[typing.Iterable[global___StructuredValue]] = ...,
-        ) -> None: ...
-    def ClearField(self, field_name: typing_extensions.Literal["values",b"values"]) -> None: ...
-global___TupleValue = TupleValue
-
-class DictValue(google.protobuf.message.Message):
-    """Represents a Python dict with key<str> and value<StructuredValue>.
-    This key is analogous with Value.string_value
-    """
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    class FieldsEntry(google.protobuf.message.Message):
-        DESCRIPTOR: google.protobuf.descriptor.Descriptor
-        KEY_FIELD_NUMBER: builtins.int
-        VALUE_FIELD_NUMBER: builtins.int
-        key: typing.Text
-        @property
-        def value(self) -> global___StructuredValue: ...
-        def __init__(self,
-            *,
-            key: typing.Text = ...,
-            value: typing.Optional[global___StructuredValue] = ...,
-            ) -> None: ...
-        def HasField(self, field_name: typing_extensions.Literal["value",b"value"]) -> builtins.bool: ...
-        def ClearField(self, field_name: typing_extensions.Literal["key",b"key","value",b"value"]) -> None: ...
-
-    FIELDS_FIELD_NUMBER: builtins.int
-    @property
-    def fields(self) -> google.protobuf.internal.containers.MessageMap[typing.Text, global___StructuredValue]: ...
-    def __init__(self,
-        *,
-        fields: typing.Optional[typing.Mapping[typing.Text, global___StructuredValue]] = ...,
-        ) -> None: ...
-    def ClearField(self, field_name: typing_extensions.Literal["fields",b"fields"]) -> None: ...
-global___DictValue = DictValue
-
-class PairValue(google.protobuf.message.Message):
-    """Represents a (key, value) pair."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    KEY_FIELD_NUMBER: builtins.int
-    VALUE_FIELD_NUMBER: builtins.int
-    key: typing.Text
-    @property
-    def value(self) -> global___StructuredValue: ...
-    def __init__(self,
-        *,
-        key: typing.Text = ...,
-        value: typing.Optional[global___StructuredValue] = ...,
-        ) -> None: ...
-    def HasField(self, field_name: typing_extensions.Literal["value",b"value"]) -> builtins.bool: ...
-    def ClearField(self, field_name: typing_extensions.Literal["key",b"key","value",b"value"]) -> None: ...
-global___PairValue = PairValue
-
-class NamedTupleValue(google.protobuf.message.Message):
-    """Represents a namedtuple."""
-    DESCRIPTOR: google.protobuf.descriptor.Descriptor
-    TYPE_NAME_FIELD_NUMBER: builtins.int
-    VALUES_FIELD_NUMBER: builtins.int
-    type_name: typing.Text
-    @property
-    def values(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___PairValue]: ...
-    def __init__(self,
-        *,
-        type_name: typing.Text = ...,
-        values: typing.Optional[typing.Iterable[global___PairValue]] = ...,
-        ) -> None: ...
-    def ClearField(self, field_name: typing_extensions.Literal["type_name",b"type_name","values",b"values"]) -> None: ...
-global___NamedTupleValue = NamedTupleValue
diff --git a/bentoml/grpc/v1/types.proto b/bentoml/grpc/v1/types.proto
deleted file mode 100644
index d88ae4b50b0..00000000000
--- a/bentoml/grpc/v1/types.proto
+++ /dev/null
@@ -1,61 +0,0 @@
-syntax = "proto3";
-
-package bentoml.grpc.v1;
-
-// cc_enable_arenas pre-allocate memory for given message to improve speed. (C++ only)
-option cc_enable_arenas = true;
-option cc_generic_services = true;
-option go_package = "github.com/bentoml/grpc/v1";
-option java_multiple_files = true;
-option java_outer_classname = "TypesProto";
-option java_package = "com.bentoml.grpc.v1";
-option objc_class_prefix = "TYP";
-
-// partial verbatim from tensorflow/core/framework/types.proto
-enum DataType {
-  // Not a legal value for DataType.  Used to indicate a DataType field
-  // has not been set.
-  DT_UNSPECIFIED = 0;
-
-  // Data types that all computation devices are expected to be
-  // capable to support.
-  DT_FLOAT = 1;
-  DT_DOUBLE = 2;
-  DT_INT32 = 3;
-  DT_UINT8 = 4;
-  DT_INT16 = 5;
-  DT_INT8 = 6;
-  DT_STRING = 7;
-
-  // Single-precision complex
-  DT_COMPLEX64 = 8;
-  DT_INT64 = 9;
-  DT_BOOL = 10;
-
-  // Quantized int8
-  DT_QINT8 = 11;
-
-  // Quantized uint8
-  DT_QUINT8 = 12;
-
-  // Quantized int32
-  DT_QINT32 = 13;
-  // Float32 truncated to 16 bits.  Only for cast ops.
-  DT_BFLOAT16 = 14;
-
-  // Quantized int16
-  DT_QINT16 = 15;
-  // Quantized uint16
-  DT_QUINT16 = 16;
-
-  DT_UINT16 = 17;
-
-  // Double-precision complex
-  DT_COMPLEX128 = 18;
-
-  // Represents half data type (32 -> 16 bits).
-  DT_HALF = 19;
-
-  DT_UINT32 = 20;
-  DT_UINT64 = 21;
-}
diff --git a/bentoml/grpc/v1/types_pb2.pyi b/bentoml/grpc/v1/types_pb2.pyi
deleted file mode 100644
index c553881ae36..00000000000
--- a/bentoml/grpc/v1/types_pb2.pyi
+++ /dev/null
@@ -1,119 +0,0 @@
-"""
-@generated by mypy-protobuf.  Do not edit manually!
-isort:skip_file
-"""
-import builtins
-import google.protobuf.descriptor
-import google.protobuf.internal.enum_type_wrapper
-import typing
-import typing_extensions
-
-DESCRIPTOR: google.protobuf.descriptor.FileDescriptor
-
-class _DataType:
-    ValueType = typing.NewType('ValueType', builtins.int)
-    V: typing_extensions.TypeAlias = ValueType
-class _DataTypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[_DataType.ValueType], builtins.type):
-    DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
-    DT_UNSPECIFIED: _DataType.ValueType  # 0
-    """Not a legal value for DataType.  Used to indicate a DataType field
-    has not been set.
-    """
-
-    DT_FLOAT: _DataType.ValueType  # 1
-    """Data types that all computation devices are expected to be
-    capable to support.
-    """
-
-    DT_DOUBLE: _DataType.ValueType  # 2
-    DT_INT32: _DataType.ValueType  # 3
-    DT_UINT8: _DataType.ValueType  # 4
-    DT_INT16: _DataType.ValueType  # 5
-    DT_INT8: _DataType.ValueType  # 6
-    DT_STRING: _DataType.ValueType  # 7
-    DT_COMPLEX64: _DataType.ValueType  # 8
-    """Single-precision complex"""
-
-    DT_INT64: _DataType.ValueType  # 9
-    DT_BOOL: _DataType.ValueType  # 10
-    DT_QINT8: _DataType.ValueType  # 11
-    """Quantized int8"""
-
-    DT_QUINT8: _DataType.ValueType  # 12
-    """Quantized uint8"""
-
-    DT_QINT32: _DataType.ValueType  # 13
-    """Quantized int32"""
-
-    DT_BFLOAT16: _DataType.ValueType  # 14
-    """Float32 truncated to 16 bits.  Only for cast ops."""
-
-    DT_QINT16: _DataType.ValueType  # 15
-    """Quantized int16"""
-
-    DT_QUINT16: _DataType.ValueType  # 16
-    """Quantized uint16"""
-
-    DT_UINT16: _DataType.ValueType  # 17
-    DT_COMPLEX128: _DataType.ValueType  # 18
-    """Double-precision complex"""
-
-    DT_HALF: _DataType.ValueType  # 19
-    """Represents half data type (32 -> 16 bits)."""
-
-    DT_UINT32: _DataType.ValueType  # 20
-    DT_UINT64: _DataType.ValueType  # 21
-class DataType(_DataType, metaclass=_DataTypeEnumTypeWrapper):
-    """partial verbatim from tensorflow/core/framework/types.proto"""
-    pass
-
-DT_UNSPECIFIED: DataType.ValueType  # 0
-"""Not a legal value for DataType.  Used to indicate a DataType field
-has not been set.
-"""
-
-DT_FLOAT: DataType.ValueType  # 1
-"""Data types that all computation devices are expected to be
-capable to support.
-"""
-
-DT_DOUBLE: DataType.ValueType  # 2
-DT_INT32: DataType.ValueType  # 3
-DT_UINT8: DataType.ValueType  # 4
-DT_INT16: DataType.ValueType  # 5
-DT_INT8: DataType.ValueType  # 6
-DT_STRING: DataType.ValueType  # 7
-DT_COMPLEX64: DataType.ValueType  # 8
-"""Single-precision complex"""
-
-DT_INT64: DataType.ValueType  # 9
-DT_BOOL: DataType.ValueType  # 10
-DT_QINT8: DataType.ValueType  # 11
-"""Quantized int8"""
-
-DT_QUINT8: DataType.ValueType  # 12
-"""Quantized uint8"""
-
-DT_QINT32: DataType.ValueType  # 13
-"""Quantized int32"""
-
-DT_BFLOAT16: DataType.ValueType  # 14
-"""Float32 truncated to 16 bits.  Only for cast ops."""
-
-DT_QINT16: DataType.ValueType  # 15
-"""Quantized int16"""
-
-DT_QUINT16: DataType.ValueType  # 16
-"""Quantized uint16"""
-
-DT_UINT16: DataType.ValueType  # 17
-DT_COMPLEX128: DataType.ValueType  # 18
-"""Double-precision complex"""
-
-DT_HALF: DataType.ValueType  # 19
-"""Represents half data type (32 -> 16 bits)."""
-
-DT_UINT32: DataType.ValueType  # 20
-DT_UINT64: DataType.ValueType  # 21
-global___DataType = DataType
-
diff --git a/bentoml/testing/server.py b/bentoml/testing/server.py
index 92ead466355..09a1a704221 100644
--- a/bentoml/testing/server.py
+++ b/bentoml/testing/server.py
@@ -331,7 +331,7 @@ def run_bento_server_distributed(
         cmd = [
             sys.executable,
             "-m",
-            "bentoml._internal.server.cli.http.api_server",
+            "bentoml._internal.server.cli.rest_api_server",
             str(bento_tag),
             "--bind",
             bind,
diff --git a/pyproject.toml b/pyproject.toml
index 0946d0417a8..4db2410e9fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,6 +2,7 @@
 requires = [
   # sync with setup.cfg until we discard non-pep-517/518
   "grpcio-tools==1.47.0",
+  "googleapis-common-protos",
   "mypy-protobuf==3.2.0",
   "protobuf==3.19.4",
   "setuptools>=59.0",
diff --git a/setup.cfg b/setup.cfg
index c240d561fd1..62f706390ea 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -73,6 +73,7 @@ python_requires = >=3.7
 include_package_data = True
 setup_requires =
     grpcio-tools==1.47.0
+    googleapis-common-protos
     mypy-protobuf==3.2.0  # generating stubs for type checking
     protobuf==3.19.4
     setuptools>=59.0
@@ -99,12 +100,14 @@ grpc =
     grpcio
     grpcio-health-checking
     grpcio-reflection
+    grpcio-status
 tracing =
     opentelemetry-exporter-jaeger
     opentelemetry-exporter-zipkin
 
 [options.package_data]
 * = *.pyi
+bentoml/grpc/* = *.proto
 bentoml = py.typed
 
 [global]
diff --git a/setup.py b/setup.py
index ce6f6604420..48b824a67a7 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,9 @@
 
 import os
 import sys
+import typing as t
 import subprocess
+import importlib.util
 from pathlib import Path
 
 from setuptools import setup
@@ -12,13 +14,28 @@
 _VERSION_MAP = {
     "v1": {
         ("service.proto", "service_test.proto"): {"grpc_out": True},
-        ("types.proto", "struct.proto"): {},
+        ("struct.proto",): {},
     },
 }
 
 
+def source_locations(pkg: str) -> str:
+    spec = importlib.util.find_spec(pkg)
+    if not spec:
+        raise RuntimeError(
+            f"{pkg} not found. Make sure to add to both pyproject.toml and setup.cfg."
+        )
+    (location,) = spec.submodule_search_locations  # type: ignore (unfinished type)
+    return t.cast(str, location)
+
+
 def get_args(parent_path: str, *paths: str, grpc_out: bool = False) -> list[str]:
-    args = ["-I.", "--python_out=.", "--mypy_out=."]
+    args = [
+        "-I.",
+        f"-I{os.path.dirname(source_locations('google'))}",  # include common googleapis stubs.
+        "--python_out=.",
+        "--mypy_out=.",
+    ]
     if grpc_out:
         args.extend(["--grpc_python_out=.", "--mypy_grpc_out=."])
     args.extend([os.path.join(parent_path, path) for path in paths])