diff --git a/src/bentoml/_internal/client/__init__.py b/src/bentoml/_internal/client/__init__.py
index 81d3a9b850b..4786dfb792f 100644
--- a/src/bentoml/_internal/client/__init__.py
+++ b/src/bentoml/_internal/client/__init__.py
@@ -30,7 +30,7 @@ def __init__(self, svc: Service, server_url: str):
         self._svc = svc
         self.server_url = server_url
 
-        if svc is not None and len(svc.apis) == 0:
+        if len(svc.apis) == 0:
             raise BentoMLException("No APIs were found when constructing client.")
 
         self.endpoints = []
diff --git a/src/bentoml/exceptions.py b/src/bentoml/exceptions.py
index c06c251d489..283e7da3fa6 100644
--- a/src/bentoml/exceptions.py
+++ b/src/bentoml/exceptions.py
@@ -119,3 +119,15 @@ class ImportServiceError(BentoMLException):
     """Raised when BentoML failed to import the user's service file."""
 
     pass
+
+
+class UnservableException(StateException):
+    """Raised when a service is not servable."""
+
+    pass
+
+
+class ServerStateException(StateException):
+    """Raised when a server API requiring the BentoML server to be running is executed when the server is not running."""
+
+    pass
diff --git a/src/bentoml/server.py b/src/bentoml/server.py
index c05d9f6d7e9..b2ecf302346 100644
--- a/src/bentoml/server.py
+++ b/src/bentoml/server.py
@@ -10,24 +10,26 @@
 from abc import ABC
 from abc import abstractmethod
 from typing import TYPE_CHECKING
+from warnings import warn
 
 from simple_di import Provide
 from simple_di import inject
 
 from ._internal.bento import Bento
+from ._internal.client import Client
+from ._internal.client import GrpcClient
+from ._internal.client import HTTPClient
 from ._internal.configuration.containers import BentoMLContainer
 from ._internal.service import Service
 from ._internal.tag import Tag
 from ._internal.utils.analytics.usage_stats import BENTOML_SERVE_FROM_SERVER_API
-from .exceptions import BentoMLException
+from .exceptions import InvalidArgument
+from .exceptions import ServerStateException
+from .exceptions import UnservableException
 
 if TYPE_CHECKING:
     from types import TracebackType
 
-    from ._internal.client import Client
-    from ._internal.client import GrpcClient
-    from ._internal.client import HTTPClient
-
     _FILE: t.TypeAlias = None | int | t.IO[t.Any]
 
 
@@ -37,7 +39,10 @@
 __all__ = ["Server", "GrpcServer", "HTTPServer"]
 
 
-class Server(ABC):
+ClientType = t.TypeVar("ClientType", bound=Client)
+
+
+class Server(ABC, t.Generic[ClientType]):
     servable: str | Bento | Tag | Service
     host: str
     port: int
@@ -63,15 +68,16 @@ def __init__(
         timeout: float = 10,
     ):
         if bento is not None:
-            if not servable:
-                logger.warning(
-                    "'bento' is deprecated, either remove it as a kwargs or pass '%s' as the first positional argument",
-                    bento,
+            if servable is None:  # type: ignore  # dealing with backwards compatibility, where a user has set bento argument manually.
+                warn(
+                    f"serving using the 'bento' argument is deprecated, either remove it as a kwarg or pass '{bento}' as the first positional argument",
+                    DeprecationWarning,
+                    stacklevel=2,
                 )
                 servable = bento
             else:
-                raise BentoMLException(
-                    "Cannot use both 'bento' and 'servable' as kwargs as 'bento' is deprecated."
+                raise InvalidArgument(
+                    "Cannot use both 'bento' and 'servable' arguments; as 'bento' is deprecated, set 'servable' instead."
                 )
 
         self.servable = servable
@@ -84,7 +90,7 @@ def __init__(
             bento_str = str(servable)
         elif isinstance(servable, Service):
             if not servable.is_service_importable():
-                raise BentoMLException(
+                raise UnservableException(
                     "Cannot use 'bentoml.Service' as a server if it is defined in interactive session or Jupyter Notebooks."
                 )
             bento_str, working_dir = servable.get_service_import_origin()
@@ -130,7 +136,7 @@ def start(
         stdout: _FILE = None,
         stderr: _FILE = None,
         text: bool | None = None,
-    ):
+    ) -> t.ContextManager[ClientType]:
         """Start the server programmatically.
 
         To get the client, use the context manager.
@@ -158,9 +164,9 @@ def __init__(__inner_self):
                 logger.debug(f"Starting server with arguments: {self.args}")
                 default_io_descriptor = None if blocking else subprocess.PIPE
                 if text is None:
-                    logger.warning(
-                        "Setting text to True will be the default behaviour for bentoml 2.x. Please set it explicitly to avoid breaking changes.\n"
-                        + '    Example: "server.start(text=False, ...)"',
+                    warn(
+                        "Setting text to True will be the default behavior for bentoml 2.x. Set it explicitly to avoid breaking changes.\n"
+                        + "For Example: 'server.start(text=False, ...)'"
                     )
                 self.process = subprocess.Popen(
                     self.args,
@@ -178,36 +184,34 @@ def __init__(__inner_self):
                     except KeyboardInterrupt:
                         pass
 
-            def __enter__(__inner_self):
+            def __enter__(__inner_self) -> ClientType:
                 return self.get_client()
 
             def __exit__(
                 __inner_self,
-                exc_type: type[BaseException] | None,
-                exc_value: BaseException | None,
-                traceback: TracebackType | None,
+                _exc_type: type[BaseException] | None,
+                _exc_value: BaseException | None,
+                _traceback: TracebackType | None,
             ):
                 self.stop()
 
         return _Manager()
 
-    def get_client(self) -> Client | None:
+    def get_client(self) -> ClientType:
         if self.process is None:
             # NOTE: if the process is None, we reset this envvar
             del os.environ[BENTOML_SERVE_FROM_SERVER_API]
-            logger.warning(
+            raise ServerStateException(
                 "Attempted to get a client for a BentoML server that was not running! Try running 'bentoml.*Server.start()' first."
             )
-            return
         assert self.process is not None
         out_code = self.process.poll()
         if out_code == 0:
             # NOTE: if the process is None, we reset this envvar
             del os.environ[BENTOML_SERVE_FROM_SERVER_API]
-            logger.warning(
+            raise ServerStateException(
                 "Attempted to get a client from a BentoML server that has already exited! You can run '.start()' again to restart it."
             )
-            return
         elif out_code is not None:
             # NOTE: if the process is None, we reset this envvar
             del os.environ[BENTOML_SERVE_FROM_SERVER_API]
@@ -215,20 +219,19 @@ def get_client(self) -> Client | None:
             if self.process.stdout is not None and not self.process.stdout.closed:
                 s = self.process.stdout.read()
                 logs += textwrap.indent(
-                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4
+                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4  # type: ignore  # may be string
                 )
             if self.process.stderr is not None and not self.process.stderr.closed:
                 logs += "\nServer Error:\n"
                 s = self.process.stderr.read()
                 logs += textwrap.indent(
-                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4
+                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4  # type: ignore  # may be string
                 )
-            logger.warning(logs)
-            return
+            raise ServerStateException(logs)
         return self._get_client()
 
     @abstractmethod
-    def _get_client(self) -> Client | None:
+    def _get_client(self) -> ClientType:
         pass
 
     def stop(self) -> None:
@@ -244,20 +247,22 @@ def stop(self) -> None:
             logger.warning(
                 "Attempted to stop a BentoML server that has already exited!"
             )
+            return
         elif out_code is not None:
             logs = "Attempted to stop a BentoML server that has already exited with an error!\nServer Output:\n"
             if self.process.stdout is not None and not self.process.stdout.closed:
                 s = self.process.stdout.read()
                 logs += textwrap.indent(
-                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4
+                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4  # type: ignore  # may be string
                 )
             if self.process.stderr is not None and not self.process.stderr.closed:
                 logs += "\nServer Error:\n"
                 s = self.process.stderr.read()
                 logs += textwrap.indent(
-                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4
+                    s.decode("utf-8") if isinstance(s, bytes) else s, " " * 4  # type: ignore  # may be string
                 )
             logger.warning(logs)
+            return
 
         if sys.platform == "win32":
             os.kill(self.process.pid, signal.CTRL_C_EVENT)
@@ -273,8 +278,10 @@ def stop(self) -> None:
             self.process.wait()
 
     def __enter__(self):
-        logger.warning(
-            "Using bentoml.Server as a context manager is deprecated, use bentoml.Server.start instead."
+        warn(
+            "Using bentoml.Server as a context manager is deprecated, use bentoml.Server.start instead.",
+            DeprecationWarning,
+            stacklevel=2,
         )
 
         return self
@@ -291,7 +298,7 @@ def __exit__(
             logger.error(f"Error stopping server: {e}", exc_info=e)
 
 
-class HTTPServer(Server):
+class HTTPServer(Server[HTTPClient]):
     _client: HTTPClient | None = None
 
     @inject
@@ -347,13 +354,18 @@ def __init__(
 
         self.args.extend(construct_ssl_args(**ssl_args))
 
+    def get_client(self) -> HTTPClient:
+        return super().get_client()
+
     def client(self) -> HTTPClient | None:
-        logger.warning(
-            "'Server.client()' is deprecated, use 'Server.get_client()' instead."
+        warn(
+            "'Server.client()' is deprecated, use 'Server.get_client()' instead.",
+            DeprecationWarning,
+            stacklevel=2,
         )
-        return t.cast("HTTPClient | None", self.get_client())
+        return self._get_client()
 
-    def _get_client(self) -> HTTPClient | None:
+    def _get_client(self) -> HTTPClient:
         if self._client is None:
             from .client import HTTPClient
 
@@ -364,7 +376,7 @@ def _get_client(self) -> HTTPClient | None:
         return self._client
 
 
-class GrpcServer(Server):
+class GrpcServer(Server[GrpcClient]):
     _client: GrpcClient | None = None
 
     @inject
@@ -423,7 +435,7 @@ def __init__(
         if grpc_protocol_version is not None:
             self.args.extend(["--protocol-version", str(grpc_protocol_version)])
 
-    def _get_client(self) -> GrpcClient | None:
+    def _get_client(self) -> GrpcClient:
         if self._client is None:
             from .client import GrpcClient
 
diff --git a/tests/e2e/bento_server_http/tests/test_serve.py b/tests/e2e/bento_server_http/tests/test_serve.py
index cbcd6a4dc9a..eb9a2378b72 100644
--- a/tests/e2e/bento_server_http/tests/test_serve.py
+++ b/tests/e2e/bento_server_http/tests/test_serve.py
@@ -11,7 +11,8 @@
 from bentoml.testing.utils import async_request
 
 
-def test_http_server(bentoml_home: str):
+@pytest.mark.usefixtures("bentoml_home")
+def test_http_server():
     server = bentoml.HTTPServer("service.py:svc", port=12345)
 
     server.start()
@@ -21,54 +22,64 @@ def test_http_server(bentoml_home: str):
 
     assert resp.status == 200
 
-    res = client.echo_json_sync({"test": "json"})
+    res = client.call("echo_json", {"test": "json"})
 
     assert res == {"test": "json"}
 
     server.stop()
 
+    assert server.process is not None  # process should not be removed
+
     timeout = 10
     start_time = time.time()
     while time.time() - start_time < timeout:
         retcode = server.process.poll()
         if retcode is not None and retcode <= 0:
             break
+
+    retcode = server.process.poll()
+    assert retcode is not None
+
     if sys.platform == "win32":
         # on Windows, because of the way that terminate is run, it seems the exit code is set.
-        assert isinstance(server.process.poll(), int)
+        pass
     else:
-        # on POSIX negative return codes mean the process was terminated; since we will be terminating
+        # negative return codes mean the process was terminated; since we will be terminating
         # the process, it should be negative.
-        # on all other platforms, this should be 0.
-        assert server.process.poll() <= 0
+        assert retcode <= 0
 
 
-def test_http_server_ctx(bentoml_home: str):
+@pytest.mark.usefixtures("bentoml_home")
+def test_http_server_ctx():
     server = bentoml.HTTPServer("service.py:svc", port=12346)
 
     with server.start() as client:
         resp = client.health()
-
         assert resp.status == 200
 
-        res = client.echo_json_sync({"more_test": "and more json"})
+        res = client.call("echo_json", {"more_test": "and more json"})
 
         assert res == {"more_test": "and more json"}
 
+    assert server.process is not None  # process should not be removed
+
     timeout = 10
     start_time = time.time()
     while time.time() - start_time < timeout:
         retcode = server.process.poll()
         if retcode is not None and retcode <= 0:
             break
+
+    retcode = server.process.poll()
+    assert retcode is not None
+
     if sys.platform == "win32":
         # on Windows, because of the way that terminate is run, it seems the exit code is set.
-        assert isinstance(server.process.poll(), int)
+        pass
     else:
-        # on POSIX negative return codes mean the process was terminated; since we will be terminating
+        # negative return codes mean the process was terminated; since we will be terminating
         # the process, it should be negative.
-        # on all other platforms, this should be 0.
-        assert server.process.poll() <= 0
+        assert retcode <= 0
 
 
 def test_serve_from_svc():
@@ -81,23 +92,29 @@ def test_serve_from_svc():
     assert resp.status == 200
     server.stop()
 
-    timeout = 60
+    assert server.process is not None  # process should not be removed
+
+    timeout = 10
     start_time = time.time()
     while time.time() - start_time < timeout:
         retcode = server.process.poll()
         if retcode is not None and retcode <= 0:
             break
+
+    retcode = server.process.poll()
+    assert retcode is not None
+
     if sys.platform == "win32":
         # on Windows, because of the way that terminate is run, it seems the exit code is set.
-        assert isinstance(server.process.poll(), int)
+        pass
     else:
-        # on POSIX negative return codes mean the process was terminated; since we will be terminating
+        # negative return codes mean the process was terminated; since we will be terminating
         # the process, it should be negative.
-        # on all other platforms, this should be 0.
-        assert server.process.poll() <= 0
+        assert retcode <= 0
 
 
-def test_serve_with_timeout(bentoml_home: str):
+@pytest.mark.usefixtures("bentoml_home")
+def test_serve_with_timeout():
     server = bentoml.HTTPServer("service.py:svc", port=12349)
     config_file = os.path.abspath("configs/timeout.yml")
     env = os.environ.copy()
@@ -108,23 +125,26 @@ def test_serve_with_timeout(bentoml_home: str):
             BentoMLException,
             match="504: b'Not able to process the request in 1 seconds'",
         ):
-            client.echo_delay({})
+            client.call("echo_delay", {})
 
 
 @pytest.mark.asyncio
-async def test_serve_with_api_max_concurrency(bentoml_home: str):
+@pytest.mark.usefixtures("bentoml_home")
+async def test_serve_with_api_max_concurrency():
     server = bentoml.HTTPServer("service.py:svc", port=12350, api_workers=1)
     config_file = os.path.abspath("configs/max_concurrency.yml")
     env = os.environ.copy()
     env.update(BENTOML_CONFIG=config_file)
 
-    with server.start(env=env) as client:
+    with server.start(stdin=None, stdout=None, env=env) as client:
         tasks = [
-            asyncio.create_task(client.async_echo_delay({"delay": 0.5})),
-            asyncio.create_task(client.async_echo_delay({"delay": 0.5})),
+            asyncio.create_task(client.async_call("echo_delay", {"delay": 0.5})),
+            asyncio.create_task(client.async_call("echo_delay", {"delay": 0.5})),
         ]
         await asyncio.sleep(0.1)
-        tasks.append(asyncio.create_task(client.async_echo_delay({"delay": 0.5})))
+        tasks.append(
+            asyncio.create_task(client.async_call("echo_delay", {"delay": 0.5}))
+        )
         results = await asyncio.gather(*tasks, return_exceptions=True)
 
     for i in range(2):
@@ -138,7 +158,8 @@ async def test_serve_with_api_max_concurrency(bentoml_home: str):
     reason="Windows runner doesn't have enough cores to run this test",
 )
 @pytest.mark.asyncio
-async def test_serve_with_lifecycle_hooks(bentoml_home: str, tmp_path: Path):
+@pytest.mark.usefixtures("bentoml_home")
+async def test_serve_with_lifecycle_hooks(tmp_path: Path):
     server = bentoml.HTTPServer("service.py:svc", port=12351, api_workers=4)
     env = os.environ.copy()
     env["BENTOML_TEST_DATA"] = str(tmp_path)