diff --git a/bentoml/_internal/configuration/containers.py b/bentoml/_internal/configuration/containers.py index ad8a29b37a0..5feadf29e4d 100644 --- a/bentoml/_internal/configuration/containers.py +++ b/bentoml/_internal/configuration/containers.py @@ -4,7 +4,7 @@ import uuid import typing as t import logging -import multiprocessing +import math from copy import deepcopy from typing import TYPE_CHECKING from dataclasses import dataclass @@ -449,7 +449,7 @@ def access_control_options( return filtered_kwargs api_server_workers = providers.Factory[int]( - lambda workers: workers or (multiprocessing.cpu_count() // 2) + 1, + lambda workers: workers or math.ceil(system_resources()["cpu"]), api_server_config.workers, ) diff --git a/bentoml/_internal/configuration/default_configuration.yaml b/bentoml/_internal/configuration/default_configuration.yaml index 3c66fecfff9..4d23fecf2b4 100644 --- a/bentoml/_internal/configuration/default_configuration.yaml +++ b/bentoml/_internal/configuration/default_configuration.yaml @@ -1,5 +1,5 @@ api_server: - workers: 1 + workers: null # When this is set to null the number of available CPU cores is used. timeout: 60 backlog: 2048 metrics: diff --git a/bentoml/serve.py b/bentoml/serve.py index ad6b620bc72..7105ad1ae31 100644 --- a/bentoml/serve.py +++ b/bentoml/serve.py @@ -289,7 +289,7 @@ def serve_http_production( port: int = Provide[BentoMLContainer.http.port], host: str = Provide[BentoMLContainer.http.host], backlog: int = Provide[BentoMLContainer.api_server_config.backlog], - api_workers: int | None = None, + api_workers: int = Provide[BentoMLContainer.api_server_workers], ssl_certfile: str | None = Provide[BentoMLContainer.api_server_config.ssl.certfile], ssl_keyfile: str | None = Provide[BentoMLContainer.api_server_config.ssl.keyfile], ssl_keyfile_password: str @@ -442,7 +442,7 @@ def serve_http_production( ), ], working_dir=working_dir, - numprocesses=api_workers or math.ceil(CpuResource.from_system()), + numprocesses=api_workers, ) ) @@ -650,7 +650,7 @@ def serve_grpc_production( port: int = Provide[BentoMLContainer.grpc.port], host: str = Provide[BentoMLContainer.grpc.host], backlog: int = Provide[BentoMLContainer.api_server_config.backlog], - api_workers: int | None = None, + api_workers: int = Provide[BentoMLContainer.api_server_workers], reflection: bool = Provide[BentoMLContainer.grpc.reflection.enabled], max_concurrent_streams: int | None = Provide[BentoMLContainer.grpc.max_concurrent_streams], @@ -808,7 +808,7 @@ def serve_grpc_production( args=args, use_sockets=False, working_dir=working_dir, - numprocesses=api_workers or math.ceil(CpuResource.from_system()), + numprocesses=api_workers, ) ) diff --git a/bentoml_cli/serve.py b/bentoml_cli/serve.py index cde029fdbce..067b1571d39 100644 --- a/bentoml_cli/serve.py +++ b/bentoml_cli/serve.py @@ -45,7 +45,7 @@ def add_serve_command(cli: click.Group) -> None: @click.option( "--api-workers", type=click.INT, - default=None, + default=BentoMLContainer.api_server_workers.get(), help="Specify the number of API server workers to start. Default to number of available CPU cores in production mode", envvar="BENTOML_API_WORKERS", show_default=True, @@ -249,7 +249,7 @@ def serve( # type: ignore (unused warning) @click.option( "--api-workers", type=click.INT, - default=None, + default=BentoMLContainer.api_server_workers.get(), help="Specify the number of API server workers to start. Default to number of available CPU cores in production mode", envvar="BENTOML_API_WORKERS", show_default=True, diff --git a/bentoml_cli/start.py b/bentoml_cli/start.py index 6fd61bdd2fb..025541e045f 100644 --- a/bentoml_cli/start.py +++ b/bentoml_cli/start.py @@ -63,7 +63,7 @@ def add_start_command(cli: click.Group) -> None: @click.option( "--api-workers", type=click.INT, - default=None, + default=BentoMLContainer.api_server_workers.get(), help="Specify the number of API server workers to start. Default to number of available CPU cores in production mode", envvar="BENTOML_API_WORKERS", ) @@ -295,7 +295,7 @@ def start_runner_server( # type: ignore (unused warning) @click.option( "--api-workers", type=click.INT, - default=None, + default=BentoMLContainer.api_server_workers.get(), help="Specify the number of API server workers to start. Default to number of available CPU cores in production mode", envvar="BENTOML_API_WORKERS", )