From e0cb13eb06f53ba66c13dcb3f147b67fcef9d99e Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 23 Nov 2022 04:18:56 +0900
Subject: [PATCH 001/110] Exlucde __pycache__ in setuptools

---
 .actions/setup_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py
index 0f05c270c62f6..73459aa45fd01 100644
--- a/.actions/setup_tools.py
+++ b/.actions/setup_tools.py
@@ -190,7 +190,7 @@ def _load_aggregate_requirements(req_dir: str = "requirements", freeze_requireme
         load_requirements(d, file_name="base.txt", unfreeze=not freeze_requirements)
         for d in glob.glob(os.path.join(req_dir, "*"))
         # skip empty folder as git artefacts, and resolving Will's special issue
-        if os.path.isdir(d) and len(glob.glob(os.path.join(d, "*"))) > 0
+        if os.path.isdir(d) and len(glob.glob(os.path.join(d, "*"))) > 0 and "__pycache__" not in d
     ]
     if not requires:
         return None

From f162e96d3e6d0e53cf0398df0218312d39770e81 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 22 Nov 2022 21:18:30 +0900
Subject: [PATCH 002/110] Add load balancer example

---
 examples/app_server_with_load_balancer/app.py | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 examples/app_server_with_load_balancer/app.py

diff --git a/examples/app_server_with_load_balancer/app.py b/examples/app_server_with_load_balancer/app.py
new file mode 100644
index 0000000000000..a5e24eac335fb
--- /dev/null
+++ b/examples/app_server_with_load_balancer/app.py
@@ -0,0 +1,59 @@
+# !pip install torchvision pydantic
+import base64
+import io
+
+import torch
+import torchvision
+from PIL import Image
+from pydantic import BaseModel
+
+import lightning as L
+from lightning.app.components.serve import Image as InputImage
+from lightning.app.components.serve import PythonServer
+
+
+class PyTorchServer(PythonServer):
+    def __init__(self):
+        super().__init__(
+            input_type=InputImage,
+            output_type=OutputData,
+            cloud_compute=L.CloudCompute("gpu"),
+        )
+
+    def setup(self):
+        self._model = torchvision.models.resnet18(pretrained=True)
+        self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        self._model.to(self._device)
+
+    def predict(self, request):
+        image = base64.b64decode(request.image.encode("utf-8"))
+        image = Image.open(io.BytesIO(image))
+        transforms = torchvision.transforms.Compose(
+            [
+                torchvision.transforms.Resize(224),
+                torchvision.transforms.ToTensor(),
+                torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        image = transforms(image)
+        image = image.to(self._device)
+        prediction = self._model(image.unsqueeze(0))
+        return {"prediction": prediction.argmax().item()}
+
+
+class OutputData(BaseModel):
+    prediction: int
+
+
+app = L.LightningApp(PyTorchServer())
+
+
+# TODO: name confusion LoadBalancer vs. AutoScaler
+# from lightning.app.components import LoadBalancer
+# component = LoadBalancer(
+#     PyTorchServer,
+#     num_replicas=4,
+#     balance_function="predict",
+#     auto_scale=False,
+# )
+# app = L.LightningApp(component)

From 072543927eff844ee4a33e618d44418f4a2ac321 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 23 Nov 2022 21:20:48 +0900
Subject: [PATCH 003/110] wip

---
 src/lightning_app/components/__init__.py      |   2 +
 src/lightning_app/components/load_balancer.py | 489 ++++++++++++++++++
 2 files changed, 491 insertions(+)
 create mode 100644 src/lightning_app/components/load_balancer.py

diff --git a/src/lightning_app/components/__init__.py b/src/lightning_app/components/__init__.py
index ee52fb55670f2..e72cd93981a5a 100644
--- a/src/lightning_app/components/__init__.py
+++ b/src/lightning_app/components/__init__.py
@@ -1,5 +1,6 @@
 from lightning_app.components.database.client import DatabaseClient
 from lightning_app.components.database.server import Database
+from lightning_app.components.load_balancer import LoadBalancer
 from lightning_app.components.multi_node import (
     LightningTrainerMultiNode,
     LiteMultiNode,
@@ -32,4 +33,5 @@
     "PyTorchLightningScriptRunner",
     "PyTorchSpawnMultiNode",
     "LightningTrainerMultiNode",
+    "LoadBalancer",
 ]
diff --git a/src/lightning_app/components/load_balancer.py b/src/lightning_app/components/load_balancer.py
new file mode 100644
index 0000000000000..d44474a9923d4
--- /dev/null
+++ b/src/lightning_app/components/load_balancer.py
@@ -0,0 +1,489 @@
+import asyncio
+import logging
+import os
+import subprocess
+import time
+import uuid
+from itertools import cycle
+from typing import Any, Dict, List, Optional, Tuple
+
+import aiohttp
+import aiohttp.client_exceptions
+import requests
+import uvicorn
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import RedirectResponse
+from pydantic import BaseModel
+
+from lightning_app.core.flow import LightningFlow
+from lightning_app.core.work import LightningWork
+from lightning_app.utilities.packaging.build_config import BuildConfig
+from lightning_app.utilities.packaging.cloud_compute import CloudCompute
+
+MIN_REPLICA = int(os.environ.get("MUSE_MIN_WORKERS", 1))
+DEVICE_TYPE = os.environ.get("MUSE_GPU_TYPE", "gpu")
+KEEP_ALIVE_TIMEOUT = float(os.environ.get("KEEP_ALIVE_TIMEOUT", 60))
+INFERENCE_REQUEST_TIMEOUT = float(os.environ.get("KEEP_ALIVE_TIMEOUT", 60))
+OPEN_PROMPTS = None
+
+
+def raise_granular_exception(exception: Exception):
+    """handle the exceptions coming from hitting the model servers."""
+    if not isinstance(exception, Exception):
+        return
+
+    if isinstance(exception, HTTPException):
+        raise exception
+
+    if isinstance(exception, aiohttp.client_exceptions.ServerDisconnectedError):
+        raise HTTPException(500, "Worker Server Disconnected")
+
+    if isinstance(exception, aiohttp.client_exceptions.ClientError):
+        logging.exception(exception)
+        raise HTTPException(500, "Worker Server error")
+
+    if isinstance(exception, asyncio.TimeoutError):
+        raise TimeoutException()
+
+    if isinstance(exception, Exception):
+        if exception.args[0] == "Server disconnected":
+            raise HTTPException(500, "Worker Server disconnected")
+
+    logging.exception(exception)
+    raise HTTPException(500, exception.args[0])
+
+
+class TimeoutException(HTTPException):
+    def __init__(self, status_code=408, detail="Request timed out.", *args, **kwargs):
+        super().__init__(status_code=status_code, detail=detail, *args, **kwargs)
+
+
+class LimitBacklogException(HTTPException):
+    def __init__(self, status_code=408, detail="Model Server has too much backlog.", *args, **kwargs):
+        super().__init__(status_code=status_code, detail=detail, *args, **kwargs)
+
+
+class SysInfo(BaseModel):
+    num_workers: int
+    servers: List[str]
+    num_requests: int
+    process_time: int
+    global_request_count: int
+
+
+class BatchRequestModel(BaseModel):
+    inputs: List[Any]
+
+
+class BatchResponse(BaseModel):
+    outputs: List[Any]
+
+
+class PrintOnce:
+    printed = False
+
+    def __call__(self, value):
+        if self.printed:
+            return
+        else:
+            print(value)
+            self.printed = True
+
+
+print_once = PrintOnce()
+
+
+def create_fastapi(title: str) -> FastAPI:
+    app = FastAPI(title=title)
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+    app.global_request_count = 0
+    app.num_current_requests = 0
+    app.last_process_time = 0
+
+    @app.middleware("http")
+    async def current_request_counter(request: Request, call_next):
+        if not request.scope["path"] == "/api/predict":
+            return await call_next(request)
+        app.global_request_count += 1
+        app.num_current_requests += 1
+        start_time = time.time()
+        response = await call_next(request)
+        process_time = time.time() - start_time
+        app.last_process_time = process_time
+        app.num_current_requests -= 1
+        return response
+
+    @app.get("/", include_in_schema=False)
+    async def docs():
+        return RedirectResponse("/docs")
+
+    @app.get("/num-requests")
+    async def num_requests() -> int:
+        return app.num_current_requests
+
+    return
+
+
+# FIXME: for debugging
+class Locust(LightningWork):
+    def __init__(self, locustfile: str, num_users: int = 10, port: int = 8089):
+        super().__init__(port=port, parallel=True, cloud_build_config=BuildConfig(requirements=["locust"]))
+        self.locustfile = locustfile
+        self.num_users = num_users
+        self.html_file = "locust_report.html"
+
+    def run(self, host: str):
+        cmd = " ".join(
+            [
+                "locust",
+                "--master-host",
+                str(self.host),
+                "--master-port",
+                str(self.port),
+                "--host",
+                str(host),
+                "-u",
+                str(self.num_users),
+                "-f",
+                str(self.locustfile),
+                "--html",
+                str(self.html_file),
+            ]
+        )
+        subprocess.Popen(cmd, shell=True).wait()
+
+
+class _LoadBalancer(LightningWork):
+    r"""The LoadBalancer is a LightningWork component that collects the requests and sends it to the prediciton API
+    asynchronously using RoundRobin scheduling. It also performs auto batching of the incoming requests.
+
+    The LoadBalancer exposes system endpoints with a basic HTTP authentication, in order to activate the authentication
+    you need to provide a system password from environment variable
+    `lightning run app lb_flow.py --env MUSE_SYSTEM_PASSWORD=PASSWORD`.
+    After enabling you will require to send username and password from the request header for the private endpoints.
+
+    Args:
+        max_batch_size: Number of requests processed at once.
+        batch_timeout_secs: Number of seconds to wait before sending the requests to process.
+        \**kwargs: Arguments passed to :func:`LightningWork.init` like ``CloudCompute``, ``BuildConfig``, etc.
+    """
+
+    def __init__(self, input_schema, output_schema, worker_url: str, max_batch_size=8, batch_timeout_secs=10, **kwargs):
+        super().__init__(cloud_compute=CloudCompute("default"), **kwargs)
+        self._input_schema = input_schema
+        self._output_schema = output_schema
+        self._server_ready = False
+        self.servers = []
+        self.max_batch_size = max_batch_size
+        self.batch_timeout_secs = batch_timeout_secs
+        self._ITER = None
+        self._batch = []
+        self._responses = {}  # {request_id: response}
+        self._last_batch_sent = 0
+        self.worker_url = worker_url
+
+    async def send_batch(self, batch: List[Tuple[str, BatchRequestModel]]):
+        # unit method
+        server = next(self._ITER)
+        request_data: List[_LoadBalancer._input_schema] = [b[1] for b in batch]
+        batch_request_data = BatchRequestModel(inputs=request_data)
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                headers = {
+                    "accept": "application/json",
+                    "Content-Type": "application/json",
+                }
+                async with session.post(
+                    f"{server}/{self.worker_url}",
+                    json=batch_request_data.dict(),
+                    timeout=INFERENCE_REQUEST_TIMEOUT,
+                    headers=headers,
+                ) as response:
+                    if response.status == 408:
+                        raise TimeoutException()
+                    response.raise_for_status()
+                    response = await response.json()
+                    outputs = response["outputs"]
+                    assert len(batch) == len(outputs), f"result has {len(outputs)} items but batch is {len(batch)}"
+                    result = {request[0]: r for request, r in zip(batch, outputs)}
+                    self._responses.update(result)
+        except Exception as e:
+            result = {request[0]: e for request in batch}
+            self._responses.update(result)
+
+    async def consumer(self):
+        while True:
+            await asyncio.sleep(0.05)
+
+            has_sent = False
+
+            batch = self._batch[: self.max_batch_size]
+            while batch and (
+                (len(batch) >= self.max_batch_size) or ((time.time() - self._last_batch_sent) > self.batch_timeout_secs)
+            ):
+                has_sent = True
+
+                asyncio.create_task(self.send_batch(batch))
+
+                self._batch = self._batch[self.max_batch_size :]
+                batch = self._batch[: self.max_batch_size]
+
+            if has_sent:
+                self._last_batch_sent = time.time()
+
+    async def process_request(self, data: BaseModel):
+        if not self.servers:
+            raise HTTPException(500, "None of the workers are healthy!")
+
+        request_id = uuid.uuid4().hex
+        request: Tuple = (request_id, data)
+        self._batch.append(request)
+
+        while True:
+            await asyncio.sleep(0.05)
+
+            if request_id in self._responses:
+                result = self._responses[request_id]
+                del self._responses[request_id]
+                raise_granular_exception(result)
+                return result
+
+    def run(self):
+        if self._server_ready:
+            return
+
+        INPUT_SCHEMA = self._input_schema
+        OUTPUT_SCHEMA = self._output_schema
+
+        print(self.servers)
+
+        self._ITER = cycle(self.servers)
+        self._last_batch_sent = time.time()
+
+        app = create_fastapi("Load Balancer")
+        app.global_request_count = 0
+        app.num_current_requests = 0
+        app.last_process_time = 0
+        app.SEND_TASK = None
+
+        @app.on_event("startup")
+        async def startup_event():
+            app.SEND_TASK = asyncio.create_task(self.consumer())
+            self._server_ready = True
+
+        @app.on_event("shutdown")
+        def shutdown_event():
+            app.SEND_TASK.cancel()
+            self._server_ready = False
+
+        @app.get("/system/info", response_model=SysInfo)
+        async def sys_info():
+            return SysInfo(
+                num_workers=len(self.servers),
+                servers=self.servers,
+                num_requests=app.num_current_requests,
+                process_time=app.last_process_time,
+                global_request_count=app.global_request_count,
+            )
+
+        @app.put("/system/update-servers")
+        async def update_servers(servers: List[str]):
+            self.servers = servers
+            self._ITER = cycle(self.servers)
+
+        @app.post("/api/predict", response_model=OUTPUT_SCHEMA)
+        async def balance_api(inputs: INPUT_SCHEMA):
+            return await self.process_request(inputs)
+
+        uvicorn.run(
+            app, host=self.host, port=self.port, loop="uvloop", timeout_keep_alive=KEEP_ALIVE_TIMEOUT, access_log=False
+        )
+
+    def update_servers(self, server_works: List[LightningWork]):
+        old_servers = set(self.servers)
+        server_urls: List[str] = [server.url for server in server_works if server.url]
+        new_servers = set(server_urls)
+        if new_servers == old_servers:
+            logging.debug("no new server added")
+            return
+        if new_servers - old_servers:
+            print("servers added:", new_servers - old_servers)
+
+        deleted_servers = old_servers - new_servers
+        if deleted_servers:
+            print("deleted servers:", deleted_servers)
+
+        headers = {
+            "accept": "application/json",
+            "username": "lightning",
+        }
+        response = requests.put(f"{self.url}/system/update-servers", json=server_urls, headers=headers, timeout=10)
+        response.raise_for_status()
+
+
+# TODO: accept schema as argument
+class LoadBalancer(LightningFlow):
+    """The MuseFlow is a LightningFlow component that handles all the servers and uses load balancer to spawn up
+    and shutdown based on current requests in the queue.
+
+    Args:
+        min_replica: Number of works to start when app initializes.
+        max_replica: Max numbers of works to spawn to handle the incoming requests.
+        autoscale_interval: Number of seconds to wait before checking whether to upscale or downscale the works.
+        max_batch_size: Number of requests to process at once.
+        batch_timeout_secs: Number of seconds to wait before sending the requests to process.
+        device_type: GPU type to use for the works.
+        downscale_threshold: Lower limit to determine when to stop works.
+        upscale_threshold: Upper limit to determine when to spawn up a new work.
+        worker_url: Default=api/predict. Provide the REST API path
+        input_schema:
+        output_schema
+    """
+
+    def __init__(
+        self,
+        work_cls: type,
+        min_replica: int = MIN_REPLICA,
+        max_replica: int = 4,
+        autoscale_interval: int = 1 * 10,
+        max_batch_size: int = 8,
+        batch_timeout_secs: float = 2,
+        device_type: str = DEVICE_TYPE,
+        downscale_threshold: Optional[int] = None,
+        upscale_threshold: Optional[int] = None,
+        worker_url: str = None,
+        input_schema: Any = Dict,
+        output_schema: Any = Dict,
+    ):
+        super().__init__()
+        self._worker_count = 0
+        self._work_registry = {}
+
+        self._input_schema = input_schema
+        self._output_schema = output_schema
+        self._initial_num_workers = min_replica
+        self.autoscale_interval = autoscale_interval
+        self.max_workers = max_replica
+        self.downscale_threshold = downscale_threshold or min_replica
+        self.upscale_threshold = upscale_threshold or min_replica * max_batch_size
+        self.fake_trigger = 0
+        self.gpu_type = device_type
+        self._last_autoscale = time.time()
+
+        worker_url = worker_url or "api/predict"
+        self.load_balancer = _LoadBalancer(
+            input_schema=self._input_schema,
+            output_schema=self._output_schema,
+            worker_url=worker_url,
+            max_batch_size=max_batch_size,
+            batch_timeout_secs=batch_timeout_secs,
+            cache_calls=True,
+            parallel=True,
+        )
+        for i in range(min_replica):
+            work = self.create_worker()
+            self.add_work(work)
+
+        self.load_test = None
+        if os.environ.get("LOAD_TEST", False):
+            self.load_test = Locust("scripts/locustfile.py")
+
+        print(
+            f"LB initialized with min replica={min_replica}, "
+            f"max_replica={max_replica}, "
+            f"batch timeout={batch_timeout_secs}, "
+            f"batch size={max_batch_size}"
+        )
+
+    @property
+    def workers(self) -> List[LightningWork]:
+        works = []
+        for i in range(self._worker_count):
+            work = self.get_work(i)
+            works.append(work)
+        return works
+
+    def create_worker(self, *args, **kwargs) -> LightningWork:
+        """implement."""
+
+    def add_work(self, work) -> str:
+        work_attribute = uuid.uuid4().hex
+        work_attribute = f"worker_{self._worker_count}_{str(work_attribute)}"
+        setattr(self, work_attribute, work)
+        self._work_registry[self._worker_count] = work_attribute
+        self._worker_count += 1
+        return work_attribute
+
+    def remove_work(self, index: int) -> str:
+        work_attribute = self._work_registry[index]
+        del self._work_registry[index]
+        work = getattr(self, work_attribute)
+        work.stop()
+        self._worker_count -= 1
+        return work_attribute
+
+    def get_work(self, index: int):
+        work_attribute = self._work_registry[index]
+        work = getattr(self, work_attribute)
+        return work
+
+    def run(self):
+        if not self.load_balancer.is_running:
+            self.load_balancer.run()
+
+        for worker in self.workers:
+            worker.run()
+
+        if self.load_balancer.url:
+            print_once(f"load balancer = {self.load_balancer.url}")
+            self.fake_trigger += 1
+            self.autoscale()
+
+            if self.load_test:
+                self.load_test.run(self.load_balancer.url)
+
+    def autoscale(self):
+        """Upscale and down scale model inference works based on the number of requests."""
+        if time.time() - self._last_autoscale < self.autoscale_interval:
+            return
+
+        self.load_balancer.update_servers(self.workers)
+
+        num_requests = int(requests.get(f"{self.load_balancer.url}/num-requests").json())
+        num_workers = len(self.workers)
+
+        # upscale
+        if num_requests > self.upscale_threshold and num_workers < self.max_workers:
+            idx = self._worker_count
+            print(f"Upscale to {self._worker_count + 1}")
+            work = self.create_worker()
+            new_work_id = self.add_work(work)
+            print("new work id:", new_work_id)
+
+        # downscale
+        elif num_requests < self.downscale_threshold and num_workers > self._initial_num_workers:
+            idx = self._worker_count - 1
+            print(f"Downscale to {idx}")
+            print("prev num servers:", len(self.workers))
+            removed_id = self.remove_work(idx)
+            print("removed:", removed_id)
+            print("new num servers:", len(self.workers))
+
+        self.load_balancer.update_servers(self.workers)
+        self._last_autoscale = time.time()
+
+    def configure_layout(self):
+        tabs = [{"name": "Swagger", "content": self.load_balancer.url}]
+        if self.load_test:
+            tabs.append({"name": "Load test", "content": self.load_test.url})
+        return tabs

From 9cc237f3198346716012347c43d8dc7c22dabc84 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 21:23:16 +0900
Subject: [PATCH 004/110] Update example

---
 examples/app_server_with_load_balancer/app.py | 82 ++++++++++++-------
 1 file changed, 51 insertions(+), 31 deletions(-)

diff --git a/examples/app_server_with_load_balancer/app.py b/examples/app_server_with_load_balancer/app.py
index a5e24eac335fb..0f2f9cb79df15 100644
--- a/examples/app_server_with_load_balancer/app.py
+++ b/examples/app_server_with_load_balancer/app.py
@@ -1,33 +1,44 @@
-# !pip install torchvision pydantic
 import base64
 import io
+import logging
+from typing import Any, List
 
 import torch
 import torchvision
-from PIL import Image
+from PIL import Image as PILImage
 from pydantic import BaseModel
 
 import lightning as L
-from lightning.app.components.serve import Image as InputImage
+from lightning.app.components import AutoScaler
 from lightning.app.components.serve import PythonServer
+from lightning.app.utilities.network import find_free_network_port
 
+logging.basicConfig(
+    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+    level=logging.INFO,
+)
+
+
+class RequestModel(BaseModel):
+    image: str
+
+
+class BatchRequestModel(BaseModel):
+    inputs: List[RequestModel]
 
-class PyTorchServer(PythonServer):
-    def __init__(self):
-        super().__init__(
-            input_type=InputImage,
-            output_type=OutputData,
-            cloud_compute=L.CloudCompute("gpu"),
-        )
 
+class BatchResponse(BaseModel):
+    outputs: List[Any]
+
+
+class PyTorchServer(PythonServer):
     def setup(self):
         self._model = torchvision.models.resnet18(pretrained=True)
         self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
         self._model.to(self._device)
 
-    def predict(self, request):
-        image = base64.b64decode(request.image.encode("utf-8"))
-        image = Image.open(io.BytesIO(image))
+    def predict(self, requests: BatchRequestModel):
         transforms = torchvision.transforms.Compose(
             [
                 torchvision.transforms.Resize(224),
@@ -35,25 +46,34 @@ def predict(self, request):
                 torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
             ]
         )
-        image = transforms(image)
-        image = image.to(self._device)
-        prediction = self._model(image.unsqueeze(0))
-        return {"prediction": prediction.argmax().item()}
-
+        images = []
+        for request in requests.inputs:
+            image = base64.b64decode(request.image.encode("utf-8"))
+            image = PILImage.open(io.BytesIO(image))
+            image = transforms(image).unsqueeze(0)
+            images.append(image)
+        images = torch.cat(images)
+        images = images.to(self._device)
+        predictions = self._model(images)
+        results = predictions.argmax(1).cpu().numpy().tolist()
+        return BatchResponse(outputs=[{"prediction": e} for e in results])
 
-class OutputData(BaseModel):
-    prediction: int
 
-
-app = L.LightningApp(PyTorchServer())
+class RootFlow(AutoScaler):
+    def create_worker(self, *args, **kwargs) -> L.LightningWork:
+        return PyTorchServer(
+            port=find_free_network_port(),
+            input_type=BatchRequestModel,
+            output_type=BatchResponse,
+            cloud_compute=L.CloudCompute("gpu"),
+        )
 
 
-# TODO: name confusion LoadBalancer vs. AutoScaler
-# from lightning.app.components import LoadBalancer
-# component = LoadBalancer(
-#     PyTorchServer,
-#     num_replicas=4,
-#     balance_function="predict",
-#     auto_scale=False,
-# )
-# app = L.LightningApp(component)
+app = L.LightningApp(
+    RootFlow(
+        input_schema=RequestModel,
+        output_schema=Any,
+        batch_timeout_secs=0.1,
+        worker_url="predict",
+    )
+)

From 9594371a506a7cceab8c1e5510678e0a4628a522 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 21:39:47 +0900
Subject: [PATCH 005/110] rename

---
 .../app.py                                             |  0
 src/lightning_app/components/__init__.py               |  4 ++--
 .../components/{load_balancer.py => auto_scaler.py}    | 10 ++++------
 3 files changed, 6 insertions(+), 8 deletions(-)
 rename examples/{app_server_with_load_balancer => app_server_with_auto_scaler}/app.py (100%)
 rename src/lightning_app/components/{load_balancer.py => auto_scaler.py} (98%)

diff --git a/examples/app_server_with_load_balancer/app.py b/examples/app_server_with_auto_scaler/app.py
similarity index 100%
rename from examples/app_server_with_load_balancer/app.py
rename to examples/app_server_with_auto_scaler/app.py
diff --git a/src/lightning_app/components/__init__.py b/src/lightning_app/components/__init__.py
index e72cd93981a5a..ca47c36071dae 100644
--- a/src/lightning_app/components/__init__.py
+++ b/src/lightning_app/components/__init__.py
@@ -1,6 +1,6 @@
+from lightning_app.components.auto_scaler import AutoScaler
 from lightning_app.components.database.client import DatabaseClient
 from lightning_app.components.database.server import Database
-from lightning_app.components.load_balancer import LoadBalancer
 from lightning_app.components.multi_node import (
     LightningTrainerMultiNode,
     LiteMultiNode,
@@ -16,6 +16,7 @@
 from lightning_app.components.training import LightningTrainerScript, PyTorchLightningScriptRunner
 
 __all__ = [
+    "AutoScaler",
     "DatabaseClient",
     "Database",
     "PopenPythonScript",
@@ -33,5 +34,4 @@
     "PyTorchLightningScriptRunner",
     "PyTorchSpawnMultiNode",
     "LightningTrainerMultiNode",
-    "LoadBalancer",
 ]
diff --git a/src/lightning_app/components/load_balancer.py b/src/lightning_app/components/auto_scaler.py
similarity index 98%
rename from src/lightning_app/components/load_balancer.py
rename to src/lightning_app/components/auto_scaler.py
index d44474a9923d4..4c067c6b58b48 100644
--- a/src/lightning_app/components/load_balancer.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -130,7 +130,7 @@ async def docs():
     async def num_requests() -> int:
         return app.num_current_requests
 
-    return
+    return app
 
 
 # FIXME: for debugging
@@ -331,10 +331,9 @@ def update_servers(self, server_works: List[LightningWork]):
         response.raise_for_status()
 
 
-# TODO: accept schema as argument
-class LoadBalancer(LightningFlow):
-    """The MuseFlow is a LightningFlow component that handles all the servers and uses load balancer to spawn up
-    and shutdown based on current requests in the queue.
+class AutoScaler(LightningFlow):
+    """A LightningFlow component that handles all the servers and uses load balancer to spawn up and shutdown based
+    on current requests in the queue.
 
     Args:
         min_replica: Number of works to start when app initializes.
@@ -352,7 +351,6 @@ class LoadBalancer(LightningFlow):
 
     def __init__(
         self,
-        work_cls: type,
         min_replica: int = MIN_REPLICA,
         max_replica: int = 4,
         autoscale_interval: int = 1 * 10,

From fc64cebcc30c1919d85ff4581177cf24e11d5714 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 21:56:45 +0900
Subject: [PATCH 006/110] remove prints

---
 examples/app_server_with_auto_scaler/app.py |  7 ----
 src/lightning_app/components/auto_scaler.py | 39 ++++++++-------------
 2 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 0f2f9cb79df15..66be47e9351d0 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -1,6 +1,5 @@
 import base64
 import io
-import logging
 from typing import Any, List
 
 import torch
@@ -13,12 +12,6 @@
 from lightning.app.components.serve import PythonServer
 from lightning.app.utilities.network import find_free_network_port
 
-logging.basicConfig(
-    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
-    datefmt="%Y-%m-%d:%H:%M:%S",
-    level=logging.INFO,
-)
-
 
 class RequestModel(BaseModel):
     image: str
diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 4c067c6b58b48..10240b5ecad42 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -18,6 +18,7 @@
 
 from lightning_app.core.flow import LightningFlow
 from lightning_app.core.work import LightningWork
+from lightning_app.utilities.app_helpers import Logger
 from lightning_app.utilities.packaging.build_config import BuildConfig
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
 
@@ -28,6 +29,9 @@
 OPEN_PROMPTS = None
 
 
+logger = Logger(__name__)
+
+
 def raise_granular_exception(exception: Exception):
     """handle the exceptions coming from hitting the model servers."""
     if not isinstance(exception, Exception):
@@ -80,20 +84,6 @@ class BatchResponse(BaseModel):
     outputs: List[Any]
 
 
-class PrintOnce:
-    printed = False
-
-    def __call__(self, value):
-        if self.printed:
-            return
-        else:
-            print(value)
-            self.printed = True
-
-
-print_once = PrintOnce()
-
-
 def create_fastapi(title: str) -> FastAPI:
     app = FastAPI(title=title)
 
@@ -265,7 +255,7 @@ def run(self):
         INPUT_SCHEMA = self._input_schema
         OUTPUT_SCHEMA = self._output_schema
 
-        print(self.servers)
+        logger.info(f"servers: {self.servers}")
 
         self._ITER = cycle(self.servers)
         self._last_batch_sent = time.time()
@@ -317,11 +307,11 @@ def update_servers(self, server_works: List[LightningWork]):
             logging.debug("no new server added")
             return
         if new_servers - old_servers:
-            print("servers added:", new_servers - old_servers)
+            logger.info(f"servers added: {new_servers - old_servers}")
 
         deleted_servers = old_servers - new_servers
         if deleted_servers:
-            print("deleted servers:", deleted_servers)
+            logger.info(f"servers added: {deleted_servers}")
 
         headers = {
             "accept": "application/json",
@@ -396,7 +386,7 @@ def __init__(
         if os.environ.get("LOAD_TEST", False):
             self.load_test = Locust("scripts/locustfile.py")
 
-        print(
+        logger.info(
             f"LB initialized with min replica={min_replica}, "
             f"max_replica={max_replica}, "
             f"batch timeout={batch_timeout_secs}, "
@@ -443,7 +433,6 @@ def run(self):
             worker.run()
 
         if self.load_balancer.url:
-            print_once(f"load balancer = {self.load_balancer.url}")
             self.fake_trigger += 1
             self.autoscale()
 
@@ -463,19 +452,19 @@ def autoscale(self):
         # upscale
         if num_requests > self.upscale_threshold and num_workers < self.max_workers:
             idx = self._worker_count
-            print(f"Upscale to {self._worker_count + 1}")
+            logger.info(f"Upscale to {self._worker_count + 1}")
             work = self.create_worker()
             new_work_id = self.add_work(work)
-            print("new work id:", new_work_id)
+            logger.info("new work id:", new_work_id)
 
         # downscale
         elif num_requests < self.downscale_threshold and num_workers > self._initial_num_workers:
             idx = self._worker_count - 1
-            print(f"Downscale to {idx}")
-            print("prev num servers:", len(self.workers))
+            logger.info(f"Downscale to {idx}")
+            logger.info("prev num servers:", len(self.workers))
             removed_id = self.remove_work(idx)
-            print("removed:", removed_id)
-            print("new num servers:", len(self.workers))
+            logger.info("removed:", removed_id)
+            logger.info("new num servers:", len(self.workers))
 
         self.load_balancer.update_servers(self.workers)
         self._last_autoscale = time.time()

From c4b5ac069ce89742f167c85f998d7c327532d467 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 22:01:44 +0900
Subject: [PATCH 007/110] _LoadBalancer -> LoadBalancer

---
 src/lightning_app/components/auto_scaler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 10240b5ecad42..d70cd90713d84 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -152,7 +152,7 @@ def run(self, host: str):
         subprocess.Popen(cmd, shell=True).wait()
 
 
-class _LoadBalancer(LightningWork):
+class LoadBalancer(LightningWork):
     r"""The LoadBalancer is a LightningWork component that collects the requests and sends it to the prediciton API
     asynchronously using RoundRobin scheduling. It also performs auto batching of the incoming requests.
 
@@ -184,7 +184,7 @@ def __init__(self, input_schema, output_schema, worker_url: str, max_batch_size=
     async def send_batch(self, batch: List[Tuple[str, BatchRequestModel]]):
         # unit method
         server = next(self._ITER)
-        request_data: List[_LoadBalancer._input_schema] = [b[1] for b in batch]
+        request_data: List[LoadBalancer._input_schema] = [b[1] for b in batch]
         batch_request_data = BatchRequestModel(inputs=request_data)
 
         try:
@@ -369,7 +369,7 @@ def __init__(
         self._last_autoscale = time.time()
 
         worker_url = worker_url or "api/predict"
-        self.load_balancer = _LoadBalancer(
+        self.load_balancer = LoadBalancer(
             input_schema=self._input_schema,
             output_schema=self._output_schema,
             worker_url=worker_url,

From 4c61501b58ea2c5500c693b63a38ef682c45b040 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 22:29:31 +0900
Subject: [PATCH 008/110] AutoScaler(work)

---
 examples/app_server_with_auto_scaler/app.py | 21 ++++++++++-----------
 src/lightning_app/components/auto_scaler.py |  5 ++++-
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 66be47e9351d0..be04d009d0660 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -26,6 +26,14 @@ class BatchResponse(BaseModel):
 
 
 class PyTorchServer(PythonServer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(
+            port=find_free_network_port(),
+            input_type=BatchRequestModel,
+            output_type=BatchResponse,
+            cloud_compute=L.CloudCompute("gpu"),
+        )
+
     def setup(self):
         self._model = torchvision.models.resnet18(pretrained=True)
         self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -52,18 +60,9 @@ def predict(self, requests: BatchRequestModel):
         return BatchResponse(outputs=[{"prediction": e} for e in results])
 
 
-class RootFlow(AutoScaler):
-    def create_worker(self, *args, **kwargs) -> L.LightningWork:
-        return PyTorchServer(
-            port=find_free_network_port(),
-            input_type=BatchRequestModel,
-            output_type=BatchResponse,
-            cloud_compute=L.CloudCompute("gpu"),
-        )
-
-
 app = L.LightningApp(
-    RootFlow(
+    AutoScaler(
+        PyTorchServer,
         input_schema=RequestModel,
         output_schema=Any,
         batch_timeout_secs=0.1,
diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index d70cd90713d84..f32d5ef6a7cbc 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -341,6 +341,7 @@ class AutoScaler(LightningFlow):
 
     def __init__(
         self,
+        work_cls: type,
         min_replica: int = MIN_REPLICA,
         max_replica: int = 4,
         autoscale_interval: int = 1 * 10,
@@ -357,6 +358,7 @@ def __init__(
         self._worker_count = 0
         self._work_registry = {}
 
+        self._work_cls = work_cls
         self._input_schema = input_schema
         self._output_schema = output_schema
         self._initial_num_workers = min_replica
@@ -378,7 +380,7 @@ def __init__(
             cache_calls=True,
             parallel=True,
         )
-        for i in range(min_replica):
+        for _ in range(min_replica):
             work = self.create_worker()
             self.add_work(work)
 
@@ -403,6 +405,7 @@ def workers(self) -> List[LightningWork]:
 
     def create_worker(self, *args, **kwargs) -> LightningWork:
         """implement."""
+        return self._work_cls()
 
     def add_work(self, work) -> str:
         work_attribute = uuid.uuid4().hex

From dc72f1a2b6f081cb76e0de1db89cd6316ef5b1d5 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 22:33:52 +0900
Subject: [PATCH 009/110] change var name

---
 src/lightning_app/components/auto_scaler.py | 65 +++++++++++----------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index f32d5ef6a7cbc..9031ccef9b60f 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -85,9 +85,9 @@ class BatchResponse(BaseModel):
 
 
 def create_fastapi(title: str) -> FastAPI:
-    app = FastAPI(title=title)
+    fastapi_app = FastAPI(title=title)
 
-    app.add_middleware(
+    fastapi_app.add_middleware(
         CORSMiddleware,
         allow_origins=["*"],
         allow_credentials=True,
@@ -95,32 +95,32 @@ def create_fastapi(title: str) -> FastAPI:
         allow_headers=["*"],
     )
 
-    app.global_request_count = 0
-    app.num_current_requests = 0
-    app.last_process_time = 0
+    fastapi_app.global_request_count = 0
+    fastapi_app.num_current_requests = 0
+    fastapi_app.last_process_time = 0
 
-    @app.middleware("http")
+    @fastapi_app.middleware("http")
     async def current_request_counter(request: Request, call_next):
         if not request.scope["path"] == "/api/predict":
             return await call_next(request)
-        app.global_request_count += 1
-        app.num_current_requests += 1
+        fastapi_app.global_request_count += 1
+        fastapi_app.num_current_requests += 1
         start_time = time.time()
         response = await call_next(request)
         process_time = time.time() - start_time
-        app.last_process_time = process_time
-        app.num_current_requests -= 1
+        fastapi_app.last_process_time = process_time
+        fastapi_app.num_current_requests -= 1
         return response
 
-    @app.get("/", include_in_schema=False)
+    @fastapi_app.get("/", include_in_schema=False)
     async def docs():
         return RedirectResponse("/docs")
 
-    @app.get("/num-requests")
+    @fastapi_app.get("/num-requests")
     async def num_requests() -> int:
-        return app.num_current_requests
+        return fastapi_app.num_current_requests
 
-    return app
+    return fastapi_app
 
 
 # FIXME: for debugging
@@ -260,43 +260,48 @@ def run(self):
         self._ITER = cycle(self.servers)
         self._last_batch_sent = time.time()
 
-        app = create_fastapi("Load Balancer")
-        app.global_request_count = 0
-        app.num_current_requests = 0
-        app.last_process_time = 0
-        app.SEND_TASK = None
+        fastapi_app = create_fastapi("Load Balancer")
+        fastapi_app.global_request_count = 0
+        fastapi_app.num_current_requests = 0
+        fastapi_app.last_process_time = 0
+        fastapi_app.SEND_TASK = None
 
-        @app.on_event("startup")
+        @fastapi_app.on_event("startup")
         async def startup_event():
-            app.SEND_TASK = asyncio.create_task(self.consumer())
+            fastapi_app.SEND_TASK = asyncio.create_task(self.consumer())
             self._server_ready = True
 
-        @app.on_event("shutdown")
+        @fastapi_app.on_event("shutdown")
         def shutdown_event():
-            app.SEND_TASK.cancel()
+            fastapi_app.SEND_TASK.cancel()
             self._server_ready = False
 
-        @app.get("/system/info", response_model=SysInfo)
+        @fastapi_app.get("/system/info", response_model=SysInfo)
         async def sys_info():
             return SysInfo(
                 num_workers=len(self.servers),
                 servers=self.servers,
-                num_requests=app.num_current_requests,
-                process_time=app.last_process_time,
-                global_request_count=app.global_request_count,
+                num_requests=fastapi_app.num_current_requests,
+                process_time=fastapi_app.last_process_time,
+                global_request_count=fastapi_app.global_request_count,
             )
 
-        @app.put("/system/update-servers")
+        @fastapi_app.put("/system/update-servers")
         async def update_servers(servers: List[str]):
             self.servers = servers
             self._ITER = cycle(self.servers)
 
-        @app.post("/api/predict", response_model=OUTPUT_SCHEMA)
+        @fastapi_app.post("/api/predict", response_model=OUTPUT_SCHEMA)
         async def balance_api(inputs: INPUT_SCHEMA):
             return await self.process_request(inputs)
 
         uvicorn.run(
-            app, host=self.host, port=self.port, loop="uvloop", timeout_keep_alive=KEEP_ALIVE_TIMEOUT, access_log=False
+            fastapi_app,
+            host=self.host,
+            port=self.port,
+            loop="uvloop",
+            timeout_keep_alive=KEEP_ALIVE_TIMEOUT,
+            access_log=False,
         )
 
     def update_servers(self, server_works: List[LightningWork]):

From 57943ac4f48f2c78e73159d480396fae8460f45b Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 22:35:56 +0900
Subject: [PATCH 010/110] remove locust

---
 src/lightning_app/components/auto_scaler.py | 36 ---------------------
 1 file changed, 36 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 9031ccef9b60f..0d598e357372e 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -1,7 +1,6 @@
 import asyncio
 import logging
 import os
-import subprocess
 import time
 import uuid
 from itertools import cycle
@@ -19,7 +18,6 @@
 from lightning_app.core.flow import LightningFlow
 from lightning_app.core.work import LightningWork
 from lightning_app.utilities.app_helpers import Logger
-from lightning_app.utilities.packaging.build_config import BuildConfig
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
 
 MIN_REPLICA = int(os.environ.get("MUSE_MIN_WORKERS", 1))
@@ -28,7 +26,6 @@
 INFERENCE_REQUEST_TIMEOUT = float(os.environ.get("KEEP_ALIVE_TIMEOUT", 60))
 OPEN_PROMPTS = None
 
-
 logger = Logger(__name__)
 
 
@@ -123,35 +120,6 @@ async def num_requests() -> int:
     return fastapi_app
 
 
-# FIXME: for debugging
-class Locust(LightningWork):
-    def __init__(self, locustfile: str, num_users: int = 10, port: int = 8089):
-        super().__init__(port=port, parallel=True, cloud_build_config=BuildConfig(requirements=["locust"]))
-        self.locustfile = locustfile
-        self.num_users = num_users
-        self.html_file = "locust_report.html"
-
-    def run(self, host: str):
-        cmd = " ".join(
-            [
-                "locust",
-                "--master-host",
-                str(self.host),
-                "--master-port",
-                str(self.port),
-                "--host",
-                str(host),
-                "-u",
-                str(self.num_users),
-                "-f",
-                str(self.locustfile),
-                "--html",
-                str(self.html_file),
-            ]
-        )
-        subprocess.Popen(cmd, shell=True).wait()
-
-
 class LoadBalancer(LightningWork):
     r"""The LoadBalancer is a LightningWork component that collects the requests and sends it to the prediciton API
     asynchronously using RoundRobin scheduling. It also performs auto batching of the incoming requests.
@@ -389,10 +357,6 @@ def __init__(
             work = self.create_worker()
             self.add_work(work)
 
-        self.load_test = None
-        if os.environ.get("LOAD_TEST", False):
-            self.load_test = Locust("scripts/locustfile.py")
-
         logger.info(
             f"LB initialized with min replica={min_replica}, "
             f"max_replica={max_replica}, "

From b6a9918f0db003a9ea1a81e897223850914c6861 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 22:50:54 +0900
Subject: [PATCH 011/110] Update docs

---
 docs/source-app/api_references.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source-app/api_references.rst b/docs/source-app/api_references.rst
index 30e0ade3a25ad..317c53c076a91 100644
--- a/docs/source-app/api_references.rst
+++ b/docs/source-app/api_references.rst
@@ -32,11 +32,19 @@ ___________________
     :nosignatures:
     :template: classtemplate_no_index.rst
 
+    ~database.client.DatabaseClient
+    ~database.server.Database
     ~python.popen.PopenPythonScript
     ~python.tracer.TracerPythonScript
     ~training.LightningTrainerScript
     ~serve.gradio.ServeGradio
     ~serve.serve.ModelInferenceAPI
+    ~serve.python_server.PythonServer
+    ~serve.stream_lit.ServeSteamLit
+    ~multi_node.base.MultiNode
+    ~multi_node.lite.LiteMultiNode
+    ~multi_node.pytorch_spawn.PyTorchSpawnMultiNode
+    ~multi_node.trainer.LightningTrainerMultiNode
 
 ----
 

From 16f73339e5a8633bd5e38aa84d60ed37dd322eb0 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 22:53:44 +0900
Subject: [PATCH 012/110] include autoscaler in api ref

---
 docs/source-app/api_references.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source-app/api_references.rst b/docs/source-app/api_references.rst
index 317c53c076a91..e7ffa249e1a7e 100644
--- a/docs/source-app/api_references.rst
+++ b/docs/source-app/api_references.rst
@@ -45,6 +45,7 @@ ___________________
     ~multi_node.lite.LiteMultiNode
     ~multi_node.pytorch_spawn.PyTorchSpawnMultiNode
     ~multi_node.trainer.LightningTrainerMultiNode
+    ~auto_scaler.AutoScaler
 
 ----
 

From cd9929c748b100466a64dedc56fed22940324578 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 23:07:52 +0900
Subject: [PATCH 013/110] docs typo

---
 docs/source-app/api_references.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-app/api_references.rst b/docs/source-app/api_references.rst
index e7ffa249e1a7e..2d7f7d98612ce 100644
--- a/docs/source-app/api_references.rst
+++ b/docs/source-app/api_references.rst
@@ -40,7 +40,7 @@ ___________________
     ~serve.gradio.ServeGradio
     ~serve.serve.ModelInferenceAPI
     ~serve.python_server.PythonServer
-    ~serve.stream_lit.ServeSteamLit
+    ~serve.streamlit.ServeSteamLit
     ~multi_node.base.MultiNode
     ~multi_node.lite.LiteMultiNode
     ~multi_node.pytorch_spawn.PyTorchSpawnMultiNode

From f33874ed41142dee0a04f0ef0b0eb75cdbeb9b2a Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 23:07:52 +0900
Subject: [PATCH 014/110] docs typo

---
 docs/source-app/api_references.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-app/api_references.rst b/docs/source-app/api_references.rst
index 317c53c076a91..dce66c7bd9507 100644
--- a/docs/source-app/api_references.rst
+++ b/docs/source-app/api_references.rst
@@ -40,7 +40,7 @@ ___________________
     ~serve.gradio.ServeGradio
     ~serve.serve.ModelInferenceAPI
     ~serve.python_server.PythonServer
-    ~serve.stream_lit.ServeSteamLit
+    ~serve.streamlit.ServeSteamLit
     ~multi_node.base.MultiNode
     ~multi_node.lite.LiteMultiNode
     ~multi_node.pytorch_spawn.PyTorchSpawnMultiNode

From 12d12b466e5d6682fbd4ed0b34a08c0200e3e806 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 23:12:42 +0900
Subject: [PATCH 015/110] docs typo

---
 docs/source-app/api_references.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-app/api_references.rst b/docs/source-app/api_references.rst
index 2d7f7d98612ce..03396f11ba374 100644
--- a/docs/source-app/api_references.rst
+++ b/docs/source-app/api_references.rst
@@ -40,7 +40,7 @@ ___________________
     ~serve.gradio.ServeGradio
     ~serve.serve.ModelInferenceAPI
     ~serve.python_server.PythonServer
-    ~serve.streamlit.ServeSteamLit
+    ~serve.streamlit.ServeStreamlit
     ~multi_node.base.MultiNode
     ~multi_node.lite.LiteMultiNode
     ~multi_node.pytorch_spawn.PyTorchSpawnMultiNode

From 656b0b6fd4191f4dcb0b4ed48352618332db5d1d Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 24 Nov 2022 23:12:42 +0900
Subject: [PATCH 016/110] docs typo

---
 docs/source-app/api_references.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-app/api_references.rst b/docs/source-app/api_references.rst
index dce66c7bd9507..8808d58e71dcc 100644
--- a/docs/source-app/api_references.rst
+++ b/docs/source-app/api_references.rst
@@ -40,7 +40,7 @@ ___________________
     ~serve.gradio.ServeGradio
     ~serve.serve.ModelInferenceAPI
     ~serve.python_server.PythonServer
-    ~serve.streamlit.ServeSteamLit
+    ~serve.streamlit.ServeStreamlit
     ~multi_node.base.MultiNode
     ~multi_node.lite.LiteMultiNode
     ~multi_node.pytorch_spawn.PyTorchSpawnMultiNode

From a5859f76be49fd0905367f81ffb6f7a64f7cc160 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 00:02:03 +0900
Subject: [PATCH 017/110] remove unused loadtest

---
 src/lightning_app/components/auto_scaler.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 0d598e357372e..2528093dd7ff7 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -408,9 +408,6 @@ def run(self):
             self.fake_trigger += 1
             self.autoscale()
 
-            if self.load_test:
-                self.load_test.run(self.load_balancer.url)
-
     def autoscale(self):
         """Upscale and down scale model inference works based on the number of requests."""
         if time.time() - self._last_autoscale < self.autoscale_interval:
@@ -443,6 +440,4 @@ def autoscale(self):
 
     def configure_layout(self):
         tabs = [{"name": "Swagger", "content": self.load_balancer.url}]
-        if self.load_test:
-            tabs.append({"name": "Load test", "content": self.load_test.url})
         return tabs

From 1bdf1bc0a3bbf85cb8ca9a52e1b4f57d6b7a6654 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 00:03:29 +0900
Subject: [PATCH 018/110] remove unused device_type

---
 src/lightning_app/components/auto_scaler.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 2528093dd7ff7..fa21cd559a68e 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -21,7 +21,6 @@
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
 
 MIN_REPLICA = int(os.environ.get("MUSE_MIN_WORKERS", 1))
-DEVICE_TYPE = os.environ.get("MUSE_GPU_TYPE", "gpu")
 KEEP_ALIVE_TIMEOUT = float(os.environ.get("KEEP_ALIVE_TIMEOUT", 60))
 INFERENCE_REQUEST_TIMEOUT = float(os.environ.get("KEEP_ALIVE_TIMEOUT", 60))
 OPEN_PROMPTS = None
@@ -320,7 +319,6 @@ def __init__(
         autoscale_interval: int = 1 * 10,
         max_batch_size: int = 8,
         batch_timeout_secs: float = 2,
-        device_type: str = DEVICE_TYPE,
         downscale_threshold: Optional[int] = None,
         upscale_threshold: Optional[int] = None,
         worker_url: str = None,
@@ -340,7 +338,6 @@ def __init__(
         self.downscale_threshold = downscale_threshold or min_replica
         self.upscale_threshold = upscale_threshold or min_replica * max_batch_size
         self.fake_trigger = 0
-        self.gpu_type = device_type
         self._last_autoscale = time.time()
 
         worker_url = worker_url or "api/predict"

From 1cb366f1bfe370662525a4adaca869088dde8ef2 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 00:27:28 +0900
Subject: [PATCH 019/110] clean up

---
 src/lightning_app/components/auto_scaler.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index fa21cd559a68e..235b3e9b07e53 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -59,12 +59,7 @@ def __init__(self, status_code=408, detail="Request timed out.", *args, **kwargs
         super().__init__(status_code=status_code, detail=detail, *args, **kwargs)
 
 
-class LimitBacklogException(HTTPException):
-    def __init__(self, status_code=408, detail="Model Server has too much backlog.", *args, **kwargs):
-        super().__init__(status_code=status_code, detail=detail, *args, **kwargs)
-
-
-class SysInfo(BaseModel):
+class _SysInfo(BaseModel):
     num_workers: int
     servers: List[str]
     num_requests: int
@@ -243,9 +238,9 @@ def shutdown_event():
             fastapi_app.SEND_TASK.cancel()
             self._server_ready = False
 
-        @fastapi_app.get("/system/info", response_model=SysInfo)
+        @fastapi_app.get("/system/info", response_model=_SysInfo)
         async def sys_info():
-            return SysInfo(
+            return _SysInfo(
                 num_workers=len(self.servers),
                 servers=self.servers,
                 num_requests=fastapi_app.num_current_requests,

From fb4d2e56e2bb9a075890a7aef84aa1b57754a2ec Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 00:28:53 +0900
Subject: [PATCH 020/110] clean up

---
 src/lightning_app/components/auto_scaler.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 235b3e9b07e53..85bc1289d2cc9 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -28,7 +28,7 @@
 logger = Logger(__name__)
 
 
-def raise_granular_exception(exception: Exception):
+def _raise_granular_exception(exception: Exception):
     """handle the exceptions coming from hitting the model servers."""
     if not isinstance(exception, Exception):
         return
@@ -67,14 +67,10 @@ class _SysInfo(BaseModel):
     global_request_count: int
 
 
-class BatchRequestModel(BaseModel):
+class _BatchRequestModel(BaseModel):
     inputs: List[Any]
 
 
-class BatchResponse(BaseModel):
-    outputs: List[Any]
-
-
 def create_fastapi(title: str) -> FastAPI:
     fastapi_app = FastAPI(title=title)
 
@@ -143,11 +139,11 @@ def __init__(self, input_schema, output_schema, worker_url: str, max_batch_size=
         self._last_batch_sent = 0
         self.worker_url = worker_url
 
-    async def send_batch(self, batch: List[Tuple[str, BatchRequestModel]]):
+    async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
         # unit method
         server = next(self._ITER)
         request_data: List[LoadBalancer._input_schema] = [b[1] for b in batch]
-        batch_request_data = BatchRequestModel(inputs=request_data)
+        batch_request_data = _BatchRequestModel(inputs=request_data)
 
         try:
             async with aiohttp.ClientSession() as session:
@@ -207,7 +203,7 @@ async def process_request(self, data: BaseModel):
             if request_id in self._responses:
                 result = self._responses[request_id]
                 del self._responses[request_id]
-                raise_granular_exception(result)
+                _raise_granular_exception(result)
                 return result
 
     def run(self):

From c0ba3515a83d829f3bf5b7474b424a333ef711b9 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 00:29:42 +0900
Subject: [PATCH 021/110] clean up

---
 src/lightning_app/components/auto_scaler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 85bc1289d2cc9..bb203aca92263 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -71,7 +71,7 @@ class _BatchRequestModel(BaseModel):
     inputs: List[Any]
 
 
-def create_fastapi(title: str) -> FastAPI:
+def _create_fastapi(title: str) -> FastAPI:
     fastapi_app = FastAPI(title=title)
 
     fastapi_app.add_middleware(
@@ -218,7 +218,7 @@ def run(self):
         self._ITER = cycle(self.servers)
         self._last_batch_sent = time.time()
 
-        fastapi_app = create_fastapi("Load Balancer")
+        fastapi_app = _create_fastapi("Load Balancer")
         fastapi_app.global_request_count = 0
         fastapi_app.num_current_requests = 0
         fastapi_app.last_process_time = 0

From 666918b3cee7f9e578dcd2adc9ae33c9ff900345 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 00:37:56 +0900
Subject: [PATCH 022/110] Add docstring

---
 src/lightning_app/components/auto_scaler.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index bb203aca92263..97f8f00c24431 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -29,7 +29,7 @@
 
 
 def _raise_granular_exception(exception: Exception):
-    """handle the exceptions coming from hitting the model servers."""
+    """Handle an exception from hitting the model servers."""
     if not isinstance(exception, Exception):
         return
 
@@ -263,6 +263,10 @@ async def balance_api(inputs: INPUT_SCHEMA):
         )
 
     def update_servers(self, server_works: List[LightningWork]):
+        """Updates works that load balancer distributes requests to.
+
+        AutoScaler uses this method to increase/decrease the number of works.
+        """
         old_servers = set(self.servers)
         server_urls: List[str] = [server.url for server in server_works if server.url]
         new_servers = set(server_urls)

From 6f0f43f7e28531f80c53c56db8110092b35825d7 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 00:50:36 +0900
Subject: [PATCH 023/110] type

---
 src/lightning_app/components/auto_scaler.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 97f8f00c24431..315ec5ed0eb84 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -28,7 +28,7 @@
 logger = Logger(__name__)
 
 
-def _raise_granular_exception(exception: Exception):
+def _raise_granular_exception(exception: Exception) -> None:
     """Handle an exception from hitting the model servers."""
     if not isinstance(exception, Exception):
         return
@@ -55,7 +55,7 @@ def _raise_granular_exception(exception: Exception):
 
 
 class TimeoutException(HTTPException):
-    def __init__(self, status_code=408, detail="Request timed out.", *args, **kwargs):
+    def __init__(self, status_code: int = 408, detail: str = "Request timed out.", *args: Any, **kwargs: Any) -> None:
         super().__init__(status_code=status_code, detail=detail, *args, **kwargs)
 
 
@@ -125,7 +125,9 @@ class LoadBalancer(LightningWork):
         \**kwargs: Arguments passed to :func:`LightningWork.init` like ``CloudCompute``, ``BuildConfig``, etc.
     """
 
-    def __init__(self, input_schema, output_schema, worker_url: str, max_batch_size=8, batch_timeout_secs=10, **kwargs):
+    def __init__(
+        self, input_schema, output_schema, worker_url: str, max_batch_size=8, batch_timeout_secs=10, **kwargs
+    ) -> None:
         super().__init__(cloud_compute=CloudCompute("default"), **kwargs)
         self._input_schema = input_schema
         self._output_schema = output_schema
@@ -319,7 +321,7 @@ def __init__(
         worker_url: str = None,
         input_schema: Any = Dict,
         output_schema: Any = Dict,
-    ):
+    ) -> None:
         super().__init__()
         self._worker_count = 0
         self._work_registry = {}
@@ -384,7 +386,7 @@ def remove_work(self, index: int) -> str:
         self._worker_count -= 1
         return work_attribute
 
-    def get_work(self, index: int):
+    def get_work(self, index: int) -> LightningWork:
         work_attribute = self._work_registry[index]
         work = getattr(self, work_attribute)
         return work

From 9cda54473d97047b2f3ccf8641e90675c2be9c21 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 19:39:16 +0900
Subject: [PATCH 024/110] env vars to args

---
 src/lightning_app/components/auto_scaler.py | 24 ++++++++++++---------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 315ec5ed0eb84..e29f505817960 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -1,6 +1,5 @@
 import asyncio
 import logging
-import os
 import time
 import uuid
 from itertools import cycle
@@ -20,11 +19,6 @@
 from lightning_app.utilities.app_helpers import Logger
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
 
-MIN_REPLICA = int(os.environ.get("MUSE_MIN_WORKERS", 1))
-KEEP_ALIVE_TIMEOUT = float(os.environ.get("KEEP_ALIVE_TIMEOUT", 60))
-INFERENCE_REQUEST_TIMEOUT = float(os.environ.get("KEEP_ALIVE_TIMEOUT", 60))
-OPEN_PROMPTS = None
-
 logger = Logger(__name__)
 
 
@@ -126,12 +120,22 @@ class LoadBalancer(LightningWork):
     """
 
     def __init__(
-        self, input_schema, output_schema, worker_url: str, max_batch_size=8, batch_timeout_secs=10, **kwargs
+        self,
+        input_schema,
+        output_schema,
+        worker_url: str,
+        max_batch_size=8,
+        batch_timeout_secs=10,
+        timeout_keep_alive=60,
+        timeout_inference_request=60,
+        **kwargs,
     ) -> None:
         super().__init__(cloud_compute=CloudCompute("default"), **kwargs)
         self._input_schema = input_schema
         self._output_schema = output_schema
         self._server_ready = False
+        self._timeout_keep_alive = timeout_keep_alive
+        self._timeout_inference_request = timeout_inference_request
         self.servers = []
         self.max_batch_size = max_batch_size
         self.batch_timeout_secs = batch_timeout_secs
@@ -156,7 +160,7 @@ async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
                 async with session.post(
                     f"{server}/{self.worker_url}",
                     json=batch_request_data.dict(),
-                    timeout=INFERENCE_REQUEST_TIMEOUT,
+                    timeout=self._timeout_inference_request,
                     headers=headers,
                 ) as response:
                     if response.status == 408:
@@ -260,7 +264,7 @@ async def balance_api(inputs: INPUT_SCHEMA):
             host=self.host,
             port=self.port,
             loop="uvloop",
-            timeout_keep_alive=KEEP_ALIVE_TIMEOUT,
+            timeout_keep_alive=self._timeout_keep_alive,
             access_log=False,
         )
 
@@ -311,7 +315,7 @@ class AutoScaler(LightningFlow):
     def __init__(
         self,
         work_cls: type,
-        min_replica: int = MIN_REPLICA,
+        min_replica: int = 1,
         max_replica: int = 4,
         autoscale_interval: int = 1 * 10,
         max_batch_size: int = 8,

From 609eb1093856e672becccd1991465e21ec02a717 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 21:13:05 +0900
Subject: [PATCH 025/110] expose an API for users to override to customise
 autoscaling logic

---
 src/lightning_app/components/auto_scaler.py | 80 +++++++++++++--------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index e29f505817960..2da58e4100885 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -299,12 +299,11 @@ class AutoScaler(LightningFlow):
     on current requests in the queue.
 
     Args:
-        min_replica: Number of works to start when app initializes.
-        max_replica: Max numbers of works to spawn to handle the incoming requests.
+        min_replicas: Number of works to start when app initializes.
+        max_replicas: Max numbers of works to spawn to handle the incoming requests.
         autoscale_interval: Number of seconds to wait before checking whether to upscale or downscale the works.
         max_batch_size: Number of requests to process at once.
         batch_timeout_secs: Number of seconds to wait before sending the requests to process.
-        device_type: GPU type to use for the works.
         downscale_threshold: Lower limit to determine when to stop works.
         upscale_threshold: Upper limit to determine when to spawn up a new work.
         worker_url: Default=api/predict. Provide the REST API path
@@ -315,8 +314,8 @@ class AutoScaler(LightningFlow):
     def __init__(
         self,
         work_cls: type,
-        min_replica: int = 1,
-        max_replica: int = 4,
+        min_replicas: int = 1,
+        max_replicas: int = 4,
         autoscale_interval: int = 1 * 10,
         max_batch_size: int = 8,
         batch_timeout_secs: float = 2,
@@ -327,17 +326,17 @@ def __init__(
         output_schema: Any = Dict,
     ) -> None:
         super().__init__()
-        self._worker_count = 0
+        self._num_replicas = 0
         self._work_registry = {}
 
         self._work_cls = work_cls
         self._input_schema = input_schema
         self._output_schema = output_schema
-        self._initial_num_workers = min_replica
         self.autoscale_interval = autoscale_interval
-        self.max_workers = max_replica
-        self.downscale_threshold = downscale_threshold or min_replica
-        self.upscale_threshold = upscale_threshold or min_replica * max_batch_size
+        self.max_replicas = max_replicas
+        self.min_replicas = min_replicas
+        self.downscale_threshold = downscale_threshold or min_replicas
+        self.upscale_threshold = upscale_threshold or min_replicas * max_batch_size
         self.fake_trigger = 0
         self._last_autoscale = time.time()
 
@@ -351,21 +350,21 @@ def __init__(
             cache_calls=True,
             parallel=True,
         )
-        for _ in range(min_replica):
+        for _ in range(min_replicas):
             work = self.create_worker()
             self.add_work(work)
 
         logger.info(
-            f"LB initialized with min replica={min_replica}, "
-            f"max_replica={max_replica}, "
+            f"Initialized AutoScaler(replicas={min_replicas}, "
+            f"max_replicas={max_replicas}, "
             f"batch timeout={batch_timeout_secs}, "
-            f"batch size={max_batch_size}"
+            f"batch size={max_batch_size})"
         )
 
     @property
     def workers(self) -> List[LightningWork]:
         works = []
-        for i in range(self._worker_count):
+        for i in range(self._num_replicas):
             work = self.get_work(i)
             works.append(work)
         return works
@@ -376,10 +375,10 @@ def create_worker(self, *args, **kwargs) -> LightningWork:
 
     def add_work(self, work) -> str:
         work_attribute = uuid.uuid4().hex
-        work_attribute = f"worker_{self._worker_count}_{str(work_attribute)}"
+        work_attribute = f"worker_{self._num_replicas}_{str(work_attribute)}"
         setattr(self, work_attribute, work)
-        self._work_registry[self._worker_count] = work_attribute
-        self._worker_count += 1
+        self._work_registry[self._num_replicas] = work_attribute
+        self._num_replicas += 1
         return work_attribute
 
     def remove_work(self, index: int) -> str:
@@ -387,7 +386,7 @@ def remove_work(self, index: int) -> str:
         del self._work_registry[index]
         work = getattr(self, work_attribute)
         work.stop()
-        self._worker_count -= 1
+        self._num_replicas -= 1
         return work_attribute
 
     def get_work(self, index: int) -> LightningWork:
@@ -406,32 +405,51 @@ def run(self):
             self.fake_trigger += 1
             self.autoscale()
 
+    def scale(self, replicas: int, metrics) -> int:
+        """The default replication logic that users can override."""
+        # FIXME: Don't hard code number
+        # if metrics["num_requests"] > 20:
+        #     return replicas + 1
+
+        # if metrics["num_requests"] < 10:
+        #     return replicas - 1
+
+        return replicas + 1  # FIXME
+
     def autoscale(self):
         """Upscale and down scale model inference works based on the number of requests."""
         if time.time() - self._last_autoscale < self.autoscale_interval:
             return
 
+        # ??? for what?
         self.load_balancer.update_servers(self.workers)
 
+        # ??? what's this?
         num_requests = int(requests.get(f"{self.load_balancer.url}/num-requests").json())
-        num_workers = len(self.workers)
+        metrics = {
+            "num_requests": num_requests,
+        }
+        num_target_workers = max(
+            self.min_replicas,
+            min(self.max_replicas, self.scale(self._num_replicas, metrics)),
+        )
+
+        logger.info(f"Scaling from {self._num_replicas} to {num_target_workers}")
 
         # upscale
-        if num_requests > self.upscale_threshold and num_workers < self.max_workers:
-            idx = self._worker_count
-            logger.info(f"Upscale to {self._worker_count + 1}")
+        num_workers_to_add = num_target_workers - self._num_replicas
+        for _ in range(num_workers_to_add):
+            logger.info(f"Upscaling from {self._num_replicas} to {self._num_replicas + 1}")
             work = self.create_worker()
             new_work_id = self.add_work(work)
-            logger.info("new work id:", new_work_id)
+            logger.info(f"Work created: '{new_work_id}'")
 
         # downscale
-        elif num_requests < self.downscale_threshold and num_workers > self._initial_num_workers:
-            idx = self._worker_count - 1
-            logger.info(f"Downscale to {idx}")
-            logger.info("prev num servers:", len(self.workers))
-            removed_id = self.remove_work(idx)
-            logger.info("removed:", removed_id)
-            logger.info("new num servers:", len(self.workers))
+        num_workers_to_remove = self._num_replicas - num_target_workers
+        for _ in range(num_workers_to_remove):
+            logger.info(f"Downscaling from {self._num_replicas} to {self._num_replicas - 1}")
+            removed_work_id = self.remove_work(self._num_replicas - 1)
+            logger.info(f"Work removed: '{removed_work_id}'")
 
         self.load_balancer.update_servers(self.workers)
         self._last_autoscale = time.time()

From 4e779a19f70032b47ea33a1b2e69361f3def2837 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 21:13:21 +0900
Subject: [PATCH 026/110] update example

---
 examples/app_server_with_auto_scaler/app.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index be04d009d0660..9829da4af524e 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -63,9 +63,10 @@ def predict(self, requests: BatchRequestModel):
 app = L.LightningApp(
     AutoScaler(
         PyTorchServer,
+        max_replicas=3,
+        worker_url="predict",
         input_schema=RequestModel,
         output_schema=Any,
         batch_timeout_secs=0.1,
-        worker_url="predict",
     )
 )

From 92737e42ef566a6118c30eaf20266e8d3a4a8ee8 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 21:32:44 +0900
Subject: [PATCH 027/110] comment

---
 src/lightning_app/components/auto_scaler.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 2da58e4100885..3464a0aa8b8d6 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -146,8 +146,7 @@ def __init__(
         self.worker_url = worker_url
 
     async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
-        # unit method
-        server = next(self._ITER)
+        server = next(self._ITER)  # round-robin
         request_data: List[LoadBalancer._input_schema] = [b[1] for b in batch]
         batch_request_data = _BatchRequestModel(inputs=request_data)
 
@@ -284,7 +283,7 @@ def update_servers(self, server_works: List[LightningWork]):
 
         deleted_servers = old_servers - new_servers
         if deleted_servers:
-            logger.info(f"servers added: {deleted_servers}")
+            logger.info(f"servers deleted: {deleted_servers}")
 
         headers = {
             "accept": "application/json",

From 04c3a7249b3b0c35fee772600e3ce3c189980ea3 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 22:21:10 +0900
Subject: [PATCH 028/110] udpate var name

---
 src/lightning_app/components/auto_scaler.py | 26 ++++++++++-----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 3464a0aa8b8d6..67ff8eed75e4b 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -325,7 +325,7 @@ def __init__(
         output_schema: Any = Dict,
     ) -> None:
         super().__init__()
-        self._num_replicas = 0
+        self.num_replicas = 0
         self._work_registry = {}
 
         self._work_cls = work_cls
@@ -363,7 +363,7 @@ def __init__(
     @property
     def workers(self) -> List[LightningWork]:
         works = []
-        for i in range(self._num_replicas):
+        for i in range(self.num_replicas):
             work = self.get_work(i)
             works.append(work)
         return works
@@ -374,10 +374,10 @@ def create_worker(self, *args, **kwargs) -> LightningWork:
 
     def add_work(self, work) -> str:
         work_attribute = uuid.uuid4().hex
-        work_attribute = f"worker_{self._num_replicas}_{str(work_attribute)}"
+        work_attribute = f"worker_{self.num_replicas}_{str(work_attribute)}"
         setattr(self, work_attribute, work)
-        self._work_registry[self._num_replicas] = work_attribute
-        self._num_replicas += 1
+        self._work_registry[self.num_replicas] = work_attribute
+        self.num_replicas += 1
         return work_attribute
 
     def remove_work(self, index: int) -> str:
@@ -385,7 +385,7 @@ def remove_work(self, index: int) -> str:
         del self._work_registry[index]
         work = getattr(self, work_attribute)
         work.stop()
-        self._num_replicas -= 1
+        self.num_replicas -= 1
         return work_attribute
 
     def get_work(self, index: int) -> LightningWork:
@@ -430,24 +430,24 @@ def autoscale(self):
         }
         num_target_workers = max(
             self.min_replicas,
-            min(self.max_replicas, self.scale(self._num_replicas, metrics)),
+            min(self.max_replicas, self.scale(self.num_replicas, metrics)),
         )
 
-        logger.info(f"Scaling from {self._num_replicas} to {num_target_workers}")
+        logger.info(f"Scaling from {self.num_replicas} to {num_target_workers}")
 
         # upscale
-        num_workers_to_add = num_target_workers - self._num_replicas
+        num_workers_to_add = num_target_workers - self.num_replicas
         for _ in range(num_workers_to_add):
-            logger.info(f"Upscaling from {self._num_replicas} to {self._num_replicas + 1}")
+            logger.info(f"Upscaling from {self.num_replicas} to {self.num_replicas + 1}")
             work = self.create_worker()
             new_work_id = self.add_work(work)
             logger.info(f"Work created: '{new_work_id}'")
 
         # downscale
-        num_workers_to_remove = self._num_replicas - num_target_workers
+        num_workers_to_remove = self.num_replicas - num_target_workers
         for _ in range(num_workers_to_remove):
-            logger.info(f"Downscaling from {self._num_replicas} to {self._num_replicas - 1}")
-            removed_work_id = self.remove_work(self._num_replicas - 1)
+            logger.info(f"Downscaling from {self.num_replicas} to {self.num_replicas - 1}")
+            removed_work_id = self.remove_work(self.num_replicas - 1)
             logger.info(f"Work removed: '{removed_work_id}'")
 
         self.load_balancer.update_servers(self.workers)

From 52d240e70fed1f60d92efed7477adcc32ff2b081 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 22:48:25 +0900
Subject: [PATCH 029/110] fix scale mechanism and clean up

---
 src/lightning_app/components/auto_scaler.py | 51 +++++++++++----------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 67ff8eed75e4b..cdb45e22abeac 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -114,8 +114,13 @@ class LoadBalancer(LightningWork):
     After enabling you will require to send username and password from the request header for the private endpoints.
 
     Args:
-        max_batch_size: Number of requests processed at once.
-        batch_timeout_secs: Number of seconds to wait before sending the requests to process.
+        input_schema: Input schema.
+        output_schema: Output schema.
+        worker_url: The REST API path.
+        max_batch_size: The number of requests processed at once.
+        timeout_batch: The number of seconds to wait before sending the requests to process.
+        timeout_keep_alive: Close Keep-Alive connections if no new data is received within this timeout.
+        timeout_inference_request: The number of seconds to wait for inference.
         \**kwargs: Arguments passed to :func:`LightningWork.init` like ``CloudCompute``, ``BuildConfig``, etc.
     """
 
@@ -124,11 +129,11 @@ def __init__(
         input_schema,
         output_schema,
         worker_url: str,
-        max_batch_size=8,
-        batch_timeout_secs=10,
-        timeout_keep_alive=60,
-        timeout_inference_request=60,
-        **kwargs,
+        max_batch_size: int = 8,
+        timeout_batch: int = 10,
+        timeout_keep_alive: int = 60,
+        timeout_inference_request: int = 60,
+        **kwargs: Any,
     ) -> None:
         super().__init__(cloud_compute=CloudCompute("default"), **kwargs)
         self._input_schema = input_schema
@@ -138,7 +143,7 @@ def __init__(
         self._timeout_inference_request = timeout_inference_request
         self.servers = []
         self.max_batch_size = max_batch_size
-        self.batch_timeout_secs = batch_timeout_secs
+        self.timeout_batch = timeout_batch
         self._ITER = None
         self._batch = []
         self._responses = {}  # {request_id: response}
@@ -182,7 +187,7 @@ async def consumer(self):
 
             batch = self._batch[: self.max_batch_size]
             while batch and (
-                (len(batch) >= self.max_batch_size) or ((time.time() - self._last_batch_sent) > self.batch_timeout_secs)
+                (len(batch) >= self.max_batch_size) or ((time.time() - self._last_batch_sent) > self.timeout_batch)
             ):
                 has_sent = True
 
@@ -302,7 +307,7 @@ class AutoScaler(LightningFlow):
         max_replicas: Max numbers of works to spawn to handle the incoming requests.
         autoscale_interval: Number of seconds to wait before checking whether to upscale or downscale the works.
         max_batch_size: Number of requests to process at once.
-        batch_timeout_secs: Number of seconds to wait before sending the requests to process.
+        timeout_batch: Number of seconds to wait before sending the requests to process.
         downscale_threshold: Lower limit to determine when to stop works.
         upscale_threshold: Upper limit to determine when to spawn up a new work.
         worker_url: Default=api/predict. Provide the REST API path
@@ -317,7 +322,7 @@ def __init__(
         max_replicas: int = 4,
         autoscale_interval: int = 1 * 10,
         max_batch_size: int = 8,
-        batch_timeout_secs: float = 2,
+        timeout_batch: float = 2,
         downscale_threshold: Optional[int] = None,
         upscale_threshold: Optional[int] = None,
         worker_url: str = None,
@@ -345,7 +350,7 @@ def __init__(
             output_schema=self._output_schema,
             worker_url=worker_url,
             max_batch_size=max_batch_size,
-            batch_timeout_secs=batch_timeout_secs,
+            timeout_batch=timeout_batch,
             cache_calls=True,
             parallel=True,
         )
@@ -356,7 +361,7 @@ def __init__(
         logger.info(
             f"Initialized AutoScaler(replicas={min_replicas}, "
             f"max_replicas={max_replicas}, "
-            f"batch timeout={batch_timeout_secs}, "
+            f"batch timeout={timeout_batch}, "
             f"batch size={max_batch_size})"
         )
 
@@ -406,35 +411,33 @@ def run(self):
 
     def scale(self, replicas: int, metrics) -> int:
         """The default replication logic that users can override."""
-        # FIXME: Don't hard code number
-        # if metrics["num_requests"] > 20:
-        #     return replicas + 1
 
-        # if metrics["num_requests"] < 10:
-        #     return replicas - 1
+        # upscale
+        if metrics["pending_requests"] > self.upscale_threshold:
+            return replicas + 1
 
-        return replicas + 1  # FIXME
+        # downscale
+        elif metrics["pending_requests"] < self.downscale_threshold:
+            return replicas - 1
+
+        return replicas
 
     def autoscale(self):
         """Upscale and down scale model inference works based on the number of requests."""
         if time.time() - self._last_autoscale < self.autoscale_interval:
             return
 
-        # ??? for what?
         self.load_balancer.update_servers(self.workers)
 
-        # ??? what's this?
         num_requests = int(requests.get(f"{self.load_balancer.url}/num-requests").json())
         metrics = {
-            "num_requests": num_requests,
+            "pending_requests": num_requests,
         }
         num_target_workers = max(
             self.min_replicas,
             min(self.max_replicas, self.scale(self.num_replicas, metrics)),
         )
 
-        logger.info(f"Scaling from {self.num_replicas} to {num_target_workers}")
-
         # upscale
         num_workers_to_add = num_target_workers - self.num_replicas
         for _ in range(num_workers_to_add):

From 6d18f24466b39cf7f98bc4cdd0e91db7b99adfc3 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 22:49:40 +0900
Subject: [PATCH 030/110] Update exampl

---
 examples/app_server_with_auto_scaler/app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 9829da4af524e..32c7d5acc5e19 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -63,10 +63,10 @@ def predict(self, requests: BatchRequestModel):
 app = L.LightningApp(
     AutoScaler(
         PyTorchServer,
-        max_replicas=3,
+        max_replicas=4,
         worker_url="predict",
         input_schema=RequestModel,
         output_schema=Any,
-        batch_timeout_secs=0.1,
+        timeout_batch=0.1,
     )
 )

From 5c5197eef676c70949716d04e948403b9a7539fd Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 23:02:48 +0900
Subject: [PATCH 031/110] ignore mypy

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 6587db0a2c80e..f61f252a08130 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,6 +80,7 @@ module = [
     "lightning_app.components.serve.types.type",
     "lightning_app.components.serve.python_server",
     "lightning_app.components.training",
+    "lightning_app.components.auto_scaler",
     "lightning_app.core.api",
     "lightning_app.core.app",
     "lightning_app.core.flow",

From 98d56ad6b5099010fa364080a982847412eb8397 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 23:04:28 +0900
Subject: [PATCH 032/110] Add test file

---
 .../tests_app/components/test_auto_scaler.py  | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 tests/tests_app/components/test_auto_scaler.py

diff --git a/tests/tests_app/components/test_auto_scaler.py b/tests/tests_app/components/test_auto_scaler.py
new file mode 100644
index 0000000000000..7e8403af40d91
--- /dev/null
+++ b/tests/tests_app/components/test_auto_scaler.py
@@ -0,0 +1,19 @@
+from lightning_app.components import AutoScaler
+
+
+class SimpleAutoScaler(AutoScaler):
+    def scale(self, replicas: int, metrics) -> int:
+        return replicas + 1
+
+
+def target_fn():
+    auto_scaler = SimpleAutoScaler()
+    auto_scaler.run()
+
+
+def test_num_replicas_not_above_max_replicas():
+    """Test self.num_replicas doesn't exceed max_replicas."""
+
+
+def test_num_replicas_not_below_min_replicas():
+    """Test num_replicas doesn't go below min_replicas."""

From b34076e7a14fbccb351a1f56aa4827d012887810 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 25 Nov 2022 23:22:33 +0900
Subject: [PATCH 033/110] .

---
 src/lightning_app/components/database/server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/lightning_app/components/database/server.py b/src/lightning_app/components/database/server.py
index 01bd8f3b12033..6d187e4cda133 100644
--- a/src/lightning_app/components/database/server.py
+++ b/src/lightning_app/components/database/server.py
@@ -19,6 +19,8 @@
 
 if _is_sqlmodel_available():
     from sqlmodel import SQLModel
+else:
+    SQLModel = object
 
 
 # Required to avoid Uvicorn Server overriding Lightning App signal handlers.

From 26ca77d8124708ee3feef8a5494fcb80d25ddb8e Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Mon, 28 Nov 2022 19:22:56 +0900
Subject: [PATCH 034/110] update impl and update tests

---
 src/lightning_app/components/auto_scaler.py   | 42 ++++++++---
 .../tests_app/components/test_auto_scaler.py  | 74 +++++++++++++++++--
 2 files changed, 99 insertions(+), 17 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index cdb45e22abeac..69d7ed8b5901f 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -12,6 +12,7 @@
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import RedirectResponse
+from lightning_utilities.core.overrides import is_overridden
 from pydantic import BaseModel
 
 from lightning_app.core.flow import LightningFlow
@@ -312,12 +313,25 @@ class AutoScaler(LightningFlow):
         upscale_threshold: Upper limit to determine when to spawn up a new work.
         worker_url: Default=api/predict. Provide the REST API path
         input_schema:
-        output_schema
+        output_schema:
+
+        .. doctest::
+
+            >>> from lightning_app.components import AutoScaler
+            >>> from lightning_app import LightningApp
+            >>> app = LightningApp(
+            ...     AutoScaler(
+            ...         MyPythonServer,  # noqa: F821
+            ...         min_replicas=1,
+            ...         max_replicas=4,
+            ...         autoscale_interval=10,
+            ...     )
+            ... )
     """
 
     def __init__(
         self,
-        work_cls: type,
+        work_cls: Optional[type] = None,
         min_replicas: int = 1,
         max_replicas: int = 4,
         autoscale_interval: int = 1 * 10,
@@ -333,15 +347,18 @@ def __init__(
         self.num_replicas = 0
         self._work_registry = {}
 
+        assert work_cls is not None or is_overridden("create_worker", self, AutoScaler)
         self._work_cls = work_cls
         self._input_schema = input_schema
         self._output_schema = output_schema
         self.autoscale_interval = autoscale_interval
+
+        if max_replicas < min_replicas:
+            raise ValueError("max_replicas must be less than or equal to min_replicas.")
         self.max_replicas = max_replicas
         self.min_replicas = min_replicas
         self.downscale_threshold = downscale_threshold or min_replicas
         self.upscale_threshold = upscale_threshold or min_replicas * max_batch_size
-        self.fake_trigger = 0
         self._last_autoscale = time.time()
 
         worker_url = worker_url or "api/predict"
@@ -359,10 +376,11 @@ def __init__(
             self.add_work(work)
 
         logger.info(
-            f"Initialized AutoScaler(replicas={min_replicas}, "
+            f"Initialized AutoScaler("
+            f"min_replicas={min_replicas}, "
             f"max_replicas={max_replicas}, "
-            f"batch timeout={timeout_batch}, "
-            f"batch size={max_batch_size})"
+            f"timeout_batch={timeout_batch}, "
+            f"max_batch_size={max_batch_size})"
         )
 
     @property
@@ -374,7 +392,7 @@ def workers(self) -> List[LightningWork]:
         return works
 
     def create_worker(self, *args, **kwargs) -> LightningWork:
-        """implement."""
+        """Override this hook to customise the work creation process."""
         return self._work_cls()
 
     def add_work(self, work) -> str:
@@ -406,7 +424,6 @@ def run(self):
             worker.run()
 
         if self.load_balancer.url:
-            self.fake_trigger += 1
             self.autoscale()
 
     def scale(self, replicas: int, metrics) -> int:
@@ -422,6 +439,10 @@ def scale(self, replicas: int, metrics) -> int:
 
         return replicas
 
+    @property
+    def num_requests(self):
+        return int(requests.get(f"{self.load_balancer.url}/num-requests").json())
+
     def autoscale(self):
         """Upscale and down scale model inference works based on the number of requests."""
         if time.time() - self._last_autoscale < self.autoscale_interval:
@@ -429,10 +450,11 @@ def autoscale(self):
 
         self.load_balancer.update_servers(self.workers)
 
-        num_requests = int(requests.get(f"{self.load_balancer.url}/num-requests").json())
         metrics = {
-            "pending_requests": num_requests,
+            "pending_requests": self.num_requests,
         }
+
+        # ensure min_replicas <= num_replicas <= max_replicas
         num_target_workers = max(
             self.min_replicas,
             min(self.max_replicas, self.scale(self.num_replicas, metrics)),
diff --git a/tests/tests_app/components/test_auto_scaler.py b/tests/tests_app/components/test_auto_scaler.py
index 7e8403af40d91..d5ba06eb2ea97 100644
--- a/tests/tests_app/components/test_auto_scaler.py
+++ b/tests/tests_app/components/test_auto_scaler.py
@@ -1,19 +1,79 @@
+import time
+from unittest.mock import patch
+
+from lightning_app import LightningWork
 from lightning_app.components import AutoScaler
 
 
-class SimpleAutoScaler(AutoScaler):
+class EmptyWork(LightningWork):
+    def run(self):
+        pass
+
+
+class AutoScaler1(AutoScaler):
     def scale(self, replicas: int, metrics) -> int:
+        # only upscale
         return replicas + 1
 
+    @property
+    def num_requests(self):
+        """To avoid sending a request to '/num-requests'."""
+        return 0  # whatever number
+
+
+class AutoScaler2(AutoScaler):
+    def scale(self, replicas: int, metrics) -> int:
+        # only downscale
+        return replicas - 1
+
+    @property
+    def num_requests(self):
+        """To avoid sending a request to '/num-requests'."""
+        return 0  # whatever number
+
+
+def test_num_replicas_after_init():
+    """Test the number of works is the same as min_replicas after initialization."""
+    min_replicas = 2
+    auto_scaler = AutoScaler(EmptyWork, min_replicas=min_replicas)
+    assert auto_scaler.num_replicas == min_replicas
+
+
+@patch("uvicorn.run")
+@patch("lightning_app.components.auto_scaler.LoadBalancer.url")
+def test_num_replicas_not_above_max_replicas(url, uvicorn_run):
+    """Test self.num_replicas doesn't exceed max_replicas."""
+    min_replicas = 1
+    max_replicas = 6
+    auto_scaler = AutoScaler1(
+        EmptyWork,
+        min_replicas=min_replicas,
+        max_replicas=max_replicas,
+        autoscale_interval=0.001,
+    )
+
+    for _ in range(max_replicas + 1):
+        time.sleep(0.002)
+        auto_scaler.run()
 
-def target_fn():
-    auto_scaler = SimpleAutoScaler()
-    auto_scaler.run()
+    assert auto_scaler.num_replicas == max_replicas
 
 
-def test_num_replicas_not_above_max_replicas():
+@patch("uvicorn.run")
+@patch("lightning_app.components.auto_scaler.LoadBalancer.url")
+def test_num_replicas_not_belo_min_replicas(url, uvicorn_run):
     """Test self.num_replicas doesn't exceed max_replicas."""
+    min_replicas = 1
+    max_replicas = 6
+    auto_scaler = AutoScaler2(
+        EmptyWork,
+        min_replicas=min_replicas,
+        max_replicas=max_replicas,
+        autoscale_interval=0.001,
+    )
 
+    for _ in range(3):
+        time.sleep(0.002)
+        auto_scaler.run()
 
-def test_num_replicas_not_below_min_replicas():
-    """Test num_replicas doesn't go below min_replicas."""
+    assert auto_scaler.num_replicas == min_replicas

From c2302546d741f424c247ed75580324eac66accef Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Mon, 28 Nov 2022 19:31:58 +0900
Subject: [PATCH 035/110] Update changlog

---
 src/lightning_app/CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
index 54b74d00a0404..476f9b51a0af9 100644
--- a/src/lightning_app/CHANGELOG.md
+++ b/src/lightning_app/CHANGELOG.md
@@ -15,6 +15,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added a CloudMultiProcessBackend which enables running a child App from within the Flow in the cloud ([#15800](https://github.com/Lightning-AI/lightning/pull/15800))
 
+- Added `AutoScaler` component ([#15769](https://github.com/Lightning-AI/lightning/pull/15769))
+
 
 ### Changed
 

From 80e6b7dc3a374578f99b9fa1769a3544164980e5 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Mon, 28 Nov 2022 19:53:08 +0900
Subject: [PATCH 036/110] .

---
 src/lightning_app/components/auto_scaler.py | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 69d7ed8b5901f..6869226b634f0 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -39,7 +39,7 @@ def _raise_granular_exception(exception: Exception) -> None:
         raise HTTPException(500, "Worker Server error")
 
     if isinstance(exception, asyncio.TimeoutError):
-        raise TimeoutException()
+        raise HTTPException(408, "Request timed out")
 
     if isinstance(exception, Exception):
         if exception.args[0] == "Server disconnected":
@@ -49,11 +49,6 @@ def _raise_granular_exception(exception: Exception) -> None:
     raise HTTPException(500, exception.args[0])
 
 
-class TimeoutException(HTTPException):
-    def __init__(self, status_code: int = 408, detail: str = "Request timed out.", *args: Any, **kwargs: Any) -> None:
-        super().__init__(status_code=status_code, detail=detail, *args, **kwargs)
-
-
 class _SysInfo(BaseModel):
     num_workers: int
     servers: List[str]
@@ -105,15 +100,10 @@ async def num_requests() -> int:
     return fastapi_app
 
 
-class LoadBalancer(LightningWork):
+class _LoadBalancer(LightningWork):
     r"""The LoadBalancer is a LightningWork component that collects the requests and sends it to the prediciton API
     asynchronously using RoundRobin scheduling. It also performs auto batching of the incoming requests.
 
-    The LoadBalancer exposes system endpoints with a basic HTTP authentication, in order to activate the authentication
-    you need to provide a system password from environment variable
-    `lightning run app lb_flow.py --env MUSE_SYSTEM_PASSWORD=PASSWORD`.
-    After enabling you will require to send username and password from the request header for the private endpoints.
-
     Args:
         input_schema: Input schema.
         output_schema: Output schema.
@@ -153,7 +143,7 @@ def __init__(
 
     async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
         server = next(self._ITER)  # round-robin
-        request_data: List[LoadBalancer._input_schema] = [b[1] for b in batch]
+        request_data: List[_LoadBalancer._input_schema] = [b[1] for b in batch]
         batch_request_data = _BatchRequestModel(inputs=request_data)
 
         try:
@@ -169,7 +159,7 @@ async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
                     headers=headers,
                 ) as response:
                     if response.status == 408:
-                        raise TimeoutException()
+                        raise HTTPException(408, "Request timed out")
                     response.raise_for_status()
                     response = await response.json()
                     outputs = response["outputs"]
@@ -362,7 +352,7 @@ def __init__(
         self._last_autoscale = time.time()
 
         worker_url = worker_url or "api/predict"
-        self.load_balancer = LoadBalancer(
+        self.load_balancer = _LoadBalancer(
             input_schema=self._input_schema,
             output_schema=self._output_schema,
             worker_url=worker_url,

From 2aeec1ca189a98f72f8a6b4ccb9de6750da39b8a Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Mon, 28 Nov 2022 19:55:29 +0900
Subject: [PATCH 037/110] revert docs

---
 docs/source-app/api_references.rst | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/docs/source-app/api_references.rst b/docs/source-app/api_references.rst
index 03396f11ba374..808d2cb0a365a 100644
--- a/docs/source-app/api_references.rst
+++ b/docs/source-app/api_references.rst
@@ -32,19 +32,11 @@ ___________________
     :nosignatures:
     :template: classtemplate_no_index.rst
 
-    ~database.client.DatabaseClient
-    ~database.server.Database
     ~python.popen.PopenPythonScript
     ~python.tracer.TracerPythonScript
     ~training.LightningTrainerScript
     ~serve.gradio.ServeGradio
     ~serve.serve.ModelInferenceAPI
-    ~serve.python_server.PythonServer
-    ~serve.streamlit.ServeStreamlit
-    ~multi_node.base.MultiNode
-    ~multi_node.lite.LiteMultiNode
-    ~multi_node.pytorch_spawn.PyTorchSpawnMultiNode
-    ~multi_node.trainer.LightningTrainerMultiNode
     ~auto_scaler.AutoScaler
 
 ----

From 8091ca9799cf19fc9aea65ababdd7984a3f43985 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Mon, 28 Nov 2022 20:42:26 +0900
Subject: [PATCH 038/110] update test

---
 .../tests_app/components/test_auto_scaler.py  | 26 ++++++-------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/tests/tests_app/components/test_auto_scaler.py b/tests/tests_app/components/test_auto_scaler.py
index d5ba06eb2ea97..d0ce1cf3208c0 100644
--- a/tests/tests_app/components/test_auto_scaler.py
+++ b/tests/tests_app/components/test_auto_scaler.py
@@ -15,22 +15,12 @@ def scale(self, replicas: int, metrics) -> int:
         # only upscale
         return replicas + 1
 
-    @property
-    def num_requests(self):
-        """To avoid sending a request to '/num-requests'."""
-        return 0  # whatever number
-
 
 class AutoScaler2(AutoScaler):
     def scale(self, replicas: int, metrics) -> int:
         # only downscale
         return replicas - 1
 
-    @property
-    def num_requests(self):
-        """To avoid sending a request to '/num-requests'."""
-        return 0  # whatever number
-
 
 def test_num_replicas_after_init():
     """Test the number of works is the same as min_replicas after initialization."""
@@ -40,14 +30,14 @@ def test_num_replicas_after_init():
 
 
 @patch("uvicorn.run")
-@patch("lightning_app.components.auto_scaler.LoadBalancer.url")
-def test_num_replicas_not_above_max_replicas(url, uvicorn_run):
+@patch("lightning_app.components.auto_scaler._LoadBalancer.url")
+@patch("lightning_app.components.auto_scaler.AutoScaler.num_requests")
+def test_num_replicas_not_above_max_replicas(*_):
     """Test self.num_replicas doesn't exceed max_replicas."""
-    min_replicas = 1
     max_replicas = 6
     auto_scaler = AutoScaler1(
         EmptyWork,
-        min_replicas=min_replicas,
+        min_replicas=1,
         max_replicas=max_replicas,
         autoscale_interval=0.001,
     )
@@ -60,15 +50,15 @@ def test_num_replicas_not_above_max_replicas(url, uvicorn_run):
 
 
 @patch("uvicorn.run")
-@patch("lightning_app.components.auto_scaler.LoadBalancer.url")
-def test_num_replicas_not_belo_min_replicas(url, uvicorn_run):
+@patch("lightning_app.components.auto_scaler._LoadBalancer.url")
+@patch("lightning_app.components.auto_scaler.AutoScaler.num_requests")
+def test_num_replicas_not_belo_min_replicas(*_):
     """Test self.num_replicas doesn't exceed max_replicas."""
     min_replicas = 1
-    max_replicas = 6
     auto_scaler = AutoScaler2(
         EmptyWork,
         min_replicas=min_replicas,
-        max_replicas=max_replicas,
+        max_replicas=4,
         autoscale_interval=0.001,
     )
 

From a2bfaede446b917319a5ee1afb2ef8702baf6a6e Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Mon, 28 Nov 2022 22:33:39 +0900
Subject: [PATCH 039/110] update state to keep calling 'flow.run()'

Co-authored-by: Aniket Maurya <theaniketmaurya@gmail.com>
---
 examples/app_server_with_auto_scaler/app.py | 4 +++-
 src/lightning_app/components/auto_scaler.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 32c7d5acc5e19..d6a8f8e921ed6 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -63,10 +63,12 @@ def predict(self, requests: BatchRequestModel):
 app = L.LightningApp(
     AutoScaler(
         PyTorchServer,
+        min_replicas=2,
         max_replicas=4,
         worker_url="predict",
         input_schema=RequestModel,
         output_schema=Any,
-        timeout_batch=0.1,
+        timeout_batch=1,
+        autoscale_interval=10,
     )
 )
diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 6869226b634f0..2412d547f8acb 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -414,6 +414,7 @@ def run(self):
             worker.run()
 
         if self.load_balancer.url:
+            self.fake_trigger += 1  # change state to keep calling `run`.
             self.autoscale()
 
     def scale(self, replicas: int, metrics) -> int:

From eb784feabe8ccc4f97573b5e82028e09a5371e08 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Mon, 28 Nov 2022 23:20:57 +0900
Subject: [PATCH 040/110] Add aiohttp to base requirements

---
 requirements/app/base.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/app/base.txt b/requirements/app/base.txt
index 83244527e285b..b5ca06aafb7f0 100644
--- a/requirements/app/base.txt
+++ b/requirements/app/base.txt
@@ -13,3 +13,4 @@ inquirer>=2.10.0
 psutil<5.9.4
 click<=8.1.3
 lightning_api_access>=0.0.1
+aiohttp<=3.8.3

From 92089596797bac20fc2b8cd6876cf8536ff60e40 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Mon, 28 Nov 2022 23:27:06 +0900
Subject: [PATCH 041/110] Update docs

Co-authored-by: Luca Antiga <luca.antiga@gmail.com>
---
 src/lightning_app/components/auto_scaler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 2412d547f8acb..83ef94c9eb29b 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -101,7 +101,7 @@ async def num_requests() -> int:
 
 
 class _LoadBalancer(LightningWork):
-    r"""The LoadBalancer is a LightningWork component that collects the requests and sends it to the prediciton API
+    r"""The LoadBalancer is a LightningWork component that collects the requests and sends them to the prediciton API
     asynchronously using RoundRobin scheduling. It also performs auto batching of the incoming requests.
 
     Args:
@@ -109,7 +109,7 @@ class _LoadBalancer(LightningWork):
         output_schema: Output schema.
         worker_url: The REST API path.
         max_batch_size: The number of requests processed at once.
-        timeout_batch: The number of seconds to wait before sending the requests to process.
+        timeout_batch: The number of seconds to wait before sending the requests to process in order to allow for requests to be batched. In any case, requests are processed as soon as `max_batch_size` is reached.
         timeout_keep_alive: Close Keep-Alive connections if no new data is received within this timeout.
         timeout_inference_request: The number of seconds to wait for inference.
         \**kwargs: Arguments passed to :func:`LightningWork.init` like ``CloudCompute``, ``BuildConfig``, etc.

From 15dc3ae54238e073d40cb9c7be8410fcd534028f Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 00:03:41 +0900
Subject: [PATCH 042/110] Use deserializer utility

---
 examples/app_server_with_auto_scaler/app.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index d6a8f8e921ed6..caf54326065cd 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -1,15 +1,13 @@
-import base64
-import io
 from typing import Any, List
 
 import torch
 import torchvision
-from PIL import Image as PILImage
 from pydantic import BaseModel
 
 import lightning as L
 from lightning.app.components import AutoScaler
 from lightning.app.components.serve import PythonServer
+from lightning.app.components.serve.types.image import Image
 from lightning.app.utilities.network import find_free_network_port
 
 
@@ -49,8 +47,7 @@ def predict(self, requests: BatchRequestModel):
         )
         images = []
         for request in requests.inputs:
-            image = base64.b64decode(request.image.encode("utf-8"))
-            image = PILImage.open(io.BytesIO(image))
+            image = Image.deserialize(request.image)
             image = transforms(image).unsqueeze(0)
             images.append(image)
         images = torch.cat(images)

From 7ffd45ad1446fd8a7c3cfeba179bf4a40d39e469 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 00:05:01 +0900
Subject: [PATCH 043/110] fake trigger

---
 src/lightning_app/components/auto_scaler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 83ef94c9eb29b..bed1d909175bc 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -109,7 +109,8 @@ class _LoadBalancer(LightningWork):
         output_schema: Output schema.
         worker_url: The REST API path.
         max_batch_size: The number of requests processed at once.
-        timeout_batch: The number of seconds to wait before sending the requests to process in order to allow for requests to be batched. In any case, requests are processed as soon as `max_batch_size` is reached.
+        timeout_batch: The number of seconds to wait before sending the requests to process in order to allow for
+            requests to be batched. In any case, requests are processed as soon as `max_batch_size` is reached.
         timeout_keep_alive: Close Keep-Alive connections if no new data is received within this timeout.
         timeout_inference_request: The number of seconds to wait for inference.
         \**kwargs: Arguments passed to :func:`LightningWork.init` like ``CloudCompute``, ``BuildConfig``, etc.
@@ -350,6 +351,7 @@ def __init__(
         self.downscale_threshold = downscale_threshold or min_replicas
         self.upscale_threshold = upscale_threshold or min_replicas * max_batch_size
         self._last_autoscale = time.time()
+        self.fake_trigger = 0
 
         worker_url = worker_url or "api/predict"
         self.load_balancer = _LoadBalancer(

From 10627e98ceebf1bac3f5fd9444aef28a7a907e64 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 07:59:45 +0900
Subject: [PATCH 044/110] wip: protect /system/* with basic auth

---
 src/lightning_app/components/auto_scaler.py | 70 +++++++++++++++++----
 1 file changed, 58 insertions(+), 12 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index bed1d909175bc..291478e018b6e 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -1,7 +1,10 @@
 import asyncio
 import logging
+import os
+import secrets
 import time
 import uuid
+from base64 import b64encode
 from itertools import cycle
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -9,11 +12,13 @@
 import aiohttp.client_exceptions
 import requests
 import uvicorn
-from fastapi import FastAPI, HTTPException, Request
+from fastapi import Depends, FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import RedirectResponse
+from fastapi.security import HTTPBasic, HTTPBasicCredentials
 from lightning_utilities.core.overrides import is_overridden
 from pydantic import BaseModel
+from starlette.status import HTTP_401_UNAUTHORIZED
 
 from lightning_app.core.flow import LightningFlow
 from lightning_app.core.work import LightningWork
@@ -23,6 +28,9 @@
 logger = Logger(__name__)
 
 
+BASIC_AUTH_PASSWORD = os.environ.get("BASIC_AUTH_PASSWORD", "")
+
+
 def _raise_granular_exception(exception: Exception) -> None:
     """Handle an exception from hitting the model servers."""
     if not isinstance(exception, Exception):
@@ -32,21 +40,21 @@ def _raise_granular_exception(exception: Exception) -> None:
         raise exception
 
     if isinstance(exception, aiohttp.client_exceptions.ServerDisconnectedError):
-        raise HTTPException(500, "Worker Server Disconnected")
+        raise HTTPException(500, "Worker Server Disconnected") from exception
 
     if isinstance(exception, aiohttp.client_exceptions.ClientError):
         logging.exception(exception)
-        raise HTTPException(500, "Worker Server error")
+        raise HTTPException(500, "Worker Server error") from exception
 
     if isinstance(exception, asyncio.TimeoutError):
-        raise HTTPException(408, "Request timed out")
+        raise HTTPException(408, "Request timed out") from exception
 
     if isinstance(exception, Exception):
         if exception.args[0] == "Server disconnected":
-            raise HTTPException(500, "Worker Server disconnected")
+            raise HTTPException(500, "Worker Server disconnected") from exception
 
     logging.exception(exception)
-    raise HTTPException(500, exception.args[0])
+    raise HTTPException(500, exception.args[0]) from exception
 
 
 class _SysInfo(BaseModel):
@@ -104,6 +112,13 @@ class _LoadBalancer(LightningWork):
     r"""The LoadBalancer is a LightningWork component that collects the requests and sends them to the prediciton API
     asynchronously using RoundRobin scheduling. It also performs auto batching of the incoming requests.
 
+    The LoadBalancer exposes system endpoints with a basic HTTP authentication, in order to activate the authentication
+    you need to provide a system password from environment variable::
+
+        lightning run app app.py --env BASIC_AUTH_PASSWORD=PASSWORD
+
+    After enabling you will require to send username and password from the request header for the private endpoints.
+
     Args:
         input_schema: Input schema.
         output_schema: Output schema.
@@ -221,6 +236,7 @@ def run(self):
         self._last_batch_sent = time.time()
 
         fastapi_app = _create_fastapi("Load Balancer")
+        security = HTTPBasic()
         fastapi_app.global_request_count = 0
         fastapi_app.num_current_requests = 0
         fastapi_app.last_process_time = 0
@@ -236,8 +252,21 @@ def shutdown_event():
             fastapi_app.SEND_TASK.cancel()
             self._server_ready = False
 
+        def authenticate_private_endpoint(credentials: HTTPBasicCredentials = Depends(security)):
+            if len(BASIC_AUTH_PASSWORD) == 0:
+                logging.warning("You have not set password for private endpoints!")
+            current_password_bytes = credentials.password.encode("utf8")
+            is_correct_password = secrets.compare_digest(current_password_bytes, BASIC_AUTH_PASSWORD.encode("utf8"))
+            if not is_correct_password:
+                raise HTTPException(
+                    status_code=401,
+                    detail="Incorrect password",
+                    headers={"WWW-Authenticate": "Basic"},
+                )
+            return True
+
         @fastapi_app.get("/system/info", response_model=_SysInfo)
-        async def sys_info():
+        async def sys_info(authenticated: bool = Depends(authenticate_private_endpoint)):
             return _SysInfo(
                 num_workers=len(self.servers),
                 servers=self.servers,
@@ -247,7 +276,7 @@ async def sys_info():
             )
 
         @fastapi_app.put("/system/update-servers")
-        async def update_servers(servers: List[str]):
+        async def update_servers(servers: List[str], authenticated: bool = Depends(authenticate_private_endpoint)):
             self.servers = servers
             self._ITER = cycle(self.servers)
 
@@ -270,11 +299,13 @@ def update_servers(self, server_works: List[LightningWork]):
         AutoScaler uses this method to increase/decrease the number of works.
         """
         old_servers = set(self.servers)
-        server_urls: List[str] = [server.url for server in server_works if server.url]
-        new_servers = set(server_urls)
+        self.servers: List[str] = [server.url for server in server_works if server.url]
+        new_servers = set(self.servers)
+
         if new_servers == old_servers:
             logging.debug("no new server added")
             return
+
         if new_servers - old_servers:
             logger.info(f"servers added: {new_servers - old_servers}")
 
@@ -282,11 +313,26 @@ def update_servers(self, server_works: List[LightningWork]):
         if deleted_servers:
             logger.info(f"servers deleted: {deleted_servers}")
 
+        self.send_request_to_update_servers(self.servers)
+
+    def send_request_to_update_servers(self, servers: List[str]):
+        AUTHORIZATION_TYPE = "Basic"
+        USERNAME = "lightning"
+        try:
+            param = f"{USERNAME}:{BASIC_AUTH_PASSWORD}".encode()
+            data = b64encode(param).decode("utf-8")
+        except (ValueError, UnicodeDecodeError) as e:
+            raise HTTPException(
+                status_code=HTTP_401_UNAUTHORIZED,
+                detail="Invalid authentication credentials",
+                headers={"WWW-Authenticate": "Basic"},
+            ) from e
         headers = {
             "accept": "application/json",
-            "username": "lightning",
+            "username": USERNAME,
+            "Authorization": AUTHORIZATION_TYPE + " " + data,
         }
-        response = requests.put(f"{self.url}/system/update-servers", json=server_urls, headers=headers, timeout=10)
+        response = requests.put(f"{self.url}/system/update-servers", json=servers, headers=headers, timeout=10)
         response.raise_for_status()
 
 

From f79d16b73cce532bb6e85a25ca43466780b915bd Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 16:20:05 +0900
Subject: [PATCH 045/110] read password at runtime

---
 src/lightning_app/components/auto_scaler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 291478e018b6e..385db5d4e5c97 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -28,9 +28,6 @@
 logger = Logger(__name__)
 
 
-BASIC_AUTH_PASSWORD = os.environ.get("BASIC_AUTH_PASSWORD", "")
-
-
 def _raise_granular_exception(exception: Exception) -> None:
     """Handle an exception from hitting the model servers."""
     if not isinstance(exception, Exception):
@@ -253,6 +250,7 @@ def shutdown_event():
             self._server_ready = False
 
         def authenticate_private_endpoint(credentials: HTTPBasicCredentials = Depends(security)):
+            BASIC_AUTH_PASSWORD = os.environ.get("BASIC_AUTH_PASSWORD", "")
             if len(BASIC_AUTH_PASSWORD) == 0:
                 logging.warning("You have not set password for private endpoints!")
             current_password_bytes = credentials.password.encode("utf8")
@@ -318,6 +316,8 @@ def update_servers(self, server_works: List[LightningWork]):
     def send_request_to_update_servers(self, servers: List[str]):
         AUTHORIZATION_TYPE = "Basic"
         USERNAME = "lightning"
+        BASIC_AUTH_PASSWORD = os.environ.get("BASIC_AUTH_PASSWORD", "")
+
         try:
             param = f"{USERNAME}:{BASIC_AUTH_PASSWORD}".encode()
             data = b64encode(param).decode("utf-8")

From ae7f300ee0711b59b440648940068ed608483c0f Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 16:36:32 +0900
Subject: [PATCH 046/110] Change env var name

---
 src/lightning_app/components/auto_scaler.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 385db5d4e5c97..e47b31a700637 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -112,7 +112,7 @@ class _LoadBalancer(LightningWork):
     The LoadBalancer exposes system endpoints with a basic HTTP authentication, in order to activate the authentication
     you need to provide a system password from environment variable::
 
-        lightning run app app.py --env BASIC_AUTH_PASSWORD=PASSWORD
+        lightning run app app.py --env AUTO_SCALER_AUTH_PASSWORD=PASSWORD
 
     After enabling you will require to send username and password from the request header for the private endpoints.
 
@@ -250,11 +250,13 @@ def shutdown_event():
             self._server_ready = False
 
         def authenticate_private_endpoint(credentials: HTTPBasicCredentials = Depends(security)):
-            BASIC_AUTH_PASSWORD = os.environ.get("BASIC_AUTH_PASSWORD", "")
-            if len(BASIC_AUTH_PASSWORD) == 0:
+            AUTO_SCALER_AUTH_PASSWORD = os.environ.get("AUTO_SCALER_AUTH_PASSWORD", "")
+            if len(AUTO_SCALER_AUTH_PASSWORD) == 0:
                 logging.warning("You have not set password for private endpoints!")
             current_password_bytes = credentials.password.encode("utf8")
-            is_correct_password = secrets.compare_digest(current_password_bytes, BASIC_AUTH_PASSWORD.encode("utf8"))
+            is_correct_password = secrets.compare_digest(
+                current_password_bytes, AUTO_SCALER_AUTH_PASSWORD.encode("utf8")
+            )
             if not is_correct_password:
                 raise HTTPException(
                     status_code=401,
@@ -316,10 +318,10 @@ def update_servers(self, server_works: List[LightningWork]):
     def send_request_to_update_servers(self, servers: List[str]):
         AUTHORIZATION_TYPE = "Basic"
         USERNAME = "lightning"
-        BASIC_AUTH_PASSWORD = os.environ.get("BASIC_AUTH_PASSWORD", "")
+        AUTO_SCALER_AUTH_PASSWORD = os.environ.get("AUTO_SCALER_AUTH_PASSWORD", "")
 
         try:
-            param = f"{USERNAME}:{BASIC_AUTH_PASSWORD}".encode()
+            param = f"{USERNAME}:{AUTO_SCALER_AUTH_PASSWORD}".encode()
             data = b64encode(param).decode("utf-8")
         except (ValueError, UnicodeDecodeError) as e:
             raise HTTPException(

From 8ea25d1875f6753652aebf1674be1c3e1b4cd6e0 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 16:45:47 +0900
Subject: [PATCH 047/110] import torch as optional

---
 src/lightning_app/components/serve/python_server.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lightning_app/components/serve/python_server.py b/src/lightning_app/components/serve/python_server.py
index 9ce1b23701059..a594967ddb61c 100644
--- a/src/lightning_app/components/serve/python_server.py
+++ b/src/lightning_app/components/serve/python_server.py
@@ -4,7 +4,6 @@
 from pathlib import Path
 from typing import Any, Dict, Optional
 
-import torch
 import uvicorn
 from fastapi import FastAPI
 from pydantic import BaseModel
@@ -202,6 +201,8 @@ def _attach_predict_fn(self, fastapi_app: FastAPI) -> None:
         input_type: type = self.configure_input_type()
         output_type: type = self.configure_output_type()
 
+        import torch
+
         def predict_fn(request: input_type):  # type: ignore
             with torch.inference_mode():
                 return self.predict(request)

From 9e66136ef1c611177416d48041f0f90a0afa4586 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 19:02:23 +0900
Subject: [PATCH 048/110] Don't overcreate works

---
 src/lightning_app/components/auto_scaler.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index e47b31a700637..4559d9966883a 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -471,28 +471,33 @@ def scale(self, replicas: int, metrics) -> int:
         """The default replication logic that users can override."""
 
         # upscale
-        if metrics["pending_requests"] > self.upscale_threshold:
+        if metrics["pending_requests"] > self.upscale_threshold * metrics["pending_works"]:
             return replicas + 1
 
         # downscale
-        elif metrics["pending_requests"] < self.downscale_threshold:
+        if metrics["pending_requests"] < self.downscale_threshold:
             return replicas - 1
 
         return replicas
 
     @property
-    def num_requests(self):
+    def num_pending_requests(self) -> int:
         return int(requests.get(f"{self.load_balancer.url}/num-requests").json())
 
-    def autoscale(self):
-        """Upscale and down scale model inference works based on the number of requests."""
+    @property
+    def num_pending_works(self) -> int:
+        return sum(1 for work in self.workers if work.url)
+
+    def autoscale(self) -> None:
+        """Upscale and down scale model inference works."""
         if time.time() - self._last_autoscale < self.autoscale_interval:
             return
 
         self.load_balancer.update_servers(self.workers)
 
         metrics = {
-            "pending_requests": self.num_requests,
+            "pending_requests": self.num_pending_requests,
+            "pending_works": self.num_pending_works,
         }
 
         # ensure min_replicas <= num_replicas <= max_replicas

From 94600d7bfbf6f6676b108ff92df5af8328cc7a64 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 21:08:18 +0900
Subject: [PATCH 049/110] simplify imports

---
 examples/app_server_with_auto_scaler/app.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index caf54326065cd..02d00e6c02474 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -5,10 +5,6 @@
 from pydantic import BaseModel
 
 import lightning as L
-from lightning.app.components import AutoScaler
-from lightning.app.components.serve import PythonServer
-from lightning.app.components.serve.types.image import Image
-from lightning.app.utilities.network import find_free_network_port
 
 
 class RequestModel(BaseModel):
@@ -23,10 +19,10 @@ class BatchResponse(BaseModel):
     outputs: List[Any]
 
 
-class PyTorchServer(PythonServer):
+class PyTorchServer(L.app.components.PythonServer):
     def __init__(self, *args, **kwargs):
         super().__init__(
-            port=find_free_network_port(),
+            port=L.app.utilities.network.find_free_network_port(),
             input_type=BatchRequestModel,
             output_type=BatchResponse,
             cloud_compute=L.CloudCompute("gpu"),
@@ -47,7 +43,7 @@ def predict(self, requests: BatchRequestModel):
         )
         images = []
         for request in requests.inputs:
-            image = Image.deserialize(request.image)
+            image = L.app.components.Image.deserialize(request.image)
             image = transforms(image).unsqueeze(0)
             images.append(image)
         images = torch.cat(images)
@@ -58,7 +54,7 @@ def predict(self, requests: BatchRequestModel):
 
 
 app = L.LightningApp(
-    AutoScaler(
+    L.app.components.AutoScaler(
         PyTorchServer,
         min_replicas=2,
         max_replicas=4,

From 15dca215c205627511165590870110d891917255 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 21:17:26 +0900
Subject: [PATCH 050/110] Update example

---
 examples/app_server_with_auto_scaler/app.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 02d00e6c02474..abe715d5f9c67 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -1,3 +1,4 @@
+import warnings  # FIXME: remove before merge
 from typing import Any, List
 
 import torch
@@ -6,6 +7,9 @@
 
 import lightning as L
 
+warnings.filterwarnings("ignore", "Arguments other than a weight enum", UserWarning)  # FIXME: remove before merge
+warnings.filterwarnings("ignore", "The parameter 'pretrained' is deprecated", UserWarning)  # FIXME: remove before merge
+
 
 class RequestModel(BaseModel):
     image: str
@@ -53,8 +57,23 @@ def predict(self, requests: BatchRequestModel):
         return BatchResponse(outputs=[{"prediction": e} for e in results])
 
 
+class MyAutoScaler(L.app.components.AutoScaler):
+    def scale(self, replicas, metrics):
+        """The default replication logic that users can override."""
+
+        # upscale
+        if metrics["pending_requests"] > self.upscale_threshold * metrics["pending_works"]:
+            return replicas + 1
+
+        # downscale
+        if metrics["pending_requests"] < self.downscale_threshold:
+            return replicas - 1
+
+        return replicas
+
+
 app = L.LightningApp(
-    L.app.components.AutoScaler(
+    MyAutoScaler(
         PyTorchServer,
         min_replicas=2,
         max_replicas=4,

From 8d6562884e18dc940aefff27506adc5a2ddabd6a Mon Sep 17 00:00:00 2001
From: Jirka Borovec <6035284+Borda@users.noreply.github.com>
Date: Tue, 29 Nov 2022 14:01:25 +0100
Subject: [PATCH 051/110] aiohttp

---
 requirements/app/base.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/app/base.txt b/requirements/app/base.txt
index 96586ad9fd8a9..ac5197b9a6e3d 100644
--- a/requirements/app/base.txt
+++ b/requirements/app/base.txt
@@ -13,4 +13,4 @@ inquirer>=2.10.0
 psutil<5.9.4
 click<=8.1.3
 lightning_api_access>=0.0.3
-aiohttp<=3.8.3
+aiohttp>=3.8.0, <=3.8.3

From fe5c0f483bcdb21bcd3e0a3b9513a3801fb1ce2e Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 22:10:31 +0900
Subject: [PATCH 052/110] Add work_args work_kwargs

---
 src/lightning_app/components/auto_scaler.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 4559d9966883a..f616fceaa7ae5 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -6,7 +6,7 @@
 import uuid
 from base64 import b64encode
 from itertools import cycle
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Type
 
 import aiohttp
 import aiohttp.client_exceptions
@@ -16,7 +16,6 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import RedirectResponse
 from fastapi.security import HTTPBasic, HTTPBasicCredentials
-from lightning_utilities.core.overrides import is_overridden
 from pydantic import BaseModel
 from starlette.status import HTTP_401_UNAUTHORIZED
 
@@ -370,7 +369,7 @@ class AutoScaler(LightningFlow):
 
     def __init__(
         self,
-        work_cls: Optional[type] = None,
+        work_cls: Type[LightningWork],
         min_replicas: int = 1,
         max_replicas: int = 4,
         autoscale_interval: int = 1 * 10,
@@ -381,13 +380,17 @@ def __init__(
         worker_url: str = None,
         input_schema: Any = Dict,
         output_schema: Any = Dict,
+        *work_args: Any,
+        **work_kwargs: Any,
     ) -> None:
         super().__init__()
         self.num_replicas = 0
         self._work_registry = {}
 
-        assert work_cls is not None or is_overridden("create_worker", self, AutoScaler)
         self._work_cls = work_cls
+        self._work_args = work_args
+        self._work_kwargs = work_kwargs
+
         self._input_schema = input_schema
         self._output_schema = output_schema
         self.autoscale_interval = autoscale_interval
@@ -431,9 +434,9 @@ def workers(self) -> List[LightningWork]:
             works.append(work)
         return works
 
-    def create_worker(self, *args, **kwargs) -> LightningWork:
-        """Override this hook to customise the work creation process."""
-        return self._work_cls()
+    def create_worker(self) -> LightningWork:
+        """Replicates a LightningWork instance with args and kwargs provided via ``__init__``."""
+        return self._work_cls(*self._work_args, **self._work_kwargs)
 
     def add_work(self, work) -> str:
         work_attribute = uuid.uuid4().hex

From 77faca5c3048496cb5da6a2ef7bc8c4bf42c208f Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 22:48:26 +0900
Subject: [PATCH 053/110] More docs

---
 examples/app_server_with_auto_scaler/app.py |  6 +-
 src/lightning_app/components/auto_scaler.py | 83 ++++++++++++++-------
 2 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index abe715d5f9c67..40674b1764545 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -47,7 +47,7 @@ def predict(self, requests: BatchRequestModel):
         )
         images = []
         for request in requests.inputs:
-            image = L.app.components.Image.deserialize(request.image)
+            image = L.app.components.serve.types.image.Image.deserialize(request.image)
             image = transforms(image).unsqueeze(0)
             images.append(image)
         images = torch.cat(images)
@@ -77,10 +77,10 @@ def scale(self, replicas, metrics):
         PyTorchServer,
         min_replicas=2,
         max_replicas=4,
+        autoscale_interval=10,
         worker_url="predict",
         input_schema=RequestModel,
         output_schema=Any,
-        timeout_batch=1,
-        autoscale_interval=10,
+        timeout_batching=1,
     )
 )
diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index f616fceaa7ae5..9bf7765e156a6 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -120,7 +120,7 @@ class _LoadBalancer(LightningWork):
         output_schema: Output schema.
         worker_url: The REST API path.
         max_batch_size: The number of requests processed at once.
-        timeout_batch: The number of seconds to wait before sending the requests to process in order to allow for
+        timeout_batching: The number of seconds to wait before sending the requests to process in order to allow for
             requests to be batched. In any case, requests are processed as soon as `max_batch_size` is reached.
         timeout_keep_alive: Close Keep-Alive connections if no new data is received within this timeout.
         timeout_inference_request: The number of seconds to wait for inference.
@@ -133,7 +133,7 @@ def __init__(
         output_schema,
         worker_url: str,
         max_batch_size: int = 8,
-        timeout_batch: int = 10,
+        timeout_batching: int = 10,
         timeout_keep_alive: int = 60,
         timeout_inference_request: int = 60,
         **kwargs: Any,
@@ -146,7 +146,7 @@ def __init__(
         self._timeout_inference_request = timeout_inference_request
         self.servers = []
         self.max_batch_size = max_batch_size
-        self.timeout_batch = timeout_batch
+        self.timeout_batching = timeout_batching
         self._ITER = None
         self._batch = []
         self._responses = {}  # {request_id: response}
@@ -190,7 +190,7 @@ async def consumer(self):
 
             batch = self._batch[: self.max_batch_size]
             while batch and (
-                (len(batch) >= self.max_batch_size) or ((time.time() - self._last_batch_sent) > self.timeout_batch)
+                (len(batch) >= self.max_batch_size) or ((time.time() - self._last_batch_sent) > self.timeout_batching)
             ):
                 has_sent = True
 
@@ -342,29 +342,53 @@ class AutoScaler(LightningFlow):
     on current requests in the queue.
 
     Args:
-        min_replicas: Number of works to start when app initializes.
-        max_replicas: Max numbers of works to spawn to handle the incoming requests.
-        autoscale_interval: Number of seconds to wait before checking whether to upscale or downscale the works.
-        max_batch_size: Number of requests to process at once.
-        timeout_batch: Number of seconds to wait before sending the requests to process.
+        min_replicas: The number of works to start when app initializes.
+        max_replicas: The max number of works to spawn to handle the incoming requests.
+        autoscale_interval: The number of seconds to wait before checking whether to upscale or downscale the works.
         downscale_threshold: Lower limit to determine when to stop works.
         upscale_threshold: Upper limit to determine when to spawn up a new work.
         worker_url: Default=api/predict. Provide the REST API path
+        max_batch_size: (auto-batching) The number of requests to process at once.
+        timeout_batching: (auto-batching) The number of seconds to wait before sending the requests to process.
         input_schema:
         output_schema:
 
-        .. doctest::
-
-            >>> from lightning_app.components import AutoScaler
-            >>> from lightning_app import LightningApp
-            >>> app = LightningApp(
-            ...     AutoScaler(
-            ...         MyPythonServer,  # noqa: F821
-            ...         min_replicas=1,
-            ...         max_replicas=4,
-            ...         autoscale_interval=10,
-            ...     )
-            ... )
+        Example:
+            import lightning as L
+
+            # Example 1: Auto-scaling serving out-of-the-box
+            app = L.LightningApp(
+                L.app.components.AutoScaler(
+                    MyPythonServer,
+                    min_replicas=1,
+                    max_replicas=8,
+                    autoscale_interval=10,
+                )
+            )
+
+            # Example 2: Customizing the scaling logic
+            class MyAutoScaler(L.app.components.AutoScaler):
+                def scale(self, replicas: int, metrics: dict) -> int:
+                    # upscale
+                    if metrics["pending_requests"] > self.upscale_threshold * metrics["pending_works"]:
+                        return replicas + 1
+
+                    # downscale
+                    if metrics["pending_requests"] < self.downscale_threshold:
+                        return replicas - 1
+
+                    return replicas
+
+            app = L.LightningApp(
+                MyAutoScaler(
+                    MyPythonServer,
+                    min_replicas=1,
+                    max_replicas=8,
+                    autoscale_interval=10,
+                    max_batch_size=8,  # for auto batching
+                    timeout_batching
+                )
+            )
     """
 
     def __init__(
@@ -374,7 +398,7 @@ def __init__(
         max_replicas: int = 4,
         autoscale_interval: int = 1 * 10,
         max_batch_size: int = 8,
-        timeout_batch: float = 2,
+        timeout_batching: float = 2,
         downscale_threshold: Optional[int] = None,
         upscale_threshold: Optional[int] = None,
         worker_url: str = None,
@@ -410,7 +434,7 @@ def __init__(
             output_schema=self._output_schema,
             worker_url=worker_url,
             max_batch_size=max_batch_size,
-            timeout_batch=timeout_batch,
+            timeout_batching=timeout_batching,
             cache_calls=True,
             parallel=True,
         )
@@ -422,7 +446,7 @@ def __init__(
             f"Initialized AutoScaler("
             f"min_replicas={min_replicas}, "
             f"max_replicas={max_replicas}, "
-            f"timeout_batch={timeout_batch}, "
+            f"timeout_batching={timeout_batching}, "
             f"max_batch_size={max_batch_size})"
         )
 
@@ -439,6 +463,11 @@ def create_worker(self) -> LightningWork:
         return self._work_cls(*self._work_args, **self._work_kwargs)
 
     def add_work(self, work) -> str:
+        """Adds a new LightningWork instance.
+
+        Returns:
+            The name of the new work attribute.
+        """
         work_attribute = uuid.uuid4().hex
         work_attribute = f"worker_{self.num_replicas}_{str(work_attribute)}"
         setattr(self, work_attribute, work)
@@ -447,6 +476,7 @@ def add_work(self, work) -> str:
         return work_attribute
 
     def remove_work(self, index: int) -> str:
+        """Removes the ``index``th LightningWork instance."""
         work_attribute = self._work_registry[index]
         del self._work_registry[index]
         work = getattr(self, work_attribute)
@@ -455,6 +485,7 @@ def remove_work(self, index: int) -> str:
         return work_attribute
 
     def get_work(self, index: int) -> LightningWork:
+        """Returns the ``index``th LightningWork instance."""
         work_attribute = self._work_registry[index]
         work = getattr(self, work_attribute)
         return work
@@ -467,7 +498,7 @@ def run(self):
             worker.run()
 
         if self.load_balancer.url:
-            self.fake_trigger += 1  # change state to keep calling `run`.
+            self.fake_trigger += 1  # Note: change state to keep calling `run`.
             self.autoscale()
 
     def scale(self, replicas: int, metrics) -> int:
@@ -485,10 +516,12 @@ def scale(self, replicas: int, metrics) -> int:
 
     @property
     def num_pending_requests(self) -> int:
+        """Fetches the number of pending requests via load balancer."""
         return int(requests.get(f"{self.load_balancer.url}/num-requests").json())
 
     @property
     def num_pending_works(self) -> int:
+        """The number of unready works."""
         return sum(1 for work in self.workers if work.url)
 
     def autoscale(self) -> None:

From 39de0babb27cd14ea74a9fc946c535e6ef185f52 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 22:50:42 +0900
Subject: [PATCH 054/110] remove FIXME

---
 examples/app_server_with_auto_scaler/app.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 40674b1764545..37e166844d226 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -1,4 +1,3 @@
-import warnings  # FIXME: remove before merge
 from typing import Any, List
 
 import torch
@@ -7,9 +6,6 @@
 
 import lightning as L
 
-warnings.filterwarnings("ignore", "Arguments other than a weight enum", UserWarning)  # FIXME: remove before merge
-warnings.filterwarnings("ignore", "The parameter 'pretrained' is deprecated", UserWarning)  # FIXME: remove before merge
-
 
 class RequestModel(BaseModel):
     image: str

From a49766d3338c79d8d6ef9db052d8ddae7f625415 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 23:00:54 +0900
Subject: [PATCH 055/110] Apply Jirka's suggestions

Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com>
---
 examples/app_server_with_auto_scaler/app.py |  2 +-
 src/lightning_app/components/auto_scaler.py | 14 +++++---------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 37e166844d226..9c92f5e4cc689 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -50,7 +50,7 @@ def predict(self, requests: BatchRequestModel):
         images = images.to(self._device)
         predictions = self._model(images)
         results = predictions.argmax(1).cpu().numpy().tolist()
-        return BatchResponse(outputs=[{"prediction": e} for e in results])
+        return BatchResponse(outputs=[{"prediction": pred} for pred in results])
 
 
 class MyAutoScaler(L.app.components.AutoScaler):
diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 9bf7765e156a6..3c5ae8b26c0dd 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -57,7 +57,7 @@ class _SysInfo(BaseModel):
     num_workers: int
     servers: List[str]
     num_requests: int
-    process_time: int
+    processing_time: int
     global_request_count: int
 
 
@@ -178,8 +178,8 @@ async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
                     assert len(batch) == len(outputs), f"result has {len(outputs)} items but batch is {len(batch)}"
                     result = {request[0]: r for request, r in zip(batch, outputs)}
                     self._responses.update(result)
-        except Exception as e:
-            result = {request[0]: e for request in batch}
+        except Exception as ex:
+            result = {request[0]: ex for request in batch}
             self._responses.update(result)
 
     async def consumer(self):
@@ -420,7 +420,7 @@ def __init__(
         self.autoscale_interval = autoscale_interval
 
         if max_replicas < min_replicas:
-            raise ValueError("max_replicas must be less than or equal to min_replicas.")
+            raise ValueError(f"`max_replicas={max_replicas}` must be less than or equal to `min_replicas={min_replicas}`.")
         self.max_replicas = max_replicas
         self.min_replicas = min_replicas
         self.downscale_threshold = downscale_threshold or min_replicas
@@ -452,11 +452,7 @@ def __init__(
 
     @property
     def workers(self) -> List[LightningWork]:
-        works = []
-        for i in range(self.num_replicas):
-            work = self.get_work(i)
-            works.append(work)
-        return works
+        return [self.get_work(i) for i in range(self.num_replicas)]
 
     def create_worker(self) -> LightningWork:
         """Replicates a LightningWork instance with args and kwargs provided via ``__init__``."""

From 6861444a43114ba04dd7477a3c157144d490987b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 29 Nov 2022 14:04:22 +0000
Subject: [PATCH 056/110] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/lightning_app/components/auto_scaler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 3c5ae8b26c0dd..cadbf0532b51b 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -420,7 +420,9 @@ def __init__(
         self.autoscale_interval = autoscale_interval
 
         if max_replicas < min_replicas:
-            raise ValueError(f"`max_replicas={max_replicas}` must be less than or equal to `min_replicas={min_replicas}`.")
+            raise ValueError(
+                f"`max_replicas={max_replicas}` must be less than or equal to `min_replicas={min_replicas}`."
+            )
         self.max_replicas = max_replicas
         self.min_replicas = min_replicas
         self.downscale_threshold = downscale_threshold or min_replicas

From 2b95111fff69c6143c14521b2aa9daa085bd490f Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 23:04:28 +0900
Subject: [PATCH 057/110] clean example device

---
 examples/app_server_with_auto_scaler/app.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 9c92f5e4cc689..7bd71061eac94 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -29,9 +29,8 @@ def __init__(self, *args, **kwargs):
         )
 
     def setup(self):
-        self._model = torchvision.models.resnet18(pretrained=True)
         self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        self._model.to(self._device)
+        self._model = torchvision.models.resnet18(pretrained=True).to(self._device)
 
     def predict(self, requests: BatchRequestModel):
         transforms = torchvision.transforms.Compose(

From 5af2569ce6fdd0fec97a9e3875009e9a746b06f9 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 23:19:19 +0900
Subject: [PATCH 058/110] add comment on init threshold value

---
 examples/app_server_with_auto_scaler/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 7bd71061eac94..a3e2c9182ecfc 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -56,7 +56,7 @@ class MyAutoScaler(L.app.components.AutoScaler):
     def scale(self, replicas, metrics):
         """The default replication logic that users can override."""
 
-        # upscale
+        # upscale (default: upscale_threshold = min_replicas * max_batch_size)
         if metrics["pending_requests"] > self.upscale_threshold * metrics["pending_works"]:
             return replicas + 1
 

From 69ec4c3266f9864c2082532aa1e794c4d7fca48c Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 23:19:39 +0900
Subject: [PATCH 059/110] bad merge

---
 src/lightning_app/components/serve/python_server.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/lightning_app/components/serve/python_server.py b/src/lightning_app/components/serve/python_server.py
index ae59695971372..7f7a8eeea98f4 100644
--- a/src/lightning_app/components/serve/python_server.py
+++ b/src/lightning_app/components/serve/python_server.py
@@ -210,8 +210,6 @@ def _attach_predict_fn(self, fastapi_app: FastAPI) -> None:
         input_type: type = self.configure_input_type()
         output_type: type = self.configure_output_type()
 
-        import torch
-
         def predict_fn(request: input_type):  # type: ignore
             with inference_mode():
                 return self.predict(request)

From 61433b3449c152bd849255cd9cf51bd0d64f1109 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 23:38:49 +0900
Subject: [PATCH 060/110] nit: logging format

---
 src/lightning_app/components/auto_scaler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index cadbf0532b51b..77c2cbbbeec7e 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -446,10 +446,10 @@ def __init__(
 
         logger.info(
             f"Initialized AutoScaler("
-            f"min_replicas={min_replicas}, "
-            f"max_replicas={max_replicas}, "
-            f"timeout_batching={timeout_batching}, "
-            f"max_batch_size={max_batch_size})"
+            f"min_replicas={min_replicas},"
+            f" max_replicas={max_replicas},"
+            f" timeout_batching={timeout_batching},"
+            f" max_batch_size={max_batch_size})"
         )
 
     @property

From f9debb66d06c5e30aecbe0a0433b9db031903a73 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 23:46:18 +0900
Subject: [PATCH 061/110] {in,out}put_schema -> {in,out}put_type

---
 src/lightning_app/components/auto_scaler.py | 39 ++++++++++-----------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 77c2cbbbeec7e..caa0f9bee5fe2 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -116,8 +116,8 @@ class _LoadBalancer(LightningWork):
     After enabling you will require to send username and password from the request header for the private endpoints.
 
     Args:
-        input_schema: Input schema.
-        output_schema: Output schema.
+        input_type: Input type.
+        output_type: Output type.
         worker_url: The REST API path.
         max_batch_size: The number of requests processed at once.
         timeout_batching: The number of seconds to wait before sending the requests to process in order to allow for
@@ -129,8 +129,8 @@ class _LoadBalancer(LightningWork):
 
     def __init__(
         self,
-        input_schema,
-        output_schema,
+        input_type: type,
+        output_type: type,
         worker_url: str,
         max_batch_size: int = 8,
         timeout_batching: int = 10,
@@ -139,8 +139,8 @@ def __init__(
         **kwargs: Any,
     ) -> None:
         super().__init__(cloud_compute=CloudCompute("default"), **kwargs)
-        self._input_schema = input_schema
-        self._output_schema = output_schema
+        self._input_type = input_type
+        self._output_type = output_type
         self._server_ready = False
         self._timeout_keep_alive = timeout_keep_alive
         self._timeout_inference_request = timeout_inference_request
@@ -155,7 +155,7 @@ def __init__(
 
     async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
         server = next(self._ITER)  # round-robin
-        request_data: List[_LoadBalancer._input_schema] = [b[1] for b in batch]
+        request_data: List[_LoadBalancer._input_type] = [b[1] for b in batch]
         batch_request_data = _BatchRequestModel(inputs=request_data)
 
         try:
@@ -223,9 +223,6 @@ def run(self):
         if self._server_ready:
             return
 
-        INPUT_SCHEMA = self._input_schema
-        OUTPUT_SCHEMA = self._output_schema
-
         logger.info(f"servers: {self.servers}")
 
         self._ITER = cycle(self.servers)
@@ -279,8 +276,8 @@ async def update_servers(servers: List[str], authenticated: bool = Depends(authe
             self.servers = servers
             self._ITER = cycle(self.servers)
 
-        @fastapi_app.post("/api/predict", response_model=OUTPUT_SCHEMA)
-        async def balance_api(inputs: INPUT_SCHEMA):
+        @fastapi_app.post("/api/predict", response_model=self._output_type)
+        async def balance_api(inputs: self._input_type):
             return await self.process_request(inputs)
 
         uvicorn.run(
@@ -350,13 +347,13 @@ class AutoScaler(LightningFlow):
         worker_url: Default=api/predict. Provide the REST API path
         max_batch_size: (auto-batching) The number of requests to process at once.
         timeout_batching: (auto-batching) The number of seconds to wait before sending the requests to process.
-        input_schema:
-        output_schema:
+        input_type: Input type.
+        output_type: Output type.
 
         Example:
             import lightning as L
 
-            # Example 1: Auto-scaling serving out-of-the-box
+            # Example 1: Auto-scaling serve component out-of-the-box
             app = L.LightningApp(
                 L.app.components.AutoScaler(
                     MyPythonServer,
@@ -402,8 +399,8 @@ def __init__(
         downscale_threshold: Optional[int] = None,
         upscale_threshold: Optional[int] = None,
         worker_url: str = None,
-        input_schema: Any = Dict,
-        output_schema: Any = Dict,
+        input_type: type = Dict,
+        output_type: type = Dict,
         *work_args: Any,
         **work_kwargs: Any,
     ) -> None:
@@ -415,8 +412,8 @@ def __init__(
         self._work_args = work_args
         self._work_kwargs = work_kwargs
 
-        self._input_schema = input_schema
-        self._output_schema = output_schema
+        self._input_type = input_type
+        self._output_type = output_type
         self.autoscale_interval = autoscale_interval
 
         if max_replicas < min_replicas:
@@ -432,8 +429,8 @@ def __init__(
 
         worker_url = worker_url or "api/predict"
         self.load_balancer = _LoadBalancer(
-            input_schema=self._input_schema,
-            output_schema=self._output_schema,
+            input_type=self._input_type,
+            output_type=self._output_type,
             worker_url=worker_url,
             max_batch_size=max_batch_size,
             timeout_batching=timeout_batching,

From e534e09fff284350c0b47c7a24d0e6129cc88e4f Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 23:48:34 +0900
Subject: [PATCH 062/110] lowercase

---
 src/lightning_app/components/auto_scaler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index caa0f9bee5fe2..738e077b4fa02 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -147,14 +147,14 @@ def __init__(
         self.servers = []
         self.max_batch_size = max_batch_size
         self.timeout_batching = timeout_batching
-        self._ITER = None
+        self._iter = None
         self._batch = []
         self._responses = {}  # {request_id: response}
         self._last_batch_sent = 0
         self.worker_url = worker_url
 
     async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
-        server = next(self._ITER)  # round-robin
+        server = next(self._iter)  # round-robin
         request_data: List[_LoadBalancer._input_type] = [b[1] for b in batch]
         batch_request_data = _BatchRequestModel(inputs=request_data)
 
@@ -225,7 +225,7 @@ def run(self):
 
         logger.info(f"servers: {self.servers}")
 
-        self._ITER = cycle(self.servers)
+        self._iter = cycle(self.servers)
         self._last_batch_sent = time.time()
 
         fastapi_app = _create_fastapi("Load Balancer")
@@ -274,7 +274,7 @@ async def sys_info(authenticated: bool = Depends(authenticate_private_endpoint))
         @fastapi_app.put("/system/update-servers")
         async def update_servers(servers: List[str], authenticated: bool = Depends(authenticate_private_endpoint)):
             self.servers = servers
-            self._ITER = cycle(self.servers)
+            self._iter = cycle(self.servers)
 
         @fastapi_app.post("/api/predict", response_model=self._output_type)
         async def balance_api(inputs: self._input_type):

From 87e2882ba4210828242f4005eec20da99ac27019 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 23:51:59 +0900
Subject: [PATCH 063/110] docs on seconds

---
 src/lightning_app/components/auto_scaler.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 738e077b4fa02..795d4fd2072a9 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -122,7 +122,7 @@ class _LoadBalancer(LightningWork):
         max_batch_size: The number of requests processed at once.
         timeout_batching: The number of seconds to wait before sending the requests to process in order to allow for
             requests to be batched. In any case, requests are processed as soon as `max_batch_size` is reached.
-        timeout_keep_alive: Close Keep-Alive connections if no new data is received within this timeout.
+        timeout_keep_alive: The number of seconds until it closes Keep-Alive connections if no new data is received.
         timeout_inference_request: The number of seconds to wait for inference.
         \**kwargs: Arguments passed to :func:`LightningWork.init` like ``CloudCompute``, ``BuildConfig``, etc.
     """
@@ -133,6 +133,7 @@ def __init__(
         output_type: type,
         worker_url: str,
         max_batch_size: int = 8,
+        # all timeout args are in seconds
         timeout_batching: int = 10,
         timeout_keep_alive: int = 60,
         timeout_inference_request: int = 60,
@@ -383,7 +384,7 @@ def scale(self, replicas: int, metrics: dict) -> int:
                     max_replicas=8,
                     autoscale_interval=10,
                     max_batch_size=8,  # for auto batching
-                    timeout_batching
+                    timeout_batching=2,  # for auto batching
                 )
             )
     """

From 7f2731d678170586357429f024af32dc2233ae65 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Nov 2022 23:53:13 +0900
Subject: [PATCH 064/110] process_time -> processing_time

---
 src/lightning_app/components/auto_scaler.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 795d4fd2072a9..a3aadb3fe26d2 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -78,7 +78,7 @@ def _create_fastapi(title: str) -> FastAPI:
 
     fastapi_app.global_request_count = 0
     fastapi_app.num_current_requests = 0
-    fastapi_app.last_process_time = 0
+    fastapi_app.last_processing_time = 0
 
     @fastapi_app.middleware("http")
     async def current_request_counter(request: Request, call_next):
@@ -88,8 +88,8 @@ async def current_request_counter(request: Request, call_next):
         fastapi_app.num_current_requests += 1
         start_time = time.time()
         response = await call_next(request)
-        process_time = time.time() - start_time
-        fastapi_app.last_process_time = process_time
+        processing_time = time.time() - start_time
+        fastapi_app.last_processing_time = processing_time
         fastapi_app.num_current_requests -= 1
         return response
 
@@ -233,7 +233,7 @@ def run(self):
         security = HTTPBasic()
         fastapi_app.global_request_count = 0
         fastapi_app.num_current_requests = 0
-        fastapi_app.last_process_time = 0
+        fastapi_app.last_processing_time = 0
         fastapi_app.SEND_TASK = None
 
         @fastapi_app.on_event("startup")
@@ -268,7 +268,7 @@ async def sys_info(authenticated: bool = Depends(authenticate_private_endpoint))
                 num_workers=len(self.servers),
                 servers=self.servers,
                 num_requests=fastapi_app.num_current_requests,
-                process_time=fastapi_app.last_process_time,
+                processing_time=fastapi_app.last_processing_time,
                 global_request_count=fastapi_app.global_request_count,
             )
 

From a933247bc27880e3cfd5099075baf28b2ad68fa9 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 30 Nov 2022 00:13:20 +0900
Subject: [PATCH 065/110] Dont modify work state from flow

---
 src/lightning_app/components/auto_scaler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index a3aadb3fe26d2..499e5480aac9e 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -296,8 +296,8 @@ def update_servers(self, server_works: List[LightningWork]):
         AutoScaler uses this method to increase/decrease the number of works.
         """
         old_servers = set(self.servers)
-        self.servers: List[str] = [server.url for server in server_works if server.url]
-        new_servers = set(self.servers)
+        server_urls: List[str] = [server.url for server in server_works if server.url]
+        new_servers = set(server_urls)
 
         if new_servers == old_servers:
             logging.debug("no new server added")
@@ -310,7 +310,7 @@ def update_servers(self, server_works: List[LightningWork]):
         if deleted_servers:
             logger.info(f"servers deleted: {deleted_servers}")
 
-        self.send_request_to_update_servers(self.servers)
+        self.send_request_to_update_servers(server_urls)
 
     def send_request_to_update_servers(self, servers: List[str]):
         AUTHORIZATION_TYPE = "Basic"

From 5d7d1c34c4248feb24950a957b8a919bf2c5cde8 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 30 Nov 2022 00:14:59 +0900
Subject: [PATCH 066/110] Update tests

---
 tests/tests_app/components/test_auto_scaler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tests_app/components/test_auto_scaler.py b/tests/tests_app/components/test_auto_scaler.py
index d0ce1cf3208c0..17183ea5ccef8 100644
--- a/tests/tests_app/components/test_auto_scaler.py
+++ b/tests/tests_app/components/test_auto_scaler.py
@@ -31,7 +31,7 @@ def test_num_replicas_after_init():
 
 @patch("uvicorn.run")
 @patch("lightning_app.components.auto_scaler._LoadBalancer.url")
-@patch("lightning_app.components.auto_scaler.AutoScaler.num_requests")
+@patch("lightning_app.components.auto_scaler.AutoScaler.num_pending_requests")
 def test_num_replicas_not_above_max_replicas(*_):
     """Test self.num_replicas doesn't exceed max_replicas."""
     max_replicas = 6
@@ -51,7 +51,7 @@ def test_num_replicas_not_above_max_replicas(*_):
 
 @patch("uvicorn.run")
 @patch("lightning_app.components.auto_scaler._LoadBalancer.url")
-@patch("lightning_app.components.auto_scaler.AutoScaler.num_requests")
+@patch("lightning_app.components.auto_scaler.AutoScaler.num_pending_requests")
 def test_num_replicas_not_belo_min_replicas(*_):
     """Test self.num_replicas doesn't exceed max_replicas."""
     min_replicas = 1

From 73cf3899c25c2aa2171e2707f53c722dc42878c3 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 30 Nov 2022 00:36:15 +0900
Subject: [PATCH 067/110] worker_url -> endpoint

---
 examples/app_server_with_auto_scaler/app.py |  2 +-
 src/lightning_app/components/auto_scaler.py | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index a3e2c9182ecfc..331bdf38b108b 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -73,7 +73,7 @@ def scale(self, replicas, metrics):
         min_replicas=2,
         max_replicas=4,
         autoscale_interval=10,
-        worker_url="predict",
+        endpoint="predict",
         input_schema=RequestModel,
         output_schema=Any,
         timeout_batching=1,
diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 499e5480aac9e..035a31580a799 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -118,7 +118,7 @@ class _LoadBalancer(LightningWork):
     Args:
         input_type: Input type.
         output_type: Output type.
-        worker_url: The REST API path.
+        endpoint: The REST API path.
         max_batch_size: The number of requests processed at once.
         timeout_batching: The number of seconds to wait before sending the requests to process in order to allow for
             requests to be batched. In any case, requests are processed as soon as `max_batch_size` is reached.
@@ -131,7 +131,7 @@ def __init__(
         self,
         input_type: type,
         output_type: type,
-        worker_url: str,
+        endpoint: str,
         max_batch_size: int = 8,
         # all timeout args are in seconds
         timeout_batching: int = 10,
@@ -152,7 +152,7 @@ def __init__(
         self._batch = []
         self._responses = {}  # {request_id: response}
         self._last_batch_sent = 0
-        self.worker_url = worker_url
+        self.endpoint = endpoint
 
     async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
         server = next(self._iter)  # round-robin
@@ -166,7 +166,7 @@ async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
                     "Content-Type": "application/json",
                 }
                 async with session.post(
-                    f"{server}/{self.worker_url}",
+                    f"{server}/{self.endpoint}",
                     json=batch_request_data.dict(),
                     timeout=self._timeout_inference_request,
                     headers=headers,
@@ -345,7 +345,7 @@ class AutoScaler(LightningFlow):
         autoscale_interval: The number of seconds to wait before checking whether to upscale or downscale the works.
         downscale_threshold: Lower limit to determine when to stop works.
         upscale_threshold: Upper limit to determine when to spawn up a new work.
-        worker_url: Default=api/predict. Provide the REST API path
+        endpoint: Default=api/predict. Provide the REST API path
         max_batch_size: (auto-batching) The number of requests to process at once.
         timeout_batching: (auto-batching) The number of seconds to wait before sending the requests to process.
         input_type: Input type.
@@ -399,7 +399,7 @@ def __init__(
         timeout_batching: float = 2,
         downscale_threshold: Optional[int] = None,
         upscale_threshold: Optional[int] = None,
-        worker_url: str = None,
+        endpoint: str = None,
         input_type: type = Dict,
         output_type: type = Dict,
         *work_args: Any,
@@ -428,11 +428,11 @@ def __init__(
         self._last_autoscale = time.time()
         self.fake_trigger = 0
 
-        worker_url = worker_url or "api/predict"
+        endpoint = endpoint or "api/predict"
         self.load_balancer = _LoadBalancer(
             input_type=self._input_type,
             output_type=self._output_type,
-            worker_url=worker_url,
+            endpoint=endpoint,
             max_batch_size=max_batch_size,
             timeout_batching=timeout_batching,
             cache_calls=True,

From 8840b8528517ebd162b66623ea00363eb3290375 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 30 Nov 2022 04:11:20 +0900
Subject: [PATCH 068/110] fix exampl

---
 examples/app_server_with_auto_scaler/app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 331bdf38b108b..488cf135ee507 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -74,8 +74,8 @@ def scale(self, replicas, metrics):
         max_replicas=4,
         autoscale_interval=10,
         endpoint="predict",
-        input_schema=RequestModel,
-        output_schema=Any,
+        input_type=RequestModel,
+        output_type=Any,
         timeout_batching=1,
     )
 )

From b7301e6cfeaa40352299326398ac93f9f4c15dc9 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 30 Nov 2022 20:46:31 +0900
Subject: [PATCH 069/110] Fix default scale logic

---
 src/lightning_app/components/auto_scaler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 035a31580a799..91462a6a93947 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -368,7 +368,7 @@ class AutoScaler(LightningFlow):
             class MyAutoScaler(L.app.components.AutoScaler):
                 def scale(self, replicas: int, metrics: dict) -> int:
                     # upscale
-                    if metrics["pending_requests"] > self.upscale_threshold * metrics["pending_works"]:
+                    if metrics["pending_requests"] > self.upscale_threshold * (metrics["pending_works"] + 1):
                         return replicas + 1
 
                     # downscale
@@ -501,7 +501,7 @@ def scale(self, replicas: int, metrics) -> int:
         """The default replication logic that users can override."""
 
         # upscale
-        if metrics["pending_requests"] > self.upscale_threshold * metrics["pending_works"]:
+        if metrics["pending_requests"] > self.upscale_threshold * (metrics["pending_works"] + 1):
             return replicas + 1
 
         # downscale

From c8d0c8633b962386089592e13791a2c051379033 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 30 Nov 2022 20:46:49 +0900
Subject: [PATCH 070/110] Fix default scale logic

---
 examples/app_server_with_auto_scaler/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 488cf135ee507..449bb3c948642 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -57,7 +57,7 @@ def scale(self, replicas, metrics):
         """The default replication logic that users can override."""
 
         # upscale (default: upscale_threshold = min_replicas * max_batch_size)
-        if metrics["pending_requests"] > self.upscale_threshold * metrics["pending_works"]:
+        if metrics["pending_requests"] > self.upscale_threshold * (metrics["pending_works"] + 1):
             return replicas + 1
 
         # downscale

From ad5b8e5e5a147c87e3d0754c7ed717e6e7ed9076 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 30 Nov 2022 23:48:47 +0900
Subject: [PATCH 071/110] Fix num_pending_works

---
 src/lightning_app/components/auto_scaler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 91462a6a93947..712a468bc2e4d 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -518,7 +518,7 @@ def num_pending_requests(self) -> int:
     @property
     def num_pending_works(self) -> int:
         """The number of unready works."""
-        return sum(1 for work in self.workers if work.url)
+        return sum(1 for work in self.workers if not work.url)
 
     def autoscale(self) -> None:
         """Upscale and down scale model inference works."""

From 9a43d316953368d9d0a855e21c82c956e8a8848c Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 1 Dec 2022 20:43:57 +0900
Subject: [PATCH 072/110] Update num_pending_works

---
 src/lightning_app/components/auto_scaler.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 712a468bc2e4d..ca04ed04ac20b 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -249,7 +249,7 @@ def shutdown_event():
         def authenticate_private_endpoint(credentials: HTTPBasicCredentials = Depends(security)):
             AUTO_SCALER_AUTH_PASSWORD = os.environ.get("AUTO_SCALER_AUTH_PASSWORD", "")
             if len(AUTO_SCALER_AUTH_PASSWORD) == 0:
-                logging.warning("You have not set password for private endpoints!")
+                logger.warn("You have not set password for private endpoints!")
             current_password_bytes = credentials.password.encode("utf8")
             is_correct_password = secrets.compare_digest(
                 current_password_bytes, AUTO_SCALER_AUTH_PASSWORD.encode("utf8")
@@ -300,7 +300,7 @@ def update_servers(self, server_works: List[LightningWork]):
         new_servers = set(server_urls)
 
         if new_servers == old_servers:
-            logging.debug("no new server added")
+            logger.debug("no new server added")
             return
 
         if new_servers - old_servers:
@@ -442,14 +442,6 @@ def __init__(
             work = self.create_worker()
             self.add_work(work)
 
-        logger.info(
-            f"Initialized AutoScaler("
-            f"min_replicas={min_replicas},"
-            f" max_replicas={max_replicas},"
-            f" timeout_batching={timeout_batching},"
-            f" max_batch_size={max_batch_size})"
-        )
-
     @property
     def workers(self) -> List[LightningWork]:
         return [self.get_work(i) for i in range(self.num_replicas)]
@@ -517,8 +509,8 @@ def num_pending_requests(self) -> int:
 
     @property
     def num_pending_works(self) -> int:
-        """The number of unready works."""
-        return sum(1 for work in self.workers if not work.url)
+        """The number of pending works."""
+        return sum(work.is_pending for work in self.workers)
 
     def autoscale(self) -> None:
         """Upscale and down scale model inference works."""

From ce4d25780cacbfc0c3c70e0fee56c7ca8c24dfd3 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 1 Dec 2022 20:57:20 +0900
Subject: [PATCH 073/110] Fix bug creating too many works

---
 src/lightning_app/components/auto_scaler.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index ca04ed04ac20b..62d206079da6c 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -489,11 +489,14 @@ def run(self):
             self.fake_trigger += 1  # Note: change state to keep calling `run`.
             self.autoscale()
 
-    def scale(self, replicas: int, metrics) -> int:
+    def scale(self, replicas: int, metrics: dict) -> int:
         """The default replication logic that users can override."""
+        max_requests_per_work = self.upscale_threshold
+        pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
+            replicas + metrics["pending_works"]
+        )
 
-        # upscale
-        if metrics["pending_requests"] > self.upscale_threshold * (metrics["pending_works"] + 1):
+        if pending_requests_per_running_or_pending_work >= max_requests_per_work:
             return replicas + 1
 
         # downscale

From 9bebb8654a7e240207b72f368d02d21f2e2b6eab Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 1 Dec 2022 21:21:35 +0900
Subject: [PATCH 074/110] Remove up/downscale_threshold args

---
 src/lightning_app/components/auto_scaler.py | 35 ++++++++++-----------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 62d206079da6c..18f79297bad53 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -6,7 +6,7 @@
 import uuid
 from base64 import b64encode
 from itertools import cycle
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Dict, List, Tuple, Type
 
 import aiohttp
 import aiohttp.client_exceptions
@@ -343,8 +343,6 @@ class AutoScaler(LightningFlow):
         min_replicas: The number of works to start when app initializes.
         max_replicas: The max number of works to spawn to handle the incoming requests.
         autoscale_interval: The number of seconds to wait before checking whether to upscale or downscale the works.
-        downscale_threshold: Lower limit to determine when to stop works.
-        upscale_threshold: Upper limit to determine when to spawn up a new work.
         endpoint: Default=api/predict. Provide the REST API path
         max_batch_size: (auto-batching) The number of requests to process at once.
         timeout_batching: (auto-batching) The number of seconds to wait before sending the requests to process.
@@ -367,14 +365,6 @@ class AutoScaler(LightningFlow):
             # Example 2: Customizing the scaling logic
             class MyAutoScaler(L.app.components.AutoScaler):
                 def scale(self, replicas: int, metrics: dict) -> int:
-                    # upscale
-                    if metrics["pending_requests"] > self.upscale_threshold * (metrics["pending_works"] + 1):
-                        return replicas + 1
-
-                    # downscale
-                    if metrics["pending_requests"] < self.downscale_threshold:
-                        return replicas - 1
-
                     return replicas
 
             app = L.LightningApp(
@@ -397,8 +387,6 @@ def __init__(
         autoscale_interval: int = 1 * 10,
         max_batch_size: int = 8,
         timeout_batching: float = 2,
-        downscale_threshold: Optional[int] = None,
-        upscale_threshold: Optional[int] = None,
         endpoint: str = None,
         input_type: type = Dict,
         output_type: type = Dict,
@@ -416,6 +404,7 @@ def __init__(
         self._input_type = input_type
         self._output_type = output_type
         self.autoscale_interval = autoscale_interval
+        self.max_batch_size = max_batch_size
 
         if max_replicas < min_replicas:
             raise ValueError(
@@ -423,8 +412,6 @@ def __init__(
             )
         self.max_replicas = max_replicas
         self.min_replicas = min_replicas
-        self.downscale_threshold = downscale_threshold or min_replicas
-        self.upscale_threshold = upscale_threshold or min_replicas * max_batch_size
         self._last_autoscale = time.time()
         self.fake_trigger = 0
 
@@ -490,17 +477,29 @@ def run(self):
             self.autoscale()
 
     def scale(self, replicas: int, metrics: dict) -> int:
-        """The default replication logic that users can override."""
-        max_requests_per_work = self.upscale_threshold
+        """The default scaling logic that users can override.
+
+        Args:
+            replicas: The number of running works.
+            metrics: ``metrics['pending_requests']`` is the total number of requests that are currently pending.
+                ``metrics['pending_works']`` is the number of pending works.
+
+        Returns:
+            The target number of running works. The value will be adjusted after this method runs
+            so that it satisfies ``min_replicas<=replicas<=max_replicas``.
+        """
+        max_requests_per_work = self.max_batch_size
+        min_requests_per_work = 1
         pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
             replicas + metrics["pending_works"]
         )
 
+        # upscale
         if pending_requests_per_running_or_pending_work >= max_requests_per_work:
             return replicas + 1
 
         # downscale
-        if metrics["pending_requests"] < self.downscale_threshold:
+        if pending_requests_per_running_or_pending_work < min_requests_per_work:
             return replicas - 1
 
         return replicas

From 8b1415489d761bf2242082d059bb39c4b5cd3ed8 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 1 Dec 2022 21:22:40 +0900
Subject: [PATCH 075/110] Update example

---
 examples/app_server_with_auto_scaler/app.py | 23 +++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 449bb3c948642..6d56ae1a397c1 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -54,14 +54,29 @@ def predict(self, requests: BatchRequestModel):
 
 class MyAutoScaler(L.app.components.AutoScaler):
     def scale(self, replicas, metrics):
-        """The default replication logic that users can override."""
+        """The default scaling logic that users can override.
+
+        Args:
+            replicas: The number of running works.
+            metrics: ``metrics['pending_requests']`` is the total number of requests that are currently pending.
+                ``metrics['pending_works']`` is the number of pending works.
+
+        Returns:
+            The target number of running works. The value will be adjusted after this method runs
+            so that it satisfies ``min_replicas<=replicas<=max_replicas``.
+        """
+        max_requests_per_work = self.max_batch_size
+        min_requests_per_work = 1
+        pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
+            replicas + metrics["pending_works"]
+        )
 
-        # upscale (default: upscale_threshold = min_replicas * max_batch_size)
-        if metrics["pending_requests"] > self.upscale_threshold * (metrics["pending_works"] + 1):
+        # upscale
+        if pending_requests_per_running_or_pending_work >= max_requests_per_work:
             return replicas + 1
 
         # downscale
-        if metrics["pending_requests"] < self.downscale_threshold:
+        if pending_requests_per_running_or_pending_work < min_requests_per_work:
             return replicas - 1
 
         return replicas

From 611077edb66aa8266fafa963e7700af864c9d407 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 1 Dec 2022 21:25:20 +0900
Subject: [PATCH 076/110] Add typing

---
 examples/app_server_with_auto_scaler/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 6d56ae1a397c1..ec7b0c88b6e91 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -53,7 +53,7 @@ def predict(self, requests: BatchRequestModel):
 
 
 class MyAutoScaler(L.app.components.AutoScaler):
-    def scale(self, replicas, metrics):
+    def scale(self, replicas: int, metrics: dict) -> int:
         """The default scaling logic that users can override.
 
         Args:

From 4e93898e6896d83e916cf2591b3346393749be93 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 1 Dec 2022 21:29:12 +0900
Subject: [PATCH 077/110] Fix example in docstring

---
 src/lightning_app/components/auto_scaler.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 18f79297bad53..4cc6cfed3dbfe 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -365,6 +365,20 @@ class AutoScaler(LightningFlow):
             # Example 2: Customizing the scaling logic
             class MyAutoScaler(L.app.components.AutoScaler):
                 def scale(self, replicas: int, metrics: dict) -> int:
+                    max_requests_per_work = self.max_batch_size
+                    min_requests_per_work = 1
+                    pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
+                        replicas + metrics["pending_works"]
+                    )
+
+                    # upscale
+                    if pending_requests_per_running_or_pending_work >= max_requests_per_work:
+                        return replicas + 1
+
+                    # downscale
+                    if pending_requests_per_running_or_pending_work < min_requests_per_work:
+                        return replicas - 1
+
                     return replicas
 
             app = L.LightningApp(

From 1e42f55aa2229a6729ca07b980537d193ee2f2b4 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 2 Dec 2022 00:06:57 +0900
Subject: [PATCH 078/110] Fix default scale logic

---
 examples/app_server_with_auto_scaler/app.py | 22 ++++++---------------
 src/lightning_app/components/auto_scaler.py | 10 +++++-----
 2 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index ec7b0c88b6e91..264ec9fa4e3e9 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -54,29 +54,19 @@ def predict(self, requests: BatchRequestModel):
 
 class MyAutoScaler(L.app.components.AutoScaler):
     def scale(self, replicas: int, metrics: dict) -> int:
-        """The default scaling logic that users can override.
-
-        Args:
-            replicas: The number of running works.
-            metrics: ``metrics['pending_requests']`` is the total number of requests that are currently pending.
-                ``metrics['pending_works']`` is the number of pending works.
-
-        Returns:
-            The target number of running works. The value will be adjusted after this method runs
-            so that it satisfies ``min_replicas<=replicas<=max_replicas``.
-        """
+        """The default scaling logic that users can override."""
+        # scale out if the number of pending requests exceeds max batch size.
         max_requests_per_work = self.max_batch_size
-        min_requests_per_work = 1
         pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
             replicas + metrics["pending_works"]
         )
-
-        # upscale
         if pending_requests_per_running_or_pending_work >= max_requests_per_work:
             return replicas + 1
 
-        # downscale
-        if pending_requests_per_running_or_pending_work < min_requests_per_work:
+        # scale in if the number of pending requests is below 25% of max_requests_per_work
+        min_requests_per_work = max_requests_per_work * 0.25
+        pending_requests_per_running_work = metrics["pending_requests"] / replicas
+        if pending_requests_per_running_work < min_requests_per_work:
             return replicas - 1
 
         return replicas
diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 4cc6cfed3dbfe..18c2b4a4d34c0 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -502,18 +502,18 @@ def scale(self, replicas: int, metrics: dict) -> int:
             The target number of running works. The value will be adjusted after this method runs
             so that it satisfies ``min_replicas<=replicas<=max_replicas``.
         """
+        # scale out if the number of pending requests exceeds max batch size.
         max_requests_per_work = self.max_batch_size
-        min_requests_per_work = 1
         pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
             replicas + metrics["pending_works"]
         )
-
-        # upscale
         if pending_requests_per_running_or_pending_work >= max_requests_per_work:
             return replicas + 1
 
-        # downscale
-        if pending_requests_per_running_or_pending_work < min_requests_per_work:
+        # scale in if the number of pending requests is below 25% of max_requests_per_work
+        min_requests_per_work = max_requests_per_work * 0.25
+        pending_requests_per_running_work = metrics["pending_requests"] / replicas
+        if pending_requests_per_running_work < min_requests_per_work:
             return replicas - 1
 
         return replicas

From 0b6153a1cd235a96eb51979950739067f717e9f6 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 2 Dec 2022 00:49:41 +0900
Subject: [PATCH 079/110] Update src/lightning_app/components/auto_scaler.py

Co-authored-by: Noha Alon <nohalon@gmail.com>
---
 src/lightning_app/components/auto_scaler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 18c2b4a4d34c0..f929c2329a6fa 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -249,7 +249,7 @@ def shutdown_event():
         def authenticate_private_endpoint(credentials: HTTPBasicCredentials = Depends(security)):
             AUTO_SCALER_AUTH_PASSWORD = os.environ.get("AUTO_SCALER_AUTH_PASSWORD", "")
             if len(AUTO_SCALER_AUTH_PASSWORD) == 0:
-                logger.warn("You have not set password for private endpoints!")
+                logger.warn("You have not set a password for private endpoints! To set a password add --env AUTO_SCALER_AUTH_PASSWORD=<your pass> to your lightning run command.")
             current_password_bytes = credentials.password.encode("utf8")
             is_correct_password = secrets.compare_digest(
                 current_password_bytes, AUTO_SCALER_AUTH_PASSWORD.encode("utf8")

From 6d677c1f551b7d4f38fbf065d9ddb22c45da27f1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 1 Dec 2022 15:51:03 +0000
Subject: [PATCH 080/110] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/lightning_app/components/auto_scaler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index f929c2329a6fa..7f60e8b8243c5 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -249,7 +249,9 @@ def shutdown_event():
         def authenticate_private_endpoint(credentials: HTTPBasicCredentials = Depends(security)):
             AUTO_SCALER_AUTH_PASSWORD = os.environ.get("AUTO_SCALER_AUTH_PASSWORD", "")
             if len(AUTO_SCALER_AUTH_PASSWORD) == 0:
-                logger.warn("You have not set a password for private endpoints! To set a password add --env AUTO_SCALER_AUTH_PASSWORD=<your pass> to your lightning run command.")
+                logger.warn(
+                    "You have not set a password for private endpoints! To set a password add --env AUTO_SCALER_AUTH_PASSWORD=<your pass> to your lightning run command."
+                )
             current_password_bytes = credentials.password.encode("utf8")
             is_correct_password = secrets.compare_digest(
                 current_password_bytes, AUTO_SCALER_AUTH_PASSWORD.encode("utf8")

From 67bbe49df68762df0769dd18aa2ae4c5f4092f89 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 2 Dec 2022 02:34:45 +0900
Subject: [PATCH 081/110] rename method

---
 src/lightning_app/components/auto_scaler.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 7f60e8b8243c5..d77449d847ca6 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -250,7 +250,8 @@ def authenticate_private_endpoint(credentials: HTTPBasicCredentials = Depends(se
             AUTO_SCALER_AUTH_PASSWORD = os.environ.get("AUTO_SCALER_AUTH_PASSWORD", "")
             if len(AUTO_SCALER_AUTH_PASSWORD) == 0:
                 logger.warn(
-                    "You have not set a password for private endpoints! To set a password add --env AUTO_SCALER_AUTH_PASSWORD=<your pass> to your lightning run command."
+                    "You have not set a password for private endpoints! To set a password, add "
+                    "`--env AUTO_SCALER_AUTH_PASSWORD=<your pass>` to your lightning run command."
                 )
             current_password_bytes = credentials.password.encode("utf8")
             is_correct_password = secrets.compare_digest(
@@ -442,14 +443,14 @@ def __init__(
             parallel=True,
         )
         for _ in range(min_replicas):
-            work = self.create_worker()
+            work = self.create_work()
             self.add_work(work)
 
     @property
     def workers(self) -> List[LightningWork]:
         return [self.get_work(i) for i in range(self.num_replicas)]
 
-    def create_worker(self) -> LightningWork:
+    def create_work(self) -> LightningWork:
         """Replicates a LightningWork instance with args and kwargs provided via ``__init__``."""
         return self._work_cls(*self._work_args, **self._work_kwargs)
 
@@ -552,7 +553,7 @@ def autoscale(self) -> None:
         num_workers_to_add = num_target_workers - self.num_replicas
         for _ in range(num_workers_to_add):
             logger.info(f"Upscaling from {self.num_replicas} to {self.num_replicas + 1}")
-            work = self.create_worker()
+            work = self.create_work()
             new_work_id = self.add_work(work)
             logger.info(f"Work created: '{new_work_id}'")
 

From e52470205698f330ea47a4d035849768e0a396d7 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 2 Dec 2022 02:39:50 +0900
Subject: [PATCH 082/110] rename locvar

---
 src/lightning_app/components/auto_scaler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index d77449d847ca6..480bdce700dae 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -486,8 +486,8 @@ def run(self):
         if not self.load_balancer.is_running:
             self.load_balancer.run()
 
-        for worker in self.workers:
-            worker.run()
+        for work in self.workers:
+            work.run()
 
         if self.load_balancer.url:
             self.fake_trigger += 1  # Note: change state to keep calling `run`.

From 2f24422bac4e432f90a96eafd56f7cc62bedecef Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 2 Dec 2022 02:42:12 +0900
Subject: [PATCH 083/110] Add todo

---
 src/lightning_app/components/auto_scaler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 480bdce700dae..0cee64b8818ea 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -452,7 +452,8 @@ def workers(self) -> List[LightningWork]:
 
     def create_work(self) -> LightningWork:
         """Replicates a LightningWork instance with args and kwargs provided via ``__init__``."""
-        return self._work_cls(*self._work_args, **self._work_kwargs)
+        # TODO: Remove `start_with_flow=False` for faster initialization on the cloud
+        return self._work_cls(*self._work_args, **self._work_kwargs, start_with_flow=False)
 
     def add_work(self, work) -> str:
         """Adds a new LightningWork instance.

From a20797db22800bba72855f1f4956415efb1f6802 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 2 Dec 2022 02:58:19 +0900
Subject: [PATCH 084/110] docs ci

---
 src/lightning_app/components/auto_scaler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 0cee64b8818ea..942479cc2352f 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -352,7 +352,8 @@ class AutoScaler(LightningFlow):
         input_type: Input type.
         output_type: Output type.
 
-        Example:
+        .. testcode::
+
             import lightning as L
 
             # Example 1: Auto-scaling serve component out-of-the-box
@@ -384,6 +385,7 @@ def scale(self, replicas: int, metrics: dict) -> int:
 
                     return replicas
 
+
             app = L.LightningApp(
                 MyAutoScaler(
                     MyPythonServer,

From a8a8aaad761a81430c2c2254a7918a34fd194a73 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 2 Dec 2022 06:51:54 +0900
Subject: [PATCH 085/110] docs ci

---
 src/lightning_app/components/auto_scaler.py | 76 ++++++++++-----------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 942479cc2352f..7dbc39fddced1 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -352,50 +352,50 @@ class AutoScaler(LightningFlow):
         input_type: Input type.
         output_type: Output type.
 
-        .. testcode::
+    .. testcode::
 
-            import lightning as L
+        import lightning as L
 
-            # Example 1: Auto-scaling serve component out-of-the-box
-            app = L.LightningApp(
-                L.app.components.AutoScaler(
-                    MyPythonServer,
-                    min_replicas=1,
-                    max_replicas=8,
-                    autoscale_interval=10,
-                )
+        # Example 1: Auto-scaling serve component out-of-the-box
+        app = L.LightningApp(
+            L.app.components.AutoScaler(
+                MyPythonServer,
+                min_replicas=1,
+                max_replicas=8,
+                autoscale_interval=10,
             )
+        )
 
-            # Example 2: Customizing the scaling logic
-            class MyAutoScaler(L.app.components.AutoScaler):
-                def scale(self, replicas: int, metrics: dict) -> int:
-                    max_requests_per_work = self.max_batch_size
-                    min_requests_per_work = 1
-                    pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
-                        replicas + metrics["pending_works"]
-                    )
-
-                    # upscale
-                    if pending_requests_per_running_or_pending_work >= max_requests_per_work:
-                        return replicas + 1
-
-                    # downscale
-                    if pending_requests_per_running_or_pending_work < min_requests_per_work:
-                        return replicas - 1
-
-                    return replicas
-
-
-            app = L.LightningApp(
-                MyAutoScaler(
-                    MyPythonServer,
-                    min_replicas=1,
-                    max_replicas=8,
-                    autoscale_interval=10,
-                    max_batch_size=8,  # for auto batching
-                    timeout_batching=2,  # for auto batching
+        # Example 2: Customizing the scaling logic
+        class MyAutoScaler(L.app.components.AutoScaler):
+            def scale(self, replicas: int, metrics: dict) -> int:
+                max_requests_per_work = self.max_batch_size
+                min_requests_per_work = 1
+                pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
+                    replicas + metrics["pending_works"]
                 )
+
+                # upscale
+                if pending_requests_per_running_or_pending_work >= max_requests_per_work:
+                    return replicas + 1
+
+                # downscale
+                if pending_requests_per_running_or_pending_work < min_requests_per_work:
+                    return replicas - 1
+
+                return replicas
+
+
+        app = L.LightningApp(
+            MyAutoScaler(
+                MyPythonServer,
+                min_replicas=1,
+                max_replicas=8,
+                autoscale_interval=10,
+                max_batch_size=8,  # for auto batching
+                timeout_batching=2,  # for auto batching
             )
+        )
     """
 
     def __init__(

From 09dfda551e514886cfa78ff527daee25f90909ee Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 2 Dec 2022 07:01:33 +0900
Subject: [PATCH 086/110] asdfafsdasdf pls docs

---
 src/lightning_app/components/auto_scaler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 7dbc39fddced1..50e8037869217 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -471,7 +471,7 @@ def add_work(self, work) -> str:
         return work_attribute
 
     def remove_work(self, index: int) -> str:
-        """Removes the ``index``th LightningWork instance."""
+        """Removes the ``index`` th LightningWork instance."""
         work_attribute = self._work_registry[index]
         del self._work_registry[index]
         work = getattr(self, work_attribute)
@@ -480,7 +480,7 @@ def remove_work(self, index: int) -> str:
         return work_attribute
 
     def get_work(self, index: int) -> LightningWork:
-        """Returns the ``index``th LightningWork instance."""
+        """Returns the ``index`` th LightningWork instance."""
         work_attribute = self._work_registry[index]
         work = getattr(self, work_attribute)
         return work

From 11842b0dbfaf1c38b9099400fb7dff6046c93db6 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 2 Dec 2022 23:49:39 +0900
Subject: [PATCH 087/110] Apply suggestions from code review

Co-authored-by: Ethan Harris <ethanwharris@gmail.com>
---
 src/lightning_app/components/auto_scaler.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 50e8037869217..b9ff23f33fa04 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -339,8 +339,9 @@ def send_request_to_update_servers(self, servers: List[str]):
 
 
 class AutoScaler(LightningFlow):
-    """A LightningFlow component that handles all the servers and uses load balancer to spawn up and shutdown based
-    on current requests in the queue.
+    """The ``AutoScaler`` can be used to automatically change the number of replicas of the given server
+    in response to changes in the number of incoming requests. Incoming requests will be batched and
+    balanced across the replicas.
 
     Args:
         min_replicas: The number of works to start when app initializes.
@@ -480,7 +481,7 @@ def remove_work(self, index: int) -> str:
         return work_attribute
 
     def get_work(self, index: int) -> LightningWork:
-        """Returns the ``index`` th LightningWork instance."""
+        """Returns the ``LightningWork`` instance with the given index."""
         work_attribute = self._work_registry[index]
         work = getattr(self, work_attribute)
         return work
@@ -535,7 +536,7 @@ def num_pending_works(self) -> int:
         return sum(work.is_pending for work in self.workers)
 
     def autoscale(self) -> None:
-        """Upscale and down scale model inference works."""
+        """Adjust the number of works based on the target number returned by ``self.scale``."""
         if time.time() - self._last_autoscale < self.autoscale_interval:
             return
 

From 29059a0830d08ccc5095c275201bd0330399bd69 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 2 Dec 2022 14:51:03 +0000
Subject: [PATCH 088/110] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/lightning_app/components/auto_scaler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index b9ff23f33fa04..868a885263f17 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -339,9 +339,9 @@ def send_request_to_update_servers(self, servers: List[str]):
 
 
 class AutoScaler(LightningFlow):
-    """The ``AutoScaler`` can be used to automatically change the number of replicas of the given server
-    in response to changes in the number of incoming requests. Incoming requests will be batched and
-    balanced across the replicas.
+    """The ``AutoScaler`` can be used to automatically change the number of replicas of the given server in
+    response to changes in the number of incoming requests. Incoming requests will be batched and balanced across
+    the replicas.
 
     Args:
         min_replicas: The number of works to start when app initializes.

From 428550651f1c113f9d01ec9af11183fe1204bedb Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 00:26:45 +0900
Subject: [PATCH 089/110] .

---
 src/lightning_app/components/auto_scaler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 868a885263f17..dae46b9b19edd 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -509,18 +509,18 @@ def scale(self, replicas: int, metrics: dict) -> int:
             The target number of running works. The value will be adjusted after this method runs
             so that it satisfies ``min_replicas<=replicas<=max_replicas``.
         """
-        # scale out if the number of pending requests exceeds max batch size.
-        max_requests_per_work = self.max_batch_size
         pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
             replicas + metrics["pending_works"]
         )
+
+        # scale out if the number of pending requests exceeds max batch size.
+        max_requests_per_work = self.max_batch_size
         if pending_requests_per_running_or_pending_work >= max_requests_per_work:
             return replicas + 1
 
         # scale in if the number of pending requests is below 25% of max_requests_per_work
         min_requests_per_work = max_requests_per_work * 0.25
-        pending_requests_per_running_work = metrics["pending_requests"] / replicas
-        if pending_requests_per_running_work < min_requests_per_work:
+        if pending_requests_per_running_or_pending_work < min_requests_per_work:
             return replicas - 1
 
         return replicas

From 72a6f1399efaa73ca064255cbdb144ffae406c1d Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 00:28:35 +0900
Subject: [PATCH 090/110] doc

---
 src/lightning_app/components/auto_scaler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index dae46b9b19edd..1b8f41a4e4379 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -370,17 +370,17 @@ class AutoScaler(LightningFlow):
         # Example 2: Customizing the scaling logic
         class MyAutoScaler(L.app.components.AutoScaler):
             def scale(self, replicas: int, metrics: dict) -> int:
-                max_requests_per_work = self.max_batch_size
-                min_requests_per_work = 1
                 pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
                     replicas + metrics["pending_works"]
                 )
 
                 # upscale
+                max_requests_per_work = self.max_batch_size
                 if pending_requests_per_running_or_pending_work >= max_requests_per_work:
                     return replicas + 1
 
                 # downscale
+                min_requests_per_work = max_requests_per_work * 0.25
                 if pending_requests_per_running_or_pending_work < min_requests_per_work:
                     return replicas - 1
 

From 56ea78b45f3a2d5e28b622cfc2240d95a906ac6d Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 01:19:25 +0900
Subject: [PATCH 091/110] Update src/lightning_app/components/auto_scaler.py

Co-authored-by: Noha Alon <nohalon@gmail.com>
---
 src/lightning_app/components/auto_scaler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 1b8f41a4e4379..907e1c58ee21b 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -342,7 +342,7 @@ class AutoScaler(LightningFlow):
     """The ``AutoScaler`` can be used to automatically change the number of replicas of the given server in
     response to changes in the number of incoming requests. Incoming requests will be batched and balanced across
     the replicas.
-
+Note that the ``Autoscaler`` experience on the cloud is in beta. 
     Args:
         min_replicas: The number of works to start when app initializes.
         max_replicas: The max number of works to spawn to handle the incoming requests.

From 24983a0a5ab915fad3456690499a8ea4157e58f0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 5 Dec 2022 16:20:41 +0000
Subject: [PATCH 092/110] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/lightning_app/components/auto_scaler.py | 106 ++++++++++----------
 1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 907e1c58ee21b..7d78b8191e320 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -340,63 +340,63 @@ def send_request_to_update_servers(self, servers: List[str]):
 
 class AutoScaler(LightningFlow):
     """The ``AutoScaler`` can be used to automatically change the number of replicas of the given server in
-    response to changes in the number of incoming requests. Incoming requests will be batched and balanced across
-    the replicas.
-Note that the ``Autoscaler`` experience on the cloud is in beta. 
-    Args:
-        min_replicas: The number of works to start when app initializes.
-        max_replicas: The max number of works to spawn to handle the incoming requests.
-        autoscale_interval: The number of seconds to wait before checking whether to upscale or downscale the works.
-        endpoint: Default=api/predict. Provide the REST API path
-        max_batch_size: (auto-batching) The number of requests to process at once.
-        timeout_batching: (auto-batching) The number of seconds to wait before sending the requests to process.
-        input_type: Input type.
-        output_type: Output type.
-
-    .. testcode::
-
-        import lightning as L
-
-        # Example 1: Auto-scaling serve component out-of-the-box
-        app = L.LightningApp(
-            L.app.components.AutoScaler(
-                MyPythonServer,
-                min_replicas=1,
-                max_replicas=8,
-                autoscale_interval=10,
+        response to changes in the number of incoming requests. Incoming requests will be batched and balanced across
+        the replicas.
+    Note that the ``Autoscaler`` experience on the cloud is in beta.
+        Args:
+            min_replicas: The number of works to start when app initializes.
+            max_replicas: The max number of works to spawn to handle the incoming requests.
+            autoscale_interval: The number of seconds to wait before checking whether to upscale or downscale the works.
+            endpoint: Default=api/predict. Provide the REST API path
+            max_batch_size: (auto-batching) The number of requests to process at once.
+            timeout_batching: (auto-batching) The number of seconds to wait before sending the requests to process.
+            input_type: Input type.
+            output_type: Output type.
+
+        .. testcode::
+
+            import lightning as L
+
+            # Example 1: Auto-scaling serve component out-of-the-box
+            app = L.LightningApp(
+                L.app.components.AutoScaler(
+                    MyPythonServer,
+                    min_replicas=1,
+                    max_replicas=8,
+                    autoscale_interval=10,
+                )
             )
-        )
 
-        # Example 2: Customizing the scaling logic
-        class MyAutoScaler(L.app.components.AutoScaler):
-            def scale(self, replicas: int, metrics: dict) -> int:
-                pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
-                    replicas + metrics["pending_works"]
+            # Example 2: Customizing the scaling logic
+            class MyAutoScaler(L.app.components.AutoScaler):
+                def scale(self, replicas: int, metrics: dict) -> int:
+                    pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
+                        replicas + metrics["pending_works"]
+                    )
+
+                    # upscale
+                    max_requests_per_work = self.max_batch_size
+                    if pending_requests_per_running_or_pending_work >= max_requests_per_work:
+                        return replicas + 1
+
+                    # downscale
+                    min_requests_per_work = max_requests_per_work * 0.25
+                    if pending_requests_per_running_or_pending_work < min_requests_per_work:
+                        return replicas - 1
+
+                    return replicas
+
+
+            app = L.LightningApp(
+                MyAutoScaler(
+                    MyPythonServer,
+                    min_replicas=1,
+                    max_replicas=8,
+                    autoscale_interval=10,
+                    max_batch_size=8,  # for auto batching
+                    timeout_batching=2,  # for auto batching
                 )
-
-                # upscale
-                max_requests_per_work = self.max_batch_size
-                if pending_requests_per_running_or_pending_work >= max_requests_per_work:
-                    return replicas + 1
-
-                # downscale
-                min_requests_per_work = max_requests_per_work * 0.25
-                if pending_requests_per_running_or_pending_work < min_requests_per_work:
-                    return replicas - 1
-
-                return replicas
-
-
-        app = L.LightningApp(
-            MyAutoScaler(
-                MyPythonServer,
-                min_replicas=1,
-                max_replicas=8,
-                autoscale_interval=10,
-                max_batch_size=8,  # for auto batching
-                timeout_batching=2,  # for auto batching
             )
-        )
     """
 
     def __init__(

From 27431f4b87b7de18114e854048ff8b41eaead380 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 02:16:41 +0900
Subject: [PATCH 093/110] Revert "[pre-commit.ci] auto fixes from
 pre-commit.com hooks"

This reverts commit 24983a0a5ab915fad3456690499a8ea4157e58f0.
---
 src/lightning_app/components/auto_scaler.py | 106 ++++++++++----------
 1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 7d78b8191e320..907e1c58ee21b 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -340,63 +340,63 @@ def send_request_to_update_servers(self, servers: List[str]):
 
 class AutoScaler(LightningFlow):
     """The ``AutoScaler`` can be used to automatically change the number of replicas of the given server in
-        response to changes in the number of incoming requests. Incoming requests will be batched and balanced across
-        the replicas.
-    Note that the ``Autoscaler`` experience on the cloud is in beta.
-        Args:
-            min_replicas: The number of works to start when app initializes.
-            max_replicas: The max number of works to spawn to handle the incoming requests.
-            autoscale_interval: The number of seconds to wait before checking whether to upscale or downscale the works.
-            endpoint: Default=api/predict. Provide the REST API path
-            max_batch_size: (auto-batching) The number of requests to process at once.
-            timeout_batching: (auto-batching) The number of seconds to wait before sending the requests to process.
-            input_type: Input type.
-            output_type: Output type.
-
-        .. testcode::
-
-            import lightning as L
-
-            # Example 1: Auto-scaling serve component out-of-the-box
-            app = L.LightningApp(
-                L.app.components.AutoScaler(
-                    MyPythonServer,
-                    min_replicas=1,
-                    max_replicas=8,
-                    autoscale_interval=10,
-                )
+    response to changes in the number of incoming requests. Incoming requests will be batched and balanced across
+    the replicas.
+Note that the ``Autoscaler`` experience on the cloud is in beta. 
+    Args:
+        min_replicas: The number of works to start when app initializes.
+        max_replicas: The max number of works to spawn to handle the incoming requests.
+        autoscale_interval: The number of seconds to wait before checking whether to upscale or downscale the works.
+        endpoint: Default=api/predict. Provide the REST API path
+        max_batch_size: (auto-batching) The number of requests to process at once.
+        timeout_batching: (auto-batching) The number of seconds to wait before sending the requests to process.
+        input_type: Input type.
+        output_type: Output type.
+
+    .. testcode::
+
+        import lightning as L
+
+        # Example 1: Auto-scaling serve component out-of-the-box
+        app = L.LightningApp(
+            L.app.components.AutoScaler(
+                MyPythonServer,
+                min_replicas=1,
+                max_replicas=8,
+                autoscale_interval=10,
             )
+        )
 
-            # Example 2: Customizing the scaling logic
-            class MyAutoScaler(L.app.components.AutoScaler):
-                def scale(self, replicas: int, metrics: dict) -> int:
-                    pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
-                        replicas + metrics["pending_works"]
-                    )
-
-                    # upscale
-                    max_requests_per_work = self.max_batch_size
-                    if pending_requests_per_running_or_pending_work >= max_requests_per_work:
-                        return replicas + 1
-
-                    # downscale
-                    min_requests_per_work = max_requests_per_work * 0.25
-                    if pending_requests_per_running_or_pending_work < min_requests_per_work:
-                        return replicas - 1
-
-                    return replicas
-
-
-            app = L.LightningApp(
-                MyAutoScaler(
-                    MyPythonServer,
-                    min_replicas=1,
-                    max_replicas=8,
-                    autoscale_interval=10,
-                    max_batch_size=8,  # for auto batching
-                    timeout_batching=2,  # for auto batching
+        # Example 2: Customizing the scaling logic
+        class MyAutoScaler(L.app.components.AutoScaler):
+            def scale(self, replicas: int, metrics: dict) -> int:
+                pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
+                    replicas + metrics["pending_works"]
                 )
+
+                # upscale
+                max_requests_per_work = self.max_batch_size
+                if pending_requests_per_running_or_pending_work >= max_requests_per_work:
+                    return replicas + 1
+
+                # downscale
+                min_requests_per_work = max_requests_per_work * 0.25
+                if pending_requests_per_running_or_pending_work < min_requests_per_work:
+                    return replicas - 1
+
+                return replicas
+
+
+        app = L.LightningApp(
+            MyAutoScaler(
+                MyPythonServer,
+                min_replicas=1,
+                max_replicas=8,
+                autoscale_interval=10,
+                max_batch_size=8,  # for auto batching
+                timeout_batching=2,  # for auto batching
             )
+        )
     """
 
     def __init__(

From b7dd2c185e3c1f2a55552baf8020fd852be949af Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 02:16:46 +0900
Subject: [PATCH 094/110] Revert "Update
 src/lightning_app/components/auto_scaler.py"

This reverts commit 56ea78b45f3a2d5e28b622cfc2240d95a906ac6d.
---
 src/lightning_app/components/auto_scaler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 907e1c58ee21b..1b8f41a4e4379 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -342,7 +342,7 @@ class AutoScaler(LightningFlow):
     """The ``AutoScaler`` can be used to automatically change the number of replicas of the given server in
     response to changes in the number of incoming requests. Incoming requests will be batched and balanced across
     the replicas.
-Note that the ``Autoscaler`` experience on the cloud is in beta. 
+
     Args:
         min_replicas: The number of works to start when app initializes.
         max_replicas: The max number of works to spawn to handle the incoming requests.

From a6344468b1952b09052bffd8017f89704eca81d6 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 22:09:21 +0900
Subject: [PATCH 095/110] Remove redefinition

---
 src/lightning_app/components/auto_scaler.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 1b8f41a4e4379..dbf4e4c70ceac 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -231,9 +231,6 @@ def run(self):
 
         fastapi_app = _create_fastapi("Load Balancer")
         security = HTTPBasic()
-        fastapi_app.global_request_count = 0
-        fastapi_app.num_current_requests = 0
-        fastapi_app.last_processing_time = 0
         fastapi_app.SEND_TASK = None
 
         @fastapi_app.on_event("startup")

From 64a1960f9e48f4a92fcac985e22d0e1d547cdce4 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 22:17:48 +0900
Subject: [PATCH 096/110] Remove load balancer run blocker

---
 src/lightning_app/components/auto_scaler.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index dbf4e4c70ceac..2ace05e1f9596 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -142,7 +142,6 @@ def __init__(
         super().__init__(cloud_compute=CloudCompute("default"), **kwargs)
         self._input_type = input_type
         self._output_type = output_type
-        self._server_ready = False
         self._timeout_keep_alive = timeout_keep_alive
         self._timeout_inference_request = timeout_inference_request
         self.servers = []
@@ -221,8 +220,6 @@ async def process_request(self, data: BaseModel):
                 return result
 
     def run(self):
-        if self._server_ready:
-            return
 
         logger.info(f"servers: {self.servers}")
 
@@ -236,12 +233,10 @@ def run(self):
         @fastapi_app.on_event("startup")
         async def startup_event():
             fastapi_app.SEND_TASK = asyncio.create_task(self.consumer())
-            self._server_ready = True
 
         @fastapi_app.on_event("shutdown")
         def shutdown_event():
             fastapi_app.SEND_TASK.cancel()
-            self._server_ready = False
 
         def authenticate_private_endpoint(credentials: HTTPBasicCredentials = Depends(security)):
             AUTO_SCALER_AUTH_PASSWORD = os.environ.get("AUTO_SCALER_AUTH_PASSWORD", "")

From fba7a3cbe38436a556d79d4d2757f08d7e25b5c8 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 22:20:20 +0900
Subject: [PATCH 097/110] raise RuntimeError

---
 src/lightning_app/components/auto_scaler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 2ace05e1f9596..a29ba72266910 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -175,7 +175,8 @@ async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
                     response.raise_for_status()
                     response = await response.json()
                     outputs = response["outputs"]
-                    assert len(batch) == len(outputs), f"result has {len(outputs)} items but batch is {len(batch)}"
+                    if len(batch) == len(outputs):
+                        raise RuntimeError(f"result has {len(outputs)} items but batch is {len(batch)}")
                     result = {request[0]: r for request, r in zip(batch, outputs)}
                     self._responses.update(result)
         except Exception as ex:

From 4ccc38cbb239ac081d69719058c4aafa99c1b675 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 22:23:09 +0900
Subject: [PATCH 098/110] remove has_sent

---
 src/lightning_app/components/auto_scaler.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index a29ba72266910..712eb5dbcd47a 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -187,20 +187,14 @@ async def consumer(self):
         while True:
             await asyncio.sleep(0.05)
 
-            has_sent = False
-
             batch = self._batch[: self.max_batch_size]
             while batch and (
                 (len(batch) >= self.max_batch_size) or ((time.time() - self._last_batch_sent) > self.timeout_batching)
             ):
-                has_sent = True
-
                 asyncio.create_task(self.send_batch(batch))
 
                 self._batch = self._batch[self.max_batch_size :]
                 batch = self._batch[: self.max_batch_size]
-
-            if has_sent:
                 self._last_batch_sent = time.time()
 
     async def process_request(self, data: BaseModel):

From aa1785c2843d786cc7843a7799db4c5adc08786c Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 22:24:11 +0900
Subject: [PATCH 099/110] lower the default timeout_batching from 10 to 1

---
 src/lightning_app/components/auto_scaler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 712eb5dbcd47a..ae1e60251d399 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -134,7 +134,7 @@ def __init__(
         endpoint: str,
         max_batch_size: int = 8,
         # all timeout args are in seconds
-        timeout_batching: int = 10,
+        timeout_batching: int = 1,
         timeout_keep_alive: int = 60,
         timeout_inference_request: int = 60,
         **kwargs: Any,

From 7c097166e30824af7962a99aeea38515872a56e1 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 22:25:30 +0900
Subject: [PATCH 100/110] remove debug

---
 src/lightning_app/components/auto_scaler.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index ae1e60251d399..7d6a39fba58d7 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -290,7 +290,6 @@ def update_servers(self, server_works: List[LightningWork]):
         new_servers = set(server_urls)
 
         if new_servers == old_servers:
-            logger.debug("no new server added")
             return
 
         if new_servers - old_servers:

From ff2009a7b270eeb3dd0944d22c9ab564e413861a Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 22:28:25 +0900
Subject: [PATCH 101/110] update the default timeout_batching

---
 src/lightning_app/components/auto_scaler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 7d6a39fba58d7..778deaadb0418 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -380,7 +380,7 @@ def scale(self, replicas: int, metrics: dict) -> int:
                 max_replicas=8,
                 autoscale_interval=10,
                 max_batch_size=8,  # for auto batching
-                timeout_batching=2,  # for auto batching
+                timeout_batching=1,  # for auto batching
             )
         )
     """
@@ -392,7 +392,7 @@ def __init__(
         max_replicas: int = 4,
         autoscale_interval: int = 1 * 10,
         max_batch_size: int = 8,
-        timeout_batching: float = 2,
+        timeout_batching: float = 1,
         endpoint: str = None,
         input_type: type = Dict,
         output_type: type = Dict,

From 839734dce3ec5a81b215a0e3ec0b1a0c5bf870dd Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 22:33:15 +0900
Subject: [PATCH 102/110] .

---
 src/lightning_app/components/auto_scaler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 778deaadb0418..840232f829ecd 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -390,7 +390,7 @@ def __init__(
         work_cls: Type[LightningWork],
         min_replicas: int = 1,
         max_replicas: int = 4,
-        autoscale_interval: int = 1 * 10,
+        autoscale_interval: int = 10,
         max_batch_size: int = 8,
         timeout_batching: float = 1,
         endpoint: str = None,

From 6a553b954ab29209cc5646117d1b572115246045 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 22:46:57 +0900
Subject: [PATCH 103/110] tighten condition

---
 src/lightning_app/components/auto_scaler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 840232f829ecd..adf7912cdc9aa 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -189,7 +189,7 @@ async def consumer(self):
 
             batch = self._batch[: self.max_batch_size]
             while batch and (
-                (len(batch) >= self.max_batch_size) or ((time.time() - self._last_batch_sent) > self.timeout_batching)
+                (len(batch) == self.max_batch_size) or ((time.time() - self._last_batch_sent) > self.timeout_batching)
             ):
                 asyncio.create_task(self.send_batch(batch))
 

From 506e1921897da7cf943641363dc76458eee345b7 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 22:48:49 +0900
Subject: [PATCH 104/110] fix endpoint

---
 src/lightning_app/components/auto_scaler.py | 33 ++++++++++-----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index adf7912cdc9aa..cefea15a3fa23 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -80,19 +80,6 @@ def _create_fastapi(title: str) -> FastAPI:
     fastapi_app.num_current_requests = 0
     fastapi_app.last_processing_time = 0
 
-    @fastapi_app.middleware("http")
-    async def current_request_counter(request: Request, call_next):
-        if not request.scope["path"] == "/api/predict":
-            return await call_next(request)
-        fastapi_app.global_request_count += 1
-        fastapi_app.num_current_requests += 1
-        start_time = time.time()
-        response = await call_next(request)
-        processing_time = time.time() - start_time
-        fastapi_app.last_processing_time = processing_time
-        fastapi_app.num_current_requests -= 1
-        return response
-
     @fastapi_app.get("/", include_in_schema=False)
     async def docs():
         return RedirectResponse("/docs")
@@ -225,6 +212,19 @@ def run(self):
         security = HTTPBasic()
         fastapi_app.SEND_TASK = None
 
+        @fastapi_app.middleware("http")
+        async def current_request_counter(request: Request, call_next):
+            if not request.scope["path"] == self.endpoint:
+                return await call_next(request)
+            fastapi_app.global_request_count += 1
+            fastapi_app.num_current_requests += 1
+            start_time = time.time()
+            response = await call_next(request)
+            processing_time = time.time() - start_time
+            fastapi_app.last_processing_time = processing_time
+            fastapi_app.num_current_requests -= 1
+            return response
+
         @fastapi_app.on_event("startup")
         async def startup_event():
             fastapi_app.SEND_TASK = asyncio.create_task(self.consumer())
@@ -267,7 +267,7 @@ async def update_servers(servers: List[str], authenticated: bool = Depends(authe
             self.servers = servers
             self._iter = cycle(self.servers)
 
-        @fastapi_app.post("/api/predict", response_model=self._output_type)
+        @fastapi_app.post(self.endpoint, response_model=self._output_type)
         async def balance_api(inputs: self._input_type):
             return await self.process_request(inputs)
 
@@ -333,7 +333,7 @@ class AutoScaler(LightningFlow):
         min_replicas: The number of works to start when app initializes.
         max_replicas: The max number of works to spawn to handle the incoming requests.
         autoscale_interval: The number of seconds to wait before checking whether to upscale or downscale the works.
-        endpoint: Default=api/predict. Provide the REST API path
+        endpoint: Provide the REST API path.
         max_batch_size: (auto-batching) The number of requests to process at once.
         timeout_batching: (auto-batching) The number of seconds to wait before sending the requests to process.
         input_type: Input type.
@@ -393,7 +393,7 @@ def __init__(
         autoscale_interval: int = 10,
         max_batch_size: int = 8,
         timeout_batching: float = 1,
-        endpoint: str = None,
+        endpoint: str = "api/predict",
         input_type: type = Dict,
         output_type: type = Dict,
         *work_args: Any,
@@ -421,7 +421,6 @@ def __init__(
         self._last_autoscale = time.time()
         self.fake_trigger = 0
 
-        endpoint = endpoint or "api/predict"
         self.load_balancer = _LoadBalancer(
             input_type=self._input_type,
             output_type=self._output_type,

From 205c0af32592aa774163cecf378b2ba2bb27b3bd Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 23:00:08 +0900
Subject: [PATCH 105/110] typo in runtimeerror cond

---
 src/lightning_app/components/auto_scaler.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index cefea15a3fa23..625f69094c92a 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -138,6 +138,10 @@ def __init__(
         self._batch = []
         self._responses = {}  # {request_id: response}
         self._last_batch_sent = 0
+
+        if not endpoint.startswith("/"):
+            endpoint = "/" + endpoint
+
         self.endpoint = endpoint
 
     async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
@@ -152,7 +156,7 @@ async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
                     "Content-Type": "application/json",
                 }
                 async with session.post(
-                    f"{server}/{self.endpoint}",
+                    f"{server}{self.endpoint}",
                     json=batch_request_data.dict(),
                     timeout=self._timeout_inference_request,
                     headers=headers,
@@ -162,7 +166,7 @@ async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
                     response.raise_for_status()
                     response = await response.json()
                     outputs = response["outputs"]
-                    if len(batch) == len(outputs):
+                    if len(batch) != len(outputs):
                         raise RuntimeError(f"result has {len(outputs)} items but batch is {len(batch)}")
                     result = {request[0]: r for request, r in zip(batch, outputs)}
                     self._responses.update(result)

From 6d76b0d54cfc042a922875cb617973048c3645fc Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 6 Dec 2022 23:47:43 +0900
Subject: [PATCH 106/110] async lock update severs

---
 src/lightning_app/components/auto_scaler.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index 625f69094c92a..fe5bdc67de6e5 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -25,6 +25,7 @@
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
 
 logger = Logger(__name__)
+lock = asyncio.Lock()
 
 
 def _raise_granular_exception(exception: Exception) -> None:
@@ -268,7 +269,9 @@ async def sys_info(authenticated: bool = Depends(authenticate_private_endpoint))
 
         @fastapi_app.put("/system/update-servers")
         async def update_servers(servers: List[str], authenticated: bool = Depends(authenticate_private_endpoint)):
-            self.servers = servers
+            async with lock:
+                self.servers = servers
+
             self._iter = cycle(self.servers)
 
         @fastapi_app.post(self.endpoint, response_model=self._output_type)

From 2233098024d6f83da46b55497fd3b93923cdbd5e Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 7 Dec 2022 00:31:02 +0900
Subject: [PATCH 107/110] add a test

---
 .../tests_app/components/test_auto_scaler.py  | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/tests_app/components/test_auto_scaler.py b/tests/tests_app/components/test_auto_scaler.py
index 17183ea5ccef8..436c3517d01ca 100644
--- a/tests/tests_app/components/test_auto_scaler.py
+++ b/tests/tests_app/components/test_auto_scaler.py
@@ -1,6 +1,8 @@
 import time
 from unittest.mock import patch
 
+import pytest
+
 from lightning_app import LightningWork
 from lightning_app.components import AutoScaler
 
@@ -67,3 +69,24 @@ def test_num_replicas_not_belo_min_replicas(*_):
         auto_scaler.run()
 
     assert auto_scaler.num_replicas == min_replicas
+
+
+@pytest.mark.parametrize(
+    "replicas, metrics, expected_replicas",
+    [
+        pytest.param(1, {"pending_requests": 1, "pending_works": 0}, 2, id="increase if no pending work"),
+        pytest.param(1, {"pending_requests": 1, "pending_works": 1}, 1, id="dont increase if pending works"),
+        pytest.param(8, {"pending_requests": 1, "pending_works": 0}, 7, id="reduce if requests < 25% capacity"),
+        pytest.param(8, {"pending_requests": 2, "pending_works": 0}, 8, id="dont reduce if requests >= 25% capacity"),
+    ],
+)
+def test_scale(replicas, metrics, expected_replicas):
+    """Test `scale()`, the default scaling strategy."""
+    auto_scaler = AutoScaler(
+        EmptyWork,
+        min_replicas=1,
+        max_replicas=8,
+        max_batch_size=1,
+    )
+
+    assert auto_scaler.scale(replicas, metrics) == expected_replicas

From 4526496ce0ad2aa6975b4a76470541eeb7ac62b9 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 7 Dec 2022 00:37:27 +0900
Subject: [PATCH 108/110] {in,out}put_type typing

---
 src/lightning_app/components/auto_scaler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/auto_scaler.py
index fe5bdc67de6e5..ad9d69690b23d 100644
--- a/src/lightning_app/components/auto_scaler.py
+++ b/src/lightning_app/components/auto_scaler.py
@@ -117,8 +117,8 @@ class _LoadBalancer(LightningWork):
 
     def __init__(
         self,
-        input_type: type,
-        output_type: type,
+        input_type: BaseModel,
+        output_type: BaseModel,
         endpoint: str,
         max_batch_size: int = 8,
         # all timeout args are in seconds
@@ -401,8 +401,8 @@ def __init__(
         max_batch_size: int = 8,
         timeout_batching: float = 1,
         endpoint: str = "api/predict",
-        input_type: type = Dict,
-        output_type: type = Dict,
+        input_type: BaseModel = Dict,
+        output_type: BaseModel = Dict,
         *work_args: Any,
         **work_kwargs: Any,
     ) -> None:

From 468b6262d85018ef88d50e7c51faea235543e512 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 7 Dec 2022 20:11:36 +0900
Subject: [PATCH 109/110] Update examples/app_server_with_auto_scaler/app.py

Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com>
---
 examples/app_server_with_auto_scaler/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
index 264ec9fa4e3e9..b713bd6d1dcfc 100644
--- a/examples/app_server_with_auto_scaler/app.py
+++ b/examples/app_server_with_auto_scaler/app.py
@@ -8,7 +8,7 @@
 
 
 class RequestModel(BaseModel):
-    image: str
+    image: str  # bytecode
 
 
 class BatchRequestModel(BaseModel):

From 5b2b69f3f011ca12b2b191a884a19c5ebf04d508 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 7 Dec 2022 20:12:17 +0900
Subject: [PATCH 110/110] Update .actions/setup_tools.py

---
 .actions/setup_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py
index 47eaddac3a832..d467c0f3ba037 100644
--- a/.actions/setup_tools.py
+++ b/.actions/setup_tools.py
@@ -191,7 +191,7 @@ def _load_aggregate_requirements(req_dir: str = "requirements", freeze_requireme
         load_requirements(d, file_name="base.txt", unfreeze=not freeze_requirements)
         for d in glob.glob(os.path.join(req_dir, "*"))
         # skip empty folder as git artefacts, and resolving Will's special issue
-        if os.path.isdir(d) and len(glob.glob(os.path.join(d, "*"))) > 0 and "__pycache__" not in d
+        if os.path.isdir(d) and len(glob.glob(os.path.join(d, "*"))) > 0
     ]
     if not requires:
         return None