Lightning-AI · tchaton · Dec 19, 2022 · Dec 19, 2022 · Dec 19, 2022 · Dec 19, 2022
@@ -60,6 +60,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed a bug where `AutoScaler` would fail with min_replica=0 ([#16092](https://github.com/Lightning-AI/lightning/pull/16092)
 
 
+- Fixed a non-thread safe deepcopy in the scheduler ([#16114](https://github.com/Lightning-AI/lightning/pull/16114))
+
+- Fixed Http Queue sleeping for 1 sec by default if no delta were found ([#16114](https://github.com/Lightning-AI/lightning/pull/16114))
+
+
 ## [1.8.4] - 2022-12-08
 
 ### Added

@@ -24,12 +24,12 @@
 from lightning_app.api.http_methods import _HttpMethod
 from lightning_app.api.request_types import _DeltaRequest
 from lightning_app.core.constants import (
-    CLOUD_QUEUE_TYPE,
     ENABLE_PULLING_STATE_ENDPOINT,
     ENABLE_PUSHING_STATE_ENDPOINT,
     ENABLE_STATE_WEBSOCKET,
     ENABLE_UPLOAD_ENDPOINT,
     FRONTEND_DIR,
+    get_cloud_queue_type,
 )
 from lightning_app.core.queues import QueuingSystem
 from lightning_app.storage import Drive
@@ -350,7 +350,7 @@ async def healthz(response: Response):
     """Health check endpoint used in the cloud FastAPI servers to check the status periodically."""
     # check the queue status only if running in cloud
     if is_running_in_cloud():
-        queue_obj = QueuingSystem(CLOUD_QUEUE_TYPE).get_queue(queue_name="healthz")
+        queue_obj = QueuingSystem(get_cloud_queue_type()).get_queue(queue_name="healthz")
         # this is only being implemented on Redis Queue. For HTTP Queue, it doesn't make sense to have every single
         # app checking the status of the Queue server
         if not queue_obj.is_running:

@@ -356,6 +356,8 @@ def _collect_deltas_from_ui_and_work_queues(self) -> List[Union[Delta, _APIReque
                         deltas.append(delta)
                 else:
                     api_or_command_request_deltas.append(delta)
+            else:
+                break
 
         if api_or_command_request_deltas:
             _process_requests(self, api_or_command_request_deltas)

@@ -1,5 +1,6 @@
 import os
 from pathlib import Path
+from typing import Optional
 
 import lightning_cloud.env
 
@@ -13,7 +14,7 @@ def get_lightning_cloud_url() -> str:
 
 SUPPORTED_PRIMITIVE_TYPES = (type(None), str, int, float, bool)
 STATE_UPDATE_TIMEOUT = 0.001
-STATE_ACCUMULATE_WAIT = 0.05
+STATE_ACCUMULATE_WAIT = 0.15
 # Duration in seconds of a moving average of a full flow execution
 # beyond which an exception is raised.
 FLOW_DURATION_THRESHOLD = 1.0
@@ -25,7 +26,6 @@ def get_lightning_cloud_url() -> str:
 APP_SERVER_PORT = _find_lit_app_port(7501)
 APP_STATE_MAX_SIZE_BYTES = 1024 * 1024  # 1 MB
 
-CLOUD_QUEUE_TYPE = os.getenv("LIGHTNING_CLOUD_QUEUE_TYPE", None)
 WARNING_QUEUE_SIZE = 1000
 # different flag because queue debug can be very noisy, and almost always not useful unless debugging the queue itself.
 QUEUE_DEBUG_ENABLED = bool(int(os.getenv("LIGHTNING_QUEUE_DEBUG_ENABLED", "0")))
@@ -77,5 +77,9 @@ def enable_multiple_works_in_default_container() -> bool:
     return bool(int(os.getenv("ENABLE_MULTIPLE_WORKS_IN_DEFAULT_CONTAINER", "0")))
 
 
+def get_cloud_queue_type() -> Optional[str]:
+    return os.getenv("LIGHTNING_CLOUD_QUEUE_TYPE", None)
+
+
 # Number of seconds to wait between filesystem checks when waiting for files in remote storage
 REMOTE_STORAGE_WAIT = 0.5
@@ -364,12 +364,11 @@ def get(self, timeout: int = None) -> Any:
 
         # timeout is some value - loop until the timeout is reached
         start_time = time.time()
-        timeout += 0.1  # add 0.1 seconds as a safe margin
         while (time.time() - start_time) < timeout:
             try:
                 return self._get()
             except queue.Empty:
-                time.sleep(HTTP_QUEUE_REFRESH_INTERVAL)
+                pass
 
     def _get(self):
         resp = self.client.post(f"v1/{self.app_id}/{self._name_suffix}", query_params={"action": "pop"})

@@ -50,7 +50,6 @@
 from lightning_app import LightningWork
 from lightning_app.core.app import LightningApp
 from lightning_app.core.constants import (
-    CLOUD_QUEUE_TYPE,
     CLOUD_UPLOAD_WARNING,
     DEFAULT_NUMBER_OF_EXPOSED_PORTS,
     DISABLE_DEPENDENCY_CACHE,
@@ -60,6 +59,7 @@
     ENABLE_MULTIPLE_WORKS_IN_NON_DEFAULT_CONTAINER,
     ENABLE_PULLING_STATE_ENDPOINT,
     ENABLE_PUSHING_STATE_ENDPOINT,
+    get_cloud_queue_type,
     get_lightning_cloud_url,
 )
 from lightning_app.runners.backends.cloud import CloudBackend
@@ -418,9 +418,11 @@ def dispatch(
                     initial_port += 1
 
             queue_server_type = V1QueueServerType.UNSPECIFIED
-            if CLOUD_QUEUE_TYPE == "http":
+            # Note: Enable app to select their own queue type.
+            queue_type = get_cloud_queue_type()
+            if queue_type == "http":
                 queue_server_type = V1QueueServerType.HTTP
-            elif CLOUD_QUEUE_TYPE == "redis":
+            elif queue_type == "redis":
                 queue_server_type = V1QueueServerType.REDIS
 
             release_body = Body8(
@@ -496,7 +498,8 @@ def dispatch(
         if lightning_app_instance.status.phase == V1LightningappInstanceState.FAILED:
             raise RuntimeError("Failed to create the application. Cannot upload the source code.")
 
-        if open_ui:
+        # TODO: Remove testing dependency, but this would open a tab for each test...
+        if open_ui and "PYTEST_CURRENT_TEST" not in os.environ:
             click.launch(self._get_app_url(lightning_app_instance, not has_sufficient_credits))
 
         if cleanup_handle:

@@ -22,12 +22,11 @@ def update_index_file(ui_root: str, info: Optional[AppInfo] = None, root_path: s
     entry_file = Path(ui_root) / "index.html"
     original_file = Path(ui_root) / "index.original.html"
 
-    if root_path:
-        if not original_file.exists():
-            shutil.copyfile(entry_file, original_file)  # keep backup
-        else:
-            # revert index.html in case it was modified after creating original.html
-            shutil.copyfile(original_file, entry_file)
+    if not original_file.exists():
+        shutil.copyfile(entry_file, original_file)  # keep backup
+    else:
+        # revert index.html in case it was modified after creating original.html
+        shutil.copyfile(original_file, entry_file)
 
     if info:
         with original_file.open() as f:

@@ -71,7 +71,7 @@ class CloudCompute:
     name: str = "default"
     disk_size: int = 0
     idle_timeout: Optional[int] = None
-    shm_size: Optional[int] = 0
+    shm_size: Optional[int] = None
     mounts: Optional[Union[Mount, List[Mount]]] = None
     _internal_id: Optional[str] = None
 
@@ -80,6 +80,12 @@ def __post_init__(self) -> None:
 
         self.name = self.name.lower()
 
+        if self.shm_size is None:
+            if "gpu" in self.name:
+                self.shm_size = 1024
+            else:
+                self.shm_size = 0
+
         # All `default` CloudCompute are identified in the same way.
         if self._internal_id is None:
             self._internal_id = self._generate_id()

@@ -1,10 +1,9 @@
 import threading
-from copy import deepcopy
 from datetime import datetime
 from typing import Optional
 
 from croniter import croniter
-from deepdiff import DeepDiff, Delta
+from deepdiff import Delta
 
 from lightning_app.utilities.proxies import ComponentDelta
 
@@ -34,11 +33,15 @@ def run_once(self):
             next_event = croniter(metadata["cron_pattern"], start_time).get_next(datetime)
             # When the event is reached, send a delta to activate scheduling.
             if current_date > next_event:
-                flow = self._app.get_component_by_name(metadata["name"])
-                previous_state = deepcopy(flow.state)
-                flow._enable_schedule(call_hash)
                 component_delta = ComponentDelta(
-                    id=flow.name, delta=Delta(DeepDiff(previous_state, flow.state, verbose_level=2))
+                    id=metadata["name"],
+                    delta=Delta(
+                        {
+                            "values_changed": {
+                                f"root['calls']['scheduling']['{call_hash}']['running']": {"new_value": True}
+                            }
+                        }
+                    ),
                 )
                 self._app.delta_queue.put(component_delta)
                 metadata["start_time"] = next_event.isoformat()

@@ -384,7 +384,7 @@ async def test_health_endpoint_success():
 @pytest.mark.anyio
 async def test_health_endpoint_failure(monkeypatch):
     monkeypatch.setenv("LIGHTNING_APP_STATE_URL", "http://someurl")  # adding this to make is_running_in_cloud pass
-    monkeypatch.setattr(api, "CLOUD_QUEUE_TYPE", "redis")
+    monkeypatch.setitem(os.environ, "LIGHTNING_CLOUD_QUEUE_TYPE", "redis")
     async with AsyncClient(app=fastapi_service, base_url="http://test") as client:
         # will respond 503 if redis is not running
         response = await client.get("/healthz")

@@ -2,7 +2,7 @@
 import os
 import pickle
 from re import escape
-from time import sleep
+from time import sleep, time
 from unittest import mock
 
 import pytest
@@ -482,6 +482,21 @@ def make_delta(i):
         assert generated > expect
 
 
+def test_lightning_app_aggregation_empty():
+    """Verify the while loop exits before `state_accumulate_wait` is reached if no deltas are found."""
+
+    class SlowQueue(MultiProcessQueue):
+        def get(self, timeout):
+            out = super().get(timeout)
+            return out
+
+    app = LightningApp(EmptyFlow())
+    app.delta_queue = SlowQueue("api_delta_queue", 0)
+    t0 = time()
+    assert app._collect_deltas_from_ui_and_work_queues() == []
+    assert (time() - t0) < app.state_accumulate_wait
+
+
 class SimpleFlow2(LightningFlow):
     def __init__(self):
         super().__init__()

@@ -655,7 +655,7 @@ def run(self):
 def test_scheduling_api():
 
     app = LightningApp(FlowSchedule())
-    MultiProcessRuntime(app, start_server=True).dispatch()
+    MultiProcessRuntime(app, start_server=False).dispatch()
 
 
 def test_lightning_flow():

diff --git a/tests/tests_app/runners/backends/test_mp_process.py b/tests/tests_app/runners/backends/test_mp_process.py
@@ -5,8 +5,9 @@
 from lightning_app.runners.backends import MultiProcessingBackend
 
 
+@mock.patch("lightning_app.core.app.AppStatus")
 @mock.patch("lightning_app.runners.backends.mp_process.multiprocessing")
-def test_backend_create_work_with_set_start_method(multiprocessing_mock):
+def test_backend_create_work_with_set_start_method(multiprocessing_mock, *_):
     backend = MultiProcessingBackend(entrypoint_file="fake.py")
     work = Mock(spec=LightningWork)
     work._start_method = "test_start_method"

@@ -675,7 +675,7 @@ def test_call_with_queue_server_type_specified(self, lightningapps, monkeypatch,
         )
 
         # calling with env variable set to http
-        monkeypatch.setattr(cloud, "CLOUD_QUEUE_TYPE", "http")
+        monkeypatch.setitem(os.environ, "LIGHTNING_CLOUD_QUEUE_TYPE", "http")
         cloud_runtime.backend.client.reset_mock()
         cloud_runtime.dispatch()
         body = IdGetBody(

@@ -14,6 +14,12 @@ def test_cloud_compute_shared_memory():
     cloud_compute = CloudCompute("gpu", shm_size=1100)
     assert cloud_compute.shm_size == 1100
 
+    cloud_compute = CloudCompute("gpu")
+    assert cloud_compute.shm_size == 1024
+
+    cloud_compute = CloudCompute("cpu")
+    assert cloud_compute.shm_size == 0
+
 
 def test_cloud_compute_with_mounts():
     mount_1 = Mount(source="s3://foo/", mount_path="/foo")

@@ -85,7 +85,7 @@ def test_extract_metadata_from_component():
                 "name": "gpu",
                 "disk_size": 0,
                 "idle_timeout": None,
-                "shm_size": 0,
+                "shm_size": 1024,
                 "mounts": None,
                 "_internal_id": ANY,
             },