horovod · chongxiaoc · Oct 26, 2021 · Oct 24, 2021 · Oct 24, 2021 · Oct 25, 2021
diff --git a/horovod/data/data_loader_base.py b/horovod/data/data_loader_base.py
@@ -55,7 +55,7 @@ class AsyncDataLoaderMixin(object):
         class PytorchAsyncDataLoader(AsyncDataLoaderMixin, PytorchDataLoader):
     """
 
-    def __init__(self, async_loader_queue_size=64, *args, **kwargs):
+    def __init__(self, async_loader_queue_size=1, *args, **kwargs):
         """
         initialize the async data loader. Need to add this in the __init__() of the implementation
         """
@@ -92,12 +92,12 @@ def _async_worker(self):
         User need to implement self._iterate() to read the data.
         """
         try:
-            # Only need to iterate once because data loader will be re-created in each epoch.
-            for batch in self._iterate():
-                if self.finished_event.is_set():
-                    break
-                self.queue.put(batch)
-            self.queue.put(None)
+            while not self.finished_event.is_set():
+                for batch in self._iterate():
+                    if self.finished_event.is_set():
+                        break
+                    self.queue.put(batch)
+                self.queue.put(None)
         except Exception as ex:
             self.queue.put(ex)
             self.queue.put(None)
@@ -125,3 +125,6 @@ def __iter__(self):
         else:
             for batch in self._iterate():
                 yield self._process_batch(batch)
+
+    def __del__(self):
+        self.close_async_loader()
diff --git a/horovod/spark/lightning/datamodule.py b/horovod/spark/lightning/datamodule.py
@@ -94,6 +94,9 @@ def train_dataloader(self):
         else:
             dataloader_class = PytorchInfiniteAsyncDataLoader
             kwargs['shuffling_queue_capacity'] = self.shuffle_size
+            # To avoid loading too much data in memory, need to limit the queue size so it will
+            # only load 1/4 of the data or less than the 10000 rows. (batch_size * queue_size)
+            kwargs['async_loader_queue_size'] = max(1, min(10000 // kwargs['batch_size'], kwargs['limit_step_per_epoch'] // 4))
 
         self.train_dl = dataloader_class(**kwargs)
         return self.train_dl
@@ -115,6 +118,9 @@ def val_dataloader(self):
         else:
             dataloader_class = PytorchInfiniteAsyncDataLoader
             kwargs['shuffling_queue_capacity'] = 0
+            # To avoid loading too much data in memory, need to limit the queue size so it will
+            # only load 1/4 of the data or less than the 10000 rows. (batch_size * queue_size)
+            kwargs['async_loader_queue_size'] = max(1, min(10000 // kwargs['batch_size'], kwargs['limit_step_per_epoch'] // 4))
 
         self.val_dl = dataloader_class(**kwargs)
         return self.val_dl