Merge pull request #306 from N720720/improvement-of-memory-usage

Improvement of memory usage
N720720 · Jan 21, 2022 · cc4b4c9 · cc4b4c9
2 parents 1932edd + 996388f
commit cc4b4c9
Show file tree

Hide file tree

Showing 9 changed files with 89 additions and 63 deletions.
diff --git a/.gitignore b/.gitignore
@@ -615,6 +615,9 @@ healthchecksdb
 MigrationBackup/
 
 # End of https://www.gitignore.io/api/osx,python,pycharm,windows,visualstudio,visualstudiocode
-lindemann/main.py
-lindemann_per_atom.txt
+
+.vscode/launch.json
+lindemann_per_frame.txt
 lindemann_per_frame.pdf
+lindemann_per_atom.txt
+
diff --git a/lindemann/index/mem_use.py b/lindemann/index/mem_use.py
@@ -3,6 +3,14 @@
 
 
 def in_gb(frames: npt.NDArray[np.float64]) -> str:
+    """Shows the size of the array in memory in GB.
+
+    Args:
+        frames (npt.NDArray[np.float64]): numpy array of shape(frames,atoms)
+
+    Returns:
+        str: Size of array in GB.
+    """
     natoms = len(frames[0])
     nframes = len(frames)
-    return f"This will use {np.round((np.zeros((nframes, natoms, natoms)).nbytes/1024**3),4)} GB"  # type: ignore[no-untyped-call]
+    return f"This will use {np.round((np.zeros((natoms, natoms)).nbytes/1024**3),4)} GB"  # type: ignore[no-untyped-call]
diff --git a/lindemann/index/per_atoms.py b/lindemann/index/per_atoms.py
@@ -3,32 +3,31 @@
 import numba as nb
 import numpy as np
 import numpy.typing as npt
+from numba import float32
 
 
 @nb.njit(fastmath=True, error_model="numpy")  # type: ignore # , cache=True) #(parallel=True)
-def lindemann_per_frames(frames: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
+def calculate(frames: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]:
 
+    """Calculate the contribution of each atom to the lindemann index over the frames
+    
+    Args: 
+        frames: numpy array of shape(frames,atoms)
+    Returns:
+        npt.NDArray[np.float32]: Returns 1D array with the progression of the lindeman index per frame of shape(frames, atoms)
     """
-    Calculate the lindemann index for each atom AND FRAME
 
-    Return a ndarray of shape (len_frames, natoms, natoms)
-
-    Warning this can produce extremly large ndarrays in memory 
-    depending on the size of the cluster and the ammount of frames.
-    """
+    first = True
     # natoms = natoms
     dt = frames.dtype
     natoms = len(frames[0])
     nframes = len(frames)
     len_frames = len(frames)
     array_mean = np.zeros((natoms, natoms), dtype=dt)
     array_var = np.zeros((natoms, natoms), dtype=dt)
-    # array_distance = np.zeros((natoms, natoms))
     iframe = dt.type(1)
-    lindex_array = np.zeros((len_frames, natoms, natoms), dtype=dt)
+    lindex_array = np.zeros((len_frames, natoms), dtype=dt)
     for q, coords in enumerate(frames):
-        # print("processing frame {}/{}".format(iframe, nframes))
-        # print(q)
         n, p = coords.shape
         array_distance = np.zeros((n, n), dtype=dt)
         for i in range(n):
@@ -52,7 +51,7 @@ def lindemann_per_frames(frames: npt.NDArray[np.float64]) -> npt.NDArray[np.floa
                 array_mean[i, j] = mean + delta / iframe
                 # update variance
                 array_var[i, j] = var + delta * (xn - array_mean[i, j])
-        iframe += 1
+        iframe += 1  # type: ignore[assignment]
         if iframe > nframes + 1:
             break
 
@@ -61,15 +60,14 @@ def lindemann_per_frames(frames: npt.NDArray[np.float64]) -> npt.NDArray[np.floa
                 array_mean[j, i] = array_mean[i, j]
                 array_var[j, i] = array_var[i, j]
 
-        lindemann_indices = np.divide(np.sqrt(np.divide(array_var, nframes)), array_mean)
-        # lindemann_indices = np.nanmean(np.sqrt(array_var/nframes)/array_mean, axis=1)
+        if first:
+            lindemann_indices = np.zeros((natoms), dtype=dt)
+            first = False
+        else:
+            np.fill_diagonal(array_mean, 1)
+            lindemann_indices = np.zeros((natoms), dtype=dt)
+            lindemann_indices = np.divide(np.sqrt(np.divide(array_var, nframes)), array_mean)
+            lindemann_indices = np.asarray([np.mean(lin[lin != 0]) for lin in lindemann_indices])
+
         lindex_array[q] = lindemann_indices
     return lindex_array
-
-
-def calculate(indices: npt.NDArray[np.float64]) -> List[npt.NDArray[np.float64]]:
-    """
-    Small helper function, since numba has not implemented the np.nanmean with axis parameter 
-    I cant implemnet this in the jit function for now.
-    """
-    return [np.nanmean(i, axis=1) for i in lindemann_per_frames(indices)]  # type: ignore[no-untyped-call]
diff --git a/lindemann/index/per_frames.py b/lindemann/index/per_frames.py
@@ -6,28 +6,26 @@
 
 
 @nb.njit(fastmath=True, error_model="numpy")  # type: ignore # , cache=True) #(parallel=True)
-def lindemann_per_frames_for_each_atom(frames: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
+def calculate(frames: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]:
 
+    """calculate the progression of the lindemann index over the frames.
+    
+    Args: 
+        frames: numpy array of shape(frames,atoms)
+    Returns:
+        npt.NDArray[np.float32]: Returns 1D array with the progression of the lindeman index per frame of shape(frames)
     """
-    Calculate the lindemann index for each atom AND FRAME
 
-    Return a ndarray of shape (len_frames, natoms, natoms)
-
-    Warning this can produce extremly large ndarrays in memory 
-    depending on the size of the cluster and the ammount of frames.
-    """
-    # natoms = natoms
+    first = True
     dt = frames.dtype
     natoms = len(frames[0])
     nframes = len(frames)
     len_frames = len(frames)
     array_mean = np.zeros((natoms, natoms), dtype=dt)
     array_var = np.zeros((natoms, natoms), dtype=dt)
-    # array_distance = np.zeros((natoms, natoms))
     iframe = dt.type(1)
-    lindex_array = np.zeros((len_frames, natoms, natoms), dtype=dt)
+    lindex_array = np.zeros((len_frames), dtype=dt)
     for q, coords in enumerate(frames):
-        # print(q)
         n, p = coords.shape
         array_distance = np.zeros((n, n), dtype=dt)
         for i in range(n):
@@ -51,7 +49,7 @@ def lindemann_per_frames_for_each_atom(frames: npt.NDArray[np.float64]) -> npt.N
                 array_mean[i, j] = mean + delta / iframe
                 # update variance
                 array_var[i, j] = var + delta * (xn - array_mean[i, j])
-        iframe += 1
+        iframe += 1  # type: ignore[assignment]
         if iframe > nframes + 1:
             break
 
@@ -60,15 +58,16 @@ def lindemann_per_frames_for_each_atom(frames: npt.NDArray[np.float64]) -> npt.N
                 array_mean[j, i] = array_mean[i, j]
                 array_var[j, i] = array_var[i, j]
 
-        lindemann_indices = np.divide(np.sqrt(np.divide(array_var, nframes)), array_mean)
-        # lindemann_indices = np.nanmean(np.sqrt(array_var/nframes)/array_mean, axis=1)
+        if first:
+            lindemann_indices = 0
+            first = False
+        else:
+            np.fill_diagonal(array_mean, 1)
+            lindemann_indices = np.zeros((natoms), dtype=dt)  # type: ignore[assignment]
+            lindemann_indices = np.divide(np.sqrt(np.divide(array_var, nframes)), array_mean)  # type: ignore[assignment]
+            lindemann_indices = np.mean(
+                np.asarray([np.mean(lin[lin != 0]) for lin in lindemann_indices])  # type: ignore[attr-defined]
+            )
+
         lindex_array[q] = lindemann_indices
     return lindex_array
-
-
-def calculate(indices: npt.NDArray[np.float64]) -> List[npt.NDArray[np.float64]]:
-    """
-    Small helper function, since numba has not implemented the np.nanmean with axis parameter 
-    I cant implemnet this in the jit function for now.
-    """
-    return [np.mean(np.nanmean(i, axis=1)) for i in lindemann_per_frames_for_each_atom(indices)]  # type: ignore[no-untyped-call]
diff --git a/lindemann/index/per_trj.py b/lindemann/index/per_trj.py
@@ -9,15 +9,20 @@
 
 
 @nb.njit(fastmath=True, error_model="numpy")  # type: ignore
-def lindemann_per_atom(frames: npt.NDArray[np.float64]) -> Any:
+def lindemann_per_atom(frames: npt.NDArray[np.float32]) -> Any:
+
+    """Calculate the lindeman index
+    Args:
+        frames: numpy array of shape(frames,atoms)
+    Returns:
+        float32: returns the lindeman index
+    """
 
-    """Calculates the lindemann index for """
     dt = frames.dtype
     natoms = len(frames[0])
     nframes = len(frames)
     array_mean = np.zeros((natoms, natoms), dtype=dt)
     array_var = np.zeros((natoms, natoms), dtype=dt)
-    # array_distance = np.zeros((natoms, natoms),dtype=dt)
     iframe = dt.type(1)
     for coords in frames:
 
@@ -43,7 +48,7 @@ def lindemann_per_atom(frames: npt.NDArray[np.float64]) -> Any:
                 delta = xn - mean
                 array_mean[i, j] = mean + delta / iframe
                 array_var[i, j] = var + delta * (xn - array_mean[i, j])
-        iframe += 1.0
+        iframe += 1.0  # type: ignore[assignment]
         if iframe > nframes:
             break
 
@@ -57,9 +62,5 @@ def lindemann_per_atom(frames: npt.NDArray[np.float64]) -> Any:
 
 
 def calculate(frames: npt.NDArray[np.float64]) -> float:
-    """
-    Small helper function, since numba has not implemented the np.nanmean with axis parameter 
-    I cant implemnet this in the jit function for now.
-    """
 
     return np.mean(bn.nanmean(lindemann_per_atom(frames), axis=1))  # type: ignore[no-any-return, no-untyped-call]
diff --git a/lindemann/main.py b/lindemann/main.py
@@ -80,9 +80,6 @@ def main(
     as the progression of the Lindemann index per frame or per atom and frame of temperature ramps
     for phase transition analysis.
     """
-    # frames = read.frames(trjfile)
-    # frames = lindemann.trajectory.read.frames(trjfile)
-    start = time.time()
 
     n_cores = cpu_count()
     len_trjfiles = len(trjfile)
@@ -156,9 +153,9 @@ def main(
         raise typer.Exit()
 
     elif timeit and single_process:
-
+        # we use float32 here since float64 is not needed for my purposes and it enables us to use nb fastmath. Change to np.float64 if you need more precision.
         start = time.time()
-        linde_for_time = per_trj.calculate(tjr_frames)
+        linde_for_time = per_trj.calculate(tjr_frames.astype(np.float32))
         time_diff = time.time() - start
 
         console.print(

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "lindemann"
-version = "0.4.1"
+version = "0.5.0"
 description = "lindemann is a python package to calculate the Lindemann index  of a lammps trajectory as well as the progression of the Lindemann index per frame of temperature ramps  for phase transition analysis."
 readme = "README.md"
 authors = [
@@ -43,6 +43,7 @@ numba = ">=0.52"
 numpy = ">=1.18, <1.22"
 pathlib = "^1.0.1"
 icc-rt = "^2020.0.133"
+Bottleneck = "^1.3.2"
 psutil = "^5.9.0"
 
 [tool.poetry.dev-dependencies]

diff --git a/tests/test_example/test_cli.py b/tests/test_example/test_cli.py
@@ -45,7 +45,7 @@ def test_t_flag():
 
 def test_m_flag():
     flag = "-m"
-    res_str = "memory use: This will use 0.7864 GB"
+    res_str = "memory use: This will use 0.0016 GB"
     trajectory = ["tests/test_example/459_02.lammpstrj"]
     single_process_and_multiprocess(trajectory, flag, res_str)