motiwari · Adarsh321123 · Jul 2, 2023 · Apr 18, 2023 · Apr 18, 2023 · Apr 18, 2023
diff --git a/.github/workflows/build_linux_wheels.yml b/.github/workflows/build_linux_wheels.yml
@@ -75,4 +75,4 @@ jobs:
           password: ${{ secrets.PYPI_APIKEY }}
           # For TestPyPI. TODO: RBF
 #          password: ${{ secrets.TEST_PYPI_APIKEY }}
-#          repository_url: https://test.pypi.org/legacy/
+#          repository_url: https://test.pypi.org/legacy/
diff --git a/.github/workflows/run_linux_tests.yml b/.github/workflows/run_linux_tests.yml
@@ -1,12 +1,12 @@
 name: Linux - build package and run tests
-on: [push, pull_request]
+on: [ push, pull_request ]
 jobs:
   build:
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]
     steps:
       - uses: actions/checkout@v2
 
@@ -21,7 +21,7 @@ jobs:
           sudo apt install -y build-essential checkinstall libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev libffi-dev zlib1g-dev 
           sudo apt install -y clang-format cppcheck
 
-      - name: Install Python dependencies 
+      - name: Install Python dependencies
         run: |
           python -m pip install --upgrade pip
           python -m pip install pytest
@@ -54,7 +54,7 @@ jobs:
           python -m pip install --no-use-pep517 --no-build-isolation -vvv -e .
         env:
           # The default compiler on the Github Ubuntu runners is gcc
-          # Would need to make a respective include change for clang 
+          # Would need to make a respective include change for clang
           CPLUS_INCLUDE_PATH: /usr/local/include/carma
 
       - name: Downloading data files for tests
@@ -70,7 +70,7 @@ jobs:
           tar -xzvf data/MNIST_70k.tar.gz -C data
 
       - name: Run smaller suite of test cases
-        run : |
+        run: |
           pytest tests/test_smaller.py 
 
       # - name: Run larger suite of test cases
@@ -86,7 +86,7 @@ jobs:
         run: git clone -b ${GITHUB_REF#refs/heads/} https://github.com/motiwari/BanditPAM
 
       - name: Verify that the C++ executable compiles and runs
-        run : |
+        run: |
           cd BanditPAM
           mkdir build
           cd build

diff --git a/.gitignore b/.gitignore
@@ -10,11 +10,11 @@ BanditPAM.egg-info/
 __pycache__
 banditpam.cpython*
 build/
-build/
 data/MNIST_*.csv
-data/MNIST_CSV.csv
 data/data.csv
 data/scrna*
+data/*.py
+data/cifar*
 docs/html
 docs/latex
 python_generate_plots/__pycache__

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,16 +4,17 @@ project(BanditPAM VERSION 1.0 LANGUAGES CXX)
 
 # TODO(@motiwari): Should RELEASE? be in caps
 set(CMAKE_BUILD_TYPE Release)
+# set(CMAKE_BUILD_TYPE Debug)
 set(CMAKE_CXX_STANDARD 17)
 
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 
 # Note: ThreadSanitizer and AddressSanitizer can give spurious errors around the #pragma omp critical.
 # They don't really matter because the threads always resolve with the same end state
 # ThreadSanitizer also introduces a very large overhead, AddressSanitizer is more reasonable
-set(CMAKE_CXX_FLAGS_DEBUG "-Wall -Wextra -g -fno-omit-frame-pointer -fsanitize=address")
+# set(CMAKE_CXX_FLAGS_DEBUG "-Wall -Wextra -g -fno-omit-frame-pointer -fsanitize=address")
 
 # NOTE: Need to explicitly pass -O0 to enable printing of armadillo matrices
-#set(CMAKE_CXX_FLAGS_DEBUG "-O0 -Wall -Wextra -g -fno-omit-frame-pointer")
+# set(CMAKE_CXX_FLAGS_DEBUG "-O0 -Wall -Wextra -g -fno-omit-frame-pointer")
 
 add_subdirectory(src)
diff --git a/data/mnist_change_size.py b/data/mnist_change_size.py
@@ -0,0 +1,14 @@
+import pandas as pd
+
+# Load the CSV file into a Pandas dataframe
+df = pd.read_csv("scrna_reformat.csv", header=None)
+
+print(df.describe())
+
+# Get the first 10,000 rows
+# df_10k = df.head(100)
+
+# print(df_10k.describe())
+
+# Save the first 10,000 rows to a new CSV file
+# df_10k.to_csv("mnist_10k.csv", index=False)
diff --git a/data/preprocess_cifar.py b/data/preprocess_cifar.py
@@ -0,0 +1,7 @@
+import pandas as pd
+
+path = "data/cifar10.csv"
+csv = pd.read_csv(path)
+first_column_label = csv.columns[0]
+csv = csv.drop(first_column_label, axis=1)
+csv.to_csv(path, header=False, index=False)
diff --git a/repro_script.py b/repro_script.py
@@ -0,0 +1,10 @@
+from experiments.run_scaling_experiment import (
+    run_scaling_experiment_with_n,
+    run_scaling_experiment_with_k,
+)
+
+# Scaling with n
+run_scaling_experiment_with_n()
+
+# Scaling with k
+run_scaling_experiment_with_k()
diff --git a/repro_script.sh b/repro_script.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# 1. Install BanditPAM from the source
+#pip install -r requirements.txt
+pip uninstall -y banditpam
+pip install -e .
+
+# 2. Install datasets if necessary
+# MNIST
+if [ -f "data/MNIST_70k.csv" ]; then
+    echo "MNIST found"
+else
+    echo "Installing MNIST..."
+    wget -P data https://motiwari.com/banditpam_data/MNIST_70k.tar.gz
+    tar -xzvf data/MNIST_70k.tar.gz -C data
+    rm data/MNIST_70k.tar.gz
+fi
+
+# CIFAR-10
+if [ -f "data/cifar10.csv" ]; then
+    echo "CIFAR-10 found"
+else
+    echo "Installing CIFAR-10..."
+    wget -P data https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+    tar -xzvf data/cifar-10-python.tar.gz -C data
+    rm data/cifar-10-python.tar.gz
+    # Preprocess the dataset
+    python data/preprocess_cifar.py
+fi
+
+# 3. Run the experiments
+python experiments/run_scaling_experiment.py
diff --git a/scripts/cache_measurements.py b/scripts/cache_measurements.py
@@ -8,9 +8,7 @@
 
 
 def time_measured_fit(
-    kmed: KMedoids,
-    X: np.array,
-    loss: str = "L2",
+    kmed: KMedoids, X: np.array, loss: str = "L2",
 ):
     start = time.time()
     kmed.fit(X, loss)
@@ -19,10 +17,7 @@ def time_measured_fit(
 
 
 def get_cache_statistics(
-    kmed: KMedoids,
-    X: np.array,
-    loss: str = "L2",
-    cache_width: int = 1000,
+    kmed: KMedoids, X: np.array, loss: str = "L2", cache_width: int = 1000,
 ):
     kmed.cache_width = cache_width
     time = time_measured_fit(kmed, X, loss)
@@ -55,22 +50,30 @@ def test_cache_stats():
             hits_1000,
             misses_1000,
         ) = get_cache_statistics(kmed=kmed, X=X, loss="L2", cache_width=1000)
-        time_750, width_750, writes_750, hits_750, misses_750 = \
-            get_cache_statistics(
-                kmed=kmed, X=X, loss="L2", cache_width=750
-            )
-        time_500, width_500, writes_500, hits_500, misses_500 = \
-            get_cache_statistics(
-                kmed=kmed, X=X, loss="L2", cache_width=500
-            )
-        time_250, width_250, writes_250, hits_250, misses_250 = \
-            get_cache_statistics(
-                kmed=kmed, X=X, loss="L2", cache_width=250
-            )
-        time_0, width_0, writes_0, hits_0, misses_0 = \
-            get_cache_statistics(
-                kmed=kmed, X=X, loss="L2", cache_width=0
-            )
+        (
+            time_750,
+            width_750,
+            writes_750,
+            hits_750,
+            misses_750,
+        ) = get_cache_statistics(kmed=kmed, X=X, loss="L2", cache_width=750)
+        (
+            time_500,
+            width_500,
+            writes_500,
+            hits_500,
+            misses_500,
+        ) = get_cache_statistics(kmed=kmed, X=X, loss="L2", cache_width=500)
+        (
+            time_250,
+            width_250,
+            writes_250,
+            hits_250,
+            misses_250,
+        ) = get_cache_statistics(kmed=kmed, X=X, loss="L2", cache_width=250)
+        time_0, width_0, writes_0, hits_0, misses_0 = get_cache_statistics(
+            kmed=kmed, X=X, loss="L2", cache_width=0
+        )
 
         assert (
             hits_1000 > hits_750 > hits_500 > hits_250 > hits_0
@@ -92,8 +95,9 @@ def test_cache_stats():
         assert width_250 == 250, "Cache width should be 250 when set to 250"
         assert width_500 == 500, "Cache width should be 500 when set to 500"
         assert width_750 == 750, "Cache width should be 750 when set to 750"
-        assert width_1000 == 1000, "Cache width should be 1000 when set to " \
-                                   "1000"
+        assert width_1000 == 1000, (
+            "Cache width should be 1000 when set to " "1000"
+        )
 
     def test_parallelization():
         X = np.loadtxt(os.path.join("data", "MNIST_10k.csv"))
@@ -176,10 +180,7 @@ def test_old_bpam_vs_new_bpam():
             bpam_orig_hits,
             bpam_orig_misses,
         ) = get_cache_statistics(
-            kmed=kmed_orig,
-            X=X,
-            loss="L2",
-            cache_width=1000,
+            kmed=kmed_orig, X=X, loss="L2", cache_width=1000,
         )
 
         print(bpam_time, bpam_orig_time)

diff --git a/scripts/compare_banditpam_versions.py b/scripts/compare_banditpam_versions.py
diff --git a/scripts/comparison_utils.py b/scripts/comparison_utils.py
@@ -1,4 +1,11 @@
+import os
+import pandas as pd
+
+
 def print_results(kmed, runtime):
+    complexity_with_caching = (
+        kmed.getDistanceComputations(True) - kmed.cache_hits
+    )
     print("-----Results-----")
     print("Algorithm:", kmed.algorithm)
     print("Final Medoids:", kmed.medoids)
@@ -7,17 +14,58 @@ def print_results(kmed, runtime):
     print("Build complexity:", f"{kmed.build_distance_computations:,}")
     print("Swap complexity:", f"{kmed.swap_distance_computations:,}")
     print("Number of Swaps", kmed.steps)
-    print(
-        "Average Swap Sample Complexity:",
-        f"{kmed.swap_distance_computations / kmed.steps:,}",
-    )
+    print("Cache Writes: {:,}".format(kmed.cache_writes))
+    print("Cache Hits: {:,}".format(kmed.cache_hits))
+    print("Cache Misses: {:,}".format(kmed.cache_misses))
     print(
         "Total complexity (without misc):",
-        f"{kmed.getDistanceComputations(False):,}"
+        f"{kmed.getDistanceComputations(False):,}",
     )
     print(
         "Total complexity (with misc):",
         f"{kmed.getDistanceComputations(True):,}",
     )
-    print("Runtime per swap:", runtime / kmed.steps)
+    print(
+        "Total complexity (with caching):", f"{complexity_with_caching:,}",
+    )
+    # print("Runtime per swap:", runtime / kmed.steps)
     print("Total runtime:", runtime)
+
+
+def store_results(kmed, runtime, log_dir, log_name, num_data, num_medoids):
+    # Create a dictionary with the printed values
+    log_dict = {
+        "num_data": num_data,
+        "num_medoids": num_medoids,
+        "loss": kmed.average_loss,
+        "misc_complexity": kmed.misc_distance_computations,
+        "build_complexity": kmed.build_distance_computations,
+        "swap_complexity": kmed.swap_distance_computations,
+        "number_of_swaps": kmed.steps,
+        "cache_writes": kmed.cache_writes,
+        "cache_hits": kmed.cache_hits,
+        "cache_misses": kmed.cache_misses,
+        "average_swap_sample_complexity": kmed.swap_distance_computations
+        / kmed.steps,
+        "total_complexity_without_misc": kmed.getDistanceComputations(False),
+        "total_complexity_with_misc": kmed.getDistanceComputations(True),
+        "total_complexity_with_caching": kmed.getDistanceComputations(True)
+        - kmed.cache_hits,
+        "runtime_per_swap": runtime / kmed.steps,
+        "total_runtime": runtime,
+    }
+    log_pd_row = pd.DataFrame([log_dict])
+
+    os.makedirs(log_dir, exist_ok=True)
+    log_path = os.path.join(log_dir, f"{log_name}.csv")
+    if os.path.exists(log_path):
+        log_df = pd.read_csv(log_path)
+    else:
+        log_df = pd.DataFrame(columns=log_dict.keys())
+
+    # Append the dictionary to the dataframe
+    log_df = pd.concat([log_df, log_pd_row], ignore_index=True)
+
+    # Save the updated dataframe back to the CSV file
+    log_df.to_csv(log_path, index=False)
+    print("Saved log to ", log_path)
diff --git a/scripts/comparison_with_fasterpam.py b/scripts/comparison_with_fasterpam.py
@@ -69,7 +69,7 @@ def run_old_bandit(data, seed):
         n_medoids=5,
         parallelize=True,
         algorithm="BanditPAM_orig",
-        dist_mat=diss
+        dist_mat=diss,
     )
     print(km.algorithm)
     km.seed = seed
@@ -87,10 +87,7 @@ def run_old_bandit(data, seed):
 
 if __name__ == "__main__":
     X, _ = fetch_openml(
-        "mnist_784",
-        version=1,
-        return_X_y=True,
-        as_frame=False,
+        "mnist_784", version=1, return_X_y=True, as_frame=False,
     )
     X = X[:20000]  # at 20k, colab will timeout for BanditPAM
     print(X.shape, type(X))