Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed Loss Mismatch Issues in BanditPAM #260

Merged
merged 30 commits into from
Jul 2, 2023
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
ca1a711
Fixing linking for M1 Macs local build by ensuring all libraries are …
Apr 18, 2023
c79e0ab
Undoing some library_dirs just to test GHA
Apr 18, 2023
362e712
Attempting to remove --no-use-pep517
Apr 18, 2023
6f068fc
Trying to install setuptools and wheel
Apr 18, 2023
c04a2ce
Attempting GHA again
Apr 18, 2023
5732040
Set up scaling experiments
Apr 27, 2023
c757f7d
Removed unnecessary files in experiments folder
Apr 27, 2023
b27f5c0
Merge branch 'main' of github.com:motiwari/BanditPAM into experiments
Apr 27, 2023
405cb34
Sharing the ScRNA experiment results
May 6, 2023
dcda713
Added CIFAR
May 16, 2023
cdc4fd7
Backup for debuggin the loss and sample complexity
Jun 13, 2023
6b7f504
Loss & Sample Complexity Issue Fixed
Jun 13, 2023
bf6e56f
Fixed the repro_script.sh bug
Jun 13, 2023
f753ecf
Updated plot_graph.py to skip the error bar for LOSS graphs
Jun 13, 2023
6cdb157
Running the exp with 10 seeds
Jun 17, 2023
a125160
Running the exp with 10 seeds
Jun 17, 2023
378d3fa
Updated the repro script to build the package from the source
Jun 17, 2023
0c563b3
cleaned up the code & ran 10 exps for mnist
Jun 22, 2023
0a4c7f9
Formatted the code with black
Jun 26, 2023
f883d24
Pulled the main and resolved the conflicts
Jun 26, 2023
ecaf132
Reset the run_scaling_experiment_with_n to its original settings
Jun 26, 2023
fe5d9fb
Reformatted the code
Jun 27, 2023
00547a8
Removed experiments folder
Jun 27, 2023
92c719b
Removed experiments folders
Jun 28, 2023
5965d7d
Reverting the dataset path in tests to pass the GHA tests
Jun 28, 2023
f84a977
Updating the codes according to Mo's feedback
Jul 1, 2023
110260c
Updated Black to make arguments one per line
Jul 1, 2023
9e738cd
Reverted the accidental GHA yml style changes
Jul 1, 2023
7930a7a
Added type hints in comparison_utils
Jul 1, 2023
c245147
Merge branch 'v4.0.3_prerelease' into fixed_loss
Adarsh321123 Jul 2, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build_linux_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,4 @@ jobs:
password: ${{ secrets.PYPI_APIKEY }}
# For TestPyPI. TODO: RBF
# password: ${{ secrets.TEST_PYPI_APIKEY }}
# repository_url: https://test.pypi.org/legacy/
# repository_url: https://test.pypi.org/legacy/
12 changes: 6 additions & 6 deletions .github/workflows/run_linux_tests.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
name: Linux - build package and run tests
on: [push, pull_request]
on: [ push, pull_request ]
jobs:
build:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]
steps:
- uses: actions/checkout@v2

Expand All @@ -21,7 +21,7 @@ jobs:
sudo apt install -y build-essential checkinstall libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev libffi-dev zlib1g-dev
sudo apt install -y clang-format cppcheck

- name: Install Python dependencies
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
python -m pip install pytest
Expand Down Expand Up @@ -54,7 +54,7 @@ jobs:
python -m pip install --no-use-pep517 --no-build-isolation -vvv -e .
env:
# The default compiler on the Github Ubuntu runners is gcc
# Would need to make a respective include change for clang
# Would need to make a respective include change for clang
CPLUS_INCLUDE_PATH: /usr/local/include/carma

- name: Downloading data files for tests
Expand All @@ -70,7 +70,7 @@ jobs:
tar -xzvf data/MNIST_70k.tar.gz -C data

- name: Run smaller suite of test cases
run : |
run: |
pytest tests/test_smaller.py

# - name: Run larger suite of test cases
Expand All @@ -86,7 +86,7 @@ jobs:
run: git clone -b ${GITHUB_REF#refs/heads/} https://github.com/motiwari/BanditPAM

- name: Verify that the C++ executable compiles and runs
run : |
run: |
cd BanditPAM
mkdir build
cd build
Expand Down
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ BanditPAM.egg-info/
__pycache__
banditpam.cpython*
build/
build/
data/MNIST_*.csv
data/MNIST_CSV.csv
data/data.csv
data/scrna*
data/*.py
data/cifar*
docs/html
docs/latex
python_generate_plots/__pycache__
Expand Down
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@ project(BanditPAM VERSION 1.0 LANGUAGES CXX)

# TODO(@motiwari): Should RELEASE? be in caps
set(CMAKE_BUILD_TYPE Release)
# set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CXX_STANDARD 17)

set(CMAKE_CXX_FLAGS_RELEASE "-O3")

# Note: ThreadSanitizer and AddressSanitizer can give spurious errors around the #pragma omp critical.
# They don't really matter because the threads always resolve with the same end state
# ThreadSanitizer also introduces a very large overhead, AddressSanitizer is more reasonable
set(CMAKE_CXX_FLAGS_DEBUG "-Wall -Wextra -g -fno-omit-frame-pointer -fsanitize=address")
# set(CMAKE_CXX_FLAGS_DEBUG "-Wall -Wextra -g -fno-omit-frame-pointer -fsanitize=address")

# NOTE: Need to explicitly pass -O0 to enable printing of armadillo matrices
#set(CMAKE_CXX_FLAGS_DEBUG "-O0 -Wall -Wextra -g -fno-omit-frame-pointer")
# set(CMAKE_CXX_FLAGS_DEBUG "-O0 -Wall -Wextra -g -fno-omit-frame-pointer")

add_subdirectory(src)
14 changes: 14 additions & 0 deletions data/mnist_change_size.py
lukeleeai marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import pandas as pd

# Load the CSV file into a Pandas dataframe
df = pd.read_csv("scrna_reformat.csv", header=None)

print(df.describe())

# Get the first 10,000 rows
# df_10k = df.head(100)

# print(df_10k.describe())

# Save the first 10,000 rows to a new CSV file
# df_10k.to_csv("mnist_10k.csv", index=False)
7 changes: 7 additions & 0 deletions data/preprocess_cifar.py
lukeleeai marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import pandas as pd

path = "data/cifar10.csv"
csv = pd.read_csv(path)
first_column_label = csv.columns[0]
csv = csv.drop(first_column_label, axis=1)
csv.to_csv(path, header=False, index=False)
10 changes: 10 additions & 0 deletions repro_script.py
lukeleeai marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from experiments.run_scaling_experiment import (
run_scaling_experiment_with_n,
run_scaling_experiment_with_k,
)

# Scaling with n
run_scaling_experiment_with_n()

# Scaling with k
run_scaling_experiment_with_k()
32 changes: 32 additions & 0 deletions repro_script.sh
lukeleeai marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

# 1. Install BanditPAM from the source
#pip install -r requirements.txt
pip uninstall -y banditpam
pip install -e .

# 2. Install datasets if necessary
# MNIST
if [ -f "data/MNIST_70k.csv" ]; then
echo "MNIST found"
else
echo "Installing MNIST..."
wget -P data https://motiwari.com/banditpam_data/MNIST_70k.tar.gz
tar -xzvf data/MNIST_70k.tar.gz -C data
rm data/MNIST_70k.tar.gz
fi

# CIFAR-10
if [ -f "data/cifar10.csv" ]; then
echo "CIFAR-10 found"
else
echo "Installing CIFAR-10..."
wget -P data https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
tar -xzvf data/cifar-10-python.tar.gz -C data
rm data/cifar-10-python.tar.gz
# Preprocess the dataset
python data/preprocess_cifar.py
fi

# 3. Run the experiments
python experiments/run_scaling_experiment.py
59 changes: 30 additions & 29 deletions scripts/cache_measurements.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@


def time_measured_fit(
kmed: KMedoids,
X: np.array,
loss: str = "L2",
kmed: KMedoids, X: np.array, loss: str = "L2",
lukeleeai marked this conversation as resolved.
Show resolved Hide resolved
):
start = time.time()
kmed.fit(X, loss)
Expand All @@ -19,10 +17,7 @@ def time_measured_fit(


def get_cache_statistics(
kmed: KMedoids,
X: np.array,
loss: str = "L2",
cache_width: int = 1000,
kmed: KMedoids, X: np.array, loss: str = "L2", cache_width: int = 1000,
lukeleeai marked this conversation as resolved.
Show resolved Hide resolved
):
kmed.cache_width = cache_width
time = time_measured_fit(kmed, X, loss)
Expand Down Expand Up @@ -55,22 +50,30 @@ def test_cache_stats():
hits_1000,
misses_1000,
) = get_cache_statistics(kmed=kmed, X=X, loss="L2", cache_width=1000)
time_750, width_750, writes_750, hits_750, misses_750 = \
get_cache_statistics(
kmed=kmed, X=X, loss="L2", cache_width=750
)
time_500, width_500, writes_500, hits_500, misses_500 = \
get_cache_statistics(
kmed=kmed, X=X, loss="L2", cache_width=500
)
time_250, width_250, writes_250, hits_250, misses_250 = \
get_cache_statistics(
kmed=kmed, X=X, loss="L2", cache_width=250
)
time_0, width_0, writes_0, hits_0, misses_0 = \
get_cache_statistics(
kmed=kmed, X=X, loss="L2", cache_width=0
)
(
time_750,
width_750,
writes_750,
hits_750,
misses_750,
) = get_cache_statistics(kmed=kmed, X=X, loss="L2", cache_width=750)
(
time_500,
width_500,
writes_500,
hits_500,
misses_500,
) = get_cache_statistics(kmed=kmed, X=X, loss="L2", cache_width=500)
(
time_250,
width_250,
writes_250,
hits_250,
misses_250,
) = get_cache_statistics(kmed=kmed, X=X, loss="L2", cache_width=250)
time_0, width_0, writes_0, hits_0, misses_0 = get_cache_statistics(
kmed=kmed, X=X, loss="L2", cache_width=0
)

assert (
hits_1000 > hits_750 > hits_500 > hits_250 > hits_0
Expand All @@ -92,8 +95,9 @@ def test_cache_stats():
assert width_250 == 250, "Cache width should be 250 when set to 250"
assert width_500 == 500, "Cache width should be 500 when set to 500"
assert width_750 == 750, "Cache width should be 750 when set to 750"
assert width_1000 == 1000, "Cache width should be 1000 when set to " \
"1000"
assert width_1000 == 1000, (
"Cache width should be 1000 when set to " "1000"
)

def test_parallelization():
X = np.loadtxt(os.path.join("data", "MNIST_10k.csv"))
Expand Down Expand Up @@ -176,10 +180,7 @@ def test_old_bpam_vs_new_bpam():
bpam_orig_hits,
bpam_orig_misses,
) = get_cache_statistics(
kmed=kmed_orig,
X=X,
loss="L2",
cache_width=1000,
kmed=kmed_orig, X=X, loss="L2", cache_width=1000,
lukeleeai marked this conversation as resolved.
Show resolved Hide resolved
)

print(bpam_time, bpam_orig_time)
Expand Down
41 changes: 0 additions & 41 deletions scripts/compare_banditpam_versions.py

This file was deleted.

60 changes: 54 additions & 6 deletions scripts/comparison_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
import os
import pandas as pd


def print_results(kmed, runtime):
complexity_with_caching = (
kmed.getDistanceComputations(True) - kmed.cache_hits
)
print("-----Results-----")
print("Algorithm:", kmed.algorithm)
print("Final Medoids:", kmed.medoids)
Expand All @@ -7,17 +14,58 @@ def print_results(kmed, runtime):
print("Build complexity:", f"{kmed.build_distance_computations:,}")
print("Swap complexity:", f"{kmed.swap_distance_computations:,}")
print("Number of Swaps", kmed.steps)
print(
"Average Swap Sample Complexity:",
f"{kmed.swap_distance_computations / kmed.steps:,}",
)
print("Cache Writes: {:,}".format(kmed.cache_writes))
print("Cache Hits: {:,}".format(kmed.cache_hits))
print("Cache Misses: {:,}".format(kmed.cache_misses))
print(
"Total complexity (without misc):",
f"{kmed.getDistanceComputations(False):,}"
f"{kmed.getDistanceComputations(False):,}",
)
print(
"Total complexity (with misc):",
f"{kmed.getDistanceComputations(True):,}",
)
print("Runtime per swap:", runtime / kmed.steps)
print(
"Total complexity (with caching):", f"{complexity_with_caching:,}",
)
# print("Runtime per swap:", runtime / kmed.steps)
print("Total runtime:", runtime)


def store_results(kmed, runtime, log_dir, log_name, num_data, num_medoids):
lukeleeai marked this conversation as resolved.
Show resolved Hide resolved
# Create a dictionary with the printed values
log_dict = {
"num_data": num_data,
"num_medoids": num_medoids,
"loss": kmed.average_loss,
"misc_complexity": kmed.misc_distance_computations,
"build_complexity": kmed.build_distance_computations,
"swap_complexity": kmed.swap_distance_computations,
"number_of_swaps": kmed.steps,
"cache_writes": kmed.cache_writes,
"cache_hits": kmed.cache_hits,
"cache_misses": kmed.cache_misses,
"average_swap_sample_complexity": kmed.swap_distance_computations
/ kmed.steps,
"total_complexity_without_misc": kmed.getDistanceComputations(False),
"total_complexity_with_misc": kmed.getDistanceComputations(True),
"total_complexity_with_caching": kmed.getDistanceComputations(True)
- kmed.cache_hits,
"runtime_per_swap": runtime / kmed.steps,
"total_runtime": runtime,
}
log_pd_row = pd.DataFrame([log_dict])

os.makedirs(log_dir, exist_ok=True)
log_path = os.path.join(log_dir, f"{log_name}.csv")
if os.path.exists(log_path):
log_df = pd.read_csv(log_path)
else:
log_df = pd.DataFrame(columns=log_dict.keys())

# Append the dictionary to the dataframe
log_df = pd.concat([log_df, log_pd_row], ignore_index=True)

# Save the updated dataframe back to the CSV file
log_df.to_csv(log_path, index=False)
print("Saved log to ", log_path)
7 changes: 2 additions & 5 deletions scripts/comparison_with_fasterpam.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def run_old_bandit(data, seed):
n_medoids=5,
parallelize=True,
algorithm="BanditPAM_orig",
dist_mat=diss
dist_mat=diss,
)
print(km.algorithm)
km.seed = seed
Expand All @@ -87,10 +87,7 @@ def run_old_bandit(data, seed):

if __name__ == "__main__":
X, _ = fetch_openml(
"mnist_784",
version=1,
return_X_y=True,
as_frame=False,
"mnist_784", version=1, return_X_y=True, as_frame=False,
lukeleeai marked this conversation as resolved.
Show resolved Hide resolved
)
X = X[:20000] # at 20k, colab will timeout for BanditPAM
print(X.shape, type(X))
Expand Down