Feature/SK-831 | Self-supervised Learning Example (#603)

scaleoutsystems · May 13, 2024 · c6bc269 · c6bc269
1 parent 0281ad7
commit c6bc269
Show file tree

Hide file tree

Showing 12 changed files with 815 additions and 0 deletions.
diff --git a/examples/FedSimSiam/.dockerignore b/examples/FedSimSiam/.dockerignore
@@ -0,0 +1,4 @@
+data
+seed.npz
+*.tgz
+*.tar.gz
diff --git a/examples/FedSimSiam/.gitignore b/examples/FedSimSiam/.gitignore
@@ -0,0 +1,6 @@
+data
+*.npz
+*.tgz
+*.tar.gz
+.fedsimsiam
+client.yaml
diff --git a/examples/FedSimSiam/README.rst b/examples/FedSimSiam/README.rst
@@ -0,0 +1,125 @@
+FEDn Project: FedSimSiam on CIFAR-10
+------------------------------------
+
+This is an example FEDn Project that runs the federated self-supervised learning algorithm FedSimSiam on 
+the CIFAR-10 dataset. This is a standard example often used for benchmarking. To be able to run this example, you 
+need to have GPU access. 
+
+   **Note: We recommend all new users to start by following the Quickstart Tutorial: https://fedn.readthedocs.io/en/stable/quickstart.html** 
+
+Prerequisites
+-------------
+
+-  `Python 3.8, 3.9, 3.10 or 3.11 <https://www.python.org/downloads>`__
+-  `A FEDn Studio account <https://fedn.scaleoutsystems.com/signup>`__   
+-  Change the dependencies in the 'client/python_env.yaml' file to match your cuda version.
+
+Creating the compute package and seed model
+-------------------------------------------
+
+Install fedn: 
+
+.. code-block::
+
+   pip install fedn
+
+Clone this repository, then locate into this directory:
+
+.. code-block::
+
+   git clone https://github.com/scaleoutsystems/fedn.git
+   cd fedn/examples/FedSimSiam
+
+Create the compute package:
+
+.. code-block::
+
+   fedn package create --path client
+
+This should create a file 'package.tgz' in the project folder.
+
+Next, generate a seed model (the first model in a global model trail):
+
+.. code-block::
+
+   fedn run build --path client
+
+This will create a seed model called 'seed.npz' in the root of the project. This step will take a few minutes, depending on hardware and internet connection (builds a virtualenv).  
+
+Using FEDn Studio
+-----------------
+
+Follow the instructions to register for FEDN Studio and start a project (https://fedn.readthedocs.io/en/stable/studio.html).
+
+In your Studio project:
+
+- Go to the 'Sessions' menu, click on 'New session', and upload the compute package (package.tgz) and seed model (seed.npz).
+- In the 'Clients' menu, click on 'Connect client' and download the client configuration file (client.yaml)
+- Save the client configuration file to the FedSimSiam example directory (fedn/examples/FedSimSiam)
+
+To connect a client, run the following command in your terminal:
+
+.. code-block::
+
+   fedn client start -in client.yaml --secure=True --force-ssl
+
+
+Running the example
+-------------------
+
+After everything is set up, go to 'Sessions' and click on 'New Session'. Click on 'Start run' and the example will execute. You can follow the training progress on 'Events' and 'Models', where you 
+can monitor the training progress. The monitoring is done using a kNN classifier that is fitted on the feature embeddings of the training images that are obtained by
+FedSimSiam's encoder, and evaluated on the feature embeddings of the test images. This process is repeated after each training round.
+
+This is a common method to track FedSimSiam's training progress, as FedSimSiam aims to minimize the distance between the embeddings of similar images.
+A high accuracy implies that the feature embeddings for images within the same class are indeed close to each other in the
+embedding space, i.e., FedSimSiam learned useful feature embeddings.
+
+
+Running FEDn in local development mode:
+---------------------------------------
+
+Follow the steps above to install FEDn, generate 'package.tgz' and 'seed.tgz'.
+
+Start a pseudo-distributed FEDn network using docker-compose:
+.. code-block::
+
+   docker compose \
+    -f ../../docker-compose.yaml \
+    -f docker-compose.override.yaml \
+    up
+
+This starts up local services for MongoDB, Minio, the API Server, one Combiner and two clients. 
+You can verify the deployment using these urls: 
+
+- API Server: http://localhost:8092/get_controller_status
+- Minio: http://localhost:9000
+- Mongo Express: http://localhost:8081
+
+Upload the package and seed model to FEDn controller using the APIClient:
+
+.. code-block::
+
+   from fedn import APIClient
+   client = APIClient(host="localhost", port=8092)
+   client.set_active_package("package.tgz", helper="numpyhelper")
+   client.set_active_model("seed.npz")
+
+
+You can now start a training session with 100 rounds using the API client:
+
+.. code-block::
+
+   client.start_session(rounds=100)
+
+Clean up 
+--------
+
+You can clean up by running
+
+.. code-block::
+
+   docker-compose \
+   -f ../../docker-compose.yaml \
+   -f docker-compose.override.yaml \
+   down -v
diff --git a/examples/FedSimSiam/client/data.py b/examples/FedSimSiam/client/data.py
@@ -0,0 +1,150 @@
+import os
+from math import floor
+
+import numpy as np
+import torch
+import torchvision
+from torchvision import transforms
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+abs_path = os.path.abspath(dir_path)
+
+
+def get_data(out_dir="data"):
+    # Make dir if necessary
+    if not os.path.exists(out_dir):
+        os.mkdir(out_dir)
+
+    # Only download if not already downloaded
+    if not os.path.exists(f"{out_dir}/train"):
+        torchvision.datasets.CIFAR10(
+            root=f"{out_dir}/train", train=True, download=True)
+
+    if not os.path.exists(f"{out_dir}/test"):
+        torchvision.datasets.CIFAR10(
+            root=f"{out_dir}/test", train=False, download=True)
+
+
+def load_data(data_path, is_train=True):
+    """ Load data from disk.
+
+    :param data_path: Path to data file.
+    :type data_path: str
+    :param is_train: Whether to load training or test data.
+    :type is_train: bool
+    :return: Tuple of data and labels.
+    :rtype: tuple
+    """
+    if data_path is None:
+        data_path = os.environ.get(
+            "FEDN_DATA_PATH", abs_path+"/data/clients/1/cifar10.pt")
+
+    data = torch.load(data_path)
+
+    if is_train:
+        X = data["x_train"]
+        y = data["y_train"]
+    else:
+        X = data["x_test"]
+        y = data["y_test"]
+
+    return X, y
+
+
+def create_knn_monitoring_dataset(out_dir="data"):
+    """ Creates dataset that is used to monitor the training progress via knn accuracies """
+    if not os.path.exists(out_dir):
+        os.mkdir(out_dir)
+
+    n_splits = int(os.environ.get("FEDN_NUM_DATA_SPLITS", 2))
+
+    # Make dir
+    if not os.path.exists(f"{out_dir}/clients"):
+        os.mkdir(f"{out_dir}/clients")
+
+    normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
+                                     std=[0.247, 0.243, 0.261])
+
+    memoryset = torchvision.datasets.CIFAR10(root="./data", train=True,
+                                             download=True, transform=transforms.Compose([transforms.ToTensor(), normalize]))
+    testset = torchvision.datasets.CIFAR10(root="./data", train=False,
+                                           download=True, transform=transforms.Compose([transforms.ToTensor(), normalize]))
+
+    # save monitoring datasets to all clients
+    for i in range(n_splits):
+        subdir = f"{out_dir}/clients/{str(i+1)}"
+        if not os.path.exists(subdir):
+            os.mkdir(subdir)
+        torch.save(memoryset, f"{subdir}/knn_memoryset.pt")
+        torch.save(testset, f"{subdir}/knn_testset.pt")
+
+
+def load_knn_monitoring_dataset(data_path, batch_size=16):
+    """ Loads the KNN monitoring dataset."""
+    if data_path is None:
+        data_path = os.environ.get(
+            "FEDN_DATA_PATH", abs_path+"/data/clients/1/cifar10.pt")
+
+    data_directory = os.path.dirname(data_path)
+    memory_path = os.path.join(data_directory, "knn_memoryset.pt")
+    testset_path = os.path.join(data_directory, "knn_testset.pt")
+
+    memoryset = torch.load(memory_path)
+    testset = torch.load(testset_path)
+
+    memoryset_loader = torch.utils.data.DataLoader(
+        memoryset, batch_size=batch_size, shuffle=False)
+    testset_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
+                                                 shuffle=False)
+    return memoryset_loader, testset_loader
+
+
+def splitset(dataset, parts):
+    n = dataset.shape[0]
+    local_n = floor(n/parts)
+    result = []
+    for i in range(parts):
+        result.append(dataset[i*local_n: (i+1)*local_n])
+    return result
+
+
+def split(out_dir="data"):
+
+    n_splits = int(os.environ.get("FEDN_NUM_DATA_SPLITS", 2))
+
+    # Make dir
+    if not os.path.exists(f"{out_dir}/clients"):
+        os.mkdir(f"{out_dir}/clients")
+
+    train_data = torchvision.datasets.CIFAR10(
+        root=f"{out_dir}/train", train=True)
+    test_data = torchvision.datasets.CIFAR10(
+        root=f"{out_dir}/test", train=False)
+
+    data = {
+        "x_train": splitset(train_data.data, n_splits),
+        "y_train": splitset(np.array(train_data.targets), n_splits),
+        "x_test": splitset(test_data.data, n_splits),
+        "y_test": splitset(np.array(test_data.targets), n_splits),
+    }
+
+    # Make splits
+    for i in range(n_splits):
+        subdir = f"{out_dir}/clients/{str(i+1)}"
+        if not os.path.exists(subdir):
+            os.mkdir(subdir)
+        torch.save({
+            "x_train": data["x_train"][i],
+            "y_train": data["y_train"][i],
+            "x_test": data["x_test"][i],
+            "y_test": data["y_test"][i],
+        },
+            f"{subdir}/cifar10.pt")
+
+
+if __name__ == "__main__":
+    # Prepare data if not already done
+    if not os.path.exists(abs_path+"/data/clients/1"):
+        get_data()
+        split()
+        create_knn_monitoring_dataset()
diff --git a/examples/FedSimSiam/client/fedn.yaml b/examples/FedSimSiam/client/fedn.yaml
@@ -0,0 +1,10 @@
+python_env: python_env.yaml
+entry_points:
+  build:
+    command: python model.py
+  startup:
+    command: python data.py
+  train:
+    command: python train.py
+  validate:
+    command: python validate.py