wandb · raubitsj · Aug 22, 2022 · Aug 20, 2022 · Aug 20, 2022 · Aug 20, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -806,8 +806,7 @@ jobs:
                   done
                 no_output_timeout: "5m"
             - run:
-                # todo: make getting the exit_code more robust: add check for "commands succeeded"
-                name: "Wait for tests to finish, get pod logs, and parse exit code"
+                name: "Wait for tests to finish"
                 command: |
                   sleep 30
 
@@ -828,19 +827,60 @@ jobs:
                     echo
                     sleep << parameters.sleep_time >>
                   done
-
+                no_output_timeout: "20m"
+            - run:
+                name: "Grab results"
+                command: |
+                  # grab result data
+                  echo "Launch attach pod to get results..."
+                  kubectl create -f << parameters.path >>/attach.yaml
+                  while true; do
+                    kubectl get pods
+                    pod_state=$(kubectl get pods | grep << parameters.pod_name >>-attach-pod)
+                    if [ "$(echo "$pod_state" | grep -c 'Running')" -eq "1" ]; then
+                      echo "Attach Pod for << parameters.image_name >> is running"
+                      break
+                    elif [ "$(echo "$pod_state" | grep -c 'Completed')" -eq "1" ]; then
+                      echo "Attach Pod for << parameters.image_name >> has completed"
+                      break
+                    elif [ "$(echo "$pod_state" | grep -c 'Error')" -eq "1" ] || [ "$(kubectl get pods | grep -c 'CrashLoopBackOff')" -eq "1" ]; then
+                      echo "Attach Pod for << parameters.image_name >> is in an error state"
+                      kubectl logs << parameters.pod_name >>-attach-pod
+                      echo "About to exit"
+                      exit 1
+                    fi
+                    echo "Sleeping for << parameters.sleep_time >> seconds"
+                    echo
+                    sleep << parameters.sleep_time >>
+                  done
+                  echo "Copy results from pod..."
+                  mkdir -p results/<< parameters.pod_name >>
+                  kubectl cp << parameters.pod_name >>-attach-pod:/wandb-store/test-results results/<< parameters.pod_name >>
+                  echo "Delete attach pod..."
+                  kubectl delete pod << parameters.pod_name >>-attach-pod
+                no_output_timeout: "10m"
+            - run:
+                # todo: make getting the exit_code more robust: add check for "commands succeeded"
+                name: "Get pod logs and parse exit code"
+                command: |
                   kubectl logs << parameters.pod_name >>
                   logs=`kubectl logs << parameters.pod_name >>`
                   exit_code=`echo $(echo $logs | grep -c "commands failed")`
                   echo "Pod for << parameters.image_name >> exited with code ${exit_code}"
                   exit ${exit_code}
                 no_output_timeout: "20m"
+            - store_test_results:
+                path: results/<< parameters.pod_name >>/test-results
+            - store_artifacts:
+                path: results/<< parameters.pod_name >>/test-results
+                destination: test-results
             - run:
                 name: "Delete the pod"
                 when: always
                 command: |
                   kubectl get pods
-                  kubectl delete -f << parameters.path >>/<< parameters.pod_config_name >>
+                  kubectl delete -f << parameters.path >>/<< parameters.pod_config_name >> || echo "Problem deleting pod"
+                  kubectl delete -f << parameters.path >>/attach.pod || echo "Problem deleting attach pod"
             # conditionally post a notification to slack if the job failed/succeeded
             - when:
                 condition: << parameters.notify_on_failure >>

diff --git a/tests/standalone_tests/shards/gke_cpu/Dockerfile b/tests/standalone_tests/shards/gke_cpu/Dockerfile
@@ -46,4 +46,4 @@ RUN PATH=/home/sdk/.local/bin:$PATH
 WORKDIR /wandb/wandb
 ENV DATE=$UTC_DATE
 #CMD ["tail", "-f", "/dev/null"]
-CMD ["python", "-m", "tox", "-v", "-e", "standalone-cpu-py38"]
+CMD ["python", "-m", "tox", "-v", "-e", "standalone-cpu-py38,pod-store"]
diff --git a/tests/standalone_tests/shards/gke_cpu/attach.yaml b/tests/standalone_tests/shards/gke_cpu/attach.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: "cpu-pod-attach-pod"
+spec:
+  restartPolicy: Never
+  containers:
+  - name: "cpu-pod-attach-container"
+    image: "alpine:latest"
+    command: ["/bin/sh"]
+    args: ["-c", "sleep 600"]
+    volumeMounts:
+    - name: cpu-pod-results-volumeclaim-name
+      mountPath: "/wandb-store"
+  volumes:
+    - name: cpu-pod-results-volumeclaim-name
+      persistentVolumeClaim:
+       claimName: cpu-pod-results-volumeclaim
diff --git a/tests/standalone_tests/shards/gke_cpu/pod.yaml b/tests/standalone_tests/shards/gke_cpu/pod.yaml
@@ -1,5 +1,33 @@
+---
+kind: PersistentVolume
 apiVersion: v1
+metadata:
+  name: cpu-pod-results-volume
+spec:
+  storageClassName: cpu-pod-pv
+  capacity:
+    storage: 1Gi
+  accessModes:
+    - ReadWriteOnce
+  hostPath:
+    path: "/tmp"
+
+---
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: cpu-pod-results-volumeclaim
+spec:
+  storageClassName: cpu-pod-pv
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 100Mi
+
+---
 kind: Pod
+apiVersion: v1
 metadata:
   name: cpu-pod
 spec:
@@ -10,9 +38,15 @@ spec:
       env:
         - name: WANDB_API_KEY
           value: WANDB_API_KEY_PLACEHOLDER
-#      command: ["tail", "-f", "/dev/null"]
       resources:
         limits:
           cpu: 3.999
         requests:
           cpu: 3.5
+      volumeMounts:
+        - mountPath: "/wandb-store"
+          name: cpu-pod-results-volumeclaim-name
+  volumes:
+    - name: cpu-pod-results-volumeclaim-name
+      persistentVolumeClaim:
+       claimName: cpu-pod-results-volumeclaim
diff --git a/tests/standalone_tests/shards/gke_gpu/Dockerfile b/tests/standalone_tests/shards/gke_gpu/Dockerfile
@@ -55,4 +55,4 @@ WORKDIR /wandb/wandb
 RUN sed -i -e 's/whl\/cpu/whl\/cu113/g' tox.ini
 ENV DATE=$UTC_DATE
 #CMD ["tail", "-f", "/dev/null"]
-CMD ["python", "-m", "tox", "-v", "-e", "standalone-gpu-py38"]
+CMD ["python", "-m", "tox", "-v", "-e", "standalone-gpu-py38,pod-store"]
diff --git a/tests/standalone_tests/shards/gke_gpu/attach.yaml b/tests/standalone_tests/shards/gke_gpu/attach.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: "gpu-pod-attach-pod"
+spec:
+  restartPolicy: Never
+  containers:
+  - name: "gpu-pod-attach-container"
+    image: "alpine:latest"
+    command: ["/bin/sh"]
+    args: ["-c", "sleep 600"]
+    volumeMounts:
+    - name: gpu-pod-results-volumeclaim-name
+      mountPath: "/wandb-store"
+  volumes:
+    - name: gpu-pod-results-volumeclaim-name
+      persistentVolumeClaim:
+       claimName: gpu-pod-results-volumeclaim
diff --git a/tests/standalone_tests/shards/gke_gpu/pod.yaml b/tests/standalone_tests/shards/gke_gpu/pod.yaml
@@ -1,5 +1,33 @@
+---
+kind: PersistentVolume
 apiVersion: v1
+metadata:
+  name: gpu-pod-results-volume
+spec:
+  storageClassName: gpu-pod-pv
+  capacity:
+    storage: 1Gi
+  accessModes:
+    - ReadWriteOnce
+  hostPath:
+    path: "/tmp"
+
+---
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: gpu-pod-results-volumeclaim
+spec:
+  storageClassName: gpu-pod-pv
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 100Mi
+
+---
 kind: Pod
+apiVersion: v1
 metadata:
   name: gpu-pod
 spec:
@@ -10,13 +38,16 @@ spec:
       env:
         - name: WANDB_API_KEY
           value: WANDB_API_KEY_PLACEHOLDER
-  #    command: ["tail", "-f", "/dev/null"]
-      # do not restart the container if it exits
       resources:
         limits:
           cpu: 3.999
           nvidia.com/gpu: 2
         requests:
           cpu: 3.5
-  #    ports:
-  #    - containerPort: 80
+      volumeMounts:
+        - mountPath: "/wandb-store"
+          name: gpu-pod-results-volumeclaim-name
+  volumes:
+    - name: gpu-pod-results-volumeclaim-name
+      persistentVolumeClaim:
+       claimName: gpu-pod-results-volumeclaim
diff --git a/tox.ini b/tox.ini
@@ -447,6 +447,14 @@ commands =
     func-s_noml-py{36,37,38,39,310}: yea {env:CI_PYTEST_SPLIT_ARGS:} --strict --shard noml run {posargs:--all}
     func-s_kfp-py{37}: yea {env:CI_PYTEST_SPLIT_ARGS:} --strict -p wandb:mockserver-bind=0.0.0.0 -p wandb:mockserver-host=__auto__ --shard kfp run {posargs:--all}
 
+[testenv:pod-store]
+whitelist_externals =
+    mkdir
+    cp
+commands =
+    mkdir -p /wandb-store/test-results
+    cp -rp test-results /wandb-store/test-results
+
 [testenv:func-cover]
 skip_install = true
 deps =