Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Push test-results to circle for nightly tests #4153

Merged
merged 24 commits into from Aug 22, 2022
48 changes: 44 additions & 4 deletions .circleci/config.yml
Expand Up @@ -806,8 +806,7 @@ jobs:
done
no_output_timeout: "5m"
- run:
# todo: make getting the exit_code more robust: add check for "commands succeeded"
name: "Wait for tests to finish, get pod logs, and parse exit code"
name: "Wait for tests to finish"
command: |
sleep 30

Expand All @@ -828,19 +827,60 @@ jobs:
echo
sleep << parameters.sleep_time >>
done

no_output_timeout: "20m"
- run:
name: "Grab results"
command: |
# grab result data
echo "Launch attach pod to get results..."
kubectl create -f << parameters.path >>/attach.yaml
while true; do
kubectl get pods
pod_state=$(kubectl get pods | grep << parameters.pod_name >>-attach-pod)
if [ "$(echo "$pod_state" | grep -c 'Running')" -eq "1" ]; then
echo "Attach Pod for << parameters.image_name >> is running"
break
elif [ "$(echo "$pod_state" | grep -c 'Completed')" -eq "1" ]; then
echo "Attach Pod for << parameters.image_name >> has completed"
break
elif [ "$(echo "$pod_state" | grep -c 'Error')" -eq "1" ] || [ "$(kubectl get pods | grep -c 'CrashLoopBackOff')" -eq "1" ]; then
echo "Attach Pod for << parameters.image_name >> is in an error state"
kubectl logs << parameters.pod_name >>-attach-pod
echo "About to exit"
exit 1
fi
echo "Sleeping for << parameters.sleep_time >> seconds"
echo
sleep << parameters.sleep_time >>
done
echo "Copy results from pod..."
mkdir -p results/<< parameters.pod_name >>
kubectl cp << parameters.pod_name >>-attach-pod:/wandb-store/test-results results/<< parameters.pod_name >>
echo "Delete attach pod..."
kubectl delete pod << parameters.pod_name >>-attach-pod
no_output_timeout: "10m"
- run:
# todo: make getting the exit_code more robust: add check for "commands succeeded"
name: "Get pod logs and parse exit code"
command: |
kubectl logs << parameters.pod_name >>
logs=`kubectl logs << parameters.pod_name >>`
exit_code=`echo $(echo $logs | grep -c "commands failed")`
echo "Pod for << parameters.image_name >> exited with code ${exit_code}"
exit ${exit_code}
no_output_timeout: "20m"
- store_test_results:
path: results/<< parameters.pod_name >>/test-results
- store_artifacts:
path: results/<< parameters.pod_name >>/test-results
destination: test-results
- run:
name: "Delete the pod"
when: always
command: |
kubectl get pods
kubectl delete -f << parameters.path >>/<< parameters.pod_config_name >>
kubectl delete -f << parameters.path >>/<< parameters.pod_config_name >> || echo "Problem deleting pod"
kubectl delete -f << parameters.path >>/attach.pod || echo "Problem deleting attach pod"
# conditionally post a notification to slack if the job failed/succeeded
- when:
condition: << parameters.notify_on_failure >>
Expand Down
2 changes: 1 addition & 1 deletion tests/standalone_tests/shards/gke_cpu/Dockerfile
Expand Up @@ -46,4 +46,4 @@ RUN PATH=/home/sdk/.local/bin:$PATH
WORKDIR /wandb/wandb
ENV DATE=$UTC_DATE
#CMD ["tail", "-f", "/dev/null"]
CMD ["python", "-m", "tox", "-v", "-e", "standalone-cpu-py38"]
CMD ["python", "-m", "tox", "-v", "-e", "standalone-cpu-py38,pod-store"]
18 changes: 18 additions & 0 deletions tests/standalone_tests/shards/gke_cpu/attach.yaml
@@ -0,0 +1,18 @@
apiVersion: v1
kind: Pod
metadata:
name: "cpu-pod-attach-pod"
spec:
restartPolicy: Never
containers:
- name: "cpu-pod-attach-container"
image: "alpine:latest"
command: ["/bin/sh"]
args: ["-c", "sleep 600"]
volumeMounts:
- name: cpu-pod-results-volumeclaim-name
mountPath: "/wandb-store"
volumes:
- name: cpu-pod-results-volumeclaim-name
persistentVolumeClaim:
claimName: cpu-pod-results-volumeclaim
36 changes: 35 additions & 1 deletion tests/standalone_tests/shards/gke_cpu/pod.yaml
@@ -1,5 +1,33 @@
---
kind: PersistentVolume
apiVersion: v1
metadata:
name: cpu-pod-results-volume
spec:
storageClassName: cpu-pod-pv
capacity:
storage: 1Gi
accessModes:
- ReadWriteOnce
hostPath:
path: "/tmp"

---
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: cpu-pod-results-volumeclaim
spec:
storageClassName: cpu-pod-pv
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Mi

---
kind: Pod
apiVersion: v1
metadata:
name: cpu-pod
spec:
Expand All @@ -10,9 +38,15 @@ spec:
env:
- name: WANDB_API_KEY
value: WANDB_API_KEY_PLACEHOLDER
# command: ["tail", "-f", "/dev/null"]
resources:
limits:
cpu: 3.999
requests:
cpu: 3.5
volumeMounts:
- mountPath: "/wandb-store"
name: cpu-pod-results-volumeclaim-name
volumes:
- name: cpu-pod-results-volumeclaim-name
persistentVolumeClaim:
claimName: cpu-pod-results-volumeclaim
2 changes: 1 addition & 1 deletion tests/standalone_tests/shards/gke_gpu/Dockerfile
Expand Up @@ -55,4 +55,4 @@ WORKDIR /wandb/wandb
RUN sed -i -e 's/whl\/cpu/whl\/cu113/g' tox.ini
ENV DATE=$UTC_DATE
#CMD ["tail", "-f", "/dev/null"]
CMD ["python", "-m", "tox", "-v", "-e", "standalone-gpu-py38"]
CMD ["python", "-m", "tox", "-v", "-e", "standalone-gpu-py38,pod-store"]
18 changes: 18 additions & 0 deletions tests/standalone_tests/shards/gke_gpu/attach.yaml
@@ -0,0 +1,18 @@
apiVersion: v1
kind: Pod
metadata:
name: "gpu-pod-attach-pod"
spec:
restartPolicy: Never
containers:
- name: "gpu-pod-attach-container"
image: "alpine:latest"
command: ["/bin/sh"]
args: ["-c", "sleep 600"]
volumeMounts:
- name: gpu-pod-results-volumeclaim-name
mountPath: "/wandb-store"
volumes:
- name: gpu-pod-results-volumeclaim-name
persistentVolumeClaim:
claimName: gpu-pod-results-volumeclaim
39 changes: 35 additions & 4 deletions tests/standalone_tests/shards/gke_gpu/pod.yaml
@@ -1,5 +1,33 @@
---
kind: PersistentVolume
apiVersion: v1
metadata:
name: gpu-pod-results-volume
spec:
storageClassName: gpu-pod-pv
capacity:
storage: 1Gi
accessModes:
- ReadWriteOnce
hostPath:
path: "/tmp"

---
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: gpu-pod-results-volumeclaim
spec:
storageClassName: gpu-pod-pv
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Mi

---
kind: Pod
apiVersion: v1
metadata:
name: gpu-pod
spec:
Expand All @@ -10,13 +38,16 @@ spec:
env:
- name: WANDB_API_KEY
value: WANDB_API_KEY_PLACEHOLDER
# command: ["tail", "-f", "/dev/null"]
# do not restart the container if it exits
resources:
limits:
cpu: 3.999
nvidia.com/gpu: 2
requests:
cpu: 3.5
# ports:
# - containerPort: 80
volumeMounts:
- mountPath: "/wandb-store"
name: gpu-pod-results-volumeclaim-name
volumes:
- name: gpu-pod-results-volumeclaim-name
persistentVolumeClaim:
claimName: gpu-pod-results-volumeclaim
8 changes: 8 additions & 0 deletions tox.ini
Expand Up @@ -447,6 +447,14 @@ commands =
func-s_noml-py{36,37,38,39,310}: yea {env:CI_PYTEST_SPLIT_ARGS:} --strict --shard noml run {posargs:--all}
func-s_kfp-py{37}: yea {env:CI_PYTEST_SPLIT_ARGS:} --strict -p wandb:mockserver-bind=0.0.0.0 -p wandb:mockserver-host=__auto__ --shard kfp run {posargs:--all}

[testenv:pod-store]
whitelist_externals =
mkdir
cp
commands =
mkdir -p /wandb-store/test-results
cp -rp test-results /wandb-store/test-results

[testenv:func-cover]
skip_install = true
deps =
Expand Down