Separate arm64 and amd64 docker builds (#125617)

Fixes #125094 Please note: Docker CUDa 12.4 failure is existing issue, related to docker image not being available on gitlab: ``` docker.io/nvidia/cuda:12.4.0-cudnn8-devel-ubuntu22.04: docker.io/nvidia/cuda:12.4.0-cudnn8-devel-ubuntu22.04: not found ``` https://github.com/pytorch/pytorch/actions/runs/8974959068/job/24648540236?pr=125617 Here is the reference issue: https://gitlab.com/nvidia/container-images/cuda/-/issues/225 Tracked on our side: pytorch/builder#1811 Pull Request resolved: #125617 Approved by: https://github.com/huydhn, https://github.com/malfet
pytorch · May 7, 2024 · b29d77b · b29d77b
1 parent 5dee462
commit b29d77b
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 6 deletions.
diff --git a/.github/scripts/generate_docker_release_matrix.py b/.github/scripts/generate_docker_release_matrix.py
@@ -21,6 +21,8 @@
 
 def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
     ret: List[Dict[str, str]] = []
+    # CUDA amd64 Docker images are available as both runtime and devel while
+    # CPU arm64 image is only available as runtime.
     for cuda, version in generate_binary_build_matrix.CUDA_ARCHES_FULL_VERSION.items():
         for image in DOCKER_IMAGE_TYPES:
             ret.append(
@@ -31,9 +33,19 @@ def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
                         cuda
                     ],
                     "image_type": image,
-                    "platform": "linux/arm64,linux/amd64",
+                    "platform": "linux/amd64",
                 }
             )
+    ret.append(
+        {
+            "cuda": "cpu",
+            "cuda_full_version": "",
+            "cudnn_version": "",
+            "image_type": "runtime",
+            "platform": "linux/arm64",
+        }
+    )
+
     return {"include": ret}
 
 

diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
@@ -7,6 +7,7 @@ on:
       - Dockerfile
       - docker.Makefile
       - .github/workflows/docker-release.yml
+      - .github/scripts/generate_docker_release_matrix.py
   push:
     branches:
       - nightly
@@ -129,17 +130,27 @@ jobs:
         if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' }}
         run: |
           PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
+          CUDA_SUFFIX="-cu${CUDA_VERSION}"
+          if [[ ${CUDA_VERSION_SHORT} == "cpu" ]]; then
+            PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-runtime"
+            CUDA_SUFFIX=""
+          fi
 
           PYTORCH_NIGHTLY_COMMIT=$(docker run ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
                                           python -c 'import torch; print(torch.version.git_version[:7],end="")')
 
           docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
-                 ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
-          docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
+                 ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
+
+          docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
+
+          # Please note, here we ned to pin specific verison of CUDA as with latest label
+          if [[ ${CUDA_VERSION_SHORT} == "12.1" ]]; then
+            docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \
+                    ghcr.io/pytorch/pytorch-nightly:latest
+            docker push ghcr.io/pytorch/pytorch-nightly:latest
+          fi
 
-          docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" \
-                 ghcr.io/pytorch/pytorch-nightly:latest
-          docker push ghcr.io/pytorch/pytorch-nightly:latest
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
         if: always()
diff --git a/docker.Makefile b/docker.Makefile
@@ -83,6 +83,22 @@ devel-push: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(CU
 devel-push:
 	$(DOCKER_PUSH)
 
+ifeq ("$(CUDA_VERSION_SHORT)","cpu")
+
+.PHONY: runtime-image
+runtime-image: BASE_IMAGE := $(BASE_RUNTIME)
+runtime-image: DOCKER_TAG := $(PYTORCH_VERSION)-runtime
+runtime-image:
+	$(DOCKER_BUILD)
+
+.PHONY: runtime-push
+runtime-push: BASE_IMAGE := $(BASE_RUNTIME)
+runtime-push: DOCKER_TAG := $(PYTORCH_VERSION)-runtime
+runtime-push:
+	$(DOCKER_PUSH)
+
+else
+
 .PHONY: runtime-image
 runtime-image: BASE_IMAGE := $(BASE_RUNTIME)
 runtime-image: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(CUDNN_VERSION)-runtime
@@ -95,6 +111,8 @@ runtime-push: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(
 runtime-push:
 	$(DOCKER_PUSH)
 
+endif
+
 .PHONY: clean
 clean:
 	-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))