Update

[ghstack-poisoned]
pytorch · May 12, 2024 · eb21d0b · eb21d0b
2 parents 1a297b5 + fa30627
commit eb21d0b
Show file tree

Hide file tree

Showing 1,791 changed files with 19,867 additions and 260,802 deletions.
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -366,7 +366,7 @@ if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
 fi
 
 # Build image
-DOCKER_BUILDKIT=1 docker build \
+docker build \
        --no-cache \
        --progress=plain \
        --build-arg "BUILD_ENVIRONMENT=${image}" \

diff --git a/.ci/docker/centos-rocm/Dockerfile b/.ci/docker/centos-rocm/Dockerfile
@@ -62,7 +62,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}
 
-# (optional) Install vision packages like OpenCV and ffmpeg
+# (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi

diff --git a/.ci/docker/common/install_vision.sh b/.ci/docker/common/install_vision.sh
@@ -5,8 +5,7 @@ set -ex
 install_ubuntu() {
   apt-get update
   apt-get install -y --no-install-recommends \
-          libopencv-dev \
-          libavcodec-dev
+          libopencv-dev
 
   # Cleanup
   apt-get autoclean && apt-get clean
@@ -19,8 +18,7 @@ install_centos() {
   yum --enablerepo=extras install -y epel-release
 
   yum install -y \
-      opencv-devel \
-      ffmpeg-devel
+      opencv-devel
 
   # Cleanup
   yum clean all

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -279,9 +279,9 @@ ghstack==0.8.0
 #Pinned versions: 0.8.0
 #test that import:
 
-jinja2==3.1.3
+jinja2==3.1.4
 #Description: jinja2 template engine
-#Pinned versions: 3.1.3
+#Pinned versions: 3.1.4
 #test that import:
 
 pytest-cpp==2.3.0
@@ -310,3 +310,5 @@ lxml==5.0.0.
 #Description: This is a requirement of unittest-xml-reporting
 
 # Python-3.9 binaries
+
+PyGithub==2.3.0
diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile
@@ -56,7 +56,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}
 
-# (optional) Install vision packages like OpenCV and ffmpeg
+# (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi

diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
@@ -53,7 +53,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}
 
-# (optional) Install vision packages like OpenCV and ffmpeg
+# (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi

diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile
@@ -83,7 +83,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}
 
-# (optional) Install vision packages like OpenCV and ffmpeg
+# (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi

diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
@@ -80,7 +80,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}
 
-# (optional) Install vision packages like OpenCV and ffmpeg
+# (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi

diff --git a/.ci/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh
@@ -45,7 +45,10 @@ time python test/run_test.py --verbose -i distributed/test_device_mesh
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
-time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state.py
+time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state
+
+# FSDP2 tests
+time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh
 
 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -310,23 +310,23 @@ test_dynamo_shard() {
 test_inductor_distributed() {
   # Smuggle a few multi-gpu tests here so that we don't have to request another large node
   echo "Testing multi_gpu tests in test_torchinductor"
-  pytest test/inductor/test_torchinductor.py -k test_multi_gpu
-  pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
-  pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
-  pytest test/distributed/test_c10d_functional_native.py
-  pytest test/distributed/_tensor/test_dtensor_compile.py
-  pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
-  pytest test/distributed/_composable/fsdp/test_fully_shard_comm.py
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation
-  pytest test/distributed/_composable/fsdp/test_fully_shard_frozen.py
-  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype
-  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype
-  pytest test/distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration
+  python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose
+  python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose
+  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
+  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
+  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
+  python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
+  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose
 
   # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
   # with if required # gpus aren't available
@@ -522,6 +522,11 @@ test_single_dynamo_benchmark() {
   fi
 }
 
+test_inductor_micro_benchmark() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
+}
+
 test_dynamo_benchmark() {
   # Usage: test_dynamo_benchmark huggingface 0
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
@@ -1209,6 +1214,8 @@ elif [[ "$TEST_CONFIG" == deploy ]]; then
   test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
   test_inductor_distributed
+elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
+  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
   install_torchvision
   id=$((SHARD_NUMBER-1))

diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
@@ -96,8 +96,13 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
     conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
   )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
-  pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-  retry pip install -q numpy protobuf typing-extensions
+  if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
+    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+    retry pip install -q numpy protobuf typing-extensions
+  else
+    pip install "\$pkg"
+    retry pip install -q numpy protobuf typing-extensions
+  fi
 fi
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
   pkg="\$(ls /final_pkgs/*-latest.zip)"

diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
@@ -13,6 +13,7 @@ self-hosted-runner:
     - linux.8xlarge.nvidia.gpu
     - linux.16xlarge.nvidia.gpu
     - linux.g5.4xlarge.nvidia.gpu
+    - linux.s390x
     - windows.4xlarge.nonephemeral
     - windows.8xlarge.nvidia.gpu
     - windows.8xlarge.nvidia.gpu.nonephemeral
@@ -21,6 +22,7 @@ self-hosted-runner:
     - linux.rocm.gpu
     - macos-m1-stable
     - macos-m1-13
+    - macos-m1-14
     - macos-12-xl
     - macos-12
     - macos12.3-m1

diff --git a/.github/actions/test-pytorch-binary/action.yml b/.github/actions/test-pytorch-binary/action.yml
@@ -35,7 +35,7 @@ runs:
           "${DOCKER_IMAGE}"
         )
 
-        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" ]]; then
+        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" ]]; then
           # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
           grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
         fi
@@ -44,3 +44,12 @@ runs:
         # Generate test script
         docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
         docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+
+    - name: Cleanup docker
+      if: always() && env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel'
+      shell: bash
+      run: |
+        # on s390x stop the container for clean worker stop
+        # ignore expansion of "docker ps -q" since it could be empty
+        # shellcheck disable=SC2046
+        docker stop $(docker ps -q) || true
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-06ad737628abc3a1e617571dc03cbdd5b36ea96a
+d23a6e1664d20707c11781299611436e1f0c104f
diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
@@ -29,10 +29,12 @@
   approved_by:
   - BowenBao
   - justinchuby
+  - liqunfu
   - shubhambhokare1
   - thiagocrepaldi
   - titaiwangms
   - wschin
+  - xadupre
   mandatory_checks_name:
   - EasyCLA
   - Lint

diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -8,6 +8,7 @@ ciflow_push_tags:
 - ciflow/binaries_wheel
 - ciflow/inductor
 - ciflow/inductor-perf-compare
+- ciflow/inductor-micro-benchmark
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly

diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt
@@ -5,7 +5,7 @@
 #   functorch/docs/requirements.txt
 #   .ci/docker/requirements-ci.txt
 boto3==1.19.12
-jinja2==3.1.3
+jinja2==3.1.4
 lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84

diff --git a/.github/scripts/amd/patch_triton_wheel.sh b/.github/scripts/amd/patch_triton_wheel.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
 set -x
 
-WHEELHOUSE_DIR=/artifacts
+if [ -z "$1" ]; then
+    echo "Need wheel location argument" && exit 1
+fi
+
+WHEELHOUSE_DIR=$1
 PATCHELF_BIN=patchelf
 ROCM_LIB=backends/amd/lib
 ROCM_LD=backends/amd/llvm/bin

diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
@@ -157,10 +157,10 @@ def build_triton(
 
         if build_rocm:
             check_call(
-                [f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh"],
+                [f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh", Path.cwd()],
                 cwd=triton_basedir,
-                shell=True,
             )
+
         return Path.cwd() / whl_path.name
 
 

diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
@@ -31,6 +31,9 @@
 CPU_AARCH64_ARCH = ["cpu-aarch64"]
 
 
+CPU_S390X_ARCH = ["cpu-s390x"]
+
+
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
     "11.8": (
         "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
@@ -130,6 +133,8 @@ def arch_type(arch_version: str) -> str:
         return "cpu-cxx11-abi"
     elif arch_version in CPU_AARCH64_ARCH:
         return "cpu-aarch64"
+    elif arch_version in CPU_S390X_ARCH:
+        return "cpu-s390x"
     else:  # arch_version should always be "cpu" in this case
         return "cpu"
 
@@ -149,6 +154,7 @@ def arch_type(arch_version: str) -> str:
     "cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
     "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
     "cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
+    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
 }
 
 CONDA_CONTAINER_IMAGES = {
@@ -205,6 +211,7 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
         "cpu": "cpu",
         "cpu-aarch64": "cpu",
         "cpu-cxx11-abi": "cpu-cxx11-abi",
+        "cpu-s390x": "cpu",
         "cuda": f"cu{gpu_arch_version.replace('.', '')}",
         "rocm": f"rocm{gpu_arch_version}",
     }.get(gpu_arch_type, gpu_arch_version)
@@ -306,8 +313,8 @@ def generate_wheels_matrix(
     python_versions: Optional[List[str]] = None,
 ) -> List[Dict[str, str]]:
     package_type = "wheel"
-    if os == "linux" or os == "linux-aarch64":
-        # NOTE: We only build manywheel packages for x86_64 and aarch64 linux
+    if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
+        # NOTE: We only build manywheel packages for x86_64 and aarch64 and s390x linux
         package_type = "manywheel"
 
     if python_versions is None:
@@ -324,6 +331,10 @@ def generate_wheels_matrix(
             # Only want the one arch as the CPU type is different and
             # uses different build/test scripts
             arches = ["cpu-aarch64"]
+        elif os == "linux-s390x":
+            # Only want the one arch as the CPU type is different and
+            # uses different build/test scripts
+            arches = ["cpu-s390x"]
 
     ret: List[Dict[str, str]] = []
     for python_version in python_versions:
@@ -334,6 +345,7 @@ def generate_wheels_matrix(
                 if arch_version == "cpu"
                 or arch_version == "cpu-cxx11-abi"
                 or arch_version == "cpu-aarch64"
+                or arch_version == "cpu-s390x"
                 else arch_version
             )
 

diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
@@ -95,6 +95,7 @@ class OperatingSystem:
     MACOS = "macos"
     MACOS_ARM64 = "macos-arm64"
     LINUX_AARCH64 = "linux-aarch64"
+    LINUX_S390X = "linux-s390x"
 
 
 LINUX_BINARY_BUILD_WORFKLOWS = [
@@ -332,6 +333,20 @@ class OperatingSystem:
     ),
 ]
 
+S390X_BINARY_BUILD_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX_S390X,
+        package_type="manywheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX_S390X
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+            isolated_workflow=True,
+        ),
+    ),
+]
+
 
 def main() -> None:
     jinja_env = jinja2.Environment(
@@ -350,6 +365,10 @@ def main() -> None:
             jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
             AARCH64_BINARY_BUILD_WORKFLOWS,
         ),
+        (
+            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
+            S390X_BINARY_BUILD_WORKFLOWS,
+        ),
         (
             jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
             LINUX_BINARY_SMOKE_WORKFLOWS,