Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
[ghstack-poisoned]
  • Loading branch information
jansel committed May 12, 2024
2 parents 1a297b5 + fa30627 commit eb21d0b
Show file tree
Hide file tree
Showing 1,791 changed files with 19,867 additions and 260,802 deletions.
2 changes: 1 addition & 1 deletion .ci/docker/build.sh
Expand Up @@ -366,7 +366,7 @@ if [[ "$image" == *cuda* && ${OS} == "ubuntu" ]]; then
fi

# Build image
DOCKER_BUILDKIT=1 docker build \
docker build \
--no-cache \
--progress=plain \
--build-arg "BUILD_ENVIRONMENT=${image}" \
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/centos-rocm/Dockerfile
Expand Up @@ -62,7 +62,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}

# (optional) Install vision packages like OpenCV and ffmpeg
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
Expand Down
6 changes: 2 additions & 4 deletions .ci/docker/common/install_vision.sh
Expand Up @@ -5,8 +5,7 @@ set -ex
install_ubuntu() {
apt-get update
apt-get install -y --no-install-recommends \
libopencv-dev \
libavcodec-dev
libopencv-dev

# Cleanup
apt-get autoclean && apt-get clean
Expand All @@ -19,8 +18,7 @@ install_centos() {
yum --enablerepo=extras install -y epel-release

yum install -y \
opencv-devel \
ffmpeg-devel
opencv-devel

# Cleanup
yum clean all
Expand Down
6 changes: 4 additions & 2 deletions .ci/docker/requirements-ci.txt
Expand Up @@ -279,9 +279,9 @@ ghstack==0.8.0
#Pinned versions: 0.8.0
#test that import:

jinja2==3.1.3
jinja2==3.1.4
#Description: jinja2 template engine
#Pinned versions: 3.1.3
#Pinned versions: 3.1.4
#test that import:

pytest-cpp==2.3.0
Expand Down Expand Up @@ -310,3 +310,5 @@ lxml==5.0.0.
#Description: This is a requirement of unittest-xml-reporting

# Python-3.9 binaries

PyGithub==2.3.0
2 changes: 1 addition & 1 deletion .ci/docker/ubuntu-cuda/Dockerfile
Expand Up @@ -56,7 +56,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}

# (optional) Install vision packages like OpenCV and ffmpeg
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/ubuntu-rocm/Dockerfile
Expand Up @@ -53,7 +53,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}

# (optional) Install vision packages like OpenCV and ffmpeg
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/ubuntu-xpu/Dockerfile
Expand Up @@ -83,7 +83,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}

# (optional) Install vision packages like OpenCV and ffmpeg
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/ubuntu/Dockerfile
Expand Up @@ -80,7 +80,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}

# (optional) Install vision packages like OpenCV and ffmpeg
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
Expand Down
5 changes: 4 additions & 1 deletion .ci/pytorch/multigpu-test.sh
Expand Up @@ -45,7 +45,10 @@ time python test/run_test.py --verbose -i distributed/test_device_mesh
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state.py
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state

# FSDP2 tests
time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh

# Other tests
time python test/run_test.py --verbose -i test_cuda_primary_ctx
Expand Down
41 changes: 24 additions & 17 deletions .ci/pytorch/test.sh
Expand Up @@ -310,23 +310,23 @@ test_dynamo_shard() {
test_inductor_distributed() {
# Smuggle a few multi-gpu tests here so that we don't have to request another large node
echo "Testing multi_gpu tests in test_torchinductor"
pytest test/inductor/test_torchinductor.py -k test_multi_gpu
pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
pytest test/distributed/test_c10d_functional_native.py
pytest test/distributed/_tensor/test_dtensor_compile.py
pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
pytest test/distributed/_composable/fsdp/test_fully_shard_comm.py
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation
pytest test/distributed/_composable/fsdp/test_fully_shard_frozen.py
pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype
pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype
pytest test/distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration
python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose
python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose
python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

# this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
# with if required # gpus aren't available
Expand Down Expand Up @@ -522,6 +522,11 @@ test_single_dynamo_benchmark() {
fi
}

test_inductor_micro_benchmark() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
}

test_dynamo_benchmark() {
# Usage: test_dynamo_benchmark huggingface 0
TEST_REPORTS_DIR=$(pwd)/test/test-reports
Expand Down Expand Up @@ -1209,6 +1214,8 @@ elif [[ "$TEST_CONFIG" == deploy ]]; then
test_torch_deploy
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
test_inductor_distributed
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
test_inductor_micro_benchmark
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
install_torchvision
id=$((SHARD_NUMBER-1))
Expand Down
9 changes: 7 additions & 2 deletions .circleci/scripts/binary_linux_test.sh
Expand Up @@ -96,8 +96,13 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
)
elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
retry pip install -q numpy protobuf typing-extensions
if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
retry pip install -q numpy protobuf typing-extensions
else
pip install "\$pkg"
retry pip install -q numpy protobuf typing-extensions
fi
fi
if [[ "$PACKAGE_TYPE" == libtorch ]]; then
pkg="\$(ls /final_pkgs/*-latest.zip)"
Expand Down
2 changes: 2 additions & 0 deletions .github/actionlint.yaml
Expand Up @@ -13,6 +13,7 @@ self-hosted-runner:
- linux.8xlarge.nvidia.gpu
- linux.16xlarge.nvidia.gpu
- linux.g5.4xlarge.nvidia.gpu
- linux.s390x
- windows.4xlarge.nonephemeral
- windows.8xlarge.nvidia.gpu
- windows.8xlarge.nvidia.gpu.nonephemeral
Expand All @@ -21,6 +22,7 @@ self-hosted-runner:
- linux.rocm.gpu
- macos-m1-stable
- macos-m1-13
- macos-m1-14
- macos-12-xl
- macos-12
- macos12.3-m1
Expand Down
11 changes: 10 additions & 1 deletion .github/actions/test-pytorch-binary/action.yml
Expand Up @@ -35,7 +35,7 @@ runs:
"${DOCKER_IMAGE}"
)
if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" ]]; then
if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" ]]; then
# Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
fi
Expand All @@ -44,3 +44,12 @@ runs:
# Generate test script
docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
- name: Cleanup docker
if: always() && env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel'
shell: bash
run: |
# on s390x stop the container for clean worker stop
# ignore expansion of "docker ps -q" since it could be empty
# shellcheck disable=SC2046
docker stop $(docker ps -q) || true
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/vision.txt
@@ -1 +1 @@
06ad737628abc3a1e617571dc03cbdd5b36ea96a
d23a6e1664d20707c11781299611436e1f0c104f
2 changes: 2 additions & 0 deletions .github/merge_rules.yaml
Expand Up @@ -29,10 +29,12 @@
approved_by:
- BowenBao
- justinchuby
- liqunfu
- shubhambhokare1
- thiagocrepaldi
- titaiwangms
- wschin
- xadupre
mandatory_checks_name:
- EasyCLA
- Lint
Expand Down
1 change: 1 addition & 0 deletions .github/pytorch-probot.yml
Expand Up @@ -8,6 +8,7 @@ ciflow_push_tags:
- ciflow/binaries_wheel
- ciflow/inductor
- ciflow/inductor-perf-compare
- ciflow/inductor-micro-benchmark
- ciflow/linux-aarch64
- ciflow/mps
- ciflow/nightly
Expand Down
2 changes: 1 addition & 1 deletion .github/requirements-gha-cache.txt
Expand Up @@ -5,7 +5,7 @@
# functorch/docs/requirements.txt
# .ci/docker/requirements-ci.txt
boto3==1.19.12
jinja2==3.1.3
jinja2==3.1.4
lintrunner==0.10.7
ninja==1.10.0.post1
nvidia-ml-py==11.525.84
Expand Down
6 changes: 5 additions & 1 deletion .github/scripts/amd/patch_triton_wheel.sh
@@ -1,7 +1,11 @@
#!/bin/bash
set -x

WHEELHOUSE_DIR=/artifacts
if [ -z "$1" ]; then
echo "Need wheel location argument" && exit 1
fi

WHEELHOUSE_DIR=$1
PATCHELF_BIN=patchelf
ROCM_LIB=backends/amd/lib
ROCM_LD=backends/amd/llvm/bin
Expand Down
4 changes: 2 additions & 2 deletions .github/scripts/build_triton_wheel.py
Expand Up @@ -157,10 +157,10 @@ def build_triton(

if build_rocm:
check_call(
[f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh"],
[f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh", Path.cwd()],
cwd=triton_basedir,
shell=True,
)

return Path.cwd() / whl_path.name


Expand Down
16 changes: 14 additions & 2 deletions .github/scripts/generate_binary_build_matrix.py
Expand Up @@ -31,6 +31,9 @@
CPU_AARCH64_ARCH = ["cpu-aarch64"]


CPU_S390X_ARCH = ["cpu-s390x"]


PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"11.8": (
"nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | " # noqa: B950
Expand Down Expand Up @@ -130,6 +133,8 @@ def arch_type(arch_version: str) -> str:
return "cpu-cxx11-abi"
elif arch_version in CPU_AARCH64_ARCH:
return "cpu-aarch64"
elif arch_version in CPU_S390X_ARCH:
return "cpu-s390x"
else: # arch_version should always be "cpu" in this case
return "cpu"

Expand All @@ -149,6 +154,7 @@ def arch_type(arch_version: str) -> str:
"cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
"cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
"cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
"cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
}

CONDA_CONTAINER_IMAGES = {
Expand Down Expand Up @@ -205,6 +211,7 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
"cpu": "cpu",
"cpu-aarch64": "cpu",
"cpu-cxx11-abi": "cpu-cxx11-abi",
"cpu-s390x": "cpu",
"cuda": f"cu{gpu_arch_version.replace('.', '')}",
"rocm": f"rocm{gpu_arch_version}",
}.get(gpu_arch_type, gpu_arch_version)
Expand Down Expand Up @@ -306,8 +313,8 @@ def generate_wheels_matrix(
python_versions: Optional[List[str]] = None,
) -> List[Dict[str, str]]:
package_type = "wheel"
if os == "linux" or os == "linux-aarch64":
# NOTE: We only build manywheel packages for x86_64 and aarch64 linux
if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
# NOTE: We only build manywheel packages for x86_64 and aarch64 and s390x linux
package_type = "manywheel"

if python_versions is None:
Expand All @@ -324,6 +331,10 @@ def generate_wheels_matrix(
# Only want the one arch as the CPU type is different and
# uses different build/test scripts
arches = ["cpu-aarch64"]
elif os == "linux-s390x":
# Only want the one arch as the CPU type is different and
# uses different build/test scripts
arches = ["cpu-s390x"]

ret: List[Dict[str, str]] = []
for python_version in python_versions:
Expand All @@ -334,6 +345,7 @@ def generate_wheels_matrix(
if arch_version == "cpu"
or arch_version == "cpu-cxx11-abi"
or arch_version == "cpu-aarch64"
or arch_version == "cpu-s390x"
else arch_version
)

Expand Down
19 changes: 19 additions & 0 deletions .github/scripts/generate_ci_workflows.py
Expand Up @@ -95,6 +95,7 @@ class OperatingSystem:
MACOS = "macos"
MACOS_ARM64 = "macos-arm64"
LINUX_AARCH64 = "linux-aarch64"
LINUX_S390X = "linux-s390x"


LINUX_BINARY_BUILD_WORFKLOWS = [
Expand Down Expand Up @@ -332,6 +333,20 @@ class OperatingSystem:
),
]

S390X_BINARY_BUILD_WORKFLOWS = [
BinaryBuildWorkflow(
os=OperatingSystem.LINUX_S390X,
package_type="manywheel",
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX_S390X
),
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
isolated_workflow=True,
),
),
]


def main() -> None:
jinja_env = jinja2.Environment(
Expand All @@ -350,6 +365,10 @@ def main() -> None:
jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
AARCH64_BINARY_BUILD_WORKFLOWS,
),
(
jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
S390X_BINARY_BUILD_WORKFLOWS,
),
(
jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
LINUX_BINARY_SMOKE_WORKFLOWS,
Expand Down

0 comments on commit eb21d0b

Please sign in to comment.