Skip to content

Commit

Permalink
[pytorch] [trcomp] Updated binaries with CVE fix for PT.12 SM Trainin…
Browse files Browse the repository at this point in the history
…g Compiler (#2478)

* Add Dockerfile trcomp

* Update SMDDP binary

* Add native dockerfile

* Change NCCL version

* Update NCCL version

* Add NCCL Version

* Add Dockerfile trcomp

* Formatting Changes

* Add SM Training Toolkit and PT Toolkit

* set NCCL to 2.12.12 and update pre-built binaries

* Trigger Build

* Updated PT binary with race condition fix

* updated wheels after random seed bugfix

* updated docker import to reflect build target is sagemaker

* set sagemaker_remote_tests flag to standard

* Update Huggingface to include fixes for CV distributed training.

* removing unused packages

* skip trcomp for smmodelparallel pytorch tests

* Trigger Build

* remove HF transformers/datasets and other packages not in base PT 1.12 dlc

* skip smmodelparallel tests

* CVE fix: upgrade protofbuf

* enable regular pytorch test

* fix framework version

* add back fixtures

* fix telemetry test

* kip trcomp incompatible tests for non huggingface scenario

Signed-off-by: Harish Tummalacherla <hartum@amazon.com>

* skip torch data tests for trcomp as we use huggingface datasets

* disable horovod test for trcomp

* fix issue with horovod test

* consume the latest pt 1.12.x as base dlc

* skip unsupported tests for trcomp

* reinstall Horovod for trcomp

* fix imports

* fix debugger tests

* fix imports

* fix imports

* nit change

* fix pip check issues

* fix uninstall

* fix framework name for trcomp images

* fix typo

* fix version check

* upgrade opensssl

* fix pip check

* fix regex

* skip pip check for trcomp

* another attempt at regex

* another attempt at regex

* revert dlc developer config

* comments, updated skipped tests logging and reverted dev config

* updated wheels with CVE fix

* dev config flags to run tests

* fix ecr ecan

* revert temp changes

Signed-off-by: Harish Tummalacherla <hartum@amazon.com>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-39-98.us-west-2.compute.internal>
Co-authored-by: dasritwi <dasritwi@amazon.com>
Co-authored-by: Ritwik Das <ritwikdas54@gmail.com>
Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com>
Co-authored-by: tejaschumbalkar <tejaschumbalkar@gmail.com>
Co-authored-by: Harish Tummalacherla <hartum@amazon.com>
  • Loading branch information
7 people committed Dec 3, 2022
1 parent 5258437 commit cab3451
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 2 deletions.
@@ -0,0 +1,58 @@
{
"torch": [
{
"description": "[torch](https://pypi.org/project/torch) is a Tensors and Dynamic neural networks in Python with strong GPU acceleration\n\nAffected versions of this package are vulnerable to Command Injection in `torch.jit.annotations.parse_type_line` which can cause arbitrary code execution because `eval` is used unsafely.",
"vulnerability_id": "SNYK-PYTHON-TORCH-3149871",
"name": "SNYK-PYTHON-TORCH-3149871",
"package_name": "torch",
"package_details": {
"file_path": "opt/conda/lib/python3.8/site-packages/torch-1.12.0+cu113.dist-info/METADATA",
"name": "torch",
"package_manager": "PYTHONPKG",
"version": "1.12.0+cu113",
"release": null
},
"remediation": {
"recommendation": {
"text": "None Provided"
}
},
"cvss_v3_score": 9.8,
"cvss_v30_score": 0.0,
"cvss_v31_score": 9.8,
"cvss_v2_score": 0.0,
"cvss_v3_severity": "CRITICAL",
"source_url": "https://snyk.io/vuln/SNYK-PYTHON-TORCH-3149871",
"source": "SNYK",
"severity": "CRITICAL",
"status": "ACTIVE",
"title": "IN1-PYTHON-TORCH-3149871 - torch"
}, {
"description": " In PyTorch before trunk/89695, torch.jit.annotations.parse_type_line can cause arbitrary code execution because eval is used unsafely.",
"vulnerability_id": "CVE-2022-45907",
"name": "CVE-2022-45907",
"package_name": "torch",
"package_details": {
"file_path": "opt/conda/lib/python3.8/site-packages/torch-1.12.0+cu113.dist-info/METADATA",
"name": "torch",
"package_manager": "PYTHONPKG",
"version": "1.12.0+cu113",
"release": null
},
"remediation": {
"recommendation": {
"text": "None Provided"
}
},
"cvss_v3_score": 9.8,
"cvss_v30_score": 0.0,
"cvss_v31_score": 9.8,
"cvss_v2_score": 0.0,
"cvss_v3_severity": "CRITICAL",
"source_url": "https://people.canonical.com/~ubuntu-security/cve/2022/CVE-2022-45907.html",
"source": "UBUNTU_CVE",
"severity": "MEDIUM",
"status": "ACTIVE",
"title": "CVE-2022-45907 - torch"
}]
}
3 changes: 2 additions & 1 deletion pytorch/training/docker/1.12/py3/cu113/Dockerfile.trcomp.gpu
Expand Up @@ -35,7 +35,8 @@ LABEL dlc_major_version="1"
# Version args - overwritten by args specified in buildspec
ARG PYTHON=python3

ARG PT_BUCKET=https://aws-pytorch-unified-cicd-binaries.s3.us-west-2.amazonaws.com/trcomp/r1.12.0_sm/20221110-064048/f77b79fa779a512c319d20e42f59bbd3559bd16a

ARG PT_BUCKET=https://aws-pytorch-unified-cicd-binaries.s3.us-west-2.amazonaws.com/trcomp/r1.12.0_sm/20221202-082049/72513603d1a1d74129e980c4935de5bcd0959691
ARG PT_URL=${PT_BUCKET}/torch-1.12.0%2Bcu113-cp38-cp38-linux_x86_64.whl
ARG PT_XLA_URL=${PT_BUCKET}/torch_xla-1.12-cp38-cp38-linux_x86_64.whl
ARG TORCHVISION_URL=${PT_BUCKET}/torchvision-0.13.0a0%2Bda3794e-cp38-cp38-linux_x86_64.whl
Expand Down
3 changes: 3 additions & 0 deletions test/dlc_tests/sanity/test_pre_release.py
Expand Up @@ -441,6 +441,9 @@ def _run_dependency_check_test(image, ec2_connection):
"1.12": ["cpu", "gpu", "hpu"],
"1.13": ["cpu", "gpu", "hpu"],
},
"pytorch_trcomp": {
"1.12": ["gpu"],
},
"huggingface_pytorch": {"1.8": ["cpu", "gpu"], "1.9": ["cpu", "gpu"]},
"huggingface_tensorflow": {"2.4": ["cpu", "gpu"], "2.5": ["cpu", "gpu"], "2.6": ["cpu", "gpu"]},
"huggingface_tensorflow_trcomp": {"2.6": ["gpu"]},
Expand Down
7 changes: 6 additions & 1 deletion test/test_utils/__init__.py
Expand Up @@ -287,7 +287,10 @@ def get_expected_dockerfile_filename(device_type, image_uri):
if "graviton" in image_uri:
return f"Dockerfile.graviton.{device_type}"
elif is_ec2_sm_in_same_dockerfile(image_uri):
return f"Dockerfile.{device_type}"
if "pytorch-trcomp-training" in image_uri:
return f"Dockerfile.trcomp.{device_type}"
else:
return f"Dockerfile.{device_type}"
elif is_ec2_image(image_uri):
return f"Dockerfile.ec2.{device_type}"
else:
Expand Down Expand Up @@ -464,6 +467,7 @@ def is_covered_by_ec2_sm_split(image_uri):
ec2_sm_split_images = {
"pytorch": SpecifierSet(">=1.10.0"),
"tensorflow": SpecifierSet(">=2.7.0"),
"pytorch_trcomp": SpecifierSet(">=1.12.0"),
}
framework, version = get_framework_and_version_from_tag(image_uri)
return framework in ec2_sm_split_images and Version(version) in ec2_sm_split_images[framework]
Expand All @@ -473,6 +477,7 @@ def is_ec2_sm_in_same_dockerfile(image_uri):
same_sm_ec2_dockerfile_record = {
"pytorch": SpecifierSet(">=1.11.0"),
"tensorflow": SpecifierSet(">=2.8.0"),
"pytorch_trcomp": SpecifierSet(">=1.12.0"),
}
framework, version = get_framework_and_version_from_tag(image_uri)
return framework in same_sm_ec2_dockerfile_record and Version(version) in same_sm_ec2_dockerfile_record[framework]
Expand Down

0 comments on commit cab3451

Please sign in to comment.