-
Notifications
You must be signed in to change notification settings - Fork 3.3k
/
gpu-tests-fabric.yml
160 lines (146 loc) · 5.65 KB
/
gpu-tests-fabric.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Python package
# Create and test a Python package on multiple Python versions.
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
# https://docs.microsoft.com/azure/devops/pipelines/languages/python
trigger:
tags:
include:
- '*'
branches:
include:
- "master"
- "release/*"
- "refs/tags/*"
pr:
branches:
include:
- "master"
- "release/*"
paths:
include:
- ".actions/**"
- ".azure/gpu-tests-fabric.yml"
- "examples/fabric/**"
- "examples/run_fabric_examples.sh"
- "tests/tests_fabric/run_standalone_*.sh"
- "tests/tests_pytorch/run_standalone_tests.sh" # used by fabric through a symlink
- "requirements/fabric/**"
- "src/lightning/__about__.py"
- "src/lightning/__init__.py"
- "src/lightning/__main__.py"
- "src/lightning/__setup__.py"
- "src/lightning/__version__.py"
- "src/lightning/fabric/**"
- "src/lightning_fabric/*"
- "tests/tests_fabric/**"
- "pyproject.toml" # includes pytest config
exclude:
- "requirements/*/docs.txt"
- "*.md"
- "**/*.md"
jobs:
- job: testing
# how long to run the job before automatically cancelling
timeoutInMinutes: "20"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
pool: lit-rtx-3090
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
PIP_CACHE_DIR: "/var/tmp/pip"
container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.0-cuda11.7.1"
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--gpus=all --shm-size=2gb -v /var/tmp:/var/tmp"
strategy:
matrix:
'pkg: Fabric':
PACKAGE_NAME: "fabric"
'pkg: Lightning':
PACKAGE_NAME: "lightning"
workspace:
clean: all
steps:
- bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver"
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
scope=$( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))' )
echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope"
displayName: 'set env. vars'
- bash: |
echo $(DEVICES)
echo $CUDA_VISIBLE_DEVICES
echo $CUDA_VERSION_MM
echo $TORCH_URL
echo $COVERAGE_SOURCE
lspci | egrep 'VGA|3D'
whereis nvidia
nvidia-smi
which python && which pip
python --version
pip --version
pip list
displayName: 'Image info & NVIDIA'
- bash: |
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
for fpath in `ls requirements/**/*.txt`; do \
python ./requirements/pytorch/adjust-versions.py $fpath ${PYTORCH_VERSION}; \
done
displayName: 'Adjust dependencies'
- bash: pip install -e .[dev,strategies,examples] -U --find-links ${TORCH_URL}
displayName: 'Install package & dependencies'
- bash: |
set -e
pip list
python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
displayName: 'Env details'
- bash: python -m pytest lightning_fabric
workingDirectory: src
condition: eq(variables['PACKAGE_NAME'], 'fabric')
displayName: 'Testing: Fabric doctests'
- bash: |
pip install -q -r .actions/requirements.txt
python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_fabric" \
--source_import="lightning.fabric" \
--target_import="lightning_fabric"
python .actions/assistant.py copy_replace_imports --source_dir="./examples/fabric" \
--source_import="lightning.fabric" \
--target_import="lightning_fabric"
condition: eq(variables['PACKAGE_NAME'], 'fabric')
displayName: 'Adjust tests & examples'
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest --ignore benchmarks -v --durations=50
workingDirectory: tests/tests_fabric
env:
PL_RUN_CUDA_TESTS: "1"
displayName: 'Testing: fabric standard'
timeoutInMinutes: "10"
- bash: bash run_standalone_tests.sh
workingDirectory: tests/tests_fabric
env:
PL_RUN_CUDA_TESTS: "1"
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
displayName: 'Testing: fabric standalone tests'
timeoutInMinutes: "10"
- bash: |
python -m coverage report
python -m coverage xml
python -m coverage html
# https://docs.codecov.com/docs/codecov-uploader
curl -Os https://uploader.codecov.io/latest/linux/codecov
chmod +x codecov
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
ls -l
workingDirectory: tests/tests_fabric
displayName: 'Statistics'
- script: |
set -e
bash run_fabric_examples.sh --accelerator=cuda --devices=1
bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp
workingDirectory: examples
displayName: 'Testing: fabric examples'