Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vendor nvidia-ml-py-11.515.48 #4109

Merged
merged 27 commits into from
Aug 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
875e66d
vendor nvidia-ml-py-11.515.48
dmitryduev Aug 16, 2022
5b9ffd7
vendor nvidia-ml-py-11.515.48
dmitryduev Aug 16, 2022
fa7443f
Merge branch 'master' of https://github.com/wandb/wandb into bsod
dmitryduev Aug 16, 2022
d3c2ebd
add win/gpu testing
dmitryduev Aug 16, 2022
459af75
add win/gpu testing
dmitryduev Aug 16, 2022
1ae0635
add win/gpu testing
dmitryduev Aug 16, 2022
cd0e436
add win/gpu testing
dmitryduev Aug 16, 2022
388b1e0
add win/gpu testing
dmitryduev Aug 16, 2022
66475b5
add win/gpu testing
dmitryduev Aug 16, 2022
9e0fe73
add win/gpu testing
dmitryduev Aug 16, 2022
96e61ae
unskip relevant tests on win; add note to pynvml
dmitryduev Aug 16, 2022
b53f980
move tests to nightly
dmitryduev Aug 16, 2022
9d677cf
more fixes to the vendored pynvml
dmitryduev Aug 17, 2022
f262453
fix wincovercircle tox testenv
dmitryduev Aug 17, 2022
66ff482
fix win job
dmitryduev Aug 17, 2022
80c29d7
fix win job
dmitryduev Aug 17, 2022
ee11cbb
stop wasting time on a useless test case
dmitryduev Aug 17, 2022
7426112
Merge branch 'master' of https://github.com/wandb/wandb into bsod
dmitryduev Aug 17, 2022
909470d
Merge branch 'master' into bsod
dmitryduev Aug 17, 2022
5831864
Merge branch 'bsod' of https://github.com/wandb/wandb into bsod
dmitryduev Aug 17, 2022
82ed159
fix wincovercircle
dmitryduev Aug 17, 2022
d7af173
fix wincovercircle
dmitryduev Aug 17, 2022
4f2893a
crank up timeout for win tests
dmitryduev Aug 17, 2022
0ebac7f
crank up timeout for win tests
dmitryduev Aug 17, 2022
c5e2d1b
update config.yml
dmitryduev Aug 17, 2022
6fdd136
Merge branch 'master' of https://github.com/wandb/wandb into bsod
dmitryduev Aug 17, 2022
b1cd149
Merge branch 'master' into bsod
dmitryduev Aug 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
111 changes: 84 additions & 27 deletions .circleci/config.yml
@@ -1,9 +1,9 @@
version: 2.1

orbs:
win: circleci/windows@2.4.0
slack: circleci/slack@4.9.3
go: circleci/go@1.3.0
win: circleci/windows@5.0.0
dmitryduev marked this conversation as resolved.
Show resolved Hide resolved
slack: circleci/slack@4.10.1
go: circleci/go@1.7.1
gke: circleci/gcp-gke@1.4.0

parameters:
Expand Down Expand Up @@ -79,6 +79,9 @@ parameters:
manual_nightly_execute_shard_kfp:
type: boolean
default: false
manual_nightly_execute_shard_standalone_gpu_win:
type: boolean
default: false
wandb_server_tag:
type: string
default: "master"
Expand Down Expand Up @@ -323,7 +326,7 @@ jobs:
default: 3
python_version_minor:
type: integer
default: 7
default: 9
toxenv:
type: string
parallelism:
Expand All @@ -332,31 +335,56 @@ jobs:
xdist:
type: integer
default: 3
machine_executor:
type: string
default: "default" # "default" or "server-2019-cuda"
executor_size:
type: string
default: "large" # could only be "medium" for "server-2019-cuda"
execute:
type: boolean
default: true
executor:
name: win/default
size: "large"
name: win/<< parameters.machine_executor >>
size: << parameters.executor_size >>
parallelism: << parameters.parallelism >>
steps:
- checkout
- run:
name: Install python dependencies
command: |
pip install tox==<< pipeline.parameters.tox_version >>
- run:
name: Temporary conda hack
shell: bash.exe
command: |
cp /c/tools/miniconda3/python* /c/tools/miniconda3/lib/venv/scripts/nt/
- run:
name: Run tests
shell: bash.exe
command: |
echo $GCLOUD_SERVICE_KEY > key.json
gcloud auth activate-service-account --key-file=key.json
yes | gcloud auth configure-docker
CI_PYTEST_PARALLEL=<< parameters.xdist >> CI_PYTEST_SPLIT_ARGS="--splits $CIRCLE_NODE_TOTAL --group $(( $CIRCLE_NODE_INDEX + 1 ))" tox -v -e << parameters.toxenv >>
no_output_timeout: 10m
- save-test-results
- when:
condition: << parameters.execute >>
steps:
- run:
name: Install python dependencies
shell: bash.exe
command: |
pip install tox==<< pipeline.parameters.tox_version >>
- run:
name: Temporary conda hack
shell: bash.exe
command: |
cp /c/tools/miniconda3/python* /c/tools/miniconda3/lib/venv/scripts/nt/
- when:
condition:
equal: [ "server-2019-cuda", << parameters.machine_executor >> ]
steps:
- run:
name: Update tox.ini on a GPU machine to install the proper pytorch version
shell: bash.exe
command: |
CUDA_VERSION=`ls "/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA"`
CUDA_VERSION_NO_DOT=`echo "${CUDA_VERSION//./}"`
v=${CUDA_VERSION_NO_DOT:1}
sed -i -e "s/whl\/cpu/whl\/cu$v/g" tox.ini
- run:
name: Run tests
shell: bash.exe
command: |
echo $GCLOUD_SERVICE_KEY > key.json
gcloud auth activate-service-account --key-file=key.json
yes | gcloud auth configure-docker
DATE=$(date -u +%Y%m%d) CI_PYTEST_PARALLEL=<< parameters.xdist >> CI_PYTEST_SPLIT_ARGS="--splits $CIRCLE_NODE_TOTAL --group $(( $CIRCLE_NODE_INDEX + 1 ))" tox -v -e << parameters.toxenv >>
no_output_timeout: 10m
- save-test-results

mac:
parameters:
Expand Down Expand Up @@ -918,6 +946,20 @@ workflows:
context: slack-secrets
notify_on_failure: true
#
# standalone GPU tests on Windows
#
- win:
machine_executor: "server-2019-cuda"
executor_size: "medium"
matrix:
parameters:
python_version_major: [ 3 ]
python_version_minor: [ 9 ]
name: "func-s_standalone_gpu-win-py<<matrix.python_version_major>><<matrix.python_version_minor>>"
toxenv: "standalone-gpu-py<<matrix.python_version_major>><<matrix.python_version_minor>>"
parallelism: 2
xdist: 1
#
# standalone tests on gke
#
- slack_notify:
Expand Down Expand Up @@ -1107,7 +1149,7 @@ workflows:
matrix:
parameters:
python_version_major: [3]
python_version_minor: [7]
python_version_minor: [9]
name: "unit-s_base-win-py<<matrix.python_version_major>><<matrix.python_version_minor>>"
toxenv: "py<<matrix.python_version_major>><<matrix.python_version_minor>>,wincovercircle -- --timeout 300 tests/unit_tests"
- mac:
Expand Down Expand Up @@ -1138,7 +1180,7 @@ workflows:
matrix:
parameters:
python_version_major: [3]
python_version_minor: [7]
python_version_minor: [9]
name: "unit-s_nb-win-py<<matrix.python_version_major>><<matrix.python_version_minor>>"
toxenv: "unit-s_nb-py<<matrix.python_version_major>><<matrix.python_version_minor>>"
parallelism: 1
Expand Down Expand Up @@ -1195,6 +1237,21 @@ workflows:
context: slack-secrets
notify_on_failure: << pipeline.parameters.manual_nightly_slack_notify >>
#
# standalone GPU tests on Windows
#
- win:
machine_executor: "server-2019-cuda"
executor_size: "medium"
execute: << pipeline.parameters.manual_nightly_execute_shard_standalone_gpu_win >>
matrix:
parameters:
python_version_major: [ 3 ]
python_version_minor: [ 9 ]
name: "func-s_standalone_gpu-win-py<<matrix.python_version_major>><<matrix.python_version_minor>>"
toxenv: "standalone-gpu-py<<matrix.python_version_major>><<matrix.python_version_minor>>"
parallelism: 2
xdist: 1
#
# stanalone tests on gke
#
- slack_notify:
Expand Down
1 change: 1 addition & 0 deletions tests/standalone_tests/artifact_tests.yea
Expand Up @@ -5,6 +5,7 @@ tag:
platforms:
- linux
- mac
- win
plugin:
- wandb
assert:
Expand Down
1 change: 1 addition & 0 deletions tests/standalone_tests/basic.yea
Expand Up @@ -6,6 +6,7 @@ tag:
platforms:
- linux
- mac
- win
plugin:
- wandb
assert:
Expand Down
1 change: 1 addition & 0 deletions tests/standalone_tests/basic_plots.yea
Expand Up @@ -4,6 +4,7 @@ tag:
platforms:
- linux
- mac
- win
plugin:
- wandb
assert:
Expand Down
1 change: 1 addition & 0 deletions tests/standalone_tests/mixed_keras.yea
Expand Up @@ -4,6 +4,7 @@ tag:
platforms:
- linux
- mac
- win
plugin:
- wandb
assert:
Expand Down
1 change: 1 addition & 0 deletions tests/standalone_tests/tweets.yea
Expand Up @@ -5,6 +5,7 @@ tag:
platforms:
- linux
- mac
- win
plugin:
- wandb
assert:
Expand Down
27 changes: 12 additions & 15 deletions tools/circleci-tool.py
Expand Up @@ -44,6 +44,15 @@

CIRCLECI_API_TOKEN = "CIRCLECI_TOKEN"

NIGHTLY_SHARDS = (
"standalone-cpu",
"standalone-gpu",
"standalone-tpu",
"standalone-local",
"kfp",
"standalone-gpu-win",
)

platforms_dict = dict(linux="test", lin="test", mac="mac", win="win")
platforms_short_dict = dict(linux="lin", lin="lin", mac="mac", win="win")
py_name_dict = dict(
Expand Down Expand Up @@ -154,13 +163,7 @@ def trigger(args):
def trigger_nightly(args):
url = "https://circleci.com/api/v2/project/gh/wandb/wandb/pipeline"

default_shards = {
"standalone-cpu",
"standalone-gpu",
"standalone-tpu",
"standalone-local",
"kfp",
}
default_shards = set(NIGHTLY_SHARDS)
shards = {
f"manual_nightly_execute_shard_{shard.replace('-', '_')}": False
for shard in default_shards
Expand Down Expand Up @@ -320,14 +323,8 @@ def process_args():
)
parse_trigger_nightly.add_argument(
"--shards",
default=(
"standalone-cpu,"
"standalone-gpu,"
"standalone-tpu,"
"standalone-local,"
"kfp"
),
help="comma-separated shards (standalone-{cpu,gpu,tpu,local},kfp)",
default=",".join(NIGHTLY_SHARDS),
help="comma-separated shards (standalone-{cpu,gpu,tpu,local,gpu-win},kfp)",
)
parse_trigger_nightly.add_argument(
"--wait", action="store_true", help="Wait for finish or error"
Expand Down
10 changes: 5 additions & 5 deletions tox.ini
Expand Up @@ -84,7 +84,7 @@ passenv =
commands =
s_nb: ipython kernel install --user --name=wandb_python
mkdir -p test-results
python -m pytest {env:CI_PYTEST_SPLIT_ARGS:} -n={env:CI_PYTEST_PARALLEL:{env:WB_UNIT_PARALLEL:4}} --durations=20 --junitxml=test-results/junit.xml --cov-config=.coveragerc --cov --cov-report= --no-cov-on-fail {posargs:tests/unit_tests_old/tests_{env:WB_UNIT_SHARD}/}
python -m pytest {env:CI_PYTEST_SPLIT_ARGS:} -n={env:CI_PYTEST_PARALLEL:{env:WB_UNIT_PARALLEL:4}} --durations=20 --junitxml=test-results/junit.xml --cov-config=.coveragerc --cov --cov-report= --no-cov-on-fail --timeout 300 {posargs:tests/unit_tests_old/tests_{env:WB_UNIT_SHARD}/}

[testenv:dev]
usedevelop = true
Expand Down Expand Up @@ -384,11 +384,11 @@ whitelist_externals =
bash.exe
commands =
bash.exe -c 'mkdir -p cover-results'
bash.exe -c 'C:/Users/circleci/project/.tox/wincovercircle/Scripts/python -m coverage combine C:/Users/circleci/project/.tox/py37/.coverage*'
bash.exe -c 'C:/Users/circleci/project/.tox/wincovercircle/Scripts/python -m coverage xml'
bash.exe -c '~/project/.tox/wincovercircle/Scripts/python.exe -m coverage combine ~/project/.tox/py39/.coverage*'
dmitryduev marked this conversation as resolved.
Show resolved Hide resolved
bash.exe -c '~/project/.tox/wincovercircle/Scripts/python.exe -m coverage xml'
bash.exe -c 'cp .coverage coverage.xml cover-results/'
bash.exe -c 'C:/Users/circleci/project/.tox/wincovercircle/Scripts/python -m coverage report --ignore-errors --skip-covered --omit "wandb/vendor/*"'
bash.exe -c 'C:/Users/circleci/project/.tox/wincovercircle/Scripts/python -m codecov -e TOXENV -F unittest'
bash.exe -c '~/project/.tox/wincovercircle/Scripts/python.exe -m coverage report --ignore-errors --skip-covered --omit "wandb/vendor/*"'
bash.exe -c '~/project/.tox/wincovercircle/Scripts/python.exe -m codecov -e TOXENV -F unittest'

[testenv:cover]
skip_install = true
Expand Down
2 changes: 1 addition & 1 deletion wandb/sdk/internal/meta.py
Expand Up @@ -180,7 +180,7 @@ def _setup_sys(self):
pynvml.nvmlInit()
self.data["gpu"] = pynvml.nvmlDeviceGetName(
pynvml.nvmlDeviceGetHandleByIndex(0)
).decode("utf8")
)
self.data["gpu_count"] = pynvml.nvmlDeviceGetCount()
except pynvml.NVMLError:
pass
Expand Down