Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GitHub Runner] Fix flax runner #13357

Merged
merged 3 commits into from Aug 31, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
110 changes: 54 additions & 56 deletions .github/workflows/self-push.yml
Expand Up @@ -106,9 +106,9 @@ jobs:
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"

# - name: Fetch the tests to run
# run: |
# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
- name: Fetch the tests to run
run: |
python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt

- name: Report fetched tests
uses: actions/upload-artifact@v2
Expand All @@ -118,10 +118,9 @@ jobs:

- name: Run all non-slow tests on GPU
run: |
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu
# if [ -f test_list.txt ]; then
# python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt)
# fi
if [ -f test_list.txt ]; then
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt)
fi

- name: Failure short reports
if: ${{ failure() }}
Expand Down Expand Up @@ -251,61 +250,60 @@ jobs:
name: run_all_tests_torch_multi_gpu_test_reports
path: reports

run_tests_flax_multi_gpu:
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Install dependencies
run: |
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
pip install --upgrade pip
pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]

- name: Launcher docker
uses: actions/checkout@v2
with:
fetch-depth: 2

- name: NVIDIA-SMI
continue-on-error: true
run: |
nvidia-smi

- name: Are GPUs recognized by our DL frameworks
run: |
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"

# run_tests_flax_multi_gpu:
# runs-on: [self-hosted, docker-gpu, multi-gpu]
# container:
# image: tensorflow/tensorflow:2.4.1-gpu
# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
# steps:
# - name: Install dependencies
# run: |
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
# pip install --upgrade pip
# pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
#
# - name: Launcher docker
# uses: actions/checkout@v2
# with:
# fetch-depth: 2
#
# - name: NVIDIA-SMI
# continue-on-error: true
# run: |
# nvidia-smi
#
# - name: Are GPUs recognized by our DL frameworks
# run: |
# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
#
# - name: Fetch the tests to run
# run: |
# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt

- name: Report fetched tests
uses: actions/upload-artifact@v2
with:
name: test_fetched
path: test_preparation.txt

- name: Run all non-slow tests on GPU
run: |
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu
#
# - name: Report fetched tests
# uses: actions/upload-artifact@v2
# with:
# name: test_fetched
# path: test_preparation.txt
#
# - name: Run all non-slow tests on GPU
# run: |
# if [ -f test_list.txt ]; then
# python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu $(cat test_list.txt)
# fi

- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_flax_multi_gpu_failures_short.txt

- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_all_tests_flax_multi_gpu_test_reports
path: reports
#
# - name: Failure short reports
# if: ${{ failure() }}
# run: cat reports/tests_flax_multi_gpu_failures_short.txt
#
# - name: Test suite reports artifacts
# if: ${{ always() }}
# uses: actions/upload-artifact@v2
# with:
# name: run_all_tests_flax_multi_gpu_test_reports
# path: reports

# run_tests_tf_multi_gpu:
# runs-on: [self-hosted, docker-gpu, multi-gpu]
Expand Down
79 changes: 39 additions & 40 deletions .github/workflows/self-scheduled.yml
Expand Up @@ -86,7 +86,7 @@ jobs:
path: reports

run_all_tests_flax_gpu:
runs-on: [self-hosted, docker-gpu, single-gpu]
runs-on: [self-hosted, docker-gpu-test, single-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
Expand Down Expand Up @@ -291,45 +291,44 @@ jobs:
name: run_all_tests_tf_multi_gpu_test_reports
path: reports

run_all_tests_flax_multi_gpu:
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Launcher docker
uses: actions/checkout@v2

- name: NVIDIA-SMI
continue-on-error: true
run: |
nvidia-smi

- name: Install dependencies
run: |
pip install --upgrade pip
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]

- name: Are GPUs recognized by our DL frameworks
run: |
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"

- name: Run all tests on GPU
run: |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests

- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_flax_gpu_failures_short.txt

- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_all_tests_flax_gpu_test_reports
path: reports
# run_all_tests_flax_multi_gpu:
# runs-on: [self-hosted, docker-gpu, multi-gpu]
# container:
# image: tensorflow/tensorflow:2.4.1-gpu
# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
# steps:
# - name: Launcher docker
# uses: actions/checkout@v2
#
# - name: NVIDIA-SMI
# run: |
# nvidia-smi
#
# - name: Install dependencies
# run: |
# pip install --upgrade pip
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
# pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
#
# - name: Are GPUs recognized by our DL frameworks
# run: |
# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
#
# - name: Run all tests on GPU
# run: |
# python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
#
# - name: Failure short reports
# if: ${{ always() }}
# run: cat reports/tests_flax_gpu_failures_short.txt
#
# - name: Test suite reports artifacts
# if: ${{ always() }}
# uses: actions/upload-artifact@v2
# with:
# name: run_all_tests_flax_gpu_test_reports
# path: reports

run_all_tests_torch_cuda_extensions_gpu:
runs-on: [self-hosted, docker-gpu, single-gpu]
Expand Down