diff --git a/.drone.yml b/.drone.yml index c87130844c040..91ccba28a1175 100644 --- a/.drone.yml +++ b/.drone.yml @@ -30,15 +30,23 @@ steps: MKL_THREADING_LAYER: GNU commands: + - set -e - python --version - pip --version - nvidia-smi - - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed -v --no-cache-dir - - pip install git+https://${AUTH_TOKEN}@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 -v --no-cache-dir + - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed --no-cache-dir + - pip install git+https://${AUTH_TOKEN}@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir # when Image has defined CUDa version we can switch to this package spec "nvidia-dali-cuda${CUDA_VERSION%%.*}0" - # todo: temprarl fix till https://github.com/PyTorchLightning/pytorch-lightning/pull/4922 is resolved - - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist "nvidia-dali-cuda100<0.27" --upgrade-strategy only-if-needed + - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100 --upgrade-strategy only-if-needed - pip list + # todo: remove unzip install after new nigtly docker is created + - apt-get update -qq + - apt-get install -y --no-install-recommends unzip + # get legacy checkpoints + - wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ + - unzip -o legacy/checkpoints.zip -d legacy/ + - ls -l legacy/checkpoints/ + # testing... - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=25 # --flake8 # Running special tests - sh tests/special_tests.sh diff --git a/.github/BECOMING_A_CORE_CONTRIBUTOR.md b/.github/BECOMING_A_CORE_CONTRIBUTOR.md index 3fa357ef062ca..828f45aedbecc 100644 --- a/.github/BECOMING_A_CORE_CONTRIBUTOR.md +++ b/.github/BECOMING_A_CORE_CONTRIBUTOR.md @@ -1,14 +1,14 @@ # How to become a core contributor -Thanks for your interest in joining the Lightning team! We’re a rapidly growing project which is poised to become the go-to framework for DL researchers! -We're currently recruiting for a team of 5 core maintainers. +Thanks for your interest in joining the Lightning team! We’re a rapidly growing project which is poised to become the go-to framework for DL researchers! +We're currently recruiting for a team of 5 core maintainers. As a core maintainer you will have a strong say in the direction of the project. Big changes will require a majority of maintainers to agree. -### Code of conduct +### Code of conduct First and foremost, you'll be evaluated against [these core values](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CONTRIBUTING.md). Any code we commit or feature we add needs to align with those core values. -### The bar for joining the team +### The bar for joining the team Lightning is being used to solve really hard problems at the top AI labs in the world. As such, the bar for adding team members is extremely high. Candidates must have solid engineering skills, have a good eye for user experience, and must be a power user of Lightning and PyTorch. With that said, the Lightning team will be diverse and a reflection of an inclusive AI community. You don't have to be an engineer to contribute! Scientists with great usability intuition and PyTorch ninja skills are welcomed! @@ -36,10 +36,10 @@ Pleasant/helpful tone. - Code is NOT overly engineered or hard to read - Ask yourself, could a non-engineer understand what’s happening here? - Make sure new tests are written -- Is this NECESSARY for Lightning? There are some PRs which are just purely about adding engineering complexity which have no place in Lightning. +- Is this NECESSARY for Lightning? There are some PRs which are just purely about adding engineering complexity which have no place in Lightning. Guidance - Some other PRs are for people who are wanting to get involved and add something unnecessary. We do want their help though! So don’t approve the PR, but direct them to a Github issue that they might be interested in helping with instead! -- To be considered for core contributor, please review 10 PRs and help the authors land it on master. Once you've finished the review, ping me +- To be considered for core contributor, please review 10 PRs and help the authors land it on master. Once you've finished the review, ping me for a sanity check. At the end of 10 PRs if your PR reviews are inline with expectations described above, then you can merge PRs on your own going forward, otherwise we'll do a few more until we're both comfortable :) @@ -47,15 +47,15 @@ otherwise we'll do a few more until we're both comfortable :) There are some big decisions which the project must make. For these I expect core contributors to have something meaningful to add if it’s their area of expertise. #### Diversity -Lightning should reflect the broader community it serves. As such we should have scientists/researchers from -different fields contributing! +Lightning should reflect the broader community it serves. As such we should have scientists/researchers from +different fields contributing! The first 5 core contributors will fit this profile. Thus if you overlap strongly with experiences and expertise as someone else on the team, you might have to wait until the next set of contributors are added. #### Summary: Requirements to apply The goal is to be inline with expectations for solving issues by the last one so you can do them on your own. If not, I might ask you to solve a few more specific ones. -- Solve 10+ Github issues. +- Solve 10+ Github issues. - Create 5+ meaningful PRs which solves some reported issue - bug, - Perform 10+ PR reviews from other contributors. diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index ddae1ea8a951c..cef062516b0eb 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -10,11 +10,15 @@ assignees: '' -## Please reproduce using [the BoringModel and post here](https://colab.research.google.com/drive/1HvWVVTK8j2Nj52qU4Q4YCyzOm0_aLQF3?usp=sharing) +## Please reproduce using the BoringModel + ### To Reproduce + +Use following [**BoringModel**](https://colab.research.google.com/drive/1HvWVVTK8j2Nj52qU4Q4YCyzOm0_aLQF3?usp=sharing) and post here + ### Expected behavior diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md index 2b249089657c8..e78df92a18bab 100644 --- a/.github/ISSUE_TEMPLATE/documentation.md +++ b/.github/ISSUE_TEMPLATE/documentation.md @@ -12,7 +12,7 @@ assignees: '' For typos and doc fixes, please go ahead and: 1. Create an issue. -2. Fix the typo. +2. Fix the typo. 3. Submit a PR. Thanks! diff --git a/.github/ISSUE_TEMPLATE/how-to-question.md b/.github/ISSUE_TEMPLATE/how-to-question.md index 2a307e18de5c7..786244d2f5e74 100644 --- a/.github/ISSUE_TEMPLATE/how-to-question.md +++ b/.github/ISSUE_TEMPLATE/how-to-question.md @@ -9,10 +9,10 @@ assignees: '' ## ❓ Questions and Help -### Before asking: +### Before asking: 1. Try to find answers to your questions in [the Lightning Forum!](https://forums.pytorchlightning.ai/) -2. Search for similar [issues](https://github.com/PyTorchLightning/pytorch-lightning/issues). -3. Search the [docs](https://pytorch-lightning.readthedocs.io/en/latest/). +2. Search for similar [issues](https://github.com/PyTorchLightning/pytorch-lightning/issues). +3. Search the [docs](https://pytorch-lightning.readthedocs.io/en/latest/). @@ -20,7 +20,7 @@ assignees: '' #### Code - + #### What have you tried? diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 4263a76fb16ae..ada6c6b8c62bd 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,6 +1,11 @@ ## What does this PR do? -Fixes # (issue) +Fixes # (issue) <- this [links related issue to this PR](https://docs.github.com/en/free-pro-team@latest/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword) ## Before submitting -- [ ] Was this discussed/approved via a Github issue? (no need for typos and docs improvements) -- [ ] Did you read the [contributor guideline](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CONTRIBUTING.md), Pull Request section? -- [ ] Did you make sure your PR does only one thing, instead of bundling different changes together? Otherwise, we ask you to create a separate PR for every change. -- [ ] Did you make sure to update the documentation with your changes? -- [ ] Did you write any new necessary tests? +- [ ] Was this discussed/approved via a GitHub issue? (not for typos and docs) +- [ ] Did you read the [contributor guideline](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CONTRIBUTING.md), **Pull Request** section? +- [ ] Did you make sure your PR does only one thing, instead of bundling different changes together? +- [ ] Did you make sure to update the documentation with your changes? (if necessary) +- [ ] Did you write any new necessary tests? (not for typos and docs) - [ ] Did you verify new and existing tests pass locally with your changes? -- [ ] If you made a notable change (that affects users), did you update the [CHANGELOG](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/CHANGELOG.md)? +- [ ] Did you update the [CHANGELOG](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/CHANGELOG.md)? (not for typos, docs, test updates, or internal minor changes/refactorings) ## PR review Anyone in the community is free to review the PR once the tests have passed. -Before you start reviewing make sure you have read [Review guidelines](https://github.com/PyTorchLightning/pytorch-lightning/wiki/Review-guidelines). In short, see the following bullet-list: +Before you start reviewing make sure you have read [Review guidelines](https://github.com/PyTorchLightning/pytorch-lightning/wiki/Review-guidelines). In short, see the following bullet-list: - [ ] Is this pull request ready for review? (if not, please submit in draft mode) - [ ] Check that all items from **Before submitting** are resolved - [ ] Make sure the title is self-explanatory and the description concisely explains the PR - - [ ] Add labels and milestones (and optionally projects) to the PR so it can be classified; _Bugfixes should be including in bug-fix release milestones (m.f.X) and features should be included in (m.X.b) releases._ - + - [ ] Add labels and milestones (and optionally projects) to the PR so it can be classified + - [ ] **Check that target branch and milestone match!** + ## Did you have fun? Make sure you had fun coding 🙃 diff --git a/.github/prepare-nightly_pkg-name.py b/.github/prepare-nightly_pkg-name.py deleted file mode 100644 index b85f6049ac140..0000000000000 --- a/.github/prepare-nightly_pkg-name.py +++ /dev/null @@ -1,12 +0,0 @@ -import os -import re - -PATH_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) - -PATH_SETUP = os.path.join(PATH_ROOT, 'setup.py') -print(f"rename package '{PATH_SETUP}'") -with open(PATH_SETUP, 'r') as fp: - setup = fp.read() -setup = re.sub(r'name=[\'"]pytorch-lightning[\'"]', 'name="pytorch-lightning-nightly"', setup) -with open(PATH_SETUP, 'w') as fp: - fp.write(setup) diff --git a/.github/prepare-nightly_version.py b/.github/prepare-nightly_version.py index 22b72c8d6803c..f830cc469905c 100644 --- a/.github/prepare-nightly_version.py +++ b/.github/prepare-nightly_version.py @@ -2,15 +2,15 @@ import os import re -PATH_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) +_PATH_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) +_PATH_INIT = os.path.join(_PATH_ROOT, 'pytorch_lightning', '__init__.py') # get today date now = datetime.datetime.now() now_date = now.strftime("%Y%m%d") -PATH_INIT = os.path.join(PATH_ROOT, 'pytorch_lightning', '__init__.py') -print(f"prepare init '{PATH_INIT}' - replace version by {now_date}") -with open(PATH_INIT, 'r') as fp: +print(f"prepare init '{_PATH_INIT}' - replace version by {now_date}") +with open(_PATH_INIT, 'r') as fp: init = fp.read() -init = re.sub(r'__version__ = [\d\.rc\'"]+', f'__version__ = "{now_date}"', init) -with open(PATH_INIT, 'w') as fp: +init = re.sub(r'__version__ = [\d\.\w\'"]+', f'__version__ = "{now_date}"', init) +with open(_PATH_INIT, 'w') as fp: fp.write(init) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 16f86e0759fce..1f5e5c2315bb8 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -2,11 +2,21 @@ name: CI build Docker # https://www.docker.com/blog/first-docker-github-action-is-here # https://github.com/docker/build-push-action # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch +on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master, "release/*"] # include release branches like release/1.0.x + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: branches: [master, "release/*"] + paths: + - "dockers/**" + - "!dockers/README.md" + - "requirements/*.txt" + - "environment.yml" + - "requirements.txt" + - ".github/workflows/ci_dockers.yml" + - ".github/workflows/events-nightly.yml" + - ".github/workflows/release-docker.yml" + - "setup.py" jobs: build-PL: @@ -55,7 +65,6 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} XLA_VERSION=${{ matrix.xla_version }} - cache-from: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} file: dockers/base-xla/Dockerfile push: false timeout-minutes: 50 @@ -96,7 +105,6 @@ jobs: PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} file: dockers/base-cuda/Dockerfile push: false timeout-minutes: 50 @@ -139,7 +147,6 @@ jobs: PYTORCH_VERSION=${{ matrix.pytorch_version }} PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} file: dockers/base-conda/Dockerfile push: false timeout-minutes: 50 diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci_pkg-install.yml index 4d70beddf3f1b..54c9f5c007c82 100644 --- a/.github/workflows/ci_pkg-install.yml +++ b/.github/workflows/ci_pkg-install.yml @@ -3,7 +3,7 @@ name: Install pkg # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master, "release/*"] # include release branches like release/1.0.x + branches: [master, "release/*"] pull_request: branches: [master, "release/*"] @@ -27,13 +27,13 @@ jobs: - name: Prepare env run: | - pip install check-manifest "twine==1.13.0" + pip install check-manifest "twine==3.2" setuptools wheel - name: Create package run: | check-manifest # python setup.py check --metadata --strict - python setup.py sdist + python setup.py sdist bdist_wheel - name: Check package run: | @@ -46,12 +46,18 @@ jobs: # this is just a hotfix because of Win cannot install it directly pip install -r requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html - - name: Install package + - name: Install | Uninstall package - archive + run: | + # install as archive + pip install dist/*.tar.gz + cd .. + python -c "import pytorch_lightning as pl ; print(pl.__version__)" + pip uninstall -y pytorch-lightning + + - name: Install | Uninstall package - wheel run: | - # pip install virtualenv - # virtualenv vEnv --system-site-packages - # source vEnv/bin/activate - pip install dist/* - cd .. & python -c "import pytorch_lightning as pl ; print(pl.__version__)" - # deactivate - # rm -rf vEnv + # install as wheel + pip install dist/*.whl + cd .. + python -c "import pytorch_lightning as pl ; print(pl.__version__)" + pip uninstall -y pytorch-lightning \ No newline at end of file diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml index c0b97439737ff..ed8a2e30949b7 100644 --- a/.github/workflows/ci_test-base.yml +++ b/.github/workflows/ci_test-base.yml @@ -1,9 +1,9 @@ -name: CI base testing +name: CI basic testing # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master, "release/*"] # include release branches like release/1.0.x + branches: [master, "release/*"] pull_request: branches: [master, "release/*"] diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index d64fedbfbe590..3faceb296eb1d 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -3,7 +3,7 @@ name: PyTorch & Conda # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master, "release/*"] # include release branches like release/1.0.x + branches: [master, "release/*"] pull_request: branches: [master, "release/*"] @@ -34,10 +34,21 @@ jobs: # todo this probably does not work with docker images, rather cache dockers uses: actions/cache@v2 with: - path: Datasets # This path is specific to Ubuntu - # Look to see if there is a cache hit for the corresponding requirements file + path: Datasets key: pl-dataset + - name: Pull checkpoints from S3 + # todo: consider adding coma caching, but ATM all models have less then 100KB + run: | + # todo: remove unzip install after new nigtly docker is created + apt-get update -qq + apt-get install -y --no-install-recommends unzip + # enter legacy and update checkpoints from S3 + cd legacy + curl https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip --output checkpoints.zip + unzip -o checkpoints.zip + ls -l checkpoints/ + - name: Tests run: | # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003 diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index b87a1d8557843..bf7c9aba8f3c2 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -3,7 +3,7 @@ name: CI complete testing # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master, "release/*"] # include release branches like release/1.0.x + branches: [master, "release/*"] pull_request: branches: [master, "release/*"] @@ -47,26 +47,47 @@ jobs: if: runner.os == 'windows' run: | # remove Horovod from requirements - python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" + fname = 'requirements/extra.txt' + lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] + open(fname, 'w').writelines(lines) + shell: python # versions <= 1.3 may have issues on mac with some BLAS ops due to missing mkl (https://github.com/pytorch/pytorch/issues/18996) - name: Adjust minimal for Python 3.8 and MacOS if: matrix.requires == 'minimal' && (runner.os == 'macOS' || matrix.python-version == 3.8) run : | - python -c "fname = 'requirements.txt' ; req = open(fname).read().replace('torch>=1.3', 'torch>=1.4') ; open(fname, 'w').write(req)" - python -c "fname = 'requirements/examples.txt' ; req = open(fname).read().replace('torchvision>=0.4.1', 'torchvision>=0.5.0') ; open(fname, 'w').write(req)" - python -c "fname = 'requirements/extra.txt' ; req = open(fname).read().replace('torchtext>=0.3.1', 'torchtext>=0.5.0') ; open(fname, 'w').write(req)" + fname = 'requirements.txt' + req = open(fname).read().replace('torch>=1.3', 'torch>=1.4') + open(fname, 'w').write(req) + + fname = 'requirements/examples.txt' + req = open(fname).read().replace('torchvision>=0.4.1', 'torchvision>=0.5.0') + open(fname, 'w').write(req) + + fname = 'requirements/extra.txt' + req = open(fname).read().replace('torchtext>=0.3.1', 'torchtext>=0.5.0') + open(fname, 'w').write(req) + shell: python - name: Set min. dependencies if: matrix.requires == 'minimal' run: | - python -c "fname = 'requirements.txt' ; req = open(fname).read().replace('>=', '==') ; open(fname, 'w').write(req)" - python -c "fname = 'requirements/extra.txt' ; req = open(fname).read().replace('>=', '==') ; open(fname, 'w').write(req)" - python -c "fname = 'requirements/loggers.txt' ; req = open(fname).read().replace('>=', '==') ; open(fname, 'w').write(req)" - python -c "fname = 'requirements/test.txt' ; req = open(fname).read().replace('>=', '==') ; open(fname, 'w').write(req)" - python -c "fname = 'requirements/examples.txt' ; req = open(fname).read().replace('>=', '==') ; open(fname, 'w').write(req)" + files = ( + 'requirements.txt', + 'requirements/extra.txt', + 'requirements/loggers.txt', + 'requirements/test.txt', + 'requirements/examples.txt', + ) + for fname in files: + req = open(fname).read().replace('>=', '==') + open(fname, 'w').write(req) + # remove Fairscale from requirements - python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" + fname = 'requirements/extra.txt' + lines = [line for line in open(fname).readlines() if 'fairscale' not in line] + open(fname, 'w').writelines(lines) + shell: python # Note: This uses an internal pip API and may not always work # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow @@ -83,6 +104,16 @@ jobs: restore-keys: | ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}- + - name: Pull checkpoints from S3 + # todo: consider adding some caching, but ATM all models have less then 100KB + run: | + cd legacy + # wget is simpler but does not work on Windows + python -c "from urllib.request import urlretrieve ; urlretrieve('https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip', 'checkpoints.zip')" + ls -l . + unzip -o checkpoints.zip + ls -l checkpoints/ + - name: Install dependencies env: # MAKEFLAGS: "-j2" @@ -115,8 +146,7 @@ jobs: - name: Cache datasets uses: actions/cache@v2 with: - path: Datasets # This path is specific to Ubuntu - # Look to see if there is a cache hit for the corresponding requirements file + path: Datasets key: pl-dataset - name: Tests diff --git a/.github/workflows/ci_test-tpu.yml b/.github/workflows/ci_test-tpu.yml index ec2a976ea98e5..b1abcfe123201 100644 --- a/.github/workflows/ci_test-tpu.yml +++ b/.github/workflows/ci_test-tpu.yml @@ -2,7 +2,7 @@ name: TPU tests on: push: - branches: [master, "release/*"] # include release branches like release/1.0.x + branches: [master, "release/*"] # TODO: temporal disable TPU testing until we find way how to pass credentials to forked PRs # pull_request: # branches: diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml index 3e2f296226a48..0210e3ceb603a 100644 --- a/.github/workflows/code-formatting.yml +++ b/.github/workflows/code-formatting.yml @@ -2,7 +2,7 @@ name: "Check Code Format" on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master, "release/*"] # include release branches like release/1.0.x + branches: [master, "release/*"] pull_request: branches: [master, "release/*"] diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index 2f91a4f5d43c8..1857ebc8dabea 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -3,7 +3,7 @@ name: "Docs check" on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master, "release/*"] # include release branches like release/1.0.x + branches: [master, "release/*"] pull_request: branches: [master, "release/*"] @@ -105,8 +105,7 @@ jobs: - name: Upload built docs uses: actions/upload-artifact@v2 with: - name: docs-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }} + name: docs-results-${{ github.sha }} path: docs/build/html/ # Use always() to always run this step to publish test results when there are test failures if: success() - diff --git a/.github/workflows/nightly.yml b/.github/workflows/events-nightly.yml similarity index 72% rename from .github/workflows/nightly.yml rename to .github/workflows/events-nightly.yml index 92fd99c40279b..df8c5e5411369 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -1,47 +1,48 @@ name: Nightly events # https://jasonet.co/posts/scheduled-actions/ +# https://github.community/t/distinct-job-for-each-schedule/17811/2 on: schedule: - # At the end of every day - - cron: "0 0 * * *" + - cron: "0 0 * * *" # At the end of every day # based on https://github.com/pypa/gh-action-pypi-publish jobs: - pypi-release: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - with: - python-version: 3.7 - - - name: Install dependencies - run: >- - python -m pip install --user --upgrade setuptools wheel - - - name: Build packages - run: | - python .github/prepare-nightly_version.py - python setup.py sdist bdist_wheel - ls -lh dist/ - - - name: Delay releasing - if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' - uses: juliangruber/sleep-action@v1 - with: - time: 5m + # does nightly releases from feature branch + - uses: actions/checkout@v2 + with: + ref: release/1.2-dev + - uses: actions/setup-python@v2 + with: + python-version: 3.7 + + - name: Install dependencies + run: >- + python -m pip install --user --upgrade setuptools wheel + + - name: Build packages + run: | + python .github/prepare-nightly_version.py + python setup.py sdist bdist_wheel + ls -lh dist/ + + - name: Delay releasing + uses: juliangruber/sleep-action@v1 + with: + time: 5m # We do this, since failures on test.pypi aren't that bad - - name: Publish to Test PyPI - uses: pypa/gh-action-pypi-publish@v1.4.1 - with: - user: __token__ - password: ${{ secrets.test_pypi_password }} - repository_url: https://test.pypi.org/legacy/ - verbose: true + - name: Publish to Test PyPI + uses: pypa/gh-action-pypi-publish@v1.4.1 + with: + user: __token__ + password: ${{ secrets.test_pypi_password }} + repository_url: https://test.pypi.org/legacy/ + verbose: true docker-XLA: runs-on: ubuntu-20.04 @@ -49,7 +50,7 @@ jobs: fail-fast: false matrix: python_version: [3.6, 3.7] - xla_version: [1.6, 1.7] # todo: , "nightly" + xla_version: [1.6, 1.7] # todo: , "nightly" steps: - name: Checkout uses: actions/checkout@v2 @@ -70,8 +71,6 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} XLA_VERSION=${{ matrix.xla_version }} - cache-from: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} - cache-to: type=inline file: dockers/base-xla/Dockerfile push: true tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} @@ -120,8 +119,6 @@ jobs: PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - cache-to: type=inline file: dockers/base-cuda/Dockerfile push: true tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} @@ -136,8 +133,6 @@ jobs: PYTORCH_VERSION=${{ matrix.pytorch_version }} PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - cache-to: type=inline file: dockers/base-conda/Dockerfile push: true tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} diff --git a/.github/workflows/greetings.yml b/.github/workflows/greetings.yml deleted file mode 100644 index bdcabdcf69cbf..0000000000000 --- a/.github/workflows/greetings.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: Greetings -# https://github.com/marketplace/actions/first-interaction - -on: [issues] # pull_request - -jobs: - greeting: - runs-on: ubuntu-20.04 - steps: - - uses: actions/first-interaction@v1 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - issue-message: 'Hi! thanks for your contribution!, great first issue!' - pr-message: 'Hey thanks for the input! Please give us a bit of time to review it!' diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index b8ca5d8723b39..fee3bbebbee84 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -3,7 +3,7 @@ name: Publish Docker Releases # https://github.com/docker/build-push-action on: push: - branches: [master, "release/*"] # include release branches like release/1.0.x + branches: [master, "release/*"] release: types: [created] @@ -26,7 +26,7 @@ jobs: - name: Get release version if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'release' id: get_version - run: echo ::set-env name=RELEASE_VERSION::$(echo ${GITHUB_REF##*/}) + run: echo "::set-output name=RELEASE_VERSION::$(echo ${GITHUB_REF##*/})" - name: Publish Releases to Docker # only on releases @@ -37,6 +37,6 @@ jobs: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/release/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ env.RELEASE_VERSION }} - tags: "${{ env.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" + build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} + tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" timeout-minutes: 55 diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 3cc3157ffbf89..80594180abd09 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -3,16 +3,15 @@ name: PyPI Release # https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master, "release/*"] # include release branches like release/1.0.x + branches: [master, "release/*"] release: - types: [created, "release/*"] + types: [created] jobs: # based on https://github.com/pypa/gh-action-pypi-publish - build-publish: + build-package: runs-on: ubuntu-20.04 - steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 @@ -28,6 +27,16 @@ jobs: python setup.py sdist bdist_wheel ls -lh dist/ + - uses: actions/upload-artifact@v2 + with: + name: pypi-packages + path: dist + + publish-package: + runs-on: ubuntu-20.04 + needs: build-package + steps: + - uses: actions/checkout@v2 - name: Upload to release if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' uses: svenstaro/upload-release-action@v2 @@ -61,3 +70,59 @@ jobs: with: user: __token__ password: ${{ secrets.pypi_password }} + + create-legacy-ckpt: + runs-on: ubuntu-20.04 + needs: [build-package, publish-package] + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: 3.7 + # Note: This uses an internal pip API and may not always work + # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow + - name: Cache pip + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} + restore-keys: ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + pip install -r requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet + pip install awscli + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }} + aws-region: us-east-1 + + - uses: actions/download-artifact@v2 + with: + name: pypi-packages + path: dist + + - name: Pull files from S3 + run: | + aws s3 cp --recursive s3://pl-public-data/legacy/checkpoints/ legacy/checkpoints/ # --acl public-read + ls -l legacy/checkpoints/ + + - name: Generate checkpoint + # if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' + run: | + ls -lh dist/ + pip install dist/*.whl + + pl_ver=$(python -c "import pytorch_lightning as pl ; print(pl.__version__)" 2>&1) + # generate checkpoint to this version + bash legacy/generate_checkpoints.sh $pl_ver + + - name: Push files to S3 + run: | + aws s3 sync legacy/checkpoints/ s3://pl-public-data/legacy/checkpoints/ + cd legacy + zip -r checkpoints.zip checkpoints + aws s3 cp checkpoints.zip s3://pl-public-data/legacy/ --acl public-read diff --git a/.gitignore b/.gitignore index 743fdaaf33dc2..d6ae2ef48ed01 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ timit_data/ # C extensions *.so +# PyCharm .idea/ # Distribution / packaging @@ -126,11 +127,14 @@ ENV/ # mypy .mypy_cache/ +# pytest +.pytest_cache/ # data .data/ Datasets/ mnist/ +legacy/checkpoints/ # pl tests ml-runs/ diff --git a/.mergify.yml b/.mergify.yml index 44c48f2ddced5..4ca323347104e 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -14,56 +14,42 @@ pull_request_rules: - - name: Automatic merge on approval + - name: warn on conflicts + conditions: + - conflict + - -draft # filter-out GH draft PRs + - -label="has conflicts" + actions: + # comment: + # message: This pull request is now in conflict... :( + label: + add: [ "has conflicts" ] + + - name: resolved conflicts conditions: - - base=master - # number of review approvals - - "#approved-reviews-by>=3" - # no waiting or assigned review - - "#review-requested=0" - # no requested chnages from any reviewer - - "#changes-requested-reviews-by=0" - # this serves as ALL check has to pass as we have actually around 40 tests in total - - "#status-success>=54" - # this is just in case since we rely on GPU tests (note: redundand to the above) - - status-success=continuous-integration/drone/pr - - "status-success=ci/circleci: TPU-tests" - # this is patter-like, unofrunatly serves as `any(...)` (note: redundand to the above) - #- "status-success~=^ci/circleci:" - # no conflict with master branch - -conflict - # was not closed yet + - label="has conflicts" + - -draft # filter-out GH draft PRs + - -merged # not merged yet - -closed - # filter-out GH draft PRs - - -draft actions: - delete_head_branch: {} - merge: - # https://doc.mergify.io/merge-action.html#strict-merge - # (on head branch) $ git merge --no-ff base - # (on head branch) # Wait for CI to go green - # (on head branch) # Squash all commits - # (on base branch) $ git merge --ff head - strict: true - method: squash - comment: - message: Great job! =) + label: + remove: [ "has conflicts" ] - - name: warn on conflicts + - name: update PR conditions: - conflict - # filter-out GH draft PRs - - -draft + - -draft # filter-out GH draft PRs + - label="0:] Ready-To-Go" actions: - comment: - message: This pull request is now in conflict... :( + update: {} - name: add core reviewer conditions: - # filter-out GH draft PRs - - -draft - # number of review approvals - - "#approved-reviews-by<3" + - -conflict # skip if conflict + - -draft # filter-out GH draft PRs + - label="0:] Ready-To-Go" + - "#approved-reviews-by<3" # number of review approvals actions: request_reviews: teams: diff --git a/.update.sh b/.update.sh deleted file mode 100644 index 40fcc22d6b79b..0000000000000 --- a/.update.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -version=$1 - -git commit -am "release v$version" -git tag $version -m "test_tube v$version" -git push --tags origin master - -# push to pypi -rm -rf ./dist/* -python3 setup.py sdist -twine upload dist/* - -# to update docs -# cd to root dir -# mkdocs gh-deploy - diff --git a/CHANGELOG.md b/CHANGELOG.md index 87d29ff6df643..5fd70e3583c01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [unreleased.Features] - YYYY-MM-DD +## [unreleased.Bugfixes] - YYYY-MM-DD ### Added @@ -23,29 +23,116 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [unreleased.BugFix] - YYYY-MM-DD + +## [1.1.5] - 2021-01-19 + +### Fixed + +- Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/PyTorchLightning/pytorch-lightning/pull/4579)) +- Fixed logging `on_train_batch_end` in a callback with multiple optimizers ([#5521](https://github.com/PyTorchLightning/pytorch-lightning/pull/5521)) +- Fixed `reinit_scheduler_properties` with correct optimizer ([#5519](https://github.com/PyTorchLightning/pytorch-lightning/pull/5519)) +- Fixed `val_check_interval` with `fast_dev_run` ([#5540](https://github.com/PyTorchLightning/pytorch-lightning/pull/5540)) + + +## [1.1.4] - 2021-01-12 ### Added +- Add automatic optimization property setter to lightning module ([#5169](https://github.com/PyTorchLightning/pytorch-lightning/pull/5169)) ### Changed +- Changed deprecated `enable_pl_optimizer=True` ([#5244](https://github.com/PyTorchLightning/pytorch-lightning/pull/5244)) -### Deprecated +### Fixed + +- Fixed `transfer_batch_to_device` for DDP with `len(devices_ids) == 1` ([#5195](https://github.com/PyTorchLightning/pytorch-lightning/pull/5195)) +- Logging only on `not should_accumulate()` during training ([#5417](https://github.com/PyTorchLightning/pytorch-lightning/pull/5417)) +- Resolve interpolation bug with Hydra ([#5406](https://github.com/PyTorchLightning/pytorch-lightning/pull/5406)) +- Check environ before selecting a seed to prevent warning message ([#4743](https://github.com/PyTorchLightning/pytorch-lightning/pull/4743)) +- Fixed signature mismatch in `model_to_device` of `DDPCPUHPCAccelerator` ([#5505](https://github.com/PyTorchLightning/pytorch-lightning/pull/5505)) + + +## [1.1.3] - 2021-01-05 + +### Added + +- Added a check for optimizer attached to `lr_scheduler` ([#5338](https://github.com/PyTorchLightning/pytorch-lightning/pull/5338)) +- Added support for passing non-existing filepaths to `resume_from_checkpoint` ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402)) + +### Changed + +- Skip restore from `resume_from_checkpoint` while `testing` ([#5161](https://github.com/PyTorchLightning/pytorch-lightning/pull/5161)) +- Allowed `log_momentum` for adaptive optimizers in `LearningRateMonitor` ([#5333](https://github.com/PyTorchLightning/pytorch-lightning/pull/5333)) +- Disabled checkpointing, earlystopping and logging with `fast_dev_run` ([#5277](https://github.com/PyTorchLightning/pytorch-lightning/pull/5277)) +- Distributed group defaults to `WORLD` if `None` ([#5125](https://github.com/PyTorchLightning/pytorch-lightning/pull/5125)) + +### Fixed +- Fixed `trainer.test` returning non-test metrics ([#5214](https://github.com/PyTorchLightning/pytorch-lightning/pull/5214)) +- Fixed metric state reset ([#5273](https://github.com/PyTorchLightning/pytorch-lightning/pull/5273)) +- Fixed `--num-nodes` on `DDPSequentialPlugin` ([#5327](https://github.com/PyTorchLightning/pytorch-lightning/pull/5327)) +- Fixed invalid value for `weights_summary` ([#5296](https://github.com/PyTorchLightning/pytorch-lightning/pull/5296)) +- Fixed `Trainer.test` not using the latest `best_model_path` ([#5161](https://github.com/PyTorchLightning/pytorch-lightning/pull/5161)) +- Fixed existence check for hparams not using underlying filesystem ([#5250](https://github.com/PyTorchLightning/pytorch-lightning/pull/5250)) +- Fixed `LightningOptimizer` AMP bug ([#5191](https://github.com/PyTorchLightning/pytorch-lightning/pull/5191)) +- Fixed casted key to string in `_flatten_dict` ([#5354](https://github.com/PyTorchLightning/pytorch-lightning/pull/5354)) + + +## [1.1.2] - 2020-12-23 + +### Added + +- Support number for logging with `sync_dist=True` ([#5080](https://github.com/PyTorchLightning/pytorch-lightning/pull/5080)) +- Added offset logging step when resuming for Wandb logger ([#5050](https://github.com/PyTorchLightning/pytorch-lightning/pull/5050)) ### Removed +- `enable_pl_optimizer=False` by default to temporarily fix AMP issues ([#5163](https://github.com/PyTorchLightning/pytorch-lightning/pull/5163)) ### Fixed -- Fixed trainer by default `None` in `DDPAccelerator` ([#4915](https://github.com/PyTorchLightning/pytorch-lightning/pull/4915)) +- Metric reduction with Logging ([#5150](https://github.com/PyTorchLightning/pytorch-lightning/pull/5150)) +- Remove nan loss in manual optimization ([#5121](https://github.com/PyTorchLightning/pytorch-lightning/pull/5121)) +- Un-balanced logging properly supported ([#5119](https://github.com/PyTorchLightning/pytorch-lightning/pull/5119)) +- Fix hanging in DDP HPC accelerators ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157)) +- Fix saved filename in `ModelCheckpoint` if it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861)) +- Fix reset `TensorRunningAccum` ([#5106](https://github.com/PyTorchLightning/pytorch-lightning/pull/5106)) +- Updated `DALIClassificationLoader` to not use deprecated arguments ([#4925](https://github.com/PyTorchLightning/pytorch-lightning/pull/4925)) +- Corrected call to `torch.no_grad` ([#5124](https://github.com/PyTorchLightning/pytorch-lightning/pull/5124)) + + +## [1.1.1] - 2020-12-15 + +### Added + +- Add a notebook example to reach a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning ([#4818](https://github.com/PyTorchLightning/pytorch-lightning/pull/4818)) +### Changed -- Fixed `LightningOptimizer` exposes optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095)) +- Simplify accelerator steps ([#5015](https://github.com/PyTorchLightning/pytorch-lightning/pull/5015)) +- Refactor load in checkpoint connector ([#4593](https://github.com/PyTorchLightning/pytorch-lightning/pull/4593)) +- Fixed the saved filename in `ModelCheckpoint` when it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861)) + +### Removed +- Drop duplicate metrics ([#5014](https://github.com/PyTorchLightning/pytorch-lightning/pull/5014)) +- Remove beta arg from F1 class and functional ([#5076](https://github.com/PyTorchLightning/pytorch-lightning/pull/5076)) +### Fixed + +- Fixed trainer by default `None` in `DDPAccelerator` ([#4915](https://github.com/PyTorchLightning/pytorch-lightning/pull/4915)) +- Fixed `LightningOptimizer` to expose optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095)) - Do not warn when the `name` key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057)) +- Check if optimizer supports closure ([#4981](https://github.com/PyTorchLightning/pytorch-lightning/pull/4981)) +- Extend LightningOptimizer to exposure underlying Optimizer attributes + update doc ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095)) +- Add deprecated metric utility functions back to functional ( + [#5067](https://github.com/PyTorchLightning/pytorch-lightning/pull/5067), + [#5068](https://github.com/PyTorchLightning/pytorch-lightning/pull/5068)) +- Allow any input in `to_onnx` and `to_torchscript` ([#4378](https://github.com/PyTorchLightning/pytorch-lightning/pull/4378)) +- Do not warn when the name key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057)) + +- Fixed `DDPHPCAccelerator` hangs in DDP construction by calling `init_device` ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157)) ## [1.1.0] - 2020-12-09 @@ -62,8 +149,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added logging using `self.log` in train and evaluation for epoch end hooks ( [#4552](https://github.com/PyTorchLightning/pytorch-lightning/pull/4552), [#4495](https://github.com/PyTorchLightning/pytorch-lightning/pull/4495), - [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) - [#4684](https://github.com/PyTorchLightning/pytorch-lightning/pull/4684)) + [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439), + [#4684](https://github.com/PyTorchLightning/pytorch-lightning/pull/4684), [#4913](https://github.com/PyTorchLightning/pytorch-lightning/pull/4913)) - Added ability for DDP plugin to modify optimizer state saving ([#4675](https://github.com/PyTorchLightning/pytorch-lightning/pull/4675)) - Added casting to python types for numpy scalars when logging hparams ([#4647](https://github.com/PyTorchLightning/pytorch-lightning/pull/4647)) diff --git a/LICENSE b/LICENSE index b9181e1a6e5d8..2e66bec2e791c 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2018-2020 William Falcon + Copyright 2018-2021 William Falcon Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/MANIFEST.in b/MANIFEST.in index 8db3912027d6d..95672548f724c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -69,4 +69,4 @@ prune temp* prune test* prune benchmark* prune dockers - +prune legacy diff --git a/Makefile b/Makefile index 76e8bac4e3748..55a95f0b14af2 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: test +.PHONY: test clean test: # install APEX, see https://github.com/NVIDIA/apex#linux @@ -13,3 +13,7 @@ test: # specific file # python -m coverage run --source pytorch_lightning -m py.test --flake8 --durations=0 -v -k + +clean: + # clean all temp runs + rm -rf $(shell find . -name "mlruns" ) diff --git a/README.md b/README.md index a5c6bbb244730..73286edc2c53b 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ -**The lightweight PyTorch wrapper for high-performance AI research. +**The lightweight PyTorch wrapper for high-performance AI research. Scale your models, not the boilerplate.** --- @@ -42,6 +42,11 @@ Scale your models, not the boilerplate.** --- +## NEWS +[Dec 2020 - Read about how Facebook uses Lightning to standardize deep learning across research and production teams](https://ai.facebook.com/blog/reengineering-facebook-ais-deep-learning-platforms-for-interoperability) + +--- + ## PyTorch Lightning is just organized PyTorch Lightning disentangles PyTorch code to decouple the science from the engineering. ![PT to PL](docs/source/_images/general/pl_quick_start_full_compressed.gif) @@ -51,10 +56,10 @@ Lightning disentangles PyTorch code to decouple the science from the engineering ## Lightning Philosophy Lightning is designed with these principles in mind: -Principle 1: Enable maximal flexibility. -Principle 2: Abstract away unecessary boilerplate, but make it accessible when needed. -Principle 3: Systems should be self-contained (ie: optimizers, computation code, etc). -Principle 4: Deep learning code should be organized into 4 distinct categories. +Principle 1: Enable maximal flexibility. +Principle 2: Abstract away unnecessary boilerplate, but make it accessible when needed. +Principle 3: Systems should be self-contained (ie: optimizers, computation code, etc). +Principle 4: Deep learning code should be organized into 4 distinct categories. - Research code (the LightningModule). - Engineering code (you delete, and is handled by the Trainer). @@ -73,19 +78,6 @@ Lightning can automatically export to ONNX or TorchScript for those cases. --- -## Trending contributors - -[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/0)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/0) -[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/1)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/1) -[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/2)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/2) -[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/3)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/3) -[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/4)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/4) -[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/5)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/5) -[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/6)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/6) -[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/7)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/7) - ---- - ## Continuous Integration
@@ -109,7 +101,7 @@ Lightning can automatically export to ONNX or TorchScript for those cases. ## How To Use -#### Step 0: Install +### Step 0: Install Simple installation from PyPI ```bash @@ -122,12 +114,30 @@ From Conda conda install pytorch-lightning -c conda-forge ``` -Install bleeding-edge (no guarantees) + + +#### Install bleeding-edge - future 1.2 + +the actual status of 1.2 [nightly] is following: + +![CI base testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20base%20testing/badge.svg?branch=release%2F1.2-dev&event=push) +![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=release%2F1.2-dev&event=push) +![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=release%2F1.2-dev&event=push) +![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=release%2F1.2-dev&event=push) +![Docs check](https://github.com/PyTorchLightning/pytorch-lightning/workflows/Docs%20check/badge.svg?branch=release%2F1.2-dev&event=push) + +Install future release from the source (no guarantees) +```bash +pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@release/1.2-dev --upgrade +``` +or nightly from testing PyPI ```bash -pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade +pip install -iU https://test.pypi.org/simple/ pytorch-lightning ``` -#### Step 0: Add these imports + + +### Step 1: Add these imports ```python import os @@ -140,7 +150,7 @@ from torchvision import transforms import pytorch_lightning as pl ``` -#### Step 1: Define a LightningModule (nn.Module subclass) +### Step 2: Define a LightningModule (nn.Module subclass) A LightningModule defines a full *system* (ie: a GAN, autoencoder, BERT or a simple Image Classifier). ```python @@ -150,7 +160,7 @@ class LitAutoEncoder(pl.LightningModule): super().__init__() self.encoder = nn.Sequential(nn.Linear(28 * 28, 128), nn.ReLU(), nn.Linear(128, 3)) self.decoder = nn.Sequential(nn.Linear(3, 128), nn.ReLU(), nn.Linear(128, 28 * 28)) - + def forward(self, x): # in lightning, forward defines the prediction/inference actions embedding = self.encoder(x) @@ -171,9 +181,9 @@ class LitAutoEncoder(pl.LightningModule): return optimizer ``` -###### Note: Training_step defines the training loop. Forward defines how the LightningModule behaves during inference/prediction. +**Note: Training_step defines the training loop. Forward defines how the LightningModule behaves during inference/prediction.** -#### Step 2: Train! +### Step 3: Train! ```python dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor()) @@ -200,7 +210,7 @@ trainer = Trainer(tpu_cores=8) ```python # torchscript autoencoder = LitAutoEncoder() -torch.jit.save(autoencoder.to_torchscript(), "model.pt") +torch.jit.save(autoencoder.to_torchscript(), "model.pt") # onnx with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmpfile: @@ -215,13 +225,14 @@ with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmpfile: ```python class LitAutoEncoder(pl.LightningModule): def training_step(self, batch, batch_idx, opt_idx): - (opt_a, opt_b) = self.optimizers() - + # access your optimizers with use_pl_optimizer=False. Default is True + (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True) + loss_a = ... self.manual_backward(loss_a, opt_a) opt_a.step() opt_a.zero_grad() - + loss_b = ... self.manual_backward(loss_b, opt_b, retain_graph=True) self.manual_backward(loss_b, opt_b) @@ -256,31 +267,31 @@ class LitAutoEncoder(pl.LightningModule): ## Examples ###### Hello world -[MNIST hello world](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/01-mnist-hello-world.ipynb) -[MNIST on TPUs](https://colab.research.google.com/drive/1-_LKx4HwAxl5M6xPJmqAAu444LTDQoa3) +- [MNIST hello world](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/01-mnist-hello-world.ipynb) +- [MNIST on TPUs](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/06-mnist-tpu-training.ipynb) ###### Contrastive Learning -[BYOL](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#byol) -[CPC v2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#cpc-v2) -[Moco v2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#moco-v2) -[SIMCLR](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#simclr) +- [BYOL](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#byol) +- [CPC v2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#cpc-v2) +- [Moco v2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#moco-v2) +- [SIMCLR](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#simclr) ###### NLP -[BERT](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/04-transformers-text-classification.ipynb) -[GPT-2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2) +- [BERT](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/04-transformers-text-classification.ipynb) +- [GPT-2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2) ###### Reinforcement Learning -[DQN](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html?highlight=dqn#dqn-models) -[Dueling-DQN](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html#dueling-dqn) -[Reinforce](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html#reinforce) +- [DQN](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html#dqn-models) +- [Dueling-DQN](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html#dueling-dqn) +- [Reinforce](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html#reinforce) ###### Vision -[GAN](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/03-basic-gan.ipynb) +- [GAN](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/03-basic-gan.ipynb) ###### Classic ML -[Logistic Regression](https://pytorch-lightning-bolts.readthedocs.io/en/latest/classic_ml.html#logistic-regression) -[Linear Regression](https://pytorch-lightning-bolts.readthedocs.io/en/latest/classic_ml.html#linear-regression) +- [Logistic Regression](https://pytorch-lightning-bolts.readthedocs.io/en/latest/classic_ml.html#logistic-regression) +- [Linear Regression](https://pytorch-lightning-bolts.readthedocs.io/en/latest/classic_ml.html#linear-regression) --- @@ -301,12 +312,12 @@ If you have any questions please: 4. [Ask on stackoverflow](https://stackoverflow.com/questions/ask?guided=false) with the tag pytorch-lightning. ### Funding -Building open-source software with only a few part-time people is hard! +Building open-source software with only a few part-time people is hard! [We're venture funded](https://techcrunch.com/2020/10/08/grid-ai-raises-18-6m-series-a-to-help-ai-researchers-and-engineers-bring-their-models-to-production/) and backed by some of the top VC funds in the world, [Index Ventures](https://www.indexventures.com/companies/), [Bain Capital Ventures](https://www.baincapitalventures.com/portfolio/), [First Minute Capital](https://firstminute.capital/companies). -Their funding ensures we can continue to build awesome tooling like Grid, give you around the clock support, +Their funding ensures we can continue to build awesome tooling like Grid, give you around the clock support, hire a full-time staff, attend conferences, and move faster through implementing features you request. To supercharge your research and production work, visit our [Grid.ai platform](https://www.grid.ai/) @@ -314,7 +325,7 @@ To supercharge your research and production work, visit our [Grid.ai platform](h --- ## Grid AI -Grid AI is our native platform for training models at scale on the cloud! +Grid AI is our native platform for training models at scale on the cloud! **Sign up for [early access here](https://www.grid.ai/)** diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py index e69de29bb2d1d..734288b07235d 100644 --- a/benchmarks/__init__.py +++ b/benchmarks/__init__.py @@ -0,0 +1,17 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +BENCHMARK_ROOT = os.path.dirname(__file__) +PROJECT_ROOT = os.path.dirname(BENCHMARK_ROOT) diff --git a/benchmarks/generate_comparison.py b/benchmarks/generate_comparison.py new file mode 100644 index 0000000000000..6b5a0680a6b36 --- /dev/null +++ b/benchmarks/generate_comparison.py @@ -0,0 +1,61 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import matplotlib.pylab as plt +import pandas as pd + +from benchmarks.test_basic_parity import measure_loops +from tests.base.models import ParityModuleMNIST, ParityModuleRNN + +NUM_EPOCHS = 20 +NUM_RUNS = 50 +MODEL_CLASSES = (ParityModuleRNN, ParityModuleMNIST) +PATH_HERE = os.path.dirname(__file__) +FIGURE_EXTENSION = '.png' + + +def _main(): + fig, axarr = plt.subplots(nrows=len(MODEL_CLASSES)) + + for i, cls_model in enumerate(MODEL_CLASSES): + path_csv = os.path.join(PATH_HERE, f'dump-times_{cls_model.__name__}.csv') + if os.path.isfile(path_csv): + df_time = pd.read_csv(path_csv, index_col=0) + else: + # todo: kind="Vanilla PT" -> use_lightning=False + vanilla = measure_loops(cls_model, kind="Vanilla PT", num_epochs=NUM_EPOCHS, num_runs=NUM_RUNS) + lightning = measure_loops(cls_model, kind="PT Lightning", num_epochs=NUM_EPOCHS, num_runs=NUM_RUNS) + + df_time = pd.DataFrame({'vanilla PT': vanilla['durations'][1:], 'PT Lightning': lightning['durations'][1:]}) + df_time /= NUM_RUNS + df_time.to_csv(os.path.join(PATH_HERE, f'dump-times_{cls_model.__name__}.csv')) + # todo: add also relative X-axis ticks to see both: relative and absolute time differences + df_time.plot.hist( + ax=axarr[i], + bins=20, + alpha=0.5, + title=cls_model.__name__, + legend=True, + grid=True, + ) + axarr[i].set(xlabel='time [seconds]') + + path_fig = os.path.join(PATH_HERE, f'figure-parity-times{FIGURE_EXTENSION}') + fig.tight_layout() + fig.savefig(path_fig) + + +if __name__ == '__main__': + _main() diff --git a/benchmarks/test_basic_parity.py b/benchmarks/test_basic_parity.py new file mode 100644 index 0000000000000..ce3d831f099f5 --- /dev/null +++ b/benchmarks/test_basic_parity.py @@ -0,0 +1,174 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import gc +import time + +import numpy as np +import pytest +import torch +from tqdm import tqdm + +from pytorch_lightning import LightningModule, seed_everything, Trainer +from tests.base.models import ParityModuleMNIST, ParityModuleRNN + + +def assert_parity_relative(pl_values, pt_values, norm_by: float = 1, max_diff: float = 0.1): + # assert speeds + diffs = np.asarray(pl_values) - np.mean(pt_values) + # norm by vanilla time + diffs = diffs / norm_by + # relative to mean reference value + diffs = diffs / np.mean(pt_values) + assert np.mean(diffs) < max_diff, f"Lightning diff {diffs} was worse than vanilla PT (threshold {max_diff})" + + +def assert_parity_absolute(pl_values, pt_values, norm_by: float = 1, max_diff: float = 0.55): + # assert speeds + diffs = np.asarray(pl_values) - np.mean(pt_values) + # norm by event count + diffs = diffs / norm_by + assert np.mean(diffs) < max_diff, f"Lightning {diffs} was worse than vanilla PT (threshold {max_diff})" + + +# ParityModuleMNIST runs with num_workers=1 +@pytest.mark.parametrize('cls_model,max_diff_speed,max_diff_memory', [ + (ParityModuleRNN, 0.05, 0.0), + (ParityModuleMNIST, 0.25, 0.0), # todo: lower this thr +]) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_pytorch_parity( + tmpdir, + cls_model: LightningModule, + max_diff_speed: float, + max_diff_memory: float, + num_epochs: int = 4, + num_runs: int = 3, +): + """ + Verify that the same pytorch and lightning models achieve the same results + """ + lightning = measure_loops(cls_model, kind="PT Lightning", num_epochs=num_epochs, num_runs=num_runs) + vanilla = measure_loops(cls_model, kind="Vanilla PT", num_epochs=num_epochs, num_runs=num_runs) + + # make sure the losses match exactly to 5 decimal places + print(f"Losses are for... \n vanilla: {vanilla['losses']} \n lightning: {lightning['losses']}") + for pl_out, pt_out in zip(lightning['losses'], vanilla['losses']): + np.testing.assert_almost_equal(pl_out, pt_out, 5) + + # drop the first run for initialize dataset (download & filter) + assert_parity_absolute( + lightning['durations'][1:], vanilla['durations'][1:], norm_by=num_epochs, max_diff=max_diff_speed + ) + + assert_parity_relative(lightning['memory'], vanilla['memory'], max_diff=max_diff_memory) + + +def _hook_memory(): + if torch.cuda.is_available(): + torch.cuda.synchronize() + used_memory = torch.cuda.max_memory_allocated() + else: + used_memory = np.nan + return used_memory + + +def measure_loops(cls_model, kind, num_runs=10, num_epochs=10): + """ + Returns an array with the last loss from each epoch for each run + """ + hist_losses = [] + hist_durations = [] + hist_memory = [] + + device_type = "cuda" if torch.cuda.is_available() else "cpu" + torch.backends.cudnn.deterministic = True + for i in tqdm(range(num_runs), desc=f'{kind} with {cls_model.__name__}'): + gc.collect() + if device_type == 'cuda': + torch.cuda.empty_cache() + torch.cuda.reset_max_memory_cached() + torch.cuda.reset_max_memory_allocated() + torch.cuda.reset_accumulated_memory_stats() + torch.cuda.reset_peak_memory_stats() + time.sleep(1) + + time_start = time.perf_counter() + + _loop = lightning_loop if kind == "PT Lightning" else vanilla_loop + final_loss, used_memory = _loop(cls_model, idx=i, device_type=device_type, num_epochs=num_epochs) + + time_end = time.perf_counter() + + hist_losses.append(final_loss) + hist_durations.append(time_end - time_start) + hist_memory.append(used_memory) + + return { + 'losses': hist_losses, + 'durations': hist_durations, + 'memory': hist_memory, + } + + +def vanilla_loop(cls_model, idx, device_type: str = 'cuda', num_epochs=10): + device = torch.device(device_type) + # set seed + seed_everything(idx) + + # init model parts + model = cls_model() + dl = model.train_dataloader() + optimizer = model.configure_optimizers() + + # model to GPU + model = model.to(device) + + epoch_losses = [] + # as the first run is skipped, no need to run it long + for epoch in range(num_epochs if idx > 0 else 1): + + # run through full training set + for j, batch in enumerate(dl): + batch = [x.to(device) for x in batch] + loss_dict = model.training_step(batch, j) + loss = loss_dict['loss'] + loss.backward() + optimizer.step() + optimizer.zero_grad() + + # track last epoch loss + epoch_losses.append(loss.item()) + + return epoch_losses[-1], _hook_memory() + + +def lightning_loop(cls_model, idx, device_type: str = 'cuda', num_epochs=10): + seed_everything(idx) + + model = cls_model() + # init model parts + trainer = Trainer( + # as the first run is skipped, no need to run it long + max_epochs=num_epochs if idx > 0 else 1, + progress_bar_refresh_rate=0, + weights_summary=None, + gpus=1 if device_type == 'cuda' else 0, + checkpoint_callback=False, + deterministic=True, + logger=False, + replace_sampler_ddp=False, + ) + trainer.fit(model) + + return trainer.train_loop.running_loss.last().item(), _hook_memory() diff --git a/benchmarks/test_parity.py b/benchmarks/test_parity.py deleted file mode 100644 index 41bba9533e10d..0000000000000 --- a/benchmarks/test_parity.py +++ /dev/null @@ -1,116 +0,0 @@ -import time - -import numpy as np -import pytest -import torch - -import tests.base.develop_utils as tutils -from pytorch_lightning import Trainer, seed_everything -from tests.base.models import ParityModuleMNIST, ParityModuleRNN - - -# ParityModuleMNIST runs with num_workers=1 -@pytest.mark.parametrize('cls_model,max_diff', [ - (ParityModuleRNN, 0.05), - (ParityModuleMNIST, 0.25), # todo: lower this thr -]) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -def test_pytorch_parity(tmpdir, cls_model, max_diff): - """ - Verify that the same pytorch and lightning models achieve the same results - """ - num_epochs = 4 - num_rums = 3 - lightning_outs, pl_times = lightning_loop(cls_model, num_rums, num_epochs) - manual_outs, pt_times = vanilla_loop(cls_model, num_rums, num_epochs) - - # make sure the losses match exactly to 5 decimal places - for pl_out, pt_out in zip(lightning_outs, manual_outs): - np.testing.assert_almost_equal(pl_out, pt_out, 5) - - # the fist run initialize dataset (download & filter) - tutils.assert_speed_parity_absolute(pl_times[1:], pt_times[1:], - nb_epochs=num_epochs, max_diff=max_diff) - - -def vanilla_loop(cls_model, num_runs=10, num_epochs=10): - """ - Returns an array with the last loss from each epoch for each run - """ - device = torch.device('cuda' if torch.cuda.is_available() else "cpu") - errors = [] - times = [] - - torch.backends.cudnn.deterministic = True - for i in range(num_runs): - time_start = time.perf_counter() - - # set seed - seed = i - seed_everything(seed) - - # init model parts - model = cls_model() - dl = model.train_dataloader() - optimizer = model.configure_optimizers() - - # model to GPU - model = model.to(device) - - epoch_losses = [] - # as the first run is skipped, no need to run it long - for epoch in range(num_epochs if i > 0 else 1): - - # run through full training set - for j, batch in enumerate(dl): - batch = [x.to(device) for x in batch] - loss_dict = model.training_step(batch, j) - loss = loss_dict['loss'] - loss.backward() - optimizer.step() - optimizer.zero_grad() - - # track last epoch loss - epoch_losses.append(loss.item()) - - time_end = time.perf_counter() - times.append(time_end - time_start) - - errors.append(epoch_losses[-1]) - - return errors, times - - -def lightning_loop(cls_model, num_runs=10, num_epochs=10): - errors = [] - times = [] - - for i in range(num_runs): - time_start = time.perf_counter() - - # set seed - seed = i - seed_everything(seed) - - model = cls_model() - # init model parts - trainer = Trainer( - # as the first run is skipped, no need to run it long - max_epochs=num_epochs if i > 0 else 1, - progress_bar_refresh_rate=0, - weights_summary=None, - gpus=1, - checkpoint_callback=False, - deterministic=True, - logger=False, - replace_sampler_ddp=False, - ) - trainer.fit(model) - - final_loss = trainer.train_loop.running_loss.last().item() - errors.append(final_loss) - - time_end = time.perf_counter() - times.append(time_end - time_start) - - return errors, times diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index 9fe4976442178..5d688a8b374ff 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -1,3 +1,17 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import platform import time @@ -6,7 +20,7 @@ import pytest import torch -from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin from pytorch_lightning.utilities import FAIRSCALE_AVAILABLE, NATIVE_AMP_AVAILABLE @@ -14,35 +28,32 @@ from tests.base.boring_model import BoringModel, RandomDataset -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_one_device(): plugin_parity_test( accelerator='ddp_cpu', - max_percent_speed_diff=0.15, # slower speed due to one CPU doing additional sequential memory saving calls plugin=DDPShardedPlugin(), - model_cls=SeedTrainLoaderModel + model_cls=SeedTrainLoaderModel, + max_percent_speed_diff=0.15, # todo: slower speed due to one CPU doing additional sequential memory saving calls ) @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_one_gpu(): plugin_parity_test( gpus=1, accelerator='ddp_spawn', plugin=DDPShardedPlugin(), - model_cls=SeedTrainLoaderModel + model_cls=SeedTrainLoaderModel, ) @pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Requires native AMP") @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_amp_one_gpu(): plugin_parity_test( @@ -50,14 +61,13 @@ def test_ddp_sharded_plugin_correctness_amp_one_gpu(): precision=16, accelerator='ddp_spawn', plugin=DDPShardedPlugin(), - model_cls=SeedTrainLoaderModel + model_cls=SeedTrainLoaderModel, ) @pytest.mark.skip(reason="Not a critical test, skip till drone CI performance improves.") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_multi_gpu(): plugin_parity_test( @@ -65,13 +75,12 @@ def test_ddp_sharded_plugin_correctness_multi_gpu(): accelerator='ddp_spawn', plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, - max_percent_speed_diff=0.25 + max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers ) @pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Requires native AMP") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_amp_multi_gpu(): @@ -81,13 +90,12 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu(): accelerator='ddp_spawn', plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, - max_percent_speed_diff=0.25 + max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers ) @pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Requires native AMP") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu(): @@ -97,7 +105,7 @@ def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu(): accelerator='ddp_spawn', plugin='ddp_sharded', model_cls=SeedTrainLoaderModel, - max_percent_speed_diff=0.25 + max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers ) @@ -133,8 +141,7 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None): @pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim(): """ @@ -145,14 +152,13 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim(): gpus=2, accelerator='ddp_spawn', model_cls=SeedTrainLoaderMultipleOptimizersModel, - max_percent_speed_diff=0.25 # Increase speed diff since only 2 GPUs sharding 2 optimizers + max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers ) @pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir): """ @@ -163,7 +169,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir): gpus=2, accelerator='ddp_spawn', model_cls=SeedTrainLoaderManualModel, - max_percent_speed_diff=0.25 # Increase speed diff since only 2 GPUs sharding 2 optimizers + max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers ) @@ -180,7 +186,8 @@ def train_dataloader(self): class SeedTrainLoaderManualModel(SeedTrainLoaderModel): def training_step(self, batch, batch_idx, optimizer_idx): # manual - (opt_a, opt_b) = self.optimizers() + # access your optimizers with use_pl_optimizer=False. Default is True + (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True) loss_1 = self.step(batch) self.manual_backward(loss_1, opt_a) diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 3c58dfcde7dea..83c8fe9e7a59b 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -40,7 +40,9 @@ RUN apt-get update -qq && \ build-essential \ cmake \ git \ + wget \ curl \ + unzip \ ca-certificates \ && \ @@ -75,16 +77,16 @@ ENV CONDA_ENV=lightning COPY environment.yml environment.yml # conda init -RUN conda create -y --name $CONDA_ENV cudatoolkit=${CUDA_VERSION} && \ +RUN conda create -y --name $CONDA_ENV python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} cudatoolkit=${CUDA_VERSION} -c ${PYTORCH_CHANNEL} && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages - # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later + # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installed later python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \ - python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'python[>=]+[\d\.]+', 'python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ - python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ + python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'- python[>=]+[\d\.]+', '# - python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ + python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'- pytorch[>=]+[\d\.]+', '# - pytorch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([ln for ln in req if 'horovod' not in ln])" && \ cat environment.yml && \ - conda env update --file environment.yml && \ + conda env update --name $CONDA_ENV --file environment.yml && \ conda clean -ya && \ rm environment.yml diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index bdaf271f2b854..e6b0c5061c02c 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -45,6 +45,8 @@ RUN apt-get update -qq && \ cmake \ git \ wget \ + curl \ + unzip \ ca-certificates \ software-properties-common \ && \ diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index 8eb093295c37b..5dfeac8c9e86e 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -97,6 +97,8 @@ RUN \ python -c "fname = 'requirements.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('torch')] ; open(fname, 'w').writelines(lines)" && \ # drop Horovod as it is not needed python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \ + # drop fairscale as it is not needed + python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" && \ # drop TorchVision as it was installed with XLA python -c "fname = 'requirements/examples.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('torchvision')] ; open(fname, 'w').writelines(lines)" && \ pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed && \ diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile index a514b1c3d35fe..9ba8f98d440a2 100644 --- a/dockers/tpu-tests/Dockerfile +++ b/dockers/tpu-tests/Dockerfile @@ -23,12 +23,20 @@ MAINTAINER PyTorchLightning COPY ./ ./pytorch-lightning/ +# Pull the legacy checkpoints +RUN cd pytorch-lightning && \ + wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ && \ + unzip -o legacy/checkpoints.zip -d legacy/ && \ + ls -l legacy/checkpoints/ + # If using this image for tests, intall more dependencies and don"t delete the source code where the tests live. RUN \ # Install pytorch-lightning at the current PR, plus dependencies. #pip install -r pytorch-lightning/requirements.txt --no-cache-dir && \ - # drop Horovod + # drop Horovod as it is not needed python -c "fname = 'pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \ + # drop fairscale as it is not needed + python -c "fname = 'pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" && \ pip install -r pytorch-lightning/requirements/devel.txt --no-cache-dir --upgrade-strategy only-if-needed #RUN python -c "import pytorch_lightning as pl; print(pl.__version__)" diff --git a/docs/.build_docs.sh b/docs/.build_docs.sh index 2b57c47953675..6cf6eab2fd398 100644 --- a/docs/.build_docs.sh +++ b/docs/.build_docs.sh @@ -1,3 +1,3 @@ rm -rf source/generated make clean -make html --debug --jobs 2 SPHINXOPTS="-W" \ No newline at end of file +make html --debug --jobs 2 SPHINXOPTS="-W" diff --git a/docs/Makefile b/docs/Makefile index 69fe55ecfa9aa..ba501f6f5b1bf 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -16,4 +16,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source/_images/benchmarks/figure-parity-times.png b/docs/source/_images/benchmarks/figure-parity-times.png new file mode 100644 index 0000000000000..2e8c5899020d9 Binary files /dev/null and b/docs/source/_images/benchmarks/figure-parity-times.png differ diff --git a/docs/source/_static/main.css b/docs/source/_static/main.css index 7441b775a4be5..82aa8b338ad39 100644 --- a/docs/source/_static/main.css +++ b/docs/source/_static/main.css @@ -1,3 +1,3 @@ col { width: 50% !important; -} \ No newline at end of file +} diff --git a/docs/source/asr_nlp_tts.rst b/docs/source/asr_nlp_tts.rst index a5f1ac59bf696..49bed0a981a6e 100644 --- a/docs/source/asr_nlp_tts.rst +++ b/docs/source/asr_nlp_tts.rst @@ -10,16 +10,16 @@ These are amazing ecosystems to help with Automatic Speech Recognition (ASR), Na NeMo **** -`NVIDIA NeMo `_ is a toolkit for building new State-of-the-Art -Conversational AI models. NeMo has separate collections for Automatic Speech Recognition (ASR), -Natural Language Processing (NLP), and Text-to-Speech (TTS) models. Each collection consists of -prebuilt modules that include everything needed to train on your data. -Every module can easily be customized, extended, and composed to create new Conversational AI +`NVIDIA NeMo `_ is a toolkit for building new State-of-the-Art +Conversational AI models. NeMo has separate collections for Automatic Speech Recognition (ASR), +Natural Language Processing (NLP), and Text-to-Speech (TTS) models. Each collection consists of +prebuilt modules that include everything needed to train on your data. +Every module can easily be customized, extended, and composed to create new Conversational AI model architectures. -Conversational AI architectures are typically very large and require a lot of data and compute -for training. NeMo uses PyTorch Lightning for easy and performant multi-GPU/multi-node -mixed-precision training. +Conversational AI architectures are typically very large and require a lot of data and compute +for training. NeMo uses PyTorch Lightning for easy and performant multi-GPU/multi-node +mixed-precision training. .. note:: Every NeMo model is a LightningModule that comes equipped with all supporting infrastructure for training and reproducibility. @@ -31,7 +31,7 @@ NeMo Models NeMo Models contain everything needed to train and reproduce state of the art Conversational AI research and applications, including: -- neural network architectures +- neural network architectures - datasets/data loaders - data preprocessing/postprocessing - data augmentors @@ -83,7 +83,7 @@ To install from a local clone of NeMo: ./reinstall.sh # from cloned NeMo's git root -For Docker users, the NeMo container is available on +For Docker users, the NeMo container is available on `NGC `_. .. code-block:: bash @@ -97,7 +97,7 @@ For Docker users, the NeMo container is available on Experiment Manager ------------------ -NeMo's Experiment Manager leverages PyTorch Lightning for model checkpointing, +NeMo's Experiment Manager leverages PyTorch Lightning for model checkpointing, TensorBoard Logging, and Weights and Biases logging. The Experiment Manager is included by default in all NeMo example scripts. @@ -126,11 +126,11 @@ Optionally launch Tensorboard to view training results in ./nemo_experiments (by Automatic Speech Recognition (ASR) ================================== -Everything needed to train Convolutional ASR models is included with NeMo. -NeMo supports multiple Speech Recognition architectures, including Jasper and QuartzNet. -`NeMo Speech Models `_ -can be trained from scratch on custom datasets or -fine-tuned using pre-trained checkpoints trained on thousands of hours of audio +Everything needed to train Convolutional ASR models is included with NeMo. +NeMo supports multiple Speech Recognition architectures, including Jasper and QuartzNet. +`NeMo Speech Models `_ +can be trained from scratch on custom datasets or +fine-tuned using pre-trained checkpoints trained on thousands of hours of audio that can be restored for immediate use. Some typical ASR tasks are included with NeMo: @@ -141,7 +141,7 @@ Some typical ASR tasks are included with NeMo: - `Voice Activity Detection `_ - `Speaker Recognition `_ -See this `asr notebook `_ +See this `asr notebook `_ for a full tutorial on doing ASR with NeMo, PyTorch Lightning, and Hydra. Specify ASR Model Configurations with YAML File @@ -149,7 +149,7 @@ Specify ASR Model Configurations with YAML File NeMo Models and the PyTorch Lightning Trainer can be fully configured from .yaml files using Hydra. -See this `asr config `_ +See this `asr config `_ for the entire speech to text .yaml file. .. code-block:: yaml @@ -198,7 +198,7 @@ Developing ASR Model From Scratch trainer.fit(asr_model) -Hydra makes every aspect of the NeMo model, +Hydra makes every aspect of the NeMo model, including the PyTorch Lightning Trainer, customizable from the command line. .. code-block:: bash @@ -259,7 +259,7 @@ with PyTorch Lightning since every NeMo model is a Lightning Module. log_probs = self.decoder(encoder_output=encoded) greedy_predictions = log_probs.argmax(dim=-1, keepdim=False) return log_probs, encoded_len, greedy_predictions - + # PTL-specific methods def training_step(self, batch, batch_nb): audio_signal, audio_signal_len, transcript, transcript_len = batch @@ -281,7 +281,7 @@ Neural Types in NeMo ASR ------------------------ NeMo Models and Neural Modules come with Neural Type checking. -Neural type checking is extremely useful when combining many different neural +Neural type checking is extremely useful when combining many different neural network architectures for a production-grade application. .. code-block:: python @@ -311,12 +311,12 @@ Natural Language Processing (NLP) ================================= Everything needed to finetune BERT-like language models for NLP tasks is included with NeMo. -`NeMo NLP Models `_ -include `HuggingFace Transformers `_ -and `NVIDIA Megatron-LM `_ BERT and Bio-Megatron models. +`NeMo NLP Models `_ +include `HuggingFace Transformers `_ +and `NVIDIA Megatron-LM `_ BERT and Bio-Megatron models. NeMo can also be used for pretraining BERT-based language models from HuggingFace. -Any of the HuggingFace encoders or Megatron-LM encoders can easily be used for the NLP tasks +Any of the HuggingFace encoders or Megatron-LM encoders can easily be used for the NLP tasks that are included with NeMo: - `Glue Benchmark (All tasks) `_ @@ -339,7 +339,7 @@ for a full tutorial on doing NER with NeMo, PyTorch Lightning, and Hydra. Specify NER Model Configurations with YAML File ----------------------------------------------- -.. note:: NeMo Models and the PyTorch Lightning Trainer can be fully configured from .yaml files using Hydra. +.. note:: NeMo Models and the PyTorch Lightning Trainer can be fully configured from .yaml files using Hydra. See this `token classification config `_ for the entire NER (token classification) .yaml file. @@ -368,7 +368,7 @@ for the entire NER (token classification) .yaml file. pretrained_model_name: bert-base-uncased lm_checkpoint: null ... - # the classifier for the downstream task + # the classifier for the downstream task head: num_fc_layers: 2 fc_dropout: 0.5 @@ -435,12 +435,12 @@ Hydra makes every aspect of the NeMo model, including the PyTorch Lightning Trai Tokenizers ---------- -Tokenization is the process of converting natural language text into integer arrays +Tokenization is the process of converting natural language text into integer arrays which can be used for machine learning. -For NLP tasks, tokenization is an essential part of data preprocessing. -NeMo supports all BERT-like model tokenizers from +For NLP tasks, tokenization is an essential part of data preprocessing. +NeMo supports all BERT-like model tokenizers from `HuggingFace's AutoTokenizer `_ -and also supports `Google's SentencePieceTokenizer `_ +and also supports `Google's SentencePieceTokenizer `_ which can be trained on custom data. To see the list of supported tokenizers: @@ -451,18 +451,18 @@ To see the list of supported tokenizers: nemo_nlp.modules.get_tokenizer_list() -See this `tokenizer notebook `_ +See this `tokenizer notebook `_ for a full tutorial on using tokenizers in NeMo. Language Models --------------- -Language models are used to extract information from (tokenized) text. +Language models are used to extract information from (tokenized) text. Much of the state-of-the-art in natural language processing is achieved -by fine-tuning pretrained language models on the downstream task. +by fine-tuning pretrained language models on the downstream task. -With NeMo, you can either `pretrain `_ -a BERT model on your data or use a pretrained language model from `HuggingFace Transformers `_ +With NeMo, you can either `pretrain `_ +a BERT model on your data or use a pretrained language model from `HuggingFace Transformers `_ or `NVIDIA Megatron-LM `_. To see the list of language models available in NeMo: @@ -483,11 +483,11 @@ for a full tutorial on using pretrained language models in NeMo. Using a Pre-trained NER Model ----------------------------- -NeMo has pre-trained NER models that can be used +NeMo has pre-trained NER models that can be used to get started with Token Classification right away. -Models are automatically downloaded from NGC, +Models are automatically downloaded from NGC, cached locally to disk, -and loaded into GPU memory using the `.from_pretrained` method. +and loaded into GPU memory using the `.from_pretrained` method. .. code-block:: python @@ -511,7 +511,7 @@ and loaded into GPU memory using the `.from_pretrained` method. NeMo NER Model Under the Hood ----------------------------- -Any aspect of NLP training or model architecture design can easily be customized with PyTorch Lightning +Any aspect of NLP training or model architecture design can easily be customized with PyTorch Lightning since every NeMo model is a Lightning Module. .. code-block:: python @@ -546,8 +546,8 @@ since every NeMo model is a Lightning Module. Neural Types in NeMo NLP ------------------------ -NeMo Models and Neural Modules come with Neural Type checking. -Neural type checking is extremely useful when combining many different neural network architectures +NeMo Models and Neural Modules come with Neural Type checking. +Neural type checking is extremely useful when combining many different neural network architectures for a production-grade application. .. code-block:: python @@ -565,11 +565,11 @@ for a production-grade application. Text-To-Speech (TTS) ==================== -Everything needed to train TTS models and generate audio is included with NeMo. -`NeMo TTS Models `_ +Everything needed to train TTS models and generate audio is included with NeMo. +`NeMo TTS Models `_ can be trained from scratch on your own data or pretrained models can be downloaded -automatically. NeMo currently supports a two step inference procedure. -First, a model is used to generate a mel spectrogram from text. +automatically. NeMo currently supports a two step inference procedure. +First, a model is used to generate a mel spectrogram from text. Second, a model is used to generate audio from a mel spectrogram. Mel Spectrogram Generators: @@ -647,10 +647,10 @@ Hydra makes every aspect of the NeMo model, including the PyTorch Lightning Trai Using State-Of-The-Art Pre-trained TTS Model -------------------------------------------- -Generate speech using models trained on `LJSpeech `, +Generate speech using models trained on `LJSpeech `, around 24 hours of single speaker data. -See this `TTS notebook `_ +See this `TTS notebook `_ for a full tutorial on generating speech with NeMo, PyTorch Lightning, and Hydra. .. code-block:: python @@ -673,7 +673,7 @@ for a full tutorial on generating speech with NeMo, PyTorch Lightning, and Hydra if isinstance(audio, torch.Tensor): audio = audio.to('cpu').numpy() return spectrogram, audio - + text_to_generate = input("Input what you want the model to say: ") spec, audio = infer(spec_gen, vocoder, text_to_generate) @@ -763,8 +763,8 @@ be customized with PyTorch Lightning since every NeMo model is a LightningModule Neural Types in NeMo TTS ------------------------ -NeMo Models and Neural Modules come with Neural Type checking. -Neural type checking is extremely useful when combining many different neural network architectures +NeMo Models and Neural Modules come with Neural Type checking. +Neural type checking is extremely useful when combining many different neural network architectures for a production-grade application. .. code-block:: python @@ -793,7 +793,7 @@ Learn More - Visit the `NVIDIA NeMo Developer Website `_ - Read the `NVIDIA NeMo PyTorch Blog `_ - Download pre-trained `ASR `_, `NLP `_, and `TTS `_ models on `NVIDIA NGC `_ to quickly get started with NeMo. -- Become an expert on Building Conversational AI applications with our `tutorials `_, and `example scripts `_, +- Become an expert on Building Conversational AI applications with our `tutorials `_, and `example scripts `_, - See our `developer guide `_ for more information on core NeMo concepts, ASR/NLP/TTS collections, and the NeMo API. .. note:: NeMo tutorial notebooks can be run on `Google Colab `_. diff --git a/docs/source/benchmarking.rst b/docs/source/benchmarking.rst new file mode 100644 index 0000000000000..8dec7ca8c123c --- /dev/null +++ b/docs/source/benchmarking.rst @@ -0,0 +1,14 @@ +Benchmark with vanilla PyTorch +============================== + +In this section we set grounds for comparison between vanilla PyTorch and PT Lightning for most common scenarios. + +Time comparison +--------------- + +We have set regular benchmarking against PyTorch vanilla training loop on with RNN and simple MNIST classifier as per of out CI. +In average for simple MNIST CNN classifier we are only about 0.06s slower per epoch, see detail chart bellow. + +.. figure:: _images/benchmarks/figure-parity-times.png + :alt: Speed parity to vanilla PT, created on 2020-12-16 + :width: 500 diff --git a/docs/source/cloud_training.rst b/docs/source/cloud_training.rst index 9fef417da7442..127bee6478dfd 100644 --- a/docs/source/cloud_training.rst +++ b/docs/source/cloud_training.rst @@ -26,4 +26,4 @@ using over 20+ distributions, lists, etc. Of course, you can also configure all can be dynamically assembled at runtime. -.. hint:: Grid supports the search strategy of your choice! (and much more than just sweeps) \ No newline at end of file +.. hint:: Grid supports the search strategy of your choice! (and much more than just sweeps) diff --git a/docs/source/conf.py b/docs/source/conf.py index 655e8dba30a36..2b861623599a6 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -294,10 +294,14 @@ def setup(app): # Ignoring Third-party packages # https://stackoverflow.com/questions/15889621/sphinx-how-to-exclude-imports-in-automodule def package_list_from_file(file): + """List up package name (not containing version and extras) from a package list file + """ mocked_packages = [] with open(file, 'r') as fp: for ln in fp.readlines(): - found = [ln.index(ch) for ch in list(',=<>#') if ch in ln] + # Example: `tqdm>=4.41.0` => `tqdm` + # `[` is for package with extras + found = [ln.index(ch) for ch in list(',=<>#[') if ch in ln] pkg = ln[:min(found)] if found else ln if pkg.rstrip(): mocked_packages.append(pkg.rstrip()) diff --git a/docs/source/datamodules.rst b/docs/source/datamodules.rst index b7dd9ec92e71d..bc79d7dc3d6ea 100644 --- a/docs/source/datamodules.rst +++ b/docs/source/datamodules.rst @@ -129,7 +129,7 @@ Here's a more realistic, complex DataModule that shows how much more reusable th # self.dims is returned when you call dm.size() # Setting default dims here because we know them. - # Could optionally be assigned dynamically in dm.setup() + # Could optionally be assigned dynamically in dm.setup() self.dims = (1, 28, 28) def prepare_data(self): @@ -268,6 +268,7 @@ Use this method to generate the val dataloader. Usually you just wrap the datas def val_dataloader(self): return DataLoader(self.mnist_val, batch_size=64) +.. _datamodule-test-dataloader-label: test_dataloader ^^^^^^^^^^^^^^^ diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst index 5eaf4303d3e4c..f3faa72f1e95e 100644 --- a/docs/source/debugging.rst +++ b/docs/source/debugging.rst @@ -28,13 +28,18 @@ The point is to detect any bugs in the training/validation loop without having t argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) .. testcode:: - + # runs 1 train, val, test batch and program ends trainer = Trainer(fast_dev_run=True) # runs 7 train, val, test batches and program ends trainer = Trainer(fast_dev_run=7) +.. note:: + + This argument will disable tuner, checkpoint callbacks, early stopping callbacks, + loggers and logger callbacks like ``LearningRateLogger`` and runs for only 1 epoch. + ---------------- Inspect gradient norms diff --git a/docs/source/governance.rst b/docs/source/governance.rst index 74d24e306d3f9..22fba33771c0a 100644 --- a/docs/source/governance.rst +++ b/docs/source/governance.rst @@ -25,3 +25,4 @@ Core Maintainers - Jeff Yang (`ydcjeff `_) - Roger Shieh (`s-rog `_) - Carlos Mocholí (`carmocca `_) +- Ananth Subramaniam (`ananthsub `_) diff --git a/docs/source/index.rst b/docs/source/index.rst index 1049a6d16a75d..2b7d9c3b58e26 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -24,6 +24,7 @@ PyTorch Lightning Documentation style_guide performance Lightning project template + benchmarking .. toctree:: diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst index d4cf578e10bda..52f1182d1508f 100644 --- a/docs/source/introduction_guide.rst +++ b/docs/source/introduction_guide.rst @@ -1051,7 +1051,7 @@ would be the particular system and how it's trained (ie: A GAN or VAE or GPT). out = decoder(features, x) loss = perceptual_loss(x1, x2, x) + CE(out, x) - + In Lightning, this code is organized into a :ref:`lightning_module`. Engineering code @@ -1071,7 +1071,7 @@ over GPUs, 16-bit precision, etc. This is normally code that is THE SAME across download_data() dist.barrier() - + In Lightning, this code is abstracted out by the :ref:`trainer`. Non-essential code @@ -1090,7 +1090,7 @@ This is code that helps the research but isn't relevant to the research code. So z = Q.rsample() generated = decoder(z) self.experiment.log('images', generated) - + In Lightning this code is organized into :ref:`callbacks`. Data code diff --git a/docs/source/loggers.rst b/docs/source/loggers.rst index b74fe292b251b..08b3b1e997555 100644 --- a/docs/source/loggers.rst +++ b/docs/source/loggers.rst @@ -9,7 +9,7 @@ Loggers ******* -Lightning supports the most popular logging frameworks (TensorBoard, Comet, etc...). TensorBoard is used by default, +Lightning supports the most popular logging frameworks (TensorBoard, Comet, etc...). TensorBoard is used by default, but you can pass to the :class:`~pytorch_lightning.trainer.trainer.Trainer` any combination of the following loggers. .. note:: @@ -247,7 +247,7 @@ Lightning supports the use of multiple loggers, just pass a list to the logger1 = TensorBoardLogger('tb_logs', name='my_model') logger2 = TestTubeLogger('tb_logs', name='my_model') trainer = Trainer(logger=[logger1, logger2]) - + The loggers are available as a list anywhere except ``__init__`` in your :class:`~pytorch_lightning.core.lightning.LightningModule`. diff --git a/docs/source/lr_finder.rst b/docs/source/lr_finder.rst index fbeb1f5fd959d..a5c3b312f30fc 100755 --- a/docs/source/lr_finder.rst +++ b/docs/source/lr_finder.rst @@ -2,7 +2,7 @@ from pytorch_lightning.trainer.trainer import Trainer from pytorch_lightning.core.lightning import LightningModule - + .. _lr_finder: Learning Rate Finder @@ -22,14 +22,14 @@ for both better performance and faster convergence. Even optimizers such as choices. To reduce the amount of guesswork concerning choosing a good initial learning -rate, a `learning rate finder` can be used. As described in this `paper `_ -a learning rate finder does a small run where the learning rate is increased -after each processed batch and the corresponding loss is logged. The result of +rate, a `learning rate finder` can be used. As described in this `paper `_ +a learning rate finder does a small run where the learning rate is increased +after each processed batch and the corresponding loss is logged. The result of this is a `lr` vs. `loss` plot that can be used as guidance for choosing a optimal -initial lr. +initial lr. -.. warning:: - For the moment, this feature only works with models having a single optimizer. +.. warning:: + For the moment, this feature only works with models having a single optimizer. LR Finder support for DDP is not implemented yet, it is coming soon. ---------- @@ -52,7 +52,7 @@ which can be accessed via ``self.learning_rate`` or ``self.lr``. def configure_optimizers(self): return Adam(self.parameters(), lr=(self.lr or self.learning_rate)) - + model = LitModel() # finds learning rate automatically @@ -81,26 +81,26 @@ method of the trainer. A typical example of this would look like model = MyModelClass(hparams) trainer = Trainer() - + # Run learning rate finder lr_finder = trainer.tuner.lr_find(model) - + # Results can be found in lr_finder.results - + # Plot with fig = lr_finder.plot(suggest=True) fig.show() - + # Pick point based on plot, or get suggestion new_lr = lr_finder.suggestion() - + # update hparams of the model model.hparams.lr = new_lr # Fit model trainer.fit(model) - + The figure produced by ``lr_finder.plot()`` should look something like the figure below. It is recommended to not pick the learning rate that achieves the lowest loss, but instead something in the middle of the sharpest downward slope (red point). diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index 387cbc3bd7482..3c853f45a70d6 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -33,10 +33,11 @@ The example below shows how to use a metric in your ``LightningModule``: self.accuracy = pl.metrics.Accuracy() def training_step(self, batch, batch_idx): - logits = self(x) + x, y = batch + preds = self(x) ... # log step metric - self.log('train_acc_step', self.accuracy(logits, y)) + self.log('train_acc_step', self.accuracy(preds, y)) ... def training_epoch_end(self, outs): @@ -67,9 +68,10 @@ If ``on_epoch`` is True, the logger automatically logs the end of epoch metric v self.valid_acc = pl.metrics.Accuracy() def training_step(self, batch, batch_idx): - logits = self(x) + x, y = batch + preds = self(x) ... - self.train_acc(logits, y) + self.train_acc(preds, y) self.log('train_acc', self.train_acc, on_step=True, on_epoch=False) def validation_step(self, batch, batch_idx): @@ -88,7 +90,7 @@ If ``on_epoch`` is True, the logger automatically logs the end of epoch metric v def training_step(self, batch, batch_idx): data, target = batch - pred = self(data) + preds = self(data) ... return {'loss' : loss, 'preds' : preds, 'target' : target} @@ -137,6 +139,56 @@ This metrics API is independent of PyTorch Lightning. Metrics can directly be us To change this, after initializing the metric, the method ``.persistent(mode)`` can be used to enable (``mode=True``) or disable (``mode=False``) this behaviour. +******************* +Metrics and devices +******************* + +Metrics are simple subclasses of :class:`~torch.nn.Module` and their metric states behave +similar to buffers and parameters of modules. This means that metrics states should +be moved to the same device as the input of the metric: + +.. code-block:: python + + import torch + from pytorch_lightning.metrics import Accuracy + + target = torch.tensor([1, 1, 0, 0], device=torch.device("cuda", 0)) + preds = torch.tensor([0, 1, 0, 0], device=torch.device("cuda", 0)) + + # Metric states are always initialized on cpu, and needs to be moved to + # the correct device + confmat = Accuracy(num_classes=2).to(torch.device("cuda", 0)) + out = confmat(preds, target) + print(out.device) # cuda:0 + +However, when **properly defined** inside a :class:`~pytorch_lightning.core.lightning.LightningModule` +, Lightning will automatically move the metrics to the same device as the data. Being +**properly defined** means that the metric is correctly identified as a child module of the +model (check ``.children()`` attribute of the model). Therefore, metrics cannot be placed +in native python ``list`` and ``dict``, as they will not be correctly identified +as child modules. Instead of ``list`` use :class:`~torch.nn.ModuleList` and instead of +``dict`` use :class:`~torch.nn.ModuleDict`. + +.. testcode:: + + class MyModule(LightningModule): + def __init__(self): + ... + # valid ways metrics will be identified as child modules + self.metric1 = pl.metrics.Accuracy() + self.metric2 = torch.nn.ModuleList(pl.metrics.Accuracy()) + self.metric3 = torch.nn.ModuleDict({'accuracy': Accuracy()}) + + def training_step(self, batch, batch_idx): + # all metrics will be on the same device as the input batch + data, target = batch + preds = self(data) + ... + val1 = self.metric1(preds, target) + val2 = self.metric2[0](preds, target) + val3 = self.metric3['accuracy'](preds, target) + + ********************* Implementing a Metric ********************* @@ -212,7 +264,7 @@ Classification Metrics Input types ----------- -For the purposes of classification metrics, inputs (predictions and targets) are split +For the purposes of classification metrics, inputs (predictions and targets) are split into these categories (``N`` stands for the batch size and ``C`` for number of classes): .. csv-table:: \*dtype ``binary`` means integers that are either 0 or 1 @@ -227,10 +279,10 @@ into these categories (``N`` stands for the batch size and ``C`` for number of c "Multi-dimensional multi-class with probabilities", "(N, C, ...)", "``float``", "(N, ...)", "``int``" .. note:: - All dimensions of size 1 (except ``N``) are "squeezed out" at the beginning, so + All dimensions of size 1 (except ``N``) are "squeezed out" at the beginning, so that, for example, a tensor of shape ``(N, 1)`` is treated as ``(N, )``. -When predictions or targets are integers, it is assumed that class labels start at 0, i.e. +When predictions or targets are integers, it is assumed that class labels start at 0, i.e. the possible class labels are 0, 1, 2, 3, etc. Below are some examples of different input types .. testcode:: diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index b3e0b905f27f4..fff32850b9466 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -58,10 +58,10 @@ This will make your code scale to any arbitrary number of GPUs or TPUs with Ligh z = torch.Tensor(2, 3) z = z.type_as(x) -The :class:`~pytorch_lightning.core.lightning.LightningModule` knows what device it is on. You can access the reference via `self.device`. +The :class:`~pytorch_lightning.core.lightning.LightningModule` knows what device it is on. You can access the reference via ``self.device``. Sometimes it is necessary to store tensors as module attributes. However, if they are not parameters they will remain on the CPU even if the module gets moved to a new device. To prevent that and remain device agnostic, -register the tensor as a buffer in your modules's `__init__` method with :meth:`~torch.nn.Module.register_buffer`. +register the tensor as a buffer in your modules's ``__init__`` method with :meth:`~torch.nn.Module.register_buffer`. .. testcode:: @@ -75,8 +75,8 @@ register the tensor as a buffer in your modules's `__init__` method with :meth:` Remove samplers ^^^^^^^^^^^^^^^ -In PyTorch, you must use `torch.nn.DistributedSampler` for multi-node or TPU training. The -sampler makes sure each GPU sees the appropriate part of your data. +In PyTorch, you must use :class:`~torch.utils.data.distributed.DistributedSampler` +for multi-node or TPU training. The sampler makes sure each GPU sees the appropriate part of your data. .. testcode:: @@ -99,7 +99,11 @@ Lightning adds the correct samplers when needed, so no need to explicitly add sa dataset = MNIST(...) return DataLoader(dataset) -.. note:: You can disable this behavior with `Trainer(replace_sampler_ddp=False)` +.. note:: + By default it will add ``shuffle=True`` for train sampler and ``shuffle=False`` for val/test sampler. + ``drop_last`` in :class:`~torch.utils.data.distributed.DistributedSampler` will be set to its default value in PyTorch. + +.. note:: You can disable this behavior with ``Trainer(replace_sampler_ddp=False)`` .. note:: For iterable datasets, we don't do this automatically. @@ -108,7 +112,7 @@ Synchronize validation and test logging ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When running in distributed mode, we have to ensure that the validation and test step logging calls are synchronized across processes. -This is done by adding `sync_dist=True` to all `self.log` calls in the validation and test step. +This is done by adding ``sync_dist=True`` to all ``self.log`` calls in the validation and test step. This ensures that each GPU worker has the same behaviour when tracking model checkpoints, which is important for later downstream tasks such as testing the best checkpoint across all workers. Note if you use any built in metrics or custom metrics that use the :ref:`Metrics API `, these do not need to be updated and are automatically handled for you. @@ -229,41 +233,24 @@ Note in particular the difference between `gpus=0`, `gpus=[0]` and `gpus="0"`. .. note:: - When specifying number of gpus as an integer `gpus=k`, setting the trainer flag - `auto_select_gpus=True` will automatically help you find `k` gpus that are not + When specifying number of gpus as an integer ``gpus=k``, setting the trainer flag + ``auto_select_gpus=True`` will automatically help you find ``k`` gpus that are not occupied by other processes. This is especially useful when GPUs are configured to be in "exclusive mode", such that only one process at a time can access them. For more details see the :ref:`Trainer guide `. - -Remove CUDA flags -^^^^^^^^^^^^^^^^^ - -CUDA flags make certain GPUs visible to your script. -Lightning sets these for you automatically, there's NO NEED to do this yourself. - -.. testcode:: - - # lightning will set according to what you give the trainer - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = "0" - -However, when using a cluster, Lightning will NOT set these flags (and you should not either). -SLURM will set these for you. -For more details see the :ref:`SLURM cluster guide `. - ---------- Distributed modes ----------------- Lightning allows multiple ways of training -- Data Parallel (`accelerator='dp'`) (multiple-gpus, 1 machine) -- DistributedDataParallel (`accelerator='ddp'`) (multiple-gpus across many machines (python script based)). -- DistributedDataParallel (`accelerator='ddp_spawn'`) (multiple-gpus across many machines (spawn based)). -- DistributedDataParallel 2 (`accelerator='ddp2'`) (DP in a machine, DDP across machines). -- Horovod (`accelerator='horovod'`) (multi-machine, multi-gpu, configured at runtime) -- TPUs (`tpu_cores=8|x`) (tpu or TPU pod) +- Data Parallel (``accelerator='dp'``) (multiple-gpus, 1 machine) +- DistributedDataParallel (``accelerator='ddp'``) (multiple-gpus across many machines (python script based)). +- DistributedDataParallel (``accelerator='ddp_spawn'``) (multiple-gpus across many machines (spawn based)). +- DistributedDataParallel 2 (``accelerator='ddp2'``) (DP in a machine, DDP across machines). +- Horovod (``accelerator='horovod'``) (multi-machine, multi-gpu, configured at runtime) +- TPUs (``tpu_cores=8|x``) (tpu or TPU pod) .. note:: If you request multiple GPUs or nodes without setting a mode, DDP will be automatically used. @@ -275,7 +262,7 @@ For a deeper understanding of what Lightning is doing, feel free to read this Data Parallel ^^^^^^^^^^^^^ -`DataParallel `_ (DP) splits a batch across k GPUs. +:class:`~torch.nn.DataParallel` (DP) splits a batch across k GPUs. That is, if you have a batch of 32 and use DP with 2 gpus, each GPU will process 16 samples, after which the root node will aggregate the results. @@ -289,7 +276,7 @@ after which the root node will aggregate the results. Distributed Data Parallel ^^^^^^^^^^^^^^^^^^^^^^^^^ -`DistributedDataParallel `_ (DDP) works as follows: +:class:`~torch.nn.parallel.DistributedDataParallel` (DDP) works as follows: 1. Each GPU across each node gets its own process. @@ -375,7 +362,7 @@ project module) you can use the following method: .. code-block:: python # train on 8 GPUs (same machine (ie: node)) - trainer = Trainer(gpus=8, accelerator='ddp') + trainer = Trainer(gpus=8, accelerator='ddp_spawn') We STRONGLY discourage this use because it has limitations (due to Python and PyTorch): @@ -576,26 +563,26 @@ not allow 16-bit and DP training. We tried to get this to work, but it's an issu Below are the possible configurations we support. -+-------+---------+----+-----+---------+------------------------------------------------------------+ -| 1 GPU | 1+ GPUs | DP | DDP | 16-bit | command | -+=======+=========+====+=====+=========+============================================================+ -| Y | | | | | `Trainer(gpus=1)` | -+-------+---------+----+-----+---------+------------------------------------------------------------+ -| Y | | | | Y | `Trainer(gpus=1, precision=16)` | -+-------+---------+----+-----+---------+------------------------------------------------------------+ -| | Y | Y | | | `Trainer(gpus=k, accelerator='dp')` | -+-------+---------+----+-----+---------+------------------------------------------------------------+ -| | Y | | Y | | `Trainer(gpus=k, accelerator='ddp')` | -+-------+---------+----+-----+---------+------------------------------------------------------------+ -| | Y | | Y | Y | `Trainer(gpus=k, accelerator='ddp', precision=16)` | -+-------+---------+----+-----+---------+------------------------------------------------------------+ ++-------+---------+----+-----+--------+------------------------------------------------------------+ +| 1 GPU | 1+ GPUs | DP | DDP | 16-bit | command | ++=======+=========+====+=====+========+============================================================+ +| Y | | | | | `Trainer(gpus=1)` | ++-------+---------+----+-----+--------+------------------------------------------------------------+ +| Y | | | | Y | `Trainer(gpus=1, precision=16)` | ++-------+---------+----+-----+--------+------------------------------------------------------------+ +| | Y | Y | | | `Trainer(gpus=k, accelerator='dp')` | ++-------+---------+----+-----+--------+------------------------------------------------------------+ +| | Y | | Y | | `Trainer(gpus=k, accelerator='ddp')` | ++-------+---------+----+-----+--------+------------------------------------------------------------+ +| | Y | | Y | Y | `Trainer(gpus=k, accelerator='ddp', precision=16)` | ++-------+---------+----+-----+--------+------------------------------------------------------------+ Implement Your Own Distributed (DDP) training ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If you need your own way to init PyTorch DDP you can override :meth:`pytorch_lightning.plugins.ddp_plugin.DDPPlugin.init_ddp_connection`. -If you also need to use your own DDP implementation, override: :meth:`pytorch_lightning.plugins.ddp_plugin.DDPPlugin.configure_ddp`. +If you also need to use your own DDP implementation, override :meth:`pytorch_lightning.plugins.ddp_plugin.DDPPlugin.configure_ddp`. ---------- @@ -667,7 +654,7 @@ To use Sharded Training, you need to first install FairScale using the command b .. code-block:: bash - pip install https://github.com/PyTorchLightning/fairscale/archive/pl_1.1.0.zip + pip install fairscale .. code-block:: python @@ -694,9 +681,7 @@ Reference: https://arxiv.org/abs/1811.06965 .. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6. -To get started, install FairScale through extras using with ``pip install pytorch-lightning["extra"]`` - -or directly using +To get started, install FairScale using the command below. We install a specific branch which contains PyTorch related fixes for Sequential Parallelism. .. code-block:: bash diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst index 30e06f76ae5bd..def273f7a8257 100644 --- a/docs/source/new-project.rst +++ b/docs/source/new-project.rst @@ -132,7 +132,7 @@ Examples of systems are: - `DQN `_ - `GAN `_ - `Image classifier `_ -- Seq2seq +- Seq2seq - `SimCLR `_ - `VAE `_ @@ -195,7 +195,7 @@ First, define the data however you want. Lightning just needs a :class:`~torch.u dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor()) train_loader = DataLoader(dataset) - + Next, init the :ref:`lightning_module` and the PyTorch Lightning :class:`~pytorch_lightning.trainer.Trainer`, then call fit with both the data and model. @@ -268,7 +268,8 @@ Now you own the train loop! .. code-block:: python def training_step(self, batch, batch_idx, opt_idx): - (opt_a, opt_b, opt_c) = self.optimizers() + # access your optimizers with use_pl_optimizer=False. Default is True + (opt_a, opt_b, opt_c) = self.optimizers(use_pl_optimizer=True) loss_a = self.generator(batch[0]) @@ -392,7 +393,7 @@ It's trivial to use CPUs, GPUs or TPUs in Lightning. There's **NO NEED** to chan # train on 1 GPU trainer = pl.Trainer(gpus=1) - + .. code-block:: python # train on multiple GPUs across nodes (32 gpus here) @@ -400,7 +401,7 @@ It's trivial to use CPUs, GPUs or TPUs in Lightning. There's **NO NEED** to chan gpus=4, num_nodes=8 ) - + .. code-block:: python # train on gpu 1, 3, 5 (3 gpus total) @@ -428,7 +429,7 @@ Without changing a SINGLE line of your code, you can now do the following with t limit_train_batches=0.5, val_check_interval=0.25 ) - + ----------- Checkpoints @@ -709,7 +710,7 @@ Lightning has many tools for debugging. Here is an example of just a few of them .. code-block:: python - # Automatically overfit the sane batch of your model for a sanity test + # Automatically overfit the sane batch of your model for a sanity test trainer = pl.Trainer(overfit_batches=1) .. code-block:: python @@ -719,7 +720,7 @@ Lightning has many tools for debugging. Here is an example of just a few of them trainer = pl.Trainer(fast_dev_run=True) .. code-block:: python - + # train only 20% of an epoch trainer = pl.Trainer(limit_train_batches=0.2) @@ -729,10 +730,10 @@ Lightning has many tools for debugging. Here is an example of just a few of them trainer = pl.Trainer(val_check_interval=0.25) .. code-block:: python - + # Profile your code to find speed/memory bottlenecks Trainer(profiler=True) - + --------------- ******************** diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst index 06e6e9679d29f..588bdefb367e3 100644 --- a/docs/source/optimizers.rst +++ b/docs/source/optimizers.rst @@ -28,8 +28,15 @@ to manually manage the optimization process. To do so, do the following: .. code-block:: python def training_step(self, batch, batch_idx, optimizer_idx): - # ignore optimizer_idx - (opt_g, opt_d) = self.optimizers() + + # 1. ignore optimizer_idx + # 2. `use_pl_optimizer=True` means `opt_g` and `opt_d` will be of type `LightingOptimizer` + # `LightingOptimizer` simply wrapped your optimizer and behave the same way ! + # When calling `optimizer.step`, `LightingOptimizer` will just handle TPU, AMP, accumulate_grad_batches, etc ... for you. + + # access your optimizers with `use_pl_optimizer=False` or `optimizer.optimizer` when using use_pl_optimizer=True + # use_pl_optimizer=True is the default + (opt_g, opt_d) = self.optimizers(use_pl_optimizer=True) # do anything you want loss_a = ... @@ -67,13 +74,13 @@ Under the hood Lightning does the following: .. code-block:: python for epoch in epochs: - for batch id data: + for batch in data: loss = model.training_step(batch, batch_idx, ...) loss.backward() optimizer.step() optimizer.zero_grad() - for scheduler in scheduler: + for scheduler in schedulers: scheduler.step() In the case of multiple optimizers, Lightning does the following: @@ -87,7 +94,7 @@ In the case of multiple optimizers, Lightning does the following: train_step(opt) opt.step() - for scheduler in scheduler: + for scheduler in schedulers: scheduler.step() @@ -179,7 +186,7 @@ Lightning will call each optimizer sequentially: train_step(opt) opt.step() - for scheduler in scheduler: + for scheduler in schedulers: scheduler.step() ---------- @@ -201,12 +208,12 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch # Alternating schedule for optimizer steps (ie: GANs) def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): # update generator opt every 2 steps - if optimizer_i == 0: + if optimizer_idx == 0: if batch_nb % 2 == 0 : optimizer.step(closure=closure) # update discriminator opt every 4 steps - if optimizer_i == 1: + if optimizer_idx == 1: if batch_nb % 4 == 0 : optimizer.step(closure=closure) @@ -220,11 +227,11 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch # Alternating schedule for optimizer steps (ie: GANs) def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): # update generator opt every 2 steps - if optimizer_i == 0: + if optimizer_idx == 0: optimizer.step(closure=closure, make_optimizer_step=(batch_nb % 2) == 0) # update discriminator opt every 4 steps - if optimizer_i == 1: + if optimizer_idx == 1: optimizer.step(closure=closure, make_optimizer_step=(batch_nb % 4) == 0) Here we add a learning-rate warm up @@ -242,19 +249,29 @@ Here we add a learning-rate warm up # update params optimizer.step(closure=closure) -The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. +.. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches, zero_grad, and much more ... + +.. testcode:: + + # function hook in LightningModule + def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + optimizer.step(closure=closure) + +.. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow. .. testcode:: - from pytorch_lightning.core.optimizer import LightningOptimizer - # function hook in LightningModule def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - if not isinstance(optimizer, LightningOptimizer): - # wraps into LightingOptimizer only for running step - optimizer = LightningOptimizer.to_lightning_optimizer(optimizer, self.trainer) + + # `optimizer is a ``LightningOptimizer`` wrapping the optimizer. + # To access it, do as follow: + optimizer = optimizer.optimizer + + # run step. However, it won't work on TPU, AMP, etc... optimizer.step(closure=closure) + ---------- Using the closure functions for optimization diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst index 93fefad0d0e35..759a671cc42ef 100644 --- a/docs/source/sequences.rst +++ b/docs/source/sequences.rst @@ -2,7 +2,7 @@ from torch.utils.data import IterableDataset from pytorch_lightning.trainer.trainer import Trainer - + .. _sequences: Sequential Data diff --git a/docs/source/slurm.rst b/docs/source/slurm.rst index be40810c3f944..da6de596db5a2 100644 --- a/docs/source/slurm.rst +++ b/docs/source/slurm.rst @@ -1,7 +1,7 @@ .. testsetup:: * from pytorch_lightning.trainer.trainer import Trainer - + .. _slurm: Computing cluster (SLURM) diff --git a/docs/source/test_set.rst b/docs/source/test_set.rst index 8d8edce672e11..d9e989a4182f3 100644 --- a/docs/source/test_set.rst +++ b/docs/source/test_set.rst @@ -3,6 +3,10 @@ Test set ======== Lightning forces the user to run the test set separately to make sure it isn't evaluated by mistake. +Testing is performed using the ``trainer`` object's ``.test()`` method. + +.. automethod:: pytorch_lightning.trainer.Trainer.test + :noindex: ---------- @@ -37,7 +41,7 @@ You can run the test set on multiple models using the same trainer instance. model1 = LitModel() model2 = GANModel() - + trainer = Trainer() trainer.test(model1) trainer.test(model2) @@ -82,4 +86,19 @@ is not available at the time your model was declared. trainer.test(test_dataloaders=test) You can either pass in a single dataloader or a list of them. This optional named -parameter can be used in conjunction with any of the above use cases. +parameter can be used in conjunction with any of the above use cases. Additionally, +you can also pass in an :ref:`datamodules` that have overridden the +:ref:`datamodule-test-dataloader-label` method. + +.. code-block:: python + + class MyDataModule(pl.LightningDataModule): + ... + def test_dataloader(self): + return DataLoader(...) + + # setup your datamodule + dm = MyDataModule(...) + + # test (pass in datamodule) + trainer.test(datamodule=dm) diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst index 0748302f30613..d461c30a20a6f 100644 --- a/docs/source/trainer.rst +++ b/docs/source/trainer.rst @@ -141,9 +141,9 @@ So you can run it like so: .. note:: If you want to stop a training run early, you can press "Ctrl + C" on your keyboard. - The trainer will catch the `KeyboardInterrupt` and attempt a graceful shutdown, including - running callbacks such as `on_train_end`. The trainer object will also set an attribute - `interrupted` to `True` in such cases. If you have a callback which shuts down compute + The trainer will catch the ``KeyboardInterrupt`` and attempt a graceful shutdown, including + running callbacks such as ``on_train_end``. The trainer object will also set an attribute + ``interrupted`` to ``True`` in such cases. If you have a callback which shuts down compute resources, for example, you can conditionally run the shutdown logic for only uninterrupted runs. ------------ @@ -155,7 +155,7 @@ Once you're done training, feel free to run the test set! .. code-block:: python - trainer.test(test_dataloader=test_dataloader) + trainer.test(test_dataloaders=test_dataloader) ------------ @@ -220,13 +220,13 @@ accelerator The accelerator backend to use (previously known as distributed_backend). -- (```dp```) is DataParallel (split batch among GPUs of same machine) -- (```ddp```) is DistributedDataParallel (each gpu on each node trains, and syncs grads) -- (```ddp_cpu```) is DistributedDataParallel on CPU (same as `ddp`, but does not use GPUs. +- (``'dp'``) is DataParallel (split batch among GPUs of same machine) +- (``'ddp'``) is DistributedDataParallel (each gpu on each node trains, and syncs grads) +- (``'ddp_cpu'``) is DistributedDataParallel on CPU (same as ``'ddp'``, but does not use GPUs. Useful for multi-node CPU training or single-node debugging. Note that this will **not** give a speedup on a single node, since Torch already makes efficient use of multiple CPUs on a single machine.) -- (```ddp2```) dp on node, ddp across nodes. Useful for things like increasing +- (``'ddp2'``) dp on node, ddp across nodes. Useful for things like increasing the number of negative samples .. testcode:: @@ -245,7 +245,7 @@ Example:: # ddp2 = DistributedDataParallel + dp trainer = Trainer(gpus=2, num_nodes=2, accelerator='ddp2') -.. note:: This option does not apply to TPU. TPUs use ```ddp``` by default (over each core) +.. note:: This option does not apply to TPU. TPUs use ``'ddp'`` by default (over each core) You can also modify hardware behavior by subclassing an existing accelerator to adjust for your needs. @@ -335,7 +335,8 @@ optimizer behavior Example:: def training_step(self, batch, batch_idx): - opt = self.optimizers() + # access your optimizers with use_pl_optimizer=False. Default is True + opt = self.optimizers(use_pl_optimizer=True) loss = ... self.manual_backward(loss, opt) @@ -350,7 +351,8 @@ In the multi-optimizer case, ignore the optimizer_idx flag and use the optimizer Example:: def training_step(self, batch, batch_idx, optimizer_idx): - (opt_a, opt_b) = self.optimizers() + # access your optimizers with use_pl_optimizer=False. Default is True + (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True) gen_loss = ... self.manual_backward(gen_loss, opt_a) @@ -619,7 +621,7 @@ will need to be set up to use remote filepaths. distributed_backend ^^^^^^^^^^^^^^^^^^^ -This has been renamed "accelerator". +Deprecated: This has been renamed ``accelerator``. fast_dev_run ^^^^^^^^^^^^ @@ -666,9 +668,30 @@ Under the hood the pseudocode looks like this when running *fast_dev_run* with a .. note:: This argument is a bit different from ``limit_train/val/test_batches``. Setting this argument will - disable tuner, logger callbacks like ``LearningRateLogger`` and runs for only 1 epoch. This must be - used only for debugging purposes. ``limit_train/val/test_batches`` only limits the number of batches and won't - disable anything. + disable tuner, checkpoint callbacks, early stopping callbacks, loggers and logger callbacks like + ``LearningRateLogger`` and runs for only 1 epoch. This must be used only for debugging purposes. + ``limit_train/val/test_batches`` only limits the number of batches and won't disable anything. + +flush_logs_every_n_steps +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Writes logs to disk this often. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(flush_logs_every_n_steps=100) + +See Also: + - :ref:`logging` gpus ^^^^ @@ -736,6 +759,35 @@ Gradient clipping value # default used by the Trainer trainer = Trainer(gradient_clip_val=0.0) +limit_train_batches +^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +How much of training dataset to check. +Useful when debugging or testing something that happens at the end of an epoch. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(limit_train_batches=1.0) + +Example:: + + # default used by the Trainer + trainer = Trainer(limit_train_batches=1.0) + + # run through only 25% of the training set each epoch + trainer = Trainer(limit_train_batches=0.25) + + # run through only 10 batches of the training set each epoch + trainer = Trainer(limit_train_batches=10) limit_test_batches ^^^^^^^^^^^^^^^^^^ @@ -790,6 +842,28 @@ Useful when debugging or testing something that happens at the end of an epoch. In the case of multiple validation dataloaders, the limit applies to each dataloader individually. +log_every_n_steps +^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + + +How often to add logging rows (does not write to disk) + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(log_every_n_steps=50) + +See Also: + - :ref:`logging` + log_gpu_memory ^^^^^^^^^^^^^^ @@ -818,28 +892,7 @@ Options: # log only the min and max memory on the master node trainer = Trainer(log_gpu_memory='min_max') -.. note:: Might slow performance because it uses the output of nvidia-smi. - -flush_logs_every_n_steps -^^^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Writes logs to disk this often. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(flush_logs_every_n_steps=100) - -See Also: - - :ref:`logging` +.. note:: Might slow performance because it uses the output of ``nvidia-smi``. logger ^^^^^^ @@ -1019,6 +1072,32 @@ The Trainer uses 2 steps by default. Turn it off or modify it here. This option will reset the validation dataloader unless ``num_sanity_val_steps=0``. +overfit_batches +^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. +If the training dataloaders have `shuffle=True`, Lightning will automatically disable it. + +Useful for quickly debugging or trying to overfit on purpose. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(overfit_batches=0.0) + + # use only 1% of the train set (and use the train set for val and test) + trainer = Trainer(overfit_batches=0.01) + + # overfit on 10 of the same batches + trainer = Trainer(overfit_batches=10) plugins ^^^^^^^ @@ -1079,89 +1158,6 @@ If False will only call from NODE_RANK=0, LOCAL_RANK=0 # use only NODE_RANK=0, LOCAL_RANK=0 Trainer(prepare_data_per_node=False) -tpu_cores -^^^^^^^^^ - -.. raw:: html - - - -| - -- How many TPU cores to train on (1 or 8). -- Which TPU core to train on [1-8] - -A single TPU v2 or v3 has 8 cores. A TPU pod has -up to 2048 cores. A slice of a POD means you get as many cores -as you request. - -Your effective batch size is batch_size * total tpu cores. - -.. note:: No need to add a DistributedDataSampler, Lightning automatically does it for you. - -This parameter can be either 1 or 8. - -Example:: - - # your_trainer_file.py - - # default used by the Trainer (ie: train on CPU) - trainer = Trainer(tpu_cores=None) - - # int: train on a single core - trainer = Trainer(tpu_cores=1) - - # list: train on a single selected core - trainer = Trainer(tpu_cores=[2]) - - # int: train on all cores few cores - trainer = Trainer(tpu_cores=8) - - # for 8+ cores must submit via xla script with - # a max of 8 cores specified. The XLA script - # will duplicate script onto each TPU in the POD - trainer = Trainer(tpu_cores=8) - -To train on more than 8 cores (ie: a POD), -submit this script using the xla_dist script. - -Example:: - - python -m torch_xla.distributed.xla_dist - --tpu=$TPU_POD_NAME - --conda-env=torch-xla-nightly - --env=XLA_USE_BF16=1 - -- python your_trainer_file.py - -overfit_batches -^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. -If the training dataloaders have `shuffle=True`, Lightning will automatically disable it. - -Useful for quickly debugging or trying to overfit on purpose. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(overfit_batches=0.0) - - # use only 1% of the train set (and use the train set for val and test) - trainer = Trainer(overfit_batches=0.01) - - # overfit on 10 of the same batches - trainer = Trainer(overfit_batches=10) - precision ^^^^^^^^^ @@ -1334,7 +1330,8 @@ resume_from_checkpoint | -To resume training from a specific checkpoint pass in the path here. +To resume training from a specific checkpoint pass in the path here. If resuming from a mid-epoch +checkpoint, training will start from the beginning of the next epoch. .. testcode:: @@ -1344,29 +1341,6 @@ To resume training from a specific checkpoint pass in the path here. # resume from a specific checkpoint trainer = Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt') -log_every_n_steps -^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - - -How often to add logging rows (does not write to disk) - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(log_every_n_steps=50) - -See Also: - - :ref:`logging` - - sync_batchnorm ^^^^^^^^^^^^^^ @@ -1406,35 +1380,63 @@ track_grad_norm # track the 2-norm trainer = Trainer(track_grad_norm=2) -limit_train_batches -^^^^^^^^^^^^^^^^^^^ +tpu_cores +^^^^^^^^^ .. raw:: html + poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/tpu_cores.jpg" + src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/tpu_cores.mp4"> | -How much of training dataset to check. -Useful when debugging or testing something that happens at the end of an epoch. +- How many TPU cores to train on (1 or 8). +- Which TPU core to train on [1-8] -.. testcode:: +A single TPU v2 or v3 has 8 cores. A TPU pod has +up to 2048 cores. A slice of a POD means you get as many cores +as you request. - # default used by the Trainer - trainer = Trainer(limit_train_batches=1.0) +Your effective batch size is batch_size * total tpu cores. + +.. note:: + No need to add a :class:`~torch.utils.data.distributed.DistributedSampler`, + Lightning automatically does it for you. + +This parameter can be either 1 or 8. Example:: - # default used by the Trainer - trainer = Trainer(limit_train_batches=1.0) + # your_trainer_file.py - # run through only 25% of the training set each epoch - trainer = Trainer(limit_train_batches=0.25) + # default used by the Trainer (ie: train on CPU) + trainer = Trainer(tpu_cores=None) - # run through only 10 batches of the training set each epoch - trainer = Trainer(limit_train_batches=10) + # int: train on a single core + trainer = Trainer(tpu_cores=1) + + # list: train on a single selected core + trainer = Trainer(tpu_cores=[2]) + + # int: train on all cores few cores + trainer = Trainer(tpu_cores=8) + + # for 8+ cores must submit via xla script with + # a max of 8 cores specified. The XLA script + # will duplicate script onto each TPU in the POD + trainer = Trainer(tpu_cores=8) + +To train on more than 8 cores (ie: a POD), +submit this script using the xla_dist script. + +Example:: + + python -m torch_xla.distributed.xla_dist + --tpu=$TPU_POD_NAME + --conda-env=torch-xla-nightly + --env=XLA_USE_BF16=1 + -- python your_trainer_file.py truncated_bptt_steps ^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index 10ee668a97fa8..d7230a1fd687a 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -130,4 +130,4 @@ Sequential Model Parallelism with Checkpointing PyTorch Lightning integration for Sequential Model Parallelism using `FairScale `_. Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially. -For more information, refer to :ref:`sequential-parallelism`. \ No newline at end of file +For more information, refer to :ref:`sequential-parallelism`. diff --git a/docs/source/transfer_learning.rst b/docs/source/transfer_learning.rst index ba44203721b98..bf5d4fc5d6e05 100644 --- a/docs/source/transfer_learning.rst +++ b/docs/source/transfer_learning.rst @@ -1,7 +1,7 @@ .. testsetup:: * from pytorch_lightning.core.lightning import LightningModule - + Transfer Learning ----------------- @@ -52,16 +52,22 @@ Example: Imagenet (computer Vision) class ImagenetTransferLearning(LightningModule): def __init__(self): + super().__init__() + # init a pretrained resnet - num_target_classes = 10 - self.feature_extractor = models.resnet50(pretrained=True) - self.feature_extractor.eval() + backbone = models.resnet50(pretrained=True) + num_filters = backbone.fc.in_features + layers = list(backbone.children())[:-1] + self.feature_extractor = torch.nn.Sequential(*layers) # use the pretrained model to classify cifar-10 (10 image classes) - self.classifier = nn.Linear(2048, num_target_classes) + num_target_classes = 10 + self.classifier = nn.Linear(num_filters, num_target_classes) def forward(self, x): - representations = self.feature_extractor(x) + self.feature_extractor.eval() + with torch.no_grad(): + representations = self.feature_extractor(x).flatten(1) x = self.classifier(representations) ... diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst index f22e355a09d17..1c8babd72ed18 100644 --- a/docs/source/weights_loading.rst +++ b/docs/source/weights_loading.rst @@ -92,7 +92,7 @@ You can also control more advanced options, like `save_top_k`, to save the best ) trainer = Trainer(callbacks=[checkpoint_callback]) - + You can retrieve the checkpoint after training by calling .. code-block:: python diff --git a/environment.yml b/environment.yml index 3d59c1eeed0dd..c019580c0b4d0 100644 --- a/environment.yml +++ b/environment.yml @@ -26,11 +26,11 @@ dependencies: - python>=3.6 - pip>20.1 - numpy>=1.16.4 - - pytorch>=1.3,<1.8 + - pytorch>=1.3 - future>=0.17.1 - PyYAML>=5.1 - tqdm>=4.41.0 - - fsspec>=0.8.0 + - fsspec[http]>=0.8.1 #- tensorboard>=2.2.0 # not needed, already included in pytorch # Optional @@ -41,7 +41,7 @@ dependencies: - torchtext>=0.3.1 # Examples - - torchvision>=0.4.1,<0.9.0 + - torchvision>=0.4.1 - pip: - test-tube>=0.7.5 diff --git a/legacy/README.md b/legacy/README.md new file mode 100644 index 0000000000000..3ce6d15f65568 --- /dev/null +++ b/legacy/README.md @@ -0,0 +1,17 @@ +# Maintaining back-compatibility with come legacy versions + +The aim of this section is set some baselines and workflows/guidelines for maintaining back compatibility with some legacies version of PL + +At this moment we focus on ability running old checkpoints, so the flow here is to create a checkpoint with every release and store it in our public AWS storage and so each CI testing will pull this archive and test loading and resuming training with this model. + +If you want to pull all saved version-checkpoints for local testing/development, call +```bash +wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip +unzip -o checkpoints.zip +``` + +To back populate collection with past version you can use following bash: +```bash +bash generate_checkpoints.sh 1.0.2 1.0.3 1.0.4 +zip -r checkpoints.zip checkpoints/ +``` diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/__init__.py b/legacy/checkpoints/.gitkeep similarity index 100% rename from pl_examples/pytorch_ecosystem/pytorch_geometric/__init__.py rename to legacy/checkpoints/.gitkeep diff --git a/legacy/generate_checkpoints.sh b/legacy/generate_checkpoints.sh new file mode 100644 index 0000000000000..7726c5b097c5c --- /dev/null +++ b/legacy/generate_checkpoints.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Sample call: +# bash generate_checkpoints.sh 1.0.2 1.0.3 1.0.4 + +LEGACY_PATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" + +echo $LEGACY_PATH +# install some PT version here so it does not need to reinstalled for each env +pip install virtualenv "torch==1.5" --quiet --no-cache-dir + +ENV_PATH="$LEGACY_PATH/vEnv" + +# iterate over all arguments assuming that each argument is version +for ver in "$@" +do + echo "processing version: $ver" + # mkdir "$LEGACY_PATH/$ver" + + # create local env + echo $ENV_PATH + virtualenv $ENV_PATH --system-site-packages + # activate and install PL version + source "$ENV_PATH/bin/activate" + # there are problem to load ckpt in older versions since they are saved the newer versions + pip install "pytorch_lightning==$ver" "torch==1.3" --quiet --no-cache-dir + + python --version + pip --version + pip list | grep torch + + python "$LEGACY_PATH/zero_training.py" + cp "$LEGACY_PATH/zero_training.py" ${LEGACY_PATH}/checkpoints/${ver} + + mv ${LEGACY_PATH}/checkpoints/${ver}/lightning_logs/version_0/checkpoints/*.ckpt ${LEGACY_PATH}/checkpoints/${ver}/ + rm -rf ${LEGACY_PATH}/checkpoints/${ver}/lightning_logs + + deactivate + # clear env + rm -rf $ENV_PATH + +done diff --git a/legacy/zero_training.py b/legacy/zero_training.py new file mode 100644 index 0000000000000..0115df4143460 --- /dev/null +++ b/legacy/zero_training.py @@ -0,0 +1,93 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import torch +from torch.utils.data import Dataset + +import pytorch_lightning as pl + +PATH_LEGACY = os.path.dirname(__file__) + + +class RandomDataset(Dataset): + def __init__(self, size, length: int = 100): + self.len = length + self.data = torch.randn(length, size) + + def __getitem__(self, index): + return self.data[index] + + def __len__(self): + return self.len + + +class DummyModel(pl.LightningModule): + + def __init__(self): + super().__init__() + self.layer = torch.nn.Linear(32, 2) + + def forward(self, x): + return self.layer(x) + + def _loss(self, batch, prediction): + # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls + return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction)) + + def _step(self, batch, batch_idx): + output = self.layer(batch) + loss = self._loss(batch, output) + # return {'loss': loss} # used for PL<1.0 + return loss # used for PL >= 1.0 + + def training_step(self, batch, batch_idx): + return self._step(batch, batch_idx) + + def validation_step(self, batch, batch_idx): + self._step(batch, batch_idx) + + def test_step(self, batch, batch_idx): + self._step(batch, batch_idx) + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + return [optimizer], [lr_scheduler] + + def train_dataloader(self): + return torch.utils.data.DataLoader(RandomDataset(32, 64)) + + def val_dataloader(self): + return torch.utils.data.DataLoader(RandomDataset(32, 64)) + + def test_dataloader(self): + return torch.utils.data.DataLoader(RandomDataset(32, 64)) + + +def main_train(dir_path, max_epochs: int = 5): + + trainer = pl.Trainer( + default_root_dir=dir_path, + checkpoint_callback=True, + max_epochs=max_epochs, + ) + + model = DummyModel() + trainer.fit(model) + + +if __name__ == '__main__': + path_dir = os.path.join(PATH_LEGACY, 'checkpoints', str(pl.__version__)) + main_train(path_dir) diff --git a/notebooks/04-transformers-text-classification.ipynb b/notebooks/04-transformers-text-classification.ipynb index 037b24e4ddd9d..d52af84a76d97 100644 --- a/notebooks/04-transformers-text-classification.ipynb +++ b/notebooks/04-transformers-text-classification.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, { "cell_type": "markdown", "metadata": { diff --git a/notebooks/05-trainer-flags-overview.ipynb b/notebooks/05-trainer-flags-overview.ipynb index 6413e8239bb2e..da044a9c9b5c6 100644 --- a/notebooks/05-trainer-flags-overview.ipynb +++ b/notebooks/05-trainer-flags-overview.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, { "cell_type": "markdown", "metadata": { diff --git a/notebooks/06-mnist-tpu-training.ipynb b/notebooks/06-mnist-tpu-training.ipynb new file mode 100644 index 0000000000000..9628c8e31879b --- /dev/null +++ b/notebooks/06-mnist-tpu-training.ipynb @@ -0,0 +1,368 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "06-mnist-tpu-training.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "TPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "WsWdLFMVKqbi" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qXO1QLkbRXl0" + }, + "source": [ + "# TPU training with PyTorch Lightning ⚡\n", + "\n", + "In this notebook, we'll train a model on TPUs. Changing one line of code is all you need to that.\n", + "\n", + "The most up to documentation related to TPU training can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/tpu.html).\n", + "\n", + "---\n", + "\n", + " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", + " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", + " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)\n", + " - Ask a question on our [official forum](https://forums.pytorchlightning.ai/)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UmKX0Qa1RaLL" + }, + "source": [ + "### Setup\n", + "\n", + "Lightning is easy to install. Simply ```pip install pytorch-lightning```" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "vAWOr0FZRaIj" + }, + "source": [ + "! pip install pytorch-lightning -qU" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zepCr1upT4Z3" + }, + "source": [ + "### Install Colab TPU compatible PyTorch/TPU wheels and dependencies" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "AYGWh10lRaF1" + }, + "source": [ + "! pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp36-cp36m-linux_x86_64.whl" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "SNHa7DpmRZ-C" + }, + "source": [ + "import torch\n", + "from torch import nn\n", + "import torch.nn.functional as F\n", + "from torch.utils.data import random_split, DataLoader\n", + "\n", + "# Note - you must have torchvision installed for this example\n", + "from torchvision.datasets import MNIST\n", + "from torchvision import transforms\n", + "\n", + "import pytorch_lightning as pl\n", + "from pytorch_lightning.metrics.functional import accuracy" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rjo1dqzGUxt6" + }, + "source": [ + "### Defining The `MNISTDataModule`\n", + "\n", + "Below we define `MNISTDataModule`. You can learn more about datamodules in [docs](https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html) and [datamodule notebook](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/notebooks/02-datamodules.ipynb)." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "pkbrm3YgUxlE" + }, + "source": [ + "class MNISTDataModule(pl.LightningDataModule):\n", + "\n", + " def __init__(self, data_dir: str = './'):\n", + " super().__init__()\n", + " self.data_dir = data_dir\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))\n", + " ])\n", + "\n", + " # self.dims is returned when you call dm.size()\n", + " # Setting default dims here because we know them.\n", + " # Could optionally be assigned dynamically in dm.setup()\n", + " self.dims = (1, 28, 28)\n", + " self.num_classes = 10\n", + "\n", + " def prepare_data(self):\n", + " # download\n", + " MNIST(self.data_dir, train=True, download=True)\n", + " MNIST(self.data_dir, train=False, download=True)\n", + "\n", + " def setup(self, stage=None):\n", + "\n", + " # Assign train/val datasets for use in dataloaders\n", + " if stage == 'fit' or stage is None:\n", + " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", + " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", + "\n", + " # Assign test dataset for use in dataloader(s)\n", + " if stage == 'test' or stage is None:\n", + " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", + "\n", + " def train_dataloader(self):\n", + " return DataLoader(self.mnist_train, batch_size=32)\n", + "\n", + " def val_dataloader(self):\n", + " return DataLoader(self.mnist_val, batch_size=32)\n", + "\n", + " def test_dataloader(self):\n", + " return DataLoader(self.mnist_test, batch_size=32)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nr9AqDWxUxdK" + }, + "source": [ + "### Defining the `LitModel`\n", + "\n", + "Below, we define the model `LitMNIST`." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YKt0KZkOUxVY" + }, + "source": [ + "class LitModel(pl.LightningModule):\n", + " \n", + " def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4):\n", + "\n", + " super().__init__()\n", + "\n", + " self.save_hyperparameters()\n", + "\n", + " self.model = nn.Sequential(\n", + " nn.Flatten(),\n", + " nn.Linear(channels * width * height, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, num_classes)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " x = self.model(x)\n", + " return F.log_softmax(x, dim=1)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " self.log('train_loss', loss, prog_bar=False)\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " preds = torch.argmax(logits, dim=1)\n", + " acc = accuracy(preds, y)\n", + " self.log('val_loss', loss, prog_bar=True)\n", + " self.log('val_acc', acc, prog_bar=True)\n", + " return loss\n", + "\n", + " def configure_optimizers(self):\n", + " optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)\n", + " return optimizer" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Uxl88z06cHyV" + }, + "source": [ + "### TPU Training\n", + "\n", + "Lightning supports training on a single TPU core or 8 TPU cores.\n", + "\n", + "The Trainer parameters `tpu_cores` defines how many TPU cores to train on (1 or 8) / Single TPU core to train on [1].\n", + "\n", + "For Single TPU training, Just pass the TPU core ID [1-8] in a list. Setting `tpu_cores=[5]` will train on TPU core ID 5." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UZ647Xg2gYng" + }, + "source": [ + "Train on TPU core ID 5 with `tpu_cores=[5]`." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bzhJ8g_vUxN2" + }, + "source": [ + "# Init DataModule\n", + "dm = MNISTDataModule()\n", + "# Init model from datamodule's attributes\n", + "model = LitModel(*dm.size(), dm.num_classes)\n", + "# Init trainer\n", + "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, tpu_cores=[5])\n", + "# Train\n", + "trainer.fit(model, dm)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "slMq_0XBglzC" + }, + "source": [ + "Train on single TPU core with `tpu_cores=1`." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "31N5Scf2RZ61" + }, + "source": [ + "# Init DataModule\n", + "dm = MNISTDataModule()\n", + "# Init model from datamodule's attributes\n", + "model = LitModel(*dm.size(), dm.num_classes)\n", + "# Init trainer\n", + "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, tpu_cores=1)\n", + "# Train\n", + "trainer.fit(model, dm)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_v8xcU5Sf_Cv" + }, + "source": [ + "Train on 8 TPU cores with `tpu_cores=8`. You might have to restart the notebook to run it on 8 TPU cores after training on single TPU core." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EFEw7YpLf-gE" + }, + "source": [ + "# Init DataModule\n", + "dm = MNISTDataModule()\n", + "# Init model from datamodule's attributes\n", + "model = LitModel(*dm.size(), dm.num_classes)\n", + "# Init trainer\n", + "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, tpu_cores=8)\n", + "# Train\n", + "trainer.fit(model, dm)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m2mhgEgpRZ1g" + }, + "source": [ + "\n", + "

Congratulations - Time to Join the Community!

\n", + "
\n", + "\n", + "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", + "\n", + "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", + "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", + "\n", + "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", + "\n", + "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n", + "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", + "\n", + "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", + "\n", + "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "\n", + "### Contributions !\n", + "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", + "\n", + "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* You can also contribute your own notebooks with useful examples !\n", + "\n", + "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", + "\n", + "" + ] + } + ] +} diff --git a/notebooks/06-cifar10-baseline.ipynb b/notebooks/07-cifar10-baseline.ipynb similarity index 99% rename from notebooks/06-cifar10-baseline.ipynb rename to notebooks/07-cifar10-baseline.ipynb index d4b2209cc91b6..7adabf382163e 100644 --- a/notebooks/06-cifar10-baseline.ipynb +++ b/notebooks/07-cifar10-baseline.ipynb @@ -4,7 +4,7 @@ "metadata": { "accelerator": "GPU", "colab": { - "name": "06_cifar10_baseline.ipynb", + "name": "07-cifar10-baseline.ipynb", "provenance": [], "collapsed_sections": [] }, diff --git a/notebooks/README.md b/notebooks/README.md index 5d0f3564e9387..a72e154c36410 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -11,4 +11,5 @@ You can easily run any of the official notebooks by clicking the 'Open in Colab' | **GAN** | Train a GAN on the MNIST Dataset. Learn how to use multiple optimizers in Lightning. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/03-basic-gan.ipynb) | | **BERT** | Fine-tune HuggingFace Transformers models on the GLUE Benchmark | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/04-transformers-text-classification.ipynb) | | **Trainer Flags** | Overview of the available Lightning `Trainer` flags | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/05-trainer-flags-overview.ipynb) | -| **94% Baseline CIFAR10** | Establish a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/06-cifar10-baseline.ipynb) | +| **TPU Training** | Train a model on MNIST using TPUs with Lightning | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/06-mnist-tpu-training.ipynb) | +| **94% Baseline CIFAR10** | Establish a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/07-cifar10-baseline.ipynb) | diff --git a/pl_examples/README.md b/pl_examples/README.md index 936f1cc3df0cf..a1cb856eb1e33 100644 --- a/pl_examples/README.md +++ b/pl_examples/README.md @@ -1,4 +1,4 @@ -# Examples +# Examples Our most robust examples showing all sorts of implementations can be found in our sister library [PyTorch-Lightning-Bolts](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2). @@ -14,6 +14,6 @@ In this folder we add 3 simple examples: --- ## Domain examples -This folder contains older examples. You should instead use the examples -in [PyTorch-Lightning-Bolts](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2) +This folder contains older examples. You should instead use the examples +in [PyTorch-Lightning-Bolts](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2) for advanced use cases. diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py index d7cec9fc1bc3a..147fc330ecd59 100644 --- a/pl_examples/__init__.py +++ b/pl_examples/__init__.py @@ -8,3 +8,40 @@ TORCHVISION_AVAILABLE = _module_available("torchvision") DALI_AVAILABLE = _module_available("nvidia.dali") + + +LIGHTNING_LOGO = """ + #### + ########### + #################### + ############################ + ##################################### +############################################## +######################### ################### +####################### ################### +#################### #################### +################## ##################### +################ ###################### +##################### ################# +###################### ################### +##################### ##################### +#################### ####################### +################### ######################### +############################################## + ##################################### + ############################ + #################### + ########## + #### +""" + + +def nice_print(msg, last=False): + print() + print("\033[0;35m" + msg + "\033[0m") + if last: + print() + + +def cli_lightning_logo(): + nice_print(LIGHTNING_LOGO) diff --git a/pl_examples/basic_examples/README.md b/pl_examples/basic_examples/README.md index 18ae204396290..199c453566c6f 100644 --- a/pl_examples/basic_examples/README.md +++ b/pl_examples/basic_examples/README.md @@ -1,5 +1,5 @@ -## Basic Examples -Use these examples to test how lightning works. +## Basic Examples +Use these examples to test how lightning works. #### MNIST Trains MNIST where the model is defined inside the LightningModule. @@ -36,7 +36,7 @@ python image_classifier.py --gpus 2 python image_classifier.py --gpus 2 --distributed_backend 'dp' ``` ---- +--- #### Autoencoder Showing the power of a system... arbitrarily complex training loops ```bash @@ -49,23 +49,23 @@ python autoencoder.py --gpus 2 # dataparallel python autoencoder.py --gpus 2 --distributed_backend 'dp' ``` ---- -# Multi-node example +--- +# Multi-node example This demo launches a job using 2 GPUs on 2 different nodes (4 GPUs total). To run this demo do the following: -1. Log into the jumphost node of your SLURM-managed cluster. -2. Create a conda environment with Lightning and a GPU PyTorch version. -3. Choose a script to submit +1. Log into the jumphost node of your SLURM-managed cluster. +2. Create a conda environment with Lightning and a GPU PyTorch version. +3. Choose a script to submit -#### DDP +#### DDP Submit this job to run with DistributedDataParallel (2 nodes, 2 gpus each) ```bash sbatch submit_ddp_job.sh YourEnv ``` -#### DDP2 +#### DDP2 Submit this job to run with a different implementation of DistributedDataParallel. In this version, each node acts like DataParallel but syncs across nodes like DDP. ```bash diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py index 58a117a648458..e1b284856c3bc 100644 --- a/pl_examples/basic_examples/autoencoder.py +++ b/pl_examples/basic_examples/autoencoder.py @@ -17,20 +17,26 @@ import torch import torch.nn.functional as F from torch import nn -from torch.utils.data import DataLoader -from torch.utils.data import random_split +from torch.utils.data import DataLoader, random_split import pytorch_lightning as pl -from pl_examples import TORCHVISION_AVAILABLE +from pl_examples import cli_lightning_logo, TORCHVISION_AVAILABLE if TORCHVISION_AVAILABLE: - from torchvision.datasets.mnist import MNIST from torchvision import transforms + from torchvision.datasets.mnist import MNIST else: from tests.base.datasets import MNIST class LitAutoEncoder(pl.LightningModule): + """ + >>> LitAutoEncoder() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + LitAutoEncoder( + (encoder): ... + (decoder): ... + ) + """ def __init__(self): super().__init__() @@ -105,4 +111,5 @@ def cli_main(): if __name__ == '__main__': + cli_lightning_logo() cli_main() diff --git a/pl_examples/basic_examples/backbone_image_classifier.py b/pl_examples/basic_examples/backbone_image_classifier.py index 91a8481de7fd9..c4dd4fdc3a478 100644 --- a/pl_examples/basic_examples/backbone_image_classifier.py +++ b/pl_examples/basic_examples/backbone_image_classifier.py @@ -19,16 +19,23 @@ from torch.utils.data import DataLoader, random_split import pytorch_lightning as pl -from pl_examples import DATASETS_PATH, TORCHVISION_AVAILABLE +from pl_examples import cli_lightning_logo, DATASETS_PATH, TORCHVISION_AVAILABLE if TORCHVISION_AVAILABLE: - from torchvision.datasets.mnist import MNIST from torchvision import transforms + from torchvision.datasets.mnist import MNIST else: from tests.base.datasets import MNIST class Backbone(torch.nn.Module): + """ + >>> Backbone() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Backbone( + (l1): Linear(...) + (l2): Linear(...) + ) + """ def __init__(self, hidden_dim=128): super().__init__() self.l1 = torch.nn.Linear(28 * 28, hidden_dim) @@ -42,6 +49,12 @@ def forward(self, x): class LitClassifier(pl.LightningModule): + """ + >>> LitClassifier(Backbone()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + LitClassifier( + (backbone): ... + ) + """ def __init__(self, backbone, learning_rate=1e-3): super().__init__() self.save_hyperparameters() @@ -125,4 +138,5 @@ def cli_main(): if __name__ == '__main__': + cli_lightning_logo() cli_main() diff --git a/pl_examples/basic_examples/conv_sequential_example.py b/pl_examples/basic_examples/conv_sequential_example.py index 36c8c2c1f69b3..39634084860c2 100644 --- a/pl_examples/basic_examples/conv_sequential_example.py +++ b/pl_examples/basic_examples/conv_sequential_example.py @@ -29,6 +29,7 @@ import torchvision import pytorch_lightning as pl +from pl_examples import cli_lightning_logo from pytorch_lightning import Trainer from pytorch_lightning.metrics.functional import accuracy from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin @@ -54,6 +55,12 @@ def forward(self, x): class LitResnet(pl.LightningModule): + """ + >>> LitResnet() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + LitResnet( + (sequential_module): Sequential(...) + ) + """ def __init__(self, lr=0.05, batch_size=32, manual_optimization=False): super().__init__() @@ -190,6 +197,7 @@ def instantiate_datamodule(args): if __name__ == "__main__": + cli_lightning_logo() parser = ArgumentParser(description="Pipe Example") parser.add_argument("--use_ddp_sequential", action="store_true") parser = Trainer.add_argparse_args(parser) diff --git a/pl_examples/basic_examples/dali_image_classifier.py b/pl_examples/basic_examples/dali_image_classifier.py index 0a39f1cb9a9ae..cfa146911dd1b 100644 --- a/pl_examples/basic_examples/dali_image_classifier.py +++ b/pl_examples/basic_examples/dali_image_classifier.py @@ -13,6 +13,7 @@ # limitations under the License. from abc import ABC from argparse import ArgumentParser +from distutils.version import LooseVersion from random import shuffle from warnings import warn @@ -22,21 +23,26 @@ from torch.utils.data import random_split import pytorch_lightning as pl -from pl_examples import TORCHVISION_AVAILABLE, DALI_AVAILABLE +from pl_examples import cli_lightning_logo, DALI_AVAILABLE, TORCHVISION_AVAILABLE if TORCHVISION_AVAILABLE: - from torchvision.datasets.mnist import MNIST from torchvision import transforms + from torchvision.datasets.mnist import MNIST else: from tests.base.datasets import MNIST if DALI_AVAILABLE: - import nvidia.dali.ops as ops + from nvidia.dali import __version__ as dali_version + from nvidia.dali import ops from nvidia.dali.pipeline import Pipeline from nvidia.dali.plugin.pytorch import DALIClassificationIterator + + NEW_DALI_API = LooseVersion(dali_version) >= LooseVersion('0.28.0') + if NEW_DALI_API: + from nvidia.dali.plugin.base_iterator import LastBatchPolicy else: warn('NVIDIA DALI is not available') - ops, Pipeline, DALIClassificationIterator = ..., ABC, ABC + ops, Pipeline, DALIClassificationIterator, LastBatchPolicy = ..., ABC, ABC, ABC class ExternalMNISTInputIterator(object): @@ -97,11 +103,18 @@ def __init__( dynamic_shape=False, last_batch_padded=False, ): - super().__init__(pipelines, size, reader_name, auto_reset, fill_last_batch, dynamic_shape, last_batch_padded) + if NEW_DALI_API: + last_batch_policy = LastBatchPolicy.FILL if fill_last_batch else LastBatchPolicy.DROP + super().__init__(pipelines, size, reader_name, auto_reset, dynamic_shape, + last_batch_policy=last_batch_policy, last_batch_padded=last_batch_padded) + else: + super().__init__(pipelines, size, reader_name, auto_reset, fill_last_batch, + dynamic_shape, last_batch_padded) + self._fill_last_batch = fill_last_batch def __len__(self): batch_count = self._size // (self._num_gpus * self.batch_size) - last_batch = 1 if self._fill_last_batch else 0 + last_batch = 1 if self._fill_last_batch else 1 return batch_count + last_batch @@ -178,7 +191,7 @@ def cli_main(): eii_test = ExternalMNISTInputIterator(mnist_test, args.batch_size) pipe_train = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_train, num_threads=2, device_id=0) - train_loader = DALIClassificationLoader(pipe_train, size=len(mnist_train), auto_reset=True, fill_last_batch=False) + train_loader = DALIClassificationLoader(pipe_train, size=len(mnist_train), auto_reset=True, fill_last_batch=True) pipe_val = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_val, num_threads=2, device_id=0) val_loader = DALIClassificationLoader(pipe_val, size=len(mnist_val), auto_reset=True, fill_last_batch=False) @@ -204,4 +217,5 @@ def cli_main(): if __name__ == "__main__": + cli_lightning_logo() cli_main() diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py index eb1415cf8b981..27a7590b64ee9 100644 --- a/pl_examples/basic_examples/mnist_datamodule.py +++ b/pl_examples/basic_examples/mnist_datamodule.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import platform from typing import Optional from torch.utils.data import DataLoader, random_split @@ -29,6 +29,9 @@ class MNISTDataModule(LightningDataModule): """ Standard MNIST, train, val, test splits and transforms + + >>> MNISTDataModule() # doctest: +ELLIPSIS + <...mnist_datamodule.MNISTDataModule object at ...> """ name = "mnist" @@ -52,6 +55,9 @@ def __init__( normalize: If true applies image normalize """ super().__init__(*args, **kwargs) + if platform.system() == "Windows": + # see: https://stackoverflow.com/a/59680818/4521646 + num_workers = 0 self.dims = (1, 28, 28) self.data_dir = data_dir diff --git a/pl_examples/basic_examples/simple_image_classifier.py b/pl_examples/basic_examples/simple_image_classifier.py index a341728554d31..894eeea619ba9 100644 --- a/pl_examples/basic_examples/simple_image_classifier.py +++ b/pl_examples/basic_examples/simple_image_classifier.py @@ -19,10 +19,18 @@ from torch.nn import functional as F import pytorch_lightning as pl +from pl_examples import cli_lightning_logo from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule class LitClassifier(pl.LightningModule): + """ + >>> LitClassifier() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + LitClassifier( + (l1): Linear(...) + (l2): Linear(...) + ) + """ def __init__(self, hidden_dim=128, learning_rate=1e-3): super().__init__() self.save_hyperparameters() @@ -103,4 +111,5 @@ def cli_main(): if __name__ == '__main__': + cli_lightning_logo() cli_main() diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py index dbea2013d1110..1351048711df4 100644 --- a/pl_examples/bug_report_model.py +++ b/pl_examples/bug_report_model.py @@ -20,12 +20,19 @@ # -------------------------------------------- # -------------------------------------------- import os + import torch from torch.utils.data import Dataset -from pytorch_lightning import Trainer, LightningModule + +from pl_examples import cli_lightning_logo +from pytorch_lightning import LightningModule, Trainer class RandomDataset(Dataset): + """ + >>> RandomDataset(size=10, length=20) # doctest: +ELLIPSIS + <...bug_report_model.RandomDataset object at ...> + """ def __init__(self, size, length): self.len = length self.data = torch.randn(length, size) @@ -38,6 +45,12 @@ def __len__(self): class BoringModel(LightningModule): + """ + >>> BoringModel() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + BoringModel( + (layer): Linear(...) + ) + """ def __init__(self): """ @@ -111,10 +124,9 @@ def configure_optimizers(self): # parser = ArgumentParser() # args = parser.parse_args(opt) -def run_test(): +def test_run(): class TestModel(BoringModel): - def on_train_epoch_start(self) -> None: print('override any method to prove your bug') @@ -137,4 +149,5 @@ def on_train_epoch_start(self) -> None: if __name__ == '__main__': - run_test() + cli_lightning_logo() + test_run() diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py index 21f6644b09a5b..733fd8646142e 100644 --- a/pl_examples/domain_templates/computer_vision_fine_tuning.py +++ b/pl_examples/domain_templates/computer_vision_fine_tuning.py @@ -1,21 +1,39 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """Computer vision example on Transfer Learning. This computer vision example illustrates how one could fine-tune a pre-trained network (by default, a ResNet50 is used) using pytorch-lightning. For the sake of this example, the 'cats and dogs dataset' (~60MB, see `DATA_URL` below) and the proposed network (denoted by `TransferLearningModel`, see below) is -trained for 15 epochs. The training consists in three stages. From epoch 0 to -4, the feature extractor (the pre-trained network) is frozen except maybe for -the BatchNorm layers (depending on whether `train_bn = True`). The BatchNorm -layers (if `train_bn = True`) and the parameters of the classifier are trained -as a single parameters group with lr = 1e-2. From epoch 5 to 9, the last two -layer groups of the pre-trained network are unfrozen and added to the -optimizer as a new parameter group with lr = 1e-4 (while lr = 1e-3 for the -first parameter group in the optimizer). Eventually, from epoch 10, all the -remaining layer groups of the pre-trained network are unfrozen and added to -the optimizer as a third parameter group. From epoch 10, the parameters of the -pre-trained network are trained with lr = 1e-5 while those of the classifier -are trained with lr = 1e-4. +trained for 15 epochs. + +The training consists of three stages. + +From epoch 0 to 4, the feature extractor (the pre-trained network) is frozen except +maybe for the BatchNorm layers (depending on whether `train_bn = True`). The BatchNorm +layers (if `train_bn = True`) and the parameters of the classifier are trained as a +single parameters group with lr = 1e-2. + +From epoch 5 to 9, the last two layer groups of the pre-trained network are unfrozen +and added to the optimizer as a new parameter group with lr = 1e-4 (while lr = 1e-3 +for the first parameter group in the optimizer). + +Eventually, from epoch 10, all the remaining layer groups of the pre-trained network +are unfrozen and added to the optimizer as a third parameter group. From epoch 10, +the parameters of the pre-trained network are trained with lr = 1e-5 while those of +the classifier is trained with lr = 1e-4. Note: See: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html @@ -25,7 +43,7 @@ from collections import OrderedDict from pathlib import Path from tempfile import TemporaryDirectory -from typing import Optional, Generator, Union +from typing import Generator, Optional, Union import torch import torch.nn.functional as F @@ -34,16 +52,16 @@ from torch.optim.lr_scheduler import MultiStepLR from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader -from torchvision import models -from torchvision import transforms +from torchvision import models, transforms from torchvision.datasets import ImageFolder from torchvision.datasets.utils import download_and_extract_archive import pytorch_lightning as pl +from pl_examples import cli_lightning_logo from pytorch_lightning import _logger as log BN_TYPES = (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d) -DATA_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip' +DATA_URL = "https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip" # --- Utility functions --- @@ -60,8 +78,7 @@ def _make_trainable(module: Module) -> None: module.train() -def _recursive_freeze(module: Module, - train_bn: bool = True) -> None: +def _recursive_freeze(module: Module, train_bn: bool = True) -> None: """Freezes the layers of a given module. Args: @@ -82,9 +99,7 @@ def _recursive_freeze(module: Module, _recursive_freeze(module=child, train_bn=train_bn) -def freeze(module: Module, - n: Optional[int] = None, - train_bn: bool = True) -> None: +def freeze(module: Module, n: Optional[int] = None, train_bn: bool = True) -> None: """Freezes the layers up to index n (if n is not None). Args: @@ -103,8 +118,7 @@ def freeze(module: Module, _make_trainable(module=child) -def filter_params(module: Module, - train_bn: bool = True) -> Generator: +def filter_params(module: Module, train_bn: bool = True) -> Generator: """Yields the trainable parameters of a given module. Args: @@ -126,17 +140,18 @@ def filter_params(module: Module, yield param -def _unfreeze_and_add_param_group(module: Module, - optimizer: Optimizer, - lr: Optional[float] = None, - train_bn: bool = True): +def _unfreeze_and_add_param_group( + module: Module, optimizer: Optimizer, lr: Optional[float] = None, train_bn: bool = True +): """Unfreezes a module and adds its parameters to an optimizer.""" _make_trainable(module) - params_lr = optimizer.param_groups[0]['lr'] if lr is None else float(lr) + params_lr = optimizer.param_groups[0]["lr"] if lr is None else float(lr) optimizer.add_param_group( - {'params': filter_params(module=module, train_bn=train_bn), - 'lr': params_lr / 10., - }) + { + "params": filter_params(module=module, train_bn=train_bn), + "lr": params_lr / 10.0, + } + ) # --- Pytorch-lightning module --- @@ -145,19 +160,30 @@ def _unfreeze_and_add_param_group(module: Module, class TransferLearningModel(pl.LightningModule): """Transfer Learning with pre-trained ResNet50. - Args: - hparams: Model hyperparameters - dl_path: Path where the data will be downloaded + >>> with TemporaryDirectory(dir='.') as tmp_dir: + ... TransferLearningModel(tmp_dir) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + TransferLearningModel( + (feature_extractor): Sequential(...) + (fc): Sequential(...) + ) """ - def __init__(self, - dl_path: Union[str, Path], - backbone: str = 'resnet50', - train_bn: bool = True, - milestones: tuple = (5, 10), - batch_size: int = 8, - lr: float = 1e-2, - lr_scheduler_gamma: float = 1e-1, - num_workers: int = 6, **kwargs) -> None: + + def __init__( + self, + dl_path: Union[str, Path], + backbone: str = "resnet50", + train_bn: bool = True, + milestones: tuple = (5, 10), + batch_size: int = 8, + lr: float = 1e-2, + lr_scheduler_gamma: float = 1e-1, + num_workers: int = 6, + **kwargs, + ) -> None: + """ + Args: + dl_path: Path where the data will be downloaded + """ super().__init__() self.dl_path = dl_path self.backbone = backbone @@ -171,6 +197,10 @@ def __init__(self, self.dl_path = dl_path self.__build_model() + self.train_acc = pl.metrics.Accuracy() + self.valid_acc = pl.metrics.Accuracy() + self.save_hyperparameters() + def __build_model(self): """Define model layers & loss.""" @@ -183,9 +213,7 @@ def __build_model(self): freeze(module=self.feature_extractor, train_bn=self.train_bn) # 2. Classifier: - _fc_layers = [torch.nn.Linear(2048, 256), - torch.nn.Linear(256, 32), - torch.nn.Linear(32, 1)] + _fc_layers = [torch.nn.Linear(2048, 256), torch.nn.Linear(256, 32), torch.nn.Linear(32, 1)] self.fc = torch.nn.Sequential(*_fc_layers) # 3. Loss: @@ -212,27 +240,24 @@ def train(self, mode=True): epoch = self.current_epoch if epoch < self.milestones[0] and mode: # feature extractor is frozen (except for BatchNorm layers) - freeze(module=self.feature_extractor, - train_bn=self.train_bn) + freeze(module=self.feature_extractor, train_bn=self.train_bn) elif self.milestones[0] <= epoch < self.milestones[1] and mode: # Unfreeze last two layers of the feature extractor - freeze(module=self.feature_extractor, - n=-2, - train_bn=self.train_bn) + freeze(module=self.feature_extractor, n=-2, train_bn=self.train_bn) def on_epoch_start(self): """Use `on_epoch_start` to unfreeze layers progressively.""" optimizer = self.trainer.optimizers[0] if self.current_epoch == self.milestones[0]: - _unfreeze_and_add_param_group(module=self.feature_extractor[-2:], - optimizer=optimizer, - train_bn=self.train_bn) + _unfreeze_and_add_param_group( + module=self.feature_extractor[-2:], optimizer=optimizer, train_bn=self.train_bn + ) elif self.current_epoch == self.milestones[1]: - _unfreeze_and_add_param_group(module=self.feature_extractor[:-2], - optimizer=optimizer, - train_bn=self.train_bn) + _unfreeze_and_add_param_group( + module=self.feature_extractor[:-2], optimizer=optimizer, train_bn=self.train_bn + ) def training_step(self, batch, batch_idx): @@ -240,32 +265,22 @@ def training_step(self, batch, batch_idx): x, y = batch y_logits = self.forward(x) y_true = y.view((-1, 1)).type_as(x) - y_bin = torch.ge(y_logits, 0) # 2. Compute loss & accuracy: train_loss = self.loss(y_logits, y_true) - num_correct = torch.eq(y_bin.view(-1), y_true.view(-1)).sum() + accuracy = self.train_acc(y_logits, y_true) # 3. Outputs: - tqdm_dict = {'train_loss': train_loss} - output = OrderedDict({'loss': train_loss, - 'num_correct': num_correct, - 'log': tqdm_dict, - 'progress_bar': tqdm_dict}) - - return output + tqdm_dict = {"train_loss": train_loss} + self.log_dict(tqdm_dict, prog_bar=True) + return {"loss": train_loss} def training_epoch_end(self, outputs): """Compute and log training loss and accuracy at the epoch level.""" - train_loss_mean = torch.stack([output['loss'] - for output in outputs]).mean() - train_acc_mean = torch.stack([output['num_correct'] - for output in outputs]).sum().float() - train_acc_mean /= (len(outputs) * self.batch_size) - return {'log': {'train_loss': train_loss_mean, - 'train_acc': train_acc_mean, - 'step': self.current_epoch}} + train_loss_mean = torch.stack([output["loss"] for output in outputs]).mean() + train_acc_mean = self.train_acc.compute() + self.log_dict({"train_loss": train_loss_mean, "train_acc": train_acc_mean, "step": self.current_epoch}) def validation_step(self, batch, batch_idx): @@ -273,142 +288,121 @@ def validation_step(self, batch, batch_idx): x, y = batch y_logits = self.forward(x) y_true = y.view((-1, 1)).type_as(x) - y_bin = torch.ge(y_logits, 0) # 2. Compute loss & accuracy: val_loss = self.loss(y_logits, y_true) - num_correct = torch.eq(y_bin.view(-1), y_true.view(-1)).sum() + accuracy = self.valid_acc(y_logits, y_true) - return {'val_loss': val_loss, - 'num_correct': num_correct} + return {"val_loss": val_loss} def validation_epoch_end(self, outputs): """Compute and log validation loss and accuracy at the epoch level.""" - val_loss_mean = torch.stack([output['val_loss'] - for output in outputs]).mean() - val_acc_mean = torch.stack([output['num_correct'] - for output in outputs]).sum().float() - val_acc_mean /= (len(outputs) * self.batch_size) - return {'log': {'val_loss': val_loss_mean, - 'val_acc': val_acc_mean, - 'step': self.current_epoch}} + val_loss_mean = torch.stack([output["val_loss"] for output in outputs]).mean() + train_acc_mean = self.valid_acc.compute() + log_dict = {"val_loss": val_loss_mean, "val_acc": train_acc_mean} + self.log_dict(log_dict, prog_bar=True) + self.log_dict({"step": self.current_epoch}) def configure_optimizers(self): - optimizer = optim.Adam(filter(lambda p: p.requires_grad, - self.parameters()), - lr=self.lr) + optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=self.lr) - scheduler = MultiStepLR(optimizer, - milestones=self.milestones, - gamma=self.lr_scheduler_gamma) + scheduler = MultiStepLR(optimizer, milestones=self.milestones, gamma=self.lr_scheduler_gamma) return [optimizer], [scheduler] def prepare_data(self): """Download images and prepare images datasets.""" - download_and_extract_archive(url=DATA_URL, - download_root=self.dl_path, - remove_finished=True) + download_and_extract_archive(url=DATA_URL, download_root=self.dl_path, remove_finished=True) def setup(self, stage: str): - data_path = Path(self.dl_path).joinpath('cats_and_dogs_filtered') + data_path = Path(self.dl_path).joinpath("cats_and_dogs_filtered") # 2. Load the data + preprocessing & data augmentation - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - - train_dataset = ImageFolder(root=data_path.joinpath('train'), - transform=transforms.Compose([ - transforms.Resize((224, 224)), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - - valid_dataset = ImageFolder(root=data_path.joinpath('validation'), - transform=transforms.Compose([ - transforms.Resize((224, 224)), - transforms.ToTensor(), - normalize, - ])) + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + + train_dataset = ImageFolder( + root=data_path.joinpath("train"), + transform=transforms.Compose( + [ + transforms.Resize((224, 224)), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ] + ), + ) + + valid_dataset = ImageFolder( + root=data_path.joinpath("validation"), + transform=transforms.Compose( + [ + transforms.Resize((224, 224)), + transforms.ToTensor(), + normalize, + ] + ), + ) self.train_dataset = train_dataset self.valid_dataset = valid_dataset - def __dataloader(self, train): + def __dataloader(self, train: bool): """Train/validation loaders.""" _dataset = self.train_dataset if train else self.valid_dataset - loader = DataLoader(dataset=_dataset, - batch_size=self.batch_size, - num_workers=self.num_workers, - shuffle=True if train else False) + loader = DataLoader(dataset=_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=train) return loader def train_dataloader(self): - log.info('Training data loaded.') + log.info("Training data loaded.") return self.__dataloader(train=True) def val_dataloader(self): - log.info('Validation data loaded.') + log.info("Validation data loaded.") return self.__dataloader(train=False) @staticmethod def add_model_specific_args(parent_parser): parser = argparse.ArgumentParser(parents=[parent_parser]) - parser.add_argument('--backbone', - default='resnet50', - type=str, - metavar='BK', - help='Name (as in ``torchvision.models``) of the feature extractor') - parser.add_argument('--epochs', - default=15, - type=int, - metavar='N', - help='total number of epochs', - dest='nb_epochs') - parser.add_argument('--batch-size', - default=8, - type=int, - metavar='B', - help='batch size', - dest='batch_size') - parser.add_argument('--gpus', - type=int, - default=1, - help='number of gpus to use') - parser.add_argument('--lr', - '--learning-rate', - default=1e-2, - type=float, - metavar='LR', - help='initial learning rate', - dest='lr') - parser.add_argument('--lr-scheduler-gamma', - default=1e-1, - type=float, - metavar='LRG', - help='Factor by which the learning rate is reduced at each milestone', - dest='lr_scheduler_gamma') - parser.add_argument('--num-workers', - default=6, - type=int, - metavar='W', - help='number of CPU workers', - dest='num_workers') - parser.add_argument('--train-bn', - default=True, - type=bool, - metavar='TB', - help='Whether the BatchNorm layers should be trainable', - dest='train_bn') - parser.add_argument('--milestones', - default=[5, 10], - type=list, - metavar='M', - help='List of two epochs milestones') + parser.add_argument( + "--backbone", + default="resnet50", + type=str, + metavar="BK", + help="Name (as in ``torchvision.models``) of the feature extractor", + ) + parser.add_argument( + "--epochs", default=15, type=int, metavar="N", help="total number of epochs", dest="nb_epochs" + ) + parser.add_argument("--batch-size", default=8, type=int, metavar="B", help="batch size", dest="batch_size") + parser.add_argument("--gpus", type=int, default=1, help="number of gpus to use") + parser.add_argument( + "--lr", "--learning-rate", default=1e-2, type=float, metavar="LR", help="initial learning rate", dest="lr" + ) + parser.add_argument( + "--lr-scheduler-gamma", + default=1e-1, + type=float, + metavar="LRG", + help="Factor by which the learning rate is reduced at each milestone", + dest="lr_scheduler_gamma", + ) + parser.add_argument( + "--num-workers", default=6, type=int, metavar="W", help="number of CPU workers", dest="num_workers" + ) + parser.add_argument( + "--train-bn", + default=True, + type=bool, + metavar="TB", + help="Whether the BatchNorm layers should be trainable", + dest="train_bn", + ) + parser.add_argument( + "--milestones", default=[5, 10], type=list, metavar="M", help="List of two epochs milestones" + ) return parser @@ -433,22 +427,26 @@ def main(args: argparse.Namespace) -> None: num_sanity_val_steps=0, gpus=args.gpus, min_epochs=args.nb_epochs, - max_epochs=args.nb_epochs) + max_epochs=args.nb_epochs, + ) trainer.fit(model) def get_args() -> argparse.Namespace: parent_parser = argparse.ArgumentParser(add_help=False) - parent_parser.add_argument('--root-data-path', - metavar='DIR', - type=str, - default=Path.cwd().as_posix(), - help='Root directory where to download the data', - dest='root_data_path') + parent_parser.add_argument( + "--root-data-path", + metavar="DIR", + type=str, + default=Path.cwd().as_posix(), + help="Root directory where to download the data", + dest="root_data_path", + ) parser = TransferLearningModel.add_model_specific_args(parent_parser) return parser.parse_args() -if __name__ == '__main__': +if __name__ == "__main__": + cli_lightning_logo() main(get_args()) diff --git a/pl_examples/domain_templates/generative_adversarial_net.py b/pl_examples/domain_templates/generative_adversarial_net.py index 088b625e31d01..35b5563d2c1cc 100644 --- a/pl_examples/domain_templates/generative_adversarial_net.py +++ b/pl_examples/domain_templates/generative_adversarial_net.py @@ -1,3 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ To run this template just do: python generative_adversarial_net.py @@ -18,12 +31,19 @@ from torch.utils.data import DataLoader from torchvision.datasets import MNIST -from pytorch_lightning.core import LightningModule, LightningDataModule +from pl_examples import cli_lightning_logo +from pytorch_lightning.core import LightningDataModule, LightningModule from pytorch_lightning.trainer import Trainer class Generator(nn.Module): - def __init__(self, latent_dim, img_shape): + """ + >>> Generator(img_shape=(1, 8, 8)) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Generator( + (model): Sequential(...) + ) + """ + def __init__(self, latent_dim: int = 100, img_shape: tuple = (1, 28, 28)): super().__init__() self.img_shape = img_shape @@ -50,6 +70,12 @@ def forward(self, z): class Discriminator(nn.Module): + """ + >>> Discriminator(img_shape=(1, 28, 28)) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Discriminator( + (model): Sequential(...) + ) + """ def __init__(self, img_shape): super().__init__() @@ -69,6 +95,37 @@ def forward(self, img): class GAN(LightningModule): + """ + >>> GAN(img_shape=(1, 8, 8)) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + GAN( + (generator): Generator( + (model): Sequential(...) + ) + (discriminator): Discriminator( + (model): Sequential(...) + ) + ) + """ + def __init__( + self, + img_shape: tuple = (1, 28, 28), + lr: float = 0.0002, + b1: float = 0.5, + b2: float = 0.999, + latent_dim: int = 100, + ): + super().__init__() + + self.save_hyperparameters() + + # networks + self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=img_shape) + self.discriminator = Discriminator(img_shape=img_shape) + + self.validation_z = torch.randn(8, self.hparams.latent_dim) + + self.example_input_array = torch.zeros(2, self.hparams.latent_dim) + @staticmethod def add_argparse_args(parent_parser: ArgumentParser): parser = ArgumentParser(parents=[parent_parser], add_help=False) @@ -82,20 +139,6 @@ def add_argparse_args(parent_parser: ArgumentParser): return parser - def __init__(self, hparams: Namespace): - super().__init__() - - self.hparams = hparams - - # networks - mnist_shape = (1, 28, 28) - self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=mnist_shape) - self.discriminator = Discriminator(img_shape=mnist_shape) - - self.validation_z = torch.randn(8, self.hparams.latent_dim) - - self.example_input_array = torch.zeros(2, self.hparams.latent_dim) - def forward(self, z): return self.generator(z) @@ -166,6 +209,10 @@ def on_epoch_end(self): class MNISTDataModule(LightningDataModule): + """ + >>> MNISTDataModule() # doctest: +ELLIPSIS + <...generative_adversarial_net.MNISTDataModule object at ...> + """ def __init__(self, batch_size: int = 64, data_path: str = os.getcwd(), num_workers: int = 4): super().__init__() self.batch_size = batch_size @@ -211,6 +258,7 @@ def main(args: Namespace) -> None: if __name__ == '__main__': + cli_lightning_logo() parser = ArgumentParser() # Add program level args, if any. diff --git a/pl_examples/domain_templates/imagenet.py b/pl_examples/domain_templates/imagenet.py index b7116547d389b..cc36f3542a1c8 100644 --- a/pl_examples/domain_templates/imagenet.py +++ b/pl_examples/domain_templates/imagenet.py @@ -1,3 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This example is largely adapted from https://github.com/pytorch/examples/blob/master/imagenet/main.py @@ -32,10 +45,17 @@ import torchvision.transforms as transforms import pytorch_lightning as pl +from pl_examples import cli_lightning_logo from pytorch_lightning.core import LightningModule class ImageNetLightningModel(LightningModule): + """ + >>> ImageNetLightningModel(data_path='missing') # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + ImageNetLightningModel( + (model): ResNet(...) + ) + """ # pull out resnet names from torchvision models MODEL_NAMES = sorted( name for name in models.__dict__ @@ -44,14 +64,14 @@ class ImageNetLightningModel(LightningModule): def __init__( self, - arch: str, - pretrained: bool, - lr: float, - momentum: float, - weight_decay: int, data_path: str, - batch_size: int, - workers: int, + arch: str = 'resnet18', + pretrained: bool = False, + lr: float = 0.1, + momentum: float = 0.9, + weight_decay: float = 1e-4, + batch_size: int = 4, + workers: int = 2, **kwargs, ): super().__init__() @@ -246,4 +266,5 @@ def run_cli(): if __name__ == '__main__': + cli_lightning_logo() run_cli() diff --git a/pl_examples/domain_templates/reinforce_learn_Qnet.py b/pl_examples/domain_templates/reinforce_learn_Qnet.py index 4b01f83e36639..c817f69ee205d 100644 --- a/pl_examples/domain_templates/reinforce_learn_Qnet.py +++ b/pl_examples/domain_templates/reinforce_learn_Qnet.py @@ -1,3 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Deep Reinforcement Learning: Deep Q-network (DQN) @@ -20,8 +33,8 @@ """ import argparse -from collections import OrderedDict, deque, namedtuple -from typing import Tuple, List +from collections import deque, namedtuple, OrderedDict +from typing import List, Tuple import gym import numpy as np @@ -33,19 +46,26 @@ from torch.utils.data.dataset import IterableDataset import pytorch_lightning as pl +from pl_examples import cli_lightning_logo class DQN(nn.Module): """ Simple MLP network - Args: - obs_size: observation/state size of the environment - n_actions: number of discrete actions available in the environment - hidden_size: size of hidden layers + >>> DQN(10, 5) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + DQN( + (net): Sequential(...) + ) """ def __init__(self, obs_size: int, n_actions: int, hidden_size: int = 128): + """ + Args: + obs_size: observation/state size of the environment + n_actions: number of discrete actions available in the environment + hidden_size: size of hidden layers + """ super(DQN, self).__init__() self.net = nn.Sequential( nn.Linear(obs_size, hidden_size), @@ -67,11 +87,15 @@ class ReplayBuffer: """ Replay Buffer for storing past experiences allowing the agent to learn from them - Args: - capacity: size of the buffer + >>> ReplayBuffer(5) # doctest: +ELLIPSIS + <...reinforce_learn_Qnet.ReplayBuffer object at ...> """ def __init__(self, capacity: int) -> None: + """ + Args: + capacity: size of the buffer + """ self.buffer = deque(maxlen=capacity) def __len__(self) -> int: @@ -99,12 +123,16 @@ class RLDataset(IterableDataset): Iterable Dataset containing the ExperienceBuffer which will be updated with new experiences during training - Args: - buffer: replay buffer - sample_size: number of experiences to sample at a time + >>> RLDataset(ReplayBuffer(5)) # doctest: +ELLIPSIS + <...reinforce_learn_Qnet.RLDataset object at ...> """ def __init__(self, buffer: ReplayBuffer, sample_size: int = 200) -> None: + """ + Args: + buffer: replay buffer + sample_size: number of experiences to sample at a time + """ self.buffer = buffer self.sample_size = sample_size @@ -118,12 +146,18 @@ class Agent: """ Base Agent class handling the interaction with the environment - Args: - env: training environment - replay_buffer: replay buffer storing experiences + >>> env = gym.make("CartPole-v0") + >>> buffer = ReplayBuffer(10) + >>> Agent(env, buffer) # doctest: +ELLIPSIS + <...reinforce_learn_Qnet.Agent object at ...> """ def __init__(self, env: gym.Env, replay_buffer: ReplayBuffer) -> None: + """ + Args: + env: training environment + replay_buffer: replay buffer storing experiences + """ self.env = env self.replay_buffer = replay_buffer self.reset() @@ -190,20 +224,34 @@ def play_step(self, net: nn.Module, epsilon: float = 0.0, device: str = 'cpu') - class DQNLightning(pl.LightningModule): - """ Basic DQN Model """ - - def __init__(self, - replay_size, - warm_start_steps: int, - gamma: float, - eps_start: int, - eps_end: int, - eps_last_frame: int, - sync_rate, - lr: float, - episode_length, - batch_size, **kwargs) -> None: - super().__init__() + """ Basic DQN Model + + >>> DQNLightning(env="CartPole-v0") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + DQNLightning( + (net): DQN( + (net): Sequential(...) + ) + (target_net): DQN( + (net): Sequential(...) + ) + ) + """ + def __init__( + self, + env: str, + replay_size: int = 200, + warm_start_steps: int = 200, + gamma: float = 0.99, + eps_start: float = 1.0, + eps_end: float = 0.01, + eps_last_frame: int = 200, + sync_rate: int = 10, + lr: float = 1e-2, + episode_length: int = 50, + batch_size: int = 4, + **kwargs, + ) -> None: + super().__init__(**kwargs) self.replay_size = replay_size self.warm_start_steps = warm_start_steps self.gamma = gamma @@ -215,7 +263,7 @@ def __init__(self, self.episode_length = episode_length self.batch_size = batch_size - self.env = gym.make(self.env) + self.env = gym.make(env) obs_size = self.env.observation_space.shape[0] n_actions = self.env.action_space.n @@ -288,8 +336,7 @@ def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], nb_batch) -> O Training loss and log metrics """ device = self.get_device(batch) - epsilon = max(self.eps_end, self.eps_start - - self.global_step + 1 / self.eps_last_frame) + epsilon = max(self.eps_end, self.eps_start - self.global_step + 1 / self.eps_last_frame) # step through environment with agent reward, done = self.agent.play_step(self.net, epsilon, device) @@ -335,6 +382,26 @@ def get_device(self, batch) -> str: """Retrieve device currently being used by minibatch""" return batch[0].device.index if self.on_gpu else 'cpu' + @staticmethod + def add_model_specific_args(parent_parser): # pragma: no-cover + parser = argparse.ArgumentParser(parents=[parent_parser]) + parser.add_argument("--batch_size", type=int, default=16, help="size of the batches") + parser.add_argument("--lr", type=float, default=1e-2, help="learning rate") + parser.add_argument("--env", type=str, default="CartPole-v0", help="gym environment tag") + parser.add_argument("--gamma", type=float, default=0.99, help="discount factor") + parser.add_argument("--sync_rate", type=int, default=10, + help="how many frames do we update the target network") + parser.add_argument("--replay_size", type=int, default=1000, + help="capacity of the replay buffer") + parser.add_argument("--warm_start_steps", type=int, default=1000, + help="how many samples do we use to fill our buffer at the start of training") + parser.add_argument("--eps_last_frame", type=int, default=1000, + help="what frame should epsilon stop decaying") + parser.add_argument("--eps_start", type=float, default=1.0, help="starting value of epsilon") + parser.add_argument("--eps_end", type=float, default=0.01, help="final value of epsilon") + parser.add_argument("--episode_length", type=int, default=200, help="max length of an episode") + return parser + def main(args) -> None: model = DQNLightning(**vars(args)) @@ -349,30 +416,12 @@ def main(args) -> None: if __name__ == '__main__': + cli_lightning_logo() torch.manual_seed(0) np.random.seed(0) - parser = argparse.ArgumentParser() - parser.add_argument("--batch_size", type=int, default=16, help="size of the batches") - parser.add_argument("--lr", type=float, default=1e-2, help="learning rate") - parser.add_argument("--env", type=str, default="CartPole-v0", help="gym environment tag") - parser.add_argument("--gamma", type=float, default=0.99, help="discount factor") - parser.add_argument("--sync_rate", type=int, default=10, - help="how many frames do we update the target network") - parser.add_argument("--replay_size", type=int, default=1000, - help="capacity of the replay buffer") - parser.add_argument("--warm_start_size", type=int, default=1000, - help="how many samples do we use to fill our buffer at the start of training") - parser.add_argument("--eps_last_frame", type=int, default=1000, - help="what frame should epsilon stop decaying") - parser.add_argument("--eps_start", type=float, default=1.0, help="starting value of epsilon") - parser.add_argument("--eps_end", type=float, default=0.01, help="final value of epsilon") - parser.add_argument("--episode_length", type=int, default=200, help="max length of an episode") - parser.add_argument("--max_episode_reward", type=int, default=200, - help="max episode reward in the environment") - parser.add_argument("--warm_start_steps", type=int, default=1000, - help="max episode reward in the environment") - + parser = argparse.ArgumentParser(add_help=False) + parser = DQNLightning.add_model_specific_args(parser) args = parser.parse_args() main(args) diff --git a/pl_examples/domain_templates/semantic_segmentation.py b/pl_examples/domain_templates/semantic_segmentation.py index 4ca1ebc2aec76..8ffd539b80aaf 100644 --- a/pl_examples/domain_templates/semantic_segmentation.py +++ b/pl_examples/domain_templates/semantic_segmentation.py @@ -1,3 +1,17 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import random from argparse import ArgumentParser, Namespace @@ -10,6 +24,7 @@ from torch.utils.data import DataLoader, Dataset import pytorch_lightning as pl +from pl_examples import cli_lightning_logo from pl_examples.domain_templates.unet import UNet from pytorch_lightning.loggers import WandbLogger @@ -17,6 +32,19 @@ DEFAULT_VALID_LABELS = (7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33) +def _create_synth_kitti_dataset(path_dir: str, image_dims: tuple = (1024, 512)): + """Create synthetic dataset with random images, just to simulate that the dataset have been already downloaded.""" + path_dir_images = os.path.join(path_dir, KITTI.IMAGE_PATH) + path_dir_masks = os.path.join(path_dir, KITTI.MASK_PATH) + for p_dir in (path_dir_images, path_dir_masks): + os.makedirs(p_dir, exist_ok=True) + for i in range(3): + path_img = os.path.join(path_dir_images, f'dummy_kitti_{i}.png') + Image.new('RGB', image_dims).save(path_img) + path_mask = os.path.join(path_dir_masks, f'dummy_kitti_{i}.png') + Image.new('L', image_dims).save(path_mask) + + class KITTI(Dataset): """ Class for KITTI Semantic Segmentation Benchmark dataset @@ -38,6 +66,12 @@ class KITTI(Dataset): In the `get_item` function, images and masks are resized to the given `img_size`, masks are encoded using `encode_segmap`, and given `transform` (if any) are applied to the image only (mask does not usually require transforms, but they can be implemented in a similar way). + + >>> from pl_examples import DATASETS_PATH + >>> dataset_path = os.path.join(DATASETS_PATH, "Kitti") + >>> _create_synth_kitti_dataset(dataset_path, image_dims=(1024, 512)) + >>> KITTI(dataset_path, 'train') # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + <...semantic_segmentation.KITTI object at ...> """ IMAGE_PATH = os.path.join('training', 'image_2') MASK_PATH = os.path.join('training', 'semantic') @@ -126,16 +160,35 @@ class SegModel(pl.LightningModule): It uses the FCN ResNet50 model as an example. Adam optimizer is used along with Cosine Annealing learning rate scheduler. - """ - def __init__(self, - data_path: str, - batch_size: int, - lr: float, - num_layers: int, - features_start: int, - bilinear: bool, **kwargs): - super().__init__() + >>> from pl_examples import DATASETS_PATH + >>> dataset_path = os.path.join(DATASETS_PATH, "Kitti") + >>> _create_synth_kitti_dataset(dataset_path, image_dims=(1024, 512)) + >>> SegModel(dataset_path) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + SegModel( + (net): UNet( + (layers): ModuleList( + (0): DoubleConv(...) + (1): Down(...) + (2): Down(...) + (3): Up(...) + (4): Up(...) + (5): Conv2d(64, 19, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + ) + """ + def __init__( + self, + data_path: str, + batch_size: int = 4, + lr: float = 1e-3, + num_layers: int = 3, + features_start: int = 64, + bilinear: bool = False, + **kwargs, + ): + super().__init__(**kwargs) self.data_path = data_path self.batch_size = batch_size self.lr = lr @@ -189,6 +242,18 @@ def train_dataloader(self): def val_dataloader(self): return DataLoader(self.validset, batch_size=self.batch_size, shuffle=False) + @staticmethod + def add_model_specific_args(parent_parser): # pragma: no-cover + parser = ArgumentParser(parents=[parent_parser]) + parser.add_argument("--data_path", type=str, help="path where dataset is stored") + parser.add_argument("--batch_size", type=int, default=16, help="size of the batches") + parser.add_argument("--lr", type=float, default=0.001, help="adam: learning rate") + parser.add_argument("--num_layers", type=int, default=5, help="number of layers on u-net") + parser.add_argument("--features_start", type=float, default=64, help="number of features in first layer") + parser.add_argument("--bilinear", action='store_true', default=False, + help="whether to use bilinear interpolation or transposed") + return parser + def main(hparams: Namespace): # ------------------------ @@ -209,14 +274,7 @@ def main(hparams: Namespace): # ------------------------ # 3 INIT TRAINER # ------------------------ - trainer = pl.Trainer( - gpus=hparams.gpus, - logger=logger, - max_epochs=hparams.epochs, - accumulate_grad_batches=hparams.grad_batches, - accelerator=hparams.accelerator, - precision=16 if hparams.use_amp else 32, - ) + trainer = pl.Trainer.from_argparse_args(hparams) # ------------------------ # 5 START TRAINING @@ -225,22 +283,9 @@ def main(hparams: Namespace): if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument("--data_path", type=str, help="path where dataset is stored") - parser.add_argument("--gpus", type=int, default=-1, help="number of available GPUs") - parser.add_argument('--distributed-backend', type=str, default='dp', choices=('dp', 'ddp', 'ddp2'), - help='supports three options dp, ddp, ddp2') - parser.add_argument('--use_amp', action='store_true', help='if true uses 16 bit precision') - parser.add_argument("--batch_size", type=int, default=4, help="size of the batches") - parser.add_argument("--lr", type=float, default=0.001, help="adam: learning rate") - parser.add_argument("--num_layers", type=int, default=5, help="number of layers on u-net") - parser.add_argument("--features_start", type=float, default=64, help="number of features in first layer") - parser.add_argument("--bilinear", action='store_true', default=False, - help="whether to use bilinear interpolation or transposed") - parser.add_argument("--grad_batches", type=int, default=1, help="number of batches to accumulate") - parser.add_argument("--epochs", type=int, default=20, help="number of epochs to train") - parser.add_argument("--log_wandb", action='store_true', help="log training on Weights & Biases") - + cli_lightning_logo() + parser = ArgumentParser(add_help=False) + parser = SegModel.add_model_specific_args(parser) hparams = parser.parse_args() main(hparams) diff --git a/pl_examples/domain_templates/unet.py b/pl_examples/domain_templates/unet.py index 6117447e5ed33..2314e19ddbfc9 100644 --- a/pl_examples/domain_templates/unet.py +++ b/pl_examples/domain_templates/unet.py @@ -1,3 +1,17 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch import torch.nn as nn import torch.nn.functional as F @@ -8,20 +22,33 @@ class UNet(nn.Module): Architecture based on U-Net: Convolutional Networks for Biomedical Image Segmentation Link - https://arxiv.org/abs/1505.04597 - Parameters: - num_classes: Number of output classes required (default 19 for KITTI dataset) - num_layers: Number of layers in each side of U-net - features_start: Number of features in first layer - bilinear: Whether to use bilinear interpolation or transposed - convolutions for upsampling. + >>> UNet(num_classes=2, num_layers=3) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + UNet( + (layers): ModuleList( + (0): DoubleConv(...) + (1): Down(...) + (2): Down(...) + (3): Up(...) + (4): Up(...) + (5): Conv2d(64, 2, kernel_size=(1, 1), stride=(1, 1)) + ) + ) """ def __init__( - self, num_classes: int = 19, + self, + num_classes: int = 19, num_layers: int = 5, features_start: int = 64, - bilinear: bool = False + bilinear: bool = False, ): + """ + Args: + num_classes: Number of output classes required (default 19 for KITTI dataset) + num_layers: Number of layers in each side of U-net + features_start: Number of features in first layer + bilinear: Whether to use bilinear interpolation or transposed convolutions for upsampling. + """ super().__init__() self.num_layers = num_layers @@ -55,6 +82,11 @@ class DoubleConv(nn.Module): """ Double Convolution and BN and ReLU (3x3 conv -> BN -> ReLU) ** 2 + + >>> DoubleConv(4, 4) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + DoubleConv( + (net): Sequential(...) + ) """ def __init__(self, in_ch: int, out_ch: int): @@ -75,6 +107,16 @@ def forward(self, x): class Down(nn.Module): """ Combination of MaxPool2d and DoubleConv in series + + >>> Down(4, 8) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Down( + (net): Sequential( + (0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) + (1): DoubleConv( + (net): Sequential(...) + ) + ) + ) """ def __init__(self, in_ch: int, out_ch: int): @@ -93,6 +135,14 @@ class Up(nn.Module): Upsampling (by either bilinear interpolation or transpose convolutions) followed by concatenation of feature map from contracting path, followed by double 3x3 convolution. + + >>> Up(8, 4) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Up( + (upsample): ConvTranspose2d(8, 4, kernel_size=(2, 2), stride=(2, 2)) + (conv): DoubleConv( + (net): Sequential(...) + ) + ) """ def __init__(self, in_ch: int, out_ch: int, bilinear: bool = False): diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/README.md b/pl_examples/pytorch_ecosystem/pytorch_geometric/README.md deleted file mode 100644 index 5c9a42d5a8942..0000000000000 --- a/pl_examples/pytorch_ecosystem/pytorch_geometric/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# [Pytorch Geometric](https://github.com/rusty1s/pytorch_geometric) examples with Lighting - -### Introduction - -PyTorch Geometric (PyG) is a geometric deep learning extension library for PyTorch. It relies on lower level libraries such as - -* PyTorch Cluster: A package consists of a small extension library of highly optimized graph cluster algorithms in Pytorch -* PyTorch Sparse: A package consists of a small extension library of optimized sparse matrix operations with autograd support in Pytorch -* PyTorch Scatter: A package consists of a small extension library of highly optimized sparse update (scatter and segment) operations for the use in PyTorch - -## Setup - -``` -pyenv install 3.7.8 -pyenv local 3.7.8 -python -m venv -source .venv/bin/activate -poetry install -``` - -Run example - -``` -python cora_dna.py -``` - -## Current example lists - -| `DATASET` | `MODEL` | `TASK` | DATASET DESCRIPTION | MODEL DESCRIPTION | | -| :---: | :---: | :---: | :---: | :---: | :---: | -| Cora | DNA | Node Classification | The citation network datasets "Cora", "CiteSeer" and "PubMed" from the "Revisiting Semi-Supervised Learning with Graph Embeddings" | The dynamic neighborhood aggregation operator from the "Just Jump: Towards Dynamic Neighborhood Aggregation in Graph Neural Networks" - - -## DATASET SIZES - -``` - 16M ./cora -``` diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/cora_dna.py b/pl_examples/pytorch_ecosystem/pytorch_geometric/cora_dna.py deleted file mode 100644 index e4e040ff7072e..0000000000000 --- a/pl_examples/pytorch_ecosystem/pytorch_geometric/cora_dna.py +++ /dev/null @@ -1,375 +0,0 @@ -"""Graph Convolution Example using Pytorch Geometric - -This example illustrates how one could train a graph convolution model with DNA Conv -on Cora Dataset using pytorch-lightning. This example will also demonstrate how this -model can be easily torch-scripted, thanks to Pytorch Geometric. -""" -# python imports -import os -import os.path as osp -import sys -from functools import partial -from collections import namedtuple -from argparse import ArgumentParser -from typing import List, Optional, NamedTuple - -# thrid parties libraries -import numpy as np -from torch import nn -import torch -from torch import Tensor -from torch.optim import Adam -import torch.nn.functional as F - -# Lightning imports -from pytorch_lightning import ( - Trainer, - LightningDataModule, - LightningModule -) -from pytorch_lightning.metrics import Accuracy - -try: - # Pytorch Geometric imports - from torch_geometric.nn import DNAConv, MessagePassing - from torch_geometric.data import DataLoader - from torch_geometric.datasets import Planetoid - import torch_geometric.transforms as T - from torch_geometric.data import NeighborSampler - from lightning import lightning_logo, nice_print -except Exception: - HAS_PYTORCH_GEOMETRIC = False -else: - HAS_PYTORCH_GEOMETRIC = True - - -# use to make model jittable -OptTensor = Optional[Tensor] -ListTensor = List[Tensor] - - -class TensorBatch(NamedTuple): - x: Tensor - edge_index: ListTensor - edge_attr: OptTensor - batch: OptTensor - -################################### -# LightningDataModule # -################################### - - -class CoraDataset(LightningDataModule): - - r"""The citation network datasets "Cora", "CiteSeer" and "PubMed" from the - `"Revisiting Semi-Supervised Learning with Graph Embeddings" - `_ paper. - Nodes represent documents and edges represent citation links. - Training, validation and test splits are given by binary masks. - c.f https://github.com/rusty1s/pytorch_geometric/blob/master/torch_geometric/datasets/planetoid.py - """ - - NAME = "cora" - - def __init__(self, - num_workers: int = 1, - batch_size: int = 8, - drop_last: bool = True, - pin_memory: bool = True, - num_layers: int = None): - super().__init__() - - assert num_layers is not None - - self._num_workers = num_workers - self._batch_size = batch_size - self._drop_last = drop_last - self._pin_memory = pin_memory - self._num_layers = num_layers - - self._transform = T.NormalizeFeatures() - - @property - def num_features(self): - return 1433 - - @property - def num_classes(self): - return 7 - - @property - def hyper_parameters(self): - # used to inform the model the dataset specifications - return {"num_features": self.num_features, "num_classes": self.num_classes} - - def prepare_data(self): - path = osp.join( - osp.dirname(osp.realpath(__file__)), "..", "..", "data", self.NAME - ) - self.dataset = Planetoid(path, self.NAME, transform=self._transform) - self.data = self.dataset[0] - - def create_neighbor_sampler(self, batch_size=2, stage=None): - # https://github.com/rusty1s/pytorch_geometric/tree/master/torch_geometric/data/sampler.py#L18 - return NeighborSampler( - self.data.edge_index, - # the nodes that should be considered for sampling. - node_idx=getattr(self.data, f"{stage}_mask"), - # -1 indicates all neighbors will be selected - sizes=[self._num_layers, -1], - num_workers=self._num_workers, - drop_last=self._drop_last, - pin_memory=self._pin_memory, - ) - - def train_dataloader(self): - return self.create_neighbor_sampler(stage="train") - - def validation_dataloader(self): - return self.create_neighbor_sampler(stage="val") - - def test_dataloader(self): - return self.create_neighbor_sampler(stage="test") - - def gather_data_and_convert_to_namedtuple(self, batch, batch_nb): - """ - This function will select features using node_idx - and create a NamedTuple Object. - """ - - usual_keys = ["x", "edge_index", "edge_attr", "batch"] - Batch: TensorBatch = namedtuple("Batch", usual_keys) - return ( - Batch( - self.data.x[batch[1]], - [e.edge_index for e in batch[2]], - None, - None, - ), - self.data.y[batch[1]], - ) - - @staticmethod - def add_argparse_args(parser): - parser.add_argument("--num_workers", type=int, default=1) - parser.add_argument("--batch_size", type=int, default=2) - parser.add_argument("--drop_last", default=True) - parser.add_argument("--pin_memory", default=True) - return parser - - -############################### -# LightningModule # -############################### - - -class DNAConvNet(LightningModule): - - r"""The dynamic neighborhood aggregation operator from the `"Just Jump: - Towards Dynamic Neighborhood Aggregation in Graph Neural Networks" - `_ paper - c.f https://github.com/rusty1s/pytorch_geometric/blob/master/torch_geometric/nn/conv/dna_conv.py#L172 - """ - - def __init__(self, - num_layers: int = 2, - hidden_channels: int = 128, - heads: int = 8, - groups: int = 16, - dropout: float = 0.8, - cached: bool = False, - num_features: int = None, - num_classes: int = None, - ): - super().__init__() - - assert num_features is not None - assert num_classes is not None - - # utils from Lightning to save __init__ arguments - self.save_hyperparameters() - hparams = self.hparams - - # Instantiate metrics - self.val_acc = Accuracy(hparams["num_classes"]) - self.test_acc = Accuracy(hparams["num_classes"]) - - # Define DNA graph convolution model - self.hidden_channels = hparams["hidden_channels"] - self.lin1 = nn.Linear(hparams["num_features"], hparams["hidden_channels"]) - - # Create ModuleList to hold all convolutions - self.convs = nn.ModuleList() - - # Iterate through the number of layers - for _ in range(hparams["num_layers"]): - - # Create a DNA Convolution - This graph convolution relies on MultiHead Attention mechanism - # to route information similar to Transformers. - # https://github.com/rusty1s/pytorch_geometric/blob/master/torch_geometric/nn/conv/dna_conv.py#L172 - self.convs.append( - DNAConv( - hparams["hidden_channels"], - hparams["heads"], - hparams["groups"], - dropout=hparams["dropout"], - cached=False, - ) - ) - # classification MLP - self.lin2 = nn.Linear(hparams["hidden_channels"], hparams["num_classes"], bias=False) - - def forward(self, batch: TensorBatch): - # batch needs to be typed for making this model jittable. - x = batch.x - x = F.relu(self.lin1(x)) - x = F.dropout(x, p=0.5, training=self.training) - x_all = x.view(-1, 1, self.hidden_channels) - - # iterate over all convolutions - for idx, conv in enumerate(self.convs): - # perform convolution using previously concatenated embedding - # through edge_index - x = F.relu(conv(x_all, batch.edge_index[idx])) - x = x.view(-1, 1, self.hidden_channels) - - # concatenate with previously computed embedding - x_all = torch.cat([x_all, x], dim=1) - - # extra latest layer embedding - x = x_all[:, -1] - - x = F.dropout(x, p=0.5, training=self.training) - - # return logits per nodes - return F.log_softmax(self.lin2(x), -1) - - def step(self, batch, batch_nb): - typed_batch, targets = self.gather_data_and_convert_to_namedtuple(batch, batch_nb) - logits = self(typed_batch) - return logits, targets - - def training_step(self, batch, batch_nb): - logits, targets = self.step(batch, batch_nb) - train_loss = F.nll_loss(logits, targets) - self.log("train_loss", train_loss, on_step=True, on_epoch=True, prog_bar=True) - return train_loss - - def validation_step(self, batch, batch_nb): - logits, targets = self.step(batch, batch_nb) - val_loss = F.nll_loss(logits, targets) - self.log("val_loss", val_loss, on_step=False, on_epoch=True, prog_bar=True) - self.log("val_acc", self.val_acc(logits, targets), on_step=False, on_epoch=True, prog_bar=True) - - def test_step(self, batch, batch_nb): - logits, targets = self.step(batch, batch_nb) - test_loss = F.nll_loss(logits, targets) - self.log("test_loss", test_loss, on_step=False, on_epoch=True, prog_bar=True) - self.log("test_acc", self.test_acc(logits, targets), on_step=False, on_epoch=True, prog_bar=True) - - # Use for jittable demonstration. - - def _convert_to_jittable(self, module): - for key, m in module._modules.items(): - if isinstance(m, MessagePassing) and m.jittable is not None: - # Pytorch Geometric MessagePassing implements a `.jittable` function - # which converts the current module into its jittable version. - module._modules[key] = m.jittable() - else: - self._convert_to_jittable(m) - return module - - def jittable(self): - for key, m in self._modules.items(): - self._modules[key] = self._convert_to_jittable(m) - - def configure_optimizers(self): - return Adam(self.parameters(), lr=1e-3) - - @staticmethod - def add_argparse_args(parser): - parser.add_argument("--num_layers", type=int, default=2) - parser.add_argument("--hidden_channels", type=int, default=128) - parser.add_argument("--heads", type=int, default=8) - parser.add_argument("--groups", type=int, default=16) - parser.add_argument("--dropout", type=float, default=0.8) - parser.add_argument("--cached", type=int, default=0) - parser.add_argument("--jit", default=True) - return parser - -################################# -# Instantiate Functions # -################################# - - -def instantiate_datamodule(args): - datamodule = CoraDataset( - num_workers=args.num_workers, - batch_size=args.batch_size, - drop_last=args.drop_last, - pin_memory=args.pin_memory, - num_layers=args.num_layers, - ) - return datamodule - - -def instantiate_model(args, datamodule): - model = DNAConvNet( - num_layers=args.num_layers, - hidden_channels=args.hidden_channels, - heads=args.heads, - groups=args.groups, - dropout=args.dropout, - # provide dataset specific arguments - **datamodule.hyper_parameters, - ) - if args.jit: - model.jittable() - - # Attached datamodule function to model - model.gather_data_and_convert_to_namedtuple = datamodule.gather_data_and_convert_to_namedtuple - return model - - -def get_single_batch(datamodule): - for batch in datamodule.test_dataloader(): - return datamodule.gather_data_and_convert_to_namedtuple(batch, 0) - -####################### -# Trainer Run # -####################### - - -def run(args): - - nice_print("You are about to train a TorchScripted Pytorch Geometric Lightning model !") - nice_print(lightning_logo) - - datamodule: LightningDataModule = instantiate_datamodule(args) - model: LightningModule = instantiate_model(args, datamodule) - trainer = Trainer.from_argparse_args(args) - trainer.fit(model, datamodule) - trainer.test() - - batch = get_single_batch(datamodule) - model.to_torchscript(file_path="model_trace.pt", - method='script', - example_inputs=batch) - - nice_print("Congratulations !") - nice_print("You trained your first TorchScripted Pytorch Geometric Lightning model !", last=True) - - -if __name__ == "__main__": - if not HAS_PYTORCH_GEOMETRIC: - print("Skip training. Pytorch Geometric isn't installed. Please, check README.md !") - - else: - parser = ArgumentParser(description="Pytorch Geometric Example") - parser = Trainer.add_argparse_args(parser) - parser = CoraDataset.add_argparse_args(parser) - parser = DNAConvNet.add_argparse_args(parser) - - cmd_line = '--max_epochs 1'.split(' ') - - run(parser.parse_args(cmd_line)) diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/lightning.py b/pl_examples/pytorch_ecosystem/pytorch_geometric/lightning.py deleted file mode 100644 index 2c765d1449c57..0000000000000 --- a/pl_examples/pytorch_ecosystem/pytorch_geometric/lightning.py +++ /dev/null @@ -1,31 +0,0 @@ -def nice_print(msg, last=False): - print() - print("\033[0;35m" + msg + "\033[0m") - if last: - print() - - -lightning_logo = """ - #### - ########### - #################### - ############################ - ##################################### -############################################## -######################### ################### -####################### ################### -#################### #################### -################## ##################### -################ ###################### -##################### ################# -###################### ################### -##################### ##################### -#################### ####################### -################### ######################### -############################################## - ##################################### - ############################ - #################### - ########## - #### -""" diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/pyproject.toml b/pl_examples/pytorch_ecosystem/pytorch_geometric/pyproject.toml deleted file mode 100644 index 99f516323e976..0000000000000 --- a/pl_examples/pytorch_ecosystem/pytorch_geometric/pyproject.toml +++ /dev/null @@ -1,25 +0,0 @@ -[tool.poetry] -name = "lightning-geometric" -version = "0.1.0" -description = "TorchScripted Pytorch Geometric Examples with Pytorch Lightning" -authors = ["Thomas Chaton "] - -[tool.poetry.dependencies] -python = "3.7.8" -torch = "^1.6.0" -torch-cluster = "^1.5.7" -torch-sparse = "^0.6.7" -torch-scatter = "^2.0.5" -torch-geometric = "^1.6.1" -pytorch-lightning = "^ 1.0.5" -openmesh = "^1.1.4" -torch-spline-conv = "^1.2.0" -tqdm = "^4.50.0" -pytest = "^6.1.0" - -[tool.poetry.dev-dependencies] -black = {version = "^20.8b1", allow-prereleases = true} - -[build-system] -requires = ["poetry>=0.12"] -build-backend = "poetry.masonry.api" diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py index da21384190163..91145c5bd0d0b 100644 --- a/pl_examples/test_examples.py +++ b/pl_examples/test_examples.py @@ -1,3 +1,17 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import importlib import platform from unittest import mock diff --git a/pyproject.toml b/pyproject.toml index 760421a56ece8..d7d07b1526390 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,14 +16,13 @@ exclude = "(.eggs|.git|.hg|.mypy_cache|.nox|.tox|.venv|.svn|_build|buck-out|buil [tool.isort] known_first_party = [ - "bencharmks", + "benchmarks", "docs", "pl_examples", "pytorch_lightning", "tests", ] skip_glob = [ - "pl_examples/*", "pytorch_lightning/accelerators/*", "pytorch_lightning/callbacks/*", "pytorch_lightning/cluster_environments/*", @@ -52,3 +51,5 @@ skip_glob = [ ] profile = "black" line_length = 120 +force_sort_within_sections = "False" +order_by_type = "False" diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index 408d95a72dc47..8b8e3328375bb 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -1,10 +1,15 @@ """Root package info.""" -__version__ = '1.1.1rc0' +import logging as python_logging +import os +import time + +_this_year = time.strftime("%Y") +__version__ = '1.1.5' __author__ = 'William Falcon et al.' __author_email__ = 'waf2107@columbia.edu' __license__ = 'Apache-2.0' -__copyright__ = 'Copyright (c) 2018-2020, %s.' % __author__ +__copyright__ = f'Copyright (c) 2018-{_this_year}, {__author__}.' __homepage__ = 'https://github.com/PyTorchLightning/pytorch-lightning' # this has to be simple string, see: https://github.com/pypa/twine/issues/522 __docs__ = ( @@ -33,9 +38,6 @@ - https://pytorch-lightning.readthedocs.io/en/stable """ -import logging as python_logging -import os - _logger = python_logging.getLogger("lightning") _logger.addHandler(python_logging.StreamHandler()) _logger.setLevel(python_logging.INFO) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 77f30219ba8c0..8bb335f2e7847 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -52,6 +52,10 @@ def __init__(self, def setup(self, model): pass + def train(self): + self.trainer.setup_trainer(self.trainer.model) + return self.train_or_test() + def teardown(self): # Ensure if necessary all processes are finished self.barrier() @@ -66,6 +70,7 @@ def train_or_test(self): if self.trainer.testing: results = self.trainer.run_test() else: + self.trainer.train_loop.setup_training() results = self.trainer.train() return results diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 4d899da2b0ec2..c911225d0b29f 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -342,8 +342,8 @@ def set_distributed_mode(self): # throw error to force user ddp or ddp2 choice if self.trainer.num_nodes > 1 and not (self.trainer.use_ddp2 or self.trainer.use_ddp): raise MisconfigurationException( - 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' - 'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`' + 'DataParallel does not support num_nodes > 1. ' + 'To avoid this exception, set `accelerator="ddp"` or `accelerator="ddp2"`' ) rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self.trainer.on_gpu}') diff --git a/pytorch_lightning/accelerators/cpu_accelerator.py b/pytorch_lightning/accelerators/cpu_accelerator.py index 25302cabbc70f..997a3568daf2d 100644 --- a/pytorch_lightning/accelerators/cpu_accelerator.py +++ b/pytorch_lightning/accelerators/cpu_accelerator.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Optional, Union, Callable +from typing import Any, Callable, Optional, Union import torch @@ -48,20 +48,8 @@ def setup(self, model): # allow for lr schedulers as well self.setup_optimizers(model) - self.trainer.convert_to_lightning_optimizers() - self.trainer.model = model - def train(self): - model = self.trainer.model - - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - return results - def _step(self, model_step: Callable, args): if self.trainer.amp_backend == AMPType.NATIVE: with torch.cuda.amp.autocast(): diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/ddp2_accelerator.py index 2e864029f8767..373406589d855 100644 --- a/pytorch_lightning/accelerators/ddp2_accelerator.py +++ b/pytorch_lightning/accelerators/ddp2_accelerator.py @@ -26,7 +26,7 @@ from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin -from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType +from pytorch_lightning.utilities import AMPType, HYDRA_AVAILABLE from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available if HYDRA_AVAILABLE: @@ -186,22 +186,16 @@ def ddp_train(self, process_idx, mp_queue, model): self.ddp_plugin.on_after_setup_optimizers(self.trainer) - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # device ids change depending on the DDP setup device_ids = self.get_device_ids() # allow user to configure ddp model = self.configure_ddp(model, device_ids) - # set up training routine - self.trainer.train_loop.setup_training(model) + self.trainer.setup_trainer(model) # train or test results = self.train_or_test() @@ -213,6 +207,7 @@ def ddp_train(self, process_idx, mp_queue, model): def configure_ddp( self, model: LightningModule, device_ids: List[int] ) -> DistributedDataParallel: + self.ddp_plugin.device_ids = device_ids model = self.ddp_plugin.configure_ddp(model, device_ids) return model diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py index da9eb2d3ea937..0fde9da158c94 100644 --- a/pytorch_lightning/accelerators/ddp_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_accelerator.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License import os +from os.path import abspath import subprocess import sys -from os.path import abspath from time import sleep from typing import Any, List, Optional, Union @@ -30,7 +30,7 @@ from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin -from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType +from pytorch_lightning.utilities import AMPType, HYDRA_AVAILABLE from pytorch_lightning.utilities.distributed import ( all_gather_ddp_if_available, find_free_network_port, @@ -285,23 +285,17 @@ def ddp_train(self, process_idx, model): # allow for lr schedulers as well self.setup_optimizers(model) - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # device ids change depending on the DDP setup device_ids = self.get_device_ids() # allow user to configure ddp model = self.configure_ddp(model, device_ids) - # set up training routine self.barrier('ddp_setup') - self.trainer.train_loop.setup_training(model) + self.trainer.setup_trainer(model) # train or test results = self.train_or_test() @@ -314,6 +308,7 @@ def ddp_train(self, process_idx, model): def configure_ddp( self, model: LightningModule, device_ids: List[int] ) -> DistributedDataParallel: + self.ddp_plugin.device_ids = device_ids model = self.ddp_plugin.configure_ddp(model, device_ids) return model diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py index a0545a4604aec..4694a31438ca6 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py @@ -42,9 +42,12 @@ def __init__(self, super().__init__(trainer, cluster_environment, ddp_plugin) self.nickname = 'ddp_cpu' - def model_to_device(self, model, process_idx): + def model_to_device(self, model): model.cpu() def get_device_ids(self): device_ids = None return device_ids + + def init_device(self, process_idx): + pass diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py index 91a6dee484f30..f9ccaa200bbf4 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py @@ -26,7 +26,7 @@ from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin -from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType +from pytorch_lightning.utilities import AMPType, HYDRA_AVAILABLE from pytorch_lightning.utilities.distributed import ( all_gather_ddp_if_available, find_free_network_port, @@ -146,22 +146,16 @@ def ddp_train(self, process_idx, mp_queue, model): self.ddp_plugin.on_after_setup_optimizers(self.trainer) - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # DDP spawn already spawned off each process... no need to do anything device_ids = self.get_device_ids() # allow user to configure ddp model = self.configure_ddp(model, device_ids) - # set up training routine - self.trainer.train_loop.setup_training(model) + self.trainer.setup_trainer(model) # train or test results = self.train_or_test() @@ -241,6 +235,7 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): def configure_ddp( self, model: LightningModule, device_ids: List[int] ) -> DistributedDataParallel: + self.ddp_plugin.device_ids = device_ids model = self.ddp_plugin.configure_ddp(model, device_ids) return model diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py index ec4c087998614..bdc4631b5d017 100644 --- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py @@ -15,8 +15,8 @@ from typing import Any, List, Optional, Union import torch -import torch.distributed as torch_distrib import torch.distributed as dist +import torch.distributed as torch_distrib from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log @@ -26,7 +26,7 @@ from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin -from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType +from pytorch_lightning.utilities import AMPType, HYDRA_AVAILABLE from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available if HYDRA_AVAILABLE: @@ -126,6 +126,7 @@ def ddp_train(self, process_idx, model): """ # determine which process we are and world size self.set_world_ranks(process_idx) + self.init_device(process_idx) # toggle prog bar if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: @@ -176,22 +177,16 @@ def ddp_train(self, process_idx, model): self.ddp_plugin.on_after_setup_optimizers(self.trainer) - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # device ids change depending on the DDP setup device_ids = self.get_device_ids() # allow user to configure ddp model = self.configure_ddp(model, device_ids) - # set up training routine - self.trainer.train_loop.setup_training(model) + self.trainer.setup_trainer(model) # train or test results = self.train_or_test() @@ -204,6 +199,7 @@ def ddp_train(self, process_idx, model): def configure_ddp( self, model: LightningModule, device_ids: List[int] ) -> DistributedDataParallel: + self.ddp_plugin.device_ids = device_ids model = self.ddp_plugin.configure_ddp(model, device_ids) return model diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py index a49e17fc0b31d..eb4ff24e39dd4 100644 --- a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py @@ -27,7 +27,7 @@ from pytorch_lightning.distributed import LightningDistributed from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin -from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType +from pytorch_lightning.utilities import AMPType, HYDRA_AVAILABLE from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.distributed import ( @@ -161,22 +161,16 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 self.ddp_plugin.on_after_setup_optimizers(self.trainer) - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # device ids change depending on the DDP setup device_ids = self.get_device_ids() # allow user to configure ddp model = self.configure_ddp(model, device_ids) - # set up training routine - self.trainer.train_loop.setup_training(model) + self.trainer.setup_trainer(model) # train or test results = self.train_or_test() @@ -273,6 +267,7 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): def configure_ddp( self, model: LightningModule, device_ids: List[int] ) -> DistributedDataParallel: + self.ddp_plugin.device_ids = device_ids model = self.ddp_plugin.configure_ddp(model, device_ids) return model diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py index 834a920b505d9..03c9ebb442fb2 100644 --- a/pytorch_lightning/accelerators/dp_accelerator.py +++ b/pytorch_lightning/accelerators/dp_accelerator.py @@ -65,8 +65,6 @@ def setup(self, model): if self.trainer.amp_backend: model = self.__init_half_precision(model) - self.trainer.convert_to_lightning_optimizers() - self.trainer.model = model def __init_torch_data_parallel(self, model): @@ -103,16 +101,6 @@ def __init_nvidia_apex(self, model): return model - def train(self): - model = self.trainer.model - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - - return results - def teardown(self): # replace the original fwd function self.trainer.model.forward = self.model_autocast_original_forward @@ -156,30 +144,6 @@ def test_step_end(self, output): output = output.mean() return output - def reinit_scheduler_properties(self, optimizers: list, schedulers: list): - """ - Reinitialize optimizer.step properties added by schedulers - """ - for scheduler in schedulers: - scheduler = scheduler['scheduler'] - - for optimizer in optimizers: - # check that we dont mix users optimizers and schedulers - if scheduler.optimizer == optimizer: - # Find the mro belonging to the base lr scheduler class - for i, mro in enumerate(scheduler.__class__.__mro__): - is_regular_scheduler = optim.lr_scheduler._LRScheduler - is_lr_reduce_on_plateau = optim.lr_scheduler.ReduceLROnPlateau - if is_regular_scheduler or is_lr_reduce_on_plateau: - idx = i - state = scheduler.state_dict() - else: - state = None - - scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer) - if state is not None: - scheduler.load_state_dict(state) - def get_reference_model(self, model) -> LightningModule: if isinstance(model, LightningDataParallel): return model.module diff --git a/pytorch_lightning/accelerators/gpu_accelerator.py b/pytorch_lightning/accelerators/gpu_accelerator.py index 1310777e0d890..d65b19bbd9bb1 100644 --- a/pytorch_lightning/accelerators/gpu_accelerator.py +++ b/pytorch_lightning/accelerators/gpu_accelerator.py @@ -54,20 +54,8 @@ def setup(self, model): # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - self.trainer.model = model - def train(self): - model = self.trainer.model - - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - return results - def _step(self, model_step: Callable, args): args[0] = self.to_device(args[0]) diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py index 5895025673b9a..6e11a13064513 100644 --- a/pytorch_lightning/accelerators/horovod_accelerator.py +++ b/pytorch_lightning/accelerators/horovod_accelerator.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import ExitStack -from typing import Any, Optional, Union, Callable +from typing import Any, Callable, Optional, Union import torch from torch.optim.lr_scheduler import _LRScheduler @@ -20,7 +20,7 @@ from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.cluster_environments import ClusterEnvironment -from pytorch_lightning.utilities import HOROVOD_AVAILABLE, AMPType +from pytorch_lightning.utilities import AMPType, HOROVOD_AVAILABLE from pytorch_lightning.utilities.distributed import rank_zero_only if HOROVOD_AVAILABLE: @@ -91,8 +91,6 @@ def _filter_named_parameters(model, optimizer): # 16-bit model = self.trainer.precision_connector.connect(model) - self.trainer.convert_to_lightning_optimizers() - # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.trainer.global_rank = hvd.rank() @@ -106,8 +104,7 @@ def train(self): # Synchronization will be performed explicitly following backward() stack.enter_context(optimizer.skip_synchronize()) - # set up training routine - self.trainer.train_loop.setup_training(self.trainer.model) + self.trainer.setup_trainer(self.trainer.model) # train or test results = self.train_or_test() diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index 9d1eec5594d82..286004bc0976e 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -26,11 +26,11 @@ from pytorch_lightning.core import LightningModule from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.utilities import ( - TPU_AVAILABLE, move_data_to_device, rank_zero_info, rank_zero_only, rank_zero_warn, + TPU_AVAILABLE, ) from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -134,8 +134,7 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, traine # setup TPU training self.__setup_tpu_training(model, trainer) - # set up training routine - self.trainer.train_loop.setup_training(model) + self.trainer.setup_trainer(model) # train or test results = self.train_or_test() @@ -230,8 +229,6 @@ def __setup_tpu_training(self, model: LightningModule, trainer): f' global rank: {trainer.tpu_global_core_rank}' f' with XLA_USE_BF16={os.environ.get("XLA_USE_BF16")}') - self.trainer.convert_to_lightning_optimizers() - def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): # do backward pass if self.trainer.train_loop.automatic_optimization: diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 88f1881643c9a..3e15d8462350c 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -19,6 +19,7 @@ Monitor a metric and stop training when it stops improving. """ +import numbers import os import numpy as np @@ -26,6 +27,7 @@ from pytorch_lightning import _logger as log from pytorch_lightning.callbacks.base import Callback +from pytorch_lightning.metrics.metric import Metric from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn, TPU_AVAILABLE @@ -164,10 +166,10 @@ def on_validation_end(self, trainer, pl_module): self._run_early_stopping_check(trainer, pl_module) def on_validation_epoch_end(self, trainer, pl_module): - if trainer.running_sanity_check: + if trainer.fast_dev_run or trainer.running_sanity_check: return - if self._validate_condition_metric(trainer.logger_connector.callback_metrics): + if self._validate_condition_metric(trainer.callback_metrics): # turn off early stopping in on_train_epoch_end self.based_on_eval_results = True @@ -176,24 +178,19 @@ def on_train_epoch_end(self, trainer, pl_module, outputs): if self.based_on_eval_results: return - # early stopping can also work in the train loop when there is no val loop - should_check_early_stop = False - - # fallback to monitor key in result dict - if trainer.logger_connector.callback_metrics.get(self.monitor, None) is not None: - should_check_early_stop = True - - if should_check_early_stop: - self._run_early_stopping_check(trainer, pl_module) + self._run_early_stopping_check(trainer, pl_module) def _run_early_stopping_check(self, trainer, pl_module): """ Checks whether the early stopping condition is met and if so tells the trainer to stop the training. """ - logs = trainer.logger_connector.callback_metrics + logs = trainer.callback_metrics - if not self._validate_condition_metric(logs): + if ( + trainer.fast_dev_run # disable early_stopping with fast_dev_run + or not self._validate_condition_metric(logs) # short circuit if metric not present + ): return # short circuit if metric not present current = logs.get(self.monitor) @@ -201,8 +198,11 @@ def _run_early_stopping_check(self, trainer, pl_module): # when in dev debugging trainer.dev_debugger.track_early_stopping_history(self, current) - if not isinstance(current, torch.Tensor): - current = torch.tensor(current, device=pl_module.device) + if current is not None: + if isinstance(current, Metric): + current = current.compute() + elif isinstance(current, numbers.Number): + current = torch.tensor(current, device=pl_module.device, dtype=torch.float) if trainer.use_tpu and TPU_AVAILABLE: current = current.cpu() diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py index b083511392bb3..1403d0bdf2e31 100644 --- a/pytorch_lightning/callbacks/gpu_stats_monitor.py +++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py @@ -24,7 +24,7 @@ import shutil import subprocess import time -from typing import List, Tuple, Dict +from typing import Dict, List, Tuple from pytorch_lightning.callbacks.base import Callback from pytorch_lightning.utilities import rank_zero_only @@ -213,5 +213,4 @@ def _should_log(trainer) -> bool: or trainer.should_stop ) - should_log = should_log and not trainer.fast_dev_run return should_log diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py index 9799e0d3298d3..b3c3f36577a67 100755 --- a/pytorch_lightning/callbacks/lr_monitor.py +++ b/pytorch_lightning/callbacks/lr_monitor.py @@ -33,11 +33,11 @@ class LearningRateMonitor(Callback): Automatically monitor and logs learning rate for learning rate schedulers during training. Args: - logging_interval: set to `epoch` or `step` to log `lr` of all optimizers - at the same interval, set to `None` to log at individual interval - according to the `interval` key of each scheduler. Defaults to ``None``. + logging_interval: set to ``'epoch'`` or ``'step'`` to log ``lr`` of all optimizers + at the same interval, set to ``None`` to log at individual interval + according to the ``interval`` key of each scheduler. Defaults to ``None``. log_momentum: option to also log the momentum values of the optimizer, if the optimizer - has the `momentum` attribute. Defaults to ``False``. + has the ``momentum`` or ``betas`` attribute. Defaults to ``False``. Example:: @@ -47,17 +47,19 @@ class LearningRateMonitor(Callback): >>> trainer = Trainer(callbacks=[lr_monitor]) Logging names are automatically determined based on optimizer class name. - In case of multiple optimizers of same type, they will be named `Adam`, - `Adam-1` etc. If a optimizer has multiple parameter groups they will - be named `Adam/pg1`, `Adam/pg2` etc. To control naming, pass in a - `name` keyword in the construction of the learning rate schdulers + In case of multiple optimizers of same type, they will be named ``Adam``, + ``Adam-1`` etc. If a optimizer has multiple parameter groups they will + be named ``Adam/pg1``, ``Adam/pg2`` etc. To control naming, pass in a + ``name`` keyword in the construction of the learning rate schdulers Example:: def configure_optimizer(self): optimizer = torch.optim.Adam(...) - lr_scheduler = {'scheduler': torch.optim.lr_scheduler.LambdaLR(optimizer, ...) - 'name': 'my_logging_name'} + lr_scheduler = { + 'scheduler': torch.optim.lr_scheduler.LambdaLR(optimizer, ...) + 'name': 'my_logging_name' + } return [optimizer], [lr_scheduler] """ @@ -80,16 +82,28 @@ def on_train_start(self, trainer, *args, **kwargs): """ if not trainer.logger: raise MisconfigurationException( - 'Cannot use LearningRateMonitor callback with Trainer that has no logger.' + 'Cannot use `LearningRateMonitor` callback with `Trainer` that has no logger.' ) if not trainer.lr_schedulers: rank_zero_warn( - 'You are using LearningRateMonitor callback with models that' + 'You are using `LearningRateMonitor` callback with models that' ' have no learning rate schedulers. Please see documentation' ' for `configure_optimizers` method.', RuntimeWarning ) + if self.log_momentum: + def _check_no_key(key): + return any( + key not in sch['scheduler'].optimizer.defaults for sch in trainer.lr_schedulers + ) + + if _check_no_key('momentum') and _check_no_key('betas'): + rank_zero_warn( + "You have set log_momentum=True, but some optimizers do not" + " have momentum. This will log a value 0 for the momentum.", RuntimeWarning + ) + # Find names for schedulers names = self._find_names(trainer.lr_schedulers) @@ -105,7 +119,7 @@ def on_train_batch_start(self, trainer, *args, **kwargs): interval = 'step' if self.logging_interval is None else 'any' latest_stat = self._extract_stats(trainer, interval) - if trainer.logger is not None and latest_stat: + if latest_stat: trainer.logger.log_metrics(latest_stat, step=trainer.global_step) def on_train_epoch_start(self, trainer, *args, **kwargs): @@ -113,7 +127,7 @@ def on_train_epoch_start(self, trainer, *args, **kwargs): interval = 'epoch' if self.logging_interval is None else 'any' latest_stat = self._extract_stats(trainer, interval) - if trainer.logger is not None and latest_stat: + if latest_stat: trainer.logger.log_metrics(latest_stat, step=trainer.global_step) def _extract_stats(self, trainer, interval: str) -> Dict[str, float]: @@ -121,19 +135,17 @@ def _extract_stats(self, trainer, interval: str) -> Dict[str, float]: for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers): if scheduler['interval'] == interval or interval == 'any': - param_groups = scheduler['scheduler'].optimizer.param_groups - if len(param_groups) != 1: - for i, pg in enumerate(param_groups): - lr = self._extract_lr(param_group=pg, name=f'{name}/pg{i + 1}') - latest_stat.update(lr) - momentum = self._extract_momentum(param_group=pg, name=f'{name}-momentum/pg{i + 1}') - latest_stat.update(momentum) - - else: - pg = param_groups[0] - lr = self._extract_lr(param_group=pg, name=name) + opt = scheduler['scheduler'].optimizer + param_groups = opt.param_groups + use_betas = 'betas' in opt.defaults + + for i, pg in enumerate(param_groups): + suffix = f'/pg{i + 1}' if len(param_groups) > 1 else '' + lr = self._extract_lr(param_group=pg, name=f'{name}{suffix}') latest_stat.update(lr) - momentum = self._extract_momentum(param_group=pg, name=f'{name}-momentum') + momentum = self._extract_momentum( + param_group=pg, name=f'{name}-momentum{suffix}', use_betas=use_betas + ) latest_stat.update(momentum) return latest_stat @@ -143,11 +155,11 @@ def _extract_lr(self, param_group, name: str) -> Dict[str, float]: self.lrs[name].append(lr) return {name: lr} - def _extract_momentum(self, param_group, name: str) -> Dict[str, float]: + def _extract_momentum(self, param_group, name: str, use_betas: bool) -> Dict[str, float]: if not self.log_momentum: return {} - momentum = param_group.get('momentum') + momentum = param_group.get('betas')[0] if use_betas else param_group.get('momentum', 0) self.last_momentum_values[name] = momentum return {name: momentum} @@ -190,5 +202,4 @@ def _should_log(trainer) -> bool: or trainer.should_stop ) - should_log = should_log and not trainer.fast_dev_run return should_log diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 1354f7f5056b3..e5c960b3c002b 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -20,6 +20,7 @@ """ +import numbers import os import re from copy import deepcopy @@ -32,8 +33,8 @@ from pytorch_lightning import _logger as log from pytorch_lightning.callbacks.base import Callback +from pytorch_lightning.metrics.metric import Metric from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -207,6 +208,7 @@ def on_save_checkpoint(self, trainer, pl_module) -> Dict[str, Any]: "best_model_score": self.best_model_score, "best_model_path": self.best_model_path, "current_score": self.current_score, + "dirpath": self.dirpath } def on_load_checkpoint(self, checkpointed_state: Dict[str, Any]): @@ -223,7 +225,8 @@ def save_checkpoint(self, trainer, pl_module): global_step = trainer.global_step if ( - self.save_top_k == 0 # no models are saved + trainer.fast_dev_run # disable checkpointing with fast_dev_run + or self.save_top_k == 0 # no models are saved or self.period < 1 # no models are saved or (epoch + 1) % self.period # skip epoch or trainer.running_sanity_check # don't save anything during sanity check @@ -240,17 +243,14 @@ def save_checkpoint(self, trainer, pl_module): # what can be monitored monitor_candidates = self._monitor_candidates(trainer) - # ie: path/val_loss=0.5.ckpt - filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, epoch, global_step) - # callback supports multiple simultaneous modes # here we call each mode sequentially # Mode 1: save all checkpoints OR only the top k if self.save_top_k: - self._save_top_k_checkpoints(monitor_candidates, trainer, pl_module, filepath) + self._save_top_k_checkpoints(trainer, pl_module, monitor_candidates) # Mode 2: save the last checkpoint - self._save_last_checkpoint(trainer, pl_module, monitor_candidates, filepath) + self._save_last_checkpoint(trainer, pl_module, monitor_candidates) def __validate_init_configuration(self): if self.save_top_k is not None and self.save_top_k < -1: @@ -303,8 +303,7 @@ def __init_ckpt_dir(self, filepath, dirpath, filename, save_top_k): and len(self._fs.ls(dirpath)) > 0 ): rank_zero_warn( - f"Checkpoint directory {dirpath} exists and is not empty. With save_top_k={save_top_k}," - " all files in this directory will be deleted when a checkpoint is saved!" + f"Checkpoint directory {dirpath} exists and is not empty." ) if dirpath and self._fs.protocol == 'file': @@ -444,6 +443,7 @@ def format_checkpoint_name( ) if ver is not None: filename = self.CHECKPOINT_JOIN_CHAR.join((filename, f"v{ver}")) + ckpt_name = f"{filename}{self.FILE_EXTENSION}" return os.path.join(self.dirpath, ckpt_name) if self.dirpath else ckpt_name @@ -480,14 +480,14 @@ def __resolve_ckpt_dir(self, trainer, pl_module): version, name = trainer.accelerator_backend.broadcast((version, trainer.logger.name)) ckpt_path = os.path.join( - save_dir, name, version, "checkpoints" + save_dir, str(name), version, "checkpoints" ) else: ckpt_path = os.path.join(trainer.weights_save_path, "checkpoints") self.dirpath = ckpt_path - if trainer.is_global_zero: + if not trainer.fast_dev_run and trainer.is_global_zero: self._fs.makedirs(self.dirpath, exist_ok=True) def _add_backward_monitor_support(self, trainer): @@ -515,13 +515,20 @@ def _validate_monitor_key(self, trainer): ) raise MisconfigurationException(m) - def _get_metric_interpolated_filepath_name(self, ckpt_name_metrics: Dict[str, Any], epoch: int, step: int): + def _get_metric_interpolated_filepath_name( + self, + ckpt_name_metrics: Dict[str, Any], + epoch: int, + step: int, + del_filepath: Optional[str] = None + ) -> str: filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics) + version_cnt = 0 - while self._fs.exists(filepath): + while self._fs.exists(filepath) and filepath != del_filepath: filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics, ver=version_cnt) - # this epoch called before version_cnt += 1 + return filepath def _monitor_candidates(self, trainer): @@ -531,13 +538,11 @@ def _monitor_candidates(self, trainer): ckpt_name_metrics.update({"step": trainer.global_step, "epoch": trainer.current_epoch}) return ckpt_name_metrics - def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath): + def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics): should_save_last = self.monitor is None or self.save_last if not should_save_last: return - last_filepath = filepath - # when user ALSO asked for the 'last.ckpt' change the name if self.save_last: last_filepath = self._format_checkpoint_name( @@ -548,6 +553,10 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath) prefix=self.prefix ) last_filepath = os.path.join(self.dirpath, f"{last_filepath}{self.FILE_EXTENSION}") + else: + last_filepath = self._get_metric_interpolated_filepath_name( + ckpt_name_metrics, trainer.current_epoch, trainer.global_step + ) accelerator_backend = trainer.accelerator_backend @@ -568,16 +577,19 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath) if self.monitor is None: self.best_model_path = self.last_model_path - def _save_top_k_checkpoints(self, metrics, trainer, pl_module, filepath): + def _save_top_k_checkpoints(self, trainer, pl_module, metrics): current = metrics.get(self.monitor) epoch = metrics.get("epoch") step = metrics.get("step") - if not isinstance(current, torch.Tensor) and current is not None: - current = torch.tensor(current, device=pl_module.device) + if current is not None: + if isinstance(current, Metric): + current = current.compute() + elif isinstance(current, numbers.Number): + current = torch.tensor(current, device=pl_module.device, dtype=torch.float) if self.check_monitor_top_k(current): - self._update_best_and_save(filepath, current, epoch, step, trainer, pl_module) + self._update_best_and_save(current, epoch, step, trainer, pl_module, metrics) elif self.verbose: rank_zero_info( f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}" @@ -588,25 +600,26 @@ def _is_valid_monitor_key(self, metrics): def _update_best_and_save( self, - filepath: str, current: torch.Tensor, epoch: int, step: int, trainer, pl_module, + ckpt_name_metrics ): k = len(self.best_k_models) + 1 if self.save_top_k == -1 else self.save_top_k - del_list = [] + del_filepath = None if len(self.best_k_models) == k and k > 0: - delpath = self.kth_best_model_path - self.best_k_models.pop(self.kth_best_model_path) - del_list.append(delpath) + del_filepath = self.kth_best_model_path + self.best_k_models.pop(del_filepath) # do not save nan, replace with +/- inf if torch.isnan(current): current = torch.tensor(float('inf' if self.mode == "min" else '-inf')) + filepath = self._get_metric_interpolated_filepath_name(ckpt_name_metrics, epoch, step, del_filepath) + # save the current score self.current_score = current self.best_k_models[filepath] = current @@ -630,9 +643,8 @@ def _update_best_and_save( ) self._save_model(filepath, trainer, pl_module) - for cur_path in del_list: - if cur_path != filepath: - self._del_model(cur_path) + if del_filepath is not None and filepath != del_filepath: + self._del_model(del_filepath) def to_yaml(self, filepath: Optional[Union[str, Path]] = None): """ diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py index 6582f16fd27be..639a988bf3856 100644 --- a/pytorch_lightning/callbacks/progress.py +++ b/pytorch_lightning/callbacks/progress.py @@ -22,9 +22,10 @@ import importlib import sys - # check if ipywidgets is installed before importing tqdm.auto # to ensure it won't fail and a progress bar is displayed +from typing import Optional, Union + if importlib.util.find_spec('ipywidgets') is not None: from tqdm.auto import tqdm else: @@ -307,7 +308,7 @@ def init_test_tqdm(self) -> tqdm: def on_sanity_check_start(self, trainer, pl_module): super().on_sanity_check_start(trainer, pl_module) self.val_progress_bar = self.init_sanity_tqdm() - self.val_progress_bar.total = convert_inf(sum(trainer.num_sanity_val_batches)) + reset(self.val_progress_bar, sum(trainer.num_sanity_val_batches)) self.main_progress_bar = tqdm(disable=True) # dummy progress bar def on_sanity_check_end(self, trainer, pl_module): @@ -323,13 +324,12 @@ def on_epoch_start(self, trainer, pl_module): super().on_epoch_start(trainer, pl_module) total_train_batches = self.total_train_batches total_val_batches = self.total_val_batches - if total_train_batches != float('inf') and not trainer.fast_dev_run: + if total_train_batches != float('inf'): # val can be checked multiple times per epoch val_checks_per_epoch = total_train_batches // trainer.val_check_batch total_val_batches = total_val_batches * val_checks_per_epoch total_batches = total_train_batches + total_val_batches - if not self.main_progress_bar.disable: - self.main_progress_bar.reset(convert_inf(total_batches)) + reset(self.main_progress_bar, total_batches) self.main_progress_bar.set_description(f'Epoch {trainer.current_epoch}') def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): @@ -343,7 +343,7 @@ def on_validation_start(self, trainer, pl_module): if not trainer.running_sanity_check: self._update_bar(self.main_progress_bar) # fill up remaining self.val_progress_bar = self.init_validation_tqdm() - self.val_progress_bar.total = convert_inf(self.total_val_batches) + reset(self.val_progress_bar, self.total_val_batches) def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): super().on_validation_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) @@ -363,7 +363,7 @@ def on_train_end(self, trainer, pl_module): def on_test_start(self, trainer, pl_module): super().on_test_start(trainer, pl_module) self.test_progress_bar = self.init_test_tqdm() - self.test_progress_bar.total = convert_inf(self.total_test_batches) + reset(self.test_progress_bar, self.total_test_batches) def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): super().on_test_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) @@ -388,8 +388,14 @@ def _update_bar(self, bar): bar.update(delta) -def convert_inf(x): +def convert_inf(x: Optional[Union[int, float]]) -> Optional[Union[int, float]]: """ The tqdm doesn't support inf values. We have to convert it to None. """ if x == float('inf'): return None return x + + +def reset(bar: tqdm, total: Optional[int] = None) -> None: + """ Resets the tqdm bar to 0 progress with a new total, unless it is disabled. """ + if not bar.disable: + bar.reset(total=convert_inf(total)) diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py index 6df1cf680c57f..cb8db4d440178 100644 --- a/pytorch_lightning/cluster_environments/slurm_environment.py +++ b/pytorch_lightning/cluster_environments/slurm_environment.py @@ -26,7 +26,7 @@ def __init__(self): def master_address(self): # figure out the root node addr try: - root_node = os.environ["SLURM_NODELIST"].split(" ")[0] + root_node = os.environ["SLURM_NODELIST"].split(" ")[0].split(",")[0] except Exception: root_node = "127.0.0.1" diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index f24a4ce8beb8a..a87ebbeb47199 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -17,10 +17,11 @@ from typing import Any, Dict, List, Optional, Union import torch -from pytorch_lightning.utilities import move_data_to_device, rank_zero_warn from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader +from pytorch_lightning.utilities import move_data_to_device, rank_zero_warn + class ModelHooks: """Hooks to be used in LightningModule.""" @@ -284,10 +285,9 @@ def on_after_backward(self): if self.trainer.global_step % 25 == 0: # don't make the tf file huge params = self.state_dict() for k, v in params.items(): - grads = v - name = k - self.logger.experiment.add_histogram(tag=name, values=grads, - global_step=self.trainer.global_step) + self.logger.experiment.add_histogram( + tag=k, values=v.grad, global_step=self.trainer.global_step + ) """ @@ -539,9 +539,9 @@ def transfer_batch_to_device(self, batch, device) any other device than the one passed in as argument (unless you know what you are doing). Note: - This hook only runs on single GPU training (no data-parallel). If you need multi-GPU support - for your custom batch objects, you need to define your custom - :class:`~torch.nn.parallel.DistributedDataParallel` or + This hook only runs on single GPU training and DDP. + If you need multi-GPU support for your custom batch objects in ``dp`` or ``ddp2``, + you need to define your custom :class:`~torch.nn.parallel.DistributedDataParallel` or :class:`~pytorch_lightning.overrides.data_parallel.LightningDistributedDataParallel` and override :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_ddp`. diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 8019d865c0ca0..f750c8aff7caf 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -35,9 +35,9 @@ from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.core.optimizer import LightningOptimizer -from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, PRIMITIVE_TYPES, ModelIO +from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, ModelIO, PRIMITIVE_TYPES from pytorch_lightning.core.step_result import Result -from pytorch_lightning.utilities import TPU_AVAILABLE, rank_zero_warn +from pytorch_lightning.utilities import rank_zero_warn, TPU_AVAILABLE from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, get_init_args @@ -111,9 +111,13 @@ def __init__(self, *args, **kwargs): self._running_manual_backward = False self._current_hook_fx_name = None self._current_dataloader_idx = None + self._automatic_optimization: bool = True - def optimizers(self): - opts = self.trainer.optimizers + def optimizers(self, use_pl_optimizer: bool = True) -> Union[Optimizer, List[Optimizer], List[LightningOptimizer]]: + if use_pl_optimizer: + opts = list(self.trainer.lightning_optimizers.values()) + else: + opts = self.trainer.optimizers # single optimizer if isinstance(opts, list) and len(opts) == 1 and isinstance(opts[0], Optimizer): @@ -160,7 +164,11 @@ def automatic_optimization(self) -> bool: """ If False you are responsible for calling .backward, .step, zero_grad. """ - return True + return self._automatic_optimization + + @automatic_optimization.setter + def automatic_optimization(self, automatic_optimization: bool) -> None: + self._automatic_optimization = automatic_optimization def print(self, *args, **kwargs) -> None: r""" @@ -279,6 +287,7 @@ def log( sync_dist_group, accelerator.sync_tensor, self._current_dataloader_idx, + self.device, ) def log_dict( @@ -617,14 +626,14 @@ def validation_step(self, *args, **kwargs): for val_batch in val_data: out = validation_step(val_batch) val_outs.append(out) - validation_epoch_end(val_outs) + validation_epoch_end(val_outs) Args: batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]): The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list. batch_idx (int): The index of this batch dataloader_idx (int): The index of the dataloader that produced this batch - (only if multiple val datasets used) + (only if multiple val dataloaders used) Return: Any of. @@ -673,11 +682,11 @@ def validation_step(self, batch, batch_idx): # log the outputs! self.log_dict({'val_loss': loss, 'val_acc': val_acc}) - If you pass in multiple val datasets, validation_step will have an additional argument. + If you pass in multiple val dataloaders, :meth:`validation_step` will have an additional argument. .. code-block:: python - # CASE 2: multiple validation datasets + # CASE 2: multiple validation dataloaders def validation_step(self, batch, batch_idx, dataloader_idx): # dataloader_idx tells you which dataset this is. @@ -736,7 +745,7 @@ def validation_step(self, batch, batch_idx): out = self(x) return out - def validation_epoch_end(self, val_step_outputs): + def validation_step_end(self, val_step_outputs): for out in val_step_outputs: # do something with these @@ -744,9 +753,7 @@ def validation_epoch_end(self, val_step_outputs): See the :ref:`multi_gpu` guide for more details. """ - def validation_epoch_end( - self, outputs: List[Any] - ) -> None: + def validation_epoch_end(self, outputs: List[Any]) -> None: """ Called at the end of the validation epoch with the outputs of all validation steps. @@ -811,7 +818,7 @@ def test_step(self, *args, **kwargs): The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list. batch_idx (int): The index of this batch. dataloader_idx (int): The index of the dataloader that produced this batch - (only if multiple test datasets used). + (only if multiple test dataloaders used). Return: Any of. @@ -851,17 +858,17 @@ def test_step(self, batch, batch_idx): # log the outputs! self.log_dict({'test_loss': loss, 'test_acc': test_acc}) - If you pass in multiple validation datasets, :meth:`test_step` will have an additional + If you pass in multiple test dataloaders, :meth:`test_step` will have an additional argument. .. code-block:: python - # CASE 2: multiple test datasets + # CASE 2: multiple test dataloaders def test_step(self, batch, batch_idx, dataloader_idx): # dataloader_idx tells you which dataset this is. Note: - If you don't need to validate you don't need to implement this method. + If you don't need to test you don't need to implement this method. Note: When the :meth:`test_step` is called, the model has been put in eval mode and @@ -913,7 +920,7 @@ def test_step(self, batch, batch_idx): out = self.encoder(x) return out - def test_epoch_end(self, output_results): + def test_step_end(self, output_results): # this out is now the full size of the batch all_test_step_outs = output_results.out loss = nce_loss(all_test_step_outs) @@ -1251,9 +1258,6 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer.zero_grad() """ - if not isinstance(optimizer, LightningOptimizer): - # wraps into LightingOptimizer only for running step - optimizer = LightningOptimizer.to_lightning_optimizer(optimizer, self.trainer) optimizer.step(closure=optimizer_closure) def optimizer_zero_grad( @@ -1331,9 +1335,17 @@ def tbptt_split_batch(self, batch, split_size): return splits - def summarize(self, mode: str = ModelSummary.MODE_DEFAULT) -> ModelSummary: - model_summary = ModelSummary(self, mode=mode) - log.info("\n" + str(model_summary)) + def summarize(self, mode: Optional[str] = ModelSummary.MODE_DEFAULT) -> Optional[ModelSummary]: + model_summary = None + + if mode in ModelSummary.MODES: + model_summary = ModelSummary(self, mode=mode) + log.info("\n" + str(model_summary)) + elif mode is not None: + raise MisconfigurationException( + f"`mode` can be None, {', '.join(ModelSummary.MODES)}, got {mode}" + ) + return model_summary def freeze(self) -> None: @@ -1392,12 +1404,15 @@ def get_progress_bar_dict(self): """ # call .item() only once but store elements without graphs running_train_loss = self.trainer.train_loop.running_loss.mean() - avg_training_loss = ( - running_train_loss.cpu().item() - if running_train_loss is not None - else float("NaN") - ) - tqdm_dict = {"loss": "{:.3g}".format(avg_training_loss)} + avg_training_loss = None + if running_train_loss is not None: + avg_training_loss = running_train_loss.cpu().item() + elif self.trainer.train_loop.automatic_optimization: + avg_training_loss = float('NaN') + + tqdm_dict = {} + if avg_training_loss is not None: + tqdm_dict["loss"] = f"{avg_training_loss:.3g}" if self.trainer.truncated_bptt_steps is not None: tqdm_dict["split_idx"] = self.trainer.split_idx diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py index faafc0a0f0584..de551cabd30df 100644 --- a/pytorch_lightning/core/memory.py +++ b/pytorch_lightning/core/memory.py @@ -33,17 +33,13 @@ class LayerSummary(object): """ Summary class for a single layer in a :class:`~pytorch_lightning.core.lightning.LightningModule`. It collects the following information: - - Type of the layer (e.g. Linear, BatchNorm1d, ...) - Input shape - Output shape - Number of parameters - The input and output shapes are only known after the example input array was passed through the model. - Example:: - >>> model = torch.nn.Conv2d(3, 8, 3) >>> summary = LayerSummary(model) >>> summary.num_parameters @@ -55,10 +51,8 @@ class LayerSummary(object): [1, 3, 5, 5] >>> summary.out_size [1, 8, 3, 3] - Args: module: A module to summarize - """ def __init__(self, module: nn.Module): @@ -76,7 +70,6 @@ def _register_hook(self) -> RemovableHandle: Registers a hook on the module that computes the input- and output size(s) on the first forward pass. If the hook is called, it will remove itself from the from the module, meaning that recursive models will only record their input- and output shapes once. - Return: A handle for the installed hook. """ @@ -120,25 +113,19 @@ def num_parameters(self) -> int: class ModelSummary(object): """ Generates a summary of all layers in a :class:`~pytorch_lightning.core.lightning.LightningModule`. - Args: model: The model to summarize (also referred to as the root module) mode: Can be one of - - `top` (default): only the top-level modules will be recorded (the children of the root module) - `full`: summarizes all layers and their submodules in the root module - The string representation of this summary prints a table with columns containing - the name, type and number of parameters for each layer. - + the name type and number of parameters for each layer. The root module may also have an attribute ``example_input_array`` as shown in the example below. If present, the root module will be called with it as input to determine the intermediate input- and output shapes of all layers. Supported are tensors and nested lists and tuples of tensors. All other types of inputs will be skipped and show as `?` in the summary table. The summary will also display `?` for layers not used in the forward pass. - Example:: - >>> import pytorch_lightning as pl >>> class LitModel(pl.LightningModule): ... @@ -169,6 +156,7 @@ class ModelSummary(object): 132 K Trainable params 0 Non-trainable params 132 K Total params + 0.506 Total estimated model params size (MB) """ MODE_TOP = "top" @@ -180,6 +168,7 @@ def __init__(self, model, mode: str = MODE_DEFAULT): self._model = model self._mode = mode self._layer_summary = self.summarize() + self._precision_megabytes = (self._model.precision / 8.0) / (1024 ** 2.0) # 1 byte -> 8 bits) @property def named_modules(self) -> List[Tuple[str, nn.Module]]: @@ -213,6 +202,31 @@ def out_sizes(self) -> List: def param_nums(self) -> List[int]: return [layer.num_parameters for layer in self._layer_summary.values()] + @property + def total_parameters(self) -> int: + return sum(p.numel() for p in self._model.parameters()) + + @property + def trainable_parameters(self) -> int: + return sum(p.numel() for p in self._model.parameters() if p.requires_grad) + + def model_size(self) -> float: + """ + Estimates total model size i.e total params size in MBs + total params size gives model size in accounting total model params. + + NOTE: Currently only Supported total params size. + + Example:: + >> model = LitModel() + >> summary = ModelSummary(model, mode='full') # doctest: +NORMALIZE_WHITESPACE + >> summary.model_size() + + Returns: + Total estimated model size(MB). + """ + return self.total_parameters * self._precision_megabytes + def summarize(self) -> Dict[str, LayerSummary]: summary = OrderedDict((name, LayerSummary(module)) for name, module in self.named_modules) if self._model.example_input_array is not None: @@ -247,7 +261,6 @@ def _forward_example_input(self) -> None: def __str__(self): """ Makes a summary listing with: - Layer Name, Layer Type, Number of Parameters, Input Sizes, Output Sizes """ arrays = [ @@ -259,11 +272,11 @@ def __str__(self): if self._model.example_input_array is not None: arrays.append(["In sizes", self.in_sizes]) arrays.append(["Out sizes", self.out_sizes]) + total_parameters = self.total_parameters + trainable_parameters = self.trainable_parameters + model_size = self.model_size() - trainable_parameters = sum(p.numel() for p in self._model.parameters() if p.requires_grad) - total_parameters = sum(p.numel() for p in self._model.parameters()) - - return _format_summary_table(total_parameters, trainable_parameters, *arrays) + return _format_summary_table(total_parameters, trainable_parameters, model_size, *arrays) def __repr__(self): return str(self) @@ -280,7 +293,7 @@ def parse_batch_shape(batch: Any) -> Union[str, List]: return UNKNOWN_SIZE -def _format_summary_table(total_parameters: int, trainable_parameters: int, *cols) -> str: +def _format_summary_table(total_parameters: int, trainable_parameters: int, model_size: float, *cols) -> str: """ Takes in a number of arrays, each specifying a column in the summary table, and combines them all into one big @@ -316,24 +329,22 @@ def _format_summary_table(total_parameters: int, trainable_parameters: int, *col summary += "Non-trainable params" summary += "\n" + s.format(get_human_readable_count(total_parameters), 10) summary += "Total params" + summary += "\n" + s.format(get_formatted_model_size(model_size), 10) + summary += "Total Estimated Params Size (MB)" return summary def get_memory_profile(mode: str) -> Union[Dict[str, int], Dict[int, int]]: """ Get a profile of the current memory usage. - Args: mode: There are two modes: - - 'all' means return memory for all gpus - 'min_max' means return memory for max and min - Return: A dictionary in which the keys are device ids as integers and values are memory usage as integers in MB. If mode is 'min_max', the dictionary will also contain two additional keys: - - 'min_gpu_mem': the minimum memory usage in MB - 'max_gpu_mem': the maximum memory usage in MB """ @@ -351,7 +362,6 @@ def get_memory_profile(mode: str) -> Union[Dict[str, int], Dict[int, int]]: def get_gpu_memory_map() -> Dict[str, int]: """ Get the current gpu usage. - Return: A dictionary in which the keys are device ids as integers and values are memory usage as integers in MB. @@ -372,12 +382,13 @@ def get_gpu_memory_map() -> Dict[str, int]: } return gpu_memory_map +def get_formatted_model_size(total_model_size: float) -> float: + return f"{total_model_size:.3f}" def get_human_readable_count(number: int) -> str: """ Abbreviates an integer number with K, M, B, T for thousands, millions, billions and trillions, respectively. - Examples: >>> get_human_readable_count(123) '123 ' @@ -391,13 +402,10 @@ def get_human_readable_count(number: int) -> str: '400 T' >>> get_human_readable_count(5e15) # (more than trillion) '5,000 T' - Args: number: a positive integer number - Return: A string formatted according to the pattern described above. - """ assert number >= 0 labels = PARAMETER_NUM_UNITS diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py index c8e9ff8b80a2f..ed5e9490983b0 100644 --- a/pytorch_lightning/core/optimizer.py +++ b/pytorch_lightning/core/optimizer.py @@ -17,7 +17,7 @@ from torch.optim.optimizer import Optimizer -from pytorch_lightning.utilities import TPU_AVAILABLE +from pytorch_lightning.utilities import AMPType, TPU_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException if TPU_AVAILABLE: @@ -62,6 +62,10 @@ def __init__(self, self._accumulate_grad_batches = accumulate_grad_batches self._optimizer_idx = None + @property + def optimizer(self): + return self._optimizer + @property def defaults(self): return self._optimizer.defaults @@ -102,9 +106,13 @@ def _on_trainer_init(self, trainer): break @classmethod - def to_lightning_optimizer(cls, optimizer, trainer): - optimizer = cls(optimizer) - optimizer._on_trainer_init(trainer) + def _to_lightning_optimizer(cls, optimizer, trainer, opt_idx): + # apex overrides .step function and need to be wrapped on each step + if trainer.amp_backend == AMPType.APEX: + optimizer = cls(optimizer) + optimizer._on_trainer_init(trainer) + else: + optimizer = trainer.lightning_optimizers[opt_idx] return optimizer def _accumulated_batches_reached(self): @@ -146,7 +154,7 @@ def __optimizer_step(self, *args, closure: Optional[Callable] = None, profiler_n **kwargs ) - trainer.train_loop.on_before_zero_grad(self) + trainer.train_loop.on_before_zero_grad(optimizer) model.optimizer_zero_grad( trainer.current_epoch, diff --git a/pytorch_lightning/core/saving.py b/pytorch_lightning/core/saving.py index 6741236a7e5f5..12a29246888f7 100644 --- a/pytorch_lightning/core/saving.py +++ b/pytorch_lightning/core/saving.py @@ -17,16 +17,19 @@ import inspect import os from argparse import Namespace -from typing import Union, Dict, Any, Optional, Callable, MutableMapping, IO +from copy import deepcopy +from functools import partial +from typing import Any, Callable, Dict, IO, MutableMapping, Optional, Union from warnings import warn import torch import yaml from pytorch_lightning import _logger as log -from pytorch_lightning.utilities import rank_zero_warn, AttributeDict, OMEGACONF_AVAILABLE -from pytorch_lightning.utilities.cloud_io import load as pl_load +from pytorch_lightning.utilities import AttributeDict, OMEGACONF_AVAILABLE, rank_zero_warn +from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.cloud_io import get_filesystem +from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.parsing import parse_class_init_keys PRIMITIVE_TYPES = (bool, int, float, str) @@ -34,6 +37,9 @@ if OMEGACONF_AVAILABLE: from omegaconf import OmegaConf + from omegaconf.dictconfig import DictConfig + from omegaconf.errors import UnsupportedValueType, ValidationError + # the older shall be on the top CHECKPOINT_PAST_HPARAMS_KEYS = ( @@ -321,9 +327,14 @@ def save_hparams_to_tags_csv(tags_csv: str, hparams: Union[dict, Namespace]) -> writer.writerow({"key": k, "value": v}) -def load_hparams_from_yaml(config_yaml: str) -> Dict[str, Any]: +def load_hparams_from_yaml(config_yaml: str, use_omegaconf: bool = True) -> Dict[str, Any]: """Load hparams from a file. + Args: + config_yaml: Path to config yaml file + use_omegaconf: If both `OMEGACONF_AVAILABLE` and `use_omegaconf` are True, + the hparams will be converted to `DictConfig` if possible + >>> hparams = Namespace(batch_size=32, learning_rate=0.001, data_root='./any/path/here') >>> path_yaml = './testing-hparams.yaml' >>> save_hparams_to_yaml(path_yaml, hparams) @@ -338,9 +349,15 @@ def load_hparams_from_yaml(config_yaml: str) -> Dict[str, Any]: return {} with fs.open(config_yaml, "r") as fp: - tags = yaml.full_load(fp) + hparams = yaml.full_load(fp) - return tags + if OMEGACONF_AVAILABLE: + if use_omegaconf: + try: + return OmegaConf.create(hparams) + except (UnsupportedValueType, ValidationError): + pass + return hparams def save_hparams_to_yaml(config_yaml, hparams: Union[dict, Namespace]) -> None: @@ -361,15 +378,16 @@ def save_hparams_to_yaml(config_yaml, hparams: Union[dict, Namespace]) -> None: # saving with OmegaConf objects if OMEGACONF_AVAILABLE: - if OmegaConf.is_config(hparams): - with fs.open(config_yaml, "w", encoding="utf-8") as fp: - OmegaConf.save(hparams, fp, resolve=True) - return - for v in hparams.values(): - if OmegaConf.is_config(v): - with fs.open(config_yaml, "w", encoding="utf-8") as fp: - OmegaConf.save(OmegaConf.create(hparams), fp, resolve=True) + # deepcopy: hparams from user shouldn't be resolved + hparams = deepcopy(hparams) + to_container = partial(OmegaConf.to_container, resolve=True) + hparams = apply_to_collection(hparams, DictConfig, to_container) + with fs.open(config_yaml, "w", encoding="utf-8") as fp: + try: + OmegaConf.save(hparams, fp) return + except (UnsupportedValueType, ValidationError): + pass assert isinstance(hparams, dict) hparams_allowed = {} diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 142fe9048cb0e..b6112a68b4e9b 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -15,15 +15,15 @@ """[Train, Eval]Result for easier logging, checkpointing, early stopping, epoch-wise reduction.""" import numbers +import os from copy import copy -from typing import Optional, Dict, Union, Sequence, Callable, MutableMapping, Any, List, Tuple, Iterable +from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Sequence, Tuple, Union import torch from torch import Tensor -import os -from pytorch_lightning.utilities.distributed import sync_ddp_if_available from pytorch_lightning.metrics import Metric +from pytorch_lightning.utilities.distributed import sync_ddp_if_available class Result(Dict): @@ -128,6 +128,7 @@ def log( sync_dist_group: Optional[Any] = None, sync_fn: Callable = None, dataloader_idx: Optional[int] = None, + device: torch.device = None, ): # no metrics should be logged with graphs if not enable_graph and isinstance(value, torch.Tensor): @@ -138,7 +139,10 @@ def log( if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)): is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized() # TODO: Find a way to make the reduction only once, so we don't need to clone. - value = value.clone() if is_dist_initialized else value + if is_dist_initialized and isinstance(value, torch.Tensor): + value = value.clone() + else: + value = torch.tensor(value, device=device, dtype=torch.float) value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op) if 'meta' not in self: @@ -367,7 +371,10 @@ def get_forked_metrics(self, add_dataloader_idx=False): dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) if options['forked']: - result[dl_key] = self[k] + if isinstance(self[k], Metric): + result[dl_key] = self[k].compute().detach() + else: + result[dl_key] = self[k] return result diff --git a/pytorch_lightning/loggers/__init__.py b/pytorch_lightning/loggers/__init__.py index 562e027ea36b6..41f42be3f02f3 100644 --- a/pytorch_lightning/loggers/__init__.py +++ b/pytorch_lightning/loggers/__init__.py @@ -24,40 +24,25 @@ 'CSVLogger', ] -try: - # needed to prevent ImportError and duplicated logs. - environ["COMET_DISABLE_AUTO_LOGGING"] = "1" +from pytorch_lightning.loggers.comet import _COMET_AVAILABLE, CometLogger +from pytorch_lightning.loggers.mlflow import _MLFLOW_AVAILABLE, MLFlowLogger +from pytorch_lightning.loggers.neptune import _NEPTUNE_AVAILABLE, NeptuneLogger +from pytorch_lightning.loggers.test_tube import _TESTTUBE_AVAILABLE, TestTubeLogger +from pytorch_lightning.loggers.wandb import _WANDB_AVAILABLE, WandbLogger - from pytorch_lightning.loggers.comet import CometLogger -except ImportError: # pragma: no-cover - del environ["COMET_DISABLE_AUTO_LOGGING"] # pragma: no-cover -else: +if _COMET_AVAILABLE: __all__.append('CometLogger') + # needed to prevent ImportError and duplicated logs. + environ["COMET_DISABLE_AUTO_LOGGING"] = "1" -try: - from pytorch_lightning.loggers.mlflow import MLFlowLogger -except ImportError: # pragma: no-cover - pass # pragma: no-cover -else: +if _MLFLOW_AVAILABLE: __all__.append('MLFlowLogger') -try: - from pytorch_lightning.loggers.neptune import NeptuneLogger -except ImportError: # pragma: no-cover - pass # pragma: no-cover -else: +if _NEPTUNE_AVAILABLE: __all__.append('NeptuneLogger') -try: - from pytorch_lightning.loggers.test_tube import TestTubeLogger -except ImportError: # pragma: no-cover - pass # pragma: no-cover -else: +if _TESTTUBE_AVAILABLE: __all__.append('TestTubeLogger') -try: - from pytorch_lightning.loggers.wandb import WandbLogger -except ImportError: # pragma: no-cover - pass # pragma: no-cover -else: +if _WANDB_AVAILABLE: __all__.append('WandbLogger') diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py index a27998366b671..ac7ab3e023bdb 100644 --- a/pytorch_lightning/loggers/base.py +++ b/pytorch_lightning/loggers/base.py @@ -207,7 +207,7 @@ def _sanitize_callable(val): return {key: _sanitize_callable(val) for key, val in params.items()} @staticmethod - def _flatten_dict(params: Dict[str, Any], delimiter: str = '/') -> Dict[str, Any]: + def _flatten_dict(params: Dict[Any, Any], delimiter: str = '/') -> Dict[str, Any]: """ Flatten hierarchical dict, e.g. ``{'a': {'b': 'c'}} -> {'a/b': 'c'}``. @@ -223,12 +223,15 @@ def _flatten_dict(params: Dict[str, Any], delimiter: str = '/') -> Dict[str, Any {'a/b': 'c'} >>> LightningLoggerBase._flatten_dict({'a': {'b': 123}}) {'a/b': 123} + >>> LightningLoggerBase._flatten_dict({5: {'a': 123}}) + {'5/a': 123} """ def _dict_generator(input_dict, prefixes=None): prefixes = prefixes[:] if prefixes else [] if isinstance(input_dict, MutableMapping): for key, value in input_dict.items(): + key = str(key) if isinstance(value, (MutableMapping, Namespace)): value = vars(value) if isinstance(value, Namespace) else value for d in _dict_generator(value, prefixes + [key]): diff --git a/pytorch_lightning/loggers/comet.py b/pytorch_lightning/loggers/comet.py index 64c87888da9d2..869bce831f0c2 100644 --- a/pytorch_lightning/loggers/comet.py +++ b/pytorch_lightning/loggers/comet.py @@ -21,17 +21,18 @@ from argparse import Namespace from typing import Any, Dict, Optional, Union -try: - import comet_ml +import torch +from torch import is_tensor -except ModuleNotFoundError: # pragma: no-cover - comet_ml = None - CometExperiment = None - CometExistingExperiment = None - CometOfflineExperiment = None - API = None - generate_guid = None -else: +from pytorch_lightning import _logger as log +from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment +from pytorch_lightning.utilities import rank_zero_only, _module_available +from pytorch_lightning.utilities.exceptions import MisconfigurationException + +_COMET_AVAILABLE = _module_available("comet_ml") + +if _COMET_AVAILABLE: + import comet_ml from comet_ml import ExistingExperiment as CometExistingExperiment from comet_ml import Experiment as CometExperiment from comet_ml import OfflineExperiment as CometOfflineExperiment @@ -41,14 +42,11 @@ except ImportError: # pragma: no-cover # For more information, see: https://www.comet.ml/docs/python-sdk/releases/#release-300 from comet_ml.papi import API # pragma: no-cover - -import torch -from torch import is_tensor - -from pytorch_lightning import _logger as log -from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment -from pytorch_lightning.utilities import rank_zero_only -from pytorch_lightning.utilities.exceptions import MisconfigurationException +else: + # needed for test mocks, these tests shall be updated + comet_ml = None + CometExperiment, CometExistingExperiment, CometOfflineExperiment = None, None, None + API = None class CometLogger(LightningLoggerBase): diff --git a/pytorch_lightning/loggers/mlflow.py b/pytorch_lightning/loggers/mlflow.py index 92f1c15d589d4..4987d050c925d 100644 --- a/pytorch_lightning/loggers/mlflow.py +++ b/pytorch_lightning/loggers/mlflow.py @@ -21,21 +21,25 @@ from time import time from typing import Any, Dict, Optional, Union -try: - import mlflow - from mlflow.tracking import MlflowClient -except ModuleNotFoundError: # pragma: no-cover - mlflow = None - MlflowClient = None - from pytorch_lightning import _logger as log from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment -from pytorch_lightning.utilities import rank_zero_only, rank_zero_warn +from pytorch_lightning.utilities import rank_zero_only, rank_zero_warn, _module_available + LOCAL_FILE_URI_PREFIX = "file:" +_MLFLOW_AVAILABLE = _module_available("mlflow") +try: + import mlflow + from mlflow.tracking import MlflowClient +# todo: there seems to be still some remaining import error with Conda env +except ImportError: + _MLFLOW_AVAILABLE = False + mlflow, MlflowClient = None, None + + class MLFlowLogger(LightningLoggerBase): """ Log using `MLflow `_. diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py index 410473f28614a..9f3c3787a417e 100644 --- a/pytorch_lightning/loggers/neptune.py +++ b/pytorch_lightning/loggers/neptune.py @@ -17,21 +17,23 @@ -------------- """ from argparse import Namespace -from typing import Any, Dict, Iterable, List, Optional, Union - -try: - import neptune - from neptune.experiments import Experiment -except ImportError: # pragma: no-cover - neptune = None - Experiment = None +from typing import Any, Dict, Iterable, Optional, Union import torch from torch import is_tensor from pytorch_lightning import _logger as log from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment -from pytorch_lightning.utilities import rank_zero_only +from pytorch_lightning.utilities import rank_zero_only, _module_available + +_NEPTUNE_AVAILABLE = _module_available("neptune") + +if _NEPTUNE_AVAILABLE: + import neptune + from neptune.experiments import Experiment +else: + # needed for test mocks, these tests shall be updated + neptune, Experiment = None, None class NeptuneLogger(LightningLoggerBase): diff --git a/pytorch_lightning/loggers/tensorboard.py b/pytorch_lightning/loggers/tensorboard.py index afdb98cb978de..f8e984c6ff5bc 100644 --- a/pytorch_lightning/loggers/tensorboard.py +++ b/pytorch_lightning/loggers/tensorboard.py @@ -144,8 +144,21 @@ def experiment(self) -> SummaryWriter: return self._experiment @rank_zero_only - def log_hyperparams(self, params: Union[Dict[str, Any], Namespace], - metrics: Optional[Dict[str, Any]] = None) -> None: + def log_hyperparams( + self, + params: Union[Dict[str, Any], Namespace], + metrics: Optional[Dict[str, Any]] = None, + ) -> None: + """ + Record hyperparameters. TensorBoard logs with and without saved hyperparameters + are incompatible, the hyperparameters are then not displayed in the TensorBoard. + Please delete or move the previously saved logs to display the new ones with hyperparameters. + + Args: + params: a dictionary-like container with the hyperparameters + metrics: Dictionary with metric names as keys and measured quantities as values + """ + params = self._convert_params(params) # store params to output @@ -217,7 +230,7 @@ def save(self) -> None: hparams_file = os.path.join(dir_path, self.NAME_HPARAMS_FILE) # save the metatags file if it doesn't exist - if not os.path.isfile(hparams_file): + if not self._fs.isfile(hparams_file): save_hparams_to_yaml(hparams_file, self.hparams) @rank_zero_only diff --git a/pytorch_lightning/loggers/test_tube.py b/pytorch_lightning/loggers/test_tube.py index 3750a32eab103..65d7deb90f43c 100644 --- a/pytorch_lightning/loggers/test_tube.py +++ b/pytorch_lightning/loggers/test_tube.py @@ -19,15 +19,18 @@ from argparse import Namespace from typing import Any, Dict, Optional, Union -try: - from test_tube import Experiment -except ImportError: # pragma: no-cover - Experiment = None - from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment +from pytorch_lightning.utilities import _module_available from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn +_TESTTUBE_AVAILABLE = _module_available("test_tube") + +if _TESTTUBE_AVAILABLE: + from test_tube import Experiment +else: + Experiment = None + class TestTubeLogger(LightningLoggerBase): r""" diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py index 24007c3a04307..f92c44ab27b7f 100644 --- a/pytorch_lightning/loggers/wandb.py +++ b/pytorch_lightning/loggers/wandb.py @@ -22,15 +22,18 @@ import torch.nn as nn +from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment +from pytorch_lightning.utilities import rank_zero_only, _module_available +from pytorch_lightning.utilities.warning_utils import WarningCache + +_WANDB_AVAILABLE = _module_available("wandb") + try: import wandb from wandb.wandb_run import Run -except ImportError: # pragma: no-cover - wandb = None - Run = None - -from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment -from pytorch_lightning.utilities import rank_zero_only +except ImportError: + # needed for test mocks, these tests shall be updated + wandb, Run = None, None class WandbLogger(LightningLoggerBase): @@ -59,13 +62,16 @@ class WandbLogger(LightningLoggerBase): Example:: - .. code:: + .. code-block:: python from pytorch_lightning.loggers import WandbLogger from pytorch_lightning import Trainer wandb_logger = WandbLogger() trainer = Trainer(logger=wandb_logger) + Note: When logging manually through `wandb.log` or `trainer.logger.experiment.log`, + make sure to use `commit=False` so the logging step does not increase. + See Also: - `Tutorial `__ @@ -103,8 +109,9 @@ def __init__( self._log_model = log_model self._prefix = prefix self._kwargs = kwargs - # logging multiple Trainer on a single W&B run (k-fold, etc) + # logging multiple Trainer on a single W&B run (k-fold, resuming, etc) self._step_offset = 0 + self.warning_cache = WarningCache() def __getstate__(self): state = self.__dict__.copy() @@ -134,6 +141,8 @@ def experiment(self) -> Run: self._experiment = wandb.init( name=self._name, dir=self._save_dir, project=self._project, anonymous=self._anonymous, id=self._id, resume='allow', **self._kwargs) if wandb.run is None else wandb.run + # offset logging step when resuming a run + self._step_offset = self._experiment.step # save checkpoints in wandb dir to upload on W&B servers if self._log_model: self._save_dir = self._experiment.dir @@ -154,6 +163,8 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> assert rank_zero_only.rank == 0, 'experiment tried to log from global_rank != 0' metrics = self._add_prefix(metrics) + if step is not None and step + self._step_offset < self.experiment.step: + self.warning_cache.warn('Trying to log at a previous step. Use `commit=False` when logging metrics manually.') self.experiment.log(metrics, step=(step + self._step_offset) if step is not None else None) @property diff --git a/pytorch_lightning/metrics/classification/precision_recall.py b/pytorch_lightning/metrics/classification/precision_recall.py index 7e1f843b9c331..c5a577a5d45e3 100644 --- a/pytorch_lightning/metrics/classification/precision_recall.py +++ b/pytorch_lightning/metrics/classification/precision_recall.py @@ -42,7 +42,6 @@ class Precision(Metric): Args: num_classes: Number of classes in the dataset. - beta: Beta coefficient in the F measure. threshold: Threshold value for binary or multi-label logits. default: 0.5 @@ -135,7 +134,6 @@ class Recall(Metric): Args: num_classes: Number of classes in the dataset. - beta: Beta coefficient in the F measure. threshold: Threshold value for binary or multi-label logits. default: 0.5 @@ -207,7 +205,7 @@ def update(self, preds: torch.Tensor, target: torch.Tensor): def compute(self): """ - Computes accuracy over state. + Computes recall over state. """ if self.average == 'micro': return self.true_positives.sum().float() / (self.actual_positives.sum() + METRIC_EPS) diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py index 0f61b94c55139..a21242c3bdc7e 100644 --- a/pytorch_lightning/metrics/metric.py +++ b/pytorch_lightning/metrics/metric.py @@ -94,7 +94,8 @@ def add_state( reset to this value when ``self.reset()`` is called. dist_reduce_fx (Optional): Function to reduce state accross mutliple processes in distributed mode. If value is ``"sum"``, ``"mean"``, or ``"cat"``, we will use ``torch.sum``, ``torch.mean``, - and ``torch.cat`` respectively, each with argument ``dim=0``. The user can also pass a custom + and ``torch.cat`` respectively, each with argument ``dim=0``. Note that the ``"cat"`` reduction + only makes sense if the state is a list, and not a tensor. The user can also pass a custom function in this parameter. persistent (Optional): whether the state will be saved as part of the modules ``state_dict``. Default is ``False``. @@ -244,7 +245,7 @@ def reset(self): """ for attr, default in self._defaults.items(): current_val = getattr(self, attr) - if isinstance(current_val, torch.Tensor): + if isinstance(default, torch.Tensor): setattr(self, attr, deepcopy(default).to(current_val.device)) else: setattr(self, attr, deepcopy(default)) diff --git a/pytorch_lightning/plugins/ddp_plugin.py b/pytorch_lightning/plugins/ddp_plugin.py index 281074cb37813..6d5ad1e9e2119 100644 --- a/pytorch_lightning/plugins/ddp_plugin.py +++ b/pytorch_lightning/plugins/ddp_plugin.py @@ -1,7 +1,8 @@ import os from contextlib import contextmanager -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Union +import torch import torch.distributed as torch_distrib from torch.optim import Optimizer @@ -47,7 +48,7 @@ def configure_ddp( def configure_ddp(self, model, device_ids): model = LightningDistributedDataParallel( - model, device_ids=device_ids, find_unused_parameters=True + model, device_ids=device_ids, find_unused_parameters=False ) return model @@ -59,9 +60,9 @@ def configure_ddp(self, model, device_ids): the model wrapped in LightningDistributedDataParallel """ - # if unset, default `find_unused_parameters` `True` + # if unset, default `find_unused_parameters` `False` self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get( - "find_unused_parameters", True + "find_unused_parameters", False ) model = LightningDistributedDataParallel( model, @@ -91,22 +92,23 @@ def init_ddp_connection( torch_backend, rank=global_rank, world_size=world_size ) + @property + def is_running_single_process_per_device(self) -> bool: + # objects do not need to be scattered in single process per device, move objects upfront to device + # This property is used in ``self.on_before_forward`` function. + return self.device_ids is not None and len(self.device_ids) == 1 + def on_before_forward(self, model: LightningModule, *args): """ - Override to handle custom input to device logic. For DDP, no logic is required as this is handled internally - within the DDP wrapper. - - Example:: - - def on_before_forward(self, model, *args): - batch, batch_idx = args - return batch.to(model.device) + Override to handle custom edge case. Args: args: Inputs to the model. model: Model to train. Returns: args moved to correct device if needed. """ + if self.is_running_single_process_per_device: + args = model.transfer_batch_to_device(args, model.device) return args def optimizer_state(self, optimizer: Optimizer) -> dict: diff --git a/pytorch_lightning/plugins/ddp_sequential_plugin.py b/pytorch_lightning/plugins/ddp_sequential_plugin.py index 010f0ea1648a8..069b1754fbce0 100644 --- a/pytorch_lightning/plugins/ddp_sequential_plugin.py +++ b/pytorch_lightning/plugins/ddp_sequential_plugin.py @@ -15,20 +15,20 @@ from typing import Any, List, Optional import torch -import torch.distributed as torch_distrib from torch import nn +import torch.distributed as torch_distrib from torch.nn.parallel import DistributedDataParallel -from pytorch_lightning import LightningModule from pytorch_lightning import _logger as log +from pytorch_lightning import LightningModule from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import FAIRSCALE_PIPE_AVAILABLE, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException if FAIRSCALE_PIPE_AVAILABLE: - import fairscale.nn.model_parallel as mpu from fairscale.nn import PipeRPCWrapper + import fairscale.nn.model_parallel as mpu from fairscale.nn.pipe import balance as pipe_balance from fairscale.nn.pipe import rpc as rpc_pipe from fairscale.nn.pipe.pipeline import PipelineStyle @@ -228,7 +228,7 @@ def _infer_check_num_gpus(self, trainer): Returns: The appropriate balance for the model """ if isinstance(self.balance, list): - if len(self.balance) != trainer.world_size: + if len(self.balance) != (trainer.world_size / trainer.num_nodes): raise MisconfigurationException( "Pipe currently only supports splitting the module onto all available GPUs" ) @@ -380,7 +380,6 @@ def register_optimizers(ctx, model): model.trainer.optimizers = optimizers model.trainer.lr_schedulers = lr_schedulers model.trainer.optimizer_frequencies = optimizer_frequencies - model.trainer.convert_to_lightning_optimizers() def run_optimizer(ctx, model): diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 4df5d128476a4..9df1ba3262afa 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -16,6 +16,7 @@ import torch from torch.optim import Optimizer +from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin @@ -52,7 +53,10 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): # unscale gradient to allow analyze within `on_after_backward` if not self.trainer.train_loop.should_accumulate() and automatic_optimization: - self.trainer.scaler.unscale_(optimizer) + if isinstance(optimizer, LightningOptimizer): + self.trainer.scaler.unscale_(optimizer.optimizer) + else: + self.trainer.scaler.unscale_(optimizer) return closure_loss diff --git a/pytorch_lightning/plugins/rpc_plugin.py b/pytorch_lightning/plugins/rpc_plugin.py index 492bddaff0c77..223a1f0a13110 100644 --- a/pytorch_lightning/plugins/rpc_plugin.py +++ b/pytorch_lightning/plugins/rpc_plugin.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from typing import Any, Optional +from contextlib import suppress +from typing import Optional import torch @@ -20,8 +21,11 @@ from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.utilities import RPC_AVAILABLE +DEFAULT_RPC_TIMEOUT_SEC = 60. if RPC_AVAILABLE: from torch.distributed import rpc + with suppress(ModuleNotFoundError, ImportError): + from torch.distributed.rpc.constants import DEFAULT_RPC_TIMEOUT_SEC class RPCPlugin(DDPPlugin): @@ -33,7 +37,8 @@ class RPCPlugin(DDPPlugin): that need to be addressed when using RPC communication when building custom RPC Plugins. """ - def __init__(self, **kwargs): + def __init__(self, rpc_timeout_sec: float = DEFAULT_RPC_TIMEOUT_SEC, **kwargs): + self.rpc_timeout_sec = rpc_timeout_sec self.rpc_initialized = False super().__init__(**kwargs) @@ -42,6 +47,7 @@ def init_rpc_connection(self, world_size: int) -> None: os.environ['MASTER_PORT'] = os.getenv('RPC_MASTER_PORT', '15000') rpc.init_rpc(f"worker{global_rank}", rank=global_rank, world_size=world_size) + rpc._set_rpc_timeout(self.rpc_timeout_sec) self.rpc_initialized = True def rpc_save_model(self, diff --git a/pytorch_lightning/plugins/sharded_plugin.py b/pytorch_lightning/plugins/sharded_plugin.py index 937538561ccdd..d989b6237ad72 100644 --- a/pytorch_lightning/plugins/sharded_plugin.py +++ b/pytorch_lightning/plugins/sharded_plugin.py @@ -11,13 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional, Union, Any +from typing import Any, List, Optional, Union from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.optimizer import is_lightning_optimizer from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.sharded_native_amp_plugin import ShardedNativeAMPPlugin -from pytorch_lightning.utilities import FAIRSCALE_AVAILABLE, AMPType, rank_zero_only +from pytorch_lightning.utilities import AMPType, FAIRSCALE_AVAILABLE, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException if FAIRSCALE_AVAILABLE: @@ -42,9 +42,6 @@ def optimizer_state(self, optimizer: 'OSS') -> Optional[dict]: optimizer.consolidate_state_dict() return self._optim_state_dict(optimizer) - def on_before_forward(self, model: LightningModule, *args): - return model.transfer_batch_to_device(args, model.trainer.root_gpu) - def _check_fairscale(self): if not FAIRSCALE_AVAILABLE: raise MisconfigurationException( @@ -66,7 +63,7 @@ def _reinit_with_fairscale_oss(self, trainer): optimizers = trainer.optimizers for x, optimizer in enumerate(optimizers): if is_lightning_optimizer(optimizer): - optimizer = optimizer._optimizer + optimizer = optimizer.optimizer if not isinstance(optimizer, OSS): optim_class = type(optimizer) zero_optimizer = OSS( @@ -76,7 +73,6 @@ def _reinit_with_fairscale_oss(self, trainer): ) optimizers[x] = zero_optimizer del optimizer - trainer.convert_to_lightning_optimizers() def get_model_from_plugin( self, diff --git a/pytorch_lightning/setup_tools.py b/pytorch_lightning/setup_tools.py index 3842bbe50cfc5..b49c90bd0b28c 100644 --- a/pytorch_lightning/setup_tools.py +++ b/pytorch_lightning/setup_tools.py @@ -19,7 +19,7 @@ from urllib.error import HTTPError, URLError from urllib.request import Request, urlopen -from pytorch_lightning import PROJECT_ROOT, __homepage__, __version__ +from pytorch_lightning import __homepage__, __version__, PROJECT_ROOT _PATH_BADGES = os.path.join('.', 'docs', 'source', '_images', 'badges') # badge to download @@ -178,6 +178,11 @@ def _load_long_description(path_dir: str) -> str: # replace github badges for release ones text = text.replace('badge.svg?branch=master&event=push', f'badge.svg?tag={__version__}') + skip_begin = r'' + skip_end = r'' + # todo: wrap content as commented description + text = re.sub(rf"{skip_begin}.+?{skip_end}", '', text, flags=re.IGNORECASE + re.DOTALL) + # # https://github.com/Borda/pytorch-lightning/releases/download/1.1.0a6/codecov_badge.png # github_release_url = os.path.join(__homepage__, "releases", "download", __version__) # # download badge and replace url with local file diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py index 21d6af043df02..20992255ba29e 100644 --- a/pytorch_lightning/trainer/configuration_validator.py +++ b/pytorch_lightning/trainer/configuration_validator.py @@ -73,17 +73,7 @@ def __verify_train_loop_configuration(self, model): trainer.overriden_optimizer_step = is_overridden('optimizer_step', model) trainer.overriden_optimizer_zero_grad = is_overridden('optimizer_zero_grad', model) - - enable_pl_optimizer = trainer._enable_pl_optimizer automatic_optimization = trainer.train_loop.automatic_optimization - if trainer.overriden_optimizer_step and not enable_pl_optimizer and automatic_optimization: - rank_zero_warn( - "When overriding `LightningModule` optimizer_step with" - " `Trainer(..., enable_pl_optimizer=False, ...)`," - " we won't be calling `.zero_grad` we can't assume when you call your `optimizer.step()`." - " For Lightning to take care of it, please use `Trainer(enable_pl_optimizer=True)`." - ) - going_to_accumulate_grad_batches = trainer.accumulation_scheduler.going_to_accumulate_grad_batches() has_overriden_optimization_functions = trainer.overriden_optimizer_step or trainer.overriden_optimizer_zero_grad @@ -94,13 +84,6 @@ def __verify_train_loop_configuration(self, model): ' It ensures optimizer_step or optimizer_zero_grad are called on every batch.' ) - if (enable_pl_optimizer) and trainer.overriden_optimizer_zero_grad and not automatic_optimization: - raise MisconfigurationException( - 'When overriding `LightningModule` optimizer_zero_grad' - ' and preserving model property `automatic_optimization` as True with' - ' `Trainer(enable_pl_optimizer=True, ...) is not supported' - ) - def __verify_eval_loop_configuration(self, model, eval_loop_name): step_name = f'{eval_loop_name}_step' diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 429bddd88b77e..03d46132fb177 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -13,20 +13,21 @@ # limitations under the License. import os -from pathlib import Path import re -from typing import Union, Optional +from pathlib import Path +from typing import Optional, Union import torch import pytorch_lightning from pytorch_lightning import _logger as log +from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities import APEX_AVAILABLE, AMPType, OMEGACONF_AVAILABLE, rank_zero_info, rank_zero_warn +from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, OMEGACONF_AVAILABLE, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem from pytorch_lightning.utilities.cloud_io import load as pl_load -from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS if APEX_AVAILABLE: from apex import amp @@ -43,7 +44,7 @@ def __init__(self, trainer): # used to validate checkpointing logic self.has_trained = False - def restore_weights(self, model: LightningModule): + def restore_weights(self) -> None: """ Attempt to restore a checkpoint (e.g. weights) in this priority: 1. from HPC weights @@ -73,11 +74,16 @@ def restore_weights(self, model: LightningModule): if self.trainer.on_gpu: torch.cuda.empty_cache() - def restore(self, checkpoint_path: str, on_gpu: bool): + def restore(self, checkpoint_path: str, on_gpu: bool) -> bool: """ Load model/training states from a 'PyTorch-Lightning checkpoint' file through file-read and state-restore. All restored states are listed in return value description of `dump_checkpoint`. """ + # Try to read the checkpoint file at `checkpoint_path`. If not exist, do not restore checkpoint. + fs = get_filesystem(checkpoint_path) + if not fs.exists(checkpoint_path): + rank_zero_warn("No checkpoint file exists at `resume_from_checkpoint`. Start from scratch") + return False # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path` checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage) @@ -94,6 +100,9 @@ def restore(self, checkpoint_path: str, on_gpu: bool): # restore training state self.restore_training_state(checkpoint) + rank_zero_info(f"Restored states from the checkpoint file at {checkpoint_path}") + return True + def restore_model_state(self, model: LightningModule, checkpoint) -> None: """ Restore model states from a 'PyTorch-Lightning checkpoint' dictionary object @@ -157,9 +166,10 @@ def restore_training_state(self, checkpoint): expected_steps = self.trainer.num_training_batches / n_accum if self.trainer.num_training_batches != 0 and self.trainer.global_step % expected_steps > 1: rank_zero_warn( - "You're resuming from a checkpoint that ended mid-epoch. " - "This can cause unreliable results if further training is done, " - "consider using an end of epoch checkpoint. " + "You're resuming from a checkpoint that ended mid-epoch." + " Training will start from the beginning of the next epoch." + " This can cause unreliable results if further training is done," + " consider using an end of epoch checkpoint." ) # restore the optimizers diff --git a/pytorch_lightning/trainer/connectors/debugging_connector.py b/pytorch_lightning/trainer/connectors/debugging_connector.py index 61d7cbd189fde..1fa1f4f319289 100644 --- a/pytorch_lightning/trainer/connectors/debugging_connector.py +++ b/pytorch_lightning/trainer/connectors/debugging_connector.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.utilities.exceptions import MisconfigurationException from typing import Union -from pytorch_lightning.utilities import rank_zero_warn, rank_zero_info + +from pytorch_lightning.loggers.base import DummyLogger +from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn +from pytorch_lightning.utilities.exceptions import MisconfigurationException class DebuggingConnector: @@ -54,11 +56,16 @@ def on_init_start( limit_train_batches = fast_dev_run limit_val_batches = fast_dev_run limit_test_batches = fast_dev_run + self.trainer.max_steps = fast_dev_run self.trainer.num_sanity_val_steps = 0 self.trainer.max_epochs = 1 + val_check_interval = 1.0 + self.trainer.check_val_every_n_epoch = 1 + self.trainer.logger = DummyLogger() + rank_zero_info( 'Running in fast_dev_run mode: will run a full train,' - f' val and test loop using {fast_dev_run} batch(es)' + f' val and test loop using {fast_dev_run} batch(es).' ) self.trainer.limit_train_batches = _determine_batch_limits(limit_train_batches, 'limit_train_batches') diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 28025859814cc..2e27f8cf61ab3 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -91,11 +91,13 @@ def check_dataloader_idx(self, result: Result) -> bool: random_key = list(result.keys())[-1] return result["meta"][random_key]["dataloader_idx"] is not None - def get_latest_from_func_name(self, latest_result, func_name: str, *args, **kwargs) -> Dict: + def get_latest_from_func_name(self, latest_result_opt, func_name: str, *args, **kwargs) -> Dict: results = {} - add_dataloader_idx = self.check_dataloader_idx(latest_result) - func = getattr(latest_result, func_name) - results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs)) + for opt_idx in latest_result_opt: + latest_result = latest_result_opt[opt_idx] + add_dataloader_idx = self.check_dataloader_idx(latest_result) + func = getattr(latest_result, func_name) + results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs)) return results def run_latest_batch_metrics_with_func_name(self, func_name, *args, **kwargs) -> List[Dict]: @@ -156,6 +158,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio assert isinstance(result, Result) if dataloader_idx is None: dataloader_idx = 0 + if extra_info is None: extra_info = {} @@ -166,6 +169,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio if dataloader_idx not in self._internals: self._internals[dataloader_idx] = {} self._internals_reduced[dataloader_idx] = defaultdict(dict) + self._latest_ref[dataloader_idx] = {} # extract infos opt_idx = extra_info["opt_idx"] @@ -173,7 +177,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio self._append_to_structure(self._internals[dataloader_idx], opt_idx, batch_idx, result) - self._latest_ref[dataloader_idx] = result + self._latest_ref[dataloader_idx][opt_idx] = result # [dataloader_idx] is a list else: @@ -181,7 +185,11 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio self._internals.setdefault(dataloader_idx, []) self._internals[dataloader_idx].append(result) - self._latest_ref[dataloader_idx] = result + if dataloader_idx not in self._latest_ref: + self._latest_ref[dataloader_idx] = {} + self._latest_ref[dataloader_idx][0] = {} + + self._latest_ref[dataloader_idx][0] = result def auto_reduce_results_on_epoch_end(self) -> None: """ @@ -195,24 +203,14 @@ def auto_reduce_results_on_epoch_end(self) -> None: epoch_metrics = self._internals[dl_idx] if self._internal_type == ResultStoreType.INSIDE_BATCH_TRAIN_LOOP: - - num_opt_idx = len(self._internals[dl_idx]) - 1 - - # Make sure we didn't create key - assert num_opt_idx >= 0 - - for opt_idx in range(num_opt_idx + 1): + for opt_idx in list(epoch_metrics): # TODO: Figure out to reduce memory # TODO: How to start training in middle of epoch opt_outputs = epoch_metrics[opt_idx] - num_batch_idx = len(self._internals[dl_idx][num_opt_idx]) - 1 - assert num_batch_idx >= 0 - batch_indexes = self._internals[dl_idx][num_opt_idx].keys() - # reduce across time first time_reduced_outputs = [] - for batch_idx in batch_indexes: + for batch_idx in opt_outputs.keys(): tbptt_outs = opt_outputs[batch_idx] tbptt_outs = tbptt_outs[0].__class__.reduce_across_time(tbptt_outs) if len(tbptt_outs) > 1: @@ -395,7 +393,7 @@ def update_logger_connector(self) -> None: callback_metrics.update(epoch_log_metrics) callback_metrics.update(forked_metrics) - if not is_train: + if not is_train and self.trainer.testing: logger_connector.evaluation_callback_metrics.update(callback_metrics) # update callback_metrics diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 6fdd2f0d57b63..84e8a1bc68f05 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -158,7 +158,7 @@ def cache_training_step_metrics(self, opt_closure_result): self.logged_metrics.update(logged_metrics_tmp) self.cached_results.legacy_batch_log_metrics.update(logged_metrics_tmp) - def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics=False): + def log_metrics(self, metrics, grad_norm_dic, step=None): """Logs the metric dict passed in. If `step` parameter is None and `step` key is presented is metrics, uses metrics["step"] as a step @@ -186,11 +186,8 @@ def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics= elif step is None: # added metrics by Lightning for convenience - if log_train_step_metrics: - step = self.trainer.total_batch_idx - else: - scalar_metrics['epoch'] = self.trainer.current_epoch - step = self.trainer.global_step + scalar_metrics['epoch'] = self.trainer.current_epoch + step = self.trainer.global_step # log actual metrics if self.trainer.logger is not None: @@ -211,9 +208,9 @@ def add_progress_bar_metrics(self, metrics): self.trainer.dev_debugger.track_pbar_metrics_history(metrics) - def track_metrics_deprecated(self, deprecated_eval_results, using_eval_result, test_mode): + def track_metrics_deprecated(self, deprecated_eval_results, using_eval_result): self._track_callback_metrics(deprecated_eval_results, using_eval_result) - self.__process_eval_epoch_end_results_and_log_legacy(deprecated_eval_results, test_mode) + self.__process_eval_epoch_end_results_and_log_legacy(deprecated_eval_results) def evaluation_epoch_end(self, testing): # reset dataloader idx @@ -242,7 +239,7 @@ def prepare_eval_loop_results(self): for dl_idx in range(self.trainer.evaluation_loop.num_dataloaders): self.add_to_eval_loop_results(dl_idx, has_been_initialized) - def get_evaluate_epoch_results(self, test_mode): + def get_evaluate_epoch_results(self): if not self.trainer.running_sanity_check: # log all the metrics as a single dict metrics_to_log = self.cached_results.get_epoch_log_metrics() @@ -252,7 +249,7 @@ def get_evaluate_epoch_results(self, test_mode): self.prepare_eval_loop_results() # log results of test - if test_mode and self.trainer.is_global_zero and self.trainer.verbose_test: + if self.trainer.testing and self.trainer.is_global_zero and self.trainer.verbose_test: print('-' * 80) for result_idx, results in enumerate(self.eval_loop_results): print(f'DATALOADER:{result_idx} TEST RESULTS') @@ -273,10 +270,13 @@ def _track_callback_metrics(self, eval_results, using_eval_result): if isinstance(eval_results, list): for eval_result in eval_results: self.trainer.logger_connector.callback_metrics.update(eval_result.callback_metrics) - self.trainer.logger_connector.evaluation_callback_metrics.update(eval_result.callback_metrics) + if self.trainer.testing: + self.trainer.logger_connector.evaluation_callback_metrics.update( + eval_result.callback_metrics) else: self.trainer.logger_connector.callback_metrics.update(eval_results.callback_metrics) - self.trainer.logger_connector.evaluation_callback_metrics.update(eval_results.callback_metrics) + if self.trainer.testing: + self.trainer.logger_connector.evaluation_callback_metrics.update(eval_results.callback_metrics) else: flat = {} if isinstance(eval_results, list): @@ -292,7 +292,8 @@ def _track_callback_metrics(self, eval_results, using_eval_result): flat['checkpoint_on'] = flat['val_loss'] flat['early_stop_on'] = flat['val_loss'] self.trainer.logger_connector.callback_metrics.update(flat) - self.trainer.logger_connector.evaluation_callback_metrics.update(flat) + if self.trainer.testing: + self.trainer.logger_connector.evaluation_callback_metrics.update(flat) else: # with a scalar return, auto set it to "val_loss" for callbacks if isinstance(eval_results, torch.Tensor): @@ -305,7 +306,8 @@ def _track_callback_metrics(self, eval_results, using_eval_result): flat['checkpoint_on'] = flat['val_loss'] flat['early_stop_on'] = flat['val_loss'] self.trainer.logger_connector.callback_metrics.update(flat) - self.trainer.logger_connector.evaluation_callback_metrics.update(flat) + if self.trainer.testing: + self.trainer.logger_connector.evaluation_callback_metrics.update(flat) def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metrics, log_metrics, callback_metrics): # eval loop returns all metrics @@ -322,12 +324,13 @@ def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metric callback_metrics.update(log_metrics) callback_metrics.update(prog_bar_metrics) self.trainer.logger_connector.callback_metrics.update(callback_metrics) - self.trainer.logger_connector.evaluation_callback_metrics.update(callback_metrics) + if self.trainer.testing: + self.trainer.logger_connector.evaluation_callback_metrics.update(callback_metrics) if len(dataloader_result_metrics) > 0: self.eval_loop_results.append(dataloader_result_metrics) - def __process_eval_epoch_end_results_and_log_legacy(self, eval_results, test_mode): + def __process_eval_epoch_end_results_and_log_legacy(self, eval_results): if self.trainer.running_sanity_check: return @@ -347,7 +350,7 @@ def __process_eval_epoch_end_results_and_log_legacy(self, eval_results, test_mod callback_metrics = result.callback_metrics # in testing we don't need the callback metrics - if test_mode: + if self.trainer.testing: callback_metrics = {} else: _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.trainer.process_dict_result(result) @@ -587,6 +590,8 @@ def __gather_result_across_time_and_optimizers(self, epoch_output): return gathered_epoch_outputs def log_train_step_metrics(self, batch_output): + if self.trainer.train_loop.should_accumulate() and self.trainer.train_loop.automatic_optimization: + return _, batch_log_metrics = self.cached_results.update_logger_connector() # when metrics should be logged if self.should_update_logs or self.trainer.fast_dev_run is True: @@ -595,5 +600,5 @@ def log_train_step_metrics(self, batch_output): if grad_norm_dic is None: grad_norm_dic = {} if len(batch_log_metrics) > 0 or len(grad_norm_dic) > 0: - self.log_metrics(batch_log_metrics, grad_norm_dic, log_train_step_metrics=True) + self.log_metrics(batch_log_metrics, grad_norm_dic) self.callback_metrics.update(batch_log_metrics) diff --git a/pytorch_lightning/trainer/connectors/optimizer_connector.py b/pytorch_lightning/trainer/connectors/optimizer_connector.py index 8c352c8e5ffeb..8b23203e42bc3 100644 --- a/pytorch_lightning/trainer/connectors/optimizer_connector.py +++ b/pytorch_lightning/trainer/connectors/optimizer_connector.py @@ -20,7 +20,11 @@ def __init__(self, trainer): self.trainer = trainer def on_trainer_init(self, enable_pl_optimizer): - self.trainer._enable_pl_optimizer = enable_pl_optimizer + if enable_pl_optimizer is not None: + rank_zero_warn( + "Trainer argument `enable_pl_optimizer` is deprecated in v1.1.3. It will be removed in v1.3.0", + DeprecationWarning + ) self.trainer.lr_schedulers = [] self.trainer.optimizers = [] self.trainer.optimizer_frequencies = [] diff --git a/pytorch_lightning/trainer/connectors/precision_connector.py b/pytorch_lightning/trainer/connectors/precision_connector.py index 37d5315e5d11b..822c3ef634fdc 100644 --- a/pytorch_lightning/trainer/connectors/precision_connector.py +++ b/pytorch_lightning/trainer/connectors/precision_connector.py @@ -15,7 +15,7 @@ from pytorch_lightning import _logger as log from pytorch_lightning.plugins.apex import ApexPlugin from pytorch_lightning.plugins.native_amp import NativeAMPPlugin -from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, AMPType, rank_zero_warn +from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, rank_zero_warn class PrecisionConnector: @@ -67,7 +67,6 @@ def _setup_amp_backend(self, amp_type: str): self.trainer.amp_backend = AMPType.APEX self.backend = ApexPlugin(self.trainer) log.warn("LightningOptimizer doesn't support Apex") - self.trainer._enable_pl_optimizer = False if not self.trainer.amp_backend: raise ModuleNotFoundError( diff --git a/pytorch_lightning/trainer/connectors/slurm_connector.py b/pytorch_lightning/trainer/connectors/slurm_connector.py index 4cb954a8e92fc..9c04c10559b6d 100644 --- a/pytorch_lightning/trainer/connectors/slurm_connector.py +++ b/pytorch_lightning/trainer/connectors/slurm_connector.py @@ -137,7 +137,7 @@ def connect_ddp(self, global_rank: int, world_size: int) -> None: # figure out the root node addr try: - root_node = os.environ["SLURM_NODELIST"].split(" ")[0] + root_node = os.environ["SLURM_NODELIST"].split(" ")[0].split(",")[0] except Exception: root_node = "127.0.0.1" diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 4b70917c8c43d..63f65bead2579 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -24,7 +24,6 @@ class EvaluationLoop(object): def __init__(self, trainer): self.trainer = trainer - self.testing = False self.outputs = [] self.step_metrics = [] self.predictions = None @@ -52,7 +51,7 @@ def get_evaluation_dataloaders(self, max_batches): model = self.trainer.get_model() # select dataloaders - if self.testing: + if self.trainer.testing: self.trainer.reset_test_dataloader(model) dataloaders = self.trainer.test_dataloaders @@ -85,34 +84,34 @@ def should_skip_evaluation(self, dataloaders, max_batches): return False def on_evaluation_start(self, *args, **kwargs): - if self.testing: + if self.trainer.testing: self.trainer.call_hook('on_test_start', *args, **kwargs) else: self.trainer.call_hook('on_validation_start', *args, **kwargs) def on_evaluation_model_eval(self, *args, **kwargs): model_ref = self.trainer.get_model() - if self.testing: + if self.trainer.testing: model_ref.on_test_model_eval() else: model_ref.on_validation_model_eval() def on_evaluation_model_train(self, *args, **kwargs): model_ref = self.trainer.get_model() - if self.testing: + if self.trainer.testing: model_ref.on_test_model_train() else: model_ref.on_validation_model_train() def on_evaluation_end(self, *args, **kwargs): - if self.testing: + if self.trainer.testing: self.trainer.call_hook('on_test_end', *args, **kwargs) else: self.trainer.call_hook('on_validation_end', *args, **kwargs) def reload_evaluation_dataloaders(self): model = self.trainer.get_model() - if self.testing: + if self.trainer.testing: self.trainer.reset_test_dataloader(model) else: self.trainer.reset_val_dataloader(model) @@ -123,9 +122,6 @@ def is_using_eval_results(self): return using_eval_result def setup(self, model, max_batches, dataloaders): - # copy properties for forward overrides - self.trainer.model_connector.copy_trainer_model_properties(model) - # bookkeeping self.outputs = [] self.predictions = PredictionCollection(self.trainer.global_rank, self.trainer.world_size) @@ -138,17 +134,23 @@ def setup(self, model, max_batches, dataloaders): self.num_dataloaders = self._get_num_dataloaders(dataloaders) def on_evaluation_epoch_start(self, *args, **kwargs): - if self.testing: + if self.trainer.testing: self.trainer.call_hook('on_test_epoch_start', *args, **kwargs) else: self.trainer.call_hook('on_validation_epoch_start', *args, **kwargs) - def build_args(self, test_mode, batch, batch_idx, dataloader_idx): + def _build_args(self, batch, batch_idx, dataloader_idx): # make dataloader_idx arg in validation_step optional args = [batch, batch_idx] - multiple_val_loaders = (not test_mode and self._get_num_dataloaders(self.trainer.val_dataloaders) > 1) - multiple_test_loaders = (test_mode and self._get_num_dataloaders(self.trainer.test_dataloaders) > 1) + multiple_val_loaders = ( + not self.trainer.testing + and self._get_num_dataloaders(self.trainer.val_dataloaders) > 1 + ) + multiple_test_loaders = ( + self.trainer.testing + and self._get_num_dataloaders(self.trainer.test_dataloaders) > 1 + ) if multiple_test_loaders or multiple_val_loaders: args.append(dataloader_idx) @@ -163,14 +165,14 @@ def _get_num_dataloaders(self, dataloaders): length = len(dataloaders[0]) return length - def evaluation_step(self, test_mode, batch, batch_idx, dataloader_idx): + def evaluation_step(self, batch, batch_idx, dataloader_idx): # configure args - args = self.build_args(test_mode, batch, batch_idx, dataloader_idx) + args = self._build_args(batch, batch_idx, dataloader_idx) model_ref = self.trainer.get_model() model_ref._results = Result() # run actual test step - if self.testing: + if self.trainer.testing: model_ref._current_fx_name = "test_step" output = self.trainer.accelerator_backend.test_step(args) else: @@ -192,7 +194,7 @@ def evaluation_step(self, test_mode, batch, batch_idx, dataloader_idx): return output def evaluation_step_end(self, *args, **kwargs): - if self.testing: + if self.trainer.testing: output = self.trainer.call_hook('test_step_end', *args, **kwargs) else: output = self.trainer.call_hook('validation_step_end', *args, **kwargs) @@ -200,7 +202,7 @@ def evaluation_step_end(self, *args, **kwargs): def evaluation_epoch_end(self): # unset dataloder_idx in model - self.trainer.logger_connector.evaluation_epoch_end(self.testing) + self.trainer.logger_connector.evaluation_epoch_end(self.trainer.testing) using_eval_result = self.is_using_eval_results() @@ -216,7 +218,7 @@ def evaluation_epoch_end(self): def log_epoch_metrics_on_evaluation_end(self): # get the final loop results - eval_loop_results = self.trainer.logger_connector.get_evaluate_epoch_results(self.testing) + eval_loop_results = self.trainer.logger_connector.get_evaluate_epoch_results() return eval_loop_results def __run_eval_epoch_end(self, num_dataloaders, using_eval_result): @@ -230,7 +232,7 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result): user_reduced = False - if self.testing: + if self.trainer.testing: if is_overridden('test_epoch_end', model=model): if using_eval_result: eval_results = self.__gather_epoch_end_eval_results(outputs) @@ -250,7 +252,7 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result): self.trainer.logger_connector.cache_logged_metrics() # depre warning if eval_results is not None and user_reduced: - step = 'testing_epoch_end' if self.testing else 'validation_epoch_end' + step = 'testing_epoch_end' if self.trainer.testing else 'validation_epoch_end' self.warning_cache.warn( f'The {step} should not return anything as of 9.1.' ' To log, use self.log(...) or self.write(...) directly in the LightningModule' @@ -263,7 +265,7 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result): eval_results = [eval_results] # track depreceated metrics - self.trainer.logger_connector.track_metrics_deprecated(eval_results, using_eval_result, self.testing) + self.trainer.logger_connector.track_metrics_deprecated(eval_results, using_eval_result) return eval_results @@ -300,15 +302,15 @@ def __auto_reduce_result_objs(self, outputs): def on_evaluation_batch_start(self, batch, batch_idx, dataloader_idx): # set dataloader_idx to model and track batch_size self.trainer.logger_connector.on_evaluation_batch_start( - self.testing, batch, dataloader_idx, self.num_dataloaders) + self.trainer.testing, batch, dataloader_idx, self.num_dataloaders) - if self.testing: + if self.trainer.testing: self.trainer.call_hook('on_test_batch_start', batch, batch_idx, dataloader_idx) else: self.trainer.call_hook('on_validation_batch_start', batch, batch_idx, dataloader_idx) def on_evaluation_batch_end(self, output, batch, batch_idx, dataloader_idx): - if self.testing: + if self.trainer.testing: self.trainer.call_hook('on_test_batch_end', output, batch, batch_idx, dataloader_idx) else: self.trainer.call_hook('on_validation_batch_end', output, batch, batch_idx, dataloader_idx) @@ -319,16 +321,16 @@ def on_evaluation_batch_end(self, output, batch, batch_idx, dataloader_idx): def store_predictions(self, output, batch_idx, dataloader_idx): # Add step predictions to prediction collection to write later if output is not None: - do_write_predictions = isinstance(output, Result) and self.testing + do_write_predictions = isinstance(output, Result) and self.trainer.testing if do_write_predictions: self.predictions.add(output.pop('predictions', None)) # track debug metrics - self.trainer.dev_debugger.track_eval_loss_history(self.testing, batch_idx, dataloader_idx, output) + self.trainer.dev_debugger.track_eval_loss_history(batch_idx, dataloader_idx, output) def on_evaluation_epoch_end(self, *args, **kwargs): # call the callback hook - if self.testing: + if self.trainer.testing: self.trainer.call_hook('on_test_epoch_end', *args, **kwargs) else: self.trainer.call_hook('on_validation_epoch_end', *args, **kwargs) diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py index 479d401720261..81f6eb64a4ab0 100644 --- a/pytorch_lightning/trainer/optimizers.py +++ b/pytorch_lightning/trainer/optimizers.py @@ -13,6 +13,7 @@ # limitations under the License. from abc import ABC +from collections import OrderedDict from typing import List, Optional, Tuple import torch @@ -75,7 +76,9 @@ def init_optimizers(self, model: LightningModule) -> Tuple[List, List, List]: ' * {"optimizer": `torch.optim.Optimizer`, (optional) "lr_scheduler": `torch.optim.lr_scheduler`}\n' ' * A list of the previously described dict format, with an optional "frequency" key (int)' ) + lr_schedulers = self.configure_schedulers(lr_schedulers, monitor=monitor) + _validate_scheduler_optimizer(optimizers, lr_schedulers) return optimizers, lr_schedulers, optimizer_frequencies @@ -86,8 +89,10 @@ def _convert_to_lightning_optimizer(trainer, optimizer): optimizer._on_trainer_init(trainer) return optimizer - if self._enable_pl_optimizer: - self.optimizers = [_convert_to_lightning_optimizer(self, opt) for opt in self.optimizers] + self._lightning_optimizers = { + opt_idx: _convert_to_lightning_optimizer(self, opt) + for opt_idx, opt in enumerate(self.optimizers) + } def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None): # Convert each scheduler into dict structure with relevant information @@ -140,6 +145,7 @@ def reinit_scheduler_properties(self, optimizers: list, schedulers: list): # Reinitialize optimizer.step properties added by schedulers for scheduler in schedulers: scheduler = scheduler['scheduler'] + state = None for optimizer in optimizers: # check that we dont mix users optimizers and schedulers @@ -147,14 +153,13 @@ def reinit_scheduler_properties(self, optimizers: list, schedulers: list): # Find the mro belonging to the base lr scheduler class for i, mro in enumerate(scheduler.__class__.__mro__): if mro in (optim.lr_scheduler._LRScheduler, optim.lr_scheduler.ReduceLROnPlateau): - idx = i state = scheduler.state_dict() - else: - state = None + scheduler.__class__.__mro__[i].__init__(scheduler, optimizer) + scheduler.load_state_dict(state) + break - scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer) if state is not None: - scheduler.load_state_dict(state) + break class _MockOptimizer(Optimizer): @@ -183,3 +188,10 @@ def zero_grad(self): def __repr__(self): return 'No Optimizer' + + +def _validate_scheduler_optimizer(optimizers, lr_schedulers): + if any(sch['scheduler'].optimizer not in optimizers for sch in lr_schedulers): + raise MisconfigurationException( + "Some schedulers are attatched with an optimizer that wasn't returned from `configure_optimizers`." + ) diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 355bbad3a037e..3fa2af79e5530 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -11,14 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import inspect -import os from abc import ABC from argparse import ArgumentParser, Namespace -from typing import List, Optional, Type, TypeVar, Union, cast +import inspect +import os +from typing import cast, List, Optional, Type, TypeVar, Union from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.callbacks import Callback, ModelCheckpoint, ProgressBarBase +from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint, ProgressBarBase from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.optimizer import is_lightning_optimizer from pytorch_lightning.loggers.base import LightningLoggerBase @@ -27,7 +27,7 @@ from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector from pytorch_lightning.trainer.connectors.model_connector import ModelConnector from pytorch_lightning.trainer.states import TrainerState -from pytorch_lightning.utilities import HOROVOD_AVAILABLE, TPU_AVAILABLE, argparse_utils, rank_zero_warn +from pytorch_lightning.utilities import argparse_utils, HOROVOD_AVAILABLE, rank_zero_warn, TPU_AVAILABLE from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.model_utils import is_overridden @@ -59,6 +59,7 @@ class TrainerProperties(ABC): model_connector: ModelConnector checkpoint_connector: CheckpointConnector callbacks: List[Callback] + _lightning_optimizers = None @property def log_dir(self): @@ -196,7 +197,7 @@ def enable_validation(self) -> bool: """ Check if we should run validation during training. """ model_ref = self.model_connector.get_model() val_loop_enabled = is_overridden('validation_step', model_ref) and self.limit_val_batches > 0 - return val_loop_enabled or self.fast_dev_run + return val_loop_enabled @property def default_root_dir(self) -> str: @@ -218,18 +219,38 @@ def weights_save_path(self) -> str: return os.path.normpath(self._weights_save_path) return self._weights_save_path + @property + def early_stopping_callback(self) -> Optional[EarlyStopping]: + """ + The first :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping` + callback in the Trainer.callbacks list, or ``None`` if it doesn't exist. + """ + callbacks = self.early_stopping_callbacks + return callbacks[0] if len(callbacks) > 0 else None + + @property + def early_stopping_callbacks(self) -> List[EarlyStopping]: + """ + A list of all instances of :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping` + found in the Trainer.callbacks list. + """ + return [c for c in self.callbacks if isinstance(c, EarlyStopping)] + @property def checkpoint_callback(self) -> Optional[ModelCheckpoint]: """ - The first checkpoint callback in the Trainer.callbacks list, or ``None`` if - no checkpoint callbacks exist. + The first :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint` + callback in the Trainer.callbacks list, or ``None`` if it doesn't exist. """ callbacks = self.checkpoint_callbacks return callbacks[0] if len(callbacks) > 0 else None @property def checkpoint_callbacks(self) -> List[ModelCheckpoint]: - """ A list of all instances of ModelCheckpoint found in the Trainer.callbacks list. """ + """ + A list of all instances of :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint` + found in the Trainer.callbacks list. + """ return [c for c in self.callbacks if isinstance(c, ModelCheckpoint)] def save_checkpoint(self, filepath, weights_only: bool = False): @@ -238,16 +259,17 @@ def save_checkpoint(self, filepath, weights_only: bool = False): def get_model(self): return self.model_connector.get_model() + @property + def lightning_optimizers(self): + if self._lightning_optimizers is None: + self.convert_to_lightning_optimizers() + return self._lightning_optimizers + def __getstate__(self): - # unwrap optimizer - self.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers] + # remove lightning_optimizers + self._lightning_optimizers = None return self.__dict__ - def __setstate__(self, d): - self.__dict__ = d - # wrap optimizers in enable_pl_optimzer is True - self.convert_to_lightning_optimizers() - @property def require_distributed_sampler(self): if self.accelerator_backend is not None: diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 57747be0d51fb..b2ba92846b241 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -50,7 +50,7 @@ def __init__(self, window_length: int): def reset(self) -> None: """Empty the accumulator.""" - self = TensorRunningAccum(self.window_length) + self.__init__(self.window_length) def last(self): """Get the last added element.""" diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 35da90625adef..c3ef0e507789e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -16,6 +16,7 @@ import os import warnings +from pathlib import Path from typing import Dict, Iterable, List, Optional, Union import torch @@ -24,7 +25,6 @@ from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector -from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes from pytorch_lightning.callbacks import Callback from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.lightning import LightningModule @@ -47,6 +47,7 @@ from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin +from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop from pytorch_lightning.trainer.logging import TrainerLoggingMixin from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin @@ -56,7 +57,7 @@ from pytorch_lightning.trainer.training_loop import TrainLoop from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin from pytorch_lightning.tuner.tuning import Tuner -from pytorch_lightning.utilities import rank_zero_warn, DeviceType +from pytorch_lightning.utilities import AMPType, DeviceType, rank_zero_warn from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.debugging import InternalDebugger from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -117,7 +118,7 @@ def __init__( weights_save_path: Optional[str] = None, num_sanity_val_steps: int = 2, truncated_bptt_steps: Optional[int] = None, - resume_from_checkpoint: Optional[str] = None, + resume_from_checkpoint: Optional[Union[Path, str]] = None, profiler: Optional[Union[BaseProfiler, bool, str]] = None, benchmark: bool = False, deterministic: bool = False, @@ -133,7 +134,7 @@ def __init__( distributed_backend: Optional[str] = None, automatic_optimization: Optional[bool] = None, move_metrics_to_cpu: bool = False, - enable_pl_optimizer: bool = True, + enable_pl_optimizer: bool = None, # todo: remove in v1.3 ): r""" Customize every aspect of training via flags @@ -250,8 +251,9 @@ def __init__( train sampler and ``shuffle=False`` for val/test sampler. If you want to customize it, you can set ``replace_sampler_ddp=False`` and add your own distributed sampler. - resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here. - This can be a URL. + resume_from_checkpoint: Path/URL of the checkpoint from which training is resumed. If there is + no checkpoint file at the path, start from scratch. If resuming from mid-epoch checkpoint, + training will start from the beginning of the next epoch. sync_batchnorm: Synchronize batch norm layers between process groups/whole world. @@ -281,7 +283,8 @@ def __init__( enable_pl_optimizer: If True, each optimizer will be wrapped by `pytorch_lightning.core.optimizer.LightningOptimizer`. It allows Lightning to - handle AMP, TPU, accumulated_gradients, etc.. + handle AMP, TPU, accumulated_gradients, etc. + .. warning:: Currently deprecated and it will be removed in v1.3 """ super().__init__() self._device_type = DeviceType.CPU @@ -309,7 +312,6 @@ def __init__( self.plugin_connector = PluginConnector(self) # training state - self.weights_summary = weights_summary self.model = None self.shown_warnings = set() @@ -372,7 +374,8 @@ def __init__( max_steps, min_steps, num_sanity_val_steps, - automatic_optimization + automatic_optimization, + weights_summary, ) self.evaluation_loop.on_trainer_init() @@ -409,6 +412,46 @@ def __init__( # Callback system self.on_init_end() + def setup_trainer(self, model: LightningModule): + """ + Sanity check a few things before starting actual training or testing. + + Args: + model: The model to run sanity test on. + """ + # -------------------------- + # Setup?? + # -------------------------- + ref_model = self.get_model() + + # set the ranks and devices + self.accelerator_backend.dist.rank = self.global_rank + self.accelerator_backend.dist.device = ref_model.device + + # set local properties on the model + self.model_connector.copy_trainer_model_properties(model) + + # init amp. Must be done here instead of __init__ to allow ddp to work + if self.amp_backend == AMPType.NATIVE and self.precision == 16 and not self.use_tpu: + self.scaler = self.precision_connector.backend.scaler + + # log hyper-parameters + if self.logger is not None: + # save exp to get started (this is where the first experiment logs are written) + self.logger.log_hyperparams(ref_model.hparams_initial) + self.logger.log_graph(ref_model) + self.logger.save() + + # wait for all to join if on distributed + self.accelerator_backend.barrier("setup_trainer") + + # register auto-resubmit when on SLURM + self.slurm_connector.register_slurm_signal_handlers() + + # track model now. + # if cluster resets state, the model will update with the saved weights + self.model = model + def fit( self, model: LightningModule, @@ -443,10 +486,6 @@ def fit( # hook self.data_connector.prepare_data(model) - # bookkeeping - # we reuse fit in .test() but change its behavior using this flag - self.testing = os.environ.get('PL_TESTING_MODE', self.testing) - # ---------------------------- # SET UP TRAINING # ---------------------------- @@ -551,13 +590,13 @@ def train(self): # hook self.train_loop.on_train_end() - def run_evaluation(self, test_mode: bool = False, max_batches=None): + def run_evaluation(self, max_batches=None): # used to know if we are logging for val, test + reset cached results - self.logger_connector.set_stage(test_mode, reset=True) + self.logger_connector.set_stage(self.testing, reset=True) # bookkeeping - self.evaluation_loop.testing = test_mode + self.evaluation_loop.testing = self.testing # prepare dataloaders dataloaders, max_batches = self.evaluation_loop.get_evaluation_dataloaders(max_batches) @@ -603,7 +642,7 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): # lightning module methods with self.profiler.profile("evaluation_step_and_end"): - output = self.evaluation_loop.evaluation_step(test_mode, batch, batch_idx, dataloader_idx) + output = self.evaluation_loop.evaluation_step(batch, batch_idx, dataloader_idx) output = self.evaluation_loop.evaluation_step_end(output) # hook + store predictions @@ -656,7 +695,7 @@ def run_test(self): # only load test dataloader for testing # self.reset_test_dataloader(ref_model) with self.profiler.profile("run_test_evaluation"): - eval_loop_results, _ = self.run_evaluation(test_mode=True) + eval_loop_results, _ = self.run_evaluation() if len(eval_loop_results) == 0: return 1 @@ -687,7 +726,7 @@ def run_sanity_check(self, ref_model): self.on_sanity_check_start() # run eval step - _, eval_results = self.run_evaluation(test_mode=False, max_batches=self.num_sanity_val_batches) + _, eval_results = self.run_evaluation(max_batches=self.num_sanity_val_batches) # allow no returns from eval if eval_results is not None and len(eval_results) > 0: @@ -791,11 +830,9 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path self.testing = True - os.environ['PL_TESTING_MODE'] = '1' self.model = model results = self.fit(model) self.testing = False - del os.environ['PL_TESTING_MODE'] # teardown if self.is_function_implemented('teardown'): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 68a0f4781c9a9..47e254606af93 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from contextlib import contextmanager from copy import copy, deepcopy @@ -26,7 +25,7 @@ from pytorch_lightning.core.step_result import EvalResult, Result from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.trainer.supporters import Accumulator, TensorRunningAccum -from pytorch_lightning.utilities import TPU_AVAILABLE, AMPType, parsing +from pytorch_lightning.utilities import AMPType, parsing, TPU_AVAILABLE from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.memory import recursive_detach @@ -49,7 +48,14 @@ def __init__(self, trainer): self._cur_grad_norm_dict = None def on_trainer_init( - self, max_epochs, min_epochs, max_steps, min_steps, num_sanity_val_steps, automatic_optimization + self, + max_epochs, + min_epochs, + max_steps, + min_steps, + num_sanity_val_steps, + automatic_optimization, + weights_summary, ): self.trainer.global_step = 0 self.trainer.current_epoch = 0 @@ -73,6 +79,12 @@ def on_trainer_init( else: self.trainer.num_sanity_val_steps = num_sanity_val_steps + self.trainer.weights_summary = weights_summary + if weights_summary is not None and weights_summary not in ModelSummary.MODES: + raise MisconfigurationException( + f"`weights_summary` can be None, {', '.join(ModelSummary.MODES)}, got {weights_summary}" + ) + @property def num_optimizers(self): num_optimizers = len(self.get_optimizers_iterable()) @@ -112,67 +124,26 @@ def setup_fit(self, model, train_dataloader, val_dataloaders, datamodule): # check that model is configured correctly self.trainer.config_validator.verify_loop_configurations(model) - def setup_training(self, model: LightningModule): - """Sanity check a few things before starting actual training. - - Args: - model: The model to run sanity test on. + def setup_training(self): + """ + Sanity check a few things before starting actual training. """ - # -------------------------- - # Setup?? - # -------------------------- - ref_model = model - if self.trainer.data_parallel: - ref_model = model.module - - # set the ranks and devices - self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank - self.trainer.accelerator_backend.dist.device = ref_model.device - - # give model convenience properties - ref_model.trainer = self.trainer - - # set local properties on the model - self.trainer.model_connector.copy_trainer_model_properties(ref_model) - - # init amp. Must be done here instead of __init__ to allow ddp to work - if self.trainer.amp_backend == AMPType.NATIVE and self.trainer.precision == 16 and not self.trainer.use_tpu: - self.trainer.scaler = self.trainer.precision_connector.backend.scaler - - # log hyper-parameters - if self.trainer.logger is not None: - # save exp to get started (this is where the first experiment logs are written) - self.trainer.logger.log_hyperparams(ref_model.hparams_initial) - self.trainer.logger.log_graph(ref_model) - self.trainer.logger.save() - - # wait for all to join if on distributed - self.trainer.accelerator_backend.barrier("setup_training") - - # register auto-resubmit when on SLURM - self.trainer.slurm_connector.register_slurm_signal_handlers() - # -------------------------- # Pre-train # -------------------------- + ref_model = self.trainer.get_model() + # on pretrain routine start self.trainer.on_pretrain_routine_start(ref_model) if self.trainer.is_function_implemented("on_pretrain_routine_start"): ref_model.on_pretrain_routine_start() # print model summary - if self.trainer.is_global_zero and self.trainer.weights_summary is not None and not self.trainer.testing: - if self.trainer.weights_summary in ModelSummary.MODES: - ref_model.summarize(mode=self.trainer.weights_summary) - else: - raise MisconfigurationException("weights_summary can be None, " + ", ".join(ModelSummary.MODES)) + if self.trainer.is_global_zero: + ref_model.summarize(mode=self.trainer.weights_summary) - # track model now. - # if cluster resets state, the model will update with the saved weights - self.trainer.model = model - - # restore training and model before hpc is called - self.trainer.checkpoint_connector.restore_weights(model) + # restore training state and model weights before hpc is called + self.trainer.checkpoint_connector.restore_weights() # on pretrain routine end self.trainer.on_pretrain_routine_end(ref_model) @@ -489,6 +460,9 @@ def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_ 'native PyTorch amp and lbfgs are not compatible.' ' To request, please file a Github issue in PyTorch and tag @mcarilli') + # wraps into LightingOptimizer only for running step + optimizer = LightningOptimizer._to_lightning_optimizer(optimizer, self.trainer, opt_idx) + # model hook model_ref.optimizer_step( self.trainer.current_epoch, @@ -585,7 +559,7 @@ def run_training_epoch(self): # ----------------------------------------- should_check_val = self.should_check_val_fx(batch_idx, is_last_batch) if should_check_val: - self.trainer.run_evaluation(test_mode=False) + self.trainer.run_evaluation() # reset stage to train self.trainer.logger_connector.set_stage("train") @@ -831,6 +805,8 @@ def backward(self, result, optimizer, opt_idx, *args, **kwargs): # backward can be called manually in the training loop if isinstance(result, torch.Tensor): + # scale loss under accumulate_grad_batches > 1 and manual_backward + result = self.scale_closure_loss(result) self.trainer.accelerator_backend.backward(result, optimizer, opt_idx, *args, **kwargs) else: result.closure_loss = self.trainer.accelerator_backend.backward( @@ -910,9 +886,8 @@ def build_train_args(self, batch, batch_idx, opt_idx, hiddens): def save_loggers_on_train_batch_end(self): # when loggers should save to disk should_flush_logs = self.trainer.logger_connector.should_flush_logs - if should_flush_logs or self.trainer.fast_dev_run is True: - if self.trainer.is_global_zero and self.trainer.logger is not None: - self.trainer.logger.save() + if should_flush_logs and self.trainer.is_global_zero and self.trainer.logger is not None: + self.trainer.logger.save() def process_train_step_outputs(self, all_train_step_outputs, early_stopping_accumulator, checkpoint_accumulator): """ @@ -975,3 +950,9 @@ def update_running_loss(self): # reset for next set of accumulated grads self.accumulated_loss.reset() + + def scale_closure_loss(self, loss: torch.Tensor) -> torch.Tensor: + model_ref = self.trainer.get_model() + if model_ref._running_manual_backward: + loss /= self.trainer.accumulate_grad_batches + return loss diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index e5641337cc8d2..c5dade86c348a 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -23,36 +23,16 @@ from pytorch_lightning.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.distributed import AllGatherGrad, rank_zero_info, rank_zero_only, rank_zero_warn +from pytorch_lightning.utilities.package_utils import _module_available from pytorch_lightning.utilities.parsing import AttributeDict, flatten_dict, is_picklable from pytorch_lightning.utilities.xla_device_utils import XLA_AVAILABLE, XLADeviceUtils - -def _module_available(module_path: str) -> bool: - """Testing if given module is avalaible in your env - - >>> _module_available('os') - True - >>> _module_available('bla.bla') - False - """ - # todo: find a better way than try / except - try: - mods = module_path.split('.') - assert mods, 'nothing given to test' - # it has to be tested as per partets - for i in range(len(mods)): - module_path = '.'.join(mods[:i + 1]) - if importlib.util.find_spec(module_path) is None: - return False - return True - except AttributeError: - return False - - +OMEGACONF_AVAILABLE = _module_available("omegaconf") APEX_AVAILABLE = _module_available("apex.amp") NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast") OMEGACONF_AVAILABLE = _module_available("omegaconf") HYDRA_AVAILABLE = _module_available("hydra") +HYDRA_EXPERIMENTAL_AVAILABLE = _module_available("hydra.experimental") HOROVOD_AVAILABLE = _module_available("horovod.torch") BOLTS_AVAILABLE = _module_available("pl_bolts") diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py index 775c22dbbfa0a..76ac0a6c595aa 100644 --- a/pytorch_lightning/utilities/apply_func.py +++ b/pytorch_lightning/utilities/apply_func.py @@ -49,12 +49,14 @@ def apply_to_collection(data: Any, dtype: Union[type, tuple], function: Callable return function(data, *args, **kwargs) # Recursively apply to collection items - elif isinstance(data, Mapping): + if isinstance(data, Mapping): return elem_type({k: apply_to_collection(v, dtype, function, *args, **kwargs) for k, v in data.items()}) - elif isinstance(data, tuple) and hasattr(data, '_fields'): # named tuple + + if isinstance(data, tuple) and hasattr(data, '_fields'): # named tuple return elem_type(*(apply_to_collection(d, dtype, function, *args, **kwargs) for d in data)) - elif isinstance(data, Sequence) and not isinstance(data, str): + + if isinstance(data, Sequence) and not isinstance(data, str): return elem_type([apply_to_collection(d, dtype, function, *args, **kwargs) for d in data]) # data is neither of dtype, nor a collection diff --git a/pytorch_lightning/utilities/debugging.py b/pytorch_lightning/utilities/debugging.py index 9264e2a49810d..c9fac5cc04a45 100644 --- a/pytorch_lightning/utilities/debugging.py +++ b/pytorch_lightning/utilities/debugging.py @@ -16,7 +16,7 @@ import time from collections import Counter from functools import wraps -from typing import Callable, Any, Optional +from typing import Any, Callable, Optional def enabled_only(fn: Callable): @@ -133,7 +133,7 @@ def track_lr_schedulers_update(self, batch_idx, interval, scheduler_idx, old_lr, self.saved_lr_scheduler_updates.append(loss_dict) @enabled_only - def track_eval_loss_history(self, test_mode, batch_idx, dataloader_idx, output): + def track_eval_loss_history(self, batch_idx, dataloader_idx, output): loss_dict = { 'sanity_check': self.trainer.running_sanity_check, 'dataloader_idx': dataloader_idx, @@ -142,7 +142,7 @@ def track_eval_loss_history(self, test_mode, batch_idx, dataloader_idx, output): 'output': output } - if test_mode: + if self.trainer.testing: self.saved_test_losses.append(loss_dict) else: self.saved_val_losses.append(loss_dict) diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 9724f05247c00..2a0b989e9b9cd 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -15,14 +15,14 @@ import os import warnings from functools import wraps +from typing import Any, Optional, Union import torch + from pytorch_lightning import _logger as log -from typing import Union, Optional, Any if torch.distributed.is_available(): - from torch.distributed import ReduceOp - from torch.distributed import group + from torch.distributed import ReduceOp, group else: class ReduceOp: SUM = None @@ -145,15 +145,14 @@ def sync_ddp( if group is None: group = torch.distributed.group.WORLD - if reduce_op is None: - reduce_op = torch.distributed.ReduceOp.SUM - elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): - reduce_op = torch.distributed.ReduceOp.SUM + op = reduce_op if isinstance(reduce_op, ReduceOp) else ReduceOp.SUM + + if isinstance(reduce_op, str) and reduce_op.lower() in ("avg", "mean"): divide_by_world_size = True # sync all processes before reduction torch.distributed.barrier(group=group) - torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False) + torch.distributed.all_reduce(result, op=op, group=group, async_op=False) if divide_by_world_size: result = result / torch.distributed.get_world_size(group) @@ -203,10 +202,11 @@ def all_gather_ddp_if_available( Return: A tensor of shape (world_size, batch, ...) """ + group = group if group is not None else torch.distributed.group.WORLD if torch.distributed.is_available() and torch.distributed.is_initialized(): if sync_grads: return AllGatherGrad.apply(tensor, group) else: - with torch.no_grad: + with torch.no_grad(): return AllGatherGrad.apply(tensor, group) return tensor diff --git a/pytorch_lightning/utilities/package_utils.py b/pytorch_lightning/utilities/package_utils.py new file mode 100644 index 0000000000000..99fd6fcc7ebb5 --- /dev/null +++ b/pytorch_lightning/utilities/package_utils.py @@ -0,0 +1,36 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib + + +def _module_available(module_path: str) -> bool: + """Testing if given module is avalaible in your env + + >>> _module_available('os') + True + >>> _module_available('bla.bla') + False + """ + # todo: find a better way than try / except + try: + mods = module_path.split('.') + assert mods, 'nothing given to test' + # it has to be tested as per partets + for i in range(len(mods)): + module_path = '.'.join(mods[:i + 1]) + if importlib.util.find_spec(module_path) is None: + return False + return True + except AttributeError: + return False diff --git a/pytorch_lightning/utilities/parsing.py b/pytorch_lightning/utilities/parsing.py index b207320c25ccc..5d90583345b4a 100644 --- a/pytorch_lightning/utilities/parsing.py +++ b/pytorch_lightning/utilities/parsing.py @@ -15,9 +15,11 @@ import inspect import pickle from argparse import Namespace -from typing import Dict, Union, Tuple +from typing import Dict, Tuple, Union from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.package_utils import _module_available def str_to_bool_or_str(val: str) -> Union[str, bool]: @@ -115,7 +117,6 @@ def get_init_args(frame) -> dict: self_var, args_var, kwargs_var = parse_class_init_keys(cls) filtered_vars = [n for n in (self_var, args_var, kwargs_var) if n] exclude_argnames = (*filtered_vars, '__class__', 'frame', 'frame_args') - # only collect variables that appear in the signature local_args = {k: local_vars[k] for k in init_parameters.keys()} local_args.update(local_args.get(kwargs_var, {})) diff --git a/pytorch_lightning/utilities/seed.py b/pytorch_lightning/utilities/seed.py index 1ce782f967ebb..16bc39bd7f142 100644 --- a/pytorch_lightning/utilities/seed.py +++ b/pytorch_lightning/utilities/seed.py @@ -20,8 +20,8 @@ import numpy as np import torch - from pytorch_lightning import _logger as log +from pytorch_lightning.utilities import rank_zero_warn def seed_everything(seed: Optional[int] = None) -> int: @@ -41,18 +41,17 @@ def seed_everything(seed: Optional[int] = None) -> int: try: if seed is None: - seed = os.environ.get("PL_GLOBAL_SEED", _select_seed_randomly(min_seed_value, max_seed_value)) + seed = os.environ.get("PL_GLOBAL_SEED") seed = int(seed) except (TypeError, ValueError): seed = _select_seed_randomly(min_seed_value, max_seed_value) + rank_zero_warn(f"No correct seed found, seed set to {seed}") - if (seed > max_seed_value) or (seed < min_seed_value): - log.warning( - f"{seed} is not in bounds, \ - numpy accepts from {min_seed_value} to {max_seed_value}" - ) + if not (min_seed_value <= seed <= max_seed_value): + rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}") seed = _select_seed_randomly(min_seed_value, max_seed_value) + log.info(f"Global seed set to {seed}") os.environ["PL_GLOBAL_SEED"] = str(seed) random.seed(seed) np.random.seed(seed) @@ -62,6 +61,4 @@ def seed_everything(seed: Optional[int] = None) -> int: def _select_seed_randomly(min_seed_value: int = 0, max_seed_value: int = 255) -> int: - seed = random.randint(min_seed_value, max_seed_value) - log.warning(f"No correct seed found, seed set to {seed}") - return seed + return random.randint(min_seed_value, max_seed_value) diff --git a/requirements.txt b/requirements.txt index 4b8a3efb5c841..2dd5378649851 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,5 @@ future>=0.17.1 # required for builtins in setup.py # pyyaml>=3.13 PyYAML>=5.1 # OmegaConf requirement >=5.1 tqdm>=4.41.0 -fsspec>=0.8.0 +fsspec[http]>=0.8.1 tensorboard>=2.2.0 diff --git a/requirements/devel.txt b/requirements/devel.txt index a8c5293c8c7db..dcf66495ee46f 100644 --- a/requirements/devel.txt +++ b/requirements/devel.txt @@ -8,4 +8,4 @@ -r ./test.txt # install all extra dependencies for running examples --r ./examples.txt \ No newline at end of file +-r ./examples.txt diff --git a/requirements/loggers.txt b/requirements/loggers.txt index 3ec7b25db4643..001210855871d 100644 --- a/requirements/loggers.txt +++ b/requirements/loggers.txt @@ -3,4 +3,4 @@ neptune-client>=0.4.109 comet-ml>=3.1.12 mlflow>=1.0.0 test_tube>=0.7.5 -wandb>=0.8.21 \ No newline at end of file +wandb>=0.8.21 diff --git a/requirements/test.txt b/requirements/test.txt index 3cb538a98d7c8..e9226139d9287 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -6,7 +6,7 @@ pytest>=5.0 flake8>=3.6 flake8-black check-manifest -twine==1.13.0 +twine==3.2 # scipy>=0.13.3 scikit-learn>=0.22.2 scikit-image>=0.17.2 @@ -17,3 +17,4 @@ pre-commit>=1.0 cloudpickle>=1.3 nltk>=3.3 +pandas # needed in benchmarks diff --git a/setup.cfg b/setup.cfg index b0c2c8640bfba..27f5df8ac6961 100644 --- a/setup.cfg +++ b/setup.cfg @@ -102,6 +102,10 @@ max-line-length = 120 files = pytorch_lightning, pl_examples, benchmarks, tests disallow_untyped_defs = True ignore_missing_imports = True +show_error_codes = True +warn_redundant_casts = True +warn_unused_configs = True +warn_unused_ignores = True # todo: add proper typing to this module... [mypy-pytorch_lightning.callbacks.*] diff --git a/setup.py b/setup.py index c548d508ab434..dd36842d84a38 100755 --- a/setup.py +++ b/setup.py @@ -69,7 +69,7 @@ url=pytorch_lightning.__homepage__, download_url='https://github.com/PyTorchLightning/pytorch-lightning', license=pytorch_lightning.__license__, - packages=find_packages(exclude=['tests', 'tests/*', 'benchmarks']), + packages=find_packages(exclude=['tests', 'tests/*', 'benchmarks', 'legacy', 'legacy/*']), long_description=_load_long_description(PATH_ROOT), long_description_content_type='text/markdown', diff --git a/tests/README.md b/tests/README.md index 8ef006c4d879a..7b857a1901fd7 100644 --- a/tests/README.md +++ b/tests/README.md @@ -33,8 +33,8 @@ The GPU machine must have: 3. [Horovod with NCCL](https://horovod.readthedocs.io/en/stable/gpus_include.html) support: `HOROVOD_GPU_OPERATIONS=NCCL pip install horovod` -## Running Coverage -Make sure to run coverage on a GPU machine with at least 2 GPUs and NVIDIA apex installed. +## Running Coverage +Make sure to run coverage on a GPU machine with at least 2 GPUs and NVIDIA apex installed. ```bash cd pytorch-lightning diff --git a/tests/__init__.py b/tests/__init__.py index 981d685430da9..b4a7291dfec66 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,3 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os import numpy as np @@ -5,6 +18,8 @@ TEST_ROOT = os.path.dirname(__file__) PROJECT_ROOT = os.path.dirname(TEST_ROOT) TEMP_PATH = os.path.join(PROJECT_ROOT, 'test_temp') +DATASETS_PATH = os.path.join(PROJECT_ROOT, 'Datasets') +LEGACY_PATH = os.path.join(PROJECT_ROOT, 'legacy') # todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages if PROJECT_ROOT not in os.getenv('PYTHONPATH', ""): diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 6ceffe8562372..6fdc3794d05f6 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import torch -from pytorch_lightning import LightningModule from torch.utils.data import Dataset +from pytorch_lightning import LightningModule + class RandomDictDataset(Dataset): def __init__(self, size, length): diff --git a/tests/base/datasets.py b/tests/base/datasets.py index 223f27731fef9..08a1db65ce1e2 100644 --- a/tests/base/datasets.py +++ b/tests/base/datasets.py @@ -63,8 +63,13 @@ class MNIST(Dataset): TEST_FILE_NAME = 'test.pt' cache_folder_name = 'complete' - def __init__(self, root: str = PATH_DATASETS, train: bool = True, - normalize: tuple = (0.5, 1.0), download: bool = True): + def __init__( + self, + root: str = PATH_DATASETS, + train: bool = True, + normalize: tuple = (0.5, 1.0), + download: bool = True, + ): super().__init__() self.root = root self.train = train # training set or test set diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py index 9c88ba1b7e4d3..7b40ba4f39ead 100644 --- a/tests/base/develop_utils.py +++ b/tests/base/develop_utils.py @@ -14,8 +14,6 @@ import functools import os -import numpy as np - from pytorch_lightning import seed_everything from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger @@ -23,24 +21,6 @@ from tests.base.model_template import EvalModelTemplate -def assert_speed_parity_relative(pl_times, pt_times, max_diff: float = 0.1): - # assert speeds - diffs = np.asarray(pl_times) - np.asarray(pt_times) - # norm by vanila time - diffs = diffs / np.asarray(pt_times) - assert np.alltrue(diffs < max_diff), \ - f"lightning {diffs} was slower than PT (threshold {max_diff})" - - -def assert_speed_parity_absolute(pl_times, pt_times, nb_epochs, max_diff: float = 0.55): - # assert speeds - diffs = np.asarray(pl_times) - np.asarray(pt_times) - # norm by vanila time - diffs = diffs / nb_epochs - assert np.alltrue(diffs < max_diff), \ - f"lightning {diffs} was slower than PT (threshold {max_diff})" - - def get_default_logger(save_dir, version=None): # set up logger object without actually saving logs logger = TensorBoardLogger(save_dir, name='lightning_logs', version=version) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index c00c712bb3b13..c9baf0db6976d 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from unittest import mock -from unittest.mock import ANY, MagicMock, call +from unittest.mock import ANY, call, MagicMock from pytorch_lightning import Trainer from tests.base import BoringModel @@ -109,8 +109,6 @@ def test_trainer_callback_system(torch_save): call.on_init_end(trainer), call.setup(trainer, model, 'test'), call.on_fit_start(trainer, model), - call.on_pretrain_routine_start(trainer, model), - call.on_pretrain_routine_end(trainer, model), call.on_test_start(trainer, model), call.on_test_epoch_start(trainer, model), call.on_test_batch_start(trainer, model, ANY, 0, 0), diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py index 7cecefad03276..5c54f6a84805d 100644 --- a/tests/callbacks/test_early_stopping.py +++ b/tests/callbacks/test_early_stopping.py @@ -13,18 +13,17 @@ # limitations under the License. import os import pickle +from unittest import mock import cloudpickle import numpy as np import pytest import torch -from unittest import mock -from pytorch_lightning import _logger -from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning import _logger, seed_everything, Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint -from tests.base import EvalModelTemplate, BoringModel from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests.base import BoringModel, EvalModelTemplate class EarlyStoppingTestRestore(EarlyStopping): @@ -87,15 +86,18 @@ def test_resume_early_stopping_from_checkpoint(tmpdir): def test_early_stopping_no_extraneous_invocations(tmpdir): """Test to ensure that callback methods aren't being invoked outside of the callback handler.""" model = EvalModelTemplate() + early_stop_callback = EarlyStopping() expected_count = 4 trainer = Trainer( default_root_dir=tmpdir, - callbacks=[EarlyStopping()], + callbacks=[early_stop_callback], val_check_interval=1.0, max_epochs=expected_count, ) trainer.fit(model) + assert trainer.early_stopping_callback == early_stop_callback + assert trainer.early_stopping_callbacks == [early_stop_callback] assert len(trainer.dev_debugger.early_stopping_history) == expected_count diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py index d29f254df67d0..39dd821e63dcd 100644 --- a/tests/callbacks/test_lr_monitor.py +++ b/tests/callbacks/test_lr_monitor.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import pytest +from torch import optim import tests.base.develop_utils as tutils from pytorch_lightning import Trainer @@ -47,19 +48,34 @@ def test_lr_monitor_single_lr(tmpdir): 'Names of learning rates not set correctly' -def test_lr_monitor_single_lr_with_momentum(tmpdir): - """ Test that learning rates and momentum are extracted and logged for single lr scheduler. """ - tutils.reset_seed() +@pytest.mark.parametrize('opt', ['SGD', 'Adam']) +def test_lr_monitor_single_lr_with_momentum(tmpdir, opt): + """ + Test that learning rates and momentum are extracted and logged for single lr scheduler. + """ + class LogMomentumModel(BoringModel): + def __init__(self, opt): + super().__init__() + self.opt = opt - model = EvalModelTemplate() - model.configure_optimizers = model.configure_optimizers__onecycle_scheduler + def configure_optimizers(self): + if self.opt == 'SGD': + opt_kwargs = {'momentum': 0.9} + elif self.opt == 'Adam': + opt_kwargs = {'betas': (0.9, 0.999)} + optimizer = getattr(optim, self.opt)(self.parameters(), lr=1e-2, **opt_kwargs) + lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-2, total_steps=10_000) + return [optimizer], [lr_scheduler] + + model = LogMomentumModel(opt=opt) lr_monitor = LearningRateMonitor(log_momentum=True) trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, - limit_val_batches=0.1, - limit_train_batches=0.5, + limit_val_batches=2, + limit_train_batches=5, + log_every_n_steps=1, callbacks=[lr_monitor], ) result = trainer.fit(model) @@ -69,7 +85,39 @@ def test_lr_monitor_single_lr_with_momentum(tmpdir): 'Expected momentum to be logged' assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \ 'Number of momentum values logged does not match number of lr schedulers' - assert all([k in ['lr-SGD-momentum'] for k in lr_monitor.last_momentum_values.keys()]), \ + assert all(k == f'lr-{opt}-momentum' for k in lr_monitor.last_momentum_values.keys()), \ + 'Names of momentum values not set correctly' + + +def test_log_momentum_no_momentum_optimizer(tmpdir): + """ + Test that if optimizer doesn't have momentum then a warning is raised with log_momentum=True. + """ + class LogMomentumModel(BoringModel): + def configure_optimizers(self): + optimizer = optim.ASGD(self.parameters(), lr=1e-2) + lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1) + return [optimizer], [lr_scheduler] + + model = LogMomentumModel() + lr_monitor = LearningRateMonitor(log_momentum=True) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_val_batches=2, + limit_train_batches=5, + log_every_n_steps=1, + callbacks=[lr_monitor], + ) + with pytest.warns(RuntimeWarning, match="optimizers do not have momentum."): + result = trainer.fit(model) + assert result + + assert all(v == 0 for v in lr_monitor.last_momentum_values.values()), \ + 'Expected momentum to be logged' + assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \ + 'Number of momentum values logged does not match number of lr schedulers' + assert all(k == 'lr-ASGD-momentum' for k in lr_monitor.last_momentum_values.keys()), \ 'Names of momentum values not set correctly' @@ -105,7 +153,7 @@ def test_lr_monitor_no_logger(tmpdir): logger=False ) - with pytest.raises(MisconfigurationException, match='Trainer that has no logger'): + with pytest.raises(MisconfigurationException, match='`Trainer` that has no logger'): trainer.fit(model) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 857877f8239ba..f9686dce159dd 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -17,55 +17,10 @@ import pytest import torch -from pytorch_lightning import Trainer, callbacks, seed_everything +from pytorch_lightning import callbacks, seed_everything, Trainer from tests.base import BoringModel -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -def test_mc_called_on_fastdevrun(tmpdir): - seed_everything(1234) - - train_val_step_model = BoringModel() - - # fast dev run = called once - # train loop only, dict, eval result - trainer = Trainer(fast_dev_run=True) - trainer.fit(train_val_step_model) - - # checkpoint should have been called once with fast dev run - assert len(trainer.dev_debugger.checkpoint_callback_history) == 1 - - # ----------------------- - # also called once with no val step - # ----------------------- - class TrainingStepCalled(BoringModel): - def __init__(self): - super().__init__() - self.training_step_called = False - self.validation_step_called = False - self.test_step_called = False - - def training_step(self, batch, batch_idx): - self.training_step_called = True - return super().training_step(batch, batch_idx) - - train_step_only_model = TrainingStepCalled() - train_step_only_model.validation_step = None - - # fast dev run = called once - # train loop only, dict, eval result - trainer = Trainer(fast_dev_run=True) - trainer.fit(train_step_only_model) - - # make sure only training step was called - assert train_step_only_model.training_step_called - assert not train_step_only_model.validation_step_called - assert not train_step_only_model.test_step_called - - # checkpoint should have been called once with fast dev run - assert len(trainer.dev_debugger.checkpoint_callback_history) == 1 - - @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test_mc_called(tmpdir): seed_everything(1234) diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py new file mode 100644 index 0000000000000..577362e65f1c9 --- /dev/null +++ b/tests/checkpointing/test_legacy_checkpoints.py @@ -0,0 +1,74 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import glob +import os +import sys + +import pytest + +from pytorch_lightning import Trainer +from tests import LEGACY_PATH + +LEGACY_CHECKPOINTS_PATH = os.path.join(LEGACY_PATH, 'checkpoints') +CHECKPOINT_EXTENSION = ".ckpt" + + +# todo: add more legacy checkpoints - for < v0.8 +@pytest.mark.parametrize("pl_version", [ + # "0.8.1", + "0.8.3", + "0.8.4", + # "0.8.5", # this version has problem with loading on PT<=1.4 as it seems to be archive + # "0.9.0", # this version has problem with loading on PT<=1.4 as it seems to be archive + "0.10.0", + "1.0.0", + "1.0.1", + "1.0.2", + "1.0.3", + "1.0.4", + "1.0.5", + "1.0.6", + "1.0.7", + "1.0.8", + "1.1.0", + "1.1.1", + "1.1.2", + "1.1.3", + "1.1.4", + "1.1.5", +]) +def test_resume_legacy_checkpoints(tmpdir, pl_version): + path_dir = os.path.join(LEGACY_CHECKPOINTS_PATH, pl_version) + + # todo: make this as mock, so it is cleaner... + orig_sys_paths = list(sys.path) + sys.path.insert(0, path_dir) + from zero_training import DummyModel + + path_ckpts = sorted(glob.glob(os.path.join(path_dir, f'*{CHECKPOINT_EXTENSION}'))) + assert path_ckpts, 'No checkpoints found in folder "%s"' % path_dir + path_ckpt = path_ckpts[-1] + + model = DummyModel.load_from_checkpoint(path_ckpt) + trainer = Trainer(default_root_dir=tmpdir, max_epochs=6) + result = trainer.fit(model) + assert result + + # todo + # model = DummyModel() + # trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, resume_from_checkpoint=path_ckpt) + # result = trainer.fit(model) + # assert result + + sys.path = orig_sys_paths diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 9817dfa4526c6..3de26ef1a6fb6 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -28,7 +28,7 @@ import pytorch_lightning as pl import tests.base.develop_utils as tutils -from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.utilities.cloud_io import load as pl_load @@ -636,8 +636,7 @@ def validation_epoch_end(self, outputs): @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_checkpoint_repeated_strategy(enable_pl_optimizer, tmpdir): +def test_checkpoint_repeated_strategy(tmpdir): """ This test validates that the checkpoint can be called when provided to callbacks list """ @@ -657,7 +656,6 @@ def validation_step(self, batch, batch_idx): limit_val_batches=2, limit_test_batches=2, callbacks=[checkpoint_callback], - enable_pl_optimizer=enable_pl_optimizer, weights_summary=None, progress_bar_refresh_rate=0, ) @@ -674,7 +672,6 @@ def validation_step(self, batch, batch_idx): limit_val_batches=2, limit_test_batches=2, resume_from_checkpoint=checkpoint_callback.best_model_path, - enable_pl_optimizer=enable_pl_optimizer, weights_summary=None, progress_bar_refresh_rate=0, ) @@ -685,8 +682,7 @@ def validation_step(self, batch, batch_idx): @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_checkpoint_repeated_strategy_extended(enable_pl_optimizer, tmpdir): +def test_checkpoint_repeated_strategy_extended(tmpdir): """ This test validates checkpoint can be called several times without increasing internally its global step if nothing run. @@ -731,7 +727,6 @@ def assert_checkpoint_log_dir(idx): limit_train_batches=limit_train_batches, limit_val_batches=3, limit_test_batches=4, - enable_pl_optimizer=enable_pl_optimizer, callbacks=[checkpoint_cb], ) trainer = pl.Trainer(**trainer_config) @@ -760,9 +755,9 @@ def assert_checkpoint_log_dir(idx): model = ExtendedBoringModel() trainer.test(model) assert not trainer.checkpoint_connector.has_trained - assert trainer.global_step == epochs * limit_train_batches - assert trainer.current_epoch == epochs - + # resume_from_checkpoint is resumed when calling `.fit` + assert trainer.global_step == 0 + assert trainer.current_epoch == 0 trainer.fit(model) assert not trainer.checkpoint_connector.has_trained assert trainer.global_step == epochs * limit_train_batches @@ -896,7 +891,8 @@ def training_step(self, *args): ) trainer = Trainer( default_root_dir=tmpdir, - fast_dev_run=True, + limit_train_batches=1, + limit_val_batches=1, callbacks=[model_checkpoint], logger=False, weights_summary=None, @@ -922,7 +918,8 @@ def __init__(self, hparams): ) trainer = Trainer( default_root_dir=tmpdir, - fast_dev_run=True, + limit_train_batches=1, + limit_val_batches=1, callbacks=[model_checkpoint], logger=False, weights_summary=None, @@ -938,3 +935,42 @@ def __init__(self, hparams): else: # make sure it's not AttributeDict assert type(ckpt[model.CHECKPOINT_HYPER_PARAMS_KEY]) == hparams_type + + +@pytest.mark.parametrize('max_epochs', [3, 4]) +@pytest.mark.parametrize( + 'save_top_k, expected', + [ + (1, ['curr_epoch.ckpt']), + (2, ['curr_epoch.ckpt', 'curr_epoch-v0.ckpt']), + ] +) +def test_model_checkpoint_file_already_exists(tmpdir, max_epochs, save_top_k, expected): + """ + Test that version is added to filename if required and it already exists in dirpath. + """ + model_checkpoint = ModelCheckpoint( + dirpath=tmpdir, + filename='curr_epoch', + save_top_k=save_top_k, + monitor='epoch', + mode='max', + ) + trainer = Trainer( + default_root_dir=tmpdir, + callbacks=[model_checkpoint], + max_epochs=max_epochs, + limit_train_batches=2, + limit_val_batches=2, + logger=None, + weights_summary=None, + progress_bar_refresh_rate=0, + ) + + model = BoringModel() + trainer.fit(model) + ckpt_files = os.listdir(tmpdir) + assert set(ckpt_files) == set(expected) + + epochs_in_ckpt_files = [pl_load(os.path.join(tmpdir, f))['epoch'] - 1 for f in ckpt_files] + assert sorted(epochs_in_ckpt_files) == list(range(max_epochs - save_top_k, max_epochs)) diff --git a/tests/checkpointing/test_torch_saving.py b/tests/checkpointing/test_torch_saving.py index 493aa0dabe126..b322cfe5a7fd3 100644 --- a/tests/checkpointing/test_torch_saving.py +++ b/tests/checkpointing/test_torch_saving.py @@ -22,15 +22,13 @@ from tests.base import BoringModel -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_model_torch_save(tmpdir, enable_pl_optimizer): +def test_model_torch_save(tmpdir): """Test to ensure torch save does not fail for model and trainer.""" model = BoringModel() num_epochs = 1 trainer = Trainer( default_root_dir=tmpdir, max_epochs=num_epochs, - enable_pl_optimizer=enable_pl_optimizer, ) temp_path = os.path.join(tmpdir, 'temp.pt') trainer.fit(model) @@ -39,8 +37,6 @@ def test_model_torch_save(tmpdir, enable_pl_optimizer): torch.save(trainer.model, temp_path) torch.save(trainer, temp_path) trainer = torch.load(temp_path) - is_lightning_optimizer = isinstance(trainer.optimizers[0], LightningOptimizer) - assert is_lightning_optimizer if enable_pl_optimizer else not is_lightning_optimizer @pytest.mark.skipif(platform.system() == "Windows", diff --git a/tests/checkpointing/test_trainer_checkpoint.py b/tests/checkpointing/test_trainer_checkpoint.py new file mode 100644 index 0000000000000..9e93a8c297481 --- /dev/null +++ b/tests/checkpointing/test_trainer_checkpoint.py @@ -0,0 +1,87 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from copy import deepcopy +import os + +import torch + +import pytorch_lightning as pl +from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.utilities.cloud_io import load as pl_load +from tests.base import BoringModel + + +def test_finetuning_with_resume_from_checkpoint(tmpdir): + """ + This test validates that generated ModelCheckpoint is pointing to the right best_model_path during test + """ + + seed_everything(3) + + checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath=tmpdir, filename="{epoch:02d}", save_top_k=-1) + + class ExtendedBoringModel(BoringModel): + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + return [optimizer], [lr_scheduler] + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log("val_loss", loss, on_epoch=True, prog_bar=True) + + model = ExtendedBoringModel() + model.validation_epoch_end = None + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=12, + limit_val_batches=6, + limit_test_batches=12, + callbacks=[checkpoint_callback], + logger=False, + ) + trainer.fit(model) + assert os.listdir(tmpdir) == ['epoch=00.ckpt'] + + best_model_paths = [checkpoint_callback.best_model_path] + results = [] + + for idx in range(3, 6): + # load from checkpoint + trainer = pl.Trainer( + default_root_dir=tmpdir, + max_epochs=idx, + limit_train_batches=12, + limit_val_batches=12, + limit_test_batches=12, + resume_from_checkpoint=best_model_paths[-1], + progress_bar_refresh_rate=0, + ) + trainer.fit(model) + trainer.test() + results.append(deepcopy(trainer.callback_metrics)) + best_model_paths.append(trainer.checkpoint_callback.best_model_path) + + for idx in range(len(results) - 1): + assert results[idx]["val_loss"] > results[idx + 1]["val_loss"] + + for idx, best_model_path in enumerate(best_model_paths): + if idx == 0: + assert best_model_path.endswith(f"epoch=0{idx}.ckpt") + else: + assert f"epoch={idx + 1}" in best_model_path diff --git a/tests/collect_env_details.py b/tests/collect_env_details.py index 1d443795d2876..2b8c4b3fafeed 100644 --- a/tests/collect_env_details.py +++ b/tests/collect_env_details.py @@ -1,3 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """Diagnose your system and show basic information This server mainly to get detail info for better bug reporting. diff --git a/tests/conftest.py b/tests/conftest.py index ad4b7169456a8..07188fed4dbed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,17 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys import threading from functools import partial, wraps diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index d286bbf3a9de6..64dc25101eae6 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -13,21 +13,21 @@ # limitations under the License. import pickle from argparse import ArgumentParser -from unittest.mock import MagicMock from typing import Optional +from unittest.mock import MagicMock import pytest import torch from torch.utils.data import DataLoader, random_split -from pytorch_lightning import LightningDataModule, Trainer, seed_everything +from pytorch_lightning import LightningDataModule, seed_everything, Trainer +from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.utilities.model_utils import is_overridden from tests.base import EvalModelTemplate -from tests.base.datasets import TrialMNIST from tests.base.datamodules import TrialMNISTDataModule +from tests.base.datasets import TrialMNIST from tests.base.develop_utils import reset_seed -from pytorch_lightning.utilities.model_utils import is_overridden -from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator -from pytorch_lightning.callbacks import ModelCheckpoint def test_can_prepare_data(tmpdir): @@ -170,14 +170,14 @@ def test_data_hooks_called_with_stage_kwarg(tmpdir): def test_dm_add_argparse_args(tmpdir): parser = ArgumentParser() parser = TrialMNISTDataModule.add_argparse_args(parser) - args = parser.parse_args(['--data_dir', './my_data']) - assert args.data_dir == './my_data' + args = parser.parse_args(['--data_dir', str(tmpdir)]) + assert args.data_dir == str(tmpdir) def test_dm_init_from_argparse_args(tmpdir): parser = ArgumentParser() parser = TrialMNISTDataModule.add_argparse_args(parser) - args = parser.parse_args(['--data_dir', './my_data']) + args = parser.parse_args(['--data_dir', str(tmpdir)]) dm = TrialMNISTDataModule.from_argparse_args(args) dm.prepare_data() dm.setup() diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py index e3a597063d02e..64b68245ba66e 100644 --- a/tests/core/test_lightning_module.py +++ b/tests/core/test_lightning_module.py @@ -11,17 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import pickle from argparse import ArgumentParser +import pickle from typing import Optional from unittest.mock import MagicMock, patch import pytest import torch -from torch.optim import SGD, Adam +from torch.optim import Adam, SGD from torch.utils.data import DataLoader, random_split -from pytorch_lightning import LightningDataModule, Trainer, seed_everything +from pytorch_lightning import LightningDataModule, seed_everything, Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import BoringModel @@ -45,8 +45,7 @@ def optimizer_step(self, *_, **__): assert "It ensures optimizer_step or optimizer_zero_grad are called on every batch" in str(e) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_automatic_optimization_num_calls(enable_pl_optimizer, tmpdir): +def test_automatic_optimization_num_calls(tmpdir): with patch("torch.optim.SGD.step") as sgd_step, \ patch("torch.optim.SGD.zero_grad") as sgd_zero_grad, \ @@ -75,16 +74,12 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, if batch_idx % 2 == 0: assert isinstance(optimizer, SGD) optimizer.step(closure=optimizer_closure) - if not enable_pl_optimizer: - optimizer.zero_grad() # update discriminator opt every 4 steps if optimizer_idx == 1: if batch_idx % 4 == 0: assert isinstance(optimizer, Adam) optimizer.step(closure=optimizer_closure) - if not enable_pl_optimizer: - optimizer.zero_grad() model = TestModel() model.training_epoch_end = None @@ -94,7 +89,6 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, default_root_dir=tmpdir, limit_train_batches=8, accumulate_grad_batches=1, - enable_pl_optimizer=enable_pl_optimizer ) trainer.fit(model) @@ -105,8 +99,7 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, assert adam_zero_grad.call_count == 2 -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_params_groups_and_state_are_accessible(enable_pl_optimizer, tmpdir): +def test_params_groups_and_state_are_accessible(tmpdir): with patch("torch.optim.SGD.step") as sgd_step, \ patch("torch.optim.SGD.zero_grad") as sgd_zero_grad, \ @@ -143,7 +136,6 @@ def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, clos default_root_dir=tmpdir, limit_train_batches=8, accumulate_grad_batches=1, - enable_pl_optimizer=enable_pl_optimizer ) trainer.fit(model) diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py index a9fcf918cc699..1c49844b5764b 100644 --- a/tests/core/test_lightning_optimizer.py +++ b/tests/core/test_lightning_optimizer.py @@ -14,16 +14,18 @@ import os from unittest.mock import patch +import numpy as np import pytest import torch import torch.nn as nn from torch.optim import Adam, Optimizer import pytorch_lightning as pl -from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning import LightningModule, seed_everything, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.model_utils import is_overridden from tests.base.boring_model import BoringModel, RandomDataset, RandomDictDataset, RandomDictStringDataset @@ -45,13 +47,12 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) groups = "{'dampening': 0, 'initial_lr': 0.1, 'lr': 0.01, 'momentum': 0, 'nesterov': False, 'weight_decay': 0}" expected = f"LightningSGD(groups=[{groups}])" - assert trainer.optimizers[0].__repr__() == expected + assert trainer._lightning_optimizers[0].__repr__() == expected def test_lightning_optimizer_from_user(tmpdir): @@ -73,13 +74,12 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) groups = "{'amsgrad': False, 'betas': (0.9, 0.999), 'eps': 1e-08, 'initial_lr': 0.1, 'lr': 0.01, 'weight_decay': 0}" expected = f"LightningAdam(groups=[{groups}])" - assert trainer.optimizers[0].__repr__() == expected + assert trainer._lightning_optimizers[0].__repr__() == expected @patch("torch.optim.Adam.step", autospec=True) @@ -89,6 +89,9 @@ def test_lightning_optimizer_manual_optimization(mock_sgd_step, mock_adam_step, Test that the user can use our LightningOptimizer. Not recommended for now. """ class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False def training_step(self, batch, batch_idx, optimizer_idx=None): (opt_1, opt_2) = self.optimizers() @@ -114,10 +117,6 @@ def configure_optimizers(self): lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1) return [optimizer_1, optimizer_2], [lr_scheduler] - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.training_step_end = None model.training_epoch_end = None @@ -127,7 +126,6 @@ def automatic_optimization(self) -> bool: limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -142,6 +140,9 @@ def test_lightning_optimizer_manual_optimization_and_accumulated_gradients(mock_ Test that the user can use our LightningOptimizer. Not recommended. """ class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False def training_step(self, batch, batch_idx, optimizer_idx=None): (opt_1, opt_2) = self.optimizers() @@ -167,10 +168,6 @@ def configure_optimizers(self): lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1) return [optimizer_1, optimizer_2], [lr_scheduler] - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.training_step_end = None model.training_epoch_end = None @@ -181,7 +178,6 @@ def automatic_optimization(self) -> bool: max_epochs=1, weights_summary=None, accumulate_grad_batches=2, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -261,7 +257,6 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -314,7 +309,6 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -374,7 +368,6 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -427,7 +420,6 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - enable_pl_optimizer=True, ) trainer.fit(model) diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py index cfeb302134d24..cb68ad04459e8 100644 --- a/tests/core/test_memory.py +++ b/tests/core/test_memory.py @@ -15,8 +15,9 @@ import torch import torch.nn as nn -from pytorch_lightning import LightningModule +from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.core.memory import UNKNOWN_SIZE, ModelSummary +from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base.models import ParityModuleRNN @@ -32,6 +33,22 @@ def forward(self, *args, **kwargs): return {'loss': self.parameter.sum()} +class PreCalculatedModel(LightningModule): + """ A module with precalculated total params size in MB. """ + + def __init__(self): + super().__init__() + self.layer1 = nn.Linear(10, 100) + self.layer2 = nn.Linear(100, 2) + self.pre_calculated_model_size = 0.005 + + def forward(self, x): + x = self.layer1(x) + x = self.layer2(x) + return x + + + class UnorderedModel(LightningModule): """ A model in which the layers not defined in order of execution """ @@ -68,6 +85,15 @@ def forward(self, x): return self.reduce(self.embed(x)) +def test_invalid_weights_summmary(): + """ Test that invalid value for weights_summary raises an error. """ + with pytest.raises(MisconfigurationException, match='`mode` can be None, .* got temp'): + UnorderedModel().summarize(mode='temp') + + with pytest.raises(MisconfigurationException, match='`weights_summary` can be None, .* got temp'): + Trainer(weights_summary='temp') + + @pytest.mark.parametrize(['mode'], [ pytest.param(ModelSummary.MODE_FULL), pytest.param(ModelSummary.MODE_TOP), @@ -237,3 +263,25 @@ def forward(self, *args, **kwargs): model.example_input_array = example_input summary = model.summarize(mode=mode) assert summary.in_sizes == [expected_size] + +@pytest.mark.parametrize(['mode'], [ + pytest.param(ModelSummary.MODE_FULL), + pytest.param(ModelSummary.MODE_TOP), +]) +def test_model_size(mode): + """ Test that model size is calculated correctly. """ + model = PreCalculatedModel() + summary = model.summarize(mode=mode) + pre_calculated_model_size = torch.tensor(model.pre_calculated_model_size) + model_size = torch.tensor(summary.model_size()) + assert torch.isclose(model_size, pre_calculated_model_size, atol=1e-4) + +@pytest.mark.parametrize(['mode'], [ + pytest.param(ModelSummary.MODE_FULL), + pytest.param(ModelSummary.MODE_TOP), +]) +def test_empty_model_size(mode): + """ Test that empty model size is zero. """ + model = EmptyModule() + summary = model.summarize(mode=mode) + assert 0.0 == summary.model_size() diff --git a/tests/deprecated_api/test_remove_1-3.py b/tests/deprecated_api/test_remove_1-3.py index 7ec69796b1e46..4a5bed4de9b55 100644 --- a/tests/deprecated_api/test_remove_1-3.py +++ b/tests/deprecated_api/test_remove_1-3.py @@ -135,3 +135,8 @@ def test_trainer_cli_profiler_remove_in_v1_3_0(cli_args, expected_parsed_arg, ex assert getattr(args, "profiler") == expected_parsed_arg trainer = Trainer.from_argparse_args(args) assert isinstance(trainer.profiler, expected_profiler) + + +def test_trainer_enable_pl_optimizer(tmpdir): + with pytest.deprecated_call(match='will be removed in v1.3'): + Trainer(enable_pl_optimizer=True) diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index ba5791c7b9f4a..4bf15ff8d99a1 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -74,7 +74,9 @@ def test_loggers_fit_test_all(tmpdir, monkeypatch): with mock.patch('pytorch_lightning.loggers.test_tube.Experiment'): _test_loggers_fit_test(tmpdir, TestTubeLogger) - with mock.patch('pytorch_lightning.loggers.wandb.wandb'): + with mock.patch('pytorch_lightning.loggers.wandb.wandb') as wandb: + wandb.run = None + wandb.init().step = 0 _test_loggers_fit_test(tmpdir, WandbLogger) @@ -112,9 +114,9 @@ def log_metrics(self, metrics, step): trainer = Trainer( max_epochs=1, logger=logger, - limit_train_batches=0.2, - limit_val_batches=0.5, - fast_dev_run=True, + limit_train_batches=1, + limit_val_batches=1, + log_every_n_steps=1, default_root_dir=tmpdir, ) trainer.fit(model) @@ -124,7 +126,7 @@ def log_metrics(self, metrics, step): if logger_class == TensorBoardLogger: expected = [ (0, ['hp_metric']), - (0, ['train_some_val']), + (0, ['epoch', 'train_some_val']), (0, ['early_stop_on', 'epoch', 'val_acc']), (0, ['hp_metric']), (1, ['epoch', 'test_acc', 'test_loss']) @@ -132,7 +134,7 @@ def log_metrics(self, metrics, step): assert log_metric_names == expected else: expected = [ - (0, ['train_some_val']), + (0, ['epoch', 'train_some_val']), (0, ['early_stop_on', 'epoch', 'val_acc']), (1, ['epoch', 'test_acc', 'test_loss']) ] @@ -368,5 +370,7 @@ def test_logger_with_prefix_all(tmpdir, monkeypatch): # WandB with mock.patch('pytorch_lightning.loggers.wandb.wandb') as wandb: logger = _instantiate_logger(WandbLogger, save_idr=tmpdir, prefix=prefix) + wandb.run = None + wandb.init().step = 0 logger.log_metrics({"test": 1.0}, step=0) logger.experiment.log.assert_called_once_with({'tmp-test': 1.0}, step=0) diff --git a/tests/loggers/test_mlflow.py b/tests/loggers/test_mlflow.py index c52dd82889f01..c6072afbb69e2 100644 --- a/tests/loggers/test_mlflow.py +++ b/tests/loggers/test_mlflow.py @@ -20,7 +20,7 @@ from pytorch_lightning import Trainer -from pytorch_lightning.loggers import MLFlowLogger +from pytorch_lightning.loggers import _MLFLOW_AVAILABLE, MLFlowLogger from tests.base import EvalModelTemplate @@ -120,7 +120,7 @@ def test_mlflow_log_dir(client, mlflow, tmpdir): def test_mlflow_logger_dirs_creation(tmpdir): """ Test that the logger creates the folders and files in the right place. """ - if not importlib.util.find_spec('mlflow'): + if not _MLFLOW_AVAILABLE: pytest.xfail("test for explicit file creation requires mlflow dependency to be installed.") assert not os.listdir(tmpdir) @@ -137,8 +137,13 @@ def test_mlflow_logger_dirs_creation(tmpdir): assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} model = EvalModelTemplate() - trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_val_batches=3, - log_gpu_memory=True) + trainer = Trainer( + default_root_dir=tmpdir, + logger=logger, + max_epochs=1, + limit_val_batches=3, + log_gpu_memory=True, + ) trainer.fit(model) assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} assert 'epoch' in os.listdir(tmpdir / exp_id / run_id / 'metrics') diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py index 15a024003ebf0..148ad550e74c7 100644 --- a/tests/loggers/test_tensorboard.py +++ b/tests/loggers/test_tensorboard.py @@ -22,7 +22,7 @@ from omegaconf import OmegaConf from tensorboard.backend.event_processing.event_accumulator import EventAccumulator -from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.loggers import TensorBoardLogger from tests.base import BoringModel, EvalModelTemplate @@ -102,7 +102,7 @@ def test_tensorboard_named_version(tmpdir): expected_version = "2020-02-05-162402" logger = TensorBoardLogger(save_dir=tmpdir, name=name, version=expected_version) - logger.log_hyperparams({"a": 1, "b": 2}) # Force data to be written + logger.log_hyperparams({"a": 1, "b": 2, 123: 3, 3.5: 4, 5j: 5}) # Force data to be written assert logger.version == expected_version assert os.listdir(tmpdir / name) == [expected_version] @@ -113,7 +113,7 @@ def test_tensorboard_named_version(tmpdir): def test_tensorboard_no_name(tmpdir, name): """Verify that None or empty name works""" logger = TensorBoardLogger(save_dir=tmpdir, name=name) - logger.log_hyperparams({"a": 1, "b": 2}) # Force data to be written + logger.log_hyperparams({"a": 1, "b": 2, 123: 3, 3.5: 4, 5j: 5}) # Force data to be written assert logger.root_dir == tmpdir assert os.listdir(tmpdir / "version_0") @@ -213,8 +213,11 @@ def test_tensorboard_with_accummulated_gradients(mock_log_metrics, expected, tmp Tests to ensure that tensorboard log properly when accumulated_gradients > 1 """ class TestModel(BoringModel): - _count = 0 - _indexes = [] + + def __init__(self): + super().__init__() + self._count = 0 + self._indexes = [] def training_step(self, batch, batch_idx): output = self.layer(batch) @@ -222,10 +225,10 @@ def training_step(self, batch, batch_idx): self.log('count', self._count, on_step=True, on_epoch=True) self.log('loss', loss, on_step=True, on_epoch=True) - if self.trainer.logger_connector.should_update_logs: - self._indexes.append(self._count) + if not self.trainer.train_loop.should_accumulate(): + if self.trainer.logger_connector.should_update_logs: + self._indexes.append(self.trainer.global_step) - self._count += 1 return loss def validation_step(self, batch, batch_idx): @@ -245,14 +248,13 @@ def configure_optimizers(self): logger_0 = TensorBoardLogger(tmpdir, default_hp_metric=False) - accumulate_grad_batches = 2 trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=12, - limit_val_batches=12, + limit_val_batches=0, max_epochs=3, gpus=0, - accumulate_grad_batches=accumulate_grad_batches, + accumulate_grad_batches=2, logger=[logger_0], log_every_n_steps=3, ) @@ -260,5 +262,6 @@ def configure_optimizers(self): mock_count_epochs = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_epoch" in m[2]["metrics"]] assert mock_count_epochs == expected + mock_count_steps = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_step" in m[2]["metrics"]] assert model._indexes == mock_count_steps diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index 33211e6492d91..a44b19ca39270 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -22,8 +22,14 @@ from tests.base import EvalModelTemplate, BoringModel +def get_warnings(recwarn): + warnings_text = '\n'.join(str(w.message) for w in recwarn.list) + recwarn.clear() + return warnings_text + + @mock.patch('pytorch_lightning.loggers.wandb.wandb') -def test_wandb_logger_init(wandb): +def test_wandb_logger_init(wandb, recwarn): """Verify that basic functionality of wandb logger works. Wandb doesn't work well with pytest so we have to mock it out here.""" @@ -34,6 +40,9 @@ def test_wandb_logger_init(wandb): wandb.init.assert_called_once() wandb.init().log.assert_called_once_with({'acc': 1.0}, step=None) + # mock wandb step + wandb.init().step = 0 + # test wandb.init not called if there is a W&B run wandb.init().log.reset_mock() wandb.init.reset_mock() @@ -49,15 +58,28 @@ def test_wandb_logger_init(wandb): logger.log_metrics({'acc': 1.0}, step=3) wandb.init().log.assert_called_with({'acc': 1.0}, step=6) + # log hyper parameters logger.log_hyperparams({'test': None, 'nested': {'a': 1}, 'b': [2, 3, 4]}) wandb.init().config.update.assert_called_once_with( {'test': 'None', 'nested/a': 1, 'b': [2, 3, 4]}, allow_val_change=True, ) + # watch a model logger.watch('model', 'log', 10) wandb.init().watch.assert_called_once_with('model', log='log', log_freq=10) + # verify warning for logging at a previous step + assert 'Trying to log at a previous step' not in get_warnings(recwarn) + # current step from wandb should be 6 (last logged step) + logger.experiment.step = 6 + # logging at step 2 should raise a warning (step_offset is still 3) + logger.log_metrics({'acc': 1.0}, step=2) + assert 'Trying to log at a previous step' in get_warnings(recwarn) + # logging again at step 2 should not display again the same warning + logger.log_metrics({'acc': 1.0}, step=2) + assert 'Trying to log at a previous step' not in get_warnings(recwarn) + assert logger.name == wandb.init().project_name() assert logger.version == wandb.init().id @@ -71,6 +93,7 @@ def test_wandb_pickle(wandb, tmpdir): class Experiment: """ """ id = 'the_id' + step = 0 def project_name(self): return 'the_project_name' @@ -108,8 +131,11 @@ def test_wandb_logger_dirs_creation(wandb, tmpdir): assert logger.name is None # mock return values of experiment + wandb.run = None + wandb.init().step = 0 logger.experiment.id = '1' logger.experiment.project_name.return_value = 'project' + logger.experiment.step = 0 for _ in range(2): _ = logger.experiment diff --git a/tests/metrics/regression/test_ssim.py b/tests/metrics/regression/test_ssim.py index f581188e89fce..8bb304850e3f2 100644 --- a/tests/metrics/regression/test_ssim.py +++ b/tests/metrics/regression/test_ssim.py @@ -53,9 +53,7 @@ def _sk_metric(preds, target, data_range, multichannel): class TestSSIM(MetricTester): atol = 6e-5 - # TODO: for some reason this test hangs with ddp=True - # @pytest.mark.parametrize("ddp", [True, False]) - @pytest.mark.parametrize("ddp", [False]) + @pytest.mark.parametrize("ddp", [True, False]) @pytest.mark.parametrize("dist_sync_on_step", [True, False]) def test_ssim(self, preds, target, multichannel, ddp, dist_sync_on_step): self.run_class_metric_test( diff --git a/tests/metrics/test_metric.py b/tests/metrics/test_metric.py index d97cd1a176cf2..67e85624379a5 100644 --- a/tests/metrics/test_metric.py +++ b/tests/metrics/test_metric.py @@ -26,6 +26,20 @@ def compute(self): pass +class DummyList(Metric): + name = "DummyList" + + def __init__(self): + super().__init__() + self.add_state("x", list(), dist_reduce_fx=None) + + def update(self): + pass + + def compute(self): + pass + + def test_inherit(): a = Dummy() @@ -77,12 +91,21 @@ def test_reset(): class A(Dummy): pass + class B(DummyList): + pass + a = A() assert a.x == 0 a.x = torch.tensor(5) a.reset() assert a.x == 0 + b = B() + assert isinstance(b.x, list) and len(b.x) == 0 + b.x = torch.tensor(5) + b.reset() + assert isinstance(b.x, list) and len(b.x) == 0 + def test_update(): class A(Dummy): diff --git a/tests/metrics/utils.py b/tests/metrics/utils.py index c607a466b2068..4bd6608ce3fcf 100644 --- a/tests/metrics/utils.py +++ b/tests/metrics/utils.py @@ -11,6 +11,11 @@ from pytorch_lightning.metrics import Metric +try: + set_start_method("spawn") +except RuntimeError: + pass + NUM_PROCESSES = 2 NUM_BATCHES = 10 BATCH_SIZE = 32 @@ -165,10 +170,7 @@ def setup_class(self): """Setup the metric class. This will spawn the pool of workers that are used for metric testing and setup_ddp """ - try: - set_start_method("spawn") - except RuntimeError: - pass + self.poolSize = NUM_PROCESSES self.pool = Pool(processes=self.poolSize) self.pool.starmap(setup_ddp, [(rank, self.poolSize) for rank in range(self.poolSize)]) diff --git a/tests/models/conf/config.yaml b/tests/models/conf/config.yaml new file mode 100644 index 0000000000000..faf751c24f6cb --- /dev/null +++ b/tests/models/conf/config.yaml @@ -0,0 +1,17 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +defaults: + - training: default + +log: ${training.log} diff --git a/tests/models/conf/training/default.yaml b/tests/models/conf/training/default.yaml new file mode 100644 index 0000000000000..2c35b22365420 --- /dev/null +++ b/tests/models/conf/training/default.yaml @@ -0,0 +1,2 @@ +# @package training +log: "Something" diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 269a2069e4266..214c3951c80dd 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -16,6 +16,7 @@ import pytest import torch +from torch import optim import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils @@ -145,8 +146,7 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): assert trainer.slurm_connector.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23' -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_cpu_model_with_amp(enable_pl_optimizer, tmpdir): +def test_cpu_model_with_amp(tmpdir): """Make sure model trains on CPU.""" trainer_options = dict( default_root_dir=tmpdir, @@ -155,7 +155,6 @@ def test_cpu_model_with_amp(enable_pl_optimizer, tmpdir): limit_train_batches=0.4, limit_val_batches=0.4, precision=16, - enable_pl_optimizer=enable_pl_optimizer, ) model = EvalModelTemplate() @@ -191,9 +190,15 @@ def test_amp_without_apex(tmpdir): @pytest.mark.skipif(not APEX_AVAILABLE, reason="test requires apex") def test_amp_with_apex(tmpdir): """Check calling apex scaling in training.""" - - model = EvalModelTemplate() - + class CustomModel(EvalModelTemplate): + def configure_optimizers(self): + optimizer1 = optim.Adam(self.parameters(), lr=self.learning_rate) + optimizer2 = optim.SGD(self.parameters(), lr=self.learning_rate) + lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1) + lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1) + return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2] + + model = CustomModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, @@ -204,4 +209,7 @@ def test_amp_with_apex(tmpdir): assert str(trainer.amp_backend) == "AMPType.APEX" trainer.fit(model) assert trainer.state == TrainerState.FINISHED - assert trainer.dev_debugger.count_events('AMP') == 10 + assert trainer.dev_debugger.count_events('AMP') == 20 + + assert isinstance(trainer.lr_schedulers[0]['scheduler'].optimizer, optim.Adam) + assert isinstance(trainer.lr_schedulers[1]['scheduler'].optimizer, optim.SGD) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 2848ab2e74f3c..8fea2ab941418 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -11,23 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from distutils.version import LooseVersion import os import platform -from distutils.version import LooseVersion import pytest import torch -import tests.base.develop_pipelines as tpipes -import tests.base.develop_utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.core.step_result import TrainResult from tests.base import EvalModelTemplate +import tests.base.develop_pipelines as tpipes +import tests.base.develop_utils as tutils -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): +def test_cpu_slurm_save_load(tmpdir): """Verify model save/load/checkpoint on CPU.""" hparams = EvalModelTemplate.get_default_hparams() model = EvalModelTemplate(**hparams) @@ -44,7 +43,6 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): limit_train_batches=0.2, limit_val_batches=0.2, callbacks=[ModelCheckpoint(dirpath=tmpdir)], - enable_pl_optimizer=enable_pl_optimizer, ) result = trainer.fit(model) real_global_step = trainer.global_step @@ -81,7 +79,6 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): max_epochs=1, logger=logger, callbacks=[ModelCheckpoint(dirpath=tmpdir)], - enable_pl_optimizer=enable_pl_optimizer, ) model = EvalModelTemplate(**hparams) @@ -101,8 +98,7 @@ def assert_pred_same(): trainer.fit(model) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): +def test_early_stopping_cpu_model(tmpdir): """Test each of the trainer options.""" stopping = EarlyStopping(monitor='early_stop_on', min_delta=0.1) trainer_options = dict( @@ -114,7 +110,6 @@ def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): track_grad_norm=2, limit_train_batches=0.1, limit_val_batches=0.1, - enable_pl_optimizer=enable_pl_optimizer, ) model = EvalModelTemplate() @@ -130,8 +125,7 @@ def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): @pytest.mark.skipif((platform.system() == "Darwin" and LooseVersion(torch.__version__) < LooseVersion("1.3.0")), reason="Distributed training is not supported on MacOS before Torch 1.3.0") -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): +def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" tutils.set_random_master_port() @@ -144,7 +138,6 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): gpus=None, num_processes=2, accelerator='ddp_cpu', - enable_pl_optimizer=enable_pl_optimizer, ) model = EvalModelTemplate() @@ -284,8 +277,7 @@ def test_cpu_model(tmpdir): tpipes.run_model_test(trainer_options, model, on_gpu=False) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_all_features_cpu_model(enable_pl_optimizer, tmpdir): +def test_all_features_cpu_model(tmpdir): """Test each of the trainer options.""" trainer_options = dict( default_root_dir=tmpdir, @@ -297,7 +289,6 @@ def test_all_features_cpu_model(enable_pl_optimizer, tmpdir): max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.4, - enable_pl_optimizer=enable_pl_optimizer, ) model = EvalModelTemplate() diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 2393b42d27191..e34648671e12d 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -49,7 +49,7 @@ def test_multi_gpu_none_backend(tmpdir): tpipes.run_model_test(trainer_options, model) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @pytest.mark.parametrize('gpus', [1, [0], [1]]) def test_single_gpu_model(tmpdir, gpus): """Make sure single GPU works (DP mode).""" diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index f3af5b745a380..5352e749c5e55 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -12,14 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import inspect +import os +from unittest.mock import MagicMock import pytest import torch -from unittest.mock import MagicMock from pytorch_lightning import Trainer from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator -from tests.base import EvalModelTemplate, BoringModel +from tests.base import BoringModel, EvalModelTemplate, RandomDataset @pytest.mark.parametrize('max_steps', [1, 2, 3]) @@ -124,6 +125,49 @@ def transfer_batch_to_device(self, data, device): assert batch_gpu.samples.device == batch_gpu.targets.device == expected +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', + reason="test should be run outside of pytest") +def test_transfer_batch_hook_ddp(tmpdir): + """ + Test custom data are properly moved to the right device using ddp + """ + + class CustomBatch: + + def __init__(self, data): + self.samples = data[0] + + def to(self, device, **kwargs): + self.samples = self.samples.to(device, **kwargs) + return self + + def collate_fn(batch): + return CustomBatch(batch) + + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + assert batch.samples.device == self.device + assert isinstance(batch_idx, int) + + def train_dataloader(self): + return torch.utils.data.DataLoader(RandomDataset(32, 64), collate_fn=collate_fn) + + model = TestModel() + model.validation_step = None + model.training_epoch_end = None + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=0, + max_epochs=1, + weights_summary=None, + accelerator="ddp", + gpus=2, + ) + trainer.fit(model) + + @pytest.mark.parametrize( 'max_epochs,batch_idx_', [(2, 5), (3, 8), (4, 12)] @@ -348,8 +392,6 @@ def on_test_model_train(self): expected = [ 'on_fit_start', - 'on_pretrain_routine_start', - 'on_pretrain_routine_end', 'on_test_model_eval', 'on_test_epoch_start', 'on_test_batch_start', diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index f47c13021edde..a047bfde6f7a2 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -20,18 +20,18 @@ import numpy as np import pytest -import torch from sklearn.metrics import accuracy_score +import torch -import tests.base.develop_pipelines as tpipes -import tests.base.develop_utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator from pytorch_lightning.core.step_result import EvalResult, Result, TrainResult from pytorch_lightning.metrics.classification.accuracy import Accuracy -from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, HOROVOD_AVAILABLE, _module_available +from pytorch_lightning.utilities import _module_available, APEX_AVAILABLE, HOROVOD_AVAILABLE, NATIVE_AMP_AVAILABLE from tests.base import EvalModelTemplate from tests.base.boring_model import BoringModel +import tests.base.develop_pipelines as tpipes +import tests.base.develop_utils as tutils from tests.base.models import BasicGAN if HOROVOD_AVAILABLE: @@ -69,8 +69,7 @@ def _run_horovod(trainer_options, on_gpu=False): @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_horovod_cpu(enable_pl_optimizer, tmpdir): +def test_horovod_cpu(tmpdir): """Test Horovod running multi-process on CPU.""" trainer_options = dict( default_root_dir=str(tmpdir), @@ -82,14 +81,12 @@ def test_horovod_cpu(enable_pl_optimizer, tmpdir): limit_val_batches=0.2, accelerator='horovod', deterministic=True, - enable_pl_optimizer=enable_pl_optimizer, ) _run_horovod(trainer_options) @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir): +def test_horovod_cpu_implicit(tmpdir): """Test Horovod without specifying a backend, inferring from env set by `horovodrun`.""" trainer_options = dict( default_root_dir=str(tmpdir), @@ -100,7 +97,6 @@ def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, deterministic=True, - enable_pl_optimizer=enable_pl_optimizer, ) _run_horovod(trainer_options) @@ -206,8 +202,7 @@ def validation_step(self, batch, *args, **kwargs): @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_horovod_multi_optimizer(enable_pl_optimizer, tmpdir): +def test_horovod_multi_optimizer(tmpdir): model = BasicGAN(**EvalModelTemplate.get_default_hparams()) # fit model @@ -219,7 +214,6 @@ def test_horovod_multi_optimizer(enable_pl_optimizer, tmpdir): limit_val_batches=0.2, deterministic=True, accelerator='horovod', - enable_pl_optimizer=enable_pl_optimizer, ) result = trainer.fit(model) assert result == 1, 'model failed to complete' @@ -241,8 +235,7 @@ def get_optimizer_params(optimizer): @pytest.mark.skipif(not HOROVOD_AVAILABLE, reason="Horovod is unavailable") @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_result_reduce_horovod(enable_pl_optimizer, tmpdir): +def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. This test mirrors tests/core/test_results.py::_ddp_test_fn @@ -282,7 +275,6 @@ def training_epoch_end(self, outputs) -> None: max_epochs=1, log_every_n_steps=1, weights_summary=None, - enable_pl_optimizer=enable_pl_optimizer, ) trainer.fit(model) diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py index 7df78d9760bd9..e354c6e708d95 100644 --- a/tests/models/test_hparams.py +++ b/tests/models/test_hparams.py @@ -15,19 +15,25 @@ import os import pickle from argparse import Namespace +from copy import deepcopy import cloudpickle import pytest import torch from fsspec.implementations.local import LocalFileSystem -from omegaconf import OmegaConf, Container +from omegaconf import Container, OmegaConf +from omegaconf.dictconfig import DictConfig from torch.nn import functional as F from torch.utils.data import DataLoader -from pytorch_lightning import Trainer, LightningModule -from pytorch_lightning.core.saving import save_hparams_to_yaml, load_hparams_from_yaml -from pytorch_lightning.utilities import AttributeDict, is_picklable -from tests.base import EvalModelTemplate, TrialMNIST, BoringModel +from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml +from pytorch_lightning.utilities import AttributeDict, HYDRA_EXPERIMENTAL_AVAILABLE, is_picklable +from tests.base import BoringModel, EvalModelTemplate, TrialMNIST + +if HYDRA_EXPERIMENTAL_AVAILABLE: + from hydra.experimental import compose, initialize class SaveHparamsModel(BoringModel): @@ -483,13 +489,13 @@ def test_hparams_save_yaml(tmpdir): path_yaml = os.path.join(tmpdir, 'testing-hparams.yaml') save_hparams_to_yaml(path_yaml, hparams) - assert load_hparams_from_yaml(path_yaml) == hparams + assert load_hparams_from_yaml(path_yaml, use_omegaconf=False) == hparams save_hparams_to_yaml(path_yaml, Namespace(**hparams)) - assert load_hparams_from_yaml(path_yaml) == hparams + assert load_hparams_from_yaml(path_yaml, use_omegaconf=False) == hparams save_hparams_to_yaml(path_yaml, AttributeDict(hparams)) - assert load_hparams_from_yaml(path_yaml) == hparams + assert load_hparams_from_yaml(path_yaml, use_omegaconf=False) == hparams save_hparams_to_yaml(path_yaml, OmegaConf.create(hparams)) assert load_hparams_from_yaml(path_yaml) == hparams @@ -642,3 +648,46 @@ def test_model_with_fsspec_as_parameter(tmpdir): ) trainer.fit(model) trainer.test() + + +@pytest.mark.skipif(not HYDRA_EXPERIMENTAL_AVAILABLE, reason="Hydra experimental is not available") +def test_model_save_hyper_parameters_interpolation_with_hydra(tmpdir): + """ + This test relies on configuration saved under tests/models/conf/config.yaml + """ + + class TestHydraModel(BoringModel): + + def __init__(self, args_0, args_1, args_2, kwarg_1=None): + self.save_hyperparameters() + self.test_hparams() + config_file = f"{tmpdir}/hparams.yaml" + save_hparams_to_yaml(config_file, self.hparams) + self.hparams = load_hparams_from_yaml(config_file) + self.test_hparams() + super().__init__() + + def test_hparams(self): + assert self.hparams.args_0.log == "Something" + assert self.hparams.args_1['cfg'].log == "Something" + assert self.hparams.args_2[0].log == "Something" + assert self.hparams.kwarg_1['cfg'][0].log == "Something" + + with initialize(config_path="conf"): + args_0 = compose(config_name="config") + args_1 = {"cfg": compose(config_name="config")} + args_2 = [compose(config_name="config")] + kwarg_1 = {"cfg": [compose(config_name="config")]} + model = TestHydraModel(args_0, args_1, args_2, kwarg_1=kwarg_1) + epochs = 2 + checkpoint_callback = ModelCheckpoint(monitor=None, dirpath=tmpdir, save_top_k=-1) + trainer = Trainer( + default_root_dir=tmpdir, + callbacks=[checkpoint_callback], + limit_train_batches=10, + limit_val_batches=10, + max_epochs=epochs, + logger=False, + ) + trainer.fit(model) + _ = TestHydraModel.load_from_checkpoint(checkpoint_callback.best_model_path) diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 17821570bdfa7..ded9deb0d0a45 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from copy import deepcopy import glob import logging as log import os import pickle -from copy import deepcopy import cloudpickle import pytest @@ -23,11 +23,11 @@ from torch.nn import functional as F from torch.utils.data import DataLoader +from pytorch_lightning import Callback, LightningModule, seed_everything, Trainer +from pytorch_lightning.callbacks import ModelCheckpoint import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils -from pytorch_lightning import Callback, LightningModule, Trainer, seed_everything -from pytorch_lightning.callbacks import ModelCheckpoint -from tests.base import EvalModelTemplate, GenericEvalModelTemplate, TrialMNIST +from tests.base import BoringModel, EvalModelTemplate, GenericEvalModelTemplate, TrialMNIST class ModelTrainerPropertyParity(Callback): @@ -52,8 +52,7 @@ def on_train_end(self, trainer, pl_module): self._check_properties(trainer, pl_module) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir): +def test_model_properties_resume_from_checkpoint(tmpdir): """ Test that properties like `current_epoch` and `global_step` in model and trainer are always the same. """ model = EvalModelTemplate() @@ -62,7 +61,6 @@ def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir): default_root_dir=tmpdir, max_epochs=1, logger=False, - enable_pl_optimizer=enable_pl_optimizer, callbacks=[checkpoint_callback, ModelTrainerPropertyParity()], # this performs the assertions ) trainer = Trainer(**trainer_args) @@ -73,6 +71,25 @@ def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir): trainer.fit(model) +def test_try_resume_from_non_existing_checkpoint(tmpdir): + """ Test that trying to resume from non-existing `resume_from_checkpoint` fail without error.""" + model = BoringModel() + checkpoint_cb = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + logger=False, + callbacks=[checkpoint_cb], + limit_train_batches=0.1, + limit_val_batches=0.1, + ) + # Generate checkpoint `last.ckpt` with BoringModel + trainer.fit(model) + # `True` if resume/restore successfully else `False` + assert trainer.checkpoint_connector.restore(str(tmpdir / "last.ckpt"), trainer.on_gpu) + assert not trainer.checkpoint_connector.restore(str(tmpdir / "last_non_existing.ckpt"), trainer.on_gpu) + + class CaptureCallbacksBeforeTraining(Callback): callbacks = [] @@ -80,8 +97,7 @@ def on_train_start(self, trainer, pl_module): self.callbacks = deepcopy(trainer.callbacks) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_callbacks_state_resume_from_checkpoint(enable_pl_optimizer, tmpdir): +def test_callbacks_state_resume_from_checkpoint(tmpdir): """ Test that resuming from a checkpoint restores callbacks that persist state. """ model = EvalModelTemplate() callback_capture = CaptureCallbacksBeforeTraining() @@ -92,7 +108,6 @@ def get_trainer_args(): default_root_dir=tmpdir, max_steps=1, logger=False, - enable_pl_optimizer=enable_pl_optimizer, callbacks=[ checkpoint, callback_capture, @@ -119,11 +134,10 @@ def get_trainer_args(): assert before.best_model_score == after.best_model_score -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_callbacks_references_resume_from_checkpoint(enable_pl_optimizer, tmpdir): +def test_callbacks_references_resume_from_checkpoint(tmpdir): """ Test that resuming from a checkpoint sets references as expected. """ model = EvalModelTemplate() - args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False, "enable_pl_optimizer": enable_pl_optimizer} + args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False} # initial training checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py index fd771c98635ab..dd54c6b5d654e 100644 --- a/tests/models/test_sync_batchnorm.py +++ b/tests/models/test_sync_batchnorm.py @@ -16,8 +16,9 @@ import torch.nn as nn import torch.nn.functional as F -from pytorch_lightning import Trainer, seed_everything, LightningModule +from pytorch_lightning import LightningModule, seed_everything, Trainer from pytorch_lightning.core.step_result import TrainResult +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.utilities import FLOAT16_EPSILON from tests.base.datamodules import MNISTDataModule from tests.base.develop_utils import set_random_master_port @@ -108,6 +109,7 @@ def test_sync_batchnorm_ddp(tmpdir): sync_batchnorm=True, num_sanity_val_steps=0, replace_sampler_ddp=False, + plugins=[DDPPlugin(find_unused_parameters=True)] ) result = trainer.fit(model, dm) diff --git a/tests/models/test_torchscript.py b/tests/models/test_torchscript.py index 3c43b201f52e4..75e1ec7724967 100644 --- a/tests/models/test_torchscript.py +++ b/tests/models/test_torchscript.py @@ -18,7 +18,7 @@ from tests.base import BoringModel from tests.base.datamodules import TrialMNISTDataModule -from tests.base.models import ParityModuleRNN, BasicGAN +from tests.base.models import BasicGAN, ParityModuleRNN @pytest.mark.parametrize("modelclass", [ @@ -116,10 +116,10 @@ def test_torchscript_retain_training_state(): ParityModuleRNN, BasicGAN, ]) -def test_torchscript_properties(modelclass): +def test_torchscript_properties(tmpdir, modelclass): """ Test that scripted LightningModule has unnecessary methods removed. """ model = modelclass() - model.datamodule = TrialMNISTDataModule() + model.datamodule = TrialMNISTDataModule(tmpdir) script = model.to_torchscript() assert not hasattr(script, "datamodule") assert not hasattr(model, "batch_size") or hasattr(script, "batch_size") diff --git a/tests/plugins/test_ddp_sequential_plugin.py b/tests/plugins/test_ddp_sequential_plugin.py index 23b0b9128b349..0f5b78f71da50 100644 --- a/tests/plugins/test_ddp_sequential_plugin.py +++ b/tests/plugins/test_ddp_sequential_plugin.py @@ -47,7 +47,8 @@ def test_ddp_sequential_plugin_ddp_rpc_manual(tmpdir, args=None): limit_test_batches=2, gpus=2, distributed_backend="ddp", - plugins=[DDPSequentialPlugin(balance=[2, 1])], + plugins=[DDPSequentialPlugin(balance=[2, 1], rpc_timeout_sec=5 * 60)], + enable_pl_optimizer=True, ) trainer.fit(model) @@ -148,6 +149,7 @@ class SequentialModelRPCManual(LightningModule): def __init__(self): super().__init__() self.sequential_module = nn.Sequential(torch.nn.Linear(32, 32), nn.ReLU(), nn.Linear(32, 2)) + self.automatic_optimization = False def forward(self, x): return self.sequential_module(x) @@ -194,19 +196,14 @@ def val_dataloader(self): def test_dataloader(self): return torch.utils.data.DataLoader(RandomDataset(32, 64)) - @property - def automatic_optimization(self) -> bool: - return False - class SequentialModelRPCAutomatic(SequentialModelRPCManual): + def __init__(self): + super().__init__() + self.automatic_optimization = True def training_step(self, batch, batch_idx): output = self.sequential_module(batch) loss = self.loss(output) self.log("train_loss", loss, on_epoch=True, prog_bar=True) return loss - - @property - def automatic_optimization(self) -> bool: - return True diff --git a/tests/special_tests.sh b/tests/special_tests.sh index f7cb581951783..a86243628e914 100644 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # Running special tests +set -e export PL_RUNNING_SPECIAL_TESTS=1 DEFAULTS="-m coverage run --source pytorch_lightning -a -m pytest --verbose --capture=no" python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp @@ -19,4 +20,5 @@ python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual_amp python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_automatic -# python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_with_wrong_balance +python ${DEFAULTS} tests/trainer/logging_tests/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp +python ${DEFAULTS} tests/models/test_hooks.py::test_transfer_batch_hook_ddp diff --git a/tests/test_profiler.py b/tests/test_profiler.py index 3bce379c1115c..4728b11582dfc 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -1,3 +1,17 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import time from pathlib import Path diff --git a/tests/trainer/dynamic_args/test_multiple_optimizers.py b/tests/trainer/dynamic_args/test_multiple_optimizers.py index 48b1bf6ab7ac9..6b8219c673009 100644 --- a/tests/trainer/dynamic_args/test_multiple_optimizers.py +++ b/tests/trainer/dynamic_args/test_multiple_optimizers.py @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import torch + from pytorch_lightning import Trainer from tests.base.boring_model import BoringModel -import torch def test_multiple_optimizers(tmpdir): @@ -68,6 +69,10 @@ def test_multiple_optimizers_manual(tmpdir): Tests that only training_step can be used """ class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False + def on_train_epoch_start(self) -> None: self.opt_0_seen = False self.opt_1_seen = False @@ -97,10 +102,6 @@ def configure_optimizers(self): optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) return optimizer, optimizer_2 - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.val_dataloader = None diff --git a/tests/trainer/flags/test_fast_dev_run.py b/tests/trainer/flags/test_fast_dev_run.py index 00c62cdf48fce..2eaa6fd7f888d 100644 --- a/tests/trainer/flags/test_fast_dev_run.py +++ b/tests/trainer/flags/test_fast_dev_run.py @@ -1,14 +1,20 @@ +import os +from unittest import mock + import pytest +import torch + from pytorch_lightning import Trainer -from tests.base import EvalModelTemplate +from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from pytorch_lightning.loggers.base import DummyLogger +from tests.base import BoringModel @pytest.mark.parametrize('tuner_alg', ['batch size scaler', 'learning rate finder']) def test_skip_on_fast_dev_run_tuner(tmpdir, tuner_alg): """ Test that tuner algorithms are skipped if fast dev run is enabled """ - hparams = EvalModelTemplate.get_default_hparams() - model = EvalModelTemplate(**hparams) + model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, @@ -19,3 +25,100 @@ def test_skip_on_fast_dev_run_tuner(tmpdir, tuner_alg): expected_message = f'Skipping {tuner_alg} since fast_dev_run is enabled.' with pytest.warns(UserWarning, match=expected_message): trainer.tune(model) + + +@pytest.mark.parametrize('fast_dev_run', [1, 4]) +@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) +def test_callbacks_and_logger_not_called_with_fastdevrun(tmpdir, fast_dev_run): + """ + Test that ModelCheckpoint, EarlyStopping and Logger are turned off with fast_dev_run + """ + class FastDevRunModel(BoringModel): + def __init__(self): + super().__init__() + self.training_step_call_count = 0 + self.training_epoch_end_call_count = 0 + self.validation_step_call_count = 0 + self.validation_epoch_end_call_count = 0 + self.test_step_call_count = 0 + + def training_step(self, batch, batch_idx): + self.log('some_metric', torch.tensor(7.)) + self.logger.experiment.dummy_log('some_distribution', torch.randn(7) + batch_idx) + self.training_step_call_count += 1 + return super().training_step(batch, batch_idx) + + def training_epoch_end(self, outputs): + self.training_epoch_end_call_count += 1 + super().training_epoch_end(outputs) + + def validation_step(self, batch, batch_idx): + self.validation_step_call_count += 1 + return super().validation_step(batch, batch_idx) + + def validation_epoch_end(self, outputs): + self.validation_epoch_end_call_count += 1 + super().validation_epoch_end(outputs) + + def test_step(self, batch, batch_idx): + self.test_step_call_count += 1 + return super().test_step(batch, batch_idx) + + checkpoint_callback = ModelCheckpoint() + early_stopping_callback = EarlyStopping() + trainer_config = dict( + fast_dev_run=fast_dev_run, + val_check_interval=2, + logger=True, + log_every_n_steps=1, + callbacks=[checkpoint_callback, early_stopping_callback], + ) + + def _make_fast_dev_run_assertions(trainer, model): + # check the call count for train/val/test step/epoch + assert model.training_step_call_count == fast_dev_run + assert model.training_epoch_end_call_count == 1 + assert model.validation_step_call_count == 0 if model.validation_step is None else fast_dev_run + assert model.validation_epoch_end_call_count == 0 if model.validation_step is None else 1 + assert model.test_step_call_count == fast_dev_run + + # check trainer arguments + assert trainer.max_steps == fast_dev_run + assert trainer.num_sanity_val_steps == 0 + assert trainer.max_epochs == 1 + assert trainer.val_check_interval == 1.0 + assert trainer.check_val_every_n_epoch == 1 + + # there should be no logger with fast_dev_run + assert isinstance(trainer.logger, DummyLogger) + assert len(trainer.dev_debugger.logged_metrics) == fast_dev_run + + # checkpoint callback should not have been called with fast_dev_run + assert trainer.checkpoint_callback == checkpoint_callback + assert not os.path.exists(checkpoint_callback.dirpath) + assert len(trainer.dev_debugger.checkpoint_callback_history) == 0 + + # early stopping should not have been called with fast_dev_run + assert trainer.early_stopping_callback == early_stopping_callback + assert len(trainer.dev_debugger.early_stopping_history) == 0 + + train_val_step_model = FastDevRunModel() + trainer = Trainer(**trainer_config) + results = trainer.fit(train_val_step_model) + trainer.test(ckpt_path=None) + + assert results + _make_fast_dev_run_assertions(trainer, train_val_step_model) + + # ----------------------- + # also called once with no val step + # ----------------------- + train_step_only_model = FastDevRunModel() + train_step_only_model.validation_step = None + + trainer = Trainer(**trainer_config) + results = trainer.fit(train_step_only_model) + trainer.test(ckpt_path=None) + + assert results + _make_fast_dev_run_assertions(trainer, train_step_only_model) diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py index 9e2023d27d928..3a9a87f84e5d9 100644 --- a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py +++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py @@ -15,8 +15,9 @@ Tests to ensure that the training loop works with a dict """ import os -from pytorch_lightning.core.lightning import LightningModule + from pytorch_lightning import Trainer +from pytorch_lightning.core.lightning import LightningModule from tests.base.deterministic_model import DeterministicModel @@ -43,7 +44,7 @@ def backward(self, loss, optimizer, optimizer_idx): # out are the results of the full loop # eval_results are output of _evaluate - out, eval_results = trainer.run_evaluation(test_mode=False) + out, eval_results = trainer.run_evaluation() assert len(out) == 1 assert len(eval_results) == 0 @@ -74,7 +75,7 @@ def test_validation_step_scalar_return(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate - out, eval_results = trainer.run_evaluation(test_mode=False) + out, eval_results = trainer.run_evaluation() assert len(out) == 1 assert len(eval_results) == 2 assert eval_results[0] == 171 and eval_results[1] == 171 @@ -106,7 +107,7 @@ def test_validation_step_arbitrary_dict_return(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate - callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) + callback_metrics, eval_results = trainer.run_evaluation() assert len(callback_metrics) == 1 assert len(eval_results) == 2 assert eval_results[0]['some'] == 171 @@ -144,7 +145,7 @@ def test_validation_step_dict_return(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate - callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) + callback_metrics, eval_results = trainer.run_evaluation() assert len(callback_metrics) == 1 assert len(callback_metrics[0]) == 5 assert len(eval_results) == 2 @@ -186,7 +187,7 @@ def test_val_step_step_end_no_return(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate - callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) + callback_metrics, eval_results = trainer.run_evaluation() assert len(callback_metrics) == 1 assert len(eval_results) == 0 @@ -218,7 +219,7 @@ def test_val_step_step_end(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate - callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) + callback_metrics, eval_results = trainer.run_evaluation() assert len(callback_metrics) == 1 assert len(callback_metrics[0]) == 6 @@ -264,7 +265,7 @@ def test_no_val_step_end(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate - callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) + callback_metrics, eval_results = trainer.run_evaluation() assert len(callback_metrics) == 1 assert len(callback_metrics[0]) == 6 assert len(eval_results) == 1 @@ -308,7 +309,7 @@ def test_full_val_loop(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate - callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) + callback_metrics, eval_results = trainer.run_evaluation() assert len(callback_metrics) == 1 assert len(callback_metrics[0]) == 7 assert len(eval_results) == 1 diff --git a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py index 76baa9237955f..53636bed66f56 100644 --- a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py +++ b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py @@ -25,7 +25,7 @@ import torch from torch.utils.data import DataLoader, Dataset -from pytorch_lightning import Trainer, callbacks, seed_everything +from pytorch_lightning import callbacks, seed_everything, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.loggers import TensorBoardLogger @@ -292,7 +292,7 @@ def validation_epoch_end(self, outputs) -> None: max_epochs=1, log_every_n_steps=1, weights_summary=None, - callbacks=[ModelCheckpoint(dirpath='val_loss')], + callbacks=[ModelCheckpoint(dirpath=tmpdir)], ) trainer.fit(model) @@ -813,7 +813,7 @@ def validation_step(self, batch, batch_idx): def test_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) - self.log('fake_test_acc', loss) + self.log('test_loss', loss) return {"y": loss} model = ExtendedModel() @@ -825,7 +825,7 @@ def test_step(self, batch, batch_idx): logger=TensorBoardLogger(tmpdir), limit_train_batches=2, limit_val_batches=2, - limit_test_batches=0, + limit_test_batches=2, max_epochs=2, progress_bar_refresh_rate=1, ) @@ -877,33 +877,15 @@ def get_metrics_at_idx(idx): expected = torch.stack(model.val_losses[4:]).mean() assert get_metrics_at_idx(6)["valid_loss_1"] == expected - -def test_progress_bar_dict_contains_values_on_test_epoch_end(tmpdir): - class TestModel(BoringModel): - def test_step(self, *args): - self.log("foo", torch.tensor(self.current_epoch), on_step=False, on_epoch=True, prog_bar=True) - - def test_epoch_end(self, *_): - self.epoch_end_called = True - self.log('foo_2', torch.tensor(self.current_epoch), prog_bar=True, - on_epoch=True, sync_dist=True, sync_dist_op='sum') - - def on_test_epoch_end(self, *_): - self.on_test_epoch_end_called = True - assert self.trainer.progress_bar_dict["foo"] == self.current_epoch - assert self.trainer.progress_bar_dict["foo_2"] == self.current_epoch - - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=2, - limit_train_batches=1, - num_sanity_val_steps=2, - checkpoint_callback=False, - logger=False, - weights_summary=None, - progress_bar_refresh_rate=0, - ) - model = TestModel() - trainer.test(model) - assert model.epoch_end_called - assert model.on_test_epoch_end_called + results = trainer.test(model) + expected_callback_metrics = { + 'train_loss', + 'valid_loss_0_epoch', + 'valid_loss_0', + 'debug_epoch', + 'valid_loss_1', + 'test_loss', + 'val_loss' + } + assert set(trainer.callback_metrics) == expected_callback_metrics + assert set(results[0]) == {'test_loss', 'debug_epoch'} diff --git a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py index 0c27d8909d760..617cd6fa3cbd1 100644 --- a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py +++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py @@ -18,17 +18,22 @@ import collections import itertools import os +import platform from unittest import mock import numpy as np import pytest import torch -from torch.utils.data import Dataset +from torch.nn import functional as F +from torch.utils.data import DataLoader, Dataset, random_split +from torchvision import transforms +from torchvision.datasets.mnist import MNIST import pytorch_lightning as pl -from pytorch_lightning import Trainer, callbacks -from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning import callbacks, Trainer +from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.loggers import WandbLogger from tests.base.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset from tests.base.deterministic_model import DeterministicModel @@ -685,6 +690,7 @@ class TestModel(BoringModel): def training_step(self, batch, batch_idx): acc = self.step(batch[0]) self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') + self.log('foo_2', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') return acc def validation_step(self, batch, batch_idx): @@ -704,9 +710,46 @@ def validation_step(self, batch, batch_idx): trainer.fit(model) assert trainer.logged_metrics['foo'] == fake_result + assert trainer.logged_metrics['foo_2'] == 2 assert trainer.logged_metrics['bar'] == fake_result +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', + reason="test should be run outside of pytest") +def test_logging_sync_dist_true_ddp(tmpdir): + """ + Tests to ensure that the sync_dist flag works with ddp + """ + class TestLoggingSyncDistModel(BoringModel): + def training_step(self, batch, batch_idx): + acc = self.step(batch[0]) + self.log('foo', 1, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='SUM') + return acc + + def validation_step(self, batch, batch_idx): + self.training_step_called = True + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('bar', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='AVG') + return {"x": loss} + + model = TestLoggingSyncDistModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=2, + weights_summary=None, + accelerator="ddp", + gpus=2, + ) + trainer.fit(model) + + assert trainer.logged_metrics['foo'] == 2 + assert trainer.logged_metrics['bar'] == 2 + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_logging_sync_dist_true_gpu(tmpdir): """ @@ -771,3 +814,48 @@ def on_train_epoch_end(self, *_): trainer.fit(model) assert model.epoch_end_called assert model.on_train_epoch_end_called + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine") +def test_metric_are_properly_reduced(tmpdir): + class TestingModel(BoringModel): + def __init__(self, *args, **kwargs): + super().__init__() + self.train_acc = pl.metrics.Accuracy() + self.val_acc = pl.metrics.Accuracy() + + def training_step(self, batch, batch_idx): + self.train_acc(torch.rand(1, 3, device=self.device), torch.randint(0, 2, (1,), device=self.device)) + self.log('train_acc', self.train_acc, on_step=True, on_epoch=True) + return super().training_step(batch, batch_idx) + + def validation_step(self, batch, batch_idx): + preds = torch.tensor(0, device=self.device) + targets = torch.tensor(1, device=self.device) + if batch_idx < 8: + targets = preds + self.val_acc(preds, targets) + self.log('val_acc', self.val_acc, on_step=True, on_epoch=True) + return super().validation_step(batch, batch_idx) + + early_stop = EarlyStopping(monitor='val_acc', mode='max') + + checkpoint = ModelCheckpoint( + monitor='val_acc', + save_last=True, + save_top_k=2, + mode='max', + ) + + model = TestingModel() + trainer = Trainer( + default_root_dir=tmpdir, + gpus=1, + max_epochs=2, + limit_train_batches=5, + limit_val_batches=32, + callbacks=[early_stop, checkpoint]) + trainer.fit(model) + + assert trainer.callback_metrics["val_acc"] == 8 / 32. + assert "train_acc" in trainer.callback_metrics diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 33d14e852b285..2fc6cb60c7fb0 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -21,7 +21,7 @@ import torch.distributed as torch_distrib import torch.nn.functional as F -from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.utilities import APEX_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base.boring_model import BoringModel @@ -33,6 +33,11 @@ def test_multiple_optimizers_manual(tmpdir): Tests that only training_step can be used """ class TestModel(BoringModel): + + def __init__(self): + super().__init__() + self.automatic_optimization = False + def training_step(self, batch, batch_idx, optimizer_idx): # manual (opt_a, opt_b) = self.optimizers() @@ -69,10 +74,6 @@ def configure_optimizers(self): optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) return optimizer, optimizer_2 - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.val_dataloader = None @@ -98,6 +99,10 @@ def test_multiple_optimizers_manual_return(tmpdir): Tests that only training_step can be used """ class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False + def training_step(self, batch, batch_idx, optimizer_idx): # manual (opt_a, opt_b) = self.optimizers() @@ -136,10 +141,6 @@ def configure_optimizers(self): optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) return optimizer, optimizer_2 - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.val_dataloader = None @@ -165,6 +166,10 @@ def test_multiple_optimizers_manual_return_and_log(tmpdir): Tests that only training_step can be used """ class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False + def training_step(self, batch, batch_idx, optimizer_idx): # manual (opt_a, opt_b) = self.optimizers() @@ -204,10 +209,6 @@ def configure_optimizers(self): optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) return optimizer, optimizer_2 - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.val_dataloader = None @@ -238,6 +239,10 @@ def test_multiple_optimizers_manual_native_amp(tmpdir): Tests that only training_step can be used """ class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False + def training_step(self, batch, batch_idx, optimizer_idx): # manual (opt_a, opt_b) = self.optimizers() @@ -274,10 +279,6 @@ def configure_optimizers(self): optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) return optimizer, optimizer_2 - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.val_dataloader = None @@ -307,6 +308,10 @@ def test_multiple_optimizers_manual_apex(tmpdir): Tests that only training_step can be used """ class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False + def training_step(self, batch, batch_idx, optimizer_idx): # manual (opt_a, opt_b) = self.optimizers() @@ -347,10 +352,6 @@ def configure_optimizers(self): optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) return optimizer, optimizer_2 - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.val_dataloader = None @@ -380,6 +381,10 @@ class ManualOptimizationExtendedModel(BoringModel): called = collections.defaultdict(int) detach = False + def __init__(self): + super().__init__() + self.automatic_optimization = False + @property def should_update(self): return self.count % 2 == 0 @@ -427,10 +432,6 @@ def on_train_end(self): assert self.called["on_train_batch_start"] == 10 assert self.called["on_train_batch_end"] == 10 - @property - def automatic_optimization(self) -> bool: - return False - @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @@ -454,7 +455,6 @@ def test_manual_optimization_and_return_tensor(tmpdir): amp_backend='native', accelerator="ddp_spawn", gpus=2, - enable_pl_optimizer=True ) trainer.fit(model) @@ -503,6 +503,10 @@ class ExtendedModel(BoringModel): called = collections.defaultdict(int) detach = False + def __init__(self): + super().__init__() + self.automatic_optimization = False + @property def should_update(self): return self.count % 2 == 0 @@ -555,10 +559,6 @@ def on_train_end(self): assert self.called["on_train_batch_start"] == 20 assert self.called["on_train_batch_end"] == 20 - @property - def automatic_optimization(self) -> bool: - return False - model = ExtendedModel() model.training_step_end = None model.training_epoch_end = None @@ -573,7 +573,6 @@ def automatic_optimization(self) -> bool: amp_backend='native', accumulate_grad_batches=4, gpus=1, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -588,6 +587,10 @@ class TestModel(BoringModel): called = False + def __init__(self): + super().__init__() + self.automatic_optimization = False + def on_after_backward(self): self.called = True norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) @@ -630,10 +633,6 @@ def configure_optimizers(self): optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) return optimizer, optimizer_2 - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.val_dataloader = None @@ -648,7 +647,6 @@ def automatic_optimization(self) -> bool: precision=16, amp_backend='native', gpus=1, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -658,16 +656,20 @@ def automatic_optimization(self) -> bool: assert model.called +@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test_step_with_optimizer_closure(tmpdir): """ Tests that `step` works with optimizer_closure """ - os.environ['PL_DEV_DEBUG'] = '1' class TestModel(BoringModel): _losses = [] + def __init__(self): + super().__init__() + self.automatic_optimization = False + def training_step(self, batch, batch_idx): # manual @@ -715,10 +717,6 @@ def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) return optimizer - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.val_dataloader = None model.training_epoch_end = None @@ -730,7 +728,6 @@ def automatic_optimization(self) -> bool: limit_val_batches=2, max_epochs=1, log_every_n_steps=1, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -739,13 +736,17 @@ def automatic_optimization(self) -> bool: assert trainer.logger_connector.progress_bar_metrics["train_loss_epoch"] == torch.stack(model._losses).mean() +@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test_step_with_optimizer_closure_and_accumulated_grad(tmpdir): """ Tests that `step` works with optimizer_closure and accumulated_grad """ - os.environ['PL_DEV_DEBUG'] = '1' class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False + def training_step(self, batch, batch_idx): # manual opt = self.optimizers() @@ -779,10 +780,6 @@ def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) return optimizer - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.val_dataloader = None model.training_epoch_end = None @@ -795,21 +792,24 @@ def automatic_optimization(self) -> bool: max_epochs=1, log_every_n_steps=1, accumulate_grad_batches=2, - enable_pl_optimizer=True, ) trainer.fit(model) assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * 2 +@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @patch("torch.optim.SGD.step") def test_step_with_optimizer_closure_and_extra_arguments(step_mock, tmpdir): """ Tests that `step` works with optimizer_closure and extra arguments """ - os.environ['PL_DEV_DEBUG'] = '1' class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False + def training_step(self, batch, batch_idx): # manual opt = self.optimizers() @@ -835,10 +835,6 @@ def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) return optimizer - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.val_dataloader = None model.training_epoch_end = None @@ -851,7 +847,6 @@ def automatic_optimization(self) -> bool: max_epochs=1, log_every_n_steps=1, accumulate_grad_batches=2, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -859,15 +854,19 @@ def automatic_optimization(self) -> bool: step_mock.assert_has_calls(expected_calls) +@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @patch("torch.optim.Adam.step") @patch("torch.optim.SGD.step") def test_step_with_optimizer_closure_with_different_frequencies(mock_sgd_step, mock_adam_step, tmpdir): """ Tests that `step` works with optimizer_closure and different accumulated_gradient frequency """ - os.environ['PL_DEV_DEBUG'] = '1' class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False + def training_step(self, batch, batch_idx, optimizer_idx): # emulate gans training @@ -913,10 +912,6 @@ def configure_optimizers(self): optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001) return [optimizer_gen, optimizer_dis] - @property - def automatic_optimization(self) -> bool: - return False - model = TestModel() model.val_dataloader = None model.training_epoch_end = None @@ -929,7 +924,6 @@ def automatic_optimization(self) -> bool: max_epochs=1, log_every_n_steps=1, accumulate_grad_batches=2, - enable_pl_optimizer=True, ) trainer.fit(model) @@ -939,6 +933,7 @@ def automatic_optimization(self) -> bool: mock_adam_step.assert_has_calls(expected_calls) +@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @patch("torch.optim.Adam.step") @patch("torch.optim.SGD.step") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @@ -947,9 +942,11 @@ def test_step_with_optimizer_closure_with_different_frequencies_ddp(mock_sgd_ste """ Tests that `step` works with optimizer_closure and different accumulated_gradient frequency """ - os.environ['PL_DEV_DEBUG'] = '1' class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False def loss_ones(self, batch, prediction): # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls @@ -1019,10 +1016,6 @@ def configure_optimizers(self): optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001) return [optimizer_gen, optimizer_dis] - @property - def automatic_optimization(self) -> bool: - return False - seed_everything(42) model = TestModel() @@ -1037,7 +1030,6 @@ def automatic_optimization(self) -> bool: max_epochs=1, log_every_n_steps=1, accumulate_grad_batches=2, - enable_pl_optimizer=True, gpus=2, accelerator="ddp", ) @@ -1048,35 +1040,3 @@ def automatic_optimization(self) -> bool: expected_calls = [call(closure=ANY, optim='adam')] * 2 mock_adam_step.assert_has_calls(expected_calls) - - -def test_step_with_misconfiguraiton_error_when_overriding_optimizer_zero_grad(tmpdir): - """ - Tests that `optimizer_zero_grad` in manual_optimization triggers a MisconfigurationException - """ - try: - class TestModel(BoringModel): - - def optimizer_zero_grad(self, *_): - pass - - @property - def automatic_optimization(self) -> bool: - return False - - model = TestModel() - model.val_dataloader = None - model.training_epoch_end = None - - limit_train_batches = 8 - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=limit_train_batches, - limit_val_batches=2, - max_epochs=1, - log_every_n_steps=1, - accumulate_grad_batches=2, - enable_pl_optimizer=True, - ) - except MisconfigurationException as e: - assert "`Trainer(enable_pl_optimizer=True, ...) is not supported" in str(e) diff --git a/tests/trainer/optimization/test_multiple_optimizers.py b/tests/trainer/optimization/test_multiple_optimizers.py new file mode 100644 index 0000000000000..a26accfab106f --- /dev/null +++ b/tests/trainer/optimization/test_multiple_optimizers.py @@ -0,0 +1,70 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tests to ensure that the behaviours related to multiple optimizers works +""" +import torch + +import pytorch_lightning as pl +from tests.base.boring_model import BoringModel + + +def test_unbalanced_logging_with_multiple_optimizers(tmpdir): + """ + This tests ensures reduction works in unbalanced logging settings, + even when a Callback also logs. + """ + class TestModel(BoringModel): + actual = {0: [], 1: []} + + def training_step(self, batch, batch_idx, optimizer_idx): + out = super().training_step(batch, batch_idx) + loss = out["loss"] + self.log(f"loss_{optimizer_idx}", loss, on_epoch=True) + self.actual[optimizer_idx].append(loss) + return out + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001) + optimizer2 = torch.optim.SGD(self.layer.parameters(), lr=0.001) + return [optimizer, optimizer2] + + model = TestModel() + model.training_epoch_end = None + + class TestCallback(pl.Callback): + def on_train_batch_end(self, trainer, pl_module, output, batch, batch_idx, dl_idx): + # when this is called, the EpochResultStore state has not been reset yet because we are still + # "INSIDE_BATCH_TRAIN_LOOP" and the LoggerConnector runs its `on_train_batch_end` after the + # Callback (see `TrainLoop.on_train_batch_end`). For this reason, opt_idx here is the index + # of the last optimizer updated (the second, index 1). This produced a KeyError as reported in #5459 + pl_module.log("test_train_batch_end", trainer.logger_connector.cached_results._opt_idx) + + # Initialize a trainer + trainer = pl.Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=5, + limit_val_batches=5, + callbacks=[TestCallback()], + weights_summary=None, + ) + trainer.fit(model) + + for k, v in model.actual.items(): + assert torch.equal(trainer.callback_metrics[f"loss_{k}_step"], v[-1]) + # test loss is properly reduced + torch.testing.assert_allclose(trainer.callback_metrics[f"loss_{k}_epoch"], torch.tensor(v).mean()) + + assert trainer.callback_metrics["test_train_batch_end"] == len(model.optimizers()) - 1 diff --git a/tests/trainer/optimization/test_parity_automatic_optimization.py b/tests/trainer/optimization/test_parity_automatic_optimization.py new file mode 100644 index 0000000000000..4f5cc855a3164 --- /dev/null +++ b/tests/trainer/optimization/test_parity_automatic_optimization.py @@ -0,0 +1,407 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import Callable +from copy import deepcopy +from typing import Optional +from unittest.mock import patch + +import numpy as np +import pytest +import torch +from torch.optim import Optimizer + +import pytorch_lightning as pl +from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.core.optimizer import LightningOptimizer +from tests.base.boring_model import BoringModel + +# TODO: +# For both automatic / manual optimization +# - Test dp, ddp, ddp2 +# - Apex +# - Random accumulated_grad_batches (bug) +# - Multiple optimizers + + +class BaseParityAutomaticOptimizationModel(BoringModel): + + def __init__( + self, + optimizer_cls, + optimizer_is_mocked=False, + accumulate_grad_batches=None, + lr=0.1 + ): + super().__init__() + self.optimizer_cls = optimizer_cls + self.losses = [] + self.grads = [] + self.on_before_zero_grad_count = 0 + self.optimizer_is_mocked = optimizer_is_mocked + self.grad_checked = False + self.accumulate_grad_batches = accumulate_grad_batches + self.lr = lr + + def on_before_zero_grad(self, optimizer): + self.on_before_zero_grad_count += 1 + if self.layer.weight.grad is not None: + self.grads.append(self.layer.weight.grad.clone()) + + def configure_optimizers(self): + optimizer = self.optimizer_cls(self.layer.parameters(), lr=self.lr) + assert isinstance(optimizer, Optimizer) + return optimizer + + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.losses.append(loss.detach().item()) + return {"loss": loss} + + +class AutomaticOptimizationPurePytorchOptimizerModel(BaseParityAutomaticOptimizationModel): + + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.losses.append(loss.detach().item()) + loss /= float(self.accumulate_grad_batches) + return {"loss": loss} + + def optimizer_step( + self, + epoch: int = None, + batch_idx: int = None, + optimizer: Optimizer = None, + optimizer_idx: int = None, + optimizer_closure: Optional[Callable] = None, + on_tpu: bool = None, + using_native_amp: bool = None, + using_lbfgs: bool = None, + ) -> None: + """ + Override the optimizer step to define manual optimizer steps, as we use LightningOptimizer wrapper as standard + """ + # Get the unwrapped optimizer + optimizer = optimizer.optimizer + assert not isinstance(optimizer, LightningOptimizer) + + optimizer_closure() + assert self.trainer.accumulate_grad_batches == 1 + + if should_accumulate(self.trainer, self.accumulate_grad_batches): + return + + self.grad_checked = True + assert torch.abs(self.layer.weight.grad).sum() > 0 + optimizer.step() + + self.on_before_zero_grad_count += 1 + optimizer.zero_grad() + + if not self.optimizer_is_mocked: + assert torch.abs(self.layer.weight.grad).sum() == 0 + + +class AutomaticOptimizationPurePytorchAMPOptimizerModel(BaseParityAutomaticOptimizationModel): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.scaler = torch.cuda.amp.GradScaler() + + def training_step(self, batch, batch_idx): + with torch.cuda.amp.autocast(): + output = self.layer(batch) + loss = self.loss(batch, output) + self.losses.append(loss.detach().item()) + loss /= float(self.accumulate_grad_batches) + loss = self.scaler.scale(loss) + return {"loss": loss} + + def optimizer_step( + self, + epoch: int = None, + batch_idx: int = None, + optimizer: Optimizer = None, + optimizer_idx: int = None, + optimizer_closure: Optional[Callable] = None, + on_tpu: bool = None, + using_native_amp: bool = None, + using_lbfgs: bool = None, + ) -> None: + """ + Override the optimizer step to define manual optimizer steps, as we use LightningOptimizer wrapper as standard + """ + # Get the unwrapped optimizer + optimizer = optimizer.optimizer + assert not isinstance(optimizer, LightningOptimizer) + + optimizer_closure() + assert self.trainer.accumulate_grad_batches == 1 + + if should_accumulate(self.trainer, self.accumulate_grad_batches): + return + + self.scaler.unscale_(optimizer) + self.grad_checked = True + assert torch.abs(self.layer.weight.grad).sum() > 0 + self.scaler.step(optimizer) + self.scaler.update() + self.on_before_zero_grad_count += 1 + optimizer.zero_grad() + if not self.optimizer_is_mocked: + assert torch.abs(self.layer.weight.grad).sum() == 0 + + +def should_accumulate(trainer, accumulate_grad_batches): + accumulation_done = (trainer.batch_idx + 1) == trainer.num_training_batches + is_final_batch = (trainer.batch_idx + 1) % accumulate_grad_batches == 0 + return not (accumulation_done or is_final_batch) + + +@pytest.mark.parametrize(["precision", "amp_backend", "gpus"], [ + pytest.param(32, "native", 0), + pytest.param(16, "native", 1, marks=pytest.mark.skipif(not torch.cuda.is_available(), reason='Requires GPU')), +]) +@pytest.mark.parametrize('accumulate_grad_batches', [1, 7]) +def test_lightning_optimizer_and_no_lightning_optimizer_equality( + tmpdir, + precision, + amp_backend, + gpus, + accumulate_grad_batches, +): + + if accumulate_grad_batches > 1: + accumulate_grad_batches = np.random.randint(1, accumulate_grad_batches) + + vanilla_model_cls = AutomaticOptimizationPurePytorchAMPOptimizerModel if precision == 16 \ + else AutomaticOptimizationPurePytorchOptimizerModel + + run_lightning_optimizer_equality( + BaseParityAutomaticOptimizationModel, + vanilla_model_cls, + precision=precision, + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=5, + accumulate_grad_batches=accumulate_grad_batches, + amp_backend=amp_backend, + gpus=gpus + ) + + +@pytest.mark.parametrize(["precision", "amp_backend", "gpus"], [ + pytest.param(32, "native", 0), +]) +@pytest.mark.parametrize('accumulate_grad_batches', [1]) +def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_calls( + tmpdir, + precision, + amp_backend, + gpus, + accumulate_grad_batches, +): + + vanilla_model_cls = AutomaticOptimizationPurePytorchAMPOptimizerModel if precision == 16 \ + else AutomaticOptimizationPurePytorchOptimizerModel + + with patch("torch.optim.SGD.step") as mock_sgd_step, \ + patch("torch.optim.Adam.step") as mock_adam_step, \ + patch("torch.optim.SGD.zero_grad") as mock_sgd_zero_grad, \ + patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad: + + max_epochs = 2 + limit_train_batches = 10 + + # Run equality test using Lightning Optimizer + run_lightning_optimizer_equality( + BaseParityAutomaticOptimizationModel, + vanilla_model_cls, + default_root_dir=tmpdir, + optimizer_is_mocked=True, + accumulate_grad_batches=accumulate_grad_batches, + max_epochs=max_epochs, + limit_train_batches=limit_train_batches, + amp_backend=amp_backend, + precision=precision, + gpus=gpus + ) + + expected_num_batches = max_epochs * limit_train_batches + assert mock_sgd_step.call_count == (expected_num_batches // accumulate_grad_batches) + assert mock_sgd_zero_grad.call_count == (expected_num_batches // accumulate_grad_batches) + assert mock_sgd_step.call_count == mock_adam_step.call_count + assert mock_sgd_step.call_count == mock_adam_step.call_count + + +def train_with_restore(tmpdir, model_cls, restore_from=None): + # init model + if restore_from is not None: + seed_everything(42) + model = model_cls(torch.optim.Adam, accumulate_grad_batches=1, lr=10e-1) + ckpt_saver = ModelCheckpoint(dirpath=f"{tmpdir}/mckpt", save_last=True, save_weights_only=False) + # Initialize a trainer + trainer = pl.Trainer( + default_root_dir=tmpdir, + max_epochs=(1 + bool(restore_from)), + limit_train_batches=8, + callbacks=([ckpt_saver] if restore_from is None else []), + checkpoint_callback=(not restore_from), + resume_from_checkpoint=restore_from, + num_sanity_val_steps=0, + ) + + # Train the model + trainer.fit(model) + return ckpt_saver.best_model_path, model + + +def test_parity_checkpointing(tmpdir): + """ + This test assert that reloading a checkpoint and finetunning gives the same result + with / without LightningOptimizer + """ + + # Initial train run of the model. + seed_everything(0) + ckpt_path, first_epoch_pl_optimizer_model = train_with_restore( + tmpdir, + model_cls=BaseParityAutomaticOptimizationModel, + restore_from=None) + + assert "last" in ckpt_path + _, second_epoch_pl_optimizer_model = train_with_restore( + tmpdir, + model_cls=BaseParityAutomaticOptimizationModel, + restore_from=ckpt_path) + + seed_everything(0) + ckpt_path, first_epoch_pure_pytorch_optimizer_model = train_with_restore( + tmpdir, + model_cls=AutomaticOptimizationPurePytorchOptimizerModel, + restore_from=None) + + _, second_epoch_pure_pytorch_optimizer_model = train_with_restore( + tmpdir, + model_cls=AutomaticOptimizationPurePytorchOptimizerModel, + restore_from=ckpt_path) + + assert first_epoch_pl_optimizer_model.losses == first_epoch_pure_pytorch_optimizer_model.losses + assert second_epoch_pl_optimizer_model.losses == second_epoch_pure_pytorch_optimizer_model.losses + + +def run_lightning_optimizer_equality( + lightning_model_cls, + vanilla_model_cls, + optimizer_is_mocked=False, + **trainer_kwargs, +): + + trainer_kwargs = { + "limit_val_batches": 0, + **trainer_kwargs + } + expected_num_batches = trainer_kwargs["max_epochs"] * trainer_kwargs["limit_train_batches"] + accumulate_grad_batches = trainer_kwargs["accumulate_grad_batches"] + + pl_optimizer_initial_model_weights, pl_optimizer_model = train_specific_optimizer_model( + lightning_model_cls, + torch.optim.SGD, + expected_num_batches=expected_num_batches, + optimizer_is_mocked=optimizer_is_mocked, + **trainer_kwargs, + ) + + pure_pytorch_optimizer_initial_model_weights, pure_pytorch_optimizer_model = train_specific_optimizer_model( + vanilla_model_cls, + torch.optim.Adam if optimizer_is_mocked else torch.optim.SGD, + expected_num_batches=expected_num_batches, + optimizer_is_mocked=optimizer_is_mocked, + replace_optimizer_step_with_pure_pytorch=True, + **trainer_kwargs, + ) + + if not optimizer_is_mocked: + + assert_model_equality( + pl_optimizer_initial_model_weights=pl_optimizer_initial_model_weights, + pl_optimizer_model=pl_optimizer_model, + pure_pytorch_optimizer_initial_model_weights=pure_pytorch_optimizer_initial_model_weights, + pure_pytorch_optimizer_model=pure_pytorch_optimizer_model, + expected_num_batches=expected_num_batches, + precision=trainer_kwargs["precision"] + ) + + +def assert_model_equality( + pl_optimizer_initial_model_weights, + pl_optimizer_model, + pure_pytorch_optimizer_initial_model_weights, + pure_pytorch_optimizer_model, + expected_num_batches, + precision, +): + + assert torch.equal(pl_optimizer_initial_model_weights, pure_pytorch_optimizer_initial_model_weights) + assert len(pl_optimizer_model.losses) == expected_num_batches + assert pure_pytorch_optimizer_model.grad_checked + assert not torch.isnan(torch.FloatTensor(pl_optimizer_model.losses)).any() + + for pytorch_grad, pl_optim_grad in zip(pure_pytorch_optimizer_model.grads, + pl_optimizer_model.grads): + assert torch.equal(pytorch_grad, pl_optim_grad), 'Grad parameters are different' + + for pytorch_weight, pl_optim_weight in zip(pure_pytorch_optimizer_model.parameters(), + pl_optimizer_model.parameters()): + assert torch.equal(pytorch_weight, pl_optim_weight), 'Model parameters are different' + + +# train function +def train_specific_optimizer_model( + model_cls, + optimizer_cls, + expected_num_batches, + optimizer_is_mocked=False, + replace_optimizer_step_with_pure_pytorch=False, + **trainer_kwargs, +): + + seed_everything(42) + trainer_kwargs = deepcopy(trainer_kwargs) + + model = model_cls( + optimizer_cls=optimizer_cls, + optimizer_is_mocked=optimizer_is_mocked, + accumulate_grad_batches=trainer_kwargs["accumulate_grad_batches"], + ) + + if replace_optimizer_step_with_pure_pytorch: + # When running pure vanilla training, accumulate_grad_batches should be 1. + trainer_kwargs["accumulate_grad_batches"] = 1 + trainer_kwargs["precision"] = 32 + + expected_global_step = expected_num_batches // trainer_kwargs["accumulate_grad_batches"] + + initial_weights = model.layer.weight.clone() + model.training_epoch_end = None + + trainer = Trainer( + **trainer_kwargs + ) + trainer.fit(model) + + assert np.abs(trainer.global_step - expected_global_step) <= 2 + return initial_weights, model diff --git a/tests/trainer/optimization/test_parity_manual_optimization.py b/tests/trainer/optimization/test_parity_manual_optimization.py new file mode 100644 index 0000000000000..08e4e9908f592 --- /dev/null +++ b/tests/trainer/optimization/test_parity_manual_optimization.py @@ -0,0 +1,207 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import Callable +from copy import deepcopy +from typing import Optional +from unittest.mock import patch + +import numpy as np +import pytest +import torch +from torch.optim import Optimizer + +from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning.core.optimizer import LightningOptimizer +from tests.base.boring_model import BoringModel +from tests.trainer.optimization.test_parity_automatic_optimization import ( + assert_model_equality, + run_lightning_optimizer_equality, + should_accumulate, +) + +""" +TODO: +For both Manual / manual optimization + - Test dp, ddp, ddp2 + - Apex + - Random accumulated_grad_batches (bug) + - Multiple optimizers +""" + + +class BaseParityManualOptimizationModel(BoringModel): + + def __init__(self, optimizer_cls, optimizer_is_mocked=False, accumulate_grad_batches=None): + super().__init__() + self.optimizer_cls = optimizer_cls + self.losses = [] + self.grads = [] + self.on_before_zero_grad_count = 0 + self.optimizer_is_mocked = optimizer_is_mocked + self.grad_checked = False + self.accumulate_grad_batches = accumulate_grad_batches + + def on_before_zero_grad(self, optimizer): + self.on_before_zero_grad_count += 1 + if self.layer.weight.grad is not None: + self.grads.append(self.layer.weight.grad.clone()) + + def configure_optimizers(self): + optimizer = self.optimizer_cls(self.layer.parameters(), lr=0.1) + assert isinstance(optimizer, Optimizer) + return optimizer + + def training_step(self, batch, batch_idx): + opt = self.optimizers() + if not isinstance(opt, LightningOptimizer): + opt = LightningOptimizer.to_lightning_optimizer(opt, self.trainer) + output = self.layer(batch) + loss = self.loss(batch, output) + self.losses.append(loss.detach().item()) + self.manual_backward(loss, opt) + opt.step() + + +class ManualOptimizationPurePytorchOptimizerModel(BaseParityManualOptimizationModel): + + def training_step(self, batch, batch_idx): + optimizer = self.optimizers(use_pl_optimizer=False) + output = self.layer(batch) + loss = self.loss(batch, output) + self.losses.append(loss.detach().item()) + loss /= float(self.accumulate_grad_batches) + loss.backward() + + if should_accumulate(self.trainer, self.accumulate_grad_batches): + return + + self.grad_checked = True + assert torch.abs(self.layer.weight.grad).sum() > 0 + optimizer.step() + + self.on_before_zero_grad_count += 1 + optimizer.zero_grad() + + if not self.optimizer_is_mocked: + assert torch.abs(self.layer.weight.grad).sum() == 0 + + +class ManualOptimizationPurePytorchAMPOptimizerModel(BaseParityManualOptimizationModel): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.scaler = torch.cuda.amp.GradScaler() + + def training_step(self, batch, batch_idx): + optimizer = self.optimizers(use_pl_optimizer=False) + with torch.cuda.amp.autocast(): + output = self.layer(batch) + loss = self.loss(batch, output) + self.losses.append(loss.detach().item()) + loss /= float(self.accumulate_grad_batches) + loss = self.scaler.scale(loss) + loss.backward() + + if should_accumulate(self.trainer, self.accumulate_grad_batches): + return + + self.scaler.unscale_(optimizer) + self.grad_checked = True + + assert torch.abs(self.layer.weight.grad).sum() > 0 + self.scaler.step(optimizer) + self.scaler.update() + self.on_before_zero_grad_count += 1 + optimizer.zero_grad() + + if not self.optimizer_is_mocked: + assert torch.abs(self.layer.weight.grad).sum() == 0 + + +@pytest.mark.parametrize(["precision", "amp_backend", "gpus"], [ + pytest.param(32, "native", 0), + pytest.param(16, "native", 1, marks=pytest.mark.skipif(not torch.cuda.is_available(), reason='Requires GPU')), +]) +@pytest.mark.parametrize('accumulate_grad_batches', [1, 7]) +def test_lightning_optimizer_and_no_lightning_optimizer_equality( + tmpdir, + precision, + amp_backend, + gpus, + accumulate_grad_batches): + + if accumulate_grad_batches > 1: + accumulate_grad_batches = np.random.randint(1, accumulate_grad_batches) + + vanilla_model_cls = ManualOptimizationPurePytorchAMPOptimizerModel if precision == 16 \ + else ManualOptimizationPurePytorchOptimizerModel + + run_lightning_optimizer_equality( + BaseParityManualOptimizationModel, + vanilla_model_cls, + precision=precision, + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=5, + accumulate_grad_batches=accumulate_grad_batches, + amp_backend=amp_backend, + gpus=gpus, + automatic_optimization=False + ) + + +@pytest.mark.parametrize(["precision", "amp_backend", "gpus"], [ + pytest.param(32, "native", 0), +]) +@pytest.mark.parametrize('accumulate_grad_batches', [1]) +def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_calls( + tmpdir, + precision, + amp_backend, + gpus, + accumulate_grad_batches, +): + + vanilla_model_cls = ManualOptimizationPurePytorchAMPOptimizerModel if precision == 16 \ + else ManualOptimizationPurePytorchOptimizerModel + + with patch("torch.optim.SGD.step") as mock_sgd_step, \ + patch("torch.optim.Adam.step") as mock_adam_step, \ + patch("torch.optim.SGD.zero_grad") as mock_sgd_zero_grad, \ + patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad: + + max_epochs = 2 + limit_train_batches = 10 + + # Run equality test using Lightning Optimizer + + run_lightning_optimizer_equality( + BaseParityManualOptimizationModel, + vanilla_model_cls, + default_root_dir=tmpdir, + optimizer_is_mocked=True, + accumulate_grad_batches=accumulate_grad_batches, + max_epochs=max_epochs, + limit_train_batches=limit_train_batches, + amp_backend=amp_backend, + precision=precision, + gpus=gpus, + automatic_optimization=False + ) + + expected_num_batches = max_epochs * limit_train_batches + assert mock_sgd_step.call_count == (expected_num_batches // accumulate_grad_batches) + assert mock_sgd_zero_grad.call_count == (expected_num_batches // accumulate_grad_batches) + assert mock_sgd_step.call_count == mock_adam_step.call_count + assert mock_sgd_zero_grad.call_count == mock_adam_zero_grad.call_count diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py index 52e085b2b7b8c..2d2ebd6a6a2dd 100644 --- a/tests/trainer/test_optimizers.py +++ b/tests/trainer/test_optimizers.py @@ -180,9 +180,8 @@ def test_reducelronplateau_scheduling(tmpdir): ), 'lr scheduler was not correctly converted to dict' -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_optimizer_return_options(enable_pl_optimizer): - trainer = Trainer(enable_pl_optimizer=enable_pl_optimizer) +def test_optimizer_return_options(): + trainer = Trainer() model = EvalModelTemplate() # single optimizer @@ -483,3 +482,20 @@ def test_lr_scheduler_with_no_actual_scheduler_raises(tmpdir): trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) with pytest.raises(MisconfigurationException, match='The lr scheduler dict must have the key "scheduler"'): trainer.fit(model) + + +def test_invalid_optimizer_in_scheduler(tmpdir): + """ + Test exception when optimizer attatched to lr_schedulers wasn't returned + """ + class InvalidOptimizerModel(BoringModel): + def configure_optimizers(self): + opt1 = torch.optim.SGD(self.layer.parameters(), lr=0.1) + opt2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) + lr_scheduler = torch.optim.lr_scheduler.StepLR(opt2, step_size=1) + return [opt1], [lr_scheduler] + + model = InvalidOptimizerModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) + with pytest.raises(MisconfigurationException, match="attatched with an optimizer that wasn't returned"): + trainer.fit(model) diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py new file mode 100644 index 0000000000000..b8a0e066cdef8 --- /dev/null +++ b/tests/trainer/test_supporters.py @@ -0,0 +1,38 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch + +from pytorch_lightning.trainer.supporters import TensorRunningAccum + + +def test_tensor_running_accum_reset(): + """ Test that reset would set all attributes to the initialization state """ + + window_length = 10 + + accum = TensorRunningAccum(window_length=window_length) + assert accum.last() is None + assert accum.mean() is None + + accum.append(torch.tensor(1.5)) + assert accum.last() == torch.tensor(1.5) + assert accum.mean() == torch.tensor(1.5) + + accum.reset() + assert accum.window_length == window_length + assert accum.memory is None + assert accum.current_idx == 0 + assert accum.last_idx is None + assert not accum.rotated diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 9e5ceccf9b646..8b66e7141957e 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -11,21 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from argparse import Namespace +from copy import deepcopy import math import os +from pathlib import Path import pickle import sys -from argparse import Namespace -from copy import deepcopy -from pathlib import Path from unittest.mock import ANY, call, patch import cloudpickle +from omegaconf import OmegaConf import pytest import torch -from omegaconf import OmegaConf -import tests.base.develop_utils as tutils from pytorch_lightning import Callback, LightningModule, Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv @@ -37,6 +36,7 @@ from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import BoringModel, EvalModelTemplate +import tests.base.develop_utils as tutils @pytest.mark.parametrize("url_ckpt", [True, False]) @@ -496,16 +496,13 @@ def test_model_checkpoint_only_weights(tmpdir): def test_model_freeze_unfreeze(): - model = EvalModelTemplate() - model.freeze() model.unfreeze() -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) @pytest.mark.parametrize("url_ckpt", [True, False]) -def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_server, url_ckpt, enable_pl_optimizer): +def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_server, url_ckpt): """Verify resuming from checkpoint runs the right number of epochs""" # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir monkeypatch.setenv("TORCH_HOME", tmpdir) @@ -533,7 +530,6 @@ def on_load_checkpoint(self, _): callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on', save_top_k=-1)], default_root_dir=tmpdir, val_check_interval=1.0, - enable_pl_optimizer=enable_pl_optimizer, progress_bar_refresh_rate=0, logger=False, weights_summary=None, diff --git a/tests/utilities/test_seed.py b/tests/utilities/test_seed.py new file mode 100644 index 0000000000000..7fa6df516c304 --- /dev/null +++ b/tests/utilities/test_seed.py @@ -0,0 +1,55 @@ +import os + +from unittest import mock +import pytest + +import pytorch_lightning.utilities.seed as seed_utils + + +@mock.patch.dict(os.environ, {}, clear=True) +def test_seed_stays_same_with_multiple_seed_everything_calls(): + """ + Ensure that after the initial seed everything, + the seed stays the same for the same run. + """ + with pytest.warns(UserWarning, match="No correct seed found"): + seed_utils.seed_everything() + initial_seed = os.environ.get("PL_GLOBAL_SEED") + + with pytest.warns(None) as record: + seed_utils.seed_everything() + assert not record # does not warn + seed = os.environ.get("PL_GLOBAL_SEED") + + assert initial_seed == seed + + +@mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "2020"}, clear=True) +def test_correct_seed_with_environment_variable(): + """ + Ensure that the PL_GLOBAL_SEED environment is read + """ + assert seed_utils.seed_everything() == 2020 + + +@mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "invalid"}, clear=True) +@mock.patch.object(seed_utils, attribute='_select_seed_randomly', new=lambda *_: 123) +def test_invalid_seed(): + """ + Ensure that we still fix the seed even if an invalid seed is given + """ + with pytest.warns(UserWarning, match="No correct seed found"): + seed = seed_utils.seed_everything() + assert seed == 123 + + +@mock.patch.dict(os.environ, {}, clear=True) +@mock.patch.object(seed_utils, attribute='_select_seed_randomly', new=lambda *_: 123) +@pytest.mark.parametrize("seed", (10e9, -10e9)) +def test_out_of_bounds_seed(seed): + """ + Ensure that we still fix the seed even if an out-of-bounds seed is given + """ + with pytest.warns(UserWarning, match="is not in bounds"): + actual = seed_utils.seed_everything(seed) + assert actual == 123