From 3dcfeb16c8c0cf23e8ab6e3bbc3e2fb63f27f90c Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Thu, 7 Oct 2021 08:52:20 +0200 Subject: [PATCH] Enable automated CoreNLP, Senna, Prover9/Mace4, Megam, MaltParser CI tests (#2820) * Re-enabled CoreNLP tests, added CoreNLP zip download to CI * Specified JDK version 16 * Specified CLASSNAME environment variable in a different way * Run pytest with CLASSNAME env, and JDK set up * Renamed incorrect CLASSNAME env to CLASSPATH * Output the core_nlp folder contents and try to set up the CoreNLPServer * Also ls the core_nlp folder after creation * Download zip to relative path * Use full paths instead of relative ones * Make directory using full path * Load third party download cache in main pytest job * Output third party folder * Rename stanford-corenlp-\d.\d.\d to remove the version specifier * Ignore exit code 1 * Fixed input to execdir * Pipe guaranteed error warning to null, load cache before Python * Revert to using a third-party.sh script * Output more before attempting to run third-party script * Attempt to get more information * Added third-party.sh * Create envs.sh before filling it * Create envs.sh in a new way * Specified bash shell * Force envs.sh to give a good exit code * Remove printenv * Add to GITHUB_ENV instead of setting env vars some other way * Print CLASSPATH in multiple ways * Hardcode environment variable paths * Use of pwd instead of /home/runner/ * Use ~ instead of pwd * Output folder that should have the Stanford CoreNLP jar * Modified used environment variables * Rename instead of sym-link downloaded folders * Remove ls for CORENLP * Output a tree structure of the cached third party folder * Add os.path.expanduser to find_jar_iter * Removed tree call * More outputting * Cache third party downloads on windows too * Fixed runs-on parameter * Specified sh to third-party call * Compromise by only downloading third party data for Linux tests * Split test step * Add automatic Prover9-Mace4 download * Use .envs.sh instead of hardcoded environment variables * Add environment variables to .bashrc directly * Use export in the same step for env variables * Revert to using echo >> ~/.bashrc * Specify third party env's as top-level * Replace $ {HOME} with ~ * Use /third instead of ~/third * Revert to ~/third, prepend /home/runner * Resolve replace-all error from c2988cc0f5c5a950544f8ef1024f38cea01373d9 * Attempting to fix broken Prover9 test * Added automatic download for MEGAM and TADM * Set MEGAM and TADM executables to chmod 711 * Updated TADM environment variable to link to the bin folder instead * Remove TADM, add MaltParser * Revert to using MaltParser 1.7.2 * Gave MaltParser test mandatory 'parser_dirname' var * Initialize MaltParser with tagger function, not tagger class instance * Set exe mode in third-party.sh, try to enable another MEGAM test * Use folder name instead of file name for MEGAM change mode * Modified verbosity, changed chmod file * Only download third party, and modify debug logs * Move megam executable to megam directory * Make directory before moving MEGAM into it * Re-enable full third party & nltk_data download and tests * Remove unnecessary, broken megam config check * Removed BFGS as algorithm in rte_classifier, as MaxentClassifier doesn't support it * Removed unnecessary 'import os' * Modified docstring in third-party bash script * Only set up Java if the runner is Linux --- .github/workflows/ci.yaml | 61 +++++++++++++-- nltk/classify/rte_classify.py | 8 +- nltk/internals.py | 3 +- nltk/test/gluesemantics_malt.doctest | 4 +- nltk/test/gluesemantics_malt_fixt.py | 2 +- nltk/test/inference.doctest | 5 +- nltk/test/unit/test_corenlp.py | 7 +- nltk/test/unit/test_rte_classify.py | 8 +- tools/github_actions/third-party.sh | 112 +++++++++++++++++++++++++++ 9 files changed, 184 insertions(+), 26 deletions(-) create mode 100644 tools/github_actions/third-party.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e18ac9ddf9..0beff60427 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -2,17 +2,31 @@ name: ci-workflow # run workflow for these events on: [push, pull_request, workflow_dispatch] +env: + CORENLP: /home/runner/third/stanford-corenlp + CORENLP_MODELS: /home/runner/third/stanford-corenlp + STANFORD_PARSER: /home/runner/third/stanford-parser + STANFORD_MODELS: /home/runner/third/stanford-postagger + STANFORD_POSTAGGER: /home/runner/third/stanford-postagger + SENNA: /home/runner/third/senna + PROVER9: /home/runner/third/prover9/bin + MEGAM: /home/runner/third/megam + # TADM requires `libtaopetsc.so` from PETSc v2.3.3, and likely has more + # tricky to install requirements, so we don't run tests for it. + # TADM: /home/runner/third/tadm/bin + MALT_PARSER: /home/runner/third/maltparser + jobs: pre-commit: name: Run pre-commit runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - uses: pre-commit/action@v2.0.0 + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: pre-commit/action@v2.0.0 cache_nltk_data: - name: cache nltk_data + name: Cache nltk_data needs: pre-commit strategy: matrix: @@ -36,9 +50,30 @@ jobs: shell: bash if: steps.restore-cache.outputs.cache-hit != 'true' + cache_third_party: + name: Cache third party tools + needs: pre-commit + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Cache third party tools + uses: actions/cache@v2 + id: restore-cache + with: + path: ~/third + key: third_${{ secrets.CACHE_VERSION }} + + - name: Download third party data + run: | + chmod +x ./tools/github_actions/third-party.sh + ./tools/github_actions/third-party.sh + if: steps.restore-cache.outputs.cache-hit != 'true' + test: name: Python ${{ matrix.python-version }} on ${{ matrix.os }} - needs: cache_nltk_data + needs: [cache_nltk_data, cache_third_party] strategy: matrix: python-version: [3.6, 3.7, 3.8, 3.9] @@ -54,6 +89,12 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Set up JDK 16 + uses: actions/setup-java@v1 + with: + java-version: 16 + if: runner.os == 'Linux' + - name: Cache dependencies uses: actions/cache@v2 id: restore-cache @@ -71,6 +112,14 @@ jobs: path: ~/nltk_data key: nltk_data_${{ secrets.CACHE_VERSION }} + - name: Use cached third party tools + uses: actions/cache@v2 + with: + path: ~/third + key: third_${{ secrets.CACHE_VERSION }} + if: runner.os == 'Linux' + - name: Run pytest - run: pytest --numprocesses auto -rsx nltk/test shell: bash + run: | + pytest --numprocesses auto -rsx --doctest-modules nltk/test diff --git a/nltk/classify/rte_classify.py b/nltk/classify/rte_classify.py index 6902904c34..f5409afad5 100644 --- a/nltk/classify/rte_classify.py +++ b/nltk/classify/rte_classify.py @@ -165,16 +165,14 @@ def rte_classifier(algorithm, sample_N=None): # Train the classifier print("Training classifier...") - if algorithm in ["megam", "BFGS"]: # MEGAM based algorithms. - # Ensure that MEGAM is configured first. - check_megam_config() - clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm) + if algorithm in ["megam"]: # MEGAM based algorithms. + clf = MaxentClassifier.train(featurized_train_set, algorithm) elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm clf = MaxentClassifier.train(featurized_train_set, algorithm) else: err_msg = str( "RTEClassifier only supports these algorithms:\n " - "'megam', 'BFGS', 'GIS', 'IIS'.\n" + "'megam', 'GIS', 'IIS'.\n" ) raise Exception(err_msg) print("Testing classifier...") diff --git a/nltk/internals.py b/nltk/internals.py index 376b65e40e..0aa36b7ccf 100644 --- a/nltk/internals.py +++ b/nltk/internals.py @@ -738,6 +738,7 @@ def find_jar_iter( if env_var == "CLASSPATH": classpath = os.environ["CLASSPATH"] for cp in classpath.split(os.path.pathsep): + cp = os.path.expanduser(cp) if os.path.isfile(cp): filename = os.path.basename(cp) if ( @@ -773,7 +774,7 @@ def find_jar_iter( yield os.path.join(cp, file_name) else: - jar_env = os.environ[env_var] + jar_env = os.path.expanduser(os.environ[env_var]) jar_iter = ( ( os.path.join(jar_env, path_to_jar) diff --git a/nltk/test/gluesemantics_malt.doctest b/nltk/test/gluesemantics_malt.doctest index 43c59cb34a..b06e8f62bf 100644 --- a/nltk/test/gluesemantics_malt.doctest +++ b/nltk/test/gluesemantics_malt.doctest @@ -24,8 +24,8 @@ Initialize the Dependency Parser ... ('^(a)$', 'ex_quant'), ... ('^(every)$', 'univ_quant'), ... ('^(girl|dog)$', 'NN') - ... ]) - >>> depparser = MaltParser(tagger=tagger) + ... ]).tag + >>> depparser = MaltParser("maltparser-1.7.2", tagger=tagger) -------------------- Automated Derivation diff --git a/nltk/test/gluesemantics_malt_fixt.py b/nltk/test/gluesemantics_malt_fixt.py index e6be9b6166..2b70bf6106 100644 --- a/nltk/test/gluesemantics_malt_fixt.py +++ b/nltk/test/gluesemantics_malt_fixt.py @@ -5,5 +5,5 @@ def setup_module(): try: depparser = MaltParser("maltparser-1.7.2") - except LookupError as e: + except (AssertionError, LookupError) as e: pytest.skip("MaltParser is not available") diff --git a/nltk/test/inference.doctest b/nltk/test/inference.doctest index 1c65f4ee69..30d1ce1863 100644 --- a/nltk/test/inference.doctest +++ b/nltk/test/inference.doctest @@ -109,7 +109,7 @@ The assumptions and goal may be accessed using the ``assumptions()`` and ``goal()`` methods, respectively. >>> prover.assumptions() - [, mortal(x))>] + [, mortal(x))>] >>> prover.goal() @@ -315,10 +315,9 @@ statements as new assumptions. % Proof 1 at ... seconds. % Length of proof is 13. % Level of proof is 4. - % Maximum clause weight is 0.000. + % Maximum clause weight is 0. % Given clauses 0. - 1 (all x (boxerdog(x) -> dog(x))). [assumption]. 2 (all x (boxer(x) -> person(x))). [assumption]. 3 (all x -(dog(x) & person(x))). [assumption]. diff --git a/nltk/test/unit/test_corenlp.py b/nltk/test/unit/test_corenlp.py index 7f634415b0..9b133d5a8e 100644 --- a/nltk/test/unit/test_corenlp.py +++ b/nltk/test/unit/test_corenlp.py @@ -15,13 +15,13 @@ def setup_module(module): global server try: - server = CoreNLPServer(port=9000) + server = corenlp.CoreNLPServer(port=9000) except LookupError: pytest.skip("Could not instantiate CoreNLPServer.") try: server.start() - except CoreNLPServerError as e: + except corenlp.CoreNLPServerError as e: pytest.skip( "Skipping CoreNLP tests because the server could not be started. " "Make sure that the 9000 port is free. " @@ -33,7 +33,6 @@ def teardown_module(module): server.stop() -@pytest.mark.skip(reason="Skipping all CoreNLP tests.") class TestTokenizerAPI(TestCase): def test_tokenize(self): corenlp_tokenizer = corenlp.CoreNLPParser() @@ -253,7 +252,6 @@ def test_tokenize(self): self.assertEqual(expected_output, tokenized_output) -@pytest.mark.skip(reason="Skipping all CoreNLP tests.") class TestTaggerAPI(TestCase): def test_pos_tagger(self): corenlp_tagger = corenlp.CoreNLPParser(tagtype="pos") @@ -751,7 +749,6 @@ def test_unexpected_tagtype(self): corenlp_tagger = corenlp.CoreNLPParser(tagtype="test") -@pytest.mark.skip(reason="Skipping all CoreNLP tests.") class TestParserAPI(TestCase): def test_parse(self): corenlp_parser = corenlp.CoreNLPParser() diff --git a/nltk/test/unit/test_rte_classify.py b/nltk/test/unit/test_rte_classify.py index 9df6f56674..0a573ea7e2 100644 --- a/nltk/test/unit/test_rte_classify.py +++ b/nltk/test/unit/test_rte_classify.py @@ -1,5 +1,6 @@ import pytest +from nltk import config_megam from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features from nltk.corpus import rte as rte_corpus @@ -85,8 +86,9 @@ def test_rte_classification_without_megam(self): clf = rte_classifier("IIS", sample_N=100) clf = rte_classifier("GIS", sample_N=100) - @pytest.mark.skip("Skipping tests with dependencies on MEGAM") def test_rte_classification_with_megam(self): - nltk.config_megam("/usr/local/bin/megam") + try: + config_megam() + except (LookupError, AttributeError) as e: + pytest.skip("Skipping tests with dependencies on MEGAM") clf = rte_classifier("megam", sample_N=100) - clf = rte_classifier("BFGS", sample_N=100) diff --git a/tools/github_actions/third-party.sh b/tools/github_actions/third-party.sh new file mode 100644 index 0000000000..b8c65b6e5d --- /dev/null +++ b/tools/github_actions/third-party.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# This install script is used in our GitHub Actions CI. +# See .github/workflows/ci.yaml + +# Installing the third-party software and the appropriate env variables. +pushd ${HOME} +[[ ! -d 'third' ]] && mkdir 'third' +pushd 'third' + +# Download nltk stanford dependencies +# Downloaded to ~/third/stanford-corenlp +# stanford_corenlp_package_zip_name=$(curl -s 'https://stanfordnlp.github.io/CoreNLP/' | grep -o 'stanford-corenlp-full-.*\.zip' | head -n1) +stanford_corenlp_package_zip_name="stanford-corenlp-full-2017-06-09.zip" +[[ ${stanford_corenlp_package_zip_name} =~ (.+)\.zip ]] +stanford_corenlp_package_name=${BASH_REMATCH[1]} +if [[ ! -d ${stanford_corenlp_package_name} ]]; then + curl -L "https://nlp.stanford.edu/software/$stanford_corenlp_package_zip_name" -o ${stanford_corenlp_package_zip_name} + # wget -nv "http://nlp.stanford.edu/software/$stanford_corenlp_package_zip_name" + unzip -q ${stanford_corenlp_package_zip_name} + rm ${stanford_corenlp_package_zip_name} + mv ${stanford_corenlp_package_name} 'stanford-corenlp' +fi + + +# Downloaded to ~/third/stanford-parser +#stanford_parser_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/lex-parser.shtml' | grep -o 'stanford-parser-full-.*\.zip' | head -n1) +stanford_parser_package_zip_name="stanford-parser-full-2017-06-09.zip" +[[ ${stanford_parser_package_zip_name} =~ (.+)\.zip ]] +stanford_parser_package_name=${BASH_REMATCH[1]} +if [[ ! -d ${stanford_parser_package_name} ]]; then + curl -L "https://nlp.stanford.edu/software/$stanford_parser_package_zip_name" -o ${stanford_parser_package_zip_name} + # wget -nv "https://nlp.stanford.edu/software/$stanford_parser_package_zip_name" + unzip -q ${stanford_parser_package_zip_name} + rm ${stanford_parser_package_zip_name} + mv ${stanford_parser_package_name} 'stanford-parser' +fi + +# Downloaded to ~/third/stanford-postagger +#stanford_tagger_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/tagger.shtml' | grep -o 'stanford-postagger-full-.*\.zip' | head -n1) +stanford_tagger_package_zip_name="stanford-postagger-full-2017-06-09.zip" +[[ ${stanford_tagger_package_zip_name} =~ (.+)\.zip ]] +stanford_tagger_package_name=${BASH_REMATCH[1]} +if [[ ! -d ${stanford_tagger_package_name} ]]; then + curl -L "https://nlp.stanford.edu/software/$stanford_tagger_package_zip_name" -o ${stanford_tagger_package_zip_name} + # wget -nv "https://nlp.stanford.edu/software/$stanford_tagger_package_zip_name" + unzip -q ${stanford_tagger_package_zip_name} + rm ${stanford_tagger_package_zip_name} + mv ${stanford_tagger_package_name} 'stanford-postagger' +fi + +# Download SENNA to ~/third/senna +senna_file_name=$(curl -s 'https://ronan.collobert.com/senna/download.html' | grep -o 'senna-v.*.tgz' | head -n1) +senna_folder_name='senna' +if [[ ! -d $senna_folder_name ]]; then + curl -L "https://ronan.collobert.com/senna/$senna_file_name" -o ${senna_file_name} + # wget -nv "https://ronan.collobert.com/senna/$senna_file_name" + tar -xzf ${senna_file_name} + rm ${senna_file_name} +fi + +# Download PROVER9 to ~/third/prover9 +prover9_file_name="p9m4-v05.tar.gz" +[[ ${prover9_file_name} =~ (.+)\.tar\.gz ]] +prover9_folder_name=${BASH_REMATCH[1]} +if [[ ! -d ${prover9_folder_name} ]]; then + curl -L "https://www.cs.unm.edu/~mccune/prover9/gui/$prover9_file_name" -o ${prover9_file_name} + tar -xzf ${prover9_file_name} + mv ${prover9_folder_name} 'prover9' + rm ${prover9_file_name} +fi + +# Download MEGAM to ~/third/megam +megam_file_name="megam_i686.opt.gz" +[[ ${megam_file_name} =~ (.+)\.gz ]] +megam_folder_name=${BASH_REMATCH[1]} +if [[ ! -d ${megam_folder_name} ]]; then + curl -L "http://hal3.name/megam/$megam_file_name" -o ${megam_file_name} + gunzip -vf ${megam_file_name} + mkdir -p "megam" + mv ${megam_folder_name} "megam/${megam_folder_name}" + chmod -R 711 "megam/$megam_folder_name" +fi + +# TADM requires `libtaopetsc.so` from PETSc v2.3.3, and likely has more +# tricky to install requirements, so we don't run tests for it. + +# Download TADM to ~/third/tadm +# tadm_file_name="tadm-0.9.8.tgz" +# [[ ${tadm_file_name} =~ (.+)\.tgz ]] +# tadm_folder_name=${BASH_REMATCH[1]} +# if [[ ! -d ${tadm_folder_name} ]]; then +# curl -L "https://master.dl.sourceforge.net/project/tadm/tadm/tadm%200.9.8/$tadm_file_name?viasf=1" -o ${tadm_file_name} +# tar -xvzf ${tadm_file_name} +# rm ${tadm_file_name} +# chmod -R 711 "./tadm/bin/tadm" +# fi + +# Download MaltParser to ~/third/maltparser +malt_file_name="maltparser-1.7.2.tar.gz" +[[ ${malt_file_name} =~ (.+)\.tar\.gz ]] +malt_folder_name=${BASH_REMATCH[1]} +if [[ ! -d ${malt_folder_name} ]]; then + curl -L "http://maltparser.org/dist/$malt_file_name" -o ${malt_file_name} + tar -xzf ${malt_file_name} + mv ${malt_folder_name} 'maltparser' + rm ${malt_file_name} +fi + +ls ~/third + +popd +popd