From 3dcfeb16c8c0cf23e8ab6e3bbc3e2fb63f27f90c Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Thu, 7 Oct 2021 08:52:20 +0200
Subject: [PATCH] Enable automated CoreNLP, Senna, Prover9/Mace4, Megam,
 MaltParser CI tests (#2820)

* Re-enabled CoreNLP tests, added CoreNLP zip download to CI

* Specified JDK version 16

* Specified CLASSNAME environment variable in a different way

* Run pytest with CLASSNAME env, and JDK set up

* Renamed incorrect CLASSNAME env to CLASSPATH

* Output the core_nlp folder contents and try to set up the CoreNLPServer

* Also ls the core_nlp folder after creation

* Download zip to relative path

* Use full paths instead of relative ones

* Make directory using full path

* Load third party download cache in main pytest job

* Output third party folder

* Rename stanford-corenlp-\d.\d.\d to remove the version specifier

* Ignore exit code 1

* Fixed input to execdir

* Pipe guaranteed error warning to null, load cache before Python

* Revert to using a third-party.sh script

* Output more before attempting to run third-party script

* Attempt to get more information

* Added third-party.sh

* Create envs.sh before filling it

* Create envs.sh in a new way

* Specified bash shell

* Force envs.sh to give a good exit code

* Remove printenv

* Add to GITHUB_ENV instead of setting env vars some other way

* Print CLASSPATH in multiple ways

* Hardcode environment variable paths

* Use of pwd instead of /home/runner/

* Use ~ instead of pwd

* Output folder that should have the Stanford CoreNLP jar

* Modified used environment variables

* Rename instead of sym-link downloaded folders

* Remove ls for CORENLP

* Output a tree structure of the cached third party folder

* Add os.path.expanduser to find_jar_iter

* Removed tree call

* More outputting

* Cache third party downloads on windows too

* Fixed runs-on parameter

* Specified sh to third-party call

* Compromise by only downloading third party data for Linux tests

* Split test step

* Add automatic Prover9-Mace4 download

* Use .envs.sh instead of hardcoded environment variables

* Add environment variables to .bashrc directly

* Use export in the same step for env variables

* Revert to using echo >> ~/.bashrc

* Specify third party env's as top-level

* Replace $ {HOME} with ~

* Use /third instead of ~/third

* Revert to ~/third, prepend /home/runner

* Resolve replace-all error from c2988cc0f5c5a950544f8ef1024f38cea01373d9

* Attempting to fix broken Prover9 test

* Added automatic download for MEGAM and TADM

* Set MEGAM and TADM executables to chmod 711

* Updated TADM environment variable to link to the bin folder instead

* Remove TADM, add MaltParser

* Revert to using MaltParser 1.7.2

* Gave MaltParser test mandatory 'parser_dirname' var

* Initialize MaltParser with tagger function, not tagger class instance

* Set exe mode in third-party.sh, try to enable another MEGAM test

* Use folder name instead of file name for MEGAM change mode

* Modified verbosity, changed chmod file

* Only download third party, and modify debug logs

* Move megam executable to megam directory

* Make directory before moving MEGAM into it

* Re-enable full third party & nltk_data download and tests

* Remove unnecessary, broken megam config check

* Removed BFGS as algorithm in rte_classifier, as MaxentClassifier doesn't support it

* Removed unnecessary 'import os'

* Modified docstring in third-party bash script

* Only set up Java if the runner is Linux
---
 .github/workflows/ci.yaml            |  61 +++++++++++++--
 nltk/classify/rte_classify.py        |   8 +-
 nltk/internals.py                    |   3 +-
 nltk/test/gluesemantics_malt.doctest |   4 +-
 nltk/test/gluesemantics_malt_fixt.py |   2 +-
 nltk/test/inference.doctest          |   5 +-
 nltk/test/unit/test_corenlp.py       |   7 +-
 nltk/test/unit/test_rte_classify.py  |   8 +-
 tools/github_actions/third-party.sh  | 112 +++++++++++++++++++++++++++
 9 files changed, 184 insertions(+), 26 deletions(-)
 create mode 100644 tools/github_actions/third-party.sh

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index e18ac9ddf9..0beff60427 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -2,17 +2,31 @@ name: ci-workflow
 # run workflow for these events
 on: [push, pull_request, workflow_dispatch]
 
+env:
+  CORENLP: /home/runner/third/stanford-corenlp
+  CORENLP_MODELS: /home/runner/third/stanford-corenlp
+  STANFORD_PARSER: /home/runner/third/stanford-parser
+  STANFORD_MODELS: /home/runner/third/stanford-postagger
+  STANFORD_POSTAGGER: /home/runner/third/stanford-postagger
+  SENNA: /home/runner/third/senna
+  PROVER9: /home/runner/third/prover9/bin
+  MEGAM: /home/runner/third/megam
+  # TADM requires `libtaopetsc.so` from PETSc v2.3.3, and likely has more
+  # tricky to install requirements, so we don't run tests for it.
+  # TADM: /home/runner/third/tadm/bin
+  MALT_PARSER: /home/runner/third/maltparser
+
 jobs:
   pre-commit:
     name: Run pre-commit
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
-    - uses: actions/setup-python@v2
-    - uses: pre-commit/action@v2.0.0
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+      - uses: pre-commit/action@v2.0.0
 
   cache_nltk_data:
-    name: cache nltk_data
+    name: Cache nltk_data
     needs: pre-commit
     strategy:
       matrix:
@@ -36,9 +50,30 @@ jobs:
         shell: bash
         if: steps.restore-cache.outputs.cache-hit != 'true'
 
+  cache_third_party:
+    name: Cache third party tools
+    needs: pre-commit
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Cache third party tools
+        uses: actions/cache@v2
+        id: restore-cache
+        with:
+          path: ~/third
+          key: third_${{ secrets.CACHE_VERSION }}
+
+      - name: Download third party data
+        run: |
+          chmod +x ./tools/github_actions/third-party.sh
+          ./tools/github_actions/third-party.sh
+        if: steps.restore-cache.outputs.cache-hit != 'true'
+
   test:
     name: Python ${{ matrix.python-version }} on ${{ matrix.os }}
-    needs: cache_nltk_data
+    needs: [cache_nltk_data, cache_third_party]
     strategy:
       matrix:
         python-version: [3.6, 3.7, 3.8, 3.9]
@@ -54,6 +89,12 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
+      - name: Set up JDK 16
+        uses: actions/setup-java@v1
+        with:
+          java-version: 16
+        if: runner.os == 'Linux'
+
       - name: Cache dependencies
         uses: actions/cache@v2
         id: restore-cache
@@ -71,6 +112,14 @@ jobs:
           path: ~/nltk_data
           key: nltk_data_${{ secrets.CACHE_VERSION }}
 
+      - name: Use cached third party tools
+        uses: actions/cache@v2
+        with:
+          path: ~/third
+          key: third_${{ secrets.CACHE_VERSION }}
+        if: runner.os == 'Linux'
+
       - name: Run pytest
-        run: pytest --numprocesses auto -rsx nltk/test
         shell: bash
+        run: |
+          pytest --numprocesses auto -rsx --doctest-modules nltk/test
diff --git a/nltk/classify/rte_classify.py b/nltk/classify/rte_classify.py
index 6902904c34..f5409afad5 100644
--- a/nltk/classify/rte_classify.py
+++ b/nltk/classify/rte_classify.py
@@ -165,16 +165,14 @@ def rte_classifier(algorithm, sample_N=None):
 
     # Train the classifier
     print("Training classifier...")
-    if algorithm in ["megam", "BFGS"]:  # MEGAM based algorithms.
-        # Ensure that MEGAM is configured first.
-        check_megam_config()
-        clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
+    if algorithm in ["megam"]:  # MEGAM based algorithms.
+        clf = MaxentClassifier.train(featurized_train_set, algorithm)
     elif algorithm in ["GIS", "IIS"]:  # Use default GIS/IIS MaxEnt algorithm
         clf = MaxentClassifier.train(featurized_train_set, algorithm)
     else:
         err_msg = str(
             "RTEClassifier only supports these algorithms:\n "
-            "'megam', 'BFGS', 'GIS', 'IIS'.\n"
+            "'megam', 'GIS', 'IIS'.\n"
         )
         raise Exception(err_msg)
     print("Testing classifier...")
diff --git a/nltk/internals.py b/nltk/internals.py
index 376b65e40e..0aa36b7ccf 100644
--- a/nltk/internals.py
+++ b/nltk/internals.py
@@ -738,6 +738,7 @@ def find_jar_iter(
             if env_var == "CLASSPATH":
                 classpath = os.environ["CLASSPATH"]
                 for cp in classpath.split(os.path.pathsep):
+                    cp = os.path.expanduser(cp)
                     if os.path.isfile(cp):
                         filename = os.path.basename(cp)
                         if (
@@ -773,7 +774,7 @@ def find_jar_iter(
                                     yield os.path.join(cp, file_name)
 
             else:
-                jar_env = os.environ[env_var]
+                jar_env = os.path.expanduser(os.environ[env_var])
                 jar_iter = (
                     (
                         os.path.join(jar_env, path_to_jar)
diff --git a/nltk/test/gluesemantics_malt.doctest b/nltk/test/gluesemantics_malt.doctest
index 43c59cb34a..b06e8f62bf 100644
--- a/nltk/test/gluesemantics_malt.doctest
+++ b/nltk/test/gluesemantics_malt.doctest
@@ -24,8 +24,8 @@ Initialize the Dependency Parser
     ...      ('^(a)$', 'ex_quant'),
     ...      ('^(every)$', 'univ_quant'),
     ...      ('^(girl|dog)$', 'NN')
-    ... ])
-    >>> depparser = MaltParser(tagger=tagger)
+    ... ]).tag
+    >>> depparser = MaltParser("maltparser-1.7.2", tagger=tagger)
 
 --------------------
 Automated Derivation
diff --git a/nltk/test/gluesemantics_malt_fixt.py b/nltk/test/gluesemantics_malt_fixt.py
index e6be9b6166..2b70bf6106 100644
--- a/nltk/test/gluesemantics_malt_fixt.py
+++ b/nltk/test/gluesemantics_malt_fixt.py
@@ -5,5 +5,5 @@ def setup_module():
 
     try:
         depparser = MaltParser("maltparser-1.7.2")
-    except LookupError as e:
+    except (AssertionError, LookupError) as e:
         pytest.skip("MaltParser is not available")
diff --git a/nltk/test/inference.doctest b/nltk/test/inference.doctest
index 1c65f4ee69..30d1ce1863 100644
--- a/nltk/test/inference.doctest
+++ b/nltk/test/inference.doctest
@@ -109,7 +109,7 @@ The assumptions and goal may be accessed using the ``assumptions()`` and
 ``goal()`` methods, respectively.
 
     >>> prover.assumptions()
-    [<ApplicationExpression man(socrates)>, <Alread_expression all x.(man(x) -> mortal(x))>]
+    [<ApplicationExpression man(socrates)>, <AllExpression all x.(man(x) -> mortal(x))>]
     >>> prover.goal()
     <ApplicationExpression mortal(socrates)>
 
@@ -315,10 +315,9 @@ statements as new assumptions.
     % Proof 1 at ... seconds.
     % Length of proof is 13.
     % Level of proof is 4.
-    % Maximum clause weight is 0.000.
+    % Maximum clause weight is 0.
     % Given clauses 0.
     <BLANKLINE>
-    <BLANKLINE>
     1 (all x (boxerdog(x) -> dog(x))).  [assumption].
     2 (all x (boxer(x) -> person(x))).  [assumption].
     3 (all x -(dog(x) & person(x))).  [assumption].
diff --git a/nltk/test/unit/test_corenlp.py b/nltk/test/unit/test_corenlp.py
index 7f634415b0..9b133d5a8e 100644
--- a/nltk/test/unit/test_corenlp.py
+++ b/nltk/test/unit/test_corenlp.py
@@ -15,13 +15,13 @@ def setup_module(module):
     global server
 
     try:
-        server = CoreNLPServer(port=9000)
+        server = corenlp.CoreNLPServer(port=9000)
     except LookupError:
         pytest.skip("Could not instantiate CoreNLPServer.")
 
     try:
         server.start()
-    except CoreNLPServerError as e:
+    except corenlp.CoreNLPServerError as e:
         pytest.skip(
             "Skipping CoreNLP tests because the server could not be started. "
             "Make sure that the 9000 port is free. "
@@ -33,7 +33,6 @@ def teardown_module(module):
     server.stop()
 
 
-@pytest.mark.skip(reason="Skipping all CoreNLP tests.")
 class TestTokenizerAPI(TestCase):
     def test_tokenize(self):
         corenlp_tokenizer = corenlp.CoreNLPParser()
@@ -253,7 +252,6 @@ def test_tokenize(self):
         self.assertEqual(expected_output, tokenized_output)
 
 
-@pytest.mark.skip(reason="Skipping all CoreNLP tests.")
 class TestTaggerAPI(TestCase):
     def test_pos_tagger(self):
         corenlp_tagger = corenlp.CoreNLPParser(tagtype="pos")
@@ -751,7 +749,6 @@ def test_unexpected_tagtype(self):
             corenlp_tagger = corenlp.CoreNLPParser(tagtype="test")
 
 
-@pytest.mark.skip(reason="Skipping all CoreNLP tests.")
 class TestParserAPI(TestCase):
     def test_parse(self):
         corenlp_parser = corenlp.CoreNLPParser()
diff --git a/nltk/test/unit/test_rte_classify.py b/nltk/test/unit/test_rte_classify.py
index 9df6f56674..0a573ea7e2 100644
--- a/nltk/test/unit/test_rte_classify.py
+++ b/nltk/test/unit/test_rte_classify.py
@@ -1,5 +1,6 @@
 import pytest
 
+from nltk import config_megam
 from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
 from nltk.corpus import rte as rte_corpus
 
@@ -85,8 +86,9 @@ def test_rte_classification_without_megam(self):
         clf = rte_classifier("IIS", sample_N=100)
         clf = rte_classifier("GIS", sample_N=100)
 
-    @pytest.mark.skip("Skipping tests with dependencies on MEGAM")
     def test_rte_classification_with_megam(self):
-        nltk.config_megam("/usr/local/bin/megam")
+        try:
+            config_megam()
+        except (LookupError, AttributeError) as e:
+            pytest.skip("Skipping tests with dependencies on MEGAM")
         clf = rte_classifier("megam", sample_N=100)
-        clf = rte_classifier("BFGS", sample_N=100)
diff --git a/tools/github_actions/third-party.sh b/tools/github_actions/third-party.sh
new file mode 100644
index 0000000000..b8c65b6e5d
--- /dev/null
+++ b/tools/github_actions/third-party.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+# This install script is used in our GitHub Actions CI.
+# See .github/workflows/ci.yaml
+
+# Installing the third-party software and the appropriate env variables.
+pushd ${HOME}
+[[ ! -d 'third' ]] && mkdir 'third'
+pushd 'third'
+
+# Download nltk stanford dependencies
+# Downloaded to ~/third/stanford-corenlp
+# stanford_corenlp_package_zip_name=$(curl -s 'https://stanfordnlp.github.io/CoreNLP/' | grep -o 'stanford-corenlp-full-.*\.zip' | head -n1)
+stanford_corenlp_package_zip_name="stanford-corenlp-full-2017-06-09.zip"
+[[ ${stanford_corenlp_package_zip_name} =~ (.+)\.zip ]]
+stanford_corenlp_package_name=${BASH_REMATCH[1]}
+if [[ ! -d ${stanford_corenlp_package_name} ]]; then
+	curl -L "https://nlp.stanford.edu/software/$stanford_corenlp_package_zip_name" -o ${stanford_corenlp_package_zip_name}
+	# wget -nv "http://nlp.stanford.edu/software/$stanford_corenlp_package_zip_name"
+	unzip -q ${stanford_corenlp_package_zip_name}
+	rm ${stanford_corenlp_package_zip_name}
+	mv ${stanford_corenlp_package_name} 'stanford-corenlp'
+fi
+
+
+# Downloaded to ~/third/stanford-parser
+#stanford_parser_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/lex-parser.shtml' | grep -o 'stanford-parser-full-.*\.zip' | head -n1)
+stanford_parser_package_zip_name="stanford-parser-full-2017-06-09.zip"
+[[ ${stanford_parser_package_zip_name} =~ (.+)\.zip ]]
+stanford_parser_package_name=${BASH_REMATCH[1]}
+if [[ ! -d ${stanford_parser_package_name} ]]; then
+	curl -L "https://nlp.stanford.edu/software/$stanford_parser_package_zip_name" -o ${stanford_parser_package_zip_name}
+	# wget -nv "https://nlp.stanford.edu/software/$stanford_parser_package_zip_name"
+	unzip -q ${stanford_parser_package_zip_name}
+	rm ${stanford_parser_package_zip_name}
+	mv ${stanford_parser_package_name} 'stanford-parser'
+fi
+
+# Downloaded to ~/third/stanford-postagger
+#stanford_tagger_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/tagger.shtml' | grep -o 'stanford-postagger-full-.*\.zip' | head -n1)
+stanford_tagger_package_zip_name="stanford-postagger-full-2017-06-09.zip"
+[[ ${stanford_tagger_package_zip_name} =~ (.+)\.zip ]]
+stanford_tagger_package_name=${BASH_REMATCH[1]}
+if [[ ! -d ${stanford_tagger_package_name} ]]; then
+	curl -L "https://nlp.stanford.edu/software/$stanford_tagger_package_zip_name" -o ${stanford_tagger_package_zip_name}
+	# wget -nv "https://nlp.stanford.edu/software/$stanford_tagger_package_zip_name"
+	unzip -q ${stanford_tagger_package_zip_name}
+	rm ${stanford_tagger_package_zip_name}
+	mv ${stanford_tagger_package_name} 'stanford-postagger'
+fi
+
+# Download SENNA to ~/third/senna
+senna_file_name=$(curl -s 'https://ronan.collobert.com/senna/download.html' | grep -o 'senna-v.*.tgz' | head -n1)
+senna_folder_name='senna'
+if [[ ! -d $senna_folder_name ]]; then
+	curl -L "https://ronan.collobert.com/senna/$senna_file_name" -o ${senna_file_name}
+	# wget -nv "https://ronan.collobert.com/senna/$senna_file_name"
+	tar -xzf ${senna_file_name}
+	rm ${senna_file_name}
+fi
+
+# Download PROVER9 to ~/third/prover9
+prover9_file_name="p9m4-v05.tar.gz"
+[[ ${prover9_file_name} =~ (.+)\.tar\.gz ]]
+prover9_folder_name=${BASH_REMATCH[1]}
+if [[ ! -d ${prover9_folder_name} ]]; then
+	curl -L "https://www.cs.unm.edu/~mccune/prover9/gui/$prover9_file_name" -o ${prover9_file_name}
+	tar -xzf ${prover9_file_name}
+	mv ${prover9_folder_name} 'prover9'
+	rm ${prover9_file_name}
+fi
+
+# Download MEGAM to ~/third/megam
+megam_file_name="megam_i686.opt.gz"
+[[ ${megam_file_name} =~ (.+)\.gz ]]
+megam_folder_name=${BASH_REMATCH[1]}
+if [[ ! -d ${megam_folder_name} ]]; then
+	curl -L "http://hal3.name/megam/$megam_file_name" -o ${megam_file_name}
+	gunzip -vf ${megam_file_name}
+	mkdir -p "megam"
+	mv ${megam_folder_name} "megam/${megam_folder_name}"
+	chmod -R 711 "megam/$megam_folder_name"
+fi
+
+# TADM requires `libtaopetsc.so` from PETSc v2.3.3, and likely has more
+# tricky to install requirements, so we don't run tests for it.
+
+# Download TADM to ~/third/tadm
+# tadm_file_name="tadm-0.9.8.tgz"
+# [[ ${tadm_file_name} =~ (.+)\.tgz ]]
+# tadm_folder_name=${BASH_REMATCH[1]}
+# if [[ ! -d ${tadm_folder_name} ]]; then
+# 	curl -L "https://master.dl.sourceforge.net/project/tadm/tadm/tadm%200.9.8/$tadm_file_name?viasf=1" -o ${tadm_file_name}
+# 	tar -xvzf ${tadm_file_name}
+# 	rm ${tadm_file_name}
+#	chmod -R 711 "./tadm/bin/tadm"
+# fi
+
+# Download MaltParser to ~/third/maltparser
+malt_file_name="maltparser-1.7.2.tar.gz"
+[[ ${malt_file_name} =~ (.+)\.tar\.gz ]]
+malt_folder_name=${BASH_REMATCH[1]}
+if [[ ! -d ${malt_folder_name} ]]; then
+	curl -L "http://maltparser.org/dist/$malt_file_name" -o ${malt_file_name}
+	tar -xzf ${malt_file_name}
+	mv ${malt_folder_name} 'maltparser'
+	rm ${malt_file_name}
+fi
+
+ls ~/third
+
+popd
+popd