From 4862b094ae4a9aa04396e06335ae52f7920e48c7 Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Tue, 21 Dec 2021 11:24:52 +0930 Subject: [PATCH 1/6] updates for 3.6.6 --- ChangeLog | 41 +++++++++++++++++++++++++++++++++++++++++ nltk/VERSION | 2 +- web/conf.py | 4 ++-- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8e4a11569f..8a9ac030b3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,44 @@ +Version 3.6.6 2021-12-21 + +* Refactor `gensim.doctest` to work for gensim 4.0.0 and up (#2914) +* Add Precision, Recall, F-measure, Confusion Matrix to Taggers (#2862) +* Added warnings if .zip files exist without any corresponding .csv files. (#2908) +* Fix `FileNotFoundError` when the `download_dir` is a non-existing nested folder (#2910) +* Rename omw to omw-1.4 (#2907) +* Resolve ReDoS opportunity by fixing incorrectly specified regex (#2906) +* Support OMW 1.4 (#2899) +* Deprecate Tree get and set node methods (#2900) +* Fix broken inaugural test case (#2903) +* Use Multilingual Wordnet Data from OMW with newer Wordnet versions (#2889) +* Keep NLTKs "tokenize" module working with pathlib (#2896) +* Make prettyprinter to be more readable (#2893) +* Update links to the nltk book (#2895) +* Add `CITATION.cff` to nltk (#2880) +* Resolve serious ReDoS in PunktSentenceTokenizer (#2869) +* Delete old CI config files (#2881) +* Improve Tokenize documentation + add TokenizerI as superclass for TweetTokenizer (#2878) +* Fix expected value for BLEU score doctest after changes from #2572 +* Add multi Bleu functionality and tests (#2793) +* Deprecate 'return_str' parameter in NLTKWordTokenizer and TreebankWordTokenizer (#2883) +* Allow empty string in CFG's + more (#2888) +* Partition `tree.py` module into `tree` package + pickle fix (#2863) +* Fix several TreebankWordTokenizer and NLTKWordTokenizer bugs (#2877) +* Rewind Wordnet data file after each lookup (#2868) +* Correct __init__ call for SyntaxCorpusReader subclasses (#2872) +* Documentation fixes (#2873) +* Fix levenstein distance for duplicated letters (#2849) +* Support alternative Wordnet versions (#2860) +* Remove hundreds of formatting warnings for nltk.org (#2859) +* Modernize `nltk.org/howto` pages (#2856) +* Fix Bleu Score smoothing function from taking log(0) (#2839) +* Update third party tools to newer versions and removing MaltParser fixed version (#2832) +* Fix TypeError: _pretty() takes 1 positional argument but 2 were given in sem/drt.py (#2854) +* Replace `http` with `https` in most URLs (#2852) + +Thanks to the following contributors to 3.6.6 +Adam Hawley, BatMrE, Danny Sepler, Eric Kafe, Gavish Poddar, Panagiotis Simakis, +RnDevelover, Robby Horvath, Tom Aarsen, Yuta Nakamura, Mohaned Mashaly + Version 3.6.5 2021-10-11 * modernised nltk.org website diff --git a/nltk/VERSION b/nltk/VERSION index d15b8b06fa..4f2c1d15f6 100644 --- a/nltk/VERSION +++ b/nltk/VERSION @@ -1 +1 @@ -3.6.5 +3.6.6 diff --git a/web/conf.py b/web/conf.py index 6391b105a8..41781ce01f 100644 --- a/web/conf.py +++ b/web/conf.py @@ -115,9 +115,9 @@ def generate_howto(): # built documents. # # The short X.Y version. -version = "3.6.5" +version = "3.6.6" # The full version, including alpha/beta/rc tags. -release = "3.6.5" +release = "3.6.6" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 4f1a1803be2ae8d3eb551470fee51a5519ffcbb1 Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Tue, 21 Dec 2021 11:44:32 +0930 Subject: [PATCH 2/6] minor clean ups --- Makefile | 3 ++- RELEASE-HOWTO.txt | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 7446b7f620..ca6471d5fb 100644 --- a/Makefile +++ b/Makefile @@ -51,10 +51,11 @@ windist: clean_code ######################################################################## clean: clean_code - rm -rf build iso dist api MANIFEST nltk-$(VERSION) nltk.egg-info + rm -rf build web/_build iso dist api MANIFEST nltk-$(VERSION) nltk.egg-info clean_code: rm -f `find nltk -name '*.pyc'` rm -f `find nltk -name '*.pyo'` rm -f `find . -name '*~'` + rm -rf `find . -name '__pycache__'` rm -f MANIFEST # regenerate manifest from MANIFEST.in diff --git a/RELEASE-HOWTO.txt b/RELEASE-HOWTO.txt index d40985c7a6..cc69d9e665 100644 --- a/RELEASE-HOWTO.txt +++ b/RELEASE-HOWTO.txt @@ -33,7 +33,7 @@ Building an NLTK distribution - Rebuild the API docs sphinx-build -E ./web ./build - Publish them - cd nltk.github.com; git pull (begin with current docs repo) + cd ../nltk.github.com; git pull (begin with current docs repo) cp -r ../nltk/build/* . git add . git commit -m "updates for version 3.X.Y" @@ -41,6 +41,7 @@ Building an NLTK distribution 4. Create a new version - Tag this version: + cd ../nltk git tag -a 3.X.Y -m "version 3.X.Y" git push --tags verify that it shows up here: https://github.com/nltk/nltk/releases From 67a09106cfa5d592b50be1505bba0e0f00ca2872 Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Tue, 21 Dec 2021 11:53:53 +0930 Subject: [PATCH 3/6] updates for 3.6.6 --- web/news.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/web/news.rst b/web/news.rst index d72db6d0cb..404cac8341 100644 --- a/web/news.rst +++ b/web/news.rst @@ -4,6 +4,23 @@ Release Notes 2021 ---- +NLTK 3.6.6 release: December 2021: + add precision, recall, F-measure, confusion matrix to Taggers + support alternative Wordnet versions (#2860) + support OMW 1.4, use Multilingual Wordnet Data from OMW with newer Wordnet versions + add multi Bleu functionality + allow empty string in CFG's + more + fix several TreebankWordTokenizer and NLTKWordTokenizer bugs + fix levenstein distance for duplicated letters + modernize `nltk.org/howto` pages + update third party tools to newer versions + +NLTK 3.6.5 release: October 2021: + support emoji ZJW sequences and skin tone modifiers in TweetTokenizer + METEOR evaluation now requires pre-tokenized input + code linting and type hinting + avoid re.Pattern and regex.Pattern which fail for Python 3.6, 3.7 + NLTK 3.6.4 release: October 2021 improved phone number recognition in tweet tokenizer resolved ReDoS vulnerability in Corpus Reader From dd1494ea0b88ea92a95034777863828e6552fde2 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Tue, 21 Dec 2021 03:33:26 +0100 Subject: [PATCH 4/6] Drop support for Python 3.6, support Python 3.10 (#2920) * run CI for python 3.10 * Deprecate Python 3.6, add Python 3.10 support in documentation Co-authored-by: purificant --- .github/workflows/ci.yaml | 2 +- .pre-commit-config.yaml | 2 +- CONTRIBUTING.md | 4 ++-- README.md | 2 +- nltk/__init__.py | 4 ++-- setup.py | 6 +++--- tox.ini | 28 ++++++++++------------------ web/dev/local_testing.rst | 6 +++--- web/install.rst | 2 +- 9 files changed, 24 insertions(+), 32 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 5fafe82176..932a4c3464 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -76,7 +76,7 @@ jobs: needs: [cache_nltk_data, cache_third_party] strategy: matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: ['3.7', '3.8', '3.9', '3.10'] os: [ubuntu-latest, macos-latest, windows-latest] fail-fast: false runs-on: ${{ matrix.os }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 312771114f..c8ac544150 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,7 @@ repos: rev: v2.23.3 hooks: - id: pyupgrade - args: ["--py36-plus"] + args: ["--py37-plus"] - repo: https://github.com/ambv/black rev: 21.7b0 hooks: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e0d667ea80..7ce10d5f56 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -77,7 +77,7 @@ Summary of our git branching model: - Do many small commits on that branch locally (`git add files-changed`, `git commit -m "Add some change"`); - Run the tests to make sure nothing breaks - (`tox -e py36` if you are on Python 3.6); + (`tox -e py37` if you are on Python 3.7); - Add your name to the `AUTHORS.md` file as a contributor; - Push to your fork on GitHub (with the name as your local branch: `git push origin branch-name`); @@ -169,7 +169,7 @@ The [`.github/workflows/ci.yaml`](https://github.com/nltk/nltk/blob/develop/.git - Otherwise, download all the data packages through `nltk.download('all')`. - The `test` job - - tests against supported Python versions (`3.6`, `3.7`, `3.8`, `3.9`). + - tests against supported Python versions (`3.7`, `3.8`, `3.9`). - tests on `ubuntu-latest` and `macos-latest`. - relies on the `cache_nltk_data` job to ensure that `nltk_data` is available. - performs these steps: diff --git a/README.md b/README.md index 156d60eaf2..e420af5ff8 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ NLTK -- the Natural Language Toolkit -- is a suite of open source Python modules, data sets, and tutorials supporting research and development in Natural -Language Processing. NLTK requires Python version 3.6, 3.7, 3.8, or 3.9. +Language Processing. NLTK requires Python version 3.7, 3.8, 3.9 or 3.10. For documentation, please visit [nltk.org](https://www.nltk.org/). diff --git a/nltk/__init__.py b/nltk/__init__.py index 9573e73791..5c7870f7c2 100644 --- a/nltk/__init__.py +++ b/nltk/__init__.py @@ -52,7 +52,7 @@ # Description of the toolkit, keywords, and the project's primary URL. __longdescr__ = """\ The Natural Language Toolkit (NLTK) is a Python package for -natural language processing. NLTK requires Python 3.6, 3.7, 3.8, or 3.9.""" +natural language processing. NLTK requires Python 3.7, 3.8, 3.9 or 3.10.""" __keywords__ = [ "NLP", "CL", @@ -84,10 +84,10 @@ "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Human Machine Interfaces", diff --git a/setup.py b/setup.py index d249118a2f..f64003fe95 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ }, long_description="""\ The Natural Language Toolkit (NLTK) is a Python package for -natural language processing. NLTK requires Python 3.6, 3.7, 3.8, or 3.9.""", +natural language processing. NLTK requires Python 3.7, 3.8, 3.9 or 3.10.""", license="Apache License, Version 2.0", keywords=[ "NLP", @@ -95,10 +95,10 @@ "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Human Machine Interfaces", @@ -110,7 +110,7 @@ "Topic :: Text Processing :: Linguistic", ], package_data={"nltk": ["test/*.doctest", "VERSION"]}, - python_requires=">=3.6", + python_requires=">=3.7", install_requires=[ "click", "joblib", diff --git a/tox.ini b/tox.ini index b97d416505..68bfd32a66 100644 --- a/tox.ini +++ b/tox.ini @@ -1,9 +1,9 @@ [tox] envlist = - py{36,37,38,39} + py{37,38,39,310} pypy - py{36,37,38,39}-nodeps - py{36,37,38,39}-jenkins + py{37,38,39,310}-nodeps + py{37,38,39,310}-jenkins py-travis [testenv] @@ -51,13 +51,6 @@ deps = commands = pytest -[testenv:py36-nodeps] -basepython = python3.6 -deps = - pytest - pytest-mock -commands = pytest - [testenv:py37-nodeps] basepython = python3.7 deps = @@ -79,18 +72,17 @@ deps = pytest-mock commands = pytest +[testenv:py310-nodeps] +basepython = python3.10 +deps = + pytest + pytest-mock +commands = pytest + # Use minor version agnostic basepython, but specify testenv # control Python2/3 versions using jenkins' user-defined matrix instead. # Available Python versions: http://repository-cloudbees.forge.cloudbees.com/distributions/ci-addons/python/fc25/ -[testenv:py3.6.4-jenkins] -basepython = python3 -commands = {toxinidir}/jenkins.sh -setenv = - STANFORD_MODELS = {homedir}/third/stanford-parser/ - STANFORD_PARSER = {homedir}/third/stanford-parser/ - STANFORD_POSTAGGER = {homedir}/third/stanford-postagger/ - [testenv:py-travis] extras = all setenv = diff --git a/web/dev/local_testing.rst b/web/dev/local_testing.rst index 8e39fc4b00..b66bfd4d61 100644 --- a/web/dev/local_testing.rst +++ b/web/dev/local_testing.rst @@ -25,10 +25,10 @@ Please consult https://tox.wiki for more info about the tox tool. Examples -------- -Run tests for python 3.6 in verbose mode; executing only tests +Run tests for python 3.7 in verbose mode; executing only tests that failed in the last test run:: - tox -e py36 -- -v --failed + tox -e py37 -- -v --failed Run tree doctests for all available interpreters:: @@ -42,7 +42,7 @@ By default, numpy, scipy and scikit-learn are installed in tox virtualenvs. This is slow, requires working build toolchain and is not always feasible. In order to skip numpy & friends, use ``..-nodeps`` environments:: - tox -e py36-nodeps,py37,pypy + tox -e py37-nodeps,py37,pypy It is also possible to run tests without tox. This way NLTK would be tested only under single interpreter, but it may be easier to have numpy and other diff --git a/web/install.rst b/web/install.rst index 248651b9de..866d8b540d 100644 --- a/web/install.rst +++ b/web/install.rst @@ -1,7 +1,7 @@ Installing NLTK =============== -NLTK requires Python versions 3.6, 3.7, 3.8, or 3.9 +NLTK requires Python versions 3.7, 3.8, 3.9 or 3.10 For Windows users, it is strongly recommended that you go through this guide to install Python 3 successfully https://docs.python-guide.org/starting/install3/win/#install3-windows From d4d99b49a9903a6d0004d64c00409e58797c0c94 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Tue, 21 Dec 2021 15:08:34 +0100 Subject: [PATCH 5/6] Resolve IndexError in `sent_tokenize` (#2922) * Prevent IndexError if input starts with an endline character * Add doctest for Punkt sent_tokenize issue --- nltk/test/tokenize.doctest | 5 +++++ nltk/tokenize/punkt.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest index ec980711b9..53e8ece914 100644 --- a/nltk/test/tokenize.doctest +++ b/nltk/test/tokenize.doctest @@ -310,6 +310,11 @@ Testing mutable default arguments for https://github.com/nltk/nltk/pull/2067 >>> type(pst._lang_vars) +Testing that inputs can start with dots. + + >>> pst = PunktSentenceTokenizer(lang_vars=None) + >>> pst.tokenize(". This input starts with a dot. This used to cause issues.") + ['.', 'This input starts with a dot.', 'This used to cause issues.'] Regression Tests: align_tokens ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py index 54937b9ecd..19fdf31f63 100644 --- a/nltk/tokenize/punkt.py +++ b/nltk/tokenize/punkt.py @@ -1379,7 +1379,7 @@ def _match_potential_end_contexts(self, text): # Find the word before the current match split = text[: match.start()].rsplit(maxsplit=1) before_start = len(split[0]) if len(split) == 2 else 0 - before_words[match] = split[-1] + before_words[match] = split[-1] if split else "" matches.append(match) return [ From 177614cd22afb7fc2eebe0334f37047fad19dc96 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 21 Dec 2021 15:29:31 +0100 Subject: [PATCH 6/6] Temporarily pause Python 3.10 CI tests due to scikit-learn issues with Windows Scikit-learn is planning to add Python 3.10 support in the middle of December 2021, according to scikit-learn/scikit-learn#21882 --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 932a4c3464..c44ef76af8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -76,7 +76,7 @@ jobs: needs: [cache_nltk_data, cache_third_party] strategy: matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.7', '3.8', '3.9'] os: [ubuntu-latest, macos-latest, windows-latest] fail-fast: false runs-on: ${{ matrix.os }}