From ca7950c6affdb7eb7538b9e97eaadcd9d7959d17 Mon Sep 17 00:00:00 2001 From: Danny Sepler Date: Sun, 28 Nov 2021 08:59:27 -0500 Subject: [PATCH 1/5] update links to the nltk book (#2895) --- nltk/__init__.py | 2 +- nltk/test/portuguese_en.doctest | 2 +- web/index.rst | 2 +- web/news.rst | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nltk/__init__.py b/nltk/__init__.py index a96ac22b25..9573e73791 100644 --- a/nltk/__init__.py +++ b/nltk/__init__.py @@ -13,7 +13,7 @@ Steven Bird, Ewan Klein, and Edward Loper (2009). Natural Language Processing with Python. O'Reilly Media Inc. -https://www.nltk.org/book +https://www.nltk.org/book/ isort:skip_file """ diff --git a/nltk/test/portuguese_en.doctest b/nltk/test/portuguese_en.doctest index e490d4cee5..41c0da1b31 100644 --- a/nltk/test/portuguese_en.doctest +++ b/nltk/test/portuguese_en.doctest @@ -7,7 +7,7 @@ Examples for Portuguese Processing This HOWTO contains a variety of examples relating to the Portuguese language. It is intended to be read in conjunction with the NLTK book -(``https://www.nltk.org/book``). For instructions on running the Python +(``https://www.nltk.org/book/``). For instructions on running the Python interpreter, please see the section *Getting Started with Python*, in Chapter 1. -------------------------------------------- diff --git a/web/index.rst b/web/index.rst index 4481c553c8..ed55f1be14 100644 --- a/web/index.rst +++ b/web/index.rst @@ -15,7 +15,7 @@ NLTK is available for Windows, Mac OS X, and Linux. Best of all, NLTK is a free, NLTK has been called "a wonderful tool for teaching, and working in, computational linguistics using Python," and "an amazing library to play with natural language." -`Natural Language Processing with Python `_ provides a practical +`Natural Language Processing with Python `_ provides a practical introduction to programming for language processing. Written by the creators of NLTK, it guides the reader through the fundamentals of writing Python programs, working with corpora, categorizing text, analyzing linguistic structure, diff --git a/web/news.rst b/web/news.rst index 5acc8d8d43..d72db6d0cb 100644 --- a/web/news.rst +++ b/web/news.rst @@ -172,7 +172,7 @@ NLTK 3.0.0b2 released: August 2014 Minor bugfixes and clean-ups. NLTK Book Updates: July 2014 - The NLTK book is being updated for Python 3 and NLTK 3 `here `__. + The NLTK book is being updated for Python 3 and NLTK 3 `here `__. The original Python 2 edition is still available `here `__. NLTK 3.0.0b1 released: July 2014 From de27a0534441686f666ea33b4c38481706dc7ce8 Mon Sep 17 00:00:00 2001 From: mohaned mashaly <30902228+12mohaned@users.noreply.github.com> Date: Sun, 28 Nov 2021 23:16:42 +0200 Subject: [PATCH 2/5] refactor: refactor prettyprinter to be more readable (#2893) * refactor: refactor prettyprinter to be more readable * fix tests * fix tests * refactor to old if condition * replace all with any --- nltk/tree/prettyprinter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nltk/tree/prettyprinter.py b/nltk/tree/prettyprinter.py index 2ce106e055..24337ae329 100644 --- a/nltk/tree/prettyprinter.py +++ b/nltk/tree/prettyprinter.py @@ -75,7 +75,7 @@ def __init__(self, tree, sentence=None, highlight=()): leaves = tree.leaves() if ( leaves - and not any(len(a) == 0 for a in tree.subtrees()) + and all(len(a) > 0 for a in tree.subtrees()) and all(isinstance(a, int) for a in leaves) ): sentence = [str(a) for a in leaves] @@ -291,7 +291,7 @@ def dumpmatrix(): matrix[rowidx][i] = ids[m] nodes[ids[m]] = tree[m] # add column to the set of children for its parent - if m != (): + if len(m) > 0: childcols[m[:-1]].add((rowidx, i)) assert len(positions) == 0 From 43187f2d5fc6e173f4570cee83c7a3080b237c99 Mon Sep 17 00:00:00 2001 From: Danny Sepler Date: Sun, 28 Nov 2021 16:18:35 -0500 Subject: [PATCH 3/5] Make hack to keep NLTKs "tokenize" module work with pathlib (#2896) --- nltk/decorators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/decorators.py b/nltk/decorators.py index 7687224a23..d64fe0e1ec 100644 --- a/nltk/decorators.py +++ b/nltk/decorators.py @@ -20,7 +20,7 @@ # Hack to keep NLTK's "tokenize" module from colliding with the "tokenize" in # the Python standard library. OLD_SYS_PATH = sys.path[:] -sys.path = [p for p in sys.path if p and "nltk" not in p] +sys.path = [p for p in sys.path if p and "nltk" not in str(p)] import inspect sys.path = OLD_SYS_PATH From f50b6b1023689bb49c0f0a76f907e035f4360b12 Mon Sep 17 00:00:00 2001 From: Eric Kafe Date: Mon, 29 Nov 2021 09:07:24 +0100 Subject: [PATCH 4/5] Use Multilingual Wordnet Data from OMW with newer Wordnet versions (#2889) * Map Wordnet 3.0 to newer Wordnets for OMW compatibility * Use Multilingual Wordnets with Wordnet 3.1 * Add support for Wordnet 2021 * Use max instead of sorted --- nltk/corpus/__init__.py | 13 ++++++-- nltk/corpus/reader/wordnet.py | 62 +++++++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py index d348810a4f..14b1ea23ef 100644 --- a/nltk/corpus/__init__.py +++ b/nltk/corpus/__init__.py @@ -361,9 +361,16 @@ WordNetCorpusReader, LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), ) -wordnet31 = LazyCorpusLoader("wordnet31", WordNetCorpusReader, None) -# wordnet2021 is scheduled for release in 2021 :) -# wordnet2021 = LazyCorpusLoader("wordnet2021", WordNetCorpusReader, None) +wordnet31 = LazyCorpusLoader( + "wordnet31", + WordNetCorpusReader, + LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), +) +wordnet2021 = LazyCorpusLoader( + "wordnet2021", + WordNetCorpusReader, + LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), +) wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, r".*\.dat") words = LazyCorpusLoader( "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii" diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py index 3b49d8223a..0b0011230d 100644 --- a/nltk/corpus/reader/wordnet.py +++ b/nltk/corpus/reader/wordnet.py @@ -31,6 +31,7 @@ """ import math +import os import re import warnings from collections import defaultdict, deque @@ -1175,6 +1176,55 @@ def __init__(self, root, omw_reader): # load the exception file data into memory self._load_exception_map() + # map from WordNet 3.0 for OMW data + self.map30 = self.map_wn30() + + def corpus2sk(self, corpus=None): + """Read sense key to synset id mapping, + from index.sense file in corpus directory""" + fn = "index.sense" + if corpus: + fn = os.path.join(os.pardir, corpus, fn) + fp = self.open(fn) + sk_map = {} + for line in fp: + items = line.strip().split(" ") + sk = items[0] + pos = self._pos_names[int(sk.split("%")[1].split(":")[0])] + sk_map[sk] = f"{items[1]}-{pos}" + fp.close() + return sk_map + + def map_wn30(self): + """Mapping from Wordnet 3.0 to currently loaded Wordnet version""" + if self.get_version() == "3.0": + return None + # warnings.warn(f"Mapping WN v. 3.0 to Wordnet v. {self.version}") + sk1 = self.corpus2sk("wordnet") + sk2 = self.corpus2sk() + + skmap = {} + for sk in set(sk1.keys()).intersection(set(sk2.keys())): + of1 = sk1[sk] + of2 = sk2[sk] + if of1 not in skmap.keys(): + skmap[of1] = [of2] + else: + skmap[of1].append(of2) + + map30 = {} + for of in skmap.keys(): + candidates = skmap[of] + # map to candidate that covers most lemmas: + of2 = max((candidates.count(x), x) for x in set(candidates))[1] + # warnings.warn(f"Map {of} {of2}") + map30[of] = of2 + if of[-1] == "s": + # Add a mapping from "a" for applications like omw, + # which don't use the "s" ss_type: + map30[of[:-1] + "a"] = of2 + return map30 + # Open Multilingual WordNet functions, contributed by # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn @@ -1205,7 +1255,6 @@ def _load_lang_data(self, lang): def langs(self): """return a list of languages supported by Multilingual Wordnet""" - import os langs = ["eng"] fileids = self._omw_reader.fileids() @@ -1337,7 +1386,6 @@ def lemma_from_key(self, key): raise WordNetError("No synset found for key %r" % key) offset = int(synset_line.split()[1]) synset = self.synset_from_pos_and_offset(pos, offset) - # return the corresponding lemma for lemma in synset._lemmas: if lemma._key == key: @@ -1595,7 +1643,7 @@ def synset_from_sense_key(self, sense_key): >>> print(wn.synset_from_sense_key("driving%1:04:03::")) Synset('drive.n.06') """ - return self.lemma_from_key(sense_key).synset() + return lemma_from_key(self, key).synset() ############################################################# # Retrieve synsets and lemmas. @@ -2051,6 +2099,14 @@ def custom_lemmas(self, tab_file, lang): if not line.startswith("#"): offset_pos, lemma_type, lemma = line.strip().split("\t") lemma = lemma.strip().replace(" ", "_") + if self.map30: + if offset_pos in self.map30.keys(): + # Map offset_pos to current Wordnet version: + offset_pos = self.map30[offset_pos] + else: + # Synsets with no mapping keep their Wordnet 3.0 offset + # warnings.warn(f"No map for {offset_pos}, {lang}: {lemma}") + pass self._lang_data[lang][0][offset_pos].append(lemma) self._lang_data[lang][1][lemma.lower()].append(offset_pos) # Make sure no more entries are accidentally added subsequently From 189221495b3f52d4480e8e438b4c9c61b396223e Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Mon, 6 Dec 2021 12:35:05 +0100 Subject: [PATCH 5/5] Fixed broken inaugural test case (#2903) --- nltk/test/corpus.doctest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest index 9f283d2c0d..533970aff4 100644 --- a/nltk/test/corpus.doctest +++ b/nltk/test/corpus.doctest @@ -94,7 +94,7 @@ If the reader methods are called without any arguments, they will typically load all documents in the corpus. >>> len(inaugural.words()) - 149797 + 152901 If a corpus contains a README file, it can be accessed with a ``readme()`` method: