Merge branch 'develop' of https://github.com/nltk/nltk into pr/2897

nltk · Dec 6, 2021 · a56ed42 · a56ed42
2 parents a32c19f + 1892214
commit a56ed42
Show file tree

Hide file tree

Showing 9 changed files with 77 additions and 14 deletions.
diff --git a/nltk/__init__.py b/nltk/__init__.py
@@ -13,7 +13,7 @@
 
 Steven Bird, Ewan Klein, and Edward Loper (2009).
 Natural Language Processing with Python.  O'Reilly Media Inc.
-https://www.nltk.org/book
+https://www.nltk.org/book/
 
 isort:skip_file
 """

diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
@@ -361,9 +361,16 @@
     WordNetCorpusReader,
     LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
 )
-wordnet31 = LazyCorpusLoader("wordnet31", WordNetCorpusReader, None)
-# wordnet2021 is scheduled for release in 2021 :)
-# wordnet2021 = LazyCorpusLoader("wordnet2021", WordNetCorpusReader, None)
+wordnet31 = LazyCorpusLoader(
+    "wordnet31",
+    WordNetCorpusReader,
+    LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
+)
+wordnet2021 = LazyCorpusLoader(
+    "wordnet2021",
+    WordNetCorpusReader,
+    LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
+)
 wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, r".*\.dat")
 words = LazyCorpusLoader(
     "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"

diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py
@@ -31,6 +31,7 @@
 """
 
 import math
+import os
 import re
 import warnings
 from collections import defaultdict, deque
@@ -1175,6 +1176,55 @@ def __init__(self, root, omw_reader):
         # load the exception file data into memory
         self._load_exception_map()
 
+        # map from WordNet 3.0 for OMW data
+        self.map30 = self.map_wn30()
+
+    def corpus2sk(self, corpus=None):
+        """Read sense key to synset id mapping,
+        from index.sense file in corpus directory"""
+        fn = "index.sense"
+        if corpus:
+            fn = os.path.join(os.pardir, corpus, fn)
+        fp = self.open(fn)
+        sk_map = {}
+        for line in fp:
+            items = line.strip().split(" ")
+            sk = items[0]
+            pos = self._pos_names[int(sk.split("%")[1].split(":")[0])]
+            sk_map[sk] = f"{items[1]}-{pos}"
+        fp.close()
+        return sk_map
+
+    def map_wn30(self):
+        """Mapping from Wordnet 3.0 to currently loaded Wordnet version"""
+        if self.get_version() == "3.0":
+            return None
+        # warnings.warn(f"Mapping WN v. 3.0 to Wordnet v. {self.version}")
+        sk1 = self.corpus2sk("wordnet")
+        sk2 = self.corpus2sk()
+
+        skmap = {}
+        for sk in set(sk1.keys()).intersection(set(sk2.keys())):
+            of1 = sk1[sk]
+            of2 = sk2[sk]
+            if of1 not in skmap.keys():
+                skmap[of1] = [of2]
+            else:
+                skmap[of1].append(of2)
+
+        map30 = {}
+        for of in skmap.keys():
+            candidates = skmap[of]
+            # map to candidate that covers most lemmas:
+            of2 = max((candidates.count(x), x) for x in set(candidates))[1]
+            # warnings.warn(f"Map {of} {of2}")
+            map30[of] = of2
+            if of[-1] == "s":
+                # Add a mapping from "a" for applications like omw,
+                # which don't use the "s" ss_type:
+                map30[of[:-1] + "a"] = of2
+        return map30
+
     # Open Multilingual WordNet functions, contributed by
     # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
 
@@ -1205,7 +1255,6 @@ def _load_lang_data(self, lang):
 
     def langs(self):
         """return a list of languages supported by Multilingual Wordnet"""
-        import os
 
         langs = ["eng"]
         fileids = self._omw_reader.fileids()
@@ -1337,7 +1386,6 @@ def lemma_from_key(self, key):
             raise WordNetError("No synset found for key %r" % key)
         offset = int(synset_line.split()[1])
         synset = self.synset_from_pos_and_offset(pos, offset)
-
         # return the corresponding lemma
         for lemma in synset._lemmas:
             if lemma._key == key:
@@ -1595,7 +1643,7 @@ def synset_from_sense_key(self, sense_key):
         >>> print(wn.synset_from_sense_key("driving%1:04:03::"))
         Synset('drive.n.06')
         """
-        return self.lemma_from_key(sense_key).synset()
+        return lemma_from_key(self, key).synset()
 
     #############################################################
     # Retrieve synsets and lemmas.
@@ -2051,6 +2099,14 @@ def custom_lemmas(self, tab_file, lang):
             if not line.startswith("#"):
                 offset_pos, lemma_type, lemma = line.strip().split("\t")
                 lemma = lemma.strip().replace(" ", "_")
+                if self.map30:
+                    if offset_pos in self.map30.keys():
+                        # Map offset_pos to current Wordnet version:
+                        offset_pos = self.map30[offset_pos]
+                    else:
+                        # Synsets with no mapping keep their Wordnet 3.0 offset
+                        # warnings.warn(f"No map for {offset_pos}, {lang}: {lemma}")
+                        pass
                 self._lang_data[lang][0][offset_pos].append(lemma)
                 self._lang_data[lang][1][lemma.lower()].append(offset_pos)
         # Make sure no more entries are accidentally added subsequently

diff --git a/nltk/decorators.py b/nltk/decorators.py
@@ -20,7 +20,7 @@
 # Hack to keep NLTK's "tokenize" module from colliding with the "tokenize" in
 # the Python standard library.
 OLD_SYS_PATH = sys.path[:]
-sys.path = [p for p in sys.path if p and "nltk" not in p]
+sys.path = [p for p in sys.path if p and "nltk" not in str(p)]
 import inspect
 
 sys.path = OLD_SYS_PATH

diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest
@@ -94,7 +94,7 @@ If the reader methods are called without any arguments, they will
 typically load all documents in the corpus.
 
     >>> len(inaugural.words())
-    149797
+    152901
 
 If a corpus contains a README file, it can be accessed with a ``readme()`` method:
 

diff --git a/nltk/test/portuguese_en.doctest b/nltk/test/portuguese_en.doctest
@@ -7,7 +7,7 @@ Examples for Portuguese Processing
 
 This HOWTO contains a variety of examples relating to the Portuguese language.
 It is intended to be read in conjunction with the NLTK book
-(``https://www.nltk.org/book``).  For instructions on running the Python
+(``https://www.nltk.org/book/``).  For instructions on running the Python
 interpreter, please see the section *Getting Started with Python*, in Chapter 1.
 
 --------------------------------------------

diff --git a/nltk/tree/prettyprinter.py b/nltk/tree/prettyprinter.py
@@ -75,7 +75,7 @@ def __init__(self, tree, sentence=None, highlight=()):
             leaves = tree.leaves()
             if (
                 leaves
-                and not any(len(a) == 0 for a in tree.subtrees())
+                and all(len(a) > 0 for a in tree.subtrees())
                 and all(isinstance(a, int) for a in leaves)
             ):
                 sentence = [str(a) for a in leaves]
@@ -291,7 +291,7 @@ def dumpmatrix():
                 matrix[rowidx][i] = ids[m]
                 nodes[ids[m]] = tree[m]
                 # add column to the set of children for its parent
-                if m != ():
+                if len(m) > 0:
                     childcols[m[:-1]].add((rowidx, i))
         assert len(positions) == 0
 

diff --git a/web/index.rst b/web/index.rst
@@ -15,7 +15,7 @@ NLTK is available for Windows, Mac OS X, and Linux. Best of all, NLTK is a free,
 NLTK has been called "a wonderful tool for teaching, and working in, computational linguistics using Python,"
 and "an amazing library to play with natural language."
 
-`Natural Language Processing with Python <https://www.nltk.org/book>`_ provides a practical
+`Natural Language Processing with Python <https://www.nltk.org/book/>`_ provides a practical
 introduction to programming for language processing.
 Written by the creators of NLTK, it guides the reader through the fundamentals
 of writing Python programs, working with corpora, categorizing text, analyzing linguistic structure,

diff --git a/web/news.rst b/web/news.rst
@@ -172,7 +172,7 @@ NLTK 3.0.0b2 released: August 2014
    Minor bugfixes and clean-ups.
 
 NLTK Book Updates: July 2014
-   The NLTK book is being updated for Python 3 and NLTK 3 `here <https://www.nltk.org/book>`__.
+   The NLTK book is being updated for Python 3 and NLTK 3 `here <https://www.nltk.org/book/>`__.
    The original Python 2 edition is still available `here <https://www.nltk.org/book_1ed>`__.
 
 NLTK 3.0.0b1 released: July 2014