Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/nltk/nltk into pr/2897
Browse files Browse the repository at this point in the history
  • Loading branch information
tomaarsen committed Dec 6, 2021
2 parents a32c19f + 1892214 commit a56ed42
Show file tree
Hide file tree
Showing 9 changed files with 77 additions and 14 deletions.
2 changes: 1 addition & 1 deletion nltk/__init__.py
Expand Up @@ -13,7 +13,7 @@
Steven Bird, Ewan Klein, and Edward Loper (2009).
Natural Language Processing with Python. O'Reilly Media Inc.
https://www.nltk.org/book
https://www.nltk.org/book/
isort:skip_file
"""
Expand Down
13 changes: 10 additions & 3 deletions nltk/corpus/__init__.py
Expand Up @@ -361,9 +361,16 @@
WordNetCorpusReader,
LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet31 = LazyCorpusLoader("wordnet31", WordNetCorpusReader, None)
# wordnet2021 is scheduled for release in 2021 :)
# wordnet2021 = LazyCorpusLoader("wordnet2021", WordNetCorpusReader, None)
wordnet31 = LazyCorpusLoader(
"wordnet31",
WordNetCorpusReader,
LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet2021 = LazyCorpusLoader(
"wordnet2021",
WordNetCorpusReader,
LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, r".*\.dat")
words = LazyCorpusLoader(
"words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
Expand Down
62 changes: 59 additions & 3 deletions nltk/corpus/reader/wordnet.py
Expand Up @@ -31,6 +31,7 @@
"""

import math
import os
import re
import warnings
from collections import defaultdict, deque
Expand Down Expand Up @@ -1175,6 +1176,55 @@ def __init__(self, root, omw_reader):
# load the exception file data into memory
self._load_exception_map()

# map from WordNet 3.0 for OMW data
self.map30 = self.map_wn30()

def corpus2sk(self, corpus=None):
"""Read sense key to synset id mapping,
from index.sense file in corpus directory"""
fn = "index.sense"
if corpus:
fn = os.path.join(os.pardir, corpus, fn)
fp = self.open(fn)
sk_map = {}
for line in fp:
items = line.strip().split(" ")
sk = items[0]
pos = self._pos_names[int(sk.split("%")[1].split(":")[0])]
sk_map[sk] = f"{items[1]}-{pos}"
fp.close()
return sk_map

def map_wn30(self):
"""Mapping from Wordnet 3.0 to currently loaded Wordnet version"""
if self.get_version() == "3.0":
return None
# warnings.warn(f"Mapping WN v. 3.0 to Wordnet v. {self.version}")
sk1 = self.corpus2sk("wordnet")
sk2 = self.corpus2sk()

skmap = {}
for sk in set(sk1.keys()).intersection(set(sk2.keys())):
of1 = sk1[sk]
of2 = sk2[sk]
if of1 not in skmap.keys():
skmap[of1] = [of2]
else:
skmap[of1].append(of2)

map30 = {}
for of in skmap.keys():
candidates = skmap[of]
# map to candidate that covers most lemmas:
of2 = max((candidates.count(x), x) for x in set(candidates))[1]
# warnings.warn(f"Map {of} {of2}")
map30[of] = of2
if of[-1] == "s":
# Add a mapping from "a" for applications like omw,
# which don't use the "s" ss_type:
map30[of[:-1] + "a"] = of2
return map30

# Open Multilingual WordNet functions, contributed by
# Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn

Expand Down Expand Up @@ -1205,7 +1255,6 @@ def _load_lang_data(self, lang):

def langs(self):
"""return a list of languages supported by Multilingual Wordnet"""
import os

langs = ["eng"]
fileids = self._omw_reader.fileids()
Expand Down Expand Up @@ -1337,7 +1386,6 @@ def lemma_from_key(self, key):
raise WordNetError("No synset found for key %r" % key)
offset = int(synset_line.split()[1])
synset = self.synset_from_pos_and_offset(pos, offset)

# return the corresponding lemma
for lemma in synset._lemmas:
if lemma._key == key:
Expand Down Expand Up @@ -1595,7 +1643,7 @@ def synset_from_sense_key(self, sense_key):
>>> print(wn.synset_from_sense_key("driving%1:04:03::"))
Synset('drive.n.06')
"""
return self.lemma_from_key(sense_key).synset()
return lemma_from_key(self, key).synset()

#############################################################
# Retrieve synsets and lemmas.
Expand Down Expand Up @@ -2051,6 +2099,14 @@ def custom_lemmas(self, tab_file, lang):
if not line.startswith("#"):
offset_pos, lemma_type, lemma = line.strip().split("\t")
lemma = lemma.strip().replace(" ", "_")
if self.map30:
if offset_pos in self.map30.keys():
# Map offset_pos to current Wordnet version:
offset_pos = self.map30[offset_pos]
else:
# Synsets with no mapping keep their Wordnet 3.0 offset
# warnings.warn(f"No map for {offset_pos}, {lang}: {lemma}")
pass
self._lang_data[lang][0][offset_pos].append(lemma)
self._lang_data[lang][1][lemma.lower()].append(offset_pos)
# Make sure no more entries are accidentally added subsequently
Expand Down
2 changes: 1 addition & 1 deletion nltk/decorators.py
Expand Up @@ -20,7 +20,7 @@
# Hack to keep NLTK's "tokenize" module from colliding with the "tokenize" in
# the Python standard library.
OLD_SYS_PATH = sys.path[:]
sys.path = [p for p in sys.path if p and "nltk" not in p]
sys.path = [p for p in sys.path if p and "nltk" not in str(p)]
import inspect

sys.path = OLD_SYS_PATH
Expand Down
2 changes: 1 addition & 1 deletion nltk/test/corpus.doctest
Expand Up @@ -94,7 +94,7 @@ If the reader methods are called without any arguments, they will
typically load all documents in the corpus.

>>> len(inaugural.words())
149797
152901

If a corpus contains a README file, it can be accessed with a ``readme()`` method:

Expand Down
2 changes: 1 addition & 1 deletion nltk/test/portuguese_en.doctest
Expand Up @@ -7,7 +7,7 @@ Examples for Portuguese Processing

This HOWTO contains a variety of examples relating to the Portuguese language.
It is intended to be read in conjunction with the NLTK book
(``https://www.nltk.org/book``). For instructions on running the Python
(``https://www.nltk.org/book/``). For instructions on running the Python
interpreter, please see the section *Getting Started with Python*, in Chapter 1.

--------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions nltk/tree/prettyprinter.py
Expand Up @@ -75,7 +75,7 @@ def __init__(self, tree, sentence=None, highlight=()):
leaves = tree.leaves()
if (
leaves
and not any(len(a) == 0 for a in tree.subtrees())
and all(len(a) > 0 for a in tree.subtrees())
and all(isinstance(a, int) for a in leaves)
):
sentence = [str(a) for a in leaves]
Expand Down Expand Up @@ -291,7 +291,7 @@ def dumpmatrix():
matrix[rowidx][i] = ids[m]
nodes[ids[m]] = tree[m]
# add column to the set of children for its parent
if m != ():
if len(m) > 0:
childcols[m[:-1]].add((rowidx, i))
assert len(positions) == 0

Expand Down
2 changes: 1 addition & 1 deletion web/index.rst
Expand Up @@ -15,7 +15,7 @@ NLTK is available for Windows, Mac OS X, and Linux. Best of all, NLTK is a free,
NLTK has been called "a wonderful tool for teaching, and working in, computational linguistics using Python,"
and "an amazing library to play with natural language."

`Natural Language Processing with Python <https://www.nltk.org/book>`_ provides a practical
`Natural Language Processing with Python <https://www.nltk.org/book/>`_ provides a practical
introduction to programming for language processing.
Written by the creators of NLTK, it guides the reader through the fundamentals
of writing Python programs, working with corpora, categorizing text, analyzing linguistic structure,
Expand Down
2 changes: 1 addition & 1 deletion web/news.rst
Expand Up @@ -172,7 +172,7 @@ NLTK 3.0.0b2 released: August 2014
Minor bugfixes and clean-ups.

NLTK Book Updates: July 2014
The NLTK book is being updated for Python 3 and NLTK 3 `here <https://www.nltk.org/book>`__.
The NLTK book is being updated for Python 3 and NLTK 3 `here <https://www.nltk.org/book/>`__.
The original Python 2 edition is still available `here <https://www.nltk.org/book_1ed>`__.

NLTK 3.0.0b1 released: July 2014
Expand Down

0 comments on commit a56ed42

Please sign in to comment.