Skip to content

Commit

Permalink
Use Multilingual Wordnet Data from OMW with newer Wordnet versions (#…
Browse files Browse the repository at this point in the history
…2889)

* Map Wordnet 3.0 to newer Wordnets for OMW compatibility

* Use Multilingual Wordnets with Wordnet 3.1

* Add support for Wordnet 2021

* Use max instead of sorted
  • Loading branch information
ekaf committed Nov 29, 2021
1 parent 43187f2 commit f50b6b1
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 6 deletions.
13 changes: 10 additions & 3 deletions nltk/corpus/__init__.py
Expand Up @@ -361,9 +361,16 @@
WordNetCorpusReader,
LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet31 = LazyCorpusLoader("wordnet31", WordNetCorpusReader, None)
# wordnet2021 is scheduled for release in 2021 :)
# wordnet2021 = LazyCorpusLoader("wordnet2021", WordNetCorpusReader, None)
wordnet31 = LazyCorpusLoader(
"wordnet31",
WordNetCorpusReader,
LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet2021 = LazyCorpusLoader(
"wordnet2021",
WordNetCorpusReader,
LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
)
wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, r".*\.dat")
words = LazyCorpusLoader(
"words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
Expand Down
62 changes: 59 additions & 3 deletions nltk/corpus/reader/wordnet.py
Expand Up @@ -31,6 +31,7 @@
"""

import math
import os
import re
import warnings
from collections import defaultdict, deque
Expand Down Expand Up @@ -1175,6 +1176,55 @@ def __init__(self, root, omw_reader):
# load the exception file data into memory
self._load_exception_map()

# map from WordNet 3.0 for OMW data
self.map30 = self.map_wn30()

def corpus2sk(self, corpus=None):
"""Read sense key to synset id mapping,
from index.sense file in corpus directory"""
fn = "index.sense"
if corpus:
fn = os.path.join(os.pardir, corpus, fn)
fp = self.open(fn)
sk_map = {}
for line in fp:
items = line.strip().split(" ")
sk = items[0]
pos = self._pos_names[int(sk.split("%")[1].split(":")[0])]
sk_map[sk] = f"{items[1]}-{pos}"
fp.close()
return sk_map

def map_wn30(self):
"""Mapping from Wordnet 3.0 to currently loaded Wordnet version"""
if self.get_version() == "3.0":
return None
# warnings.warn(f"Mapping WN v. 3.0 to Wordnet v. {self.version}")
sk1 = self.corpus2sk("wordnet")
sk2 = self.corpus2sk()

skmap = {}
for sk in set(sk1.keys()).intersection(set(sk2.keys())):
of1 = sk1[sk]
of2 = sk2[sk]
if of1 not in skmap.keys():
skmap[of1] = [of2]
else:
skmap[of1].append(of2)

map30 = {}
for of in skmap.keys():
candidates = skmap[of]
# map to candidate that covers most lemmas:
of2 = max((candidates.count(x), x) for x in set(candidates))[1]
# warnings.warn(f"Map {of} {of2}")
map30[of] = of2
if of[-1] == "s":
# Add a mapping from "a" for applications like omw,
# which don't use the "s" ss_type:
map30[of[:-1] + "a"] = of2
return map30

# Open Multilingual WordNet functions, contributed by
# Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn

Expand Down Expand Up @@ -1205,7 +1255,6 @@ def _load_lang_data(self, lang):

def langs(self):
"""return a list of languages supported by Multilingual Wordnet"""
import os

langs = ["eng"]
fileids = self._omw_reader.fileids()
Expand Down Expand Up @@ -1337,7 +1386,6 @@ def lemma_from_key(self, key):
raise WordNetError("No synset found for key %r" % key)
offset = int(synset_line.split()[1])
synset = self.synset_from_pos_and_offset(pos, offset)

# return the corresponding lemma
for lemma in synset._lemmas:
if lemma._key == key:
Expand Down Expand Up @@ -1595,7 +1643,7 @@ def synset_from_sense_key(self, sense_key):
>>> print(wn.synset_from_sense_key("driving%1:04:03::"))
Synset('drive.n.06')
"""
return self.lemma_from_key(sense_key).synset()
return lemma_from_key(self, key).synset()

#############################################################
# Retrieve synsets and lemmas.
Expand Down Expand Up @@ -2051,6 +2099,14 @@ def custom_lemmas(self, tab_file, lang):
if not line.startswith("#"):
offset_pos, lemma_type, lemma = line.strip().split("\t")
lemma = lemma.strip().replace(" ", "_")
if self.map30:
if offset_pos in self.map30.keys():
# Map offset_pos to current Wordnet version:
offset_pos = self.map30[offset_pos]
else:
# Synsets with no mapping keep their Wordnet 3.0 offset
# warnings.warn(f"No map for {offset_pos}, {lang}: {lemma}")
pass
self._lang_data[lang][0][offset_pos].append(lemma)
self._lang_data[lang][1][lemma.lower()].append(offset_pos)
# Make sure no more entries are accidentally added subsequently
Expand Down

0 comments on commit f50b6b1

Please sign in to comment.