Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support OMW 1.4 #2899

Merged
merged 13 commits into from Dec 8, 2021
205 changes: 128 additions & 77 deletions nltk/corpus/reader/wordnet.py
Expand Up @@ -443,11 +443,29 @@ def name(self):
def frame_ids(self):
return self._frame_ids

def definition(self):
return self._definition
def _doc(self, doc_type, default, lang="eng"):
"""Helper method for Synset.definition and Synset.examples"""
corpus = self._wordnet_corpus_reader
if lang not in corpus.langs():
return None
elif lang == "eng":
return default
else:
corpus._load_lang_data(lang)
of = corpus.ss2of(self)
i = corpus.lg_attrs.index(doc_type)
if of in corpus._lang_data[lang][i].keys():
return corpus._lang_data[lang][i][of]
else:
return None

def definition(self, lang="eng"):
"""Return definition in specified language"""
return self._doc("def", self._definition, lang=lang)

def examples(self):
return self._examples
def examples(self, lang="eng"):
"""Return examples in specified language"""
return self._doc("exe", self._examples, lang=lang)

def lexname(self):
return self._lexname
Expand Down Expand Up @@ -1132,10 +1150,6 @@ def __init__(self, root, omw_reader):
Construct a new wordnet corpus reader, with the given root
directory.
"""
if omw_reader is None:
warnings.warn(
"The multilingual functions are not available with this Wordnet version"
)

super().__init__(root, self._FILES, encoding=self._ENCODING)

Expand All @@ -1154,6 +1168,13 @@ def __init__(self, root, omw_reader):
# Corpus reader containing omw data.
self._omw_reader = omw_reader

if self._omw_reader is None:
warnings.warn(
"The multilingual functions are not available with this Wordnet version"
)
else:
self.provenances = self.omw_prov()

# A cache to store the wordnet data of multiple languages
self._lang_data = defaultdict(list)

Expand All @@ -1179,20 +1200,22 @@ def __init__(self, root, omw_reader):
# map from WordNet 3.0 for OMW data
self.map30 = self.map_wn30()

# Language data attributes
self.lg_attrs = ["lemma", "none", "def", "exe"]

def corpus2sk(self, corpus=None):
"""Read sense key to synset id mapping,
from index.sense file in corpus directory"""
fn = "index.sense"
if corpus:
fn = os.path.join(os.pardir, corpus, fn)
fp = self.open(fn)
sk_map = {}
for line in fp:
items = line.strip().split(" ")
sk = items[0]
pos = self._pos_names[int(sk.split("%")[1].split(":")[0])]
sk_map[sk] = f"{items[1]}-{pos}"
fp.close()
with self.open(fn) as fp:
sk_map = {}
for line in fp:
items = line.strip().split(" ")
sk = items[0]
pos = self._pos_names[int(sk.split("%")[1].split(":")[0])]
sk_map[sk] = f"{items[1]}-{pos}"
return sk_map

def map_wn30(self):
Expand Down Expand Up @@ -1250,20 +1273,31 @@ def _load_lang_data(self, lang):
if lang not in self.langs():
raise WordNetError("Language is not supported.")

with self._omw_reader.open("{0:}/wn-data-{0:}.tab".format(lang)) as fp:
with self._omw_reader.open(
f"{self.provenances[lang]}/wn-data-{lang.split('_')[0]}.tab"
) as fp:
self.custom_lemmas(fp, lang)

def langs(self):
"""return a list of languages supported by Multilingual Wordnet"""

langs = ["eng"]
def omw_prov(self):
"""Return a provenance dictionary of the languages in Multilingual Wordnet"""
provdict = {}
provdict["eng"] = ""
fileids = self._omw_reader.fileids()
for fileid in fileids:
file_name, file_extension = os.path.splitext(fileid)
prov, langfile = os.path.split(fileid)
file_name, file_extension = os.path.splitext(langfile)
if file_extension == ".tab":
langs.append(file_name.split("-")[-1])
lang = file_name.split("-")[-1]
if lang in provdict.keys():
# We already have another resource for this lang,
# so we need to further specify the lang id:
lang = f"{lang}_{prov}"
provdict[lang] = prov
return provdict

return langs
def langs(self):
"""return a list of languages supported by Multilingual Wordnet"""
return self.provenances.keys()

def _load_lemma_pos_offset_map(self):
for suffix in self._FILEMAP.values():
Expand Down Expand Up @@ -1732,11 +1766,32 @@ def all_lemma_names(self, pos=None, lang="eng"):
lemma = iter(set(lemma))
return lemma

def all_synsets(self, pos=None):
def all_omw_synsets(self, pos=None, lang=None):
if lang not in self.langs():
return None
self._load_lang_data(lang)
for of in self._lang_data[lang][0].keys():
try:
ss = self.of2ss(of)
yield ss
except:
# A few OMW offsets don't exist in Wordnet 3.0.
# Additionally, when mapped to later Wordnets,
# increasing numbers of synsets are lost in the mapping.
# warnings.warn(f"Language {lang}: no synset found for {of}")
pass

def all_synsets(self, pos=None, lang="eng"):
"""Iterate over all synsets with a given part of speech tag.
If no pos is specified, all synsets for all parts of speech
will be loaded.
"""
if lang == "eng":
return self.all_eng_synsets(pos=pos)
else:
return self.all_omw_synsets(pos=pos, lang=lang)

def all_eng_synsets(self, pos=None):
if pos is None:
pos_tags = self._FILEMAP.keys()
else:
Expand Down Expand Up @@ -1794,59 +1849,38 @@ def words(self, lang="eng"):
"""return lemmas of the given language as list of words"""
return self.all_lemma_names(lang=lang)

def doc(self, file="README", lang="eng"):
"""Return the contents of readme, license or citation file
use lang=lang to get the file for an individual language"""
if lang == "eng":
reader = self
else:
reader = self._omw_reader
if lang in self.langs():
file = f"{os.path.join(self.provenances[lang],file)}"
try:
with reader.open(file) as fp:
return fp.read()
except:
if lang in self._lang_data:
return f"Cannot determine {file} for {lang}"
else:
return f"Language {lang} is not supported."

def license(self, lang="eng"):
"""Return the contents of LICENSE (for omw)
use lang=lang to get the license for an individual language"""
if lang == "eng":
with self.open("LICENSE") as fp:
return fp.read()
elif lang in self.langs():
with self._omw_reader.open(f"{lang}/LICENSE") as fp:
return fp.read()
elif lang == "omw":
# under the assumption you don't mean Omwunra-Toqura
with self._omw_reader.open("LICENSE") as fp:
return fp.read()
elif lang in self._lang_data:
raise WordNetError("Cannot determine license for user-provided tab file")
else:
raise WordNetError("Language is not supported.")
return self.doc(file="LICENSE", lang=lang)

def readme(self, lang="omw"):
def readme(self, lang="eng"):
"""Return the contents of README (for omw)
use lang=lang to get the readme for an individual language"""
if lang == "eng":
with self.open("README") as fp:
return fp.read()
elif lang in self.langs():
with self._omw_reader.open(f"{lang}/README") as fp:
return fp.read()
elif lang == "omw":
# under the assumption you don't mean Omwunra-Toqura
with self._omw_reader.open("README") as fp:
return fp.read()
elif lang in self._lang_data:
raise WordNetError("No README for user-provided tab file")
else:
raise WordNetError("Language is not supported.")
return self.doc(file="README", lang=lang)

def citation(self, lang="omw"):
def citation(self, lang="eng"):
"""Return the contents of citation.bib file (for omw)
use lang=lang to get the citation for an individual language"""
if lang == "eng":
with self.open("citation.bib") as fp:
return fp.read()
elif lang in self.langs():
with self._omw_reader.open(f"{lang}/citation.bib") as fp:
return fp.read()
elif lang == "omw":
# under the assumption you don't mean Omwunra-Toqura
with self._omw_reader.open("citation.bib") as fp:
return fp.read()
elif lang in self._lang_data:
raise WordNetError("citation not known for user-provided tab file")
else:
raise WordNetError("Language is not supported.")
return self.doc(file="citation.bib", lang=lang)

#############################################################
# Misc
Expand Down Expand Up @@ -2088,17 +2122,26 @@ def custom_lemmas(self, tab_file, lang):
:type: lang str
:param: lang ISO 639-3 code of the language of the tab file
"""
if len(lang) != 3:
lg = lang.split("_")[0]
if len(lg) != 3:
raise ValueError("lang should be a (3 character) ISO 639-3 code")
self._lang_data[lang] = [defaultdict(list), defaultdict(list)]
self._lang_data[lang] = [
defaultdict(list),
defaultdict(list),
defaultdict(list),
defaultdict(list),
]
for line in tab_file.readlines():
if isinstance(line, bytes):
# Support byte-stream files (e.g. as returned by Python 2's
# open() function) as well as text-stream ones
line = line.decode("utf-8")
if not line.startswith("#"):
offset_pos, lemma_type, lemma = line.strip().split("\t")
lemma = lemma.strip().replace(" ", "_")
triple = line.strip().split("\t")
if len(triple) < 3:
continue
offset_pos, label = triple[:2]
val = triple[-1]
if self.map30:
if offset_pos in self.map30.keys():
# Map offset_pos to current Wordnet version:
Expand All @@ -2107,11 +2150,19 @@ def custom_lemmas(self, tab_file, lang):
# Synsets with no mapping keep their Wordnet 3.0 offset
# warnings.warn(f"No map for {offset_pos}, {lang}: {lemma}")
pass
self._lang_data[lang][0][offset_pos].append(lemma)
self._lang_data[lang][1][lemma.lower()].append(offset_pos)
pair = label.split(":")
attr = pair[-1]
if len(pair) == 1 or pair[0] == lg:
if attr == "lemma":
val = val.strip().replace(" ", "_")
self._lang_data[lang][1][val.lower()].append(offset_pos)
if attr in self.lg_attrs:
self._lang_data[lang][self.lg_attrs.index(attr)][
offset_pos
].append(val)
# Make sure no more entries are accidentally added subsequently
self._lang_data[lang][0].default_factory = None
self._lang_data[lang][1].default_factory = None
for n in range(len(self.lg_attrs)):
self._lang_data[lang][n].default_factory = None

######################################################################
# Visualize WordNet relation graphs using Graphviz
Expand Down
17 changes: 9 additions & 8 deletions nltk/test/wordnet.doctest
Expand Up @@ -48,19 +48,20 @@ The WordNet corpus reader gives access to the Open Multilingual
WordNet, using ISO-639 language codes.

>>> sorted(wn.langs())
['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus', 'fas',
'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'ita', 'jpn', 'nld', 'nno',
'nob', 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha', 'zsm']
['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus',
'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'isl', 'ita', 'ita_iwn',
'jpn', 'lit', 'nld', 'nno', 'nob', 'pol', 'por', 'ron', 'slk',
'slv', 'spa', 'swe', 'tha', 'zsm']
>>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn')
[Synset('dog.n.01'), Synset('spy.n.01')]

>>> wn.synset('spy.n.01').lemma_names('jpn')
['いぬ', 'スパイ', '回者', '回し者', '密偵', '工作員',
'廻者', '廻し者', '探', '探り', '犬', '秘密捜査員',
'まわし者', '諜報員', '諜者', '間者', '間諜', '隠密']
['いぬ', 'まわし者', 'スパイ', '回し者', '回者', '密偵',
'工作員', '廻し者', '廻者', '探', '探り', '犬', '秘密捜査員',
'諜報員', '諜者', '間者', '間諜', '隠密']

>>> wn.synset('dog.n.01').lemma_names('ita')
['cane', 'Canis_familiaris']
['Canis_familiaris', 'cane']
>>> wn.lemmas('cane', lang='ita')
[Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'),
Lemma('incompetent.n.01.cane')]
Expand All @@ -77,7 +78,7 @@ WordNet, using ISO-639 language codes.
>>> dog_lemma.lang()
'por'
>>> len(list(wordnet.all_lemma_names(pos='n', lang='jpn')))
64797
66031

-------
Synsets
Expand Down