Skip to content

Commit

Permalink
Merge pull request #3126 from ekaf/hotfix-3125
Browse files Browse the repository at this point in the history
Avoid duplicate merged OMW synsets and lemmas
  • Loading branch information
stevenbird committed Dec 17, 2023
2 parents 796b03b + a3a7e53 commit b718276
Showing 1 changed file with 16 additions and 9 deletions.
25 changes: 16 additions & 9 deletions nltk/corpus/reader/wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,7 @@ def __init__(self, root, omw_reader):
self.map30 = self.map_wn()

# Language data attributes
self.lg_attrs = ["lemma", "none", "def", "exe"]
self.lg_attrs = ["lemma", "of", "def", "exe"]

def index_sense(self, version=None):
"""Read sense key to synset id mapping from index.sense file in corpus directory"""
Expand Down Expand Up @@ -1251,7 +1251,7 @@ def map_to_many(self, version="wordnet"):
return synset_to_many

def map_to_one(self, version="wordnet"):
self.nomap[version] = []
self.nomap[version] = set()
self.splits[version] = {}
synset_to_many = self.map_to_many(version)
synset_to_one = {}
Expand All @@ -1273,7 +1273,7 @@ def map_to_one(self, version="wordnet"):
# where only Lithuanian and Slovak use the "s" ss_type.
synset_to_one[f"{source[:-1]}a"] = target
else:
self.nomap[version].append(source)
self.nomap[version].add(source)
return synset_to_one

def map_wn(self, version="wordnet"):
Expand All @@ -1294,7 +1294,9 @@ def merged_synsets(self, version="wordnet"):
for source, targets in self.map_to_many(version).items():
for target in targets:
merge[target].add(source)
self.merges[version] = {s: t for s, t in merge.items() if len(t) > 1}
self.merges[version] = {
trg: src for trg, src in merge.items() if len(src) > 1
}
return self.merges[version]

# Open Multilingual WordNet functions, contributed by
Expand Down Expand Up @@ -2230,8 +2232,9 @@ def custom_lemmas(self, tab_file, lang):
else:
# Some OMW offsets were never in Wordnet:
if (
offset_pos not in self.nomap
and offset_pos.replace("a", "s") not in self.nomap
offset_pos not in self.nomap["wordnet"]
and offset_pos.replace("a", "s")
not in self.nomap["wordnet"]
):
warnings.warn(
f"{lang}: invalid offset {offset_pos} in '{line}'"
Expand All @@ -2247,11 +2250,15 @@ def custom_lemmas(self, tab_file, lang):
if len(pair) == 1 or pair[0] == lg:
if attr == "lemma":
val = val.strip().replace(" ", "_")
self._lang_data[lang][1][val.lower()].append(offset_pos)
lang_offsets = self._lang_data[lang][1][val.lower()]
if offset_pos not in lang_offsets:
lang_offsets.append(offset_pos)
if attr in self.lg_attrs:
self._lang_data[lang][self.lg_attrs.index(attr)][
lang_lemmas = self._lang_data[lang][self.lg_attrs.index(attr)][
offset_pos
].append(val)
]
if val not in lang_lemmas:
lang_lemmas.append(val)

def disable_custom_lemmas(self, lang):
"""prevent synsets from being mistakenly added"""
Expand Down

0 comments on commit b718276

Please sign in to comment.