Skip to content

Commit

Permalink
Change extended_omw reader w.r.t. review, add pre-commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ExplorerFreda committed Dec 24, 2021
1 parent eaf840a commit 1057d66
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 19 deletions.
9 changes: 4 additions & 5 deletions nltk/corpus/__init__.py
Expand Up @@ -128,13 +128,12 @@
"dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii"
)
extended_omw = LazyCorpusLoader(
'wordnet', ExtendedOpenMultilingualWordNetCorpusReader,
"wordnet",
ExtendedOpenMultilingualWordNetCorpusReader,
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-[a-z\-]*\.tab", encoding="utf8"),
LazyCorpusLoader(
'omw-1.4', CorpusReader, r'.*/wn-[a-z\-]*\.tab', encoding='utf8'
"extended_omw", CorpusReader, r".*/wn-[a-z\-]*\.tab", encoding="utf8"
),
LazyCorpusLoader(
'extended_omw', CorpusReader, r'.*/wn-[a-z\-]*\.tab', encoding='utf8'
)
)
floresta = LazyCorpusLoader(
"floresta",
Expand Down
2 changes: 1 addition & 1 deletion nltk/corpus/reader/__init__.py
Expand Up @@ -182,5 +182,5 @@
"UnicharsCorpusReader",
"MWAPPDBCorpusReader",
"PanlexSwadeshCorpusReader",
"ExtendedOpenMultilingualWordNetCorpusReader"
"ExtendedOpenMultilingualWordNetCorpusReader",
]
22 changes: 9 additions & 13 deletions nltk/corpus/reader/extended_omw.py
@@ -1,4 +1,4 @@
# Natural Language Toolkit: Extened Open Multilingual WordNet Reader
# Natural Language Toolkit: Exteneded Open Multilingual WordNet Reader
#
# Copyright (C) 2001-2021 NLTK Project
# Author: Freda Shi <freda@ttic.edu>
Expand All @@ -10,11 +10,11 @@
An NLTK interface for Extended Open Multilingual WordNet
Extended Open Multilingual WordNet automatically maps WordNet synsets
to multiple languages with data extracted from Wikitionary and CLDR.
to multiple languages with data extracted from Wikitionary and CLDR.
Currently ignoring all languages marked with "*" in the released corpus.
All synsets, whether manually annotated or automatically extracted, are
treated equally.
Currently ignoring all languages marked with "*" in the released corpus.
All synsets, whether manually annotated or automatically extracted, are
treated equally.
For details about WordNet, see:
https://wordnet.princeton.edu/
Expand All @@ -28,9 +28,7 @@

import os
from collections import defaultdict
from IPython.core.pylabtools import configure_inline_support

from traitlets.traitlets import default
from nltk.corpus.reader.wordnet import WordNetCorpusReader


Expand Down Expand Up @@ -64,7 +62,7 @@ def exomw_prov(self):
if file_extension == ".tab":
lang = file_name.split("-")[-1]
# only use wordnet English data -- should be examined.
if lang == "eng":
if lang == "eng":
continue
provdict[lang].append(prov)
return provdict
Expand All @@ -77,12 +75,10 @@ def _load_lang_data(self, lang):
for prov in self.provenances[lang]:
with self._omw_reader.open(f"{prov}/wn-data-{lang}.tab") as fp:
self.custom_lemmas(fp, lang)
fp.close()
# load extended open multilingual wordnet
# load extended open multilingual wordnet
for prov in self.exomw_provenances[lang]:
with self._exomw_reader.open(f"{prov}/wn-{prov}-{lang}.tab") as fp:
self.custom_lemmas(fp, lang)
fp.close()
for index in range(len(self.lg_attrs)):
for key in self._lang_data[lang][index]:
data_item = self._lang_data[lang][index][key]
Expand All @@ -96,7 +92,7 @@ def langs(self):

def custom_lemmas(self, tab_file, lang):
"""
Adapted from Open Multilingual WordNet Loader.
Adapted from Open Multilingual WordNet Loader.
:param tab_file: Tab file as a file or file-like object
:type lang: str
Expand Down Expand Up @@ -136,6 +132,6 @@ def custom_lemmas(self, tab_file, lang):
self._lang_data[lang][index][offset_pos].add(val)

def disable_custom_lemmas(self, lang):
"""prevent synsets from mistakenly added"""
"""prevent synsets from being mistakenly added"""
for n in range(len(self.lg_attrs)):
self._lang_data[lang][n].default_factory = None

0 comments on commit 1057d66

Please sign in to comment.