From a9b04078eba245e50121cf4bde0be6654f203142 Mon Sep 17 00:00:00 2001 From: Eric Kafe Date: Tue, 19 Oct 2021 11:33:25 +0200 Subject: [PATCH 1/5] Load alternative Wordnet versions --- nltk/corpus/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py index e951c6028c..d2631e712b 100644 --- a/nltk/corpus/__init__.py +++ b/nltk/corpus/__init__.py @@ -361,6 +361,16 @@ WordNetCorpusReader, LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), ) +wordnet31 = LazyCorpusLoader( + "wordnet31", + WordNetCorpusReader, + None +) +wordnet2021 = LazyCorpusLoader( + "wordnet2021", + WordNetCorpusReader, + None +) wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, r".*\.dat") words = LazyCorpusLoader( "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii" From 5c8ede794de925d3e5e05dfbb069a2a6ad1f8fbb Mon Sep 17 00:00:00 2001 From: Eric Kafe Date: Tue, 19 Oct 2021 11:35:10 +0200 Subject: [PATCH 2/5] Support alternative Wordnet version names --- nltk/corpus/reader/wordnet.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py index be12d2852f..d5a6804990 100644 --- a/nltk/corpus/reader/wordnet.py +++ b/nltk/corpus/reader/wordnet.py @@ -32,6 +32,7 @@ import math import re +import warnings from collections import defaultdict, deque from functools import total_ordering from itertools import chain, islice @@ -1130,6 +1131,9 @@ def __init__(self, root, omw_reader): Construct a new wordnet corpus reader, with the given root directory. """ + if omw_reader is None: + warnings.warn("The multilingual functions are not available with this Wordnet version") + super().__init__(root, self._FILES, encoding=self._ENCODING) # A index that provides the file offset @@ -1288,7 +1292,7 @@ def _compute_max_depth(self, pos, simulate_root): def get_version(self): fh = self._data_file(ADJ) for line in fh: - match = re.search(r"WordNet (\d+\.\d+) Copyright", line) + match = re.search(r"Word[nN]et (\d+|\d+\.\d+) Copyright", line) if match is not None: version = match.group(1) fh.seek(0) From d11bbd96ec31c115a494aa054a09094d1c82aa19 Mon Sep 17 00:00:00 2001 From: Eric Kafe Date: Tue, 19 Oct 2021 18:19:55 +0200 Subject: [PATCH 3/5] Installed pre-commit --- nltk/corpus/__init__.py | 12 ++---------- nltk/corpus/reader/wordnet.py | 4 +++- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py index d2631e712b..6ed4bb427e 100644 --- a/nltk/corpus/__init__.py +++ b/nltk/corpus/__init__.py @@ -361,16 +361,8 @@ WordNetCorpusReader, LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), ) -wordnet31 = LazyCorpusLoader( - "wordnet31", - WordNetCorpusReader, - None -) -wordnet2021 = LazyCorpusLoader( - "wordnet2021", - WordNetCorpusReader, - None -) +wordnet31 = LazyCorpusLoader("wordnet31", WordNetCorpusReader, None) +wordnet2021 = LazyCorpusLoader("wordnet2021", WordNetCorpusReader, None) wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, r".*\.dat") words = LazyCorpusLoader( "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii" diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py index d5a6804990..cac0b8c876 100644 --- a/nltk/corpus/reader/wordnet.py +++ b/nltk/corpus/reader/wordnet.py @@ -1132,7 +1132,9 @@ def __init__(self, root, omw_reader): directory. """ if omw_reader is None: - warnings.warn("The multilingual functions are not available with this Wordnet version") + warnings.warn( + "The multilingual functions are not available with this Wordnet version" + ) super().__init__(root, self._FILES, encoding=self._ENCODING) From 40261f690c60828c20ffff5802c9e50563f81871 Mon Sep 17 00:00:00 2001 From: Eric Kafe Date: Mon, 25 Oct 2021 12:20:25 +0200 Subject: [PATCH 4/5] Added doctests --- nltk/corpus/__init__.py | 3 ++- nltk/test/wordnet.doctest | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py index 6ed4bb427e..d348810a4f 100644 --- a/nltk/corpus/__init__.py +++ b/nltk/corpus/__init__.py @@ -362,7 +362,8 @@ LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), ) wordnet31 = LazyCorpusLoader("wordnet31", WordNetCorpusReader, None) -wordnet2021 = LazyCorpusLoader("wordnet2021", WordNetCorpusReader, None) +# wordnet2021 is scheduled for release in 2021 :) +# wordnet2021 = LazyCorpusLoader("wordnet2021", WordNetCorpusReader, None) wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, r".*\.dat") words = LazyCorpusLoader( "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii" diff --git a/nltk/test/wordnet.doctest b/nltk/test/wordnet.doctest index e1ed56657b..642f19ab5b 100644 --- a/nltk/test/wordnet.doctest +++ b/nltk/test/wordnet.doctest @@ -758,6 +758,27 @@ classified.a.02): [Synset('restricted.a.01'), [Synset('classified.a.02')]]]] +---------------------------------------------------------------- +Loading alternative Wordnet versions +---------------------------------------------------------------- + + >>> print("Wordnet {}".format(wn.get_version())) + Wordnet 3.0 + + >>> from nltk.corpus import wordnet31 as wn31 + >>> print("Wordnet {}".format(wn31.get_version())) + Wordnet 3.1 + + >>> print(wn.synset('restrain.v.01').hyponyms()) + [Synset('confine.v.03'), Synset('control.v.02'), Synset('hold.v.36'), Synset('inhibit.v.04')] + + >>> print(wn31.synset('restrain.v.01').hyponyms()) + [Synset('enchain.v.01'), Synset('fetter.v.01'), Synset('ground.v.02'), Synset('impound.v.02'), Synset('pen_up.v.01'), Synset('pinion.v.01'), Synset('pound.v.06'), Synset('tie_down.v.01')] + + >>> print(wn31.synset('restrain.v.04').hyponyms()) + [Synset('baffle.v.03'), Synset('confine.v.02'), Synset('control.v.02'), Synset('hold.v.36'), Synset('rule.v.07'), Synset('swallow.v.06'), Synset('wink.v.04')] + + ------------- Teardown test ------------- From 1c7263c4cb4b0fff47971b7b7b9de32aadc4faf4 Mon Sep 17 00:00:00 2001 From: Eric Kafe Date: Tue, 26 Oct 2021 10:22:51 +0200 Subject: [PATCH 5/5] Rewind data file in get_version --- nltk/corpus/reader/wordnet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py index a856774a43..fda24a4360 100644 --- a/nltk/corpus/reader/wordnet.py +++ b/nltk/corpus/reader/wordnet.py @@ -1293,6 +1293,7 @@ def _compute_max_depth(self, pos, simulate_root): def get_version(self): fh = self._data_file(ADJ) + fh.seek(0) for line in fh: match = re.search(r"Word[nN]et (\d+|\d+\.\d+) Copyright", line) if match is not None: