Refactor gensim.doctest to work for gensim 4.0.0 and up (#2914)

* Refactor gensim.doctest to work for gensim 4.0.0 and up * Removed unused gensim dependency * Removed gensim from extras_require optional dependencies
nltk · Dec 19, 2021 · 6b60213 · 6b60213
1 parent 59aa3fb
commit 6b60213
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 17 deletions.
diff --git a/nltk/test/gensim.doctest b/nltk/test/gensim.doctest
@@ -32,7 +32,7 @@ It might take some time to train the model. So, after it is trained, it can be s
 
 The model will be the list of words with their embedding. We can easily get the vector representation of a word.
 
-    >>> len(new_model['university'])
+    >>> len(new_model.wv['university'])
     100
 
 There are some supporting functions already implemented in Gensim to manipulate with word embeddings.
@@ -54,7 +54,7 @@ The full model is from https://code.google.com/p/word2vec/ (about 3 GB).
 
 We pruned the model to only include the most common words (~44k words).
 
-    >>> len(model.vocab)
+    >>> len(model)
     43981
 
 Each word is represented in the space of 300 dimensions:
@@ -89,7 +89,7 @@ We can visualize the word embeddings using t-SNE (https://lvdmaaten.github.io/ts
 |    max_count = 1000
 |    X = np.zeros(shape=(max_count,len(model['university'])))
 |
-|    for term in model.vocab:
+|    for term in model.index_to_key:
 |        X[count] = model[term]
 |        labels.append(term)
 |        count+= 1
@@ -123,23 +123,19 @@ Here is the supporting code to extract part of the binary model (GoogleNews-vect
 We use this code to get the `word2vec_sample` model.
 
 |    import gensim
-|    from gensim.models.word2vec import Word2Vec
 |    # Load the binary model
-|    model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True);
+|    model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True)
 |
 |    # Only output word that appear in the Brown corpus
 |    from nltk.corpus import brown
 |    words = set(brown.words())
-|    print (len(words))
+|    print(len(words))
 |
 |    # Output presented word to a temporary file
 |    out_file = 'pruned.word2vec.txt'
-|    f = open(out_file,'wb')
+|    with open(out_file,'w') as f:
+|        word_presented = words.intersection(model.index_to_key)
+|        f.write('{} {}\n'.format(len(word_presented),len(model['word'])))
 |
-|    word_presented = words.intersection(model.vocab.keys())
-|    f.write('{} {}\n'.format(len(word_presented),len(model['word'])))
-|
-|    for word in word_presented:
-|        f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word])))
-|
-|    f.close()
+|        for word in word_presented:
+|            f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word])))
diff --git a/pip-req.txt b/pip-req.txt
@@ -6,7 +6,6 @@ scipy>=0.13.2
 matplotlib>=1.3.1
 scikit-learn>=0.14.1
 python-crfsuite>=0.8.2
-gensim>=0.11.1,<4.0.0
 pyparsing>=2.0.3
 twython>=3.2.0
 regex>=2021.8.3

diff --git a/requirements-ci.txt b/requirements-ci.txt
@@ -1,4 +1,4 @@
-gensim<4.0.0
+gensim>=4.0.0
 matplotlib
 pytest
 pytest-mock

diff --git a/setup.py b/setup.py
@@ -31,7 +31,6 @@
 # Specify groups of optional dependencies
 extras_require = {
     "machine_learning": [
-        "gensim<4.0.0",
         "numpy",
         "python-crfsuite",
         "scikit-learn",