Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

update CI #1594

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
27 changes: 13 additions & 14 deletions docs/tutorials/word_embedding/word_embedding.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ To begin, let's first import a few packages that we'll need for this example:
import warnings
warnings.filterwarnings('ignore')

from mxnet import gluon, nd
from mxnet import gluon, np
import gluonnlp as nlp
import re
import collections
import numpy as np
import numpy as onp

```

Expand Down Expand Up @@ -160,7 +160,7 @@ For example,

```{.python .input}
def simple(words):
return np.ones((len(words), 300))
return onp.ones((len(words), 300))
matrix = nlp.embedding.load_embeddings(vocab, 'wiki.simple', unk_method=simple)
```

Expand Down Expand Up @@ -217,7 +217,7 @@ input_dim, output_dim = matrix.shape
layer = gluon.nn.Embedding(input_dim, output_dim)
layer.initialize()
layer.weight.set_data(matrix)
layer(nd.array([5, 4]))[:, :5]
layer(np.array([5, 4]))[:, :5]
```

### Creating Vocabulary from Pre-trained Word Embeddings
Expand Down Expand Up @@ -257,18 +257,17 @@ To apply word embeddings, we need to define
cosine similarity. Cosine similarity determines the similarity between two vectors.

```{.python .input}
import numpy as np
def cos_sim(x, y):
return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
return onp.dot(x, y) / (onp.linalg.norm(x) * onp.linalg.norm(y))
```

The range of cosine similarity between two vectors can be between -1 and 1. The
larger the value, the larger the similarity between the two vectors.

```{.python .input}
x = np.array([1, 2])
y = np.array([10, 20])
z = np.array([-1, -2])
x = onp.array([1, 2])
y = onp.array([10, 20])
z = onp.array([-1, -2])

print(cos_sim(x, y))
print(cos_sim(x, z))
Expand All @@ -287,16 +286,16 @@ We can then find the indices for which the dot product is greatest (`topk`), whi

```{.python .input}
def norm_vecs_by_row(x):
return x / np.sqrt(np.sum(x * x, axis=1) + 1E-10).reshape((-1,1))
return x / onp.sqrt(onp.sum(x * x, axis=1) + 1E-10).reshape((-1,1))

def topk(res, k):
part = np.argpartition(res, -k)[-k:]
return part[np.argsort(res[part])].tolist()[::-1]
part = onp.argpartition(res, -k)[-k:]
return part[onp.argsort(res[part])].tolist()[::-1]

def get_knn(vocab, matrix, k, word):
word_vec = matrix[vocab[word]].reshape((-1, 1))
vocab_vecs = norm_vecs_by_row(matrix)
dot_prod = np.dot(vocab_vecs, word_vec)
dot_prod = onp.dot(vocab_vecs, word_vec)
indices = topk(dot_prod.reshape((len(vocab), )), k=k+1)
# Remove unknown and input tokens.
return vocab.to_tokens(indices[1:])
Expand Down Expand Up @@ -351,7 +350,7 @@ def get_top_k_by_analogy(vocab, matrix, k, word1, word2, word3):
word_vecs = [matrix[vocab[word]] for word in [word1, word2, word3]]
word_diff = (word_vecs[1] - word_vecs[0] + word_vecs[2]).reshape((-1, 1))
vocab_vecs = norm_vecs_by_row(matrix)
dot_prod = np.dot(vocab_vecs, word_diff)
dot_prod = onp.dot(vocab_vecs, word_diff)
indices = topk(dot_prod.reshape((len(vocab), )), k=k)
return vocab.to_tokens(indices)
```
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def find_version(*file_paths):
'contextvars',
'pyarrow',
'sentencepiece==0.1.95',
'protobuf',
'protobuf<=3.20.1',
'pandas',
'tokenizers==0.9.4',
'dataclasses;python_version<"3.7"', # Dataclass for python <= 3.6
Expand Down
1 change: 1 addition & 0 deletions tests/test_utils_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def test_download_s3(overwrite):
overwrite=overwrite)


@pytest.mark.skip("RuntimeError: Failed downloading url https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2014-41/cc-index.paths.gz")
@pytest.mark.remote_required
@pytest.mark.parametrize('overwrite', [False, True])
def test_download_https(overwrite):
Expand Down