dmlc · barry-jin · Apr 4, 2022 · Apr 5, 2022 · Apr 5, 2022 · Apr 6, 2022
@@ -48,7 +48,7 @@ jobs:
                                              --saved-output coverage.xml \
                                              --save-path coverage.xml \
                                              --remote https://github.com/${{ github.repository }} \
-                                             --command "python3 -m pip install pytest-forked && python3 -m pytest --forked --cov=. --cov-config=./.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow ./tests/" \
+                                             --command "python3 -m pip install pytest-forked && python3 -m pytest -vv --forked --cov=. --cov-config=./.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow ./tests/" \
                                              --wait | tee batch_job.log
 
 
@@ -64,7 +64,7 @@ jobs:
                                              --saved-output coverage.xml \
                                              --save-path coverage.xml \
                                              --remote https://github.com/${{ github.event.pull_request.head.repo.full_name }} \
-                                             --command "python3 -m pip install pytest-forked && python3 -m pytest --forked --cov=. --cov-config=./.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow ./tests/" \
+                                             --command "python3 -m pip install pytest-forked && python3 -m pytest -vv --forked --cov=. --cov-config=./.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow ./tests/" \
                                              --wait | tee batch_job.log
 
       - name: Wait for job and copy files from AWS s3

@@ -64,7 +64,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install setuptools pytest pytest-cov contextvars
           python -m pip install --upgrade cython
-          python -m pip install --pre "mxnet>=2.0.0b20210121" -f https://dist.mxnet.io/python
+          python -m pip install mxnet==2.0.0b1
           python -m pip install -U -e .[extras,dev]
       - name: Build and Install TVM
         if: matrix.os == 'ubuntu-latest'

@@ -35,13 +35,13 @@ following commands:
 
 ```bash
 # Install the version with CUDA 10.2
-python3 -m pip install -U --pre "mxnet-cu102>=2.0.0a"
+python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b"
 
 # Install the version with CUDA 11
-python3 -m pip install -U --pre "mxnet-cu110>=2.0.0a"
+python3 -m pip install -U --pre "mxnet-cu110>=2.0.0b"
 
 # Install the cpu-only version
-python3 -m pip install -U --pre "mxnet>=2.0.0a"
+python3 -m pip install -U --pre "mxnet>=2.0.0b"
 ```
 
 

@@ -231,5 +231,5 @@ def pytest_generate_tests(metafunc):
     devices = metafunc.config.option.device
     if not devices:
         devices = ['cpu']
-    if 'ctx' in metafunc.fixturenames:
-        metafunc.parametrize("ctx", [getattr(mx, device)() for device in devices])
+    if 'device' in metafunc.fixturenames:
+        metafunc.parametrize("device", [getattr(mx, device)() for device in devices])
@@ -57,7 +57,7 @@ Select your preferences and run the install command.
            .. code-block:: bash
 
               # Install Apache MXNet (incubating) 2 Alhpa or newer.
-              python3 -m pip install -U --pre "mxnet>=2.0.0a"
+              python3 -m pip install -U --pre "mxnet>=2.0.0b"
 
               # Install GluonNLP
               git clone https://github.com/dmlc/gluon-nlp.git
@@ -71,7 +71,7 @@ Select your preferences and run the install command.
               # Install Apache MXNet (incubating) 2 Alhpa or newer.
               # Here we assume CUDA 10.2 is installed. You can change the number
               # according to your own CUDA version, e.g., cu101, cu110
-              python3 -m pip install -U --pre "mxnet-cu102>=2.0.0a"
+              python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b"
 
               # Install GluonNLP
               git clone https://github.com/dmlc/gluon-nlp.git
@@ -85,7 +85,7 @@ Select your preferences and run the install command.
            .. code-block:: bash
 
               # Install Apache MXNet (incubating) 2 Alhpa or newer.
-              python3 -m pip install -U --pre "mxnet>=2.0.0a"
+              python3 -m pip install -U --pre "mxnet>=2.0.0b"
 
               # Install GluonNLP
               git clone https://github.com/dmlc/gluon-nlp.git
@@ -99,7 +99,7 @@ Select your preferences and run the install command.
               # Install Apache MXNet (incubating) 2 Alhpa or newer.
               # Here we assume CUDA 10.2 is installed. You can change the number
               # according to your own CUDA version, e.g., cu100, cu101
-              python3 -m pip install -U --pre "mxnet-cu102>=2.0.0a"
+              python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b"
 
               # Install GluonNLP
               git clone https://github.com/dmlc/gluon-nlp.git

@@ -33,11 +33,11 @@ To begin, let's first import a few packages that we'll need for this example:
 import warnings
 warnings.filterwarnings('ignore')
 
-from mxnet import gluon, nd
+from mxnet import gluon, np
 import gluonnlp as nlp
 import re
 import collections
-import numpy as np
+import numpy as onp
 
 ```
 
@@ -160,7 +160,7 @@ For example,
 
 ```{.python .input}
 def simple(words):
-    return np.ones((len(words), 300))
+    return onp.ones((len(words), 300))
 matrix = nlp.embedding.load_embeddings(vocab, 'wiki.simple', unk_method=simple)
 ```
 
@@ -217,7 +217,7 @@ input_dim, output_dim = matrix.shape
 layer = gluon.nn.Embedding(input_dim, output_dim)
 layer.initialize()
 layer.weight.set_data(matrix)
-layer(nd.array([5, 4]))[:, :5]
+layer(np.array([5, 4]))[:, :5]
 ```
 
 ### Creating Vocabulary from Pre-trained Word Embeddings
@@ -259,16 +259,16 @@ cosine similarity. Cosine similarity determines the similarity between two vecto
 ```{.python .input}
 import numpy as np
 def cos_sim(x, y):
-    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
+    return onp.dot(x, y) / (onp.linalg.norm(x) * onp.linalg.norm(y))
 ```
 
 The range of cosine similarity between two vectors can be between -1 and 1. The
 larger the value, the larger the similarity between the two vectors.
 
 ```{.python .input}
-x = np.array([1, 2])
-y = np.array([10, 20])
-z = np.array([-1, -2])
+x = onp.array([1, 2])
+y = onp.array([10, 20])
+z = onp.array([-1, -2])
 
 print(cos_sim(x, y))
 print(cos_sim(x, z))
@@ -287,16 +287,16 @@ We can then find the indices for which the dot product is greatest (`topk`), whi
 
 ```{.python .input}
 def norm_vecs_by_row(x):
-    return x / np.sqrt(np.sum(x * x, axis=1) + 1E-10).reshape((-1,1))
+    return x / onp.sqrt(onp.sum(x * x, axis=1) + 1E-10).reshape((-1,1))
 
 def topk(res, k):
-    part = np.argpartition(res, -k)[-k:]
-    return part[np.argsort(res[part])].tolist()[::-1]
+    part = onp.argpartition(res, -k)[-k:]
+    return part[onp.argsort(res[part])].tolist()[::-1]
 
 def get_knn(vocab, matrix, k, word):
     word_vec = matrix[vocab[word]].reshape((-1, 1))
     vocab_vecs = norm_vecs_by_row(matrix)
-    dot_prod = np.dot(vocab_vecs, word_vec)
+    dot_prod = onp.dot(vocab_vecs, word_vec)
     indices = topk(dot_prod.reshape((len(vocab), )), k=k+1)
     # Remove unknown and input tokens.
     return vocab.to_tokens(indices[1:])
@@ -351,7 +351,7 @@ def get_top_k_by_analogy(vocab, matrix, k, word1, word2, word3):
     word_vecs = [matrix[vocab[word]] for word in [word1, word2, word3]]
     word_diff = (word_vecs[1] - word_vecs[0] + word_vecs[2]).reshape((-1, 1))
     vocab_vecs = norm_vecs_by_row(matrix)
-    dot_prod = np.dot(vocab_vecs, word_diff)
+    dot_prod = onp.dot(vocab_vecs, word_diff)
     indices = topk(dot_prod.reshape((len(vocab), )), k=k)
     return vocab.to_tokens(indices)
 ```

@@ -5,7 +5,6 @@
 from benchmark_utils import GluonNLPBackboneBenchmark
 import multiprocessing as mp
 from multiprocessing import Process
-mx.npx.set_np()
 
 
 MODELS = [

@@ -471,8 +471,8 @@ def traceit(frame, event, args):
         if log_gpu:
             # Clear GPU caches
             if is_mxnet_available():
-                for ctx in mx_all_contexts:
-                    ctx.empty_cache()
+                for device in mx_all_contexts:
+                    device.empty_cache()
             if is_torch_available():
                 torch_empty_cache()
             if is_tf_available():
@@ -665,10 +665,10 @@ def compile_tvm_graph_executor(model, model_name, layout, compute_layout,
     with tvm.transform.PassContext(opt_level=opt_level, required_pass=required_pass):
         lib = relay.build(mod, target, params=params)
     if use_gpu:
-        ctx = tvm.gpu()
+        device = tvm.gpu()
     else:
-        ctx = tvm.cpu()
-    rt = graph_executor.GraphModule(lib["default"](ctx))
+        device = tvm.cpu()
+    rt = graph_executor.GraphModule(lib["default"](device))
     _TVM_RT_CACHE[key] = rt
     return rt
 
@@ -767,9 +767,9 @@ def _inference_speed_memory(self, model_name: str, batch_size: int, sequence_len
         else:
             dtype = 'float32'
         if self._use_gpu:
-            ctx = mxnet.gpu()
+            device = mxnet.gpu()
         else:
-            ctx = mxnet.cpu()
+            device = mxnet.cpu()
         model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
         cfg.defrost()
         cfg.MODEL.layout = self._layout
@@ -780,22 +780,22 @@ def _inference_speed_memory(self, model_name: str, batch_size: int, sequence_len
             model = model_cls.from_cfg(cfg, extract_feature=True, dtype=dtype)
         else:
             model = model_cls.from_cfg(cfg, dtype=dtype)
-        model.load_parameters(backbone_param_path, ctx=ctx, cast_dtype=True)
+        model.load_parameters(backbone_param_path, device=device, cast_dtype=True)
         model.cast(dtype)
         model.hybridize(static_alloc=True, static_shape=True)
         vocab_size = cfg.MODEL.vocab_size
         if self._layout == 'NT':
             input_ids = mxnet.np.random.randint(0, vocab_size, (batch_size, sequence_length),
-                                                dtype=np.int32, ctx=ctx)
-            token_types = mxnet.np.zeros((batch_size, sequence_length), dtype=np.int32, ctx=ctx)
+                                                dtype=np.int32, device=device)
+            token_types = mxnet.np.zeros((batch_size, sequence_length), dtype=np.int32, device=device)
             valid_length = mxnet.np.full((batch_size,), sequence_length,
-                                         dtype=np.int32, ctx=ctx)
+                                         dtype=np.int32, device=device)
         elif self._layout == 'TN':
             input_ids = mxnet.np.random.randint(0, vocab_size, (sequence_length, batch_size),
-                                                dtype=np.int32, ctx=ctx)
-            token_types = mxnet.np.zeros((sequence_length, batch_size), dtype=np.int32, ctx=ctx)
+                                                dtype=np.int32, device=device)
+            token_types = mxnet.np.zeros((sequence_length, batch_size), dtype=np.int32, device=device)
             valid_length = mxnet.np.full((batch_size,), sequence_length,
-                                         dtype=np.int32, ctx=ctx)
+                                         dtype=np.int32, device=device)
         else:
             raise NotImplementedError
         mxnet.npx.waitall()
@@ -817,17 +817,17 @@ def run_forward():
             tvm = try_import_tvm()
             run_forward()
             if self._use_gpu:
-                ctx = tvm.gpu()
+                device = tvm.gpu()
             else:
-                ctx = tvm.cpu()
+                device = tvm.cpu()
             rt = compile_tvm_graph_executor(model=model, model_name=model_name,
                                            layout=self._layout, compute_layout=self._compute_layout,
                                            batch_size=batch_size, seq_length=sequence_length,
                                            instance_type=self._instance_type,
                                            dtype='float32' if not self._use_fp16 else 'float16')
-            tvm_input_ids = tvm.nd.array(input_ids.asnumpy(), ctx=ctx)
-            tvm_token_types = tvm.nd.array(token_types.asnumpy(), ctx=ctx)
-            tvm_valid_length = tvm.nd.array(valid_length.asnumpy(), ctx=ctx)
+            tvm_input_ids = tvm.nd.array(input_ids.asnumpy(), device=device)
+            tvm_token_types = tvm.nd.array(token_types.asnumpy(), device=device)
+            tvm_valid_length = tvm.nd.array(valid_length.asnumpy(), device=device)
 
             if 'roberta' in model_name or 'xlmr' in model_name:
                 rt.set_input(data0=tvm_input_ids, data1=tvm_valid_length)
@@ -837,7 +837,7 @@ def run_forward():
                 rt.set_input(data0=tvm_input_ids, data1=tvm_token_types,
                              data2=tvm_valid_length)
             # ftimer returns a ProfileResult
-            ftimer = rt.module.time_evaluator("run", ctx, number=3, repeat=self._repeat)
+            ftimer = rt.module.time_evaluator("run", device, number=3, repeat=self._repeat)
             runtimes = np.min(ftimer().results)
         else:
             timeit.repeat(run_forward, repeat=1, number=3)
@@ -867,9 +867,9 @@ def _train_speed_memory(self, model_name: str, batch_size: int, sequence_length:
             amp.init()
 
         if self._use_gpu:
-            ctx = mxnet.gpu()
+            device = mxnet.gpu()
         else:
-            ctx = mxnet.cpu()
+            device = mxnet.cpu()
         model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
         cfg.defrost()
         cfg.MODEL.layout = self._layout
@@ -880,7 +880,7 @@ def _train_speed_memory(self, model_name: str, batch_size: int, sequence_length:
             model = model_cls.from_cfg(cfg, extract_feature=True)
         else:
             model = model_cls.from_cfg(cfg)
-        model.load_parameters(backbone_param_path, ctx=ctx)
+        model.load_parameters(backbone_param_path, device=device)
         model.hybridize(static_alloc=True)
         vocab_size = cfg.MODEL.vocab_size
         if hasattr(cfg.MODEL, 'units'):
@@ -889,27 +889,27 @@ def _train_speed_memory(self, model_name: str, batch_size: int, sequence_length:
             out_units = cfg.MODEL.DECODER.units
         if self._layout == 'NT':
             input_ids = mxnet.np.random.randint(0, vocab_size, (batch_size, sequence_length),
-                                                dtype=np.int32, ctx=ctx)
-            token_types = mxnet.np.zeros((batch_size, sequence_length), dtype=np.int32, ctx=ctx)
+                                                dtype=np.int32, device=device)
+            token_types = mxnet.np.zeros((batch_size, sequence_length), dtype=np.int32, device=device)
             valid_length = mxnet.np.full((batch_size,), sequence_length,
-                                         dtype=np.int32, ctx=ctx)
+                                         dtype=np.int32, device=device)
             contextual_embedding_ograd = mxnet.np.random.normal(
                 0, 1, (batch_size, sequence_length, out_units),
-                dtype=np.float32, ctx=ctx)
+                dtype=np.float32, device=device)
             pooled_out_ograd = mxnet.np.random.normal(
-                0, 1, (batch_size, out_units), dtype=np.float32, ctx=ctx)
+                0, 1, (batch_size, out_units), dtype=np.float32, device=device)
         elif self._layout == 'TN':
             input_ids = mxnet.np.random.randint(0, vocab_size, (sequence_length, batch_size),
-                                                dtype=np.int32, ctx=ctx)
-            token_types = mxnet.np.zeros((sequence_length, batch_size), dtype=np.int32, ctx=ctx)
+                                                dtype=np.int32, device=device)
+            token_types = mxnet.np.zeros((sequence_length, batch_size), dtype=np.int32, device=device)
             valid_length = mxnet.np.full((batch_size,), sequence_length,
-                                         dtype=np.int32, ctx=ctx)
+                                         dtype=np.int32, device=device)
             contextual_embedding_ograd = mxnet.np.random.normal(
                 0, 1, (sequence_length, batch_size, out_units),
-                dtype=np.float32, ctx=ctx)
+                dtype=np.float32, device=device)
             pooled_out_ograd = mxnet.np.random.normal(0, 1, (batch_size, out_units),
                                                       dtype=np.float32,
-                                                      ctx=ctx)
+                                                      device=device)
         else:
             raise NotImplementedError
         if model_cls.__name__ in ['BertModel', 'AlbertModel', 'ElectraModel', 'MobileBertModel']:
@@ -939,7 +939,7 @@ def train_step():
         mxnet.npx.waitall()
         runtimes = timeit.repeat(train_step, repeat=self._repeat, number=3)
         mxnet.npx.waitall()
-        ctx.empty_cache()
+        device.empty_cache()
         mxnet.npx.waitall()
         # Profile memory
         if self._use_gpu:

@@ -9,7 +9,7 @@
 from gluonnlp.models import get_backbone
 from gluonnlp.utils.parameter import clip_grad_global_norm
 from gluonnlp.utils.preprocessing import get_trimmed_lengths
-from gluonnlp.utils.misc import get_mxnet_visible_ctx, grouper, repeat
+from gluonnlp.utils.misc import get_mxnet_visible_device, grouper, repeat
 from mxnet.gluon.data import batchify as bf
 from mxnet.gluon.data import DataLoader
 from mxnet.lr_scheduler import PolyScheduler
@@ -30,7 +30,7 @@ def forward(self, data, token_types, valid_length):
         out = self.out_proj(pooled_out)
         return out
 
-    def initialize_with_pretrained_backbone(self, backbone_params_path, ctx=None):
-        self.backbone.load_parameters(backbone_params_path, ctx=ctx)
-        self.out_proj.initialize(ctx=ctx)
+    def initialize_with_pretrained_backbone(self, backbone_params_path, device=None):
+        self.backbone.load_parameters(backbone_params_path, device=device)
+        self.out_proj.initialize(device=device)
 
@@ -10,7 +10,7 @@
 from gluonnlp.models import get_backbone
 from gluonnlp.utils.parameter import clip_grad_global_norm
 from gluonnlp.utils.preprocessing import get_trimmed_lengths
-from gluonnlp.utils.misc import get_mxnet_visible_ctx, grouper, repeat
+from gluonnlp.utils.misc import get_mxnet_visible_device, grouper, repeat
 from mxnet.gluon.data import batchify as bf
 from mxnet.gluon.data import DataLoader
 from mxnet.lr_scheduler import PolyScheduler