huggingface · sgugger · May 24, 2022 · Apr 5, 2022 · Apr 7, 2022 · Apr 7, 2022
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -272,6 +272,8 @@
       title: GPT-J
     - local: model_doc/gpt_neo
       title: GPT Neo
+    - local: model_doc/gpt_neox
+      title: GPT NeoX
     - local: model_doc/hubert
       title: Hubert
     - local: model_doc/perceiver

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
@@ -208,6 +208,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 |            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           GPTNeoX           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |

diff --git a/docs/source/en/model_doc/gpt_neox.mdx b/docs/source/en/model_doc/gpt_neox.mdx
@@ -0,0 +1,71 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# GPT-NeoX
+
+## Overview
+
+The GPT-NeoX-20B model was released in the [EleutherAI/gpt-neox](https://github.com/EleutherAI/gpt-neox) repository.
+It is a GPT-2 like causal language model trained on the
+[Pile](https://pile.eleuther.ai/) dataset.
+
+Development of the model was led by Sid Black, Stella Biderman and Eric Hallahan, and the model was trained with generous the support of [CoreWeave](https://www.coreweave.com/).
+More details about the model can be found in the [manuscript](https://arxiv.org/abs/2204.06745).
+
+### Generation
+
+The `generate()` method can be used to generate text using GPT Neo model.
+
+```python
+>>> from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizer
+
+>>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b")
+>>> tokenizer = GPTNeoXTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+
+>>> prompt = (
+...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
+...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
+...     "researchers was the fact that the unicorns spoke perfect English."
+... )
+
+>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
+>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+```
+
+## GPTNeoXConfig
+
+[[autodoc]] GPTNeoXConfig
+
+## GPTNeoXTokenizer
+
+[[autodoc]] GPTNeoXTokenizer
+
+## GPTNeoXTokenizerFast
+
+[[autodoc]] GPTNeoXTokenizerFast
+
+## GPTNeoXModel
+
+[[autodoc]] GPTNeoXModel
+    - forward
+
+## GPTNeoXForCausalLM
+
+[[autodoc]] GPTNeoXForCausalLM
+    - forward
diff --git a/docs/source/model_doc/gpt_neox.mdx b/docs/source/model_doc/gpt_neox.mdx
@@ -0,0 +1,86 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# GPT-NeoX-20B
+
+## Overview
+
+The GPT-NeoX-20B model was proposed in [Black et al (2022)](http://eaidata.bmk.sh/data/GPT_NeoX_20B.pdf) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, and Samuel Weinbach. It is a 20 billion parameter autoregressive language model.
+
+The abstract from the paper is the following:
+
+*GPT-NeoX-20B is a 20 billion parameter autoregressive language model whose weights will be made freely and openly available to the public through a permissive license. It is, to the best of our knowledge, the largest dense autoregressive model that has publicly available weights. In this paper, we describe the model architecture and training, evaluate its performance, and discuss the broader impacts of its release. We are open-sourcing the training and evaluation code, as well as the model weights, at https://github.com/EleutherAI/gpt-neox.*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [zphang](<https://huggingface.co/zphang). The original code can be found [here](https://github.com/EleutherAI/gpt-neox) and [here](https://github.com/zphang/minimal-gpt-neox-20b).
+
+## GPTNeoXConfig
+
+[[autodoc]] GPTNeoXConfig
+
+
+## GPTNeoXTokenizer
+
+[[autodoc]] GPTNeoXTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+
+## GPTNeoXTokenizerFast
+
+[[autodoc]] GPTNeoXTokenizerFast
+
+
+## GPTNeoXModel
+
+[[autodoc]] GPTNeoXModel
+    - forward
+
+
+## GPTNeoXForCausalLM
+
+[[autodoc]] GPTNeoXForCausalLM
+    - forward
+
+
+## GPTNeoXForMaskedLM
+
+[[autodoc]] GPTNeoXForMaskedLM
+    - forward
+
+
+## GPTNeoXForSequenceClassification
+
+[[autodoc]] transformers.GPTNeoXForSequenceClassification
+    - forward
+
+## GPTNeoXForMultipleChoice
+
+[[autodoc]] transformers.GPTNeoXForMultipleChoice
+    - forward
+
+
+## GPTNeoXForTokenClassification
+
+[[autodoc]] transformers.GPTNeoXForTokenClassification
+    - forward
+
+
+## GPTNeoXForQuestionAnswering
+
+[[autodoc]] GPTNeoXForQuestionAnswering
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -203,6 +203,7 @@
     "models.glpn": ["GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP", "GLPNConfig"],
     "models.gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2Tokenizer"],
     "models.gpt_neo": ["GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoConfig"],
+    "models.gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig", "GPTNeoXTokenizer"],
     "models.gptj": ["GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTJConfig"],
     "models.herbert": ["HerbertTokenizer"],
     "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
@@ -466,6 +467,7 @@
     _import_structure["models.fnet"].append("FNetTokenizerFast")
     _import_structure["models.funnel"].append("FunnelTokenizerFast")
     _import_structure["models.gpt2"].append("GPT2TokenizerFast")
+    _import_structure["models.gpt_neox"].append("GPTNeoXTokenizerFast")
     _import_structure["models.herbert"].append("HerbertTokenizerFast")
     _import_structure["models.layoutlm"].append("LayoutLMTokenizerFast")
     _import_structure["models.layoutlmv2"].append("LayoutLMv2TokenizerFast")
@@ -648,6 +650,7 @@
     _import_structure["modeling_utils"] = ["PreTrainedModel"]
 
     # PyTorch models structure
+
     _import_structure["models.albert"].extend(
         [
             "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1061,6 +1064,15 @@
             "load_tf_weights_in_gpt_neo",
         ]
     )
+    _import_structure["models.gpt_neox"].extend(
+        [
+            "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "GPTNeoXForCausalLM",
+            "GPTNeoXLayer",
+            "GPTNeoXModel",
+            "GPTNeoXPreTrainedModel",
+        ]
+    )
     _import_structure["models.gptj"].extend(
         [
             "GPTJ_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2601,6 +2613,7 @@
     from .models.glpn import GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP, GLPNConfig
     from .models.gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2Tokenizer
     from .models.gpt_neo import GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoConfig
+    from .models.gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig, GPTNeoXTokenizer
     from .models.gptj import GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTJConfig
     from .models.herbert import HerbertTokenizer
     from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
@@ -2837,6 +2850,7 @@
         from .models.fnet import FNetTokenizerFast
         from .models.funnel import FunnelTokenizerFast
         from .models.gpt2 import GPT2TokenizerFast
+        from .models.gpt_neox import GPTNeoXTokenizerFast
         from .models.herbert import HerbertTokenizerFast
         from .models.layoutlm import LayoutLMTokenizerFast
         from .models.layoutlmv2 import LayoutLMv2TokenizerFast
@@ -2930,6 +2944,7 @@
         from .utils.dummy_scatter_objects import *
 
     if is_torch_available():
+
         # Benchmarks
         from .benchmark.benchmark import PyTorchBenchmark
         from .benchmark.benchmark_args import PyTorchBenchmarkArguments
@@ -3327,6 +3342,13 @@
             GPTNeoPreTrainedModel,
             load_tf_weights_in_gpt_neo,
         )
+        from .models.gpt_neox import (
+            GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GPTNeoXForCausalLM,
+            GPTNeoXLayer,
+            GPTNeoXModel,
+            GPTNeoXPreTrainedModel,
+        )
         from .models.gptj import (
             GPTJ_PRETRAINED_MODEL_ARCHIVE_LIST,
             GPTJForCausalLM,

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -60,6 +60,7 @@
     glpn,
     gpt2,
     gpt_neo,
+    gpt_neox,
     gptj,
     herbert,
     hubert,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -33,6 +33,7 @@
         ("dpt", "DPTConfig"),
         ("decision_transformer", "DecisionTransformerConfig"),
         ("glpn", "GLPNConfig"),
+        ("gpt_neox", "GPTNeoXConfig"),
         ("maskformer", "MaskFormerConfig"),
         ("decision_transformer", "DecisionTransformerConfig"),
         ("poolformer", "PoolFormerConfig"),
@@ -140,6 +141,7 @@
         # Add archive maps here)
         ("dpt", "DPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("glpn", "GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("gpt_neox", "GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("maskformer", "MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("poolformer", "POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convnext", "CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -235,6 +237,7 @@
         ("dpt", "DPT"),
         ("decision_transformer", "Decision Transformer"),
         ("glpn", "GLPN"),
+        ("gpt_neox", "GPTNeoX"),
         ("maskformer", "MaskFormer"),
         ("poolformer", "PoolFormer"),
         ("convnext", "ConvNext"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -31,6 +31,7 @@
         ("dpt", "DPTModel"),
         ("decision_transformer", "DecisionTransformerModel"),
         ("glpn", "GLPNModel"),
+        ("gpt_neox", "GPTNeoXModel"),
         ("maskformer", "MaskFormerModel"),
         ("decision_transformer", "DecisionTransformerModel"),
         ("decision_transformer_gpt2", "DecisionTransformerGPT2Model"),
@@ -231,6 +232,7 @@
 MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Causal LM mapping
+        ("gpt_neox", "GPTNeoXForCausalLM"),
         ("xglm", "XGLMForCausalLM"),
         ("plbart", "PLBartForCausalLM"),
         ("qdqbert", "QDQBertLMHeadModel"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -180,6 +180,7 @@
             ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
             ("hubert", ("Wav2Vec2CTCTokenizer", None)),
             ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+            ("gpt_neox", ("GPTNeoXTokenizer", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
             ("luke", ("LukeTokenizer", None)),
             ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
             ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),

diff --git a/src/transformers/models/gpt_neox/__init__.py b/src/transformers/models/gpt_neox/__init__.py
@@ -0,0 +1,62 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"],
+    "tokenization_gpt_neox": ["GPTNeoXTokenizer"],
+}
+
+if is_tokenizers_available():
+    _import_structure["tokenization_gpt_neox_fast"] = ["GPTNeoXTokenizerFast"]
+
+if is_torch_available():
+    _import_structure["modeling_gpt_neox"] = [
+        "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "GPTNeoXForCausalLM",
+        "GPTNeoXLayer",
+        "GPTNeoXModel",
+        "GPTNeoXPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig
+    from .tokenization_gpt_neox import GPTNeoXTokenizer
+
+    if is_tokenizers_available():
+        from .tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
+
+    if is_torch_available():
+        from .modeling_gpt_neox import (
+            GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GPTNeoXForCausalLM,
+            GPTNeoXLayer,
+            GPTNeoXModel,
+            GPTNeoXPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)