From 9e935007b02b27d7f4690c1a3148006de31fb747 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Sat, 27 Aug 2022 17:59:36 +0800 Subject: [PATCH 01/23] add_ernie --- docs/source/en/model_doc/ernie.mdx | 74 + src/transformers/__init__.py | 44 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + src/transformers/models/auto/modeling_auto.py | 10 + .../models/auto/tokenization_auto.py | 1 + src/transformers/models/ernie/__init__.py | 88 + .../models/ernie/configuration_ernie.py | 169 ++ .../models/ernie/modeling_ernie.py | 1819 +++++++++++++++++ tests/models/ernie/__init__.py | 0 tests/models/ernie/test_modeling_ernie.py | 579 ++++++ 11 files changed, 2788 insertions(+) create mode 100644 docs/source/en/model_doc/ernie.mdx create mode 100644 src/transformers/models/ernie/__init__.py create mode 100644 src/transformers/models/ernie/configuration_ernie.py create mode 100644 src/transformers/models/ernie/modeling_ernie.py create mode 100644 tests/models/ernie/__init__.py create mode 100644 tests/models/ernie/test_modeling_ernie.py diff --git a/docs/source/en/model_doc/ernie.mdx b/docs/source/en/model_doc/ernie.mdx new file mode 100644 index 0000000000000..db04a99c992c2 --- /dev/null +++ b/docs/source/en/model_doc/ernie.mdx @@ -0,0 +1,74 @@ + + +# Ernie + +## Overview +ERNIE is a series of powerful models proposed by baidu, especially in Chinese tasks, +including [ERNIE1.0](https://arxiv.org/abs/1904.09223), [ERNIE2.0](https://ojs.aaai.org/index.php/AAAI/article/view/6428), +[ERNIE3.0](https://arxiv.org/abs/2107.02137), [ERNIE-Gram](https://arxiv.org/abs/2010.12148), [ERNIE-health](https://arxiv.org/abs/2110.07244), etc. + +These models are contributed by [nghuyong](https://huggingface.co/nghuyong) and the official code can be found in [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) (in PaddlePaddle). + +## ErnieConfig + +[[autodoc]] ErnieConfig + - all + +## Ernie specific outputs + +[[autodoc]] models.ernie.modeling_ernie.ErnieForPreTrainingOutput + +## ErnieModel + +[[autodoc]] ErnieModel + - forward + +## ErnieForPreTraining + +[[autodoc]] ErnieForPreTraining + - forward + +## ErnieLMHeadModel + +[[autodoc]] ErnieLMHeadModel + - forward + +## ErnieForMaskedLM + +[[autodoc]] ErnieForMaskedLM + - forward + +## ErnieForNextSentencePrediction + +[[autodoc]] ErnieForNextSentencePrediction + - forward + +## ErnieForSequenceClassification + +[[autodoc]] ErnieForSequenceClassification + - forward + +## ErnieForMultipleChoice + +[[autodoc]] ErnieForMultipleChoice + - forward + +## ErnieForTokenClassification + +[[autodoc]] ErnieForTokenClassification + - forward + +## ErnieForQuestionAnswering + +[[autodoc]] ErnieForQuestionAnswering + - forward \ No newline at end of file diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 4651c3b5b9e92..c42265698c347 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -143,6 +143,13 @@ "BertTokenizer", "WordpieceTokenizer", ], + "models.ernie": [ + "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP", + "BasicTokenizer", + "ErnieConfig", + + "WordpieceTokenizer", + ], "models.bert_generation": ["BertGenerationConfig"], "models.bert_japanese": ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"], "models.bertweet": ["BertweetTokenizer"], @@ -888,6 +895,22 @@ "load_tf_weights_in_bert", ] ) + _import_structure["models.ernie"].extend( + [ + "ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST", + "ErnieForMaskedLM", + "ErnieForMultipleChoice", + "ErnieForNextSentencePrediction", + "ErnieForPreTraining", + "ErnieForQuestionAnswering", + "ErnieForSequenceClassification", + "ErnieForTokenClassification", + "ErnieLayer", + "ErnieLMHeadModel", + "ErnieModel", + "ErniePreTrainedModel", + ] + ) _import_structure["models.bert_generation"].extend( [ "BertGenerationDecoder", @@ -2989,6 +3012,13 @@ BertTokenizer, WordpieceTokenizer, ) + from .models.ernie import ( + ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, + BasicTokenizer, + ErnieConfig, + + WordpieceTokenizer, + ) from .models.bert_generation import BertGenerationConfig from .models.bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer from .models.bertweet import BertweetTokenizer @@ -3630,6 +3660,20 @@ BertPreTrainedModel, load_tf_weights_in_bert, ) + from .models.ernie import ( + ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST, + ErnieForMaskedLM, + ErnieForMultipleChoice, + ErnieForNextSentencePrediction, + ErnieForPreTraining, + ErnieForQuestionAnswering, + ErnieForSequenceClassification, + ErnieForTokenClassification, + ErnieLayer, + ErnieLMHeadModel, + ErnieModel, + ErniePreTrainedModel, + ) from .models.bert_generation import ( BertGenerationDecoder, BertGenerationEncoder, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 05e0d6ea13207..2f964cf4219e1 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -24,6 +24,7 @@ bartpho, beit, bert, + ernie, bert_generation, bert_japanese, bertweet, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index c387b89530d91..6017fd8896402 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -33,6 +33,7 @@ ("bart", "BartConfig"), ("beit", "BeitConfig"), ("bert", "BertConfig"), + ("ernie", "ErnieConfig"), ("bert-generation", "BertGenerationConfig"), ("big_bird", "BigBirdConfig"), ("bigbird_pegasus", "BigBirdPegasusConfig"), @@ -162,6 +163,7 @@ ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("ernie", "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("big_bird", "BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("bigbird_pegasus", "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("blenderbot", "BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -278,6 +280,7 @@ ("bartpho", "BARTpho"), ("beit", "BEiT"), ("bert", "BERT"), + ("ernie", "Ernie"), ("bert-generation", "Bert Generation"), ("bert-japanese", "BertJapanese"), ("bertweet", "BERTweet"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 5060b535b05da..1af787e0f642f 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -32,6 +32,7 @@ ("bart", "BartModel"), ("beit", "BeitModel"), ("bert", "BertModel"), + ("ernie", "ErnieModel"), ("bert-generation", "BertGenerationEncoder"), ("big_bird", "BigBirdModel"), ("bigbird_pegasus", "BigBirdPegasusModel"), @@ -155,6 +156,7 @@ ("albert", "AlbertForPreTraining"), ("bart", "BartForConditionalGeneration"), ("bert", "BertForPreTraining"), + ("ernie", "ErnieForPreTraining"), ("big_bird", "BigBirdForPreTraining"), ("bloom", "BloomForCausalLM"), ("camembert", "CamembertForMaskedLM"), @@ -208,6 +210,7 @@ ("albert", "AlbertForMaskedLM"), ("bart", "BartForConditionalGeneration"), ("bert", "BertForMaskedLM"), + ("ernie", "ErnieForMaskedLM"), ("big_bird", "BigBirdForMaskedLM"), ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"), ("blenderbot-small", "BlenderbotSmallForConditionalGeneration"), @@ -272,6 +275,7 @@ # Model for Causal LM mapping ("bart", "BartForCausalLM"), ("bert", "BertLMHeadModel"), + ("ernie", "ErnieLMHeadModel"), ("bert-generation", "BertGenerationDecoder"), ("big_bird", "BigBirdForCausalLM"), ("bigbird_pegasus", "BigBirdPegasusForCausalLM"), @@ -404,6 +408,7 @@ ("albert", "AlbertForMaskedLM"), ("bart", "BartForConditionalGeneration"), ("bert", "BertForMaskedLM"), + ("ernie", "ErnieForMaskedLM"), ("big_bird", "BigBirdForMaskedLM"), ("camembert", "CamembertForMaskedLM"), ("convbert", "ConvBertForMaskedLM"), @@ -489,6 +494,7 @@ ("albert", "AlbertForSequenceClassification"), ("bart", "BartForSequenceClassification"), ("bert", "BertForSequenceClassification"), + ("ernie", "ErnieForSequenceClassification"), ("big_bird", "BigBirdForSequenceClassification"), ("bigbird_pegasus", "BigBirdPegasusForSequenceClassification"), ("bloom", "BloomForSequenceClassification"), @@ -547,6 +553,7 @@ ("albert", "AlbertForQuestionAnswering"), ("bart", "BartForQuestionAnswering"), ("bert", "BertForQuestionAnswering"), + ("ernie", "ErnieForQuestionAnswering"), ("big_bird", "BigBirdForQuestionAnswering"), ("bigbird_pegasus", "BigBirdPegasusForQuestionAnswering"), ("camembert", "CamembertForQuestionAnswering"), @@ -608,6 +615,7 @@ # Model for Token Classification mapping ("albert", "AlbertForTokenClassification"), ("bert", "BertForTokenClassification"), + ("ernie", "ErnieForTokenClassification"), ("big_bird", "BigBirdForTokenClassification"), ("bloom", "BloomForTokenClassification"), ("camembert", "CamembertForTokenClassification"), @@ -651,6 +659,7 @@ # Model for Multiple Choice mapping ("albert", "AlbertForMultipleChoice"), ("bert", "BertForMultipleChoice"), + ("ernie", "ErnieForMultipleChoice"), ("big_bird", "BigBirdForMultipleChoice"), ("camembert", "CamembertForMultipleChoice"), ("canine", "CanineForMultipleChoice"), @@ -686,6 +695,7 @@ MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict( [ ("bert", "BertForNextSentencePrediction"), + ("ernie", "ErnieForNextSentencePrediction"), ("fnet", "FNetForNextSentencePrediction"), ("megatron-bert", "MegatronBertForNextSentencePrediction"), ("mobilebert", "MobileBertForNextSentencePrediction"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 8ece13b79fe3f..3add419e73909 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -63,6 +63,7 @@ ), ("bartpho", ("BartphoTokenizer", None)), ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)), ("bert-japanese", ("BertJapaneseTokenizer", None)), ("bertweet", ("BertweetTokenizer", None)), diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py new file mode 100644 index 0000000000000..b1d9013528db4 --- /dev/null +++ b/src/transformers/models/ernie/__init__.py @@ -0,0 +1,88 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_tensorflow_text_available, + is_torch_available, +) + + +_import_structure = { + "configuration_ernie": ["ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ErnieConfig", "ErnieOnnxConfig"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_ernie"] = [ + "ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST", + "ErnieForMaskedLM", + "ErnieForMultipleChoice", + "ErnieForNextSentencePrediction", + "ErnieForPreTraining", + "ErnieForQuestionAnswering", + "ErnieForSequenceClassification", + "ErnieForTokenClassification", + "ErnieLayer", + "ErnieLMHeadModel", + "ErnieModel", + "ErniePreTrainedModel", + ] + +try: + if not is_tensorflow_text_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["tokenization_ernie_tf"] = ["TFBertTokenizer"] + +if TYPE_CHECKING: + from .configuration_ernie import ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieConfig, ErnieOnnxConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_ernie import ( + ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST, + ErnieForMaskedLM, + ErnieForMultipleChoice, + ErnieForNextSentencePrediction, + ErnieForPreTraining, + ErnieForQuestionAnswering, + ErnieForSequenceClassification, + ErnieForTokenClassification, + ErnieLayer, + ErnieLMHeadModel, + ErnieModel, + ErniePreTrainedModel, + ) + +else: + import sys + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/ernie/configuration_ernie.py b/src/transformers/models/ernie/configuration_ernie.py new file mode 100644 index 0000000000000..59c86d8c5d65a --- /dev/null +++ b/src/transformers/models/ernie/configuration_ernie.py @@ -0,0 +1,169 @@ +# coding=utf-8 +# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" ERNIE model configuration""" +from collections import OrderedDict +from typing import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "nghuyong/ernie-1.0-base-zh": "https://huggingface.co/nghuyong/ernie-1.0-base-zh/resolve/main/config.json", + "nghuyong/ernie-2.0-base-en": "https://huggingface.co/nghuyong/ernie-2.0-base-en/resolve/main/config.json", + "nghuyong/ernie-2.0-large-en": "https://huggingface.co/nghuyong/ernie-2.0-large-en/resolve/main/config.json", + "nghuyong/ernie-3.0-base-zh": "https://huggingface.co/nghuyong/ernie-3.0-base-zh/resolve/main/config.json", + "nghuyong/ernie-3.0-medium-zh": "https://huggingface.co/nghuyong/ernie-3.0-medium-zh/resolve/main/config.json", + "nghuyong/ernie-3.0-mini-zh": "https://huggingface.co/nghuyong/ernie-3.0-mini-zh/resolve/main/config.json", + "nghuyong/ernie-3.0-micro-zh": "https://huggingface.co/nghuyong/ernie-3.0-micro-zh/resolve/main/config.json", + "nghuyong/ernie-3.0-nano-zh": "https://huggingface.co/nghuyong/ernie-3.0-nano-zh/resolve/main/config.json", + "nghuyong/ernie-gram-zh": "https://huggingface.co/nghuyong/ernie-gram-zh/resolve/main/config.json", + "nghuyong/ernie-health-zh": "https://huggingface.co/nghuyong/ernie-health-zh/resolve/main/config.json", +} + + + +class ErnieConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ErnieModel`] or a [`TFErnieModel`]. It is used to + instantiate a ERNIE model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the ERNIE + [nghuyong/ernie-3.0-base-zh](https://huggingface.co/nghuyong/ernie-3.0-base-zh) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the ERNIE model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`ErnieModel`] or [`TFErnieModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`ErnieModel`] or [`TFErnieModel`]. + task_type_vocab_size (`int`, *optional*, defaults to 3): + The vocabulary size of the `task_type_ids` for ERNIE2.0/ERNIE3.0 model + use_task_id (`bool`, *optional*, defaults to `False`): + Whether or not the model support `task_type_ids` + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Examples: + + ```python + >>> from transformers import ErnieModel, ErnieConfig + + >>> # Initializing a ERNIE nghuyong/ernie-3.0-base-zh style configuration + >>> configuration = ErnieConfig() + + >>> # Initializing a model from the nghuyong/ernie-3.0-base-zh style configuration + >>> model = ErnieModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "ernie" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + task_type_vocab_size=3, + use_task_id=False, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.task_type_vocab_size = task_type_vocab_size + self.use_task_id = use_task_id + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + + +class ErnieOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ("token_type_ids", dynamic_axis), + ] + ) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py new file mode 100644 index 0000000000000..f57c02832d8f7 --- /dev/null +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -0,0 +1,1819 @@ +# coding=utf-8 +# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ERNIE model.""" + + +import math +import os +import warnings +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import ( + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + is_torch_greater_than_1_6, + prune_linear_layer, +) +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_ernie import ErnieConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "nghuyong/ernie-3.0-base-zh" +_CONFIG_FOR_DOC = "ErnieConfig" +_TOKENIZER_FOR_DOC = "BertTokenizer" + +# # TokenClassification docstring +# _CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/ernie-large-cased-finetuned-conll03-english" +# _TOKEN_CLASS_EXPECTED_OUTPUT = ( +# "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC'] " +# ) +# _TOKEN_CLASS_EXPECTED_LOSS = 0.01 +# +# # QuestionAnswering docstring +# _CHECKPOINT_FOR_QA = "deepset/ernie-base-cased-squad2" +# _QA_EXPECTED_OUTPUT = "'a nice puppet'" +# _QA_EXPECTED_LOSS = 7.41 +# _QA_TARGET_START_INDEX = 14 +# _QA_TARGET_END_INDEX = 15 +# +# # SequenceClassification docstring +# _CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/nghuyong/ernie-3.0-base-zh-yelp-polarity" +# _SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'" +# _SEQ_CLASS_EXPECTED_LOSS = 0.01 + + +ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "nghuyong/ernie-1.0-base-zh", + "nghuyong/ernie-2.0-base-en", + "nghuyong/ernie-2.0-large-en", + "nghuyong/ernie-3.0-base-zh", + "nghuyong/ernie-3.0-medium-zh", + "nghuyong/ernie-3.0-mini-zh", + "nghuyong/ernie-3.0-micro-zh", + "nghuyong/ernie-3.0-nano-zh", + "nghuyong/ernie-gram-zh", + "nghuyong/ernie-health-zh", + # See all Ernie models at https://huggingface.co/models?filter=ernie +] + + +class ErnieEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + self.use_task_id = config.use_task_id + if config.use_task_id: + self.task_type_embeddings = nn.Embedding(config.task_type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + if is_torch_greater_than_1_6: + self.register_buffer( + "token_type_ids", + torch.zeros(self.position_ids.size(), dtype=torch.long), + persistent=False, + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + task_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + + # add task_type_id for ERNIE model + if self.use_task_id: + if task_type_ids is None: + task_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + task_type_embeddings = self.task_type_embeddings(task_type_ids) + embeddings += task_type_embeddings + + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class ErnieSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in ErnieModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +class ErnieSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Ernie +class ErnieAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = ErnieSelfAttention(config, position_embedding_type=position_embedding_type) + self.output = ErnieSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Ernie +class ErnieIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Ernie +class ErnieOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Ernie +class ErnieLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = ErnieAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = ErnieAttention(config, position_embedding_type="absolute") + self.intermediate = ErnieIntermediate(config) + self.output = ErnieOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Ernie +class ErnieEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([ErnieLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Ernie +class ErniePooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->Ernie +class ErniePredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Ernie +class ErnieLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = ErniePredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Ernie +class ErnieOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = ErnieLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->Ernie +class ErnieOnlyNSPHead(nn.Module): + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->Ernie +class ErniePreTrainingHeads(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = ErnieLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert->Ernie,bert->ernie +class ErniePreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ErnieConfig + base_model_prefix = "ernie" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, ErnieEncoder): + module.gradient_checkpointing = value + + +@dataclass +# Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->Ernie +class ErnieForPreTrainingOutput(ModelOutput): + """ + Output type of [`ErnieForPreTraining`]. + + Args: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +ERNIE_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`ErnieConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ERNIE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Ernie Model transformer outputting raw hidden-states without any specific head on top.", + ERNIE_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_bert.BertModel with BERT->ERNIE,Bert->Ernie +class ErnieModel(ErniePreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = ErnieEmbeddings(config) + self.encoder = ErnieEncoder(config) + + self.pooler = ErniePooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Ernie Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """, + ERNIE_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_bert.BertForPreTraining with BERT->ERNIE,Bert->Ernie,bert->ernie,bert-base-uncased->nghuyong/ernie-3.0-base-zh +class ErnieForPreTraining(ErniePreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.ernie = ErnieModel(config) + self.cls = ErniePreTrainingHeads(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=ErnieForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + next_sentence_label: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], ErnieForPreTrainingOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), + the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence + pair (see `input_ids` docstring) Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + kwargs (`Dict[str, any]`, optional, defaults to *{}*): + Used to hide legacy arguments that have been deprecated. + + Returns: + + Example: + + ```python + >>> from transformers import BertTokenizer, ErnieForPreTraining + >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained("nghuyong/ernie-3.0-base-zh") + >>> model = ErnieForPreTraining.from_pretrained("nghuyong/ernie-3.0-base-zh") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss = None + if labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return ErnieForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING +) +# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel with BERT->ERNIE,Bert->Ernie,bert->ernie +class ErnieLMHeadModel(ErniePreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `ErnieLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.ernie = ErnieModel(config, add_pooling_layer=False) + self.cls = ErnieOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.Tensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.ernie( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past} + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + +@add_start_docstrings("""Ernie Model with a `language modeling` head on top.""", ERNIE_START_DOCSTRING) +# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM with BERT->ERNIE,Bert->Ernie,bert->ernie +class ErnieForMaskedLM(ErniePreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `ErnieForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.ernie = ErnieModel(config, add_pooling_layer=False) + self.cls = ErnieOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'paris'", + expected_loss=0.88, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + if self.config.pad_token_id is None: + raise ValueError("The PAD token should be defined for generation") + + attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1) + dummy_token = torch.full( + (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + +@add_start_docstrings( + """Ernie Model with a `next sentence prediction (classification)` head on top.""", + ERNIE_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_bert.BertForNextSentencePrediction with BERT->ERNIE,Bert->Ernie,bert->ernie,bert-base-uncased->nghuyong/ernie-3.0-base-zh +class ErnieForNextSentencePrediction(ErniePreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.ernie = ErnieModel(config) + self.cls = ErnieOnlyNSPHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see `input_ids` docstring). Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Returns: + + Example: + + ```python + >>> from transformers import BertTokenizer, ErnieForNextSentencePrediction + >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained("nghuyong/ernie-3.0-base-zh") + >>> model = ErnieForNextSentencePrediction.from_pretrained("nghuyong/ernie-3.0-base-zh") + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." + >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt") + + >>> outputs = model(**encoding, labels=torch.LongTensor([1])) + >>> logits = outputs.logits + >>> assert logits[0, 0] < logits[0, 1] # next sentence was random + ``` + """ + + if "next_sentence_label" in kwargs: + warnings.warn( + "The `next_sentence_label` argument is deprecated and will be removed in a future version, use" + " `labels` instead.", + FutureWarning, + ) + labels = kwargs.pop("next_sentence_label") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + seq_relationship_scores = self.cls(pooled_output) + + next_sentence_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) + + if not return_dict: + output = (seq_relationship_scores,) + outputs[2:] + return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output + + return NextSentencePredictorOutput( + loss=next_sentence_loss, + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Ernie Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + ERNIE_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with BERT->ERNIE,Bert->Ernie,bert->ernie +class ErnieForSequenceClassification(ErniePreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.ernie = ErnieModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Ernie Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + ERNIE_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice with BERT->ERNIE,Bert->Ernie,bert->ernie +class ErnieForMultipleChoice(ErniePreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.ernie = ErnieModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.ernie( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Ernie Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + ERNIE_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with BERT->ERNIE,Bert->Ernie,bert->ernie +class ErnieForTokenClassification(ErniePreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.ernie = ErnieModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Ernie Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ERNIE_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with BERT->ERNIE,Bert->Ernie,bert->ernie +class ErnieForQuestionAnswering(ErniePreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.ernie = ErnieModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/tests/models/ernie/__init__.py b/tests/models/ernie/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py new file mode 100644 index 0000000000000..be9af7d7688d0 --- /dev/null +++ b/tests/models/ernie/test_modeling_ernie.py @@ -0,0 +1,579 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import tempfile +import unittest + +from transformers import ErnieConfig, is_torch_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device + +from ...generation.test_generation_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_torch_available(): + import torch + + from transformers import ( + MODEL_FOR_PRETRAINING_MAPPING, + ErnieForMaskedLM, + ErnieForMultipleChoice, + ErnieForNextSentencePrediction, + ErnieForPreTraining, + ErnieForQuestionAnswering, + ErnieForSequenceClassification, + ErnieForTokenClassification, + ErnieLMHeadModel, + ErnieModel, + ) + from transformers.models.ernie.modeling_ernie import ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST + + +class ErnieModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + """ + Returns a tiny configuration by default. + """ + return ErnieConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + config.is_decoder = True + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ErnieModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_model_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.add_cross_attention = True + model = ErnieModel(config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_for_causal_lm( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + model = ErnieLMHeadModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ErnieForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_model_for_causal_lm_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.add_cross_attention = True + model = ErnieLMHeadModel(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=token_labels, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=token_labels, + encoder_hidden_states=encoder_hidden_states, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_decoder_model_past_large_inputs( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.is_decoder = True + config.add_cross_attention = True + model = ErnieLMHeadModel(config=config).to(torch_device).eval() + + # first forward pass + outputs = model( + input_ids, + attention_mask=input_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=True, + ) + past_key_values = outputs.past_key_values + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model( + next_input_ids, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_hidden_states=True, + )["hidden_states"][0] + output_from_past = model( + next_tokens, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_hidden_states=True, + )["hidden_states"][0] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_for_next_sequence_prediction( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ErnieForNextSentencePrediction(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=sequence_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, 2)) + + def create_and_check_for_pretraining( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ErnieForPreTraining(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=token_labels, + next_sentence_label=sequence_labels, + ) + self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ErnieForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = ErnieForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = ErnieForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = ErnieForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + ErnieModel, + ErnieLMHeadModel, + ErnieForMaskedLM, + ErnieForMultipleChoice, + ErnieForNextSentencePrediction, + ErnieForPreTraining, + ErnieForQuestionAnswering, + ErnieForSequenceClassification, + ErnieForTokenClassification, + ) + if is_torch_available() + else () + ) + all_generative_model_classes = (ErnieLMHeadModel,) if is_torch_available() else () + fx_compatible = False + + # special case for ForPreTraining model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING): + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device + ) + inputs_dict["next_sentence_label"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + return inputs_dict + + def setUp(self): + self.model_tester = ErnieModelTester(self) + self.config_tester = ConfigTester(self, config_class=ErnieConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_as_decoder(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) + + def test_model_as_decoder_with_default_input_mask(self): + # This regression test was failing with PyTorch < 1.3 + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = self.model_tester.prepare_config_and_inputs_for_decoder() + + input_mask = None + + self.model_tester.create_and_check_model_as_decoder( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def test_for_causal_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_for_causal_lm(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_causal_lm_decoder(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_model_for_causal_lm_as_decoder(*config_and_inputs) + + def test_decoder_model_past_with_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_next_sequence_prediction(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs) + + def test_for_pretraining(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_pretraining(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = ErnieModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @slow + @require_torch_gpu + def test_torchscript_device_change(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + + # ErnieForMultipleChoice behaves incorrectly in JIT environments. + if model_class == ErnieForMultipleChoice: + return + + config.torchscript = True + model = model_class(config=config) + + inputs_dict = self._prepare_for_class(inputs_dict, model_class) + traced_model = torch.jit.trace( + model, (inputs_dict["input_ids"].to("cpu"), inputs_dict["attention_mask"].to("cpu")) + ) + + with tempfile.TemporaryDirectory() as tmp: + torch.jit.save(traced_model, os.path.join(tmp, "ernie.pt")) + loaded = torch.jit.load(os.path.join(tmp, "ernie.pt"), map_location=torch_device) + loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device)) \ No newline at end of file From eb48d3dd39f5cdd23882870e24cb6346c45700e4 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Thu, 1 Sep 2022 00:17:47 +0800 Subject: [PATCH 02/23] remove Tokenizer in ernie --- src/transformers/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index c42265698c347..5ac0edebc0055 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -145,10 +145,7 @@ ], "models.ernie": [ "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP", - "BasicTokenizer", "ErnieConfig", - - "WordpieceTokenizer", ], "models.bert_generation": ["BertGenerationConfig"], "models.bert_japanese": ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"], @@ -3014,10 +3011,7 @@ ) from .models.ernie import ( ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, - BasicTokenizer, ErnieConfig, - - WordpieceTokenizer, ) from .models.bert_generation import BertGenerationConfig from .models.bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer From 21a57beeda19701d1d467fd995233fdc0f88df05 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Wed, 7 Sep 2022 23:29:33 +0800 Subject: [PATCH 03/23] polish code --- src/transformers/__init__.py | 8 +-- src/transformers/models/__init__.py | 2 +- .../models/auto/configuration_auto.py | 6 +- src/transformers/models/auto/modeling_auto.py | 18 ++--- .../models/auto/tokenization_auto.py | 2 +- src/transformers/models/ernie/__init__.py | 7 -- .../models/ernie/configuration_ernie.py | 2 +- .../models/ernie/modeling_ernie.py | 68 +++++++++---------- 8 files changed, 51 insertions(+), 62 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 5ac0edebc0055..80c959b171093 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -143,10 +143,6 @@ "BertTokenizer", "WordpieceTokenizer", ], - "models.ernie": [ - "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP", - "ErnieConfig", - ], "models.bert_generation": ["BertGenerationConfig"], "models.bert_japanese": ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"], "models.bertweet": ["BertweetTokenizer"], @@ -206,6 +202,10 @@ "models.dpt": ["DPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DPTConfig"], "models.electra": ["ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "ElectraConfig", "ElectraTokenizer"], "models.encoder_decoder": ["EncoderDecoderConfig"], + "models.ernie": [ + "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP", + "ErnieConfig", + ], "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"], "models.flava": [ "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 2f964cf4219e1..ce70853ba2948 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -24,7 +24,6 @@ bartpho, beit, bert, - ernie, bert_generation, bert_japanese, bertweet, @@ -58,6 +57,7 @@ dpt, electra, encoder_decoder, + ernie, flaubert, flava, fnet, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 6017fd8896402..3dd72dd64391b 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -33,7 +33,6 @@ ("bart", "BartConfig"), ("beit", "BeitConfig"), ("bert", "BertConfig"), - ("ernie", "ErnieConfig"), ("bert-generation", "BertGenerationConfig"), ("big_bird", "BigBirdConfig"), ("bigbird_pegasus", "BigBirdPegasusConfig"), @@ -62,6 +61,7 @@ ("dpt", "DPTConfig"), ("electra", "ElectraConfig"), ("encoder-decoder", "EncoderDecoderConfig"), + ("ernie", "ErnieConfig"), ("flaubert", "FlaubertConfig"), ("flava", "FlavaConfig"), ("fnet", "FNetConfig"), @@ -163,7 +163,6 @@ ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), - ("ernie", "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("big_bird", "BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("bigbird_pegasus", "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("blenderbot", "BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -189,6 +188,7 @@ ("dpr", "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("dpt", "DPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("electra", "ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("ernie", "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("flaubert", "FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("flava", "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("fnet", "FNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -280,7 +280,6 @@ ("bartpho", "BARTpho"), ("beit", "BEiT"), ("bert", "BERT"), - ("ernie", "Ernie"), ("bert-generation", "Bert Generation"), ("bert-japanese", "BertJapanese"), ("bertweet", "BERTweet"), @@ -317,6 +316,7 @@ ("dpt", "DPT"), ("electra", "ELECTRA"), ("encoder-decoder", "Encoder decoder"), + ("ernie", "Ernie"), ("flaubert", "FlauBERT"), ("flava", "FLAVA"), ("fnet", "FNet"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 1af787e0f642f..c084ec0545f43 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -32,7 +32,6 @@ ("bart", "BartModel"), ("beit", "BeitModel"), ("bert", "BertModel"), - ("ernie", "ErnieModel"), ("bert-generation", "BertGenerationEncoder"), ("big_bird", "BigBirdModel"), ("bigbird_pegasus", "BigBirdPegasusModel"), @@ -61,6 +60,7 @@ ("dpr", "DPRQuestionEncoder"), ("dpt", "DPTModel"), ("electra", "ElectraModel"), + ("ernie", "ErnieModel"), ("flaubert", "FlaubertModel"), ("flava", "FlavaModel"), ("fnet", "FNetModel"), @@ -156,7 +156,6 @@ ("albert", "AlbertForPreTraining"), ("bart", "BartForConditionalGeneration"), ("bert", "BertForPreTraining"), - ("ernie", "ErnieForPreTraining"), ("big_bird", "BigBirdForPreTraining"), ("bloom", "BloomForCausalLM"), ("camembert", "CamembertForMaskedLM"), @@ -166,6 +165,7 @@ ("deberta-v2", "DebertaV2ForMaskedLM"), ("distilbert", "DistilBertForMaskedLM"), ("electra", "ElectraForPreTraining"), + ("ernie", "ErnieForPreTraining"), ("flaubert", "FlaubertWithLMHeadModel"), ("flava", "FlavaForPreTraining"), ("fnet", "FNetForPreTraining"), @@ -210,7 +210,6 @@ ("albert", "AlbertForMaskedLM"), ("bart", "BartForConditionalGeneration"), ("bert", "BertForMaskedLM"), - ("ernie", "ErnieForMaskedLM"), ("big_bird", "BigBirdForMaskedLM"), ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"), ("blenderbot-small", "BlenderbotSmallForConditionalGeneration"), @@ -225,6 +224,7 @@ ("distilbert", "DistilBertForMaskedLM"), ("electra", "ElectraForMaskedLM"), ("encoder-decoder", "EncoderDecoderModel"), + ("ernie", "ErnieForMaskedLM"), ("flaubert", "FlaubertWithLMHeadModel"), ("fnet", "FNetForMaskedLM"), ("fsmt", "FSMTForConditionalGeneration"), @@ -275,7 +275,6 @@ # Model for Causal LM mapping ("bart", "BartForCausalLM"), ("bert", "BertLMHeadModel"), - ("ernie", "ErnieLMHeadModel"), ("bert-generation", "BertGenerationDecoder"), ("big_bird", "BigBirdForCausalLM"), ("bigbird_pegasus", "BigBirdPegasusForCausalLM"), @@ -287,6 +286,7 @@ ("ctrl", "CTRLLMHeadModel"), ("data2vec-text", "Data2VecTextForCausalLM"), ("electra", "ElectraForCausalLM"), + ("ernie", "ErnieLMHeadModel"), ("gpt2", "GPT2LMHeadModel"), ("gpt_neo", "GPTNeoForCausalLM"), ("gpt_neox", "GPTNeoXForCausalLM"), @@ -408,7 +408,6 @@ ("albert", "AlbertForMaskedLM"), ("bart", "BartForConditionalGeneration"), ("bert", "BertForMaskedLM"), - ("ernie", "ErnieForMaskedLM"), ("big_bird", "BigBirdForMaskedLM"), ("camembert", "CamembertForMaskedLM"), ("convbert", "ConvBertForMaskedLM"), @@ -417,6 +416,7 @@ ("deberta-v2", "DebertaV2ForMaskedLM"), ("distilbert", "DistilBertForMaskedLM"), ("electra", "ElectraForMaskedLM"), + ("ernie", "ErnieForMaskedLM"), ("flaubert", "FlaubertWithLMHeadModel"), ("fnet", "FNetForMaskedLM"), ("funnel", "FunnelForMaskedLM"), @@ -494,7 +494,6 @@ ("albert", "AlbertForSequenceClassification"), ("bart", "BartForSequenceClassification"), ("bert", "BertForSequenceClassification"), - ("ernie", "ErnieForSequenceClassification"), ("big_bird", "BigBirdForSequenceClassification"), ("bigbird_pegasus", "BigBirdPegasusForSequenceClassification"), ("bloom", "BloomForSequenceClassification"), @@ -507,6 +506,7 @@ ("deberta-v2", "DebertaV2ForSequenceClassification"), ("distilbert", "DistilBertForSequenceClassification"), ("electra", "ElectraForSequenceClassification"), + ("ernie", "ErnieForSequenceClassification"), ("flaubert", "FlaubertForSequenceClassification"), ("fnet", "FNetForSequenceClassification"), ("funnel", "FunnelForSequenceClassification"), @@ -553,7 +553,6 @@ ("albert", "AlbertForQuestionAnswering"), ("bart", "BartForQuestionAnswering"), ("bert", "BertForQuestionAnswering"), - ("ernie", "ErnieForQuestionAnswering"), ("big_bird", "BigBirdForQuestionAnswering"), ("bigbird_pegasus", "BigBirdPegasusForQuestionAnswering"), ("camembert", "CamembertForQuestionAnswering"), @@ -564,6 +563,7 @@ ("deberta-v2", "DebertaV2ForQuestionAnswering"), ("distilbert", "DistilBertForQuestionAnswering"), ("electra", "ElectraForQuestionAnswering"), + ("ernie", "ErnieForQuestionAnswering"), ("flaubert", "FlaubertForQuestionAnsweringSimple"), ("fnet", "FNetForQuestionAnswering"), ("funnel", "FunnelForQuestionAnswering"), @@ -615,7 +615,6 @@ # Model for Token Classification mapping ("albert", "AlbertForTokenClassification"), ("bert", "BertForTokenClassification"), - ("ernie", "ErnieForTokenClassification"), ("big_bird", "BigBirdForTokenClassification"), ("bloom", "BloomForTokenClassification"), ("camembert", "CamembertForTokenClassification"), @@ -626,6 +625,7 @@ ("deberta-v2", "DebertaV2ForTokenClassification"), ("distilbert", "DistilBertForTokenClassification"), ("electra", "ElectraForTokenClassification"), + ("ernie", "ErnieForTokenClassification"), ("flaubert", "FlaubertForTokenClassification"), ("fnet", "FNetForTokenClassification"), ("funnel", "FunnelForTokenClassification"), @@ -659,7 +659,6 @@ # Model for Multiple Choice mapping ("albert", "AlbertForMultipleChoice"), ("bert", "BertForMultipleChoice"), - ("ernie", "ErnieForMultipleChoice"), ("big_bird", "BigBirdForMultipleChoice"), ("camembert", "CamembertForMultipleChoice"), ("canine", "CanineForMultipleChoice"), @@ -668,6 +667,7 @@ ("deberta-v2", "DebertaV2ForMultipleChoice"), ("distilbert", "DistilBertForMultipleChoice"), ("electra", "ElectraForMultipleChoice"), + ("ernie", "ErnieForMultipleChoice"), ("flaubert", "FlaubertForMultipleChoice"), ("fnet", "FNetForMultipleChoice"), ("funnel", "FunnelForMultipleChoice"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 3add419e73909..7308f042da1e0 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -63,7 +63,6 @@ ), ("bartpho", ("BartphoTokenizer", None)), ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)), ("bert-japanese", ("BertJapaneseTokenizer", None)), ("bertweet", ("BertweetTokenizer", None)), @@ -122,6 +121,7 @@ ), ), ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)), + ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("flaubert", ("FlaubertTokenizer", None)), ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)), ("fsmt", ("FSMTTokenizer", None)), diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py index b1d9013528db4..14bf0d85a2949 100644 --- a/src/transformers/models/ernie/__init__.py +++ b/src/transformers/models/ernie/__init__.py @@ -51,13 +51,6 @@ "ErniePreTrainedModel", ] -try: - if not is_tensorflow_text_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_ernie_tf"] = ["TFBertTokenizer"] if TYPE_CHECKING: from .configuration_ernie import ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieConfig, ErnieOnnxConfig diff --git a/src/transformers/models/ernie/configuration_ernie.py b/src/transformers/models/ernie/configuration_ernie.py index 59c86d8c5d65a..26a6ab535f58d 100644 --- a/src/transformers/models/ernie/configuration_ernie.py +++ b/src/transformers/models/ernie/configuration_ernie.py @@ -38,7 +38,6 @@ } - class ErnieConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`ErnieModel`] or a [`TFErnieModel`]. It is used to @@ -165,5 +164,6 @@ def inputs(self) -> Mapping[str, Mapping[int, str]]: ("input_ids", dynamic_axis), ("attention_mask", dynamic_axis), ("token_type_ids", dynamic_axis), + ("task_type_ids", dynamic_axis), ] ) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index f57c02832d8f7..7ac6a1e6906fc 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -17,7 +17,6 @@ import math -import os import warnings from dataclasses import dataclass from typing import List, Optional, Tuple, Union @@ -59,29 +58,10 @@ logger = logging.get_logger(__name__) -_CHECKPOINT_FOR_DOC = "nghuyong/ernie-3.0-base-zh" +_CHECKPOINT_FOR_DOC = "nghuyong/ernie-1.0-base-zh" _CONFIG_FOR_DOC = "ErnieConfig" _TOKENIZER_FOR_DOC = "BertTokenizer" -# # TokenClassification docstring -# _CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/ernie-large-cased-finetuned-conll03-english" -# _TOKEN_CLASS_EXPECTED_OUTPUT = ( -# "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC'] " -# ) -# _TOKEN_CLASS_EXPECTED_LOSS = 0.01 -# -# # QuestionAnswering docstring -# _CHECKPOINT_FOR_QA = "deepset/ernie-base-cased-squad2" -# _QA_EXPECTED_OUTPUT = "'a nice puppet'" -# _QA_EXPECTED_LOSS = 7.41 -# _QA_TARGET_START_INDEX = 14 -# _QA_TARGET_END_INDEX = 15 -# -# # SequenceClassification docstring -# _CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/nghuyong/ernie-3.0-base-zh-yelp-polarity" -# _SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'" -# _SEQ_CLASS_EXPECTED_LOSS = 0.01 - ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST = [ "nghuyong/ernie-1.0-base-zh", @@ -94,7 +74,7 @@ "nghuyong/ernie-3.0-nano-zh", "nghuyong/ernie-gram-zh", "nghuyong/ernie-health-zh", - # See all Ernie models at https://huggingface.co/models?filter=ernie + # See all ERNIE models at https://huggingface.co/models?filter=ernie ] @@ -163,7 +143,7 @@ def forward( position_embeddings = self.position_embeddings(position_ids) embeddings += position_embeddings - # add task_type_id for ERNIE model + # add `task_type_id` for ERNIE model if self.use_task_id: if task_type_ids is None: task_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) @@ -175,6 +155,7 @@ def forward( return embeddings +# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Ernie class ErnieSelfAttention(nn.Module): def __init__(self, config, position_embedding_type=None): super().__init__() @@ -302,6 +283,7 @@ def forward( return outputs +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Ernie class ErnieSelfOutput(nn.Module): def __init__(self, config): super().__init__() @@ -780,6 +762,9 @@ class ErnieForPreTrainingOutput(ModelOutput): - 1 corresponds to a *sentence B* token. [What are token type IDs?](../glossary#token-type-ids) + task_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of task types, the values of `task_type_ids` are usually the same. Indices are selected in `[0, + config.task_type_vocab_size]`. position_ids (`torch.LongTensor` of shape `({0})`, *optional*): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. @@ -810,7 +795,7 @@ class ErnieForPreTrainingOutput(ModelOutput): "The bare Ernie Model transformer outputting raw hidden-states without any specific head on top.", ERNIE_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertModel with BERT->ERNIE,Bert->Ernie + class ErnieModel(ErniePreTrainedModel): """ @@ -862,6 +847,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, + task_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -956,6 +942,7 @@ def forward( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, + task_type_ids=task_type_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) @@ -994,7 +981,7 @@ def forward( """, ERNIE_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForPreTraining with BERT->ERNIE,Bert->Ernie,bert->ernie,bert-base-uncased->nghuyong/ernie-3.0-base-zh + class ErnieForPreTraining(ErniePreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1018,6 +1005,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, + task_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -1049,8 +1037,8 @@ def forward( >>> from transformers import BertTokenizer, ErnieForPreTraining >>> import torch - >>> tokenizer = BertTokenizer.from_pretrained("nghuyong/ernie-3.0-base-zh") - >>> model = ErnieForPreTraining.from_pretrained("nghuyong/ernie-3.0-base-zh") + >>> tokenizer = BertTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh") + >>> model = ErnieForPreTraining.from_pretrained("nghuyong/ernie-1.0-base-zh") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -1065,6 +1053,7 @@ def forward( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, + task_type_ids=task_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, @@ -1099,7 +1088,6 @@ def forward( @add_start_docstrings( """Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING ) -# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel with BERT->ERNIE,Bert->Ernie,bert->ernie class ErnieLMHeadModel(ErniePreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] @@ -1135,6 +1123,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, + task_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -1179,6 +1168,7 @@ def forward( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, + task_type_ids=task_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, @@ -1235,7 +1225,6 @@ def _reorder_cache(self, past, beam_idx): @add_start_docstrings("""Ernie Model with a `language modeling` head on top.""", ERNIE_START_DOCSTRING) -# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM with BERT->ERNIE,Bert->Ernie,bert->ernie class ErnieForMaskedLM(ErniePreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] @@ -1276,6 +1265,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, + task_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -1299,6 +1289,7 @@ def forward( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, + task_type_ids=task_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, @@ -1349,7 +1340,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_ """Ernie Model with a `next sentence prediction (classification)` head on top.""", ERNIE_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForNextSentencePrediction with BERT->ERNIE,Bert->Ernie,bert->ernie,bert-base-uncased->nghuyong/ernie-3.0-base-zh class ErnieForNextSentencePrediction(ErniePreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1367,6 +1357,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, + task_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -1392,8 +1383,8 @@ def forward( >>> from transformers import BertTokenizer, ErnieForNextSentencePrediction >>> import torch - >>> tokenizer = BertTokenizer.from_pretrained("nghuyong/ernie-3.0-base-zh") - >>> model = ErnieForNextSentencePrediction.from_pretrained("nghuyong/ernie-3.0-base-zh") + >>> tokenizer = BertTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh") + >>> model = ErnieForNextSentencePrediction.from_pretrained("nghuyong/ernie-1.0-base-zh") >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." @@ -1419,6 +1410,7 @@ def forward( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, + task_type_ids=task_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, @@ -1455,7 +1447,6 @@ def forward( """, ERNIE_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with BERT->ERNIE,Bert->Ernie,bert->ernie class ErnieForSequenceClassification(ErniePreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1478,6 +1469,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, + task_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -1498,6 +1490,7 @@ def forward( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, + task_type_ids=task_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, @@ -1552,7 +1545,6 @@ def forward( """, ERNIE_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice with BERT->ERNIE,Bert->Ernie,bert->ernie class ErnieForMultipleChoice(ErniePreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1579,6 +1571,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, + task_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -1610,6 +1603,7 @@ def forward( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, + task_type_ids=task_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, @@ -1648,7 +1642,6 @@ def forward( """, ERNIE_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with BERT->ERNIE,Bert->Ernie,bert->ernie class ErnieForTokenClassification(ErniePreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] @@ -1673,6 +1666,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, + task_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -1691,6 +1685,7 @@ def forward( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, + task_type_ids=task_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, @@ -1728,7 +1723,6 @@ def forward( """, ERNIE_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with BERT->ERNIE,Bert->Ernie,bert->ernie class ErnieForQuestionAnswering(ErniePreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] @@ -1749,6 +1743,7 @@ def forward( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, + task_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -1774,6 +1769,7 @@ def forward( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, + task_type_ids=task_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, From e23184c999355d921c1b9503f8c212ee19feb148 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Wed, 7 Sep 2022 23:38:41 +0800 Subject: [PATCH 04/23] format code style --- src/transformers/models/ernie/__init__.py | 1 + src/transformers/models/ernie/modeling_ernie.py | 7 ------- tests/models/ernie/test_modeling_ernie.py | 4 +--- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py index 14bf0d85a2949..bbd254f1bb404 100644 --- a/src/transformers/models/ernie/__init__.py +++ b/src/transformers/models/ernie/__init__.py @@ -78,4 +78,5 @@ else: import sys + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 7ac6a1e6906fc..6592411e2b146 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -500,7 +500,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -795,7 +794,6 @@ class ErnieForPreTrainingOutput(ModelOutput): "The bare Ernie Model transformer outputting raw hidden-states without any specific head on top.", ERNIE_START_DOCSTRING, ) - class ErnieModel(ErniePreTrainedModel): """ @@ -981,7 +979,6 @@ def forward( """, ERNIE_START_DOCSTRING, ) - class ErnieForPreTraining(ErniePreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1089,7 +1086,6 @@ def forward( """Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING ) class ErnieLMHeadModel(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] @@ -1226,7 +1222,6 @@ def _reorder_cache(self, past, beam_idx): @add_start_docstrings("""Ernie Model with a `language modeling` head on top.""", ERNIE_START_DOCSTRING) class ErnieForMaskedLM(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] @@ -1643,7 +1638,6 @@ def forward( ERNIE_START_DOCSTRING, ) class ErnieForTokenClassification(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): @@ -1724,7 +1718,6 @@ def forward( ERNIE_START_DOCSTRING, ) class ErnieForQuestionAnswering(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py index be9af7d7688d0..eefacdeebebb1 100644 --- a/tests/models/ernie/test_modeling_ernie.py +++ b/tests/models/ernie/test_modeling_ernie.py @@ -427,7 +427,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( ( ErnieModel, @@ -560,7 +559,6 @@ def test_model_from_pretrained(self): def test_torchscript_device_change(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - # ErnieForMultipleChoice behaves incorrectly in JIT environments. if model_class == ErnieForMultipleChoice: return @@ -576,4 +574,4 @@ def test_torchscript_device_change(self): with tempfile.TemporaryDirectory() as tmp: torch.jit.save(traced_model, os.path.join(tmp, "ernie.pt")) loaded = torch.jit.load(os.path.join(tmp, "ernie.pt"), map_location=torch_device) - loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device)) \ No newline at end of file + loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device)) From ec50d06fc8740b6dffd6b396d98a1e17c9d32a44 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Thu, 8 Sep 2022 00:41:03 +0800 Subject: [PATCH 05/23] polish code --- src/transformers/models/ernie/modeling_ernie.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 6592411e2b146..3de4306cc7fb2 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -500,6 +500,7 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: + if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -653,7 +654,6 @@ def forward(self, sequence_output, pooled_output): return prediction_scores, seq_relationship_score -# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert->Ernie,bert->ernie class ErniePreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained From b57ae7cd11f3bdd88a0dd7d677e05afd39f7d7a0 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Thu, 8 Sep 2022 01:34:39 +0800 Subject: [PATCH 06/23] fix style --- src/transformers/__init__.py | 33 +++++++++---------- src/transformers/dependency_versions_table.py | 1 - src/transformers/models/ernie/__init__.py | 7 +--- .../models/ernie/modeling_ernie.py | 4 +-- 4 files changed, 18 insertions(+), 27 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 80c959b171093..6216841863f71 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -3009,10 +3009,6 @@ BertTokenizer, WordpieceTokenizer, ) - from .models.ernie import ( - ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, - ErnieConfig, - ) from .models.bert_generation import BertGenerationConfig from .models.bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer from .models.bertweet import BertweetTokenizer @@ -3068,6 +3064,7 @@ from .models.dpt import DPT_PRETRAINED_CONFIG_ARCHIVE_MAP, DPTConfig from .models.electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig, ElectraTokenizer from .models.encoder_decoder import EncoderDecoderConfig + from .models.ernie import ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieConfig from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer from .models.flava import ( FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -3654,20 +3651,6 @@ BertPreTrainedModel, load_tf_weights_in_bert, ) - from .models.ernie import ( - ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST, - ErnieForMaskedLM, - ErnieForMultipleChoice, - ErnieForNextSentencePrediction, - ErnieForPreTraining, - ErnieForQuestionAnswering, - ErnieForSequenceClassification, - ErnieForTokenClassification, - ErnieLayer, - ErnieLMHeadModel, - ErnieModel, - ErniePreTrainedModel, - ) from .models.bert_generation import ( BertGenerationDecoder, BertGenerationEncoder, @@ -3885,6 +3868,20 @@ load_tf_weights_in_electra, ) from .models.encoder_decoder import EncoderDecoderModel + from .models.ernie import ( + ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST, + ErnieForMaskedLM, + ErnieForMultipleChoice, + ErnieForNextSentencePrediction, + ErnieForPreTraining, + ErnieForQuestionAnswering, + ErnieForSequenceClassification, + ErnieForTokenClassification, + ErnieLayer, + ErnieLMHeadModel, + ErnieModel, + ErniePreTrainedModel, + ) from .models.flaubert import ( FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, FlaubertForMultipleChoice, diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 58e4a2cd42c37..4bde0425cee2d 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -62,7 +62,6 @@ "starlette": "starlette", "tensorflow-cpu": "tensorflow-cpu>=2.3,<2.10", "tensorflow": "tensorflow>=2.3,<2.10", - "tensorflow-text": "tensorflow-text", "tf2onnx": "tf2onnx", "timeout-decorator": "timeout-decorator", "timm": "timm", diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py index bbd254f1bb404..548e37da8e5f8 100644 --- a/src/transformers/models/ernie/__init__.py +++ b/src/transformers/models/ernie/__init__.py @@ -18,12 +18,7 @@ from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_tensorflow_text_available, - is_torch_available, -) +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tensorflow_text_available, is_torch_available _import_structure = { diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 3de4306cc7fb2..e51764a06cac1 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -762,8 +762,8 @@ class ErnieForPreTrainingOutput(ModelOutput): [What are token type IDs?](../glossary#token-type-ids) task_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): - Indices of task types, the values of `task_type_ids` are usually the same. Indices are selected in `[0, - config.task_type_vocab_size]`. + Indices of task types, the values of `task_type_ids` are usually the same. Indices are selected in `[0, + config.task_type_vocab_size]`. position_ids (`torch.LongTensor` of shape `({0})`, *optional*): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. From 23ab164078643355a12d3afc9662be8b9a69bfda Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Thu, 8 Sep 2022 01:42:38 +0800 Subject: [PATCH 07/23] update doc --- docs/source/en/_toctree.yml | 2 ++ docs/source/en/model_doc/ernie.mdx | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 3353e82ad1e42..6f5aa7b13005f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -237,6 +237,8 @@ title: ELECTRA - local: model_doc/encoder-decoder title: Encoder Decoder Models + - local: model_doc/ernie + title: ERNIE - local: model_doc/flaubert title: FlauBERT - local: model_doc/fnet diff --git a/docs/source/en/model_doc/ernie.mdx b/docs/source/en/model_doc/ernie.mdx index db04a99c992c2..9edc35ff2ca37 100644 --- a/docs/source/en/model_doc/ernie.mdx +++ b/docs/source/en/model_doc/ernie.mdx @@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Ernie +# ERNIE ## Overview ERNIE is a series of powerful models proposed by baidu, especially in Chinese tasks, From ae3132d17a787ffdb1bc6097b06330fb249ae779 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Thu, 8 Sep 2022 02:08:36 +0800 Subject: [PATCH 08/23] make fix-copies --- README.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/index.mdx | 2 + docs/source/en/serialization.mdx | 1 + src/transformers/utils/dummy_pt_objects.py | 80 ++++++++++++++++++++++ 7 files changed, 87 insertions(+) diff --git a/README.md b/README.md index 2b728c212b3da..ac1803891d88d 100644 --- a/README.md +++ b/README.md @@ -295,6 +295,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[ERNIE](https://huggingface.co/docs/transformers/main/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. diff --git a/README_ko.md b/README_ko.md index a0df54a86c1a9..52b1afadcf987 100644 --- a/README_ko.md +++ b/README_ko.md @@ -247,6 +247,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[ERNIE](https://huggingface.co/docs/transformers/main/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. diff --git a/README_zh-hans.md b/README_zh-hans.md index 1141748161510..249b8ae4d7733 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -271,6 +271,7 @@ conda install -c huggingface transformers 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (来自 Intel Labs) 伴随论文 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 由 René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 发布。 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。 +1. **[ERNIE](https://huggingface.co/docs/transformers/main/model_doc/ernie)** (来自 Baidu) 伴随论文 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 发布。 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index c0444b8ba6a1b..79236e4b23b2c 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -283,6 +283,7 @@ conda install -c huggingface transformers 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[ERNIE](https://huggingface.co/docs/transformers/main/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 15ac6aa50aefd..fddbb11c62aed 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -87,6 +87,7 @@ The documentation is organized into five sections: 1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. 1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. 1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. 1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. @@ -229,6 +230,7 @@ Flax), PyTorch, and/or TensorFlow. | DPT | ❌ | ❌ | ✅ | ❌ | ❌ | | ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | | Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | +| Ernie | ❌ | ❌ | ✅ | ❌ | ❌ | | FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | | FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | | FLAVA | ❌ | ❌ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx index 31ad430e06434..466e6d815ece7 100644 --- a/docs/source/en/serialization.mdx +++ b/docs/source/en/serialization.mdx @@ -67,6 +67,7 @@ Ready-made configurations include the following architectures: - DETR - DistilBERT - ELECTRA +- Ernie - FlauBERT - GPT Neo - GPT-J diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 32ba979f78b62..0fb5b1e66ca46 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -1865,6 +1865,86 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class ErnieForMaskedLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ErnieForMultipleChoice(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ErnieForNextSentencePrediction(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ErnieForPreTraining(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ErnieForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ErnieForSequenceClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ErnieForTokenClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ErnieLayer(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ErnieLMHeadModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ErnieModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ErniePreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None From dc0b44ca1ee8c77ab87f49eb3c86b684f7feccd9 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Thu, 8 Sep 2022 02:29:09 +0800 Subject: [PATCH 09/23] change model name --- src/transformers/models/auto/configuration_auto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 3dd72dd64391b..9ee459df932d8 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -316,7 +316,7 @@ ("dpt", "DPT"), ("electra", "ELECTRA"), ("encoder-decoder", "Encoder decoder"), - ("ernie", "Ernie"), + ("ernie", "ERNIE"), ("flaubert", "FlauBERT"), ("flava", "FLAVA"), ("fnet", "FNet"), From 2ad263b63b88bbee5f003b8d1f6e7c9052cce387 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Thu, 8 Sep 2022 02:33:44 +0800 Subject: [PATCH 10/23] change model name --- docs/source/en/index.mdx | 2 +- docs/source/en/serialization.mdx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index fddbb11c62aed..753ac59a688cd 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -230,7 +230,7 @@ Flax), PyTorch, and/or TensorFlow. | DPT | ❌ | ❌ | ✅ | ❌ | ❌ | | ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ | | Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ | -| Ernie | ❌ | ❌ | ✅ | ❌ | ❌ | +| ERNIE | ❌ | ❌ | ✅ | ❌ | ❌ | | FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | | FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | | FLAVA | ❌ | ❌ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx index 466e6d815ece7..74f50c78513ce 100644 --- a/docs/source/en/serialization.mdx +++ b/docs/source/en/serialization.mdx @@ -67,7 +67,7 @@ Ready-made configurations include the following architectures: - DETR - DistilBERT - ELECTRA -- Ernie +- ERNIE - FlauBERT - GPT Neo - GPT-J From ddcfe13bda5a805301f6ef0cd9b4213c20d517eb Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Thu, 8 Sep 2022 02:38:22 +0800 Subject: [PATCH 11/23] fix dependency --- src/transformers/dependency_versions_table.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 4bde0425cee2d..58e4a2cd42c37 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -62,6 +62,7 @@ "starlette": "starlette", "tensorflow-cpu": "tensorflow-cpu>=2.3,<2.10", "tensorflow": "tensorflow>=2.3,<2.10", + "tensorflow-text": "tensorflow-text", "tf2onnx": "tf2onnx", "timeout-decorator": "timeout-decorator", "timm": "timm", From 1514ca636152b2a5d7062349089838d7cac7f37b Mon Sep 17 00:00:00 2001 From: ydshieh Date: Thu, 8 Sep 2022 10:39:45 +0200 Subject: [PATCH 12/23] add more copied from --- .../models/ernie/modeling_ernie.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index e51764a06cac1..fee40744a1702 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -807,6 +807,7 @@ class ErnieModel(ErniePreTrainedModel): `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ + # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Ernie def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -819,12 +820,15 @@ def __init__(self, config, add_pooling_layer=True): # Initialize weights and apply final processing self.post_init() + # Copied from transformers.models.bert.modeling_bert.BertModel.get_input_embeddings def get_input_embeddings(self): return self.embeddings.word_embeddings + # Copied from transformers.models.bert.modeling_bert.BertModel.set_input_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value + # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base @@ -980,6 +984,7 @@ def forward( ERNIE_START_DOCSTRING, ) class ErnieForPreTraining(ErniePreTrainedModel): + # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) @@ -989,9 +994,11 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.get_output_embeddings def get_output_embeddings(self): return self.cls.predictions.decoder + # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings @@ -1089,6 +1096,7 @@ class ErnieLMHeadModel(ErniePreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) @@ -1101,9 +1109,11 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.get_output_embeddings def get_output_embeddings(self): return self.cls.predictions.decoder + # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings @@ -1201,6 +1211,7 @@ def forward( cross_attentions=outputs.cross_attentions, ) + # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.prepare_inputs_for_generation def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs): input_shape = input_ids.shape # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly @@ -1213,6 +1224,7 @@ def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=Non return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past} + # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel._reorder_cache def _reorder_cache(self, past, beam_idx): reordered_past = () for layer_past in past: @@ -1225,6 +1237,7 @@ class ErnieForMaskedLM(ErniePreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) @@ -1240,9 +1253,11 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.get_output_embeddings def get_output_embeddings(self): return self.cls.predictions.decoder + # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings @@ -1314,6 +1329,7 @@ def forward( attentions=outputs.attentions, ) + # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.prepare_inputs_for_generation def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): input_shape = input_ids.shape effective_batch_size = input_shape[0] @@ -1336,6 +1352,7 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_ ERNIE_START_DOCSTRING, ) class ErnieForNextSentencePrediction(ErniePreTrainedModel): + # Copied from transformers.models.bert.modeling_bert.BertForNextSentencePrediction.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) @@ -1443,6 +1460,7 @@ def forward( ERNIE_START_DOCSTRING, ) class ErnieForSequenceClassification(ErniePreTrainedModel): + # Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1541,6 +1559,7 @@ def forward( ERNIE_START_DOCSTRING, ) class ErnieForMultipleChoice(ErniePreTrainedModel): + # Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) @@ -1640,6 +1659,7 @@ def forward( class ErnieForTokenClassification(ErniePreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] + # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1720,6 +1740,7 @@ def forward( class ErnieForQuestionAnswering(ErniePreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] + # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels From 3a4b5d91021d9db21b34026277138dddb97983c1 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Thu, 8 Sep 2022 23:35:55 +0800 Subject: [PATCH 13/23] rename ErnieLMHeadModel to ErnieForCausalLM do not expose ErnieLayer update doc --- docs/source/en/model_doc/ernie.mdx | 32 +++++++++++++++++-- src/transformers/__init__.py | 6 ++-- src/transformers/models/auto/modeling_auto.py | 2 +- src/transformers/models/ernie/__init__.py | 7 ++-- .../models/ernie/modeling_ernie.py | 13 ++++---- src/transformers/utils/dummy_pt_objects.py | 2 +- tests/models/ernie/test_modeling_ernie.py | 12 +++---- 7 files changed, 49 insertions(+), 25 deletions(-) diff --git a/docs/source/en/model_doc/ernie.mdx b/docs/source/en/model_doc/ernie.mdx index 9edc35ff2ca37..6ec3f10473200 100644 --- a/docs/source/en/model_doc/ernie.mdx +++ b/docs/source/en/model_doc/ernie.mdx @@ -19,6 +19,34 @@ including [ERNIE1.0](https://arxiv.org/abs/1904.09223), [ERNIE2.0](https://ojs.a These models are contributed by [nghuyong](https://huggingface.co/nghuyong) and the official code can be found in [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) (in PaddlePaddle). +### How to use +Take `ernie-1.0-base-zh` as an example: + +```Python +from transformers import AutoTokenizer, AutoModel +tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh") +model = AutoModel.from_pretrained("nghuyong/ernie-1.0-base-zh") +``` + +### Supported Models + +| Model Name | Language | Description | +|:-------------------:|:--------:|:-------------------------------:| +| ernie-1.0-base-zh | Chinese | Layer:12, Heads:12, Hidden:768 | +| ernie-2.0-base-en | English | Layer:12, Heads:12, Hidden:768 | +| ernie-2.0-large-en | English | Layer:24, Heads:16, Hidden:1024 | +| ernie-3.0-base-zh | Chinese | Layer:12, Heads:12, Hidden:768 | +| ernie-3.0-medium-zh | Chinese | Layer:6, Heads:12, Hidden:768 | +| ernie-3.0-mini-zh | Chinese | Layer:6, Heads:12, Hidden:384 | +| ernie-3.0-micro-zh | Chinese | Layer:4, Heads:12, Hidden:384 | +| ernie-3.0-nano-zh | Chinese | Layer:4, Heads:12, Hidden:312 | +| ernie-health-zh | Chinese | Layer:12, Heads:12, Hidden:768 | +| ernie-gram-zh | Chinese | Layer:12, Heads:12, Hidden:768 | + +You can find all the supported models from huggingface's model hub: [huggingface.co/nghuyong](https://huggingface.co/nghuyong), and model details from paddle's official +repo: [PaddleNLP](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers/ERNIE/contents.html) +and [ERNIE](https://github.com/PaddlePaddle/ERNIE/blob/repro). + ## ErnieConfig [[autodoc]] ErnieConfig @@ -38,9 +66,9 @@ These models are contributed by [nghuyong](https://huggingface.co/nghuyong) and [[autodoc]] ErnieForPreTraining - forward -## ErnieLMHeadModel +## ErnieForCausalLM -[[autodoc]] ErnieLMHeadModel +[[autodoc]] ErnieForCausalLM - forward ## ErnieForMaskedLM diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 6216841863f71..430240c29b566 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -902,8 +902,7 @@ "ErnieForQuestionAnswering", "ErnieForSequenceClassification", "ErnieForTokenClassification", - "ErnieLayer", - "ErnieLMHeadModel", + "ErnieForCausalLM", "ErnieModel", "ErniePreTrainedModel", ] @@ -3877,8 +3876,7 @@ ErnieForQuestionAnswering, ErnieForSequenceClassification, ErnieForTokenClassification, - ErnieLayer, - ErnieLMHeadModel, + ErnieForCausalLM, ErnieModel, ErniePreTrainedModel, ) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index c084ec0545f43..4fb13417f42c8 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -286,7 +286,7 @@ ("ctrl", "CTRLLMHeadModel"), ("data2vec-text", "Data2VecTextForCausalLM"), ("electra", "ElectraForCausalLM"), - ("ernie", "ErnieLMHeadModel"), + ("ernie", "ErnieForCausalLM"), ("gpt2", "GPT2LMHeadModel"), ("gpt_neo", "GPTNeoForCausalLM"), ("gpt_neox", "GPTNeoXForCausalLM"), diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py index 548e37da8e5f8..af38f8027f613 100644 --- a/src/transformers/models/ernie/__init__.py +++ b/src/transformers/models/ernie/__init__.py @@ -40,9 +40,7 @@ "ErnieForQuestionAnswering", "ErnieForSequenceClassification", "ErnieForTokenClassification", - "ErnieLayer", - "ErnieLMHeadModel", - "ErnieModel", + "ErnieForCausalLM", "ErniePreTrainedModel", ] @@ -65,8 +63,7 @@ ErnieForQuestionAnswering, ErnieForSequenceClassification, ErnieForTokenClassification, - ErnieLayer, - ErnieLMHeadModel, + ErnieForCausalLM, ErnieModel, ErniePreTrainedModel, ) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index fee40744a1702..3ad77d1bcdcd2 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -1,6 +1,5 @@ # coding=utf-8 -# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright 2022 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -762,8 +761,10 @@ class ErnieForPreTrainingOutput(ModelOutput): [What are token type IDs?](../glossary#token-type-ids) task_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): - Indices of task types, the values of `task_type_ids` are usually the same. Indices are selected in `[0, - config.task_type_vocab_size]`. + Task type embedding is a special embedding to represent the characteristic of different tasks, such as + word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. + We assign a `task_type_id` to each task and the `task_type_id` is in the range + `[0, config.task_type_vocab_size-1] position_ids (`torch.LongTensor` of shape `({0})`, *optional*): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. @@ -1092,7 +1093,7 @@ def forward( @add_start_docstrings( """Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING ) -class ErnieLMHeadModel(ErniePreTrainedModel): +class ErnieForCausalLM(ErniePreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] @@ -1101,7 +1102,7 @@ def __init__(self, config): super().__init__(config) if not config.is_decoder: - logger.warning("If you want to use `ErnieLMHeadModel` as a standalone, add `is_decoder=True.`") + logger.warning("If you want to use `ErnieForCausalLM` as a standalone, add `is_decoder=True.`") self.ernie = ErnieModel(config, add_pooling_layer=False) self.cls = ErnieOnlyMLMHead(config) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 0fb5b1e66ca46..f8ff95318308b 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -1924,7 +1924,7 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ErnieLMHeadModel(metaclass=DummyObject): +class ErnieForCausalLM(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py index eefacdeebebb1..fe982a4738593 100644 --- a/tests/models/ernie/test_modeling_ernie.py +++ b/tests/models/ernie/test_modeling_ernie.py @@ -37,7 +37,7 @@ ErnieForQuestionAnswering, ErnieForSequenceClassification, ErnieForTokenClassification, - ErnieLMHeadModel, + ErnieForCausalLM, ErnieModel, ) from transformers.models.ernie.modeling_ernie import ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST @@ -218,7 +218,7 @@ def create_and_check_for_causal_lm( encoder_hidden_states, encoder_attention_mask, ): - model = ErnieLMHeadModel(config=config) + model = ErnieForCausalLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) @@ -246,7 +246,7 @@ def create_and_check_model_for_causal_lm_as_decoder( encoder_attention_mask, ): config.add_cross_attention = True - model = ErnieLMHeadModel(config=config) + model = ErnieForCausalLM(config=config) model.to(torch_device) model.eval() result = model( @@ -280,7 +280,7 @@ def create_and_check_decoder_model_past_large_inputs( ): config.is_decoder = True config.add_cross_attention = True - model = ErnieLMHeadModel(config=config).to(torch_device).eval() + model = ErnieForCausalLM(config=config).to(torch_device).eval() # first forward pass outputs = model( @@ -430,7 +430,7 @@ class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase) all_model_classes = ( ( ErnieModel, - ErnieLMHeadModel, + ErnieForCausalLM, ErnieForMaskedLM, ErnieForMultipleChoice, ErnieForNextSentencePrediction, @@ -442,7 +442,7 @@ class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase) if is_torch_available() else () ) - all_generative_model_classes = (ErnieLMHeadModel,) if is_torch_available() else () + all_generative_model_classes = (ErnieForCausalLM,) if is_torch_available() else () fx_compatible = False # special case for ForPreTraining model From 2ee2a093f96203a5bbb06209013d90214057cc73 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Thu, 8 Sep 2022 23:50:07 +0800 Subject: [PATCH 14/23] fix --- src/transformers/models/ernie/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py index af38f8027f613..13707762bfb44 100644 --- a/src/transformers/models/ernie/__init__.py +++ b/src/transformers/models/ernie/__init__.py @@ -41,6 +41,7 @@ "ErnieForSequenceClassification", "ErnieForTokenClassification", "ErnieForCausalLM", + "ErnieModel" "ErniePreTrainedModel", ] From 46c3fee2526d135a939b1b6a62bc132046dff727 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Thu, 8 Sep 2022 23:59:40 +0800 Subject: [PATCH 15/23] make style --- src/transformers/__init__.py | 2 +- src/transformers/models/ernie/__init__.py | 5 ++--- src/transformers/models/ernie/modeling_ernie.py | 8 ++++---- tests/models/ernie/test_modeling_ernie.py | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 430240c29b566..b081c294d2de6 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -3869,6 +3869,7 @@ from .models.encoder_decoder import EncoderDecoderModel from .models.ernie import ( ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST, + ErnieForCausalLM, ErnieForMaskedLM, ErnieForMultipleChoice, ErnieForNextSentencePrediction, @@ -3876,7 +3877,6 @@ ErnieForQuestionAnswering, ErnieForSequenceClassification, ErnieForTokenClassification, - ErnieForCausalLM, ErnieModel, ErniePreTrainedModel, ) diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py index 13707762bfb44..a42cc581e662c 100644 --- a/src/transformers/models/ernie/__init__.py +++ b/src/transformers/models/ernie/__init__.py @@ -41,8 +41,7 @@ "ErnieForSequenceClassification", "ErnieForTokenClassification", "ErnieForCausalLM", - "ErnieModel" - "ErniePreTrainedModel", + "ErnieModelErniePreTrainedModel", ] @@ -57,6 +56,7 @@ else: from .modeling_ernie import ( ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST, + ErnieForCausalLM, ErnieForMaskedLM, ErnieForMultipleChoice, ErnieForNextSentencePrediction, @@ -64,7 +64,6 @@ ErnieForQuestionAnswering, ErnieForSequenceClassification, ErnieForTokenClassification, - ErnieForCausalLM, ErnieModel, ErniePreTrainedModel, ) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 3ad77d1bcdcd2..5c1b7549b2e1a 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -761,10 +761,10 @@ class ErnieForPreTrainingOutput(ModelOutput): [What are token type IDs?](../glossary#token-type-ids) task_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): - Task type embedding is a special embedding to represent the characteristic of different tasks, such as - word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. - We assign a `task_type_id` to each task and the `task_type_id` is in the range - `[0, config.task_type_vocab_size-1] + Task type embedding is a special embedding to represent the characteristic of different tasks, such as + word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We + assign a `task_type_id` to each task and the `task_type_id` is in the range `[0, + config.task_type_vocab_size-1] position_ids (`torch.LongTensor` of shape `({0})`, *optional*): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`. diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py index fe982a4738593..243550cea8d61 100644 --- a/tests/models/ernie/test_modeling_ernie.py +++ b/tests/models/ernie/test_modeling_ernie.py @@ -30,6 +30,7 @@ from transformers import ( MODEL_FOR_PRETRAINING_MAPPING, + ErnieForCausalLM, ErnieForMaskedLM, ErnieForMultipleChoice, ErnieForNextSentencePrediction, @@ -37,7 +38,6 @@ ErnieForQuestionAnswering, ErnieForSequenceClassification, ErnieForTokenClassification, - ErnieForCausalLM, ErnieModel, ) from transformers.models.ernie.modeling_ernie import ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST From 6d802470f6f40aef54ac6dfb2a954a21f7968f13 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Fri, 9 Sep 2022 00:09:14 +0800 Subject: [PATCH 16/23] polish code --- src/transformers/__init__.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index b081c294d2de6..0637da2e28c6a 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -892,21 +892,6 @@ "load_tf_weights_in_bert", ] ) - _import_structure["models.ernie"].extend( - [ - "ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST", - "ErnieForMaskedLM", - "ErnieForMultipleChoice", - "ErnieForNextSentencePrediction", - "ErnieForPreTraining", - "ErnieForQuestionAnswering", - "ErnieForSequenceClassification", - "ErnieForTokenClassification", - "ErnieForCausalLM", - "ErnieModel", - "ErniePreTrainedModel", - ] - ) _import_structure["models.bert_generation"].extend( [ "BertGenerationDecoder", @@ -1168,6 +1153,21 @@ ] ) _import_structure["models.encoder_decoder"].append("EncoderDecoderModel") + _import_structure["models.ernie"].extend( + [ + "ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST", + "ErnieForMaskedLM", + "ErnieForMultipleChoice", + "ErnieForNextSentencePrediction", + "ErnieForPreTraining", + "ErnieForQuestionAnswering", + "ErnieForSequenceClassification", + "ErnieForTokenClassification", + "ErnieForCausalLM", + "ErnieModel", + "ErniePreTrainedModel", + ] + ) _import_structure["models.flaubert"].extend( [ "FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST", From 9963d11392557a647b235a9ec824042fcaa4c035 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Fri, 9 Sep 2022 00:14:28 +0800 Subject: [PATCH 17/23] polish code --- .../models/ernie/modeling_ernie.py | 2 +- src/transformers/utils/dummy_pt_objects.py | 21 +++++++------------ 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 5c1b7549b2e1a..878840c83fe4a 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -1102,7 +1102,7 @@ def __init__(self, config): super().__init__(config) if not config.is_decoder: - logger.warning("If you want to use `ErnieForCausalLM` as a standalone, add `is_decoder=True.`") + logger.warning("If you want to use `ErnieLMHeadModel` as a standalone, add `is_decoder=True.`") self.ernie = ErnieModel(config, add_pooling_layer=False) self.cls = ErnieOnlyMLMHead(config) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index f8ff95318308b..f5509359d9462 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -1868,6 +1868,13 @@ def __init__(self, *args, **kwargs): ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST = None +class ErnieForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class ErnieForMaskedLM(metaclass=DummyObject): _backends = ["torch"] @@ -1917,20 +1924,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ErnieLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - -class ErnieForCausalLM(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class ErnieModel(metaclass=DummyObject): _backends = ["torch"] From ca6b58f53b2698d25803865a3b75580198333432 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Fri, 9 Sep 2022 00:22:50 +0800 Subject: [PATCH 18/23] fix --- src/transformers/models/ernie/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py index a42cc581e662c..b348341c0a8d2 100644 --- a/src/transformers/models/ernie/__init__.py +++ b/src/transformers/models/ernie/__init__.py @@ -41,6 +41,7 @@ "ErnieForSequenceClassification", "ErnieForTokenClassification", "ErnieForCausalLM", + "ErnieModel", "ErnieModelErniePreTrainedModel", ] From 31ed2da650f4c7382497c323d4bc5792a8c8b9fe Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Fri, 9 Sep 2022 00:26:23 +0800 Subject: [PATCH 19/23] fix --- src/transformers/models/ernie/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py index b348341c0a8d2..433dede367feb 100644 --- a/src/transformers/models/ernie/__init__.py +++ b/src/transformers/models/ernie/__init__.py @@ -42,10 +42,10 @@ "ErnieForTokenClassification", "ErnieForCausalLM", "ErnieModel", + "ErniePreTrainedModel", "ErnieModelErniePreTrainedModel", ] - if TYPE_CHECKING: from .configuration_ernie import ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieConfig, ErnieOnnxConfig From 56a9b1e1870cd280ab1f88fac152ab4044effcfe Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Fri, 9 Sep 2022 00:27:26 +0800 Subject: [PATCH 20/23] fix --- src/transformers/models/ernie/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py index 433dede367feb..e8e5a21abfce6 100644 --- a/src/transformers/models/ernie/__init__.py +++ b/src/transformers/models/ernie/__init__.py @@ -43,7 +43,6 @@ "ErnieForCausalLM", "ErnieModel", "ErniePreTrainedModel", - "ErnieModelErniePreTrainedModel", ] if TYPE_CHECKING: From 4c13047a2569d2f7e8d9ade7e6202c8d2e6f9b52 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Fri, 9 Sep 2022 00:27:54 +0800 Subject: [PATCH 21/23] fix --- src/transformers/models/ernie/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/ernie/__init__.py b/src/transformers/models/ernie/__init__.py index e8e5a21abfce6..b8dce9a15b594 100644 --- a/src/transformers/models/ernie/__init__.py +++ b/src/transformers/models/ernie/__init__.py @@ -33,6 +33,7 @@ else: _import_structure["modeling_ernie"] = [ "ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST", + "ErnieForCausalLM", "ErnieForMaskedLM", "ErnieForMultipleChoice", "ErnieForNextSentencePrediction", @@ -40,7 +41,6 @@ "ErnieForQuestionAnswering", "ErnieForSequenceClassification", "ErnieForTokenClassification", - "ErnieForCausalLM", "ErnieModel", "ErniePreTrainedModel", ] From 0f1f343341e43c7260de596935c040f14e3e10d8 Mon Sep 17 00:00:00 2001 From: rightyonghu Date: Fri, 9 Sep 2022 00:28:47 +0800 Subject: [PATCH 22/23] fix --- src/transformers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 0637da2e28c6a..98479d4693661 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1156,6 +1156,7 @@ _import_structure["models.ernie"].extend( [ "ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST", + "ErnieForCausalLM", "ErnieForMaskedLM", "ErnieForMultipleChoice", "ErnieForNextSentencePrediction", @@ -1163,7 +1164,6 @@ "ErnieForQuestionAnswering", "ErnieForSequenceClassification", "ErnieForTokenClassification", - "ErnieForCausalLM", "ErnieModel", "ErniePreTrainedModel", ] From 685728232335792f1746f2a5772e60545a545810 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Fri, 9 Sep 2022 10:34:28 +0200 Subject: [PATCH 23/23] final fix --- src/transformers/models/ernie/modeling_ernie.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 878840c83fe4a..0af6be3a707f0 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -1097,12 +1097,12 @@ class ErnieForCausalLM(ErniePreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] - # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with Bert->Ernie,bert->ernie + # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->ErnieForCausalLM,Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) if not config.is_decoder: - logger.warning("If you want to use `ErnieLMHeadModel` as a standalone, add `is_decoder=True.`") + logger.warning("If you want to use `ErnieForCausalLM` as a standalone, add `is_decoder=True.`") self.ernie = ErnieModel(config, add_pooling_layer=False) self.cls = ErnieOnlyMLMHead(config)